Höhere Fahrradzählungen im Jahr 2023:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import holidays
import glob
import os
pfad = "../data/tageswerte_pro_Jahr/*.csv"

dateien = glob.glob(pfad)
dfs = []
for datei in dateien:
    df_temp = pd.read_csv(datei, low_memory=False)
    df_temp["year"] = os.path.basename(datei).split("_")[2].split(".")[0]
    dfs.append(df_temp.astype({"year":int}))

df = pd.concat(dfs, ignore_index=True)
# Daten bereinigen

# replace various forms of 'na' with np.nan
df = df.replace(["na", "NA", "Na"], np.nan)

df_backup = df.copy()
print(df.isna().sum())
# TODO Find if temp rain and snow values are missing in a pattern or at random
# convert numeric values to corresponding dtypes
df["site_temperature"] = df["site_temperature"].astype(float)
df["site_rain_accumulation"] = df["site_rain_accumulation"].astype(float)
df["site_snow_accumulation"] = df["site_snow_accumulation"].astype(float)

# Mannheim subset, saved to file
df_ma = df[df["domain_name"] == 'Stadt Mannheim']
df_ma.to_csv("../data/processed/daily_bikes_mannheim.csv", index=False)
#print(df_ma.isna().sum())



# Wie ist das verhältnis über den gesammten Zeitraum von Regen und Schnee?


labels = [
    "Januar", "Februar", "März", "April", "Mai", "Juni",
    "Juli", "August", "September", "Oktober", "November", "Dezember"
]
df2 = df.copy()


df2["iso_timestamp"] = pd.to_datetime(
    df2["iso_timestamp"],
    errors="coerce", #If 'coerce', then invalid parsing will be set as NaT.
    utc=True
)

df_ma = df2.loc[df2["domain_name"] == "Stadt Mannheim"]
df_ma = df_ma.dropna(subset=["iso_timestamp"])
df_ma["month"] = df_ma["iso_timestamp"].dt.month.astype(int)
df_ma["site_rain_accumulation"] = pd.to_numeric(df_ma["site_rain_accumulation"], errors="coerce").clip(lower=0)
df_ma["site_snow_accumulation"] = pd.to_numeric(df_ma["site_snow_accumulation"], errors="coerce").clip(lower=0)

monthly = (
    df_ma.groupby("month")[["site_rain_accumulation", "site_snow_accumulation"]]
        .sum()
        .reindex(range(1, 13), fill_value=0)
)

rain_sum = monthly["site_rain_accumulation"].to_numpy()
snow_sum = monthly["site_snow_accumulation"].to_numpy()

x = np.arange(12)
width = 0.35
fig, ax = plt.subplots()

rects1 = ax.bar(x - width/2, rain_sum, width, label="Rain")
rects2 = ax.bar(x + width/2, snow_sum, width, label="Snow")

ax.set_ylabel("Akkumulation")
ax.set_title("Monatliche Summe Regen vs. Schnee (Mannheim)")
ax.set_xticks(x, labels, rotation=45, ha="right")
ax.legend()

ax.bar_label(rects1, padding=3, fmt="%.1f")
ax.bar_label(rects2, padding=3, fmt="%.1f")

fig.tight_layout()
plt.show()


# IDEE: Man könnte noch zeigen wie akurat Schnee / Regen sind, in dem man farblich darstellt ob die Temperatur > oder < 0 war.
operator_name                  0
domain_name                    0
domain_id                      0
counter_site                   0
counter_site_id                0
counter_serial              1593
longitude                      0
latitude                       0
timezone                       0
iso_timestamp                  0
channels_in                26510
channels_out               26559
channels_unknown          220492
channels_all                   0
site_temperature           20642
site_rain_accumulation     20642
site_snow_accumulation     20656
year                           0
dtype: int64

# Shorten der Labels für bessere Lesbarkeit im Plot
max_length = 15
df_ma["short_labels"] = df_ma["counter_site"].apply(lambda x: x[:max_length] + ("…" if len(x) > max_length else ""))
avg_per_year = df_ma.groupby("year")["channels_all"].mean()
years = avg_per_year.index.astype(str)
fig, axs = plt.subplots(1,2, figsize=(15, 5))

sns.barplot(x=years, y=avg_per_year.values, ax=axs[0])
axs[1].set_yticks(range(0, 3501, 500))
axs[0].set_yticklabels(range(0,3501, 500))
axs[0].set_xlabel("Year")
axs[0].set_ylabel("Average channels_all")
axs[0].set_title("Average daily observations per year (Mannheim 2014-2025)", fontsize=16)


# WICHTIG Hier kommt der Abfall des Mittelwerts dadurch, dass mehr Zählstationen hinzugefügt wurden, die nicht an einem Hotspot standen, also Mittelwert sinkt.
plot_data = pd.DataFrame({
    'Year': years,
    'Average': avg_per_year
})

sns.lineplot(x='Year', y='Average', data=plot_data, marker='o', ax=axs[1])
axs[1].set_yticks(range(0, 3501, 500))
axs[1].set_yticklabels(range(0,3501, 500))
axs[1].set_title("Average daily observations (Mannheim 2014-2025)", fontsize=16)
axs[1].set_xlabel("Year", fontsize=12)
axs[1].set_ylabel("Average Number of bikes", fontsize=12)

plt.tight_layout()
plt.show()
/tmp/ipykernel_433306/1295171260.py:7: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  axs[0].set_yticklabels(range(0,3501, 500))

y = df_ma.groupby(by="year")["counter_site"].nunique()
plt.figure(figsize=(10, 6))
sns.lineplot(x=y.index, y=y.values, marker='o')
plt.title("Number of counting stations per year in Mannheim", fontsize=16)
plt.xlabel("Year", fontsize=12)
plt.ylabel("Number of stations", fontsize=12)
plt.tight_layout()
plt.show()

# sollte man das normieren?
sum_per_year = df_ma.groupby("year")["channels_all"].sum()

sum_per_year = sum_per_year.sort_index()

plt.figure(figsize=(10, 5))
sns.barplot(x=years, y=sum_per_year.values)

plt.xlabel("Year")
plt.ylabel("Total observations in millions")
plt.title("Total observations per year (Mannheim)")
plt.tight_layout()
plt.show()

df_ma["timestamp"] = pd.to_datetime(df_ma["iso_timestamp"])
df_ma["year"] = df_ma["timestamp"].dt.year
df_ma["month"] = df_ma["timestamp"].dt.month
df_ma["date"] = df_ma["timestamp"].dt.date

df_2024 = df_ma[df_ma["year"] == 2024].copy()

monthly_avg_2024 = (df_2024.groupby("month")["channels_all"].mean().reset_index())


plt.figure(figsize=(10, 5))
plt.bar( monthly_avg_2024["month"], monthly_avg_2024["channels_all"], color="steelblue" ) 
plt.title("Average Daily Bike Counts per Month (Mannheim 2024)") 
plt.xlabel("Month") 
plt.ylabel("Average daily bike counts") 
plt.xticks(range(1, 13)) 
plt.grid(axis="y", alpha=0.3) 
plt.tight_layout() 
plt.show()

df_ma["timestamp"] = pd.to_datetime(df_ma["iso_timestamp"])
df_ma["year"] = df_ma["timestamp"].dt.year
df_ma["month"] = df_ma["timestamp"].dt.month
df_ma["date"] = df_ma["timestamp"].dt.date

df_2023 = df_ma[df_ma["year"] == 2023].copy()

monthly_avg_2023 = (df_2023.groupby("month")["channels_all"].mean().reset_index())


plt.figure(figsize=(10, 5))
plt.bar( monthly_avg_2023["month"], monthly_avg_2023["channels_all"], color="steelblue" ) 
plt.title("Average Daily Bike Counts per Month (Mannheim 2023)") 
plt.xlabel("Month") 
plt.ylabel("Average daily bike counts") 
plt.xticks(range(1, 13)) 
plt.grid(axis="y", alpha=0.3) 
plt.tight_layout() 
plt.show()

Einfluss der BUGA‑Saison auf den Radverkehr in Mannheim? (14 April bis 8 Oktober)

df_22_23_24 = df_ma[df_ma["year"].isin([2022, 2023,2024])].copy()


monthly_sum = (df_22_23_24.groupby(["year", "month"])["channels_all"].sum().reset_index())
monthly_pivot = monthly_sum.pivot(index="month", columns="year", values="channels_all")


plt.figure(figsize=(12, 6))
monthly_pivot.plot(kind="bar", figsize=(12,6), width=0.8)

plt.title("Total Monthly Bike Counts in Mannheim (2022 vs. 2023)")
plt.xlabel("Month")
plt.ylabel("Total bike counts per month")
plt.xticks(range(0,12), range(1,13), rotation=0)
plt.grid(axis="y", alpha=0.3)
plt.legend(title="Year")

plt.tight_layout()
plt.show()
<Figure size 1200x600 with 0 Axes>

df_ma["t"] = pd.to_datetime(df_ma["iso_timestamp"], utc=True).dt.tz_convert("Europe/Berlin")
df_ma["metric"] = pd.to_numeric(df_ma["channels_all"], errors="coerce")

def buga_sum(y):
    start = pd.Timestamp(f"{y}-04-14", tz="Europe/Berlin")
    end   = pd.Timestamp(f"{y}-10-08", tz="Europe/Berlin")
    return df_ma.loc[(df_ma["t"] >= start) & (df_ma["t"] <= end), "metric"].sum()

vals = {y: buga_sum(y) for y in [2022, 2023, 2024]}


plt.figure(figsize=(8,5))
bars = plt.bar(["2022", "2023 (BUGA)", "2024"], [vals[2022], vals[2023], vals[2024]],)
plt.bar_label(bars, padding=3, fmt="%.0f")
plt.title("Bike Counts During BUGA Period (14 Apr – 8 Oct)")
plt.ylabel("Total bike counts")
plt.tight_layout()
plt.show()

df_ma["year"] = df_ma["t"].dt.year
df_ma["month"] = df_ma["t"].dt.month

df_buga = df_ma[df_ma["month"].between(4, 10)]

monthly = df_buga.groupby(["year", "month"])["metric"].sum().reset_index()

plt.figure(figsize=(10,6))
for y in [2022, 2023, 2024]:
    subset = monthly[monthly["year"] == y]
    plt.plot(subset["month"], subset["metric"], marker="o", label=str(y))

plt.title("Monthly Bike Counts (April–October)")
plt.xlabel("Month")
plt.ylabel("Total bike counts")
plt.xticks(range(4,11))
plt.legend()
plt.grid(alpha=0.3)
plt.show()

Im August gehen wohl viele in den Urlaub aber im September hat 2023 deutlich mehr Fahrradzählungen als 2022 und 2024

df_ma["weekday"] = df_ma["t"].dt.day_name()

def buga_period(y):
    start = pd.Timestamp(f"{y}-04-14", tz="Europe/Berlin")
    end   = pd.Timestamp(f"{y}-10-08", tz="Europe/Berlin")
    return df_ma[(df_ma["t"] >= start) & (df_ma["t"] <= end)]

df_buga_2023 = buga_period(2023)
df_buga_2022 = buga_period(2022)

sns.boxplot(data=pd.concat([df_buga_2022.assign(year="2022"),df_buga_2023.assign(year="2023")]), x="weekday", y="metric", hue="year")
plt.xticks(rotation=45)
plt.title("Weekday Bike Counts During BUGA vs. 2022")
plt.show()

sns.barplot(df_ma[df_ma["counter_site"] == "Renzstraße"],x="year", y="channels_all")
# sns.barplot(df_ma[df_ma["counter_site"] == "Kurpfalzbrücke"],x="year", y="channels_all")
plt.show()

station_sum = df_ma.groupby("short_labels")["channels_all"].sum().sort_values(ascending=False)
year_counts = df_ma.groupby("short_labels")["year"].nunique()
year_counts = year_counts[station_sum.index]  # Reorder year_counts to match the sorted station_sum

plt.figure(figsize=(10, 7))
sns.barplot(x=station_sum.index, y=station_sum.values, hue=year_counts)
plt.xticks(rotation=75)
plt.xlabel("Station")
plt.ylabel("Total observations (2014-2024) in millions")
plt.title("Total observations per station (Mannheim)")
plt.tight_layout()
plt.show()

Größtes Radverkehrsaufkommen nach Städten

Da die Zählstellen zu unterschiedlichen Zeitpunkten in Betrieb genommen wurden, wurde für vergleichende Analysen ein gemeinsamer Beobachtungszeitraum definiert. Hierzu wurde für jede Zählstelle das erste Jahr mit verfügbaren Messdaten bestimmt. In der Analyse wurden ausschließlich Zählstellen berücksichtigt, die spätestens ab dem Jahr 2018 Daten aufweisen.

station_coverage = (
    df.groupby("counter_site")
      .agg(
          first_year=("year", "min"),
          last_year=("year", "max"),
          n_observations=("year", "count")
      )
      .reset_index()
)

station_coverage.sort_values("first_year")
valid_stations = station_coverage[
    station_coverage["first_year"] <= 2018
]["counter_site"]

df_common = df[
    (df["counter_site"].isin(valid_stations)) &
    (df["year"] >= 2018)
]
plt.figure(figsize=(10, 6))
sns.histplot(station_coverage["first_year"], bins=range(2013, 2026))
plt.xlabel("Startjahr der Zählstelle")
plt.ylabel("Anzahl der Zählstellen")
plt.title("Verteilung der Inbetriebnahmejahre der Fahrradzählstellen")
plt.show()

# Top 10 Zählstellen
avg_df_common = (
    df_common.groupby(["counter_site", "domain_name"], as_index=False)
      .agg(avg_daily_counts=("channels_all", "mean"))
)

# Top 10 Zählstellen basierend auf dem durchschnittlichen täglichen Fahrradverkehr
top10_common = avg_df_common.sort_values("avg_daily_counts", ascending=False).head(10)

# Visualisierung der Top 10 Zählstellen
plt.figure(figsize=(15, 6))
sns.barplot(
    data=top10_common,
    y="counter_site",
    x="avg_daily_counts",
    hue="domain_name"
)

plt.ylabel("Zählstelle")
plt.xlabel("Durchschnittliche Fahrradfahrten pro Tag")
plt.title("Top 10 Zählstellen mit dem höchsten durchschnittlichen Fahrradverkehr\n(2018 und später)")
plt.tight_layout()
plt.show()

# TODO welche Station hat den größten Messwertsprung innerhalb eines Jahres erlebt -> woran lag das
max_length = 15

plt.figure(figsize=(20, 10))
sns.boxplot(data=df_ma, x="short_labels", y="channels_all")
plt.xticks(rotation=45)
plt.ylabel("Daily observations")
plt.title("Distribution of daily counts per station (Stadt Mannheim - 2014-2024)")
plt.tight_layout()
plt.show()


df["timestamp"] = pd.to_datetime(df_ma["iso_timestamp"])
#Renzstraße
renz = df_ma[df_ma["counter_site"] == "Renzstraße"]
renz_max = renz.loc[renz["channels_all"].idxmax()]
print("Renzstraße - Höchster Wert:")
print(renz_max[["date", "channels_all"]])

#Kurpfalzbrücke
kurp = df_ma[df_ma["counter_site"] == "Kurpfalzbrücke"]
kurp_max = kurp.loc[kurp["channels_all"].idxmax()]
print("Kurpfalzbrücke - Höchster Wert:")
print(kurp_max[["date", "channels_all"]])      

print("am 11.04.2018 gab es in Mannheim einen Warnstreik im öffentlichen Dienst und mehrere tausend Menschen nahmen an Demonstrationen teil.")
print("Das erklärt die Ausreißer bei den Fahrradzählungen an diesem Tag.")

Renzstraße - Höchster Wert:
date            2018-04-10
channels_all          7638
Name: 239807, dtype: object
Kurpfalzbrücke - Höchster Wert:
date            2018-04-10
channels_all         10558
Name: 245866, dtype: object
am 11.04.2018 gab es in Mannheim einen Warnstreik im öffentlichen Dienst und mehrere tausend Menschen nahmen an Demonstrationen teil.
Das erklärt die Ausreißer bei den Fahrradzählungen an diesem Tag.
df_ma['day_of_week'] = df_ma['timestamp'].dt.dayofweek  # 0=Monday, 6=Sunday
df_ma['month'] = df_ma['timestamp'].dt.month
df_ma['is_weekend'] = df_ma['day_of_week'].isin([5, 6]).astype(int)  # 1 if weekend, 0 if weekday
# You can manually add holidays or use a library like `holidays` to check if the day is a public holiday.
de_holidays = holidays.Germany(years=2023)  # For Germany, for example
df_ma['is_holiday'] = df_ma['timestamp'].dt.date.isin(de_holidays.keys()).astype(int)


variables = ['day_of_week','month','is_weekend', 'is_holiday', 'channels_all', 'site_temperature', 'site_rain_accumulation']

# Create the scatterplot matrix using Seaborn's pairplot
sns.pairplot(df_ma[variables], diag_kind='kde', plot_kws={'alpha': 0.5, 's': 50}, hue="is_holiday")

# Customize the plot
plt.suptitle('Matrix Scatterplot of Bike Counting Data', fontsize=16)
plt.show()

# is_weekend und channels_all vergleichen
sum_weekdays = sum(df_ma[df_ma["is_weekend"]==0]["channels_all"])
sum_weekends = sum(df_ma[df_ma["is_weekend"]==1]["channels_all"])
num_weekdays = len(df_ma[df_ma["is_weekend"]==0])# Since each row is an hour, divide by 24 to get the number of days
num_weekends = len(df_ma[df_ma["is_weekend"]==1])  
avg_weekdays = sum_weekdays / num_weekdays
avg_weekends = sum_weekends / num_weekends

Korrelation zwischen Temperatur und Fahrradnutzung

fig, axs = plt.subplots(1,2,figsize=(15, 5))
sns.scatterplot(data=df_ma, y="site_temperature", x="channels_all", ax=axs[0])
sns.regplot(data=df_ma, y="site_temperature", x="channels_all",scatter_kws={'s': 10}, line_kws={'color': 'red'}, ax=axs[1])
plt.xlabel("Site temperature")
plt.ylabel("Bike counts") 
axs[0].set_title("Temperature vs. bicycle counts (Stadt Mannheim)")
plt.tight_layout()
plt.show()
# Pearson
pearson_ma = df_ma[["channels_all", "site_temperature"]].corr(method="pearson").loc["channels_all", "site_temperature"]

# Spearman
spearman_ma = df_ma[["channels_all", "site_temperature"]].corr(method="spearman").loc["channels_all", "site_temperature"]

print(f"Pearson: {pearson_ma:.3f}")
print(f"Spearman: {spearman_ma:.3f}")

# Drop all nan values in site_temperature from big dataframe
df["site_temperature"] = df["site_temperature"].replace(["na", "NA", "Na"], np.nan)
df = df.dropna(subset=["site_temperature"])
pearson_all = df[["channels_all", "site_temperature"]].corr(method="pearson").loc["channels_all", "site_temperature"]
spearman_all = df[["channels_all", "site_temperature"]].corr(method="spearman").loc["channels_all", "site_temperature"]

print(f"Pearson: {pearson_all:.3f}")
print(f"Spearman: {spearman_all:.3f}")

Pearson: 0.202
Spearman: 0.223
Pearson: 0.195
Spearman: 0.272

Die insgesamte Korrelation zwischen der Temperatur und der Fahrradnutzung in Mannheim beträgt nach Pearson 0.202.

Betrachtet man jedoch die Korrelation der einzelnen der Messstationen so ergeben sich folgende Werte.

corr_per_station = (
    df_ma.groupby("counter_site")[["channels_all", "site_temperature"]]
         .corr(method="pearson")
         .reset_index()
)

# Nur die Zeilen channels_all vs. site_temperature extrahieren
corr_per_station = corr_per_station[
    (corr_per_station["level_1"] == "site_temperature") &
    (corr_per_station["counter_site"].notna())
][["counter_site", "channels_all"]].rename(columns={"channels_all": "pearson_corr"})

print(corr_per_station.sort_values("pearson_corr"))

corr_per_station_sorted = corr_per_station.sort_values("pearson_corr")

plt.figure(figsize=(10, 5))
plt.barh(corr_per_station_sorted["counter_site"], corr_per_station_sorted["pearson_corr"])
plt.xlabel("Pearson correlation (site_temperature vs. channels_all)")
plt.ylabel("Station")
plt.title("Correlation between temperature and bicycle counts by station")
plt.tight_layout()
plt.show()
                                       counter_site  pearson_corr
1                                      B38. RI. AUS      0.011942
15                             Lindenhofüberführung      0.235979
25                    Theodor-Heuss-Anlage. RI. AUS      0.393459
21                                       Renzstraße      0.445103
19           Neckarauer Übergang -Schwetzinger Str.      0.486533
5                    Feudenheimerstr. stadteinwärts      0.510540
27                    Theodor-Heuss-Anlage. RI. IN.      0.510875
7                      Feudenheimstr. stadtauswärts      0.533080
13                                   Kurpfalzbrücke      0.548310
9                                   Jungbuschbrücke      0.557844
11                           Konrad-Adenauer-Brücke      0.568114
17                                    Luzenbergstr.      0.584917
3                                    Fernmeldeturm.      0.619432
23  Schlosspark Lindenhof (Richtung Jugendherberge)      0.629848

An den meisten Stationen ist die Korrelation positiv (d.h. wärmere Tage → mehr Radverkehr).“ „Die Stärke variiert jedoch deutlich zwischen den Stationen (von ~0 bis ~0.63).“ „Insbesondere an Stationen X, Y, Z ist der Zusammenhang besonders stark.“

#rain category
df_ma["rain_category"] = pd.cut(df_ma["site_rain_accumulation"], bins=[-0.1, 0, 2.5, 7.6, 50], labels=["No Rain", "Light Rain", "Moderate Rain", "Heavy Rain"]) 

fig, axs = plt.subplots(1,2,figsize=(15, 5))


sns.regplot(data=df_ma, y="site_rain_accumulation", x="channels_all",scatter_kws={'s': 10, 'alpha':0.5}, line_kws={'color': 'red'}, ax=axs[0])

plt.xlabel("rain accumulation")
plt.ylabel("bike counts") 
axs[0].set_title("rain vs. bicycle counts (Stadt Mannheim)")
plt.tight_layout()

sns.violinplot( data=df_ma, x="rain_category", y="channels_all",palette="Blues", legend = False, ax=axs[1]) 
axs[1].set_title("Bike counts by rain category") 
axs[1].set_xlabel("Rain category") 
axs[1].set_ylabel("Bike counts") 
axs[1].grid(axis="y", alpha=0.3)
plt.tight_layout() 
plt.show()

# Spearman
spearman_ma = ( df_ma[["channels_all", "site_rain_accumulation"]] .corr(method="spearman") .loc["channels_all", "site_rain_accumulation"] ) 
print(f"Spearman (Mannheim): {spearman_ma:.3f}") 
# Gesamtdaten bereinigen 
df["site_rain_accumulation"] = df["site_rain_accumulation"].replace(["na", "NA", "Na"], np.nan) 
df = df.dropna(subset=["site_rain_accumulation"]) 
spearman_all = ( df[["channels_all", "site_rain_accumulation"]].corr(method="spearman").loc["channels_all", "site_rain_accumulation"] ) 
print(f"Spearman (alle Städte): {spearman_all:.3f}")
/tmp/ipykernel_433306/3267416270.py:14: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot( data=df_ma, x="rain_category", y="channels_all",palette="Blues", legend = False, ax=axs[1])

Spearman (Mannheim): -0.100
Spearman (alle Städte): -0.114
#Auswirkung df_maer Einführung df_maes 9-Euro-Tickets vom 1. Juni 2022 bis zum 31. August 2022 im Vergleich zum Jahr df_maavor undf_ma df_maanach

#In Mannheim

df_ma["t"] = pd.to_datetime(df_ma["iso_timestamp"].astype(str).str.strip(), errors="coerce", utc=True)
df_ma = df_ma.dropna(subset=["t"])


df_ma["metric"] = pd.to_numeric(df_ma["channels_all"], errors="coerce") 

def win_sum(y):
    a = pd.Timestamp(f"{y}-06-01", tz="Europe/Berlin")
    b = pd.Timestamp(f"{y}-09-01", tz="Europe/Berlin")
    return df_ma.loc[(df_ma["t"]>=a) & (df_ma["t"]<b), "metric"].sum(skipna=True)

vals = {y: win_sum(y) for y in [2020, 2021, 2022, 2023]}
print(vals, "2022 vs 2021 %:", (vals[2022]/vals[2021]-1)*100, "2022 vs 2023 %:", (vals[2022]/vals[2023]-1)*100)

fig, ax = plt.subplots()
bars = ax.bar(["2020", "2021","2022 (9€)","2023"], [vals[2020], vals[2021], vals[2022], vals[2023]])
ax.bar_label(bars, padding=3, fmt="%.0f")
ax.set_title("Mannheim: Juni–Aug Vergleich 2021/2022/2023")
ax.set_ylabel("Summe channels_all")
plt.tight_layout()
plt.show()
{2020: np.int64(1688124), 2021: np.int64(1525185), 2022: np.int64(1789267), 2023: np.int64(2262742)} 2022 vs 2021 %: 17.31475198090724 2022 vs 2023 %: -20.924833675248877

# Monatliche Beobachtungen im Jahr 2021, 2022, 2023 in Mannheim als Liniendiagramm

years  = [2021, 2022, 2023]
labels = ["Jan","Feb","Mär","Apr","Mai","Jun","Jul","Aug","Sep","Okt","Nov","Dez"]

# --- 1) vorbereiten + filtern ---
d = df_backup.copy()
d["t"] = (pd.to_datetime(d["iso_timestamp"], errors="coerce", utc=True)
            .dt.tz_convert("Europe/Berlin"))
d = d[
    (d["domain_name"].astype(str).str.strip() == "Stadt Mannheim") &
    (d["counter_site"].astype(str).str.strip() == "Kurpfalzbrücke") &
    d["t"].notna()
]
d["metric"] = pd.to_numeric(d["channels_all"], errors="coerce").fillna(0)

# --- 2) Tageswerte -> Monatssummen je Jahr ---
daily = d.groupby(d["t"].dt.floor("D"))["metric"].sum().reset_index(name="metric")
daily["year"]  = daily["t"].dt.year
daily["month"] = daily["t"].dt.month
daily = daily[daily["year"].isin(years)]

m = daily.groupby(["year", "month"])["metric"].sum().reset_index(name="month_sum")

# Pivot für Plot (Monate 1..12 als Index, Jahre als Spalten)
sum_pivot = (m.pivot(index="month", columns="year", values="month_sum")
               .reindex(range(1, 13), fill_value=0))


daily = d.groupby(d["t"].dt.floor("D"))["metric"].sum().reset_index(name="metric")
daily["year"]  = daily["t"].dt.year
daily["month"] = daily["t"].dt.month
daily = daily[daily["year"].isin(years)]

daily["is_holiday"] = False

m = (daily.groupby(["year","month"])
          .agg(month_sum=("metric","sum"),
               holiday_share=("is_holiday","mean"))  # Anteil Ferientage im Monat
          .reset_index())

sum_pivot = (m.pivot(index="month", columns="year", values="month_sum")
               .reindex(range(1,13), fill_value=0))
hol_pivot = (m.pivot(index="month", columns="year", values="holiday_share")
               .reindex(range(1,13), fill_value=0))


x = np.arange(1, 13)
fig, ax = plt.subplots()

for y in years:
    ax.plot(x, sum_pivot[y].to_numpy(), label=str(y))


ax.set_xticks(x, labels)
ax.set_xlabel("Monat")
ax.set_ylabel("Summe channels_all")
ax.set_title("Mannheim: Monatssumme + Ferienanteil")
ax.legend()
plt.tight_layout()
plt.show()

# Prognose berechnen für Wachstum im Jahr 2022 mit bezug ob das 9$ Ticket gedämpft hat

grouped_df = df.groupby("domain_name")
print("Städte mit Messstationen ab 2014:\n ")
for name, group in grouped_df:
    first_year = min(group["year"])
    if first_year <= 2014:
        print(group["domain_name"].iloc[0])
        means_2014 = group.groupby("counter_site")["channels_all"].mean()
        biggest_station_in_city = means_2014.idxmax()
        print(biggest_station_in_city)
Städte mit Messstationen ab 2014:
 
Landeshauptstadt Stuttgart
König-Karls-Brücke Barometer
Stadt Freiburg
Wiwilibrücke
Stadt Heidelberg
Thedor-Heuss-Brücke Querschnitt
Stadt Heilbronn
Neckarufer
Stadt Karlsruhe
Erbprinzenstraße
Stadt Kirchheim Unter Teck
Barometer Kirchheim u. Teck
Stadt Lörrach
Untere Hartmattenstraße / Hauptfriedhof
Stadt Mannheim
Kurpfalzbrücke
Stadt Tübingen
Unterführung Steinlach/Karlstraße Südseite - Steinlachallee