DAT_Projekt/notebooks/hourly_bikes.ipynb

187 lines
63 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"id": "d6fa6fc8",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import holidays"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "a2f9a292",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_4598/1257658190.py:2: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
" df = df.replace([\"na\", \"NA\", \"Na\"], np.nan)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"operator_name 0\n",
"domain_name 0\n",
"domain_id 0\n",
"counter_site 0\n",
"counter_site_id 0\n",
"counter_serial 0\n",
"longitude 0\n",
"latitude 0\n",
"timezone 0\n",
"iso_timestamp 0\n",
"channels_in 0\n",
"channels_out 0\n",
"channels_unknown 698715\n",
"channels_all 0\n",
"site_temperature 8846\n",
"site_rain_accumulation 8846\n",
"site_snow_accumulation 698715\n",
"year 0\n",
"dtype: int64\n"
]
}
],
"source": [
"df = pd.read_csv(\"../data/processed/hourly_bikes_mannheim.csv\", low_memory=False)\n",
"df = df.replace([\"na\", \"NA\", \"Na\"], np.nan)\n",
"\n",
"print(df.isna().sum())\n",
"df[\"site_temperature\"] = df[\"site_temperature\"].astype(float)\n",
"df[\"site_rain_accumulation\"] = df[\"site_rain_accumulation\"].astype(float)\n",
"df[\"site_snow_accumulation\"] = df[\"site_snow_accumulation\"].astype(float)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "623979f9",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_4598/2721099372.py:17: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n",
" agg_df = df_ma.groupby('time_of_day')['channels_all'].mean().reset_index()\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 2000x1000 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Ensure the 'iso_timestamp' is properly parsed with utc=True\n",
"df['timestamp'] = pd.to_datetime(df['iso_timestamp'], utc=True) # Convert to datetime with UTC handling\n",
"\n",
"# Extract the hour from the timestamp\n",
"df['hour'] = df['timestamp'].dt.hour\n",
"\n",
"bins = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,17,18,19,20,21,22,23,24] # These are the edges of your 3-hour bins\n",
"\n",
"\n",
"# Add a new column with the time bin for each row\n",
"df['time_of_day'] = pd.cut(df['hour'], bins=bins, right=False)\n",
"\n",
"# Aggregate the data by time_of_day\n",
"#agg_df = df.groupby('time_of_day')['channels_all'].sum().reset_index()\n",
"\n",
"df_ma = df[df['counter_site'] == \"Renzstraße\"]\n",
"agg_df = df_ma.groupby('time_of_day')['channels_all'].mean().reset_index()\n",
"\n",
"\n",
"\n",
"# Create a Seaborn bar plot to visualize the aggregated data\n",
"plt.figure(figsize=(20, 10))\n",
"ax = sns.barplot( data=agg_df,x=\"time_of_day\", y=\"channels_all\")\n",
"\n",
"\n",
"n = len(agg_df) # bei dir 24\n",
"\n",
"# Ticks auf die Grenzen legen: -0.5, 0.5, 1.5, ..., 23.5\n",
"ax.set_xlim(-0.5, n - 0.5)\n",
"ax.set_xticks(np.arange(-0.5, n + 0.5, 1))\n",
"\n",
"# Tick-Labels: 0..24 (Grenzwerte)\n",
"ax.set_xticklabels([str(i) for i in range(n + 1)], rotation=0)\n",
"ax.ticklabel_format(axis='y', style='plain', useOffset=False)\n",
"\n",
"# Add some labels and title to the plot\n",
"ax.set_title('Fahrradpassagen pro Stunde als arithmetisches Mittel über den Beobachtungszeitraum (Mannheim, Renzstraße, ab 2014)', fontsize=16)\n",
"plt.xlabel('Uhrzeit', fontsize=16)\n",
"\n",
"from matplotlib.ticker import EngFormatter, FuncFormatter\n",
"\n",
"#ax.yaxis.set_major_formatter(FuncFormatter(lambda x, pos: f\"{x/1e6:.1f}\"))\n",
"plt.ylabel('Fahrradpassagen', fontsize=16)\n",
"\n",
"# Optionally, rotate the x-axis labels for better readability\n",
"plt.xticks(rotation=0)\n",
"\n",
"# Show the plot\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "0364d24a",
"metadata": {
"vscode": {
"languageId": "ruby"
}
},
"outputs": [],
"source": [
"df['hour'] = df['timestamp'].dt.hour\n",
"df['day_of_week'] = df['timestamp'].dt.dayofweek # 0=Monday, 6=Sunday\n",
"df['month'] = df['timestamp'].dt.month\n",
"df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int) # 1 if weekend, 0 if weekday\n",
"# You can manually add holidays or use a library like `holidays` to check if the day is a public holiday.\n",
"de_holidays = holidays.Germany(years=2023) # For Germany, for example\n",
"df['is_holiday'] = df['timestamp'].dt.date.isin(de_holidays.keys()).astype(int)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "dat (3.12.3)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}