DAT_Projekt/notebooks/hourly_bikes.ipynb

181 lines
56 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "d6fa6fc8",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import holidays"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "a2f9a292",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_47774/1257658190.py:2: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
" df = df.replace([\"na\", \"NA\", \"Na\"], np.nan)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"operator_name 0\n",
"domain_name 0\n",
"domain_id 0\n",
"counter_site 0\n",
"counter_site_id 0\n",
"counter_serial 0\n",
"longitude 0\n",
"latitude 0\n",
"timezone 0\n",
"iso_timestamp 0\n",
"channels_in 0\n",
"channels_out 0\n",
"channels_unknown 698715\n",
"channels_all 0\n",
"site_temperature 8846\n",
"site_rain_accumulation 8846\n",
"site_snow_accumulation 698715\n",
"year 0\n",
"dtype: int64\n"
]
}
],
"source": [
"df = pd.read_csv(\"../data/processed/hourly_bikes_mannheim.csv\", low_memory=False)\n",
"df = df.replace([\"na\", \"NA\", \"Na\"], np.nan)\n",
"\n",
"print(df.isna().sum())\n",
"df[\"site_temperature\"] = df[\"site_temperature\"].astype(float)\n",
"df[\"site_rain_accumulation\"] = df[\"site_rain_accumulation\"].astype(float)\n",
"df[\"site_snow_accumulation\"] = df[\"site_snow_accumulation\"].astype(float)\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "623979f9",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_47774/833663349.py:15: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n",
" agg_df = df.groupby('time_of_day')['channels_all'].sum().reset_index()\n",
"/tmp/ipykernel_47774/833663349.py:19: FutureWarning: \n",
"\n",
"Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.\n",
"\n",
" sns.barplot(x='time_of_day', y='channels_all', data=agg_df, palette='Blues_d')\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Ensure the 'iso_timestamp' is properly parsed with utc=True\n",
"df['timestamp'] = pd.to_datetime(df['iso_timestamp'], utc=True) # Convert to datetime with UTC handling\n",
"\n",
"# Extract the hour from the timestamp\n",
"df['hour'] = df['timestamp'].dt.hour\n",
"\n",
"bins = [0, 3, 6, 9, 12, 15, 18, 21, 24] # These are the edges of your 3-hour bins\n",
"labels = ['00:00-03:00', '03:00-06:00', '06:00-09:00', '09:00-12:00', \n",
" '12:00-15:00', '15:00-18:00', '18:00-21:00', '21:00-24:00'] # 3-hour intervals\n",
"\n",
"# Add a new column with the time bin for each row\n",
"df['time_of_day'] = pd.cut(df['hour'], bins=bins, labels=labels, right=False)\n",
"\n",
"# Aggregate the data by time_of_day\n",
"agg_df = df.groupby('time_of_day')['channels_all'].sum().reset_index()\n",
"\n",
"# Create a Seaborn bar plot to visualize the aggregated data\n",
"plt.figure(figsize=(10, 6))\n",
"sns.barplot(x='time_of_day', y='channels_all', data=agg_df, palette='Blues_d')\n",
"\n",
"# Add some labels and title to the plot\n",
"plt.title('Total Bikes Counted by Time of Day', fontsize=16)\n",
"plt.xlabel('Time of Day', fontsize=14)\n",
"plt.ylabel('Total Bikes Counted', fontsize=14)\n",
"\n",
"# Optionally, rotate the x-axis labels for better readability\n",
"plt.xticks(rotation=45)\n",
"\n",
"# Show the plot\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "0364d24a",
"metadata": {
"vscode": {
"languageId": "ruby"
}
},
"outputs": [],
"source": [
"df['hour'] = df['timestamp'].dt.hour\n",
"df['day_of_week'] = df['timestamp'].dt.dayofweek # 0=Monday, 6=Sunday\n",
"df['month'] = df['timestamp'].dt.month\n",
"df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int) # 1 if weekend, 0 if weekday\n",
"# You can manually add holidays or use a library like `holidays` to check if the day is a public holiday.\n",
"de_holidays = holidays.Germany(years=2023) # For Germany, for example\n",
"df['is_holiday'] = df['timestamp'].dt.date.isin(de_holidays.keys()).astype(int)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0eb01368",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "dat (3.12.3)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}