From 3ac48802375590146fc9655eb98f1ea4ffb3132f Mon Sep 17 00:00:00 2001
From: Felix Mucha <3016498@stud.hs-mannheim.de>
Date: Wed, 26 Jun 2024 18:33:03 +0200
Subject: [PATCH] added first biases
---
README.md | 16 +++++++++++++---
notebooks/demographic_plots.ipynb | 30 ++++++++++++++++++++++++++++++
2 files changed, 43 insertions(+), 3 deletions(-)
diff --git a/README.md b/README.md
index 77d77ef..d4875c9 100644
--- a/README.md
+++ b/README.md
@@ -260,11 +260,21 @@ Further analysis included the creation of a Euclidean distance matrix plot to vi
The detailed procedures can be found in the following notebook:
[cluster_features.ipynb](notebooks/cluster_features.ipynb)
-## Legal basis
+## Legal Basis and Data Biases
(version 03.07)
-- The data used all come from one hospital
-- Most of the data are from people of older age, predominantly from the 60-70 age group
+### Local Bias
+- The dataset originates exclusively from one hospital, encompassing contributions from Chapman University, Shaoxing People’s Hospital (affiliated with Shaoxing Hospital Zhejiang University School of Medicine), and Ningbo First Hospital. This may introduce a local bias, as all data are collected from a specific geographic and institutional context.
+
+### Demographic Bias
+- The dataset predominantly features data from older individuals, with the majority of participants falling within the 60-70 age group. This demographic skew is further detailed by:
+ - Average age: 59.59 years
+ - Standard deviation of age: 18.29 years
+ - Male ratio: 57.34%
+ - Female ratio: 42.66%
+This indicates a potential demographic bias towards older age groups and a gender imbalance.
+
+# TODO
- Zustimmung und Anonymität:
- Datenschutz und Ethik:
diff --git a/notebooks/demographic_plots.ipynb b/notebooks/demographic_plots.ipynb
index 365b781..bf6bac9 100644
--- a/notebooks/demographic_plots.ipynb
+++ b/notebooks/demographic_plots.ipynb
@@ -59,6 +59,36 @@
"df_dgc['age_group'] = pd.cut(df_dgc['age'], bins=age_categories)"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# avg age and std dev overall and for each group\n",
+ "avg_age = df_dgc['age'].mean()\n",
+ "std_age = df_dgc['age'].std()\n",
+ "avg_age_group = df_dgc.groupby('age_group')['age'].mean()\n",
+ "std_age_group = df_dgc.groupby('age_group')['age'].std()\n",
+ "\n",
+ "# print \n",
+ "print(\"Average age: \", avg_age)\n",
+ "print(\"Std Dev age: \", std_age)\n",
+ "print(\"Average age group: \", avg_age_group)\n",
+ "print(\"Std Dev age group: \", std_age_group)\n",
+ "\n",
+ "# female and male ratio\n",
+ "count_male = df_dgc[df_dgc['gender'] == 'Male'].shape[0]\n",
+ "count_female = df_dgc[df_dgc['gender'] == 'Female'].shape[0]\n",
+ "count_total = df_dgc.shape[0]\n",
+ "male_ratio = count_male / count_total\n",
+ "female_ratio = count_female / count_total\n",
+ "\n",
+ "# print\n",
+ "print('Male Ratio: ', male_ratio)\n",
+ "print('Female Ratio:', female_ratio)\n"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},