diff --git a/Titanic.ipynb b/Titanic.ipynb index 7b6e7ed..0f8463c 100644 --- a/Titanic.ipynb +++ b/Titanic.ipynb @@ -39,6 +39,11 @@ "Requirement already satisfied: pyparsing>=2.3.1 in /opt/conda/lib/python3.11/site-packages (from matplotlib) (3.1.2)\n", "Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/lib/python3.11/site-packages (from matplotlib) (2.8.2)\n", "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.11/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n", + "Requirement already satisfied: scikit-learn in /opt/conda/lib/python3.11/site-packages (1.4.2)\n", + "Requirement already satisfied: numpy>=1.19.5 in /opt/conda/lib/python3.11/site-packages (from scikit-learn) (1.26.4)\n", + "Requirement already satisfied: scipy>=1.6.0 in /opt/conda/lib/python3.11/site-packages (from scikit-learn) (1.11.4)\n", + "Requirement already satisfied: joblib>=1.2.0 in /opt/conda/lib/python3.11/site-packages (from scikit-learn) (1.4.0)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.11/site-packages (from scikit-learn) (3.4.0)\n", "Requirement already satisfied: ydata_profiling in /opt/conda/lib/python3.11/site-packages (4.7.0)\n", "Requirement already satisfied: scipy<1.12,>=1.4.1 in /opt/conda/lib/python3.11/site-packages (from ydata_profiling) (1.11.4)\n", "Requirement already satisfied: pandas!=1.4.0,<3,>1.1 in /opt/conda/lib/python3.11/site-packages (from ydata_profiling) (2.2.1)\n", @@ -73,7 +78,7 @@ "Requirement already satisfied: llvmlite<0.43,>=0.42.0dev0 in /opt/conda/lib/python3.11/site-packages (from numba<1,>=0.56.0->ydata_profiling) (0.42.0)\n", "Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.11/site-packages (from pandas!=1.4.0,<3,>1.1->ydata_profiling) (2023.3.post1)\n", "Requirement already satisfied: tzdata>=2022.7 in /opt/conda/lib/python3.11/site-packages (from pandas!=1.4.0,<3,>1.1->ydata_profiling) (2024.1)\n", - "Requirement already satisfied: joblib>=0.14.1 in /opt/conda/lib/python3.11/site-packages (from phik<0.13,>=0.11.1->ydata_profiling) (1.1.1)\n", + "Requirement already satisfied: joblib>=0.14.1 in /opt/conda/lib/python3.11/site-packages (from phik<0.13,>=0.11.1->ydata_profiling) (1.4.0)\n", "Requirement already satisfied: annotated-types>=0.4.0 in /opt/conda/lib/python3.11/site-packages (from pydantic>=2->ydata_profiling) (0.6.0)\n", "Requirement already satisfied: pydantic-core==2.16.3 in /opt/conda/lib/python3.11/site-packages (from pydantic>=2->ydata_profiling) (2.16.3)\n", "Requirement already satisfied: typing-extensions>=4.6.1 in /opt/conda/lib/python3.11/site-packages (from pydantic>=2->ydata_profiling) (4.11.0)\n", @@ -114,10 +119,9 @@ "# Needed Packages to comunicate with mySQL from python\n", "!pip install sqlalchemy mysql-connector-python\n", "\n", - "# for pandas\n", "!pip install pandas\n", "!pip install matplotlib\n", - "\n", + "!pip install scikit-learn\n", "!pip install ydata_profiling\n", "!pip install ipywidgets" ] @@ -138,6 +142,12 @@ "outputs": [], "source": [ "from sqlalchemy import create_engine, text\n", + "from ydata_profiling import ProfileReport\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.metrics import roc_curve, auc, accuracy_score, confusion_matrix, classification_report\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", @@ -243,7 +253,7 @@ "id": "57e0ecb7-d73b-4a97-b683-d48dcd148ca7", "metadata": {}, "source": [ - "# Analyze the dataset distribution" + "# Analyze the dataset distribution aimed towards the survived target class" ] }, { @@ -535,7 +545,13 @@ "id": "38843fc3-81fb-4d4a-9045-eae86ff6bd1e", "metadata": {}, "source": [ - "# Correlation Heatmap" + "# Correlation Heatmap\n", + "A correlation heatmap is a visual representation of the correlation between variables in a dataset. It uses colors to show the strength and direction of these relationships.\n", + "\n", + "## How to read\n", + "- Color Gradient: Colors indicate correlation strength, with red for positive correlation and blue for negative correlation.\n", + "- Diagonal Line: Represents perfect correlation (1) of variables with themselves.\n", + "- Symmetry: The heatmap is symmetrical around the diagonal." ] }, { @@ -602,7 +618,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "26b3bcf8bcf54fcea408efd0ce0f540b", + "model_id": "9b2a7638b3de4ab5a7714620ac0e560f", "version_major": 2, "version_minor": 0 }, @@ -616,7 +632,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "39795ba3d0464e72957892c668feda06", + "model_id": "ff14cd91ccaf406dbba38e9cd25f47d2", "version_major": 2, "version_minor": 0 }, @@ -630,7 +646,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "e283f57ff6044fd385ab256092a82bb8", + "model_id": "664a9698faea40ae89dcc6293d53b4dc", "version_major": 2, "version_minor": 0 }, @@ -644,7 +660,7 @@ { "data": { "text/html": [ - "