ANLP_WS24_CA1/data_explo_reddit.ipynb

308 lines
54 KiB
Plaintext
Raw Normal View History

2024-11-20 11:52:27 +01:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import os\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import nltk\n",
"from nltk.corpus import stopwords\n",
"from nltk.tokenize import word_tokenize\n",
"from collections import Counter"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to\n",
"[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to\n",
"[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# nltk count words\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# dataset reddit jokes"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"# Load the data from the JSON file\n",
"data_path = './data/reddit_jokes.json'\n",
"with open(data_path) as f:\n",
" data = json.load(f)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>body</th>\n",
" <th>id</th>\n",
" <th>score</th>\n",
" <th>title</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Now I have to say \"Leroy can you please paint ...</td>\n",
" <td>5tz52q</td>\n",
" <td>1</td>\n",
" <td>I hate how you cant even say black paint anymore</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Pizza doesn't scream when you put it in the ov...</td>\n",
" <td>5tz4dd</td>\n",
" <td>0</td>\n",
" <td>What's the difference between a Jew in Nazi Ge...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>...and being there really helped me learn abou...</td>\n",
" <td>5tz319</td>\n",
" <td>0</td>\n",
" <td>I recently went to America....</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>A Sunday school teacher is concerned that his ...</td>\n",
" <td>5tz2wj</td>\n",
" <td>1</td>\n",
" <td>Brian raises his hand and says, “Hes in Heaven.”</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>He got caught trying to sell the two books to ...</td>\n",
" <td>5tz1pc</td>\n",
" <td>0</td>\n",
" <td>You hear about the University book store worke...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" body id score \\\n",
"0 Now I have to say \"Leroy can you please paint ... 5tz52q 1 \n",
"1 Pizza doesn't scream when you put it in the ov... 5tz4dd 0 \n",
"2 ...and being there really helped me learn abou... 5tz319 0 \n",
"3 A Sunday school teacher is concerned that his ... 5tz2wj 1 \n",
"4 He got caught trying to sell the two books to ... 5tz1pc 0 \n",
"\n",
" title \n",
"0 I hate how you cant even say black paint anymore \n",
"1 What's the difference between a Jew in Nazi Ge... \n",
"2 I recently went to America.... \n",
"3 Brian raises his hand and says, “Hes in Heaven.” \n",
"4 You hear about the University book store worke... "
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# create pandas dataframe of the data\n",
"df = pd.DataFrame(data)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(82914, 4)\n",
"The Person has no Internet Connection...;-p\n",
"513ftd\n",
"14\n",
"-----------\n",
"Rubio on rails\n",
"48tsdn\n",
"6\n",
"-----------\n",
"\n",
"3qaqsy\n",
"29\n",
"-----------\n",
"After all, this isn't the first time Atlanta was burned by the north.\n",
"5soa19\n",
"16\n",
"-----------\n",
"I think conspiracy theorists are secretly working together to brainwash us\n",
"5sb13m\n",
"10\n",
"-----------\n"
]
}
],
"source": [
"# get jokes with highest scores min 4.5\n",
"good_jokes = df[df['score'] >= 4.5].values\n",
"# random sample of 5 jokes\n",
"print(np.array(good_jokes).shape)\n",
"# 5 random indices min max\n",
"number_of_jokes = 5\n",
"idx = np.random.randint(0, len(good_jokes), number_of_jokes)\n",
"for i in idx:\n",
" print(good_jokes[i][0])\n",
" print(good_jokes[i][1])\n",
" print(good_jokes[i][2])\n",
" print('-----------')"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAmIAAAHHCAYAAAAcbzQmAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAABLKklEQVR4nO3deVhV5d7/8c9W3BscACemRCDn2bQiTmqZHFFpsOFJzUrNIUvLqVJPznnS9GhamXZOJ2k4ZdpT2iNqEk6nJEsUDVNSc6gUtBS2YqLC/fuji/VzBw4QugTer+taV651f9da33Ur8bn2XntthzHGCAAAAFddBbsbAAAAKK8IYgAAADYhiAEAANiEIAYAAGATghgAAIBNCGIAAAA2IYgBAADYhCAGAABgE4IYAACATQhiQBk0adIkORyOq3Ku22+/Xbfffru1vm7dOjkcDn300UdX5fx9+/ZVeHj4VTlXcZ08eVIDBgxQUFCQHA6Hhg8ffkXO43A4NHTo0CtybABXBkEMuMbFxcXJ4XBYi7e3t0JCQhQTE6NXXnlFJ06cKJHzHDp0SJMmTVJKSkqJHK8kXcu9XY4XX3xRcXFxeuKJJ/Tuu+/qkUceuWBteHi47rzzzqvYHQA7edndAIDLM2XKFEVEROjs2bNKT0/XunXrNHz4cM2ePVuffvqpWrZsadWOGzdOY8aMKdLxDx06pMmTJys8PFytW7e+7P1Wr15dpPMUx8V6+9e//qW8vLwr3sOfsWbNGt1yyy2aOHGi3a0AuMYQxIBSomvXrrrxxhut9bFjx2rNmjW68847dffdd2vnzp3y8fGRJHl5ecnL68r+eJ86dUqVK1eW0+m8oue5lEqVKtl6/stx5MgRNW3a1O42yoTs7GxVqVLF7jaAEsNbk0Apdscdd2j8+PE6cOCA3nvvPWt7YfeIJSQkqF27dvL391fVqlXVqFEj/e1vf5P0+31dN910kySpX79+1tugcXFxkn6/D6x58+ZKTk5Whw4dVLlyZWvfP94jli83N1d/+9vfFBQUpCpVqujuu+/Wjz/+6FETHh6uvn37Ftj3/GNeqrfC7hHLzs7WqFGjFBoaKpfLpUaNGukf//iHjDEedfn3VC1dulTNmzeXy+VSs2bNtGrVqsIn/A+OHDmi/v37KzAwUN7e3mrVqpXefvttazz/frl9+/YpPj7e6n3//v2XdfyiXk9hpk6dqgoVKujVV1+1tq1cuVLt27dXlSpVVK1aNcXGxmrHjh2XPNbZs2c1efJkNWjQQN7e3qpZs6batWunhIQEj7pdu3bpwQcfVO3ateXj46NGjRrp+eef96jZunWrunbtKl9fX1WtWlWdOnXSV1995VGT/7b8+vXr9eSTTyogIEB16tQp0nWkp6erX79+qlOnjlwul4KDg3XPPfcU+e8AuFJ4RQwo5R555BH97W9/0+rVqzVw4MBCa3bs2KE777xTLVu21JQpU+RyubRnzx59+eWXkqQmTZpoypQpmjBhggYNGqT27dtLkv7yl79Yx/j111/VtWtX9ezZUw8//LACAwMv2tff//53ORwOjR49WkeOHNGcOXMUHR2tlJQU65W7y3E5vZ3PGKO7775ba9euVf/+/dW6dWt99tlnevbZZ/Xzzz/r5Zdf9qj/4osv9PHHH+vJJ59UtWrV9Morr+j+++/XwYMHVbNmzQv29dtvv+n222/Xnj17NHToUEVERGjJkiXq27evMjMzNWzYMDVp0kTvvvuuRowYoTp16mjUqFGSpNq1a1/29Rf1es43btw4vfjii3rjjTesfxvvvvuu+vTpo5iYGL300ks6deqU5s+fr3bt2mnr1q0X/eDDpEmTNG3aNA0YMEA333yz3G63Nm/erC1btuivf/2rJGn79u1q3769KlWqpEGDBik8PFx79+7V//3f/+nvf/+7pN//PbZv316+vr567rnnVKlSJb3xxhu6/fbbtX79ekVGRnqc98knn1Tt2rU1YcIEZWdnF+k67r//fu3YsUNPPfWUwsPDdeTIESUkJOjgwYPX/Ic8UE4YANe0hQsXGknmm2++uWCNn5+fueGGG6z1iRMnmvN/vF9++WUjyRw9evSCx/jmm2+MJLNw4cICY7fddpuRZBYsWFDo2G233Watr1271kgy1113nXG73db2xYsXG0lm7ty51rawsDDTp0+fSx7zYr316dPHhIWFWetLly41kszUqVM96h544AHjcDjMnj17rG2SjNPp9Ni2bds2I8m8+uqrBc51vjlz5hhJ5r333rO2nTlzxkRFRZmqVat6XHtYWJiJjY296PEuVFvU6xkyZIgxxphRo0aZChUqmLi4OGv8xIkTxt/f3wwcONDjWOnp6cbPz6/A9j9q1arVJa+jQ4cOplq1aubAgQMe2/Py8qw/d+/e3TidTrN3715r26FDh0y1atVMhw4drG35//bbtWtnzp07V+TrOH78uJFkZs6cedGeATvx1iRQBlStWvWin5709/eXJC1btqzYN7a7XC7169fvsusfffRRVatWzVp/4IEHFBwcrBUrVhTr/JdrxYoVqlixop5++mmP7aNGjZIxRitXrvTYHh0drXr16lnrLVu2lK+vr3744YdLnicoKEi9evWytlWqVElPP/20Tp48qfXr15fA1RT9eowxGjp0qObOnav33ntPffr0scYSEhKUmZmpXr166ZdffrGWihUrKjIyUmvXrr1oL/7+/tqxY4d2795d6PjRo0e1YcMGPfbYY6pbt67HWP5b5bm5uVq9erW6d++u66+/3hoPDg7WQw89pC+++EJut9tj34EDB6pixYpFvg4fHx85nU6tW7dOx48fv+i1AXYhiAFlwMmTJz1Czx/16NFDt956qwYMGKDAwED17NlTixcvLlIou+6664p0Y36DBg081h0Oh+rXr3/F7805cOCAQkJCCsxHkyZNrPHz/TEwSFL16tUv+Yv7wIEDatCggSpU8Pzf6IXOU1xFvZ533nlH8+bN06uvvuoREiVZAeqOO+5Q7dq1PZbVq1fryJEjF+1lypQpyszMVMOGDdWiRQs9++yz2r59uzWeH16bN29+wWMcPXpUp06dUqNGjQqMNWnSRHl5eQXuJYyIiCjWdbhcLr300ktauXKlAgMD1aFDB82YMUPp6ekXvU7gauIeMaCU++mnn5SVlaX69etfsMbHx0cbNmzQ2rVrFR8fr1WrVunDDz/UHXfcodWrV3u82nCxY5S0Cz10Njc397J6KgkXOo+5jBvhr0W33nqrUlJS9Nprr+nBBx9UjRo1rLH84P3uu+8qKCiowL6X+qRthw4dtHfvXi1btkyrV6/Wm2++qZdfflkLFizQgAEDSvZCzvPHf3tFuY7hw4frrrvu0tKlS/XZZ59p/PjxmjZtmtasWaMbbrjhivUMXC5eEQNKuXfffVeSFBMTc9G6ChUqqFOnTpo9e7a+++47/f3vf9eaNWust3FK+kn8f3z7yhijPXv2eNwgXb16dWVmZhbY94+v8hSlt7CwMB06dKjAW7W7du2yxktCWFiYdu/eXeBVxStxnqJcT/369bV69WodOnRIXbp08dgv/y3YgIAARUdHF1gK+/TrH9WoUUP9+vXTBx98oB9//FEtW7bUpEmTJMl6qzE1NfWC+9euXVuVK1dWWlpagbFdu3apQoUKCg0NvWgPRb2OevXqadSoUVq9erVSU1N15swZzZo165LXClwNBDGgFFuzZo1eeOEFRUREqHfv3hesO3bsWIFt+Q9GzcnJkSTr2UyFBaPieOeddzxCwEcffaTDhw+ra9eu1rZ69erpq6++0pkzZ6xty5cvL/DWVFF669atm3Jzc/Xaa695bH/55ZflcDg8zv9ndOvWTenp6frwww+tbefOndOrr76qqlWr6rbbbiux8xT1elq2bKkVK1Zo586duuuuu/Tbb79J+j2s+/r66sUXX9TZs2cL7Hf06NGL9vLrr796rFetWlX169e3/g3Vrl1bHTp00FtvvaWDBw961Oa/wlixYkV17txZy5Yt83ibOiMjQ++//77atWsnX1/fi/Zxuddx6tQpnT592mOsXr16qla
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# plot the distribution of scores\n",
"scores = df['score']\n",
"plt.hist(scores, bins=100)\n",
"plt.xlabel('score')\n",
"plt.ylabel('Frequency')\n",
"plt.title('Distribution of Joke scores')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkgAAAHHCAYAAABEEKc/AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA+0klEQVR4nO3de3zP9f//8fvOGzs5bbMwy/ksh9aiVJZhHRQfh88qJMQUoaJCoRzKuaL6lgklOtDH6WOfkSLkfA7JMbaJbCY2tufvD5e9f71fW2zvxnvjdr1c3pdP7+fr+X69Hq8ns/vn+Xq+Xm8XY4wRAAAAbFydXQAAAEBRQ0ACAACwICABAABYEJAAAAAsCEgAAAAWBCQAAAALAhIAAIAFAQkAAMCCgAQAAGBBQMIt5fXXX5eLi8sNOdZ9992n++67z/b+u+++k4uLi7788ssbcvxu3bqpcuXKN+RYjkpPT9czzzyjkJAQubi4aMCAAdflOC4uLurXr9912TeAmxMBCcVWfHy8XFxcbC9vb2+FhoYqOjpaU6dO1blz5wrlOCdOnNDrr7+ubdu2Fcr+ClNRri0/3nrrLcXHx6tPnz6aPXu2nnzyyb/tW7lyZT300EM3sLqCOXz4sLp3764qVarI29tbISEhuvfeezVixAhnl1ak/PTTT+rbt68aN24sDw+Pa/4flo8//li1atWSt7e3qlWrpmnTpuXZ77ffflPHjh0VGBgof39/Pfroo/r111//0T5xizNAMTVz5kwjyYwcOdLMnj3bfPLJJ+att94yrVq1Mi4uLiYsLMxs377d7jOXLl0yFy5cKNBxNm7caCSZmTNnFuhzGRkZJiMjw/Z+1apVRpJZsGBBgfbjaG2ZmZnm4sWLhXas6yEiIsI0a9YsX33DwsJMTEyMQ8eRZOLi4hz6bH4cOHDABAYGmvLly5tXX33VfPTRR2bkyJGmXbt2xsvL67odtzgaMWKE8fDwMI0bNzbVq1c3V/s1NGPGDCPJtG/f3nz44YfmySefNJLM2LFj7fqdO3fOVKtWzQQFBZlx48aZiRMnmooVK5oKFSqY33//3aF9AgQkFFs5AWnjxo25tiUmJhofHx8TFhZm/vzzz390nIIGpPPnz+fZfqMDUnEQHh6e79BTlANS3759jbu7uzl8+HCubcnJydftuHlJT0+/IcfZu3evyczMLPDnkpKSbD+TcXFxfxuQ/vzzT1OmTJlcf+axsbGmZMmS5syZM7a2cePGGUnmp59+sqvPzc3NDB061KF9Alxiw03pgQce0LBhw3TkyBHNmTPH1p7XGqSEhAQ1b95cgYGB8vX1VY0aNfTKK69IurJuqGnTppKk7t272y7nxcfHS7qyzqhu3bravHmz7r33XpUoUcL2WesapBxZWVl65ZVXFBISopIlS+qRRx7RsWPH7PpUrlxZ3bp1y/XZv+7zWrXltQbp/PnzGjRokCpWrCgvLy/VqFFD77zzjowxdv1y1uwsXLhQdevWlZeXl+rUqaPly5fnPeAWKSkp6tGjh4KDg+Xt7a0GDRpo1qxZtu0567EOHTqkJUuW2Go/fPhwvvZf0PPJy+jRo+Xq6mp3eWXZsmW65557VLJkSfn5+SkmJka7d+++5r4OHjyoChUqKCwsLNe2oKCgXG3Lli1TixYt5OfnJ39/fzVt2lSfffaZXZ8FCxaocePG8vHxUdmyZfXEE0/ot99+s+vTrVs3+fr66uDBg2rbtq38/PwUGxsrScrOztbkyZNVp04deXt7Kzg4WL1799Yff/xht49NmzYpOjpaZcuWlY+Pj8LDw/X0009f85zHjh2r2267TYMHD9bevXuv2T9HcHCwfHx8rtlv1apVOn36tPr27WvXHhcXp/Pnz2vJkiW2ti+//FJNmza1/TxIUs2aNdWyZUvNnz/foX0CBCTctHLWs6xYseJv++zevVsPPfSQMjIyNHLkSE2YMEGPPPKI1q5dK0mqVauWRo4cKUnq1auXZs+erdmzZ+vee++17eP06dNq06aNGjZsqMmTJ+v++++/al1vvvmmlixZopdfflnPP/+8EhISFBUVpQsXLhTo/PJT218ZY/TII49o0qRJat26tSZOnKgaNWroxRdf1MCBA3P1X7Nmjfr27avOnTtr/Pjxunjxotq3b6/Tp09fta4LFy7ovvvu0+zZsxUbG6u3335bAQEB6tatm6ZMmWKrffbs2SpbtqwaNmxoq71cuXL5Pv+Cns9fvfbaaxo+fLg++OADPffcc5Kk2bNnKyYmRr6+vho3bpyGDRumPXv2qHnz5tcMbmFhYTp27JhWrlx5zbrj4+MVExOjM2fOaOjQoRo7dqwaNmxoFz7j4+PVsWNHubm5acyYMerZs6e+/vprNW/eXGfPnrXb3+XLlxUdHa2goCC98847at++vSSpd+/eevHFF9WsWTNNmTJF3bt319y5cxUdHa1Lly5JuhJkW7VqpcOHD2vIkCGaNm2aYmNjtX79+muex3PPPafWrVtr+vTpql27tpo1a6ZPPvlE6enp1/xsfmzdulWS1KRJE7v2xo0by9XV1bY9OztbO3bsyNVPku68804dPHjQth4xv/sEJLEGCcXX1S6x5QgICDB33HGH7f2IESPspvQnTZpkJJlTp0797T6udhmrRYsWRpKZMWNGnttatGhhe59zie22224zaWlptvb58+cbSWbKlCm2trCwMNO1a9dr7vNqtXXt2tWEhYXZ3i9cuNBIMqNHj7br16FDB+Pi4mJ++eUXW5sk4+npade2fft2I8lMmzYt17H+avLkyUaSmTNnjq0tMzPTREZGGl9fX7tzL8hlM2vfgp5PziW2QYMGGVdXVxMfH2/bfu7cORMYGGh69uxpt6+kpCQTEBCQq91q165dxsfHx0gyDRs2NP379zcLFy7Mdbn17Nmzxs/Pz0RERORaC5ednW2MuTJWQUFBpm7dunZ9Fi9ebCSZ4cOH29q6du1qJJkhQ4bY7euHH34wkszcuXPt2pcvX27X/s0331zzZ+haUlNTzQcffGAiIiKMJOPr62t69Ohhfvzxx2t+9mqX2OLi4oybm1ue28qVK2c6d+5sjDHm1KlTtrWIVu+9956RZH7++ecC7RMwhktsuMn5+vpe9W62wMBASdKiRYuUnZ3t0DG8vLzUvXv3fPd/6qmn5OfnZ3vfoUMHlS9fXkuXLnXo+Pm1dOlSubm56fnnn7drHzRokIwxWrZsmV17VFSUqlSpYntfv359+fv7/+2dQX89TkhIiLp06WJr8/Dw0PPPP6/09HStXr26EM6m4OdjjFG/fv00ZcoUzZkzR127drVtS0hI0NmzZ9WlSxf9/vvvtpebm5siIiK0atWqq9ZSp04dbdu2TU888YQOHz6sKVOmqF27dgoODtZHH31kd5xz585pyJAh8vb2tttHzqXfTZs2KSUlRX379rXrExMTo5o1a+Z5GahPnz527xcsWKCAgAA9+OCDdufTuHFj+fr62s4n5+//4sWLbbNKBeXv769evXpp/fr12rNnj5599lktXrxYd999t+rUqaP/+7//c2i/Fy5ckKenZ57bvL29bTOuOf/r5eWVZ7+/9snvPgGJS2y4yaWnp9uFEatOnTqpWbNmeuaZZxQcHKzOnTtr/vz5BQpLt91229/+o5uXatWq2b13cXFR1apVC7z+pqCOHDmi0NDQXONRq1Yt2/a/qlSpUq59lCpVKtcalryOU61aNbm62v/z8nfHcVRBz+fTTz/Ve++9p2nTptmFN0k6cOCApCtr18qVK2f3WrFihVJSUq5ZT/Xq1TV79mz9/vvv2rFjh9566y25u7urV69e+t///ifpylolSapbt+5Vz0uSatSokWtbzZo1c52Xu7u7KlSokOt8UlNTFRQUlOt80tPTbefTokULtW/fXm+88YbKli2rRx99VDNnzlRGRsY1zzcvtWrV0ttvv621a9cqMjJSe/bs0bvvvuvQvnx8fJSZmZnntosXL9rWMeX8b141X7x40a5PfvcJSJK7swsArpfjx48rNTVVVatW/ds+Pj4++v7777Vq1SotWbJEy5cv1xdffKEHHnh
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of jokes with score > 1000: 3699\n"
]
}
],
"source": [
"min_score = 1000\n",
"\n",
"# plot the distribution of scores\n",
"scores = df[df['score'] > min_score]['score']\n",
"plt.hist(scores, bins=100)\n",
"plt.xlabel('Score')\n",
"plt.ylabel('Frequency')\n",
"plt.title(f'Distribution of Joke Scores >{min_score}')\n",
"plt.show()\n",
"\n",
"# print number of jokes with score > 1000\n",
"num_jokes = len(scores)\n",
"print(f'Number of jokes with score > {min_score}:', num_jokes)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}