ANLP_WS24_CA1/data_explo_reddit.ipynb

396 lines
105 KiB
Plaintext
Raw Normal View History

2024-11-20 11:52:27 +01:00
{
"cells": [
{
"cell_type": "code",
2024-11-20 23:20:17 +01:00
"execution_count": 1,
2024-11-20 11:52:27 +01:00
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import os\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import nltk\n",
"from nltk.corpus import stopwords\n",
"from nltk.tokenize import word_tokenize\n",
"from collections import Counter"
]
},
{
"cell_type": "code",
2024-11-20 23:20:17 +01:00
"execution_count": 2,
2024-11-20 11:52:27 +01:00
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to\n",
"[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to\n",
"[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
2024-11-20 23:20:17 +01:00
"execution_count": 2,
2024-11-20 11:52:27 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# nltk count words\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# dataset reddit jokes"
]
},
{
"cell_type": "code",
2024-11-20 23:20:17 +01:00
"execution_count": 3,
2024-11-20 11:52:27 +01:00
"metadata": {},
"outputs": [],
"source": [
"# Load the data from the JSON file\n",
"data_path = './data/reddit_jokes.json'\n",
"with open(data_path) as f:\n",
" data = json.load(f)"
]
},
{
"cell_type": "code",
2024-11-20 23:20:17 +01:00
"execution_count": 12,
2024-11-20 11:52:27 +01:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>body</th>\n",
" <th>id</th>\n",
" <th>score</th>\n",
" <th>title</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Now I have to say \"Leroy can you please paint ...</td>\n",
" <td>5tz52q</td>\n",
" <td>1</td>\n",
" <td>I hate how you cant even say black paint anymore</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Pizza doesn't scream when you put it in the ov...</td>\n",
" <td>5tz4dd</td>\n",
" <td>0</td>\n",
" <td>What's the difference between a Jew in Nazi Ge...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>...and being there really helped me learn abou...</td>\n",
" <td>5tz319</td>\n",
" <td>0</td>\n",
" <td>I recently went to America....</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>A Sunday school teacher is concerned that his ...</td>\n",
" <td>5tz2wj</td>\n",
" <td>1</td>\n",
" <td>Brian raises his hand and says, “Hes in Heaven.”</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>He got caught trying to sell the two books to ...</td>\n",
" <td>5tz1pc</td>\n",
" <td>0</td>\n",
" <td>You hear about the University book store worke...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" body id score \\\n",
"0 Now I have to say \"Leroy can you please paint ... 5tz52q 1 \n",
"1 Pizza doesn't scream when you put it in the ov... 5tz4dd 0 \n",
"2 ...and being there really helped me learn abou... 5tz319 0 \n",
"3 A Sunday school teacher is concerned that his ... 5tz2wj 1 \n",
"4 He got caught trying to sell the two books to ... 5tz1pc 0 \n",
"\n",
" title \n",
"0 I hate how you cant even say black paint anymore \n",
"1 What's the difference between a Jew in Nazi Ge... \n",
"2 I recently went to America.... \n",
"3 Brian raises his hand and says, “Hes in Heaven.” \n",
"4 You hear about the University book store worke... "
]
},
2024-11-20 23:20:17 +01:00
"execution_count": 12,
2024-11-20 11:52:27 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# create pandas dataframe of the data\n",
"df = pd.DataFrame(data)\n",
"df.head()"
]
},
{
"cell_type": "code",
2024-11-20 23:20:17 +01:00
"execution_count": 5,
2024-11-20 11:52:27 +01:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(82914, 4)\n",
2024-11-20 23:20:17 +01:00
"Now it doesn't work\n",
"\n",
"I painted it white and now the system is corrupt.\n",
"\n",
"Painted it yellow and all the drivers crashed\n",
"\n",
"Painted a vagina on it and now all it does is whine.\n",
"4i6fy5\n",
"82\n",
2024-11-20 11:52:27 +01:00
"-----------\n",
2024-11-20 23:20:17 +01:00
"A blonde walks into a bank in New York City and asks for the loan officer. She says she's going to Europe on business for two weeks and needs to borrow $5,000. The bank officer says the bank will need some kind of security for the loan, so the blonde hands over the keys to a new Rolls Royce. The car is parked on the street in front of the bank; she has the title, and everything checks out. The bank agrees to accept the car as collateral for the loan. The bank's president and its officers all enjoy a good laugh at the blonde for using a $250,000 Rolls as collateral against a $5,000 loan. An employee of the bank then drives the Rolls into the bank's underground garage and parks it there. Two weeks later, the blonde returns and repays the $5,000 and the interest, which comes to $15.41. The loan officer says, \"Miss, we are very happy to have had your business, and this transaction has worked out very nicely; but we are a little puzzled. We checked you out and found out that you are a multimillionaire. What puzzles us is - why would you bother to borrow $5,000?\" The blond replies.....\"Where else in New York City can I park my car for two weeks for only $15.41 and expect it to be there when I return?\"\n",
"5088he\n",
"308\n",
2024-11-20 11:52:27 +01:00
"-----------\n",
2024-11-20 23:20:17 +01:00
"Just one. But only if the light bulb really wants to change.\n",
"264i9p\n",
"37\n",
2024-11-20 11:52:27 +01:00
"-----------\n",
2024-11-20 23:20:17 +01:00
"He is following his grandfather around. His grandfather gets on a big tractor to mow the lawn.\n",
"\n",
"Fascinated, Johnny says, \"Wow! Can i try to drive it?\"\n",
"\n",
"His granddad replies, \"can the tip of your penis touch your asshole?\"\n",
"\n",
"Johnny, shocked, says \"No...\"\n",
"\n",
"\"Then you're not old enough\"\n",
"\n",
"In taking a break from mowing, he pulls out a cigar and starts to smoke.\n",
"\n",
"\"Wow! Can i have a puff?\"\n",
"\n",
"\"Can the tip of your penis touch your asshole?\"\n",
"\n",
"\"No...\"\n",
"\n",
"\"Then you're not old enough\"\n",
"\n",
"The old man finishes mowing and goes inside and cracks open a beer.\n",
"\n",
"\"Wow! Can i have a sip?\n",
"\n",
"\"Can the tip of your penis touch your asshole?\"\n",
"\n",
"\"No....\"\n",
"\n",
"\"Then you're not old enough\"\n",
"\n",
"Later in the evening, Johnny is sitting in front of the TV eating cookies his grandmother made him. His grandpa comes into the room and says \"Wow! Can i have a cookie?\"\n",
"\n",
"Johnny stares at him blankly and says, \"can the tip of your penis touch your asshole?\"\n",
"\n",
"His grandfather smugly says \"why as a matterof fact, it can.\"\n",
"\n",
"Johnny looks up from the TV and says, \"Then go fuck yourself, these are my cookies!\"\n",
"463nqd\n",
"108\n",
2024-11-20 11:52:27 +01:00
"-----------\n",
2024-11-20 23:20:17 +01:00
"Patience!\n",
"2awccy\n",
"5\n",
2024-11-20 11:52:27 +01:00
"-----------\n"
]
}
],
"source": [
"# get jokes with highest scores min 4.5\n",
"good_jokes = df[df['score'] >= 4.5].values\n",
"# random sample of 5 jokes\n",
"print(np.array(good_jokes).shape)\n",
"# 5 random indices min max\n",
"number_of_jokes = 5\n",
"idx = np.random.randint(0, len(good_jokes), number_of_jokes)\n",
"for i in idx:\n",
" print(good_jokes[i][0])\n",
" print(good_jokes[i][1])\n",
" print(good_jokes[i][2])\n",
" print('-----------')"
]
},
{
"cell_type": "code",
2024-11-20 23:20:17 +01:00
"execution_count": null,
2024-11-20 11:52:27 +01:00
"metadata": {},
"outputs": [
{
"data": {
2024-11-20 23:20:17 +01:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAmIAAAHHCAYAAAAcbzQmAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAABHRklEQVR4nO3deVhV5f7//xeIG3DY4ARIInDSnIccInJokCMqDTSc1KzMKBuk41SpDeopS9NjOVRap1M2p3ZKPQ4kBxw+mTngbEpWGpZusBQQS0S4f3/4Y33dgQoELoTn47rWdbXX/d5rvdeNxOtae621PYwxRgAAALjoPO1uAAAAoLoiiAEAANiEIAYAAGATghgAAIBNCGIAAAA2IYgBAADYhCAGAABgE4IYAACATQhiAAAANiGIAVXQxIkT5eHhcVH2dd111+m6666zXq9evVoeHh769NNPL8r+77vvPoWFhV2UfZVVTk6OHnjgAQUFBcnDw0MjRoyokP14eHgoPj6+QrYNoGIQxIBKbt68efLw8LAWHx8fBQcHKzo6WrNmzdLx48fLZT+HDh3SxIkTtW3btnLZXnmqzL2VxIsvvqh58+bpkUce0fvvv6977rnnnLVhYWG68cYbL2J3AOzkZXcDAErmueeeU3h4uPLy8uRyubR69WqNGDFCL7/8spYsWaL27dtbtc8884zGjh1bqu0fOnRI//jHPxQWFqaOHTuW+H0rV64s1X7K4ny9/etf/1JBQUGF9/BnJCcn6+qrr9aECRPsbgVAJUMQAy4Rffv2VZcuXazX48aNU3Jysm688UbdfPPN2rNnj3x9fSVJXl5e8vKq2F/v3377TbVq1ZLD4ajQ/VxIzZo1bd1/SWRkZKh169Z2t1ElnDhxQrVr17a7DaDc8NEkcAm74YYb9Oyzz+rHH3/UBx98YK0v7hqxxMREde/eXf7+/qpTp45atGihp556StKZ67q6du0qSRoyZIj1Mei8efMknbkOrG3btkpJSVHPnj1Vq1Yt671/vEasUH5+vp566ikFBQWpdu3auvnmm3Xw4EG3mrCwMN13331F3nv2Ni/UW3HXiJ04cUKjR49WSEiIvL291aJFC/3zn/+UMcatrvCaqkWLFqlt27by9vZWmzZtlJCQUPyE/0FGRobi4uIUGBgoHx8fdejQQe+++641Xni93P79+7Vs2TKr9wMHDpRo+6U9nuJMmjRJnp6emj17trVuxYoV6tGjh2rXrq26desqJiZGu3fvvuC28vLy9I9//EPNmzeXj4+PGjRooO7duysxMdGtbu/evbrzzjvVqFEj+fr6qkWLFnr66afdarZu3aq+ffvK6XSqTp066tWrl77++mu3msKP5desWaNHH31UAQEBatKkSamOw+VyaciQIWrSpIm8vb3VuHFj3XLLLaX+GQAVhTNiwCXunnvu0VNPPaWVK1fqwQcfLLZm9+7duvHGG9W+fXs999xz8vb21nfffad169ZJklq1aqXnnntO48eP19ChQ9WjRw9J0jXXXGNt49dff1Xfvn01YMAA3X333QoMDDxvXy+88II8PDw0ZswYZWRkaMaMGYqKitK2bdusM3clUZLezmaM0c0336xVq1YpLi5OHTt21BdffKEnnnhCP//8s1555RW3+i+//FKfffaZHn30UdWtW1ezZs3S7bffrrS0NDVo0OCcff3++++67rrr9N133yk+Pl7h4eFauHCh7rvvPmVmZmr48OFq1aqV3n//fY0cOVJNmjTR6NGjJUmNGjUq8fGX9njO9swzz+jFF1/UG2+8Yf3beP/99zV48GBFR0frpZde0m+//aY5c+aoe/fu2rp163lvfJg4caImT56sBx54QFdddZWys7O1efNmbdmyRX/9618lSTt27FCPHj1Us2ZNDR06VGFhYfr+++/13//+Vy+88IKkM/8ee/ToIafTqSeffFI1a9bUG2+8oeuuu05r1qxRRESE234fffRRNWrUSOPHj9eJEydKdRy33367du/erccee0xhYWHKyMhQYmKi0tLSKv1NHqgmDIBK7Z133jGSzKZNm85Z4+fnZ6688krr9YQJE8zZv96vvPKKkWSOHDlyzm1s2rTJSDLvvPNOkbFrr73WSDJz584tduzaa6+1Xq9atcpIMpdddpnJzs621i9YsMBIMjNnzrTWhYaGmsGDB19wm+frbfDgwSY0NNR6vWjRIiPJTJo0ya3ujjvuMB4eHua7776z1kkyDofDbd327duNJDN79uwi+zrbjBkzjCTzwQcfWOtOnTplIiMjTZ06ddyOPTQ01MTExJx3e+eqLe3xDBs2zBhjzOjRo42np6eZN2+eNX78+HHj7+9vHnzwQbdtuVwu4+fnV2T9H3Xo0OGCx9GzZ09Tt25d8+OPP7qtLygosP47NjbWOBwO8/3331vrDh06ZOrWrWt69uxprSv8t9+9e3dz+vTpUh/HsWPHjCQzbdq08/YM2ImPJoEqoE6dOue9e9Lf31+StHjx4jJf2O7t7a0hQ4aUuP7ee+9V3bp1rdd33HGHGjdurOXLl5dp/yW1fPly1ahRQ3//+9/d1o8ePVrGGK1YscJtfVRUlC6//HLrdfv27eV0OvXDDz9ccD9BQUEaOHCgta5mzZr6+9//rpycHK1Zs6Ycjqb0x2OMUXx8vGbOnKkPPvhAgwcPtsYSExOVmZmpgQMH6pdffrGWGjVqKCIiQqtWrTpvL/7+/tq9e7f27dtX7PiRI0e0du1a3X///WratKnbWOFH5fn5+Vq5cqViY2P1l7/8xRpv3Lix7rrrLn355ZfKzs52e++DDz6oGjVqlPo4fH195XA4tHr1ah07duy8xwbYhSAGVAE5OTluoeeP+vfvr27duumBBx5QYGCgBgwYoAULFpQqlF122WWlujC/efPmbq89PDzUrFmzCr8258cff1RwcHCR+WjVqpU1frY/BgZJqlev3gX/cP/4449q3ry5PD3d/zd6rv2UVWmP57333tNrr72m2bNnu4VESVaAuuGGG9SoUSO3ZeXKlcrIyDhvL88995wyMzN1xRVXqF27dnriiSe0Y8cOa7wwvLZt2/ac2zhy5Ih+++03tWjRoshYq1atVFBQUORawvDw8DIdh7e3t1566SWtWLFCgYGB6tmzp6ZOnSqXy3Xe4wQuJq4RAy5xP/30k7KystSsWbNz1vj6+mrt2rVatWqVli1bpoSEBM2fP1833HCDVq5c6Xa24XzbKG/neuhsfn5+iXoqD+fajynBhfCVUbdu3bRt2za9+uqruvPOO1W/fn1rrDB4v//++woKCiry3gvdaduzZ099//33Wrx4sVauXKm33npLr7zyiubOnasHHnigfA/kLH/8t1ea4xgxYoRuuukmLVq0SF988YWeffZZTZ48WcnJybryyisrrGegpDgjBlzi3n//fUlSdHT0ees8PT3Vq1cvvfzyy/rmm2/0wgsvKDk52foYp7yfxP/Hj6+MMfruu+/cLpCuV6+eMjMzi7z3j2d5StNbaGioDh06VOSj2r1791rj5SE0NFT79u0rclaxIvZTmuNp1qyZVq5cqUOHDqlPnz5u7yv8CDYgIEBRUVFFluLufv2j+vXra8iQIfr444918OBBtW/fXhMnTpQk66PGXbt2nfP9jRo1Uq1atZSamlpkbO/evfL09FRISMh5eyjtcVx++eUaPXq0Vq5cqV27dunUqVOaPn36BY8VuBgIYsAlLDk5Wc8//7zCw8M1aNCgc9YdPXq0yLrCB6Pm5uZKkvVspuKCUVm89957biHg008/1eHDh9W3b19r3eWXX66vv/5ap06dstYtXbq0yEdTpemtX79+ys/P16uvvuq2/pVXXpGHh4fb/v+Mfv36yeVyaf78+da606dPa/bs2apTp46uvfbacttPaY+nffv2Wr58ufbs2aObbrpJv//+u6QzYd3pdOrFF19UXl5ekfcdOXLkvL38+uuvbq/r1KmjZs2aWf+GGjVqpJ49e+rtt99WWlqaW23hGcYaNWqod+/eWrx4sdvH1Onp6froo4/UvXt3OZ3O8/ZR0uP47bffdPLkSbexyy+
2024-11-20 11:52:27 +01:00
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# plot the distribution of scores\n",
2024-11-20 23:20:17 +01:00
"scores = df[df['score'] > 0]['score']\n",
"plt.hist(scores, bins=100)\n",
"plt.xlabel('score')\n",
"plt.ylabel('Frequency')\n",
"plt.title('Distribution of Joke scores with score > 0')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkQAAAHHCAYAAABeLEexAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAABXSElEQVR4nO3deVhU1eMG8HdYZkBgQBQYSATcRVETTUnFBb6gornmRomEWgrmlqmV5o5auWZSlpJlm5ZLbkEqUkqoJG4ZqaloMmAiIJis5/eHD/fnyCLiAIP3/TzPPDX3nHvvOZc78nLuuXcUQggBIiIiIhkzqukGEBEREdU0BiIiIiKSPQYiIiIikj0GIiIiIpI9BiIiIiKSPQYiIiIikj0GIiIiIpI9BiIiIiKSPQYiIiIikj0Gomowb948KBSKatlXjx490KNHD+l9TEwMFAoFtm3bVi37HzNmDFxdXatlX5WVnZ2NsWPHQqPRQKFQYMqUKVWyH4VCgbCwsCrZNlWMq6srxowZU+G6/fr1q9oGkWxERkZCoVDgypUrNd0UqiAGosdUfJIXv8zMzODk5AR/f3+sWbMGd+7c0ct+bty4gXnz5iExMVEv29MnQ25bRSxZsgSRkZGYMGECvvjiC7z88stl1uUvyafLH3/8gXnz5vGXVDXZsGEDunfvDgcHB6hUKri5uSE4OLjM4//ZZ5+hZcuWMDMzQ9OmTbF27dpS6/3zzz8YNmwYbGxsoFarMWDAAPz9999V2BOqTjdu3MBLL72E5s2bw8rKCjY2Nnjuuefw+eefo7RvG9PX+WCij8bL0YIFC+Dm5ob8/HxotVrExMRgypQpWLFiBXbt2oU2bdpIdd955x3MmjXrsbZ/48YNzJ8/H66urmjXrl2F14uKinqs/VRGeW3bsGEDioqKqrwNT+LgwYPo3Lkz3n333ZpuClWxpKQkGBn9/999f/zxB+bPn48ePXoY/Ejm0+DkyZNwc3PDCy+8gLp16+Ly5cvYsGEDdu/ejVOnTsHJyUmq+/HHH+O1117DkCFDMG3aNPzyyy94/fXXcffuXcycOVOql52djZ49eyIzMxNvvfUWTE1NsXLlSnTv3h2JiYmoV69eTXSV9Ojff//F9evXMXToUDRs2BD5+fmIjo7GmDFjkJSUhCVLlkh19Xo+CHosmzZtEgDE8ePHS5QdOHBAmJubCxcXF3H37t0n2s/x48cFALFp06YK1c/JySl1+aFDhwQAsXXr1idqz5O0zdC4ubmJgICACtV1cXGpcN2HARChoaGVWre2yM7OrukmPJatW7cKAOLQoUMlyp7kZ20I8vPzRW5ubrXs67///hNJSUmVWvfEiRMCgAgPD5eW3b17V9SrV6/E8Q8MDBQWFhYiPT1dWrZs2TIBQBw7dkxadv78eWFsbCxmz55dqTZVheLfFZcvX67ppjySPj7H165d0/k5VYV+/foJCwsLUVBQIC3T5/nAS2Z61KtXL8yZMwdXr17Fl19+KS0vbQ5RdHQ0unbtChsbG1haWqJ58+Z46623ANyf99OxY0cAQHBwsHR5LjIyEsD9eUKtW7dGQkICvL29UadOHWndh+cQFSssLMRbb70FjUYDCwsLvPDCC7h27ZpOnbLmWzy4zUe1rbQ5RDk5OZg+fTqcnZ2hUqnQvHlzvP/++yWGPovn3OzYsQOtW7eGSqVCq1atsH///tIP+EPS0tIQEhICBwcHmJmZoW3btvj888+l8uL5VJcvX8aePXuktj/u5ZOK9qc0ixYtgpGRkc6lgH379qFbt26wsLCAlZUVAgICcO7cuUduKz8/H/Pnz0fTpk1hZmaGevXqoWvXroiOjtap9+eff2LYsGGws7ODubk5mjdvjrffflunzsmTJ9GnTx+o1WpYWlrCx8cHv/32m06d4svFhw8fxsSJE2Fvb48GDRo8Vj+0Wi2Cg4PRoEEDqFQqODo6YsCAAeX+DHbt2gWFQoHTp09Ly77//nsoFAoMHjxYp27Lli0xfPhw6f2D53RkZCRefPFFAEDPnj2ln39MTIzONn799Vc899xzMDMzQ6NGjbB58+Yy2/agb775Bp6enrCysoJarYaHhwdWr16tUycjIwNTp06Fq6srVCoVGjRogNGjR+Pff/+V6jzqPAaAK1euQKFQ4P3338eqVavQuHFjqFQq/PHHHwDu/8yHDh0KW1tbmJmZoUOHDti1a1eF+lGeU6dOYdKkSXBycsJHH31UqW0U//uQkZEhLTt06BBu3bqFiRMn6tQNDQ1FTk4O9uzZIy3btm0bOnbsKP07BAAtWrSAj48Pvvvuu0fuf9OmTejVqxfs7e2hUqng7u6O9evXl9rOfv36Veh8OHfuHHr16gVzc3M0aNAAixYtqvBIeUU/E/v27UP37t2l86tjx4746quvdOps3boVnp6eMDc3R/369fHSSy/hn3/+0akzZswYWFpa4tKlS+jbty+srKwQGBgIACgqKsKqVavQqlUrmJmZwcHBAa+++ipu3779yH78/PPPcHJyQmBgIA4dOlShfw8fl6urK+7evYu8vDxp2ZOeDzr0k9vko7wRIiHup2QAYujQodKyd999Vzx4qM+ePSuUSqXo0KGDWL16tYiIiBBvvPGG8Pb2FkIIodVqxYIFCwQAMX78ePHFF1+IL774Qly6dEkIIUT37t2FRqMRdnZ2YtKkSeLjjz8WO3bskMq6d+8u7at4hMjDw0O0adNGrFixQsyaNUuYmZmJZs2a6Yxkubi4iKCgoBJ9enCbj2pbUFCQcHFxkdYtKioSvXr1EgqFQowdO1Z8+OGHon///gKAmDJlis5+AIi2bdsKR0dHsXDhQrFq1SrRqFEjUadOHfHvv/+W+3O5e/euaNmypTA1NRVTp04Va9asEd26dRMAxKpVq6S2f/HFF6J+/fqiXbt2UtvL++vo4VGDx+3PgyNEb7/9tlAoFOKTTz6Rlm3evFkoFArRu3dvsXbtWrFs2TLh6uoqbGxsHvmX5VtvvSUUCoUYN26c2LBhg/jggw/EyJEjxdKlS6U6p06dEmq1WtSrV0/Mnj1bfPzxx+LNN98UHh4eUp2zZ88KCwsL6bgvXbpUuLm5CZVKJX777TepXvG57+7uLrp37y7Wrl0r7aui/Xj++eeFtbW1eOedd8Snn34qlixZInr27CkOHz5cZj9v3bolFAqFWLt2rbRs8uTJwsjISNjZ2UnL0tLSBADx4YcfSssePKcvXbokXn/9dQFAvPXWW9LPX6vVSnWbN28uHBwcxFtvvSU+/PBD0b59e6FQKMTZs2fL/VlERUUJAMLHx0esW7dOrFu3ToSFhYkXX3xRqnPnzh3RunVrYWxsLMaNGyfWr18vFi5cKDp27ChOnjwphKjYeSyEEJcvX5Z+Fo0aNRJLly4VK1euFFevXhVnz54V1tbWwt3dXSxbtkx8+OGHwtvbWygUCvHDDz+U24/SZGRkiI8++kh4enoKAMLKykqEhISIM2fOVHgb//77r0hNTRXHjx+XPi9RUVFS+aJFiwQAkZqaqrNebm6uMDIyEtOmTRNCCFFYWChUKpWYMGFCiX288847AoDIysoqty0dO3YUY8aMEStXrhRr164Vfn5+Jc4bISp+PqSkpAg7OztRt25dMW/ePPHee++Jpk2bijZt2lRohKgin4lNmzYJhUIhWrduLRYvXizWrVsnxo4dK15++WWdOgBEx44dxcqVK8WsWbOEubm5cHV1Fbdv35bqBQUFCZVKJRo3biyCgoJERESE2Lx5sxBCiLFjxwoTExMxbtw4ERERIWbOnCksLCxEx44dRV5eXrn9+Oeff8SMGTOERqMRAESjRo3EokWLxPXr18tdrzx3794VN2/eFJcvXxaRkZHCwsJCPP/881K5Ps6HBzEQPaZHBSIhhLC2thbPPvus9P7hQLRy5UoBQNy8ebPMbZR3Wap79+4CgIiIiCi1rLRA9Mwzz+icGN99950AIFavXi0tq0ggelTbHg5EO3bsEADEokWLdOoNHTpUKBQKcfHiRWkZAKFUKnWWnTp1SgDQ+WVYmlWrVgkA4ssvv5SW5eXlCS8vL2Fpaan
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# plot the distribution of scores but cut of the tail score above 100 and below 0\n",
"min_score = 30\n",
"max_score = 300\n",
"\n",
"scores = df[(df['score'] < max_score) & (df['score'] > min_score)]['score']\n",
2024-11-20 11:52:27 +01:00
"plt.hist(scores, bins=100)\n",
"plt.xlabel('score')\n",
"plt.ylabel('Frequency')\n",
2024-11-20 23:20:17 +01:00
"plt.title(f'Distribution of Joke scores with score < {max_score} and score > {min_score}')\n",
2024-11-20 11:52:27 +01:00
"plt.show()"
]
},
{
"cell_type": "code",
2024-11-20 23:20:17 +01:00
"execution_count": 44,
2024-11-20 11:52:27 +01:00
"metadata": {},
"outputs": [
{
"data": {
2024-11-20 23:20:17 +01:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjsAAAHHCAYAAABZbpmkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB1A0lEQVR4nO3dd1hUR/s38O/SQbp0RUTELioaEQtYUEBjjKJGYxTUWFFjNySCiT6KvcSa5FGx19geW1TsNaKiUewNC2AHAenz/uHL+blSpCywu34/17WXnDmzc+6zu7C3c2bmyIQQAkRERERqSqOsAyAiIiIqSUx2iIiISK0x2SEiIiK1xmSHiIiI1BqTHSIiIlJrTHaIiIhIrTHZISIiIrXGZIeIiIjUGpMdIiIiUmtMdpTYL7/8AplMVirHatmyJVq2bCltHz16FDKZDFu3bi2V4wcEBKBy5cqlcqyiSkxMxPfffw8bGxvIZDKMHDmyRI4jk8kwbNiwEmmbqCRUrlwZAQEBZR0GUZ6Y7JSSsLAwyGQy6aGnpwc7Ozt4e3vjt99+w9u3bxVynKdPn+KXX35BZGSkQtpTJGWOrSCmTZuGsLAwDBkyBGvWrEHv3r3zrFu5cmV8+eWXpRhd4Tx48AB9+/aFk5MT9PT0YGNjAw8PD0yaNKmsQ1Mq//zzD4YOHYqGDRtCW1v7k//5WL58OWrWrAk9PT04Oztj4cKFudZ78uQJunfvDlNTUxgbG6NTp064d+9esdpUBYmJiZg0aRLq1KmDcuXKoXz58qhfvz5++OEHPH36tKzDUxoxMTH48ccf0apVKxgZGUEmk+Ho0aN51j99+jSaN28OAwMD2NjYYMSIEUhMTMxRLzU1FRMmTICdnR309fXh5uaGgwcPllqbZUpQqVi5cqUAICZPnizWrFkjVqxYIaZNmybatWsnZDKZcHBwEJcvX5Z7Tnp6unj37l2hjnP+/HkBQKxcubJQz0tNTRWpqanS9pEjRwQAsWXLlkK1U9TY0tLSREpKisKOVRLc3NxEs2bNClTXwcFBdOjQoUjHASACAwOL9NyCuH37tjA1NRW2trbi559/Fn/++aeYPHmy+Prrr4Wurm6JHVcVTZo0SWhra4uGDRuKatWqifz+ZC5btkwAEH5+fuKPP/4QvXv3FgDE9OnT5eq9fftWODs7CysrKzFjxgwxd+5cYW9vLypWrChevHhRpDbLmoODg/D398+3TlpammjQoIHQ19cXgwcPFsuWLROzZ88Wffv2FRYWFuLIkSOlEqsqyP776+zsLNzd3QWAPF+fS5cuCT09PdGgQQOxdOlS8fPPPwtdXV3h4+OTo26PHj2ElpaWGDt2rPj999+Fu7u70NLSEidOnCjxNssak51Skp3snD9/Pse+8PBwoa+vLxwcHERycnKxjlPYZCcpKSnX8tJOdlSBo6NjgRMYZU52hg4dKrS0tMSDBw9y7IuLiyux4+YmMTGxVI5z/fp1kZaWVujnxcbGSr+TgYGBeSY7ycnJonz58jne8169eoly5cqJV69eSWUzZswQAMQ///wjF5+mpqYICgoqUptlrSDJzubNmwUAsW7duhz73r17J+Lj40soupxK63NXUB9/PhMSEsTLly+FEEJs2bIl32TH19dX2Nrayr1+f/75pwAg/v77b6ns3LlzAoCYNWuWVPbu3Tvh5OQk3N3dS7zNssZkp5Tkl+wIIcS0adMEAPHHH39IZZMmTcrxx/XAgQOiWbNmwsTERJQrV05Uq1ZN+gOZnaB8/MhOLjw9PUXt2rVFRESEaNGihdDX1xc//PCDtM/T01M6TnZbGzduFEFBQcLa2loYGBiIjh07iujoaLmY8vpD92Gbn4rN399fODg4yD0/MTFRjB49WlSsWFHo6OiIatWqiVmzZomsrCy5etnJwfbt20Xt2rWFjo6OqFWrlti3b1+ur/XH4uLiRL9+/YSVlZXQ1dUVLi4uIiwsLMdr8fHj/v37ebaZW7JT2PP50JQpU4RMJhO//fabVLZ3717RvHlzYWBgIAwNDUX79u3F1atXP3m+3t7eonLlyp+s9+FxPDw8hKGhoTAyMhKNGjXK8YW1efNm4erqKvT09ET58uVFr169xOPHj+Xq+Pv7i3Llyok7d+4IX19fYWhoKDp16iSEECIzM1PMmzdP1KpVS+jq6gorKysxcODAHF/o58+fF+3atRPly5cXenp6onLlyqJv376fPAd/f39haWkpxowZI6Kiogp87h/KL9nZs2ePACD27NkjV3769GkBQKxZs0Yq++KLL8QXX3yRo4127doJJyenIrWZmwcPHoghQ4aIatWqCT09PWFubi66du2a43Ob/bfp5MmTYtSoUcLCwkIYGBiIr7/+Wjx79kyublZWlpgyZYqoUKGC0NfXFy1bthRXr14tULITGhoqAOSaZOfm+vXrolu3bsLCwkLo6emJatWqiZ9++kmuzsWLF4WPj48wMjIS5cqVE61btxZnzpzJ9fyOHj0qhgwZIiwtLYWpqam0vyC/RzExMSIgIEBUqFBB6OjoCBsbG/HVV1/l+zfgUxITE8WKFStEs2bNBADx+vXrXOvll+zEx8cLLS0tMW7cOLny1NRUYWhoKPr37y+VjRs3TmhqauZIKrO/e7L/rpdEm8qAY3aURPb4jwMHDuRZ59q1a/jyyy+RmpqKyZMnY86cOfjqq69w6tQpAEDNmjUxefJkAMDAgQOxZs0arFmzBh4eHlIbL1++hK+vL+rXr4/58+ejVatW+cY1depU7NmzBxMmTMCIESNw8OBBeHl54d27d4U6v4LE9iEhBL766ivMmzcPPj4+mDt3LqpXr45x48Zh9OjROeqfPHkSQ4cORY8ePTBz5kykpKTAz88PL1++zDeud+/eoWXLllizZg169eqFWbNmwcTEBAEBAViwYIEU+5o1a2BhYYH69etLsVtaWhb4/At7Ph+aOHEiQkJC8Pvvv2P48OEAgDVr1qBDhw4wNDTEjBkzEBwcjKioKDRv3hwPHjzItz0HBwc8evQIhw8f/mTcYWFh6NChA169eoWgoCBMnz4d9evXx/79++XqdO/eHZqamggNDcWAAQOwbds2NG/eHG/evJFrLyMjA97e3rCyssLs2bPh5+cHABg0aBDGjRuHZs2aYcGCBejbty/WrVsHb29vpKenAwCePXuGdu3a4cGDB/jxxx+xcOFC9OrVC2fPnv3keQwfPhw+Pj5YunQpatWqhWbNmmHFihW5jkEoikuXLgEAGjVqJFfesGFDaGhoSPuzsrJw5cqVHPUAoHHjxrh79640fq+gbebl/PnzOH36NHr06IHffvsNgwcPRnh4OFq2bInk5OQc9YcPH47Lly9j0qRJGDJkCP73v//lGCgfEhKC4OBg1KtXD7NmzUKVKlXQrl07JCUl5RsL8P5zBwCrV6+GECLfuleuXIGbmxsOHz6MAQMGYMGCBfj666/xv//9T6pz7do1tGjRApcvX8b48eMRHByM+/fvo2XLljh37lyONocOHYqoqCiEhITgxx9/BFDw3yM/Pz9s374dffv2xZIlSzBixAi8ffsW0dHRnzzvj507dw4DBw6Era0t+vXrh5SUFCxatAhGRkaFbuvff/9FRkZGjs+Ijo4O6tevL/cZuXTpEqpVqwZjY2O5uo0bNwYAaSxlSbSpFMo42fpsfKpnRwghTExMRIMGDaTtj3t25s2bJwCI58+f59lGfpeKPD09BQCxbNmyXPfl1rNToUIFkZCQIJVnd0UvWLBAKitIz86nYvu4Z2fHjh0CgPjPf/4jV69r165CJpOJO3fuSGUAhI6OjlzZ5cuXBQCxcOHCHMf60Pz58wUAsXbtWqksLS1NuLu7C0NDQ7lzL8ylqY/rFvZ8snt2xowZIzQ0NOR6mt6+fStMTU3FgAED5NqKjY0VJiYmOco/dvXqVaGvry8AiPr164sffvhB7NixI8clzTdv3ggjIyPh5uaWY+xYdm9UWlqasLKyEnXq1JGrs3v3bgFAhISESGX+/v4CgPjxxx/l2jpx4kSulzf2798vV759+/ZP/g59Snx8vPj999+Fm5u
2024-11-20 11:52:27 +01:00
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-11-20 23:20:17 +01:00
"Number of jokes with score > 1000 and <10000 : 3279\n"
2024-11-20 11:52:27 +01:00
]
}
],
"source": [
2024-11-20 23:20:17 +01:00
"min_score = 1000 # 1000\n",
"max_score = 10_000 # 10_000\n",
"cut_off = 2500\n",
"\n",
"\n",
"# Filter scores within the specified range\n",
"scores = df[(df['score'] > min_score) & (df['score'] < max_score)]['score']\n",
"\n",
"# Create bins\n",
"bins = np.linspace(min_score, max_score, 100)\n",
"\n",
"# Plot the histogram\n",
"plt.hist(scores, bins=bins, color='#1b212c', edgecolor='#1b212c')\n",
"\n",
"green_scores = scores[scores > cut_off]\n",
"red_scores = scores[scores <= cut_off]\n",
"\n",
"# Highlight scores above and below 2500\n",
"plt.hist(green_scores, bins=bins, color='#82c7a5', edgecolor='#1b212c')\n",
"plt.hist(red_scores, bins=bins, color='#0145ac', edgecolor='#1b212c')\n",
2024-11-20 11:52:27 +01:00
"\n",
"plt.xlabel('Score')\n",
"plt.ylabel('Frequency')\n",
2024-11-20 23:20:17 +01:00
"plt.title(f'Distribution of Joke Scores >{min_score} and Scores <{max_score}')\n",
"plt.legend(['All Scores', f'Scores > {cut_off} n: {len(red_scores)}', f'Scores <= {cut_off} n: {len(green_scores)}'])\n",
2024-11-20 11:52:27 +01:00
"plt.show()\n",
"\n",
2024-11-20 23:20:17 +01:00
"# Print number of jokes with score > 1000 and < 10000\n",
2024-11-20 11:52:27 +01:00
"num_jokes = len(scores)\n",
2024-11-20 23:20:17 +01:00
"print(f'Number of jokes with score > {min_score} and <{max_score} :', num_jokes)"
2024-11-20 11:52:27 +01:00
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}