632 lines
360 KiB
Plaintext
632 lines
360 KiB
Plaintext
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 14,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import json\n",
|
||
|
"import os\n",
|
||
|
"import numpy as np\n",
|
||
|
"import pandas as pd\n",
|
||
|
"import matplotlib.pyplot as plt\n",
|
||
|
"import nltk\n",
|
||
|
"from nltk.corpus import stopwords\n",
|
||
|
"from nltk.tokenize import word_tokenize\n",
|
||
|
"from collections import Counter"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 15,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stderr",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"[nltk_data] Downloading package punkt to\n",
|
||
|
"[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n",
|
||
|
"[nltk_data] Package punkt is already up-to-date!\n",
|
||
|
"[nltk_data] Downloading package stopwords to\n",
|
||
|
"[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n",
|
||
|
"[nltk_data] Package stopwords is already up-to-date!\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"True"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 15,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# nltk count words\n",
|
||
|
"nltk.download('punkt')\n",
|
||
|
"nltk.download('stopwords')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"# dataset stupistuff"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 16,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Load the data from the JSON file\n",
|
||
|
"data_path = './data/stupidstuff.json'\n",
|
||
|
"with open(data_path) as f:\n",
|
||
|
" data = json.load(f)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 17,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/html": [
|
||
|
"<div>\n",
|
||
|
"<style scoped>\n",
|
||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||
|
" vertical-align: middle;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe tbody tr th {\n",
|
||
|
" vertical-align: top;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe thead th {\n",
|
||
|
" text-align: right;\n",
|
||
|
" }\n",
|
||
|
"</style>\n",
|
||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||
|
" <thead>\n",
|
||
|
" <tr style=\"text-align: right;\">\n",
|
||
|
" <th></th>\n",
|
||
|
" <th>body</th>\n",
|
||
|
" <th>category</th>\n",
|
||
|
" <th>id</th>\n",
|
||
|
" <th>rating</th>\n",
|
||
|
" </tr>\n",
|
||
|
" </thead>\n",
|
||
|
" <tbody>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>0</th>\n",
|
||
|
" <td>A blackjack dealer and a player with a thirtee...</td>\n",
|
||
|
" <td>Children</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>2.63</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>1</th>\n",
|
||
|
" <td>At a dinner party, several of the guests were ...</td>\n",
|
||
|
" <td>Blonde Jokes</td>\n",
|
||
|
" <td>2</td>\n",
|
||
|
" <td>2.57</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>2</th>\n",
|
||
|
" <td>One day this cop pulls over a blonde for speed...</td>\n",
|
||
|
" <td>Blonde Jokes</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>3.09</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>3</th>\n",
|
||
|
" <td>Three women are about to be executed for crime...</td>\n",
|
||
|
" <td>Blonde Jokes</td>\n",
|
||
|
" <td>4</td>\n",
|
||
|
" <td>4.10</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>4</th>\n",
|
||
|
" <td>A girl came skipping home FROM school one day....</td>\n",
|
||
|
" <td>Blonde Jokes</td>\n",
|
||
|
" <td>5</td>\n",
|
||
|
" <td>4.30</td>\n",
|
||
|
" </tr>\n",
|
||
|
" </tbody>\n",
|
||
|
"</table>\n",
|
||
|
"</div>"
|
||
|
],
|
||
|
"text/plain": [
|
||
|
" body category id rating\n",
|
||
|
"0 A blackjack dealer and a player with a thirtee... Children 1 2.63\n",
|
||
|
"1 At a dinner party, several of the guests were ... Blonde Jokes 2 2.57\n",
|
||
|
"2 One day this cop pulls over a blonde for speed... Blonde Jokes 3 3.09\n",
|
||
|
"3 Three women are about to be executed for crime... Blonde Jokes 4 4.10\n",
|
||
|
"4 A girl came skipping home FROM school one day.... Blonde Jokes 5 4.30"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 17,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# create pandas dataframe of the data\n",
|
||
|
"df = pd.DataFrame(data)\n",
|
||
|
"df.head()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 18,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"(744, 4)\n",
|
||
|
"People clap when they see you -- their hands over their eyes or ears.\n",
|
||
|
"Insults\n",
|
||
|
"1592\n",
|
||
|
"-----------\n",
|
||
|
"It was the end of the school year. The teacher had turned in her grades; there was nothing really for the class to do.All the kids were restless and it was near the end of the day. So the teacher thought of an activity. She said, \"The first ones to answer correctly the questions I ask may leave early today.\" Little Johnny said to himself, \"Good, I'm smart and I want to get outa here.\"The teacher asked, \"Who said 'Four Score and Seven Years Ago'?\" But before Johnny could open his mouth, Susie said, \"Abraham Lincoln?\"The teacher said, \"That's right, Susie. You may go.\" Johnny was mad that Susie had answered first.The teacher asked, \"Who said 'I Have a Dream'?\" But before Johnny could open his mouth, Mary said, \"Martin Luther King!\" The teacher said, \"That's right, Mary. You may go.\" Johnny was even madder than before. Mary had answered first. The teacher asked, \"Who said 'Ask not what your country can do for you'?\" Before Johnny could open his mouth, Nancy piped, \"John Kennedy!\"The teacher said, \"That's right, Nancy. You may go.\" Now Johnny was furious! Nancy had answered first.Then the teacher turned her back, and Johnny muttered, \"I wish these girls would keep their mouths shut!\"The teacher spun around. \"WHO SAID THAT?\" Johnny said, \"BILL CLINTON. CAN I GO NOW?\"\n",
|
||
|
"Miscellaneous\n",
|
||
|
"2100\n",
|
||
|
"-----------\n",
|
||
|
"The Secret Service was looking for more employees. They put up a sign and the next day they picked the next three people. They brought the first guy into a room and gave him a pistol and said\" Your wife is in that room go in and shoot her\" The guy looked at them and said\" No I can't do it\" So the Secret Service brought out the next guy and told him the same thing and handed him the gun. \"He went into the room and came back out but he didn't want to shoot her. So the Secret Service who was really desperate brought the last person in. She was a blonde so they were worried. They said\" Your husbandis in that room and I want you to shoot him.\" \"Alright\" she announced. She went into the room and the Secret Service heard alot of crashing and banging. They went in and found the man dead. \"What the hell is going on\" \"Oh The gun was a blank so I beat him to death with a chair.\"\n",
|
||
|
"Blonde Jokes\n",
|
||
|
"114\n",
|
||
|
"-----------\n",
|
||
|
"Q: How many software people does it take to screw in a light bulb?\n",
|
||
|
"A: None. That's a hardware problem.\n",
|
||
|
"A: One, but if he changes it, the whole building will probably fall down.\n",
|
||
|
"A: Two. One always leaves in the middle of the project.\n",
|
||
|
"Light Bulbs\n",
|
||
|
"2769\n",
|
||
|
"-----------\n",
|
||
|
"\n",
|
||
|
"Bar Jokes\n",
|
||
|
"3558\n",
|
||
|
"-----------\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# get jokes with highest scores min 4.5\n",
|
||
|
"good_jokes = df[df['rating'] >= 4.5].values\n",
|
||
|
"# random sample of 5 jokes\n",
|
||
|
"print(np.array(good_jokes).shape)\n",
|
||
|
"# 5 random indices min max\n",
|
||
|
"number_of_jokes = 5\n",
|
||
|
"idx = np.random.randint(0, len(good_jokes), number_of_jokes)\n",
|
||
|
"for i in idx:\n",
|
||
|
" print(good_jokes[i][0])\n",
|
||
|
" print(good_jokes[i][1])\n",
|
||
|
" print(good_jokes[i][2])\n",
|
||
|
" print('-----------')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 19,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Animals\n",
|
||
|
"Aviation\n",
|
||
|
"Bar Jokes\n",
|
||
|
"Blind Jokes\n",
|
||
|
"Blonde Jokes\n",
|
||
|
"Business\n",
|
||
|
"Children\n",
|
||
|
"Computers\n",
|
||
|
"Crazy Jokes\n",
|
||
|
"Deep Thoughts\n",
|
||
|
"English\n",
|
||
|
"Ethnic Jokes\n",
|
||
|
"Family, Parents\n",
|
||
|
"Farmers\n",
|
||
|
"Food Jokes\n",
|
||
|
"Heaven and Hell\n",
|
||
|
"Holidays\n",
|
||
|
"Idiots\n",
|
||
|
"Insults\n",
|
||
|
"Lawyers\n",
|
||
|
"Light Bulbs\n",
|
||
|
"Love & Romance\n",
|
||
|
"Marriage\n",
|
||
|
"Medical\n",
|
||
|
"Men\n",
|
||
|
"Military\n",
|
||
|
"Miscellaneous\n",
|
||
|
"Money\n",
|
||
|
"Music\n",
|
||
|
"Office Jokes\n",
|
||
|
"Old Age\n",
|
||
|
"One Liners\n",
|
||
|
"Police Jokes\n",
|
||
|
"Political\n",
|
||
|
"Redneck\n",
|
||
|
"Religious\n",
|
||
|
"School\n",
|
||
|
"Science\n",
|
||
|
"Sex\n",
|
||
|
"Sports\n",
|
||
|
"State Jokes\n",
|
||
|
"Women\n",
|
||
|
"Yo Mama\n",
|
||
|
"amount of categories: 43\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjwAAAItCAYAAAA9haCkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAADXwElEQVR4nOzdd1gU1/s28HvpHSwUURSwo9iNEmsQRezRaFTsWKJo7FG/sWvsvaLG3mNN7C12UbEbe9eooLGAooLA8/7Bu/Nj2QV2FaJu7s917XWxM2fPnll2Zp85VSUiAiIiIiIjZvKpC0BERESU1RjwEBERkdFjwENERERGjwEPERERGT0GPERERGT0GPAQERGR0WPAQ0REREaPAQ8REREZPQY8REREZPQY8BD9RwwfPhwqlQr//PPPpy4KEdG/jgEP0RdmyZIlUKlUOHXq1KcuCgEYM2YMNm/e/KmLQUQZYMBDRPQRGPAQfRkY8BCR0YmNjf3URfjs8DOh/zoGPERG4M8//0SVKlVga2sLJycnNGzYEFeuXMnwdffu3UOBAgVQvHhxREVFAQBevnyJXr16wcPDA5aWlihQoADGjx+PpKQkjdeuWbMGZcuWhb29PRwcHODr64vp06en+353796FSqXCpEmTMHXqVOTLlw/W1taoVq0a/vrrL630V69exXfffYfs2bPDysoK5cqVwx9//KGRRt3Ed/DgQXTr1g0uLi7IkydPuuV49+4dhg8fjkKFCsHKygq5cuVC48aNcevWLSXNpEmT8PXXXyNHjhywtrZG2bJlsX79eo18VCoVYmNjsXTpUqhUKqhUKrRr107Z//DhQ3To0AGurq6wtLREsWLFsGjRIq3y3Lt3Dw0aNICtrS1cXFzQu3dv7Nq1CyqVCgcOHNBIu27dOpQtWxbW1tbImTMnWrVqhYcPH2qkadeuHezs7HDr1i3UqVMH9vb2CA4OxrBhw2Bubo6nT59qlaFz585wcnLCu3fv0v3siL5UZp+6AET0cfbu3YugoCB4e3tj+PDhePv2LWbOnIlKlSrhzJkz8PT01Pm6W7duwd/fH9mzZ8eePXuQM2dOvHnzBtWqVcPDhw/RpUsX5M2bF8eOHcOgQYPw+PFjTJs2DQCwZ88etGjRAjVq1MD48eMBAFeuXMHRo0fRs2fPDMu8bNkyvHr1CqGhoXj37h2mT58Of39/XLx4Ea6urgCAS5cuoVKlSsidOzcGDhwIW1tb/Pbbb2jUqBE2bNiAb7/9ViPPbt26wdnZGUOHDk23NiMxMRH16tXDvn370Lx5c/Ts2ROvXr3Cnj178NdffyF//vwAgOnTp6NBgwYIDg5GfHw81qxZg6ZNm2Lr1q2oW7cuAGD58uXo2LEjvvrqK3Tu3BkAlNdHRUWhYsWKUKlU6N69O5ydnbFjxw6EhIQgJiYGvXr1ApBc8+Lv74/Hjx+jZ8+ecHNzw6pVq7B//36tsi9ZsgTt27dH+fLlMXbsWERFRWH69Ok4evQozp49CycnJyVtQkICAgMDUblyZUyaNAk2Njbw8/PDyJEjsXbtWnTv3l1JGx8fj/Xr16NJkyawsrLK8P9H9EUSIvqiLF68WABIRESEiIiUKlVKXFxc5NmzZ0qa8+fPi4mJibRp00bZNmzYMAEgT58+lStXroi7u7uUL19enj9/rqQZNWqU2NrayvXr1zXec+DAgWJqair3798XEZGePXuKg4ODJCQkGFT2O3fuCACxtraWv//+W9l+4sQJASC9e/dWttWoUUN8fX3l3bt3yrakpCT5+uuvpWDBglqfR+XKlfUqz6JFiwSATJkyRWtfUlKS8vebN2809sXHx0vx4sXF399fY7utra20bdtWK6+QkBDJlSuX/PPPPxrbmzdvLo6Ojkr+kydPFgCyefNmJc3bt2+lSJEiAkD279+vvL+Li4sUL15c3r59q6TdunWrAJChQ4cq29q2bSsAZODAgVrl8vPzkwoVKmhs27hxo8Z7ERkjNmkRfcEeP36Mc+fOoV27dsiePbuyvUSJEqhZsya2b9+u9Zq//voL1apVg6enJ/bu3Yts2bIp+9atW4cqVaogW7Zs+Oeff5RHQEAAEhMTcejQIQCAk5MTYmNjsWfPng8qd6NGjZA7d27l+VdffYUKFSoo5X3+/Dn+/PNPNGvWDK9evVLK8ezZMwQGBuLGjRtazTidOnWCqalphu+9YcMG5MyZEz169NDap1KplL+tra2Vv1+8eIHo6GhUqVIFZ86cyfA9RAQbNmxA/fr1ISIan2VgYCCio6OVfHbu3IncuXOjQYMGyuutrKzQqVMnjTxPnTqFJ0+eoFu3bhq1MHXr1kWRIkWwbds2rXJ07dpVa1ubNm1w4sQJjea7lStXwsPDA9WqVcvw2Ii+VAx4iL5g9+7dAwAULlxYa1/RokXxzz//aDXv1K9fH/b29ti1axccHBw09t24cQM7d+6Es7OzxiMgIAAA8OTJEwDJzUeFChVCUFAQ8uTJgw4dOmDnzp16l7tgwYJa2woVKoS7d+8CAG7evAkRwZAhQ7TKMmzYMI2yqHl5een13rdu3ULhwoVhZpZ+i/7WrVtRsWJFWFlZIXv27HB2dsbcuXMRHR2d4Xs8ffoUL1++xPz587XK3759e43y37t3D/nz59cItgCgQIECGs/T+18XKVJE2a9mZmamsy/T999/D0tLS6xcuRIAEB0dja1btyI4OFirDETGhH14iP5jmjRpgqVLl2LlypXo0qWLxr6kpCTUrFkTP/30k87XFipUCADg4uKCc+fOYdeuXdixYwd27NiBxYsXo02bNli6dOlHl1HdQbpfv34IDAzUmSZ1QJCyRuZjHT58GA0aNEDVqlUxZ84c5MqVC+bm5li8eDFWrVqV4evV5W/VqhXatm2rM02JEiUyrby6WFpawsRE+542W7ZsqFevHlauXImhQ4di/fr1iIuLQ6tWrbK0PESfGgMeoi9Yvnz5AADXrl3T2nf16lXkzJkTtra2GtsnTpwIMzMzdOvWDfb29mjZsqWyL3/+/Hj9+rVSo5MeCwsL1K9fH/Xr10dSUhK6deuGefPmYciQIVrBSGo3btzQ2nb9+nWlg7W3tzcAwNzcXK+yGCJ//vw4ceIE3r9/D3Nzc51pNmzYACsrK+zatQuWlpbK9sWLF2ul1VUr4uzsDHt7eyQmJmZY/nz58uHy5csQEY28bt68qZUOSP5f+/v7a+y7du2asl8fbdq0QcOGDREREYGVK1eidOnSKFasmN6vJ/oSsUmL6AuWK1culCpVCkuXLsXLly+V7X/99Rd2796NOnXqaL1GpVJh/vz5+O6779C2bVuNYd7NmjVDeHg4du3apfW6ly9fIiEhAQDw7NkzjX0mJiZKjUVcXFyG5d68ebNGH5yTJ0/ixIkTCAoKApBcg1S9enXMmzcPjx8/1nq9rmHV+mrSpAn++ecfzJo1S2ufiAAATE1NoVKpkJiYqOy7e/euzgkGbW1tNT579eubNGmCDRs26Bxun7L8gYGBePjwocb/4d27d1iwYIHGa8qVKwcXFxeEhYVpfMY7duzAlStXlJFj+ggKCkLOnDkxfvx4HDx4kLU79J/AGh6iL9zEiRMRFBQEPz8/hISEKMPSHR0dMXz4cJ2vMTExwYoVK9CoUSM0a9YM27dvh7+/P/r3748//vgD9erVQ7t27VC2bFnExsbi4sWLWL9+Pe7evYucOXOiY8eOeP78Ofz9/ZEnTx7cu3cPM2fORKlSpVC0aNEMy1ygQAFUrlwZXbt2RVxcHKZNm4YcOXJoNKXNnj0blStXhq+vLzp16gRvb29ERUUhPDwcf//9N86fP/9Bn1ebNm2wbNky9OnTBydPnkSVKlUQGxuLvXv3olu3bmjYsCHq1q2LKVOmoHbt2mjZsiWePHmC2bNno0CBArhw4YJGfmXLlsXevXsxZcoUuLu7w8vLCxUqVMC4ceOwf/9+VKhQAZ06dYKPjw+eP3+OM2fOYO/evXj+/DkAoEuXLpg1axZatGiBnj17IleuXFi5cqXSMVld62Nubo7x48ejffv2qFatGlq0aKEMS/f09ETv3r3
|
||
|
"text/plain": [
|
||
|
"<Figure size 640x480 with 1 Axes>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"#print unique categories\n",
|
||
|
"categories = df['category'].unique()\n",
|
||
|
"categories= sorted(categories)\n",
|
||
|
"for category in categories:\n",
|
||
|
" print(category)\n",
|
||
|
"print('amount of categories:', len(categories))\n",
|
||
|
"# plot the distribution of categories\n",
|
||
|
"category_counts = df['category'].value_counts()\n",
|
||
|
"category_counts.plot(kind='bar')\n",
|
||
|
"plt.title('Jokes per category')\n",
|
||
|
"plt.ylabel('Number of jokes')\n",
|
||
|
"plt.xlabel('Category')\n",
|
||
|
"plt.xticks(rotation=90)\n",
|
||
|
"plt.show()\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 20,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjsAAAHHCAYAAABZbpmkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA/CklEQVR4nO3df3xP9f//8ftrm22MbU22WZiFsPx6m/AKJZZhiVCRGIn323tTTKX19kYUUvk99O6rzY+8Re9Qfs+PeJcRSkklClP7pWRDb9ts5/uHy16fXkbx2mt7zel2vVzO5dJ5nuc55/E8xu6d8zyvl8UwDEMAAAAm5ebqAgAAAMoSYQcAAJgaYQcAAJgaYQcAAJgaYQcAAJgaYQcAAJgaYQcAAJgaYQcAAJgaYQcAAJgaYQcoIxMnTpTFYimXc3Xs2FEdO3a0rX/44YeyWCx69913y+X8gwcPVt26dcvlXI46f/68nnzySQUHB8tisWjUqFFlch6LxaK4uLgyObYrJCcny2Kx6MSJE64uBXAYYQe4DsX/4Bcv3t7eCgkJUVRUlObMmaNz58455Tzp6emaOHGiDh486JTjOVNFru16TJkyRcnJyRoxYoSWLl2qgQMHXrNv3bp19cADD5RjddevOMgWL+7u7goMDFTfvn319ddfO3zcKVOmaM2aNc4rFKhAPFxdAHAzmTRpksLCwlRQUKDMzEx9+OGHGjVqlGbMmKH3339fzZo1s/UdN26cnn/++Rs6fnp6ul588UXVrVtXLVq0uO79tmzZckPnccTv1fbmm2+qqKiozGsoje3bt6tt27aaMGGCq0txiqeeekp33XWXCgoK9MUXX2jhwoX68MMP9eWXXyo4OPiGjzdlyhT17dtXvXr1smsfOHCg+vXrJy8vLydVDpQ/wg5wA7p166ZWrVrZ1hMSErR9+3Y98MADevDBB/X111+rcuXKkiQPDw95eJTtX7Fff/1VVapUkaenZ5me549UqlTJpee/HtnZ2QoPD3d1GU7ToUMH9e3b17besGFDjRgxQkuWLNFzzz3ntPO4u7vL3d3daccDXIHHWEApderUSf/85z918uRJLVu2zNZ+tTk7KSkpat++vfz9/VW1alU1bNhQL7zwgqTLjyfuuusuSdKQIUNsjymSk5MlXZ6X06RJEx04cED33HOPqlSpYtv3yjk7xQoLC/XCCy8oODhYPj4+evDBB3Xq1Cm7PnXr1tXgwYNL7PvbY/5RbVebs3PhwgWNGTNGtWvXlpeXlxo2bKjXXntNhmHY9Sue47JmzRo1adJEXl5euvPOO7Vp06arX/ArZGdna+jQoQoKCpK3t7eaN2+uxYsX27YXP/Y5fvy41q9fb6v9RuegXO94ruall16Sm5ub5s6da2vbuHGjOnToIB8fH1WrVk3R0dE6fPjwDdX0Wx06dJAkfffdd3btr732mu6++25Vr15dlStXVkRERIm5XBaLRRcuXNDixYtt16f4Z+Jqc3aKH/N99NFHat26tby9vXX77bdryZIlJer64osvdO+996py5cqqVauWXnrpJSUlJZU45v79+xUVFaVbb71VlStXVlhYmJ544gmHrwfwW9zZAZxg4MCBeuGFF7RlyxYNGzbsqn0OHz6sBx54QM2aNdOkSZPk5eWlY8eO6eOPP5YkNW7cWJMmTdL48eM1fPhw2y+vu+++23aMn3/+Wd26dVO/fv30+OOPKygo6Hfrevnll2WxWDR27FhlZ2dr1qxZioyM1MGDB213oK7H9dT2W4Zh6MEHH9SOHTs0dOhQtWjRQps3b9azzz6rH3/8UTNnzrTr/9FHH+m9997T3//+d1WrVk1z5sxRnz59lJaWpurVq1+zrv/973/q2LGjjh07pri4OIWFhWnVqlUaPHiwzp49q6efflqNGzfW0qVLNXr0aNWqVUtjxoyRJNWoUeO6x3+j4/mtcePGacqUKXrjjTdsPxtLly5VTEyMoqKi9Morr+jXX3/VggUL1L59e3322WcOTfYuDg633HKLXfvs2bP14IMPasCAAcrPz9eKFSv08MMPa926dYqOjrbV8+STT6p169YaPny4JKlevXq/e75jx46pb9++Gjp0qGJiYvTWW29p8ODBioiI0J133ilJ+vHHH3XffffJYrEoISFBPj4++n//7/+VeCSWnZ2tLl26qEaNGnr++efl7++vEydO6L333rvh6wBclQHgDyUlJRmSjH379l2zj5+fn/GXv/zFtj5hwgTjt3/FZs6caUgyTp8+fc1j7Nu3z5BkJCUlldh27733GpKMhQsXXnXbvffea1vfsWOHIcm47bbbjNzcXFv7ypUrDUnG7NmzbW2hoaFGTEzMHx7z92qLiYkxQkNDbetr1qwxJBkvvfSSXb++ffsaFovFOHbsmK1NkuHp6WnX9vnnnxuSjLlz55Y412/NmjXLkGQsW7bM1pafn29YrVajatWqdmMPDQ01oqOjf/d41+p7o+OJjY01DMMwxowZY7i5uRnJycm27efOnTP8/f2NYcOG2R0rMzPT8PPzK9F+peI/27feess4ffq0kZ6ebmzatMmoX7++YbFYjE8++cSu/6+//mq3np+fbzRp0sTo1KmTXbuPj89Vfw6Kf/aPHz9uawsNDTUkGbt27bK1ZWdnG15eXsaYMWNsbSNHjjQsFovx2Wef2dp+/vlnIyAgwO6Yq1ev/sO/X0Bp8BgLcJKqVav+7ltZ/v7+kqS1a9c6PJnXy8tLQ4YMue7+gwYNUrVq1Wzrffv2Vc2aNbVhwwaHzn+9NmzYIHd3dz311FN27WPGjJFhGNq4caNde2RkpN2dhGbNmsnX11fff//9H54nODhY/fv3t7VVqlRJTz31lM6fP6+dO3c6YTQ3Ph7DMBQXF6fZs2dr2bJliomJsW1LSUnR2bNn1b9/f/3000+2xd3dXW3atNGOHTuuq6YnnnhCNWrUUEhIiLp27aqcnBwtXbrU9rix2G/v4P3yyy/KyclRhw4d9Omnn97oZbATHh5uu8MnXb5T1rBhQ7s/s02bNslqtdpNaA8ICNCAAQPsjlX8d2PdunUqKCgoVV3A1RB2ACc5f/68XbC40qOPPqp27drpySefVFBQkPr166eVK1feUPC57bbbbmgycoMGDezWLRaL6tevX+afmXLy5EmFhISUuB6NGze2bf+tOnXqlDjGLbfcol9++eUPz9OgQQO5udn/U3at8zjqRsezZMkSJSYmau7cuXZBTJKOHj0q6fJcrxo1atgtW7ZsUXZ29nXVNH78eKWkpGj16tUaNGiQcnJySlwH6XKAaNu2rby9vRUQEKAaNWpowYIFysnJue7xX831/JmdPHlS9evXL9HvyrZ7771Xffr00Ysvvqhbb71VPXv2VFJSkvLy8kpVI1CMOTuAE/zwww/Kycm56j/sxSpXrqxdu3Zpx44dWr9+vTZt2qR33nlHnTp10pYtW67rjZcbmWdzva71wYeFhYXl9hbOtc5jXMfk34qoXbt2OnjwoObNm6dHHnlEAQEBtm3F4Xbp0qVXfUX8et/ga9q0qSIjIyVJvXr10q+//qphw4apffv2ql27tiTpv//9rx588EHdc889mj9/vmrWrKlKlSopKSlJy5cvL9UYnflnVvwBmHv27NEHH3ygzZs364knntDrr7+uPXv2qGrVqqWqFeDODuAES5culSRFRUX9bj83Nzd17txZM2bM0FdffaWXX35Z27dvtz26cPYnLhffRShmGIaOHTtmNwH2lltu0dmzZ0vse+XdihupLTQ0VOnp6SUe633zzTe27c4QGhqqo0ePlrg7VhbnuZHx1K9fX1u2bFF6erq6du1qt1/x47rAwEBFRkaWWK72Vt31mDZtmi5evKiXX37Z1vaf//xH3t7etvDQrVs3W0C6Ull82ndoaKiOHTtWov1qbZLUtm1bvfzyy9q/f7/efvttHT58WCtWrHB6XfjzIewApbR9+3ZNnjxZYWFhJeYi/NaZM2dKtBXPZSi+Xe/j4yNJVw0fjliyZIndL9p3331XGRkZ6tatm62tXr162rNnj/Lz821t69atK/GK+o3U1r17dxU
|
||
|
"text/plain": [
|
||
|
"<Figure size 640x480 with 1 Axes>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# plot the distribution of ratings\n",
|
||
|
"ratings = df['rating']\n",
|
||
|
"plt.hist(ratings, bins=100)\n",
|
||
|
"plt.xlabel('Rating')\n",
|
||
|
"plt.ylabel('Frequency')\n",
|
||
|
"plt.title('Distribution of Joke Ratings')\n",
|
||
|
"plt.show()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 21,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjcAAAItCAYAAADFZVtTAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAADUSElEQVR4nOzdd1QU19sH8O/Se1ERFFFEFHvvGns3ItFo7GI3scWuiV1jizVq7A177L0QI3aNvcZeUKNYoqKoqPC8f/Du/Bh2gVlEMZvv55w9hx3uzN6dbc/c+9x7dSIiICIiIjITFmldASIiIqLUxOCGiIiIzAqDGyIiIjIrDG6IiIjIrDC4ISIiIrPC4IaIiIjMCoMbIiIiMisMboiIiMisMLghIiIis8Lghog+SzqdDsOGDfvkjxsWFgadToc1a9Z88scmotTB4IYonl9//RU6nQ6lSpVK66p8dnx9faHT6ZSbo6MjSpYsiZCQkBQfc9u2bWkSwHxOTp8+jRYtWsDHxwe2trZIly4dqlWrhoULFyImJsbk440ePRobNmxI/YoS/YtYpXUFiD4ny5Ytg6+vL/78809cu3YN/v7+aV2lz0rhwoXRu3dvAMD9+/cxb948tG7dGtHR0ejQoYPJx9u2bRtmzJhhNMB5/fo1rKzM+ytq3rx56Ny5Mzw9PdGyZUvkzJkTL168wO7du9GuXTvcv38fP/zwg0nHHD16NL7++msEBQV9nEoT/QuY9zcHkQlu3ryJQ4cOYd26dejUqROWLVuGoUOHftI6xMbG4u3bt7Czs/ukj6uVt7c3WrRoodwPDg6Gn58fJk+enKLgJimf6zlILUeOHEHnzp1RpkwZbNu2Dc7Ozsr/vv/+exw/fhznz59Pwxp+XFFRUXB0dEzrapCZYrcU0f9btmwZ3N3dUbduXXz99ddYtmyZ8r93794hXbp0aNOmjcF+kZGRsLOzQ58+fZRt0dHRGDp0KPz9/WFrawsfHx/069cP0dHRqn11Oh26du2KZcuWIV++fLC1tcWOHTsAABMmTEDZsmWRPn162Nvbo1ixYkbzQF6/fo3u3bsjQ4YMcHZ2RmBgIO7du2c0Z+XevXto27YtPD09YWtri3z58mHBggUpPmceHh7InTs3rl+/rtq+f/9+NGrUCFmzZlWef8+ePfH69WulTHBwMGbMmKGcB/0t/rmJX/9hw4ZBp9Ph2rVrCA4OhpubG1xdXdGmTRu8evUqxeckMTExMfjhhx/g5eUFR0dHBAYG4s6dO8r/hw4dCmtrazx69Mhg344dO8LNzQ1v3rxJ9PjDhw+HTqfDsmXLVIGNXvHixREcHKzc1/J+0Ol0iIqKwuLFi5XzGf8YWl//27dvIzAwEI6OjsiYMSN69uyJnTt3QqfTISwsTFV29erVKFasGOzt7ZEhQwa0aNEC9+7dU5UJDg6Gk5MTrl+/jjp16sDZ2RnNmzf/4HNIlCghIhERyZ07t7Rr105ERPbt2ycA5M8//1T+37ZtW3Fzc5Po6GjVfosXLxYAcuzYMRERiYmJkRo1aoiDg4N8//33Mnv2bOnatatYWVlJ/fr1VfsCkDx58oiHh4cMHz5cZsyYIadOnRIRkSxZssh3330n06dPl0mTJknJkiUFgGzZskV1jMaNGwsAadmypcyYMUMaN24shQoVEgAydOhQpdyDBw8kS5Ys4uPjIyNGjJCZM2dKYGCgAJDJkycne36yZcsmdevWVW179+6deHl5iaenp2p7t27dpE6dOjJ69GiZPXu2tGvXTiwtLeXrr79Wyhw6dEiqV68uAGTJkiXKLf65iV//oUOHCgApUqSINGjQQH799Vdp3769AJB+/fql6JwYs2fPHgEgBQoUkIIFC8qkSZNkwIABYmdnJ7ly5ZJXr16JiMjVq1cFgEybNk21f3R0tLi7u0vbtm0TfYyoqCixtraWKlWqJFmX+LS8H5YsWSK2trbyxRdfKOfz0KFDIqL99X/58qX4+fmJvb29DBgwQKZMmSIlS5ZUzt+ePXuUsgsXLhQAUqJECZk8ebIMGDBA7O3txdfXV54+faqUa926tdja2kqOHDmkdevWMmvWLAkJCfmgc0iUFAY3RCJy/PhxASChoaEiIhIbGytZsmSRHj16KGV27twpAGTz5s2qfevUqSN+fn7K/SVLloiFhYXs379fVW7WrFkCQA4ePKhsAyAWFhZy4cIFgzrpf0T13r59K/nz51f9IJ44cUIAyPfff68qGxwcbPBD3q5dO8mUKZM8fvxYVbZJkybi6upq8HgJZcuWTWrUqCGPHj2SR48eyblz56Rly5YCQLp06ZJk3UVExowZIzqdTm7fvq1s69KliyR2jZVYcJPwB++rr76S9OnTK/dNOSfG6IMbb29viYyMVLb/9ttvAkCmTp2qbCtTpoyUKlVKtf+6desMgoCEzpw5IwBU76/kaHk/iIg4OjpK69atDfbX+vpPnDhRAMiGDRuUMq9fv5bcuXOrntfbt28lY8aMkj9/fnn9+rVSdsuWLQJAhgwZomxr3bq1AJABAwYY1Cul55AoKeyWIkJcl5SnpycqV64MIK55/5tvvsHKlSuVEStVqlRBhgwZsGrVKmW/p0+fIjQ0FN98842ybfXq1ciTJw9y586Nx48fK7cqVaoAAPbs2aN67IoVKyJv3rwGdbK3t1c9zvPnz/HFF1/g5MmTynZ9F9Z3332n2rdbt26q+yKCtWvXol69ehARVb1q1qyJ58+fq46bmF27dsHDwwMeHh4oUKAAlixZgjZt2uDnn39OtO5RUVF4/PgxypYtCxHBqVOnkn2cpHTu3Fl1/4svvsCTJ08QGRkJQPs5SU6rVq1U3UVff/01MmXKhG3btqnKHD16VNUtt2zZMvj4+KBixYqJHltfV2PdUYnR8n5IjCmv/44dO+Dt7Y3AwEBlfzs7O4OcquPHj+Phw4f47rvvVPlRdevWRe7cubF161aDenz77bcG21J6DomSwuCG/vNiYmKwcuVKVK5cGTdv3sS1a9dw7do1lCpVChEREdi9ezcAwMrKCg0bNsTGjRuV3Jl169bh3bt3quDm6tWruHDhghIE6G+5cuUCADx8+FD1+NmzZzdary1btqB06dKws7NDunTp4OHhgZkzZ+L58+dKmdu3b8PCwsLgGAlHeT169AjPnj3DnDlzDOqlzyNKWC9jSpUqhdDQUOzYsQMTJkyAm5sbnj59ChsbG1W58PBwBAcHI126dHBycoKHh4fyQxW//imRNWtW1X13d3cAcT/4gPZzkpycOXOq7ut0Ovj7++PWrVvKtm+++Qa2trZKftbz58+xZcsWNG/eXJU/lJCLiwsA4MWLF5rro+X9kBhTXv/bt28jR44cBvVPeP5u374NAAgICDB4vNy5cyv/17OyskKWLFkMyqb0HBIlhaOl6D/vjz/+wP3797Fy5UqsXLnS4P/Lli1DjRo1AABNmjTB7NmzsX37dgQFBeG3335D7ty5UahQIaV8bGwsChQogEmTJhl9PB8fH9X9+Ffkevv370dgYCAqVKiAX3/9FZkyZYK1tTUWLlyI5cuXm/wcY2NjAQAtWrRA69atjZYpWLBgssfJkCEDqlWrBgCoWbMmcufOjS+//BJTp05Fr169AMQFi9WrV8c///yD/v37I3fu3HB0dMS9e/cQHBys1CWlLC0tjW4XkQ86bkq4u7vjyy+/xLJlyzBkyBCsWbMG0dHRqhFlxvj7+8PKygrnzp3T9Dgf+n5Irdf/Q9ja2sLCwvB6OqXnkCgpDG7oP2/ZsmXImDGjMnInvnXr1mH9+vWYNWsW7O3tUaFCBWTKlAmrVq1C+fLl8ccff+DHH39U7ZMjRw6cOXMGVatWTfGV59q1a2FnZ4edO3fC1tZW2b5w4UJVuWzZsiE2NhY3b95UtTRcu3ZNVc7DwwPOzs6IiYlRgpPUULduXVSsWBGjR49Gp06d4OjoiHPnzuHKlStYvHgxWrVqpZQNDQ012P9jXJlrPSfJuXr1quq+iODatWsGQUCrVq1Qv359HDt2DMuWLUORIkWQL1++JI/t4OCAKlWq4I8//sCdO3cMAt6EtL4
|
||
|
"text/plain": [
|
||
|
"<Figure size 640x480 with 1 Axes>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# plot the average rating per category\n",
|
||
|
"means = [df[df['category'] == category]['rating'].mean() for category in categories]\n",
|
||
|
"\n",
|
||
|
"plt.bar(categories, means)\n",
|
||
|
"plt.xlabel('Category')\n",
|
||
|
"plt.ylabel('Average Rating')\n",
|
||
|
"plt.title('Average Rating by Category')\n",
|
||
|
"plt.xticks(rotation=90)\n",
|
||
|
"plt.show()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 22,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAApYAAAHHCAYAAAAFyBjFAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd1QU19/H8fcC0psgCiqCgqIoWKLGjh0bsWCNDcXeO5aooLHG3jvYe4mxY8FYSKwQFcUSFDVYgoKCgpT7/MGz83MFFAz2+zpnT7Kzd+7cGVb8OjP3MyohhECSJEmSJEmS/iOtTz0ASZIkSZIk6esgC0tJkiRJkiQpR8jCUpIkSZIkScoRsrCUJEmSJEmScoQsLCVJkiRJkqQcIQtLSZIkSZIkKUfIwlKSJEmSJEnKEbKwlCRJkiRJknKELCwlSZIkSZKkHCELS0mSvgoqlQpfX99PPQwJsLe3p0mTJp96GFn2yy+/UKRIEbS1tSlTpswH2UbNmjUpVarUB+k7q4KCglCpVAQFBX3ScUhfN1lYStJ/sGjRIlQqFd9///2nHspnx97eHpVKpbyMjIyoWLEia9asee8+9+3bJ4vH/6c+rjNnzkz3WUBAACqVinPnzn2CkX1ZDh06xIgRI6hatSr+/v5Mnjw507ZeXl4YGxt/xNF9GVJSUvD396dmzZpYWFigp6eHvb09Xbp0ea/vYFhYGL6+vty+fTvnByt9cDqfegCS9CVbv3499vb2nDlzhps3b+Lo6Piph/RZKVOmDEOHDgUgKiqKFStW0LlzZxITE+nevXu2+9u3bx8LFy7MsLh8+fIlOjrf3q+0X375hd69e2NoaPiph/JFOnr0KFpaWqxcuRJdXd1PPZwPqkaNGrx8+TJH9/Ply5e0aNGCAwcOUKNGDUaPHo2FhQW3b99my5YtrF69msjISAoWLJjlPsPCwvDz86NmzZrY29vn2Filj0OesZSk9xQREcHp06eZNWsWVlZWrF+//qOPITU1lYSEhI++3awqUKAAHTp0oEOHDgwfPpyTJ09ibGzM7Nmzc3xb+vr631xhWaZMGR4+fMiSJUs+9VA+uuTkZF69evWf+3n06BEGBgZffVEJoKWlhb6+PlpaOfdX//Dhwzlw4ACzZ8/m+PHjDBs2jK5duzJhwgSuXLnC9OnTc2xbn5vP/ffvpyILS0l6T+vXryd37tw0btyYli1bahSWSUlJWFhY0KVLl3TrPXv2DH19fYYNG6YsS0xMZPz48Tg6OqKnp4etrS0jRowgMTFRY12VSkW/fv1Yv349JUuWRE9PjwMHDgAwY8YMqlSpgqWlJQYGBnz33Xds27Yt3fZfvnzJgAEDyJMnDyYmJvzwww/cv38/w3sU79+/T9euXcmXLx96enqULFmSVatWvfcxs7Kyonjx4ty6dUtj+YkTJ2jVqhWFChVS9n/w4MG8fPlSaePl5cXChQuV46B+vX5sXh+/r68vKpWKmzdv4uXlhbm5OWZmZnTp0oUXL1689zF53cOHD9HR0cHPzy/dZ+Hh4ahUKhYsWACkfSf8/PwoWrQo+vr6WFpaUq1aNQIDA7N8/N5UtWpVateuzfTp0zWOVUZq1qxJzZo10y338vLSOCt0+/ZtVCoVM2bMYOHChRQpUgRDQ0Pq16/P3bt3EUIwceJEChYsiIGBAU2bNuXJkycZbvPQoUOUKVMGfX19nJ2d2bFjR7o2MTExDBo0CFtbW/T09HB0dGTatGmkpqZmOKY5c+bg4OCAnp4eYWFhme5vcnIyEydOVNra29szevRojT9TKpUKf39/4uPjle9TQEDAW49jRhYtWqT8ecyfPz99+/YlJibmnesdOnQIQ0ND2rVrR3JyMgDXrl2jZcuWWFhYoK+vT/ny5dm9e7fGeu/7XcroHkv1vZ9hYWHUqlULQ0NDChQokKWC8N69eyxdupR69eoxaNCgdJ9ra2szbNgw5WzlnTt36NOnD05OThgYGGBpaUmrVq00LnkHBATQqlUrAGrVqqX8XF4f8/79+6levTpGRkaYmJjQuHFjrly5km77W7duxdnZGX19fUqVKsXOnTvTfd8B4uPjGTp0qPIddHJyYsaMGQghNNpl9Pt3//792Nvb07Rp03TbT0hIwMzMjJ49e77zWH5VhCRJ76V48eLC29tbCCHE77//LgBx5swZ5fOuXbsKc3NzkZiYqLHe6tWrBSDOnj0rhBAiJSVF1K9fXxgaGopBgwaJpUuXin79+gkdHR3RtGlTjXUBUaJECWFlZSX8/PzEwoULxcWLF4UQQhQsWFD06dNHLFiwQMyaNUtUrFhRAGLPnj0afbRu3VoAomPHjmLhwoWidevWonTp0gIQ48ePV9o9ePBAFCxYUNja2ooJEyaIxYsXix9++EEAYvbs2e88PnZ2dqJx48Yay5KSkoS1tbXIly+fxvL+/fuLRo0aicmTJ4ulS5cKb29voa2tLVq2bKm0OX36tKhXr54AxNq1a5XX68fm9fGPHz9eAKJs2bKiRYsWYtGiRaJbt24CECNGjHivY5KR2rVrC2dn53TL/fz8hLa2tnjw4IEQQojRo0cLlUolunfvLpYvXy5mzpwp2rVrJ6ZOnfrW/jMDiL59+yrfvZkzZyqf+fv7a3zHhBDCzc1NuLm5peunc+fOws7OTnkfEREhAFGmTBnh7OwsZs2aJX766Sehq6srKlWqJEaPHi2qVKki5s2bJwYMGCBUKpXo0qWLRp92dnaiWLFiwtzcXIwcOVLMmjVLuLi4CC0tLXHo0CGlXXx8vHB1dRWWlpZi9OjRYsmSJaJTp05CpVKJgQMHphuTs7OzKFKkiJg6daqYPXu2uHPnTqbHp3PnzgIQLVu2FAsXLhSdOnUSgGjWrJnSZu3ataJ69epCT09P+T7dunXrrX0aGRlpLFN/z+rWrSvmz58v+vXrJ7S1tUWFChXEq1evNI5/yZIllfe//fab0NPTE506dRLJyclCCCEuX74szMzMhLOzs5g2bZpYsGCBqFGjhlCpVGLHjh3Kuu/7XTp27JgAxLFjxzTGlT9/fmFraysGDhwoFi1aJGrXri0AsW/fvrf2t2zZMgGINWvWvLWd2tatW0Xp0qXFuHHjxLJly8To0aNF7ty5hZ2dnYiPjxdCCHHr1i0xYMAAAYjRo0crPxf1n6M1a9YIlUolGjRoIObPny+mTZsm7O3thbm5uYiIiFC2tWfPHqFSqYSrq6uYNWuWGDt2rMidO7coVaqUxvc9NTVV1K5dW6hUKtGtWzexYMEC4eHhIQAxaNAgjfFn9vt3zJgxIleuXCI6Olqj/ZYtWwQgfv/99ywdn6+FLCwl6T2cO3dOACIwMFAIkfbLqWDBghp/GR48eFAA4rffftNYt1GjRqJIkSLK+7Vr1wotLS1x4sQJjXZLliwRgDh16pSyDBBaWlriypUr6cb04sULjfevXr0SpUqVErVr11aWnT9/PsNfmF5eXumKKG9vb2FjYyP+/fdfjbZt27YVZmZm6bb3Jjs7O1G/fn3x+PFj8fjxY3Hp0iXRsWNHpSB629iFEGLKlClCpVJpFA99+/YVmf17OLPCsmvXrhrtmjdvLiwtLZX32TkmGVm6dKkAxKVLlzSWOzs7axz70qVLpyu0/4vXj2OtWrWEtbW1chxzorC0srISMTExyvJRo0YJQJQuXVokJSUpy9u1ayd0dXVFQkKCsszOzk4AYvv27cqy2NhYYWNjI8qWLassmzhxojAyMhLXr1/XGNPIkSOFtra2iIyM1BiTqampePTo0TuPTUhIiABEt27dNJYPGzZMAOLo0aMa+/9msZiZN9s+evRI6Orqivr164uUlBRl+YIFCwQgVq1apSx7vbDcvn27yJUrl+jevbvGenXq1BEuLi4axzI1NVVUqVJFFC1aVFn2vt+lzArLN4vDxMREYW1tLTw9Pd/a3+DBgwWg/OP2XTL6cx4cHJxu+1u3bk03TiGEeP78uTA3Nxfdu3fXWP7gwQNhZmamsdzFxUUULFhQPH/+XFkWFBQkAI3v+65duwQgfv75Z40+W7ZsKVQqlbh586ayLLPfv+Hh4QIQixc
|
||
|
"text/plain": [
|
||
|
"<Figure size 640x480 with 1 Axes>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"category_means = df.groupby('category')['rating'].mean()\n",
|
||
|
"category_means = category_means[categories]\n",
|
||
|
"# give each category a id number and add to plot to each scatter\n",
|
||
|
"category_counts = df['category'].unique()\n",
|
||
|
"\n",
|
||
|
"plt.scatter(category_means, category_counts)\n",
|
||
|
"plt.xlabel('Average Rating')\n",
|
||
|
"plt.ylabel('Number of Jokes')\n",
|
||
|
"plt.title('Average Rating vs. Number of Jokes in Category')\n",
|
||
|
"plt.show()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 23,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkQAAAHHCAYAAABeLEexAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAABIkklEQVR4nO3dd3gU1f7H8c+mh0ASWhIikERAehMUAqFHIiCC4lW8iIAIFrgUAQULIIgUlXpF1J8CXlEULyqXK02qYKSEJggISBWSoJCEIpByfn/wZC5LQgubQub9ep59HvbM2Znv2c0mH2bOzDiMMUYAAAA25pbfBQAAAOQ3AhEAALA9AhEAALA9AhEAALA9AhEAALA9AhEAALA9AhEAALA9AhEAALA9AhEAALA9AhFsY+TIkXI4HHmyrebNm6t58+bW81WrVsnhcOirr77Kk+13795d4eHhebKtnDpz5oyefvpphYSEyOFwaMCAAfld0k278nNGVgXpPcr8HfDHH3/kdykogAhEuC3NmjVLDofDevj4+Cg0NFQxMTGaOnWqTp8+7ZLtHDt2TCNHjtTWrVtdsj5XKsi13Yg333xTs2bN0nPPPad//etf6tq1a7b9qlWrptq1a2dp//rrr+VwONSsWbMsyz7++GM5HA4tXbrU5XXnVHp6umbOnKnmzZurRIkS8vb2Vnh4uHr06KFNmzbld3mSpF9++UUjR47UwYMH87uUq8r87heU9wyFh0d+FwDcilGjRikiIkKpqamKj4/XqlWrNGDAAE2cOFELFixQrVq1rL6vvvqqhg4delPrP3bsmF5//XWFh4erTp06N/y6vPhDfK3aPvzwQ2VkZOR6DbdixYoVatiwoUaMGHHNflFRUfroo4+UnJysgIAAq33dunXy8PDQxo0blZqaKk9PT6dl7u7uioyMzLX6b8Zff/2lhx9+WIsXL1bTpk318ssvq0SJEjp48KC+/PJLzZ49W4cPH1bZsmXztc5ffvlFr7/+upo3b+6yPYwFKZQC18IeItzW2rRpoyeeeEI9evTQsGHDtGTJEn3//fdKTEzUgw8+qL/++svq6+HhIR8fn1yt59y5c5IkLy8veXl55eq2rsXT01Pe3t75tv0bkZiYqMDAwOv2i4qKUkZGhn788Uen9nXr1unRRx/VX3/9pbi4OKdla9euVa1atVSsWLFbqvHs2bO39PpMQ4YM0eLFizVp0iStXr1agwcP1lNPPaVRo0Zp586dmjBhgku2UxDl93cBuFEEIhQ6LVu21GuvvaZDhw7p008/tdqzm0O0bNkyRUVFKTAwUEWLFlXlypX18ssvS7o07+eee+6RJPXo0cM6PDdr1ixJl+ZG1KhRQ3FxcWratKmKFClivfZq8ybS09P18ssvKyQkRH5+fnrwwQd15MgRpz7h4eHq3r17ltdevs7r1ZbdHKKzZ89q0KBBKleunLy9vVW5cmW9/fbbMsY49XM4HOrbt6+++eYb1ahRQ97e3qpevboWL16c/Rt+hcTERPXs2VPBwcHy8fFR7dq1NXv2bGt55nyqAwcO6L///a9V+9UO00RFRUm6FIAynT9/Xps3b9bDDz+sO++802nZiRMn9Ouvv1qvk6QtW7aoTZs28vf3V9GiRdWqVSv99NNPTtvJPBSzevVqPf/88woKCnLaY/PBBx+oQoUK8vX11b333qsffvjhht6Po0eP6v3339d9992X7Twpd3d3DR482GlbN1Lv1ebEZY7j8vczPDxcDzzwgNauXat7771XPj4+uvPOO/XJJ584ve5vf/ubJKlFixbW57Jq1SpJ0qZNmxQTE6NSpUrJ19dXEREReuqpp647/qvNp/vyyy81ZswYlS1bVj4+PmrVqpX27dt33fVdzYoVK9SkSRP5+fkpMDBQHTp00K5du677ukOHDqlixYqqUaOGEhISJElJSUkaMGCA9V2pWLGixo8fn2Wv69y5c1WvXj0VK1ZM/v7+qlmzpqZMmZLjMSB/ccgMhVLXrl318ssva+nSperVq1e2fXbu3KkHHnhAtWrV0qhRo+Tt7a19+/ZZf1yrVq2qUaNGafjw4erdu7eaNGkiSWrUqJG1jj///FNt2rRR586d9cQTTyg4OPiadY0ZM0YOh0MvvfSSEhMTNXnyZEVHR2vr1q3y9fW94fHdSG2XM8bowQcf1MqVK9WzZ0/VqVNHS5Ys0ZAhQ/T7779r0qRJTv3Xrl2r+fPn6/nnn1exYsU0depUderUSYcPH1bJkiWvWtdff/2l5s2ba9++ferbt68iIiI0b948de/eXUlJSerfv7+qVq2qf/3rXxo4cKDKli2rQYMGSZJKly6d7TrvvPNOhYaGau3atVbbxo0bdfHiRTVq1EiNGjXSunXrrPVk7knKDEQ7d+5UkyZN5O/vrxdffFGenp56//331bx5c61evVoNGjRw2t7zzz+v0qVLa/jw4dYeoo8++kjPPPOMGjVqpAEDBui3337Tgw8+qBIlSqhcuXJXfT8kadGiRUpLS7vqHKkr3Wy9N2rfvn165JFH1LNnT3Xr1k0ff/yxunfvrnr16ql69epq2rSp+vXrp6lTp+rll19W1apVJV36WUtMTFTr1q1VunRpDR06VIGBgTp48KDmz5+fo1okady4cXJzc9PgwYOVnJysCRMmqEuXLlq/fv1Nr+v7779XmzZtdOedd2rkyJH666+/NG3aNDVu3FibN2++6uG//fv3q2XLlipRooSWLVumUqVK6dy5c2rWrJl+//13PfPMMypfvrx+/PFHDRs2TMePH9fkyZMlXfrP1OOPP65WrVpp/PjxkqRdu3Zp3bp16t+/f07fFuQnA9yGZs6caSSZjRs3XrVPQECAqVu3rvV8xIgR5vIf+UmTJhlJ5sSJE1ddx8aNG40kM3PmzCzLmjVrZiSZGTNmZLusWbNm1vOVK1caSeaOO+4wKSkpVvuXX35pJJkpU6ZYbWFhYaZbt27XXee1auvWrZsJCwuznn/zzTdGknnjjTec+j3yyCPG4XCYffv2WW2SjJeXl1Pbtm3bjCQzbdq0LNu63OTJk40k8+mnn1ptFy9eNJGRkaZo0aJOYw8LCzPt2rW75voy/e1vfzO+vr7m4sWLxhhjxo4dayIiIowxxkyfPt0EBQVZfQcPHmwkmd9//90YY0zHjh2Nl5eX2b9/v9Xn2LFjplixYqZp06ZWW+bPVFRUlElLS3OqPygoyNSpU8dcuHDBav/ggw+MJKfPJDsDBw40ksyWLVtuaKw3Wu+VP89XjuPAgQNWW1hYmJFk1qxZY7UlJiYab29vM2jQIKtt3rx5RpJZuXKl0zq//vrr637fruZq34WqVas6vZ9TpkwxkszPP/98zfVl992vU6eOCQoKMn/++afVtm3bNuPm5maefPJJqy3zPTtx4oTZtWuXCQ0NNffcc485efKk1Wf06NHGz8/P/Prrr07bHTp0qHF3dzeHDx82xhjTv39/4+/v7/Szgtsbh8xQaBUtWvSaZ5tlzl/59ttvczwB2dvbWz169Ljh/k8++aTTvJZHHnlEZcqU0XfffZej7d+o7777Tu7u7urXr59T+6BBg2SM0aJFi5zao6OjVaFCBet5rVq15O/vr99+++262wkJCdHjjz9utXl6eqpfv346c+aMVq9enaP6o6KinOYKrVu3ztob1rhxYyUmJmrv3r3WsoiICIWGhio9PV1Lly5Vx44ddeedd1rrK1OmjP7+979r7dq1SklJcdpWr1695O7ubj3ftGmTEhMT9eyzzzrNhenevbvTJO+ryVz/jcxnykm9N6patWrWnkTp0h65ypUrX/czlf73XVm4cKFSU1NztP0r9ejRw+n9zKztRuq53PHjx7V161Z1795dJUqUsNpr1aql++67L9vv1o4dO9SsWTOFh4fr+++/V/Hixa1l8+bNU5MmTVS8eHH98ccf1iM6Olrp6elas2aNpEvvydmzZ7Vs2bKbqhcFF4EIhdaZM2eu+UfoscceU+PGjfX0008rODhYnTt31pdffnlT4eiOO+64qQmjlSpVcnrucDhUsWLFXD/N+dChQwoNDc3yfmQeFjl
|
||
|
"text/plain": [
|
||
|
"<Figure size 640x480 with 1 Axes>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"stop_words = set(stopwords.words('english'))\n",
|
||
|
"\n",
|
||
|
"# get all the jokes\n",
|
||
|
"jokes = df['body'].values\n",
|
||
|
"\n",
|
||
|
"# tokenize the jokes\n",
|
||
|
"tokenized_jokes = [word_tokenize(joke) for joke in jokes]\n",
|
||
|
"\n",
|
||
|
"# remove stop words\n",
|
||
|
"filtered_jokes = [[word for word in joke if word.lower() not in stop_words] for joke in tokenized_jokes]\n",
|
||
|
"\n",
|
||
|
"# count the number of words in each joke\n",
|
||
|
"word_counts = [len(joke) for joke in filtered_jokes]\n",
|
||
|
"\n",
|
||
|
"# plot the distribution of word counts\n",
|
||
|
"plt.hist(word_counts, bins=100)\n",
|
||
|
"plt.xlabel('Number of Words')\n",
|
||
|
"plt.ylabel('Frequency')\n",
|
||
|
"plt.title('Distribution of Word Counts in Jokes')\n",
|
||
|
"plt.show()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 24,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
", 25353\n",
|
||
|
". 22851\n",
|
||
|
"`` 12833\n",
|
||
|
"'' 10173\n",
|
||
|
"? 4538\n",
|
||
|
"! 3501\n",
|
||
|
"'s 3136\n",
|
||
|
": 2863\n",
|
||
|
"said 2359\n",
|
||
|
"n't 2324\n",
|
||
|
"man 1992\n",
|
||
|
"says 1836\n",
|
||
|
"one 1549\n",
|
||
|
"back 1004\n",
|
||
|
"get 970\n",
|
||
|
"asked 962\n",
|
||
|
"would 917\n",
|
||
|
"... 844\n",
|
||
|
"day 840\n",
|
||
|
"-- 824\n",
|
||
|
"'m 819\n",
|
||
|
"time 735\n",
|
||
|
"know 730\n",
|
||
|
"wife 722\n",
|
||
|
"like 719\n",
|
||
|
"' 707\n",
|
||
|
"first 695\n",
|
||
|
"guy 692\n",
|
||
|
"Well 676\n",
|
||
|
"went 668\n",
|
||
|
"got 666\n",
|
||
|
"go 664\n",
|
||
|
"'ll 648\n",
|
||
|
"'re 648\n",
|
||
|
"woman 632\n",
|
||
|
"replied 604\n",
|
||
|
"two 594\n",
|
||
|
"old 590\n",
|
||
|
"asks 581\n",
|
||
|
"- 574\n",
|
||
|
"take 550\n",
|
||
|
"'ve 536\n",
|
||
|
"could 533\n",
|
||
|
"going 487\n",
|
||
|
"little 483\n",
|
||
|
"blonde 481\n",
|
||
|
"next 479\n",
|
||
|
"want 468\n",
|
||
|
"good 467\n",
|
||
|
"see 463\n",
|
||
|
"came 448\n",
|
||
|
"husband 448\n",
|
||
|
"home 441\n",
|
||
|
") 437\n",
|
||
|
"goes 435\n",
|
||
|
"$ 405\n",
|
||
|
"car 404\n",
|
||
|
"replies 403\n",
|
||
|
"doctor 400\n",
|
||
|
"think 398\n",
|
||
|
"say 393\n",
|
||
|
"boy 393\n",
|
||
|
"told 388\n",
|
||
|
"give 385\n",
|
||
|
"three 378\n",
|
||
|
"tell 378\n",
|
||
|
"One 376\n",
|
||
|
"new 375\n",
|
||
|
"way 373\n",
|
||
|
"( 371\n",
|
||
|
"God 369\n",
|
||
|
"around 367\n",
|
||
|
"make 366\n",
|
||
|
"Q 364\n",
|
||
|
"right 362\n",
|
||
|
"* 360\n",
|
||
|
".. 358\n",
|
||
|
"people 358\n",
|
||
|
"door 348\n",
|
||
|
"Oh 342\n",
|
||
|
"years 332\n",
|
||
|
"second 323\n",
|
||
|
"later 320\n",
|
||
|
"ca 316\n",
|
||
|
"never 311\n",
|
||
|
"last 308\n",
|
||
|
"really 308\n",
|
||
|
"room 307\n",
|
||
|
"away 306\n",
|
||
|
"another 301\n",
|
||
|
"young 300\n",
|
||
|
"minutes 299\n",
|
||
|
"look 293\n",
|
||
|
"house 292\n",
|
||
|
"; 292\n",
|
||
|
"dog 292\n",
|
||
|
"night 291\n",
|
||
|
"comes 290\n",
|
||
|
"come 287\n",
|
||
|
"work 284\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# count the frequency of each word\n",
|
||
|
"word_freq = Counter()\n",
|
||
|
"for joke in filtered_jokes:\n",
|
||
|
" word_freq.update(joke)\n",
|
||
|
"\n",
|
||
|
"# get the most common words\n",
|
||
|
"most_common = word_freq.most_common(100)\n",
|
||
|
"\n",
|
||
|
"for word, count in most_common:\n",
|
||
|
" print(word, count)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 25,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjsAAAHHCAYAAABZbpmkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAABQiUlEQVR4nO3dd1gU1/4G8HdpC9IRaZGmGLvYIqKooCiisecavcYeu1FsUZJYYkywREW9GhMTUW80RpNovEaxgb2EorF3EBUBFQFBRcr5/eHD/FwBhWVhl/H9PM88j3tmduY7Zxd4PXNmVyGEECAiIiKSKT1tF0BERERUnhh2iIiISNYYdoiIiEjWGHaIiIhI1hh2iIiISNYYdoiIiEjWGHaIiIhI1hh2iIiISNYYdoiIiEjWGHa0ZM6cOVAoFBVyLF9fX/j6+kqPDx48CIVCgd9++61Cjj9kyBC4ublVyLHUlZmZiY8//hgODg5QKBQICgrSdkml9urrTIXpUh8V/A548OCBtksptXXr1kGhUCA+Pr7E20ZHR5d/YWpQKBSYM2eOtsugcsawowEFP8wFi7GxMZycnBAQEIDly5fj8ePHGjlOYmIi5syZgzNnzmhkf5qky7WVxDfffIN169ZhzJgx+O9//4uBAwcWuV29evXg6elZqH3btm1QKBRo165doXVr166FQqHA3r17NV63uvLy8hAWFgZfX1/Y2NhAqVTCzc0NQ4cO1Zk/ShcvXsScOXNK9AdVW3T9D3lFWrVqFdatW6ftMoq0a9cuBhoN2bhxIxQKBczMzEq0/at/H19ekpKSyrna/2dQYUd6C8ydOxfu7u7IyclBUlISDh48iKCgICxZsgQ7duxAo0aNpG2/+OILzJgxo1T7T0xMxJdffgk3Nzc0bty4xM+riD+yr6ttzZo1yM/PL/cayiIiIgItW7bE7NmzX7udj48PfvrpJ6Snp8PS0lJqP3bsGAwMDBAVFYWcnBwYGhqqrNPX14e3t3e51V8aT58+Re/evREeHo62bdvis88+g42NDeLj47FlyxasX78eCQkJqF69ulbrvHjxIr788kv4+vpqbGRQlwJnZTZw4ED069cPSqVSalu1ahVsbW0xZMgQ7RVWjF27dmHlypVFBp6nT5/CwIB/CksiMzMTn376KUxNTUv93IK/jy+zsrLSUGVvxldYgwIDA9G8eXPpcXBwMCIiIvD++++je/fuuHTpEkxMTAAABgYG5f4D9uTJE1SpUgVGRkblepw3efkPv65KSUlBvXr13ridj48P1qxZg+PHjyMwMFBqP3bsGPr27YtNmzYhJiYGLVu2lNYdPXoUjRo1grm5eZlqzMrKUuuXzKumTZuG8PBwLF26tNDlutmzZ2Pp0qVlPoau0vbPglzo6+tDX19fa8fX1M8CABgbG2tkP+XhwYMHyMnJgaOjo7ZLAQDMmzcP5ubm8PPzw/bt20v13Ff/PlY0XsYqZ+3bt8fMmTNx69Yt/Pzzz1J7UXN29u3bBx8fH1hZWcHMzAy1a9fGZ599BuDFPJv33nsPADB06FBpGLBg2NjX1xcNGjRATEwM2rZtiypVqkjPLW6eQl5eHj777DM4ODjA1NQU3bt3x+3bt1W2cXNzK/J/ai/v8021FTVnJysrC1OmTIGzszOUSiVq166Nb7/9FkIIle0UCgXGjx+P7du3o0GDBlAqlahfvz7Cw8OL7vBXpKSkYPjw4bC3t4exsTE8PT2xfv16aX3B/KW4uDj89ddfUu3FXTrx8fEB8CLcFHj27BliY2PRu3dv1KhRQ2Xd/fv3cfXqVel5AHD69GkEBgbCwsICZmZm6NChA06ePKlynIKh30OHDmHs2LGws7NTGWn54YcfULNmTZiYmKBFixY4cuRIifrjzp07+P7779GxY8ci5yXp6+tj6tSpKscqSb3FzUEram6Hm5sb3n//fRw9ehQtWrSAsbExatSogQ0bNqg871//+hcAwM/PT3pdDh48CACIjo5GQEAAbG1tYWJiAnd3dwwbNuyN51/c/LUtW7bg66+/RvXq1WFsbIwOHTrg+vXrb9xfcSIiItCmTRuYmprCysoKPXr0wKVLl974vFu3bsHDwwMNGjRAcnIyACAtLQ1BQUHSz4qHhwcWLFhQaLR08+bNaNasGczNzWFhYYGGDRti2bJlrz1e06ZN0bt3b5W2hg0bQqFQ4OzZs1Lbr7/+CoVCIZ3Dq6+rm5sbLly4gEOHDkmv1au/c7KzszF58mRUq1YNpqam6NWrF+7fv//GPhkyZAjMzMxw48YNdOnSBebm5hgwYAAA4MiRI/jXv/4FFxcXKJVKODs7Y9KkSXj69KnK81euXAkAKpdQCrw6Z6fgvXz9+nUMGTIEVlZWsLS0xNChQ/HkyROV2p4+fYoJEybA1tYW5ubm6N69O+7evVton48fP0ZQUBDc3NygVCphZ2eHjh07IjY29rXnfv78ebi4uKBHjx7YsWMHcnNz39hf5eXatWtYunQplixZovZ/1B8/foy8vDwNV1YyHNmpAAMHDsRnn32GvXv3YsSIEUVuc+HCBbz//vto1KgR5s6dC6VSievXr0t/OOvWrYu5c+di1qxZGDlyJNq0aQMAaNWqlbSPhw8fIjAwEP369cNHH30Ee3v719b19ddfQ6FQYPr06UhJSUFoaCj8/f1x5swZaQSqJEpS28uEEOjevTsiIyMxfPhwNG7cGHv27MG0adNw9+7dQiMLR48exR9//IGxY8fC3Nwcy5cvR58+fZCQkICqVasWW9fTp0/h6+uL69evY/z48XB3d8fWrVsxZMgQpKWlYeLEiahbty7++9//YtKkSahevTqmTJkCAKhWrVqR+6xRowacnJxw9OhRqS0qKgrPnz9Hq1at0KpVKxw7dkzaz/HjxwH8f0i6cOEC2rRpAwsLC3z66acwNDTE999/D19fXxw6dAheXl4qxxs7diyqVauGWbNmISsrCwDw008/YdSoUWjVqhWCgoJw8+ZNdO/eHTY2NnB2di62PwBg9+7dyM3NLXZO0qtKW29JXb9+HR988AGGDx+OwYMHY+3atRgyZAiaNWuG+vXro23btpgwYQKWL1+Ozz77DHXr1gXw4r2WkpKCTp06oVq1apgxYwasrKwQHx+PP/74Q61aAGD+/PnQ09PD1KlTkZ6ejoULF2LAgAE4depUqfe1f/9+BAYGokaNGpgzZw6ePn2KFStWoHXr1oiNjS32ktyNGzfQvn172NjYYN++fbC1tcWTJ0/Qrl073L17F6NGjYKLiwuOHz+O4OBg3Lt3D6GhoQBe/Eepf//+6NChAxYsWAAAuHTpEo4dO4aJEycWW2ubNm3wyy+/SI9TU1Nx4cIF6Onp4ciRI9Kl9yNHjqBatWrS6/Cq0NBQfPLJJzAzM8Pnn38OAIV+/3zyySewtrbG7NmzER8fj9DQUIwfPx6//vrrG/s0NzcXAQEB8PHxwbfffosqVaoAALZu3YonT55gzJgxqFq1Kv7++2+sWLECd+7cwdatWwEAo0aNQmJiIvbt24f//ve/bzxWgb59+8Ld3R0hISGIjY3Fjz/+CDs7O6l/gRdBasuWLRg4cCBatmyJQ4cOoWvXroX2NXr0aPz2228YP3486tWrh4cPH+Lo0aO4dOkSmjZtWmwNjRs3xsyZM7Fu3Tr06NEDjo6OGDx4MIYNG4ZatWoV+7ycnBykp6eX6DxtbGygp/fmcY+goCD4+fmhS5cu2LJlS4n2/TI/Pz9kZmbCyMgIAQEBWLx48WvPQeMElVlYWJgAIKKioordxtLSUjRp0kR6PHv2bPFy9y9dulQAEPfv3y92H1FRUQKACAsLK7SuXbt2AoBYvXp1kevatWsnPY6MjBQAxDvvvCMyMjKk9i1btggAYtmyZVKbq6urGDx48Bv3+braBg8eLFxdXaXH27dvFwDEvHnzVLb74IMPhEKhENevX5faAAgjIyOVtn/++UcAECtWrCh0rJeFhoYKAOLnn3+W2p4/fy68vb2FmZmZyrm7urqKrl27vnZ/Bf71r38JExMT8fz5cyGEECEhIcLd3V0IIcSqVauEnZ2dtO3
|
||
|
"text/plain": [
|
||
|
"<Figure size 640x480 with 1 Axes>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"\n",
|
||
|
"# get all the jokes with ratings greater than 4.5\n",
|
||
|
"good_jokes = df[df['rating'] >= 4.5]['body'].values\n",
|
||
|
"\n",
|
||
|
"# tokenize the jokes\n",
|
||
|
"tokenized_good_jokes = [word_tokenize(joke) for joke in good_jokes]\n",
|
||
|
"\n",
|
||
|
"# remove stop words\n",
|
||
|
"filtered_good_jokes = [[word for word in joke if word.lower() not in stop_words] for joke in tokenized_good_jokes]\n",
|
||
|
"\n",
|
||
|
"# count the number of words in each joke\n",
|
||
|
"word_good_counts = [len(joke) for joke in filtered_good_jokes]\n",
|
||
|
"\n",
|
||
|
"# plot the distribution of word counts\n",
|
||
|
"plt.hist(word_good_counts, bins=100)\n",
|
||
|
"plt.xlabel('Number of Words')\n",
|
||
|
"plt.ylabel('Frequency')\n",
|
||
|
"plt.title('Distribution of Word Counts in Jokes with ratings >= 4.5')\n",
|
||
|
"plt.show()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 26,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAioAAAHHCAYAAACRAnNyAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAABvIUlEQVR4nO3dd3hUVf4/8PedSTKTOmmEhJaEbuhFIHQQpC2w69oQEF1XReGrorKArgK6CMjPBdcCK+6yrijgqigoolQphk7QEFAMoQgJmIT0PnN+f4QZM71n7iTv1/PMI7nl3HNuYu4n557zOZIQQoCIiIhIhhS+rgARERGRNQxUiIiISLYYqBAREZFsMVAhIiIi2WKgQkRERLLFQIWIiIhki4EKERERyRYDFSIiIpItBipEREQkWwxUiBrQ3r17IUkSPv74Y19XxSHXrl3DnXfeiZiYGEiShFWrVvm6Sk5LSkrCAw884Otq+AX9z+fevXt9XRUiAwYq1Oj85z//gSRJUKvVuHLlitn+4cOHo2vXrj6omf+ZM2cOvv76ayxYsADvv/8+xo4da3aMVqtFREQEJk+ebLZv5cqVkCQJM2bMMNv34osvQpIk/PTTT16pe2Nw4cIFSJJk+CgUCkRHR2PcuHFIS0tzudy3334b//nPfzxXUSIvCvB1BYi8paqqCsuWLcMbb7zh66r4rd27d2Py5Ml49tlnrR6jVCoxYMAAfPfdd2b7Dh48iICAABw8eNDivri4OHTs2NGjdW6MpkyZgvHjx0Or1eKnn37C22+/jREjRuDo0aPo1q2b0+W9/fbbiI2NNetpGjp0KCoqKhAUFOShmhO5jz0q1Gj17NkTa9euxdWrV31dlQZXVlbmkXKuX7+OyMhIu8cNHjwYeXl5OHPmjNH2gwcP4u6770ZWVhZyc3MN22tra3H48GEMGjTI7Tp6qq1y1rt3b0ybNg0zZszAkiVLsGHDBlRVVWH16tUevY5CoYBarYZCwUcDyQd/GqnReu6556DVarFs2TKbx+m71y11hUuShEWLFhm+XrRokeF1xbRp06DRaNCsWTO88MILEELg8uXLmDx5MiIiIhAfH4/XXnvN4jW1Wi2ee+45xMfHIzQ0FJMmTcLly5fNjjt8+DDGjh0LjUaDkJAQDBs2zKx3Ql+nzMxM3HfffYiKisLgwYNttvn8+fO46667EB0djZCQEAwYMABffvmlYb/+9ZkQAm+99Zbh1YM1+uvVr9v58+eRm5uL2bNnQ61WG+1LT09HWVmZUT13796NIUOGIDQ0FJGRkZg8ebJZ4GOrrUII/O1vf0OrVq0QEhKCESNG4PTp02Z1rampweLFi9GhQweo1WrExMRg8ODB2LFjh9X2HTt2DJIk4b333jPb9/XXX0OSJHzxxRcAgJKSEjz11FNISkqCSqVCXFwcRo8ejRMnTlgt31lDhgwBAGRlZRltX7duHUaOHIm4uDioVCqkpKSYBTNJSUk4ffo0vv32W8P3dfjw4QAsj1HRvyrNzMzEiBEjEBISgpYtW+LVV181q9fFixcxadIkhIaGIi4uzvDqkONeyB0MVKjRSk5Oxv333++VXpV77rkHOp0Oy5YtQ//+/fG3v/0Nq1atwujRo9GyZUssX74c7du3x7PPPot9+/aZnb9kyRJ8+eWXmDdvHp544gns2LEDo0aNQkVFheGY3bt3Y+jQoSguLsbChQvxyiuvoLCwECNHjsSRI0fMyrzrrrtQXl6OV155BQ8//LDVul+7dg0DBw7E119/jccffxxLlixBZWUlJk2ahM2bNwOoewXw/vvvAwBGjx6N999/3/C1JQMGDEBAQAAOHDhg2Hbw4EGEhobi1ltvRd++fY0CFf2/9UHGzp07MWbMGFy/fh2LFi3C008/je+++w6DBg3ChQsXHGrriy++iBdeeAE9evTAihUr0LZtW9x+++1mPS6LFi3C4sWLMWLECLz55pt4/vnn0aZNG5uBRN++fdG2bVt89NFHZvs2bdqEqKgojBkzBgAwc+ZMrF69Gn/84x/x9ttv49lnn0VwcLBZ0OUO/T2Jiooy2r569WokJibiueeew2uvvYbWrVvj8ccfx1tvvWU4ZtWqVWjVqhU6d+5s+L4+//zzNq9348YNjB07Fj169MBrr72Gzp07Y968efjqq68Mx5SVlWHkyJHYuXMnnnjiCTz//PP47rvvMG/ePI+1m5ooQdTIrFu3TgAQR48eFVlZWSIgIEA88cQThv3Dhg0TXbp0MXydnZ0tAIh169aZlQVALFy40PD1woULBQDxyCOPGLbV1taKVq1aCUmSxLJlywzbb9y4IYKDg8WMGTMM2/bs2SMAiJYtW4ri4mLD9o8++kgAEK+//roQQgidTic6dOggxowZI3Q6neG48vJykZycLEaPHm1WpylTpjh0f5566ikBQOzfv9+wraSkRCQnJ4ukpCSh1WqN2j9r1iyHyr311ltFu3btDF8/+uijYsSIEUIIIf7yl7+IW2+91bDvzjvvFCEhIaKmpkYIIUTPnj1FXFycyM/PNxxz6tQpoVAoxP3332+3rdevXxdBQUFiwoQJRvfrueeeEwCMvgc9evQQEyZMcKhN9S1YsEAEBgaKgoICw7aqqioRGRkp/vSnPxm2aTQah++ZPfqfzcWLF4tff/1V5Obmiv3794tbb71VABD/+9//jI4vLy83K2PMmDGibdu2Rtu6dOkihg0bZnas/udzz549hm3Dhg0TAMR///tfw7aqqioRHx8v/vjHPxq2vfbaawKA+OyzzwzbKioqROfOnc3KJHIGe1SoUWvbti2mT5+Od955Bzk5OR4r989//rPh30qlEn379oUQAg899JBhe2RkJDp16oTz58+bnX///fcjPDzc8PWdd96JhIQEbNu2DUDdq5Fz587hvvvuQ35+PvLy8pCXl4eysjLcdttt2LdvH3Q6nVGZM2fOdKju27ZtQ79+/Yxeu4SFheGRRx7BhQsXkJmZ6dhNMDF48GCjsSgHDx7EwIEDAQCDBg3CyZMnUV5ebtjXv39/BAQEICcnB+np6XjggQcQHR1tKK979+4YPXq04Z7YauvOnTtRXV2N//u//zN6RfXUU0+ZnRsZGYnTp0/j3LlzTrXvnnvuQU1NDT799FPDtm+++QaFhYW45557jMo/fPiwR3vxFi5ciGbNmiE+Ph5DhgzBmTNn8Nprr+HOO+80Oi44ONjw76KiIuTl5WHYsGE4f/48ioqKXL5+WFgYpk2bZvg6KCgI/fr1M/rZ3r59O1q2bIlJkyYZtqnVapu9e0SOYKBCjd5f//pX1NbW2h2r4ow2bdoYfa3RaKBWqxEbG2u2/caNG2bnd+jQwehrSZLQvn17Q5e+/iE6Y8YMNGvWzOjz7rvvoqqqyuzBk5yc7FDdL168iE6dOpltv+WWWwz7XVF/nEphYSFOnz5tGCw7cOBA1NbW4siRI8jOzkZOTo7heP31rNVJH6DVZ9pWfRmm97VZs2Zmr0deeuklFBYWomPHjujWrRvmzp2L77//3m77evTogc6dO2PTpk2GbZs2bUJsbCxGjhxp2Pbqq68iIyMDrVu3Rr9+/bBo0SKLwaozHnnkEezYsQNbt27FnDlzUFFRAa1Wa3bcwYMHMWrUKMM4n2bNmuG5554DALcClVatWpmNUYqKijL62b548SLatWtndlz79u1dvi4RwOnJ1AS0bdsW06ZNwzvvvIP58+eb7bc2SNTSg0BPqVQ6tA2oG+TpLH1vyYoVK9CzZ0+Lx4SFhRl9Xf+vaV/QBx4HDhxASEgIACA1NRUAEBsbiw4dOuDAgQOGQcP2Bvza4k5bhw4diqysLHz++ef45ptv8O6772LlypVYs2aNUU+ZJffccw+WLFmCvLw8hIeHY8uWLZgyZQoCAn77VXr33XdjyJAh2Lx5M7755husWLECy5cvx6effopx48a5VOcOHTpg1KhRAIDf/e53UCqVmD9/PkaMGIG+ffsCqBtYe9ttt6Fz5874+9//jtatWyMoKAjbtm3DypUrzXrgnOHJn20iZ7FHhZoEfa/K8uXLzfbp/+IuLCw02u5qz4I
|
||
|
"text/plain": [
|
||
|
"<Figure size 640x480 with 1 Axes>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# joke length vs rating\n",
|
||
|
"plt.scatter(word_counts, ratings)\n",
|
||
|
"plt.xlabel('Number of Words')\n",
|
||
|
"plt.ylabel('Rating')\n",
|
||
|
"plt.title('Number of Words vs. Rating')\n",
|
||
|
"plt.show()"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.10.4"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 2
|
||
|
}
|