ANLP_WS24_CA1/data_explo_wocka.ipynb

423 lines
86 KiB
Plaintext
Raw Normal View History

2024-11-20 11:52:27 +01:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import os\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import nltk\n",
"from nltk.corpus import stopwords\n",
"from nltk.tokenize import word_tokenize\n",
"from collections import Counter"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to\n",
"[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to\n",
"[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# nltk count words\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# dataset wocka"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# Load the data from the JSON file\n",
"data_path = './data/wocka.json'\n",
"with open(data_path) as f:\n",
" data = json.load(f)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>body</th>\n",
" <th>category</th>\n",
" <th>id</th>\n",
" <th>title</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>What do you call a cow with no legs?\\r\\n\\r\\nGr...</td>\n",
" <td>Animal</td>\n",
" <td>1</td>\n",
" <td>Cow With No Legs</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>What do you call a cow jumping over a barbed w...</td>\n",
" <td>Animal</td>\n",
" <td>2</td>\n",
" <td>Jumping Cow</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>What's black and white and red all over?\\r\\n\\r...</td>\n",
" <td>Other / Misc</td>\n",
" <td>4</td>\n",
" <td>Black, White and Red</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>So, this guy walks into a bar.\\r\\n\\r\\nAnd says...</td>\n",
" <td>Bar</td>\n",
" <td>5</td>\n",
" <td>Guy in a Bar</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>If the opposite of pro is con, isn't the oppos...</td>\n",
" <td>One Liners</td>\n",
" <td>6</td>\n",
" <td>Progress</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" body category id \\\n",
"0 What do you call a cow with no legs?\\r\\n\\r\\nGr... Animal 1 \n",
"1 What do you call a cow jumping over a barbed w... Animal 2 \n",
"2 What's black and white and red all over?\\r\\n\\r... Other / Misc 4 \n",
"3 So, this guy walks into a bar.\\r\\n\\r\\nAnd says... Bar 5 \n",
"4 If the opposite of pro is con, isn't the oppos... One Liners 6 \n",
"\n",
" title \n",
"0 Cow With No Legs \n",
"1 Jumping Cow \n",
"2 Black, White and Red \n",
"3 Guy in a Bar \n",
"4 Progress "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# create pandas dataframe of the data\n",
"df = pd.DataFrame(data)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Animal\n",
"At Work\n",
"Bar\n",
"Blond\n",
"Blonde\n",
"Children\n",
"College\n",
"Gross\n",
"Insults\n",
"Knock-Knock\n",
"Lawyer\n",
"Lightbulb\n",
"Medical\n",
"Men / Women\n",
"News / Politics\n",
"One Liners\n",
"Other / Misc\n",
"Puns\n",
"Redneck\n",
"Religious\n",
"Sports\n",
"Tech\n",
"Yo Mama\n",
"Yo Momma\n",
"amount of categories: 24\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkQAAAIcCAYAAAAXPbHAAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAACRRElEQVR4nOzdd1RT2fc28CcU6UUQRRQEuyiWEXvHgtjL2AY79t7Frw0dxzaKfcQO9t67omDvgxUbImABC4MIKgqc9w9/3NcIahISQPN81spa5t6bkx2EZOeUfWRCCAEiIiIiLaaT3QEQERERZTcmRERERKT1mBARERGR1mNCRERERFqPCRERERFpPSZEREREpPWYEBEREZHWY0JEREREWo8JEREREWk9JkREJPHx8YFMJsOrV6+yOxQioizFhIjoF+Tv7w+ZTIYrV65kdygEYPr06di9e3d2h0FE38GEiIhIw5gQEeV8TIiISCslJiZmdwg5Dn8mpM2YEBFpiRMnTqBWrVowMTGBpaUlWrZsidDQ0B8+LiIiAkWLFkWZMmUQExMDAIiLi8OwYcNgb28PAwMDFC1aFLNmzUJqaqrcYzdv3oyKFSvCzMwM5ubmcHFxwYIFC777fI8fP4ZMJsOcOXMwb948FCpUCEZGRqhTpw5u3bqV7vq7d+/i999/h5WVFQwNDeHq6oq9e/fKXZM2hBgcHIwBAwYgb968KFiw4Hfj+PDhA3x8fFC8eHEYGhoif/78aNOmDcLCwqRr5syZg+rVq8Pa2hpGRkaoWLEitm/fLteOTCZDYmIiAgICIJPJIJPJ0L17d+n806dP0bNnT+TLlw8GBgYoXbo0Vq9enS6eiIgItGjRAiYmJsibNy+GDx+OI0eOQCaTISgoSO7abdu2oWLFijAyMkKePHnQuXNnPH36VO6a7t27w9TUFGFhYWjSpAnMzMzg6emJyZMnQ19fHy9fvkwXQ58+fWBpaYkPHz5892dH9DPSy+4AiEjzjh8/Dg8PDxQuXBg+Pj54//49Fi1ahBo1auDatWtwdHTM8HFhYWFwc3ODlZUVjh07hjx58uDdu3eoU6cOnj59ir59+8LBwQHnzp3DuHHj8Pz5c8yfPx8AcOzYMXTq1An169fHrFmzAAChoaE4e/Yshg4d+sOY165di7dv32LgwIH48OEDFixYADc3N9y8eRP58uUDANy+fRs1atRAgQIF4O3tDRMTE2zduhWtWrXCjh070Lp1a7k2BwwYABsbG0yaNOm7vSEpKSlo1qwZAgMD0bFjRwwdOhRv377FsWPHcOvWLRQpUgQAsGDBArRo0QKenp74+PEjNm/ejHbt2mH//v1o2rQpAGDdunXo1asXKleujD59+gCA9PiYmBhUrVoVMpkMgwYNgo2NDQ4dOgQvLy/Ex8dj2LBhAD733Li5ueH58+cYOnQobG1tsXHjRpw8eTJd7P7+/ujRowcqVaqEGTNmICYmBgsWLMDZs2fx77//wtLSUro2OTkZ7u7uqFmzJubMmQNjY2NUq1YNU6dOxZYtWzBo0CDp2o8fP2L79u1o27YtDA0Nf/j/R/TTEUT0y1mzZo0AIC5fviyEEKJ8+fIib9684vXr19I1169fFzo6OqJr167SscmTJwsA4uXLlyI0NFTY2dmJSpUqidjYWOmaP//8U5iYmIj79+/LPae3t7fQ1dUVkZGRQgghhg4dKszNzUVycrJSsYeHhwsAwsjISDx58kQ6fvHiRQFADB8+XDpWv3594eLiIj58+CAdS01NFdWrVxfFihVL9/OoWbOmQvGsXr1aABC+vr7pzqWmpkr/fvfundy5jx8/ijJlygg3Nze54yYmJqJbt27p2vLy8hL58+cXr169kjvesWNHYWFhIbU/d+5cAUDs3r1buub9+/eiZMmSAoA4efKk9Px58+YVZcqUEe/fv5eu3b9/vwAgJk2aJB3r1q2bACC8vb3TxVWtWjVRpUoVuWM7d+6Uey6iXw2HzIh+cc+fP0dISAi6d+8OKysr6XjZsmXRsGFDHDx4MN1jbt26hTp16sDR0RHHjx9H7ty5pXPbtm1DrVq1kDt3brx69Uq6NWjQACkpKTh16hQAwNLSEomJiTh27JhKcbdq1QoFChSQ7leuXBlVqlSR4o2NjcWJEyfQvn17vH37Vorj9evXcHd3x4MHD9INE/Xu3Ru6uro/fO4dO3YgT548GDx4cLpzMplM+reRkZH07//++w9v3rxBrVq1cO3atR8+hxACO3bsQPPmzSGEkPtZuru7482bN1I7hw8fRoECBdCiRQvp8YaGhujdu7dcm1euXMGLFy8wYMAAuV6cpk2bomTJkjhw4EC6OPr375/uWNeuXXHx4kW54cENGzbA3t4ederU+eFrI/oZMSEi+sVFREQAAEqUKJHuXKlSpfDq1at0w0fNmzeHmZkZjhw5AnNzc7lzDx48wOHDh2FjYyN3a9CgAQDgxYsXAD4PTxUvXhweHh4oWLAgevbsicOHDyscd7FixdIdK168OB4/fgwAePjwIYQQmDhxYrpYJk+eLBdLGicnJ4WeOywsDCVKlICe3vdnFezfvx9Vq1aFoaEhrKysYGNjg6VLl+LNmzc/fI6XL18iLi4Oy5cvTxd/jx495OKPiIhAkSJF5JIxAChatKjc/e/9X5csWVI6n0ZPTy/DuVQdOnSAgYEBNmzYAAB48+YN9u/fD09Pz3QxEP0qOIeIiNJp27YtAgICsGHDBvTt21fuXGpqKho2bIgxY8Zk+NjixYsDAPLmzYuQkBAcOXIEhw4dwqFDh7BmzRp07doVAQEBmY4xbQL3qFGj4O7unuE1XycMX/boZNbp06fRokUL1K5dG//88w/y588PfX19rFmzBhs3bvzh49Pi79y5M7p165bhNWXLllVbvBkxMDCAjk7678W5c+dGs2bNsGHDBkyaNAnbt29HUlISOnfurNF4iLITEyKiX1yhQoUAAPfu3Ut37u7du8iTJw9MTEzkjv/999/Q09PDgAEDYGZmhj/++EM6V6RIESQkJEg9Qt+TK1cuNG/eHM2bN0dqaioGDBiAZcuWYeLEiemSla89ePAg3bH79+9LE8ALFy4MANDX11coFmUUKVIEFy9exKdPn6Cvr5/hNTt27IChoSGOHDkCAwMD6fiaNWvSXZtRr4qNjQ3MzMyQkpLyw/gLFSqEO3fuQAgh19bDhw/TXQd8/r92c3OTO3fv3j3pvCK6du2Kli1b4vLly9iwYQMqVKiA0qVLK/x4op8Nh8yIfnH58+dH+fLlERAQgLi4OOn4rVu3cPToUTRp0iTdY2QyGZYvX47ff/8d3bp1k1vG3r59e5w/fx5HjhxJ97i4uDgkJycDAF6/fi13TkdHR+rxSEpK+mHcu3fvlpsDdOnSJVy8eBEeHh4APvdA1a1bF8uWLcPz58/TPT6jZeOKatu2LV69eoXFixenOyeEAADo6upCJpMhJSVFOvf48eMMCzCamJjI/ezTHt+2bVvs2LEjw3ICX8bv7u6Op0+fyv0/fPjwAStWrJB7jKurK/LmzQs/Pz+5n/GhQ4cQGhoqrXxThIeHB/LkyYNZs2YhODiYvUP0y2MPEZEW+Pvvv+Hh4YFq1arBy8tLWnZvYWEBHx+fDB+jo6OD9evXo1WrVmjfvj0OHjwINzc3jB49Gnv37kWzZs3QvXt3VKxYEYmJibh58ya2b9+Ox48fI0+ePOjVqxdiY2Ph5uaGggULIiIiAosWLUL58uVRqlSpH8ZctGhR1KxZE/3790dSUhLmz58Pa2truaG6JUuWoGbNmnBxcUHv3r1RuHBhxMTE4Pz583jy5AmuX7+u0s+ra9euWLt2LUaMGIFLly6hVq1aSExMxPHjxzFgwAC0bNkSTZs2ha+vLxo3bow//vgDL168wJIlS1C0aFHcuHFDrr2KFSvi+PHj8PX1hZ2dHZycnFClShXMnDkTJ0+eRJUqVdC7d284OzsjNjYW165dw/HjxxEbGwsA6Nu3LxYvXoxOnTph6NChyJ8/PzZs2CBNnE7rNdLX18esWbPQo0cP1KlTB506dZKW3Ts6OmL48OEK/wz
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"#print unique categories\n",
"categories = df['category'].unique()\n",
"categories= sorted(categories)\n",
"for category in categories:\n",
" print(category)\n",
"print('amount of categories:', len(categories))\n",
"# plot the distribution of categories\n",
"category_counts = df['category'].value_counts()\n",
"category_counts.plot(kind='bar')\n",
"plt.title('Jokes per category')\n",
"plt.ylabel('Number of jokes')\n",
"plt.xlabel('Category')\n",
"plt.xticks(rotation=90)\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkQAAAHHCAYAAABeLEexAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAABSRElEQVR4nO3deVxUVf8H8M+wzIDAgIhsIYtiKu5L4STuKBmaJpaaKSpWGjwK7laPmpXr41qprWJPmkmPmmmiiKCpuKG4ixuKJoulMODCen5/+OL+vIIKCA5wP+/X675ezTlnzv2eGYFPd+69oxJCCBAREREpmJGhCyAiIiIyNAYiIiIiUjwGIiIiIlI8BiIiIiJSPAYiIiIiUjwGIiIiIlI8BiIiIiJSPAYiIiIiUjwGIiIiIlI8BiJSjJkzZ0KlUj2XfXXp0gVdunSRHsfGxkKlUuHXX399LvsfPnw43N3dn8u+yis7OxujRo2Co6MjVCoVQkNDDV1SmT36PlNxVek1Kvod8Pfffxu6FKqCGIioWgoPD4dKpZI2MzMzODs7w8/PD8uWLUNWVlaF7OfGjRuYOXMmEhISKmS+ilSVayuN2bNnIzw8HGPGjMF///tfDB06tMRxXl5eaNmyZbH2jRs3QqVSoXPnzsX6fvjhB6hUKuzYsaPC6y6vgoICrFq1Cl26dIGtrS00Gg3c3d0xYsQIHDlyxNDlAQDOnDmDmTNn4sqVK4Yu5bGKfvarymtGNYeJoQsgehazZs2Ch4cH8vLykJqaitjYWISGhmLRokXYvHkzWrRoIY39+OOPMXXq1DLNf+PGDXzyySdwd3dHq1atSv285/GH+Em1ffvttygsLKz0Gp7Frl270L59e8yYMeOJ43x8fPD9998jMzMT1tbWUvu+fftgYmKCw4cPIy8vD6amprI+Y2Nj6HS6Squ/LO7du4f+/fsjMjISnTp1wocffghbW1tcuXIF69evx+rVq5GcnAwXFxeD1nnmzBl88skn6NKlS4UdYaxKoZToSXiEiKq1Xr164Z133sGIESMwbdo0bN++HTt37kR6ejpef/113Lt3TxprYmICMzOzSq3n7t27AAC1Wg21Wl2p+3oSU1NTaDQag+2/NNLT02FjY/PUcT4+PigsLMT+/ftl7fv27cNbb72Fe/fuIT4+Xta3d+9etGjRAlZWVs9U4507d57p+UUmTZqEyMhILF68GLt378bEiRMxcuRIzJo1C6dPn8b8+fMrZD9VkaF/FohKi4GIapxu3brh3//+N65evYqffvpJai/pHKKoqCj4+PjAxsYGlpaWaNSoET788EMAD877eemllwAAI0aMkD6eCw8PB/Dg3IhmzZohPj4enTp1Qq1ataTnPu68iYKCAnz44YdwdHSEhYUFXn/9dVy7dk02xt3dHcOHDy/23IfnfFptJZ1DdOfOHUyYMAH16tWDRqNBo0aN8J///AdCCNk4lUqFkJAQbNq0Cc2aNYNGo0HTpk0RGRlZ8gv+iPT0dAQFBcHBwQFmZmZo2bIlVq9eLfUXnU+VlJSErVu3SrU/7mMaHx8fAA8CUJH79+/j6NGj6N+/P+rXry/ru3nzJs6fPy89DwCOHTuGXr16QavVwtLSEt27d8eBAwdk+yn6KGb37t344IMPYG9vLzti880336BBgwYwNzfHyy+/jD///LNUr8f169fx9ddfo0ePHiWeJ2VsbIyJEyfK9lWaeh93TlzROh5+Pd3d3dG7d2/s3bsXL7/8MszMzFC/fn38+OOPsue9+eabAICuXbtK70tsbCwA4MiRI/Dz84OdnR3Mzc3h4eGBkSNHPnX9jzufbv369fj888/h4uICMzMzdO/eHRcvXnzqfI+za9cudOzYERYWFrCxsUHfvn1x9uzZpz7v6tWr8PT0RLNmzZCWlgYAyMjIQGhoqPSz4unpiXnz5hU76rpu3Tq0bdsWVlZW0Gq1aN68OZYuXVruNZBh8SMzqpGGDh2KDz/8EDt27MC7775b4pjTp0+jd+/eaNGiBWbNmgWNRoOLFy9Kf1ybNGmCWbNmYfr06XjvvffQsWNHAMArr7wizfHPP/+gV69eGDRoEN555x04ODg8sa7PP/8cKpUKU6ZMQXp6OpYsWQJfX18kJCTA3Ny81OsrTW0PE0Lg9ddfR0xMDIKCgtCqVSts374dkyZNwl9//YXFixfLxu/duxcbNmzABx98ACsrKyxbtgwBAQFITk5GnTp1HlvXvXv30KVLF1y8eBEhISHw8PBAREQEhg8fjoyMDIwbNw5NmjTBf//7X4SFhcHFxQUTJkwAANStW7fEOevXrw9nZ2fs3btXajt8+DByc3Pxyiuv4JVXXsG+ffukeYqOJBUFotOnT6Njx47QarWYPHkyTE1N8fXXX6NLly7YvXs3vL29Zfv74IMPULduXUyfPl06QvT999/j/fffxyuvvILQ0FBcvnwZr7/+OmxtbVGvXr3Hvh4AsG3bNuTn5z/2HKlHlbXe0rp48SIGDBiAoKAgBAYG4ocffsDw4cPRtm1bNG3aFJ06dcLYsWOxbNkyfPjhh2jSpAmAB//W0tPT0bNnT9StWxdTp06FjY0Nrly5gg0bNpSrFgCYO3cujIyMMHHiRGRmZmL+/PkYMmQIDh48WOa5du7ciV69eqF+/fqYOXMm7t27hy+++AIdOnTA0aNHH/vx36VLl9CtWzfY2toiKioKdnZ2uHv3Ljp37oy//voL77//PlxdXbF//35MmzYNKSkpWLJkCYAH/zM1ePBgdO/eHfPmzQMAnD17Fvv27cO4cePK+7KQIQmiamjVqlUCgDh8+PBjx1hbW4vWrVtLj2fMmCEe/ie/ePFiAUDcvHnzsXMcPnxYABCrVq0q1te5c2cBQKxcubLEvs6dO0uPY2JiBADxwgsvCL1eL7WvX79eABBLly6V2tzc3ERgYOBT53xSbYGBgcLNzU16vGnTJgFAfPbZZ7JxAwYMECqVSly8eFFqAyDUarWs7fjx4wKA+OKLL4rt62FLliwRAMRPP/0kteXm5gqdTicsLS1la3dzcxP+/v5PnK/Im2++KczNzUVubq4QQog5c+YIDw8PIYQQy5cvF/b29tLYiRMnCgDir7/+EkII0a9fP6FWq8WlS5ekMTdu3BBWVlaiU6dOUlvRvykfHx+Rn58vq9/e3l60atVK5OTkSO3ffPONACB7T0oSFhYmAIhjx46Vaq2lrffRf8+PriMpKUlqc3NzEwDEnj17pLb09HSh0WjEhAkTpLaIiAgBQMTExMjm3Lhx41N/3h7ncT8LTZo0kb2eS5cuFQDEyZMnnzhfST/7rVq1Evb29uKff/6R2o4fPy6MjIzEsGHDpLai1+zmzZvi7NmzwtnZWbz00kvi1q1b0phPP/1UWFhYiPPnz8v2O3XqVGFsbCySk5OFEEKMGzdOaLVa2b8Vqt74kRnVWJaWlk+82qzo/JXffvut3CcgazQajBgxotTjhw0bJjuvZcCAAXBycsIff/xRrv2X1h9//AFjY2OMHTtW1j5hwgQIIbBt2zZZu6+vLxo0aCA9btGiBbRaLS5fvvzU/Tg6OmLw4MFSm6mpKcaOHYvs7Gzs3r27XPX7+PjIzhXat2+fdDSsQ4cOSE9Px4ULF6Q+Dw8PODs7o6CgADt27EC/fv1Qv359aT4nJye8/fbb2Lt3L/R6vWxf7777LoyNjaXHR44cQXp6OkaPHi07F2b48OGyk7wfp2j+0pzPVJ56S8vLy0s6kgg8OCLXqFGjp76nwP//rGzZsgV5eXnl2v+jRowYIXs9i2orTT0PS0lJQUJCAoYPHw5bW1upvUWLFujRo0eJP1unTp1C586d4e7ujp07d6J27dpSX0REBDp27IjatWvj77//ljZfX18UFBRgz549AB68Jnfu3EFUVFSZ6qWqi4GIaqzs7Own/hEaOHAgOnTogFGjRsHBwQGDBg3C+vXryxSOXnjhhTKdMNqwYUPZY5VKBU9Pz0q/zPnq1atwdnYu9noUfSxy9epVWburq2uxOWrXro3bt28/dT8NGzaEkZH8V8vj9lNaD59HJITA/v370aFDBwB
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"stop_words = set(stopwords.words('english'))\n",
"\n",
"# get all the jokes\n",
"jokes = df['body'].values\n",
"\n",
"# tokenize the jokes\n",
"tokenized_jokes = [word_tokenize(joke) for joke in jokes]\n",
"\n",
"# remove stop words\n",
"filtered_jokes = [[word for word in joke if word.lower() not in stop_words] for joke in tokenized_jokes]\n",
"\n",
"# count the number of words in each joke\n",
"word_counts = [len(joke) for joke in filtered_jokes]\n",
"\n",
"# plot the distribution of word counts\n",
"plt.hist(word_counts, bins=100)\n",
"plt.xlabel('Number of Words')\n",
"plt.ylabel('Frequency')\n",
"plt.title('Distribution of Word Counts in Jokes')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
". 71849\n",
", 61789\n",
"'' 25674\n",
"`` 23253\n",
"? 14285\n",
"! 10304\n",
": 10000\n",
"'s 9615\n",
"n't 6959\n",
"-- 5676\n",
"said 4701\n",
") 3969\n",
"man 3745\n",
"one 3715\n",
"* 3580\n",
"- 3213\n",
"says 2901\n",
"get 2802\n",
"... 2630\n",
"( 2429\n",
"like 2314\n",
"would 2205\n",
"know 2083\n",
"'m 2081\n",
"asked 2054\n",
"' 1964\n",
"back 1959\n",
"'re 1951\n",
"time 1843\n",
"go 1835\n",
"day 1812\n",
"got 1661\n",
"first 1497\n",
"people 1455\n",
"say 1431\n",
"could 1419\n",
"two 1415\n",
"'ll 1373\n",
"went 1346\n",
"take 1301\n",
"little 1298\n",
"see 1289\n",
"; 1260\n",
"wife 1238\n",
"going 1207\n",
"old 1187\n",
"'ve 1176\n",
"want 1175\n",
"woman 1156\n",
"car 1156\n",
"replied 1142\n",
"Well 1139\n",
"make 1126\n",
"guy 1122\n",
"blonde 1117\n",
"think 1107\n",
"home 1086\n",
"$ 1074\n",
"good 1054\n",
"Q 1044\n",
"next 1039\n",
"around 1025\n",
"One 1024\n",
"2 1016\n",
"right 976\n",
"ca 975\n",
"asks 967\n",
"way 959\n",
"Yo 953\n",
"new 952\n",
"tell 948\n",
"put 947\n",
"call 942\n",
"'d 934\n",
"1 933\n",
"came 920\n",
"3 892\n",
"told 883\n",
"boy 877\n",
"come 834\n",
"God 826\n",
"never 818\n",
"room 816\n",
"name 816\n",
"door 808\n",
"really 806\n",
"night 803\n",
"house 800\n",
"goes 794\n",
"look 767\n",
"• 767\n",
"love 758\n",
"give 755\n",
"dog 754\n",
"work 752\n",
"still 734\n",
"called 724\n",
"years 719\n",
"last 712\n",
"Oh 712\n"
]
}
],
"source": [
"# count the frequency of each word\n",
"word_freq = Counter()\n",
"for joke in filtered_jokes:\n",
" word_freq.update(joke)\n",
"\n",
"# get the most common words\n",
"most_common = word_freq.most_common(100)\n",
"\n",
"for word, count in most_common:\n",
" print(word, count)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}