423 lines
86 KiB
Plaintext
423 lines
86 KiB
Plaintext
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 7,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import json\n",
|
||
|
"import os\n",
|
||
|
"import numpy as np\n",
|
||
|
"import pandas as pd\n",
|
||
|
"import matplotlib.pyplot as plt\n",
|
||
|
"import nltk\n",
|
||
|
"from nltk.corpus import stopwords\n",
|
||
|
"from nltk.tokenize import word_tokenize\n",
|
||
|
"from collections import Counter"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 8,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stderr",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"[nltk_data] Downloading package punkt to\n",
|
||
|
"[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n",
|
||
|
"[nltk_data] Package punkt is already up-to-date!\n",
|
||
|
"[nltk_data] Downloading package stopwords to\n",
|
||
|
"[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n",
|
||
|
"[nltk_data] Package stopwords is already up-to-date!\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"True"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 8,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# nltk count words\n",
|
||
|
"nltk.download('punkt')\n",
|
||
|
"nltk.download('stopwords')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"# dataset wocka"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 9,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Load the data from the JSON file\n",
|
||
|
"data_path = './data/wocka.json'\n",
|
||
|
"with open(data_path) as f:\n",
|
||
|
" data = json.load(f)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 10,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/html": [
|
||
|
"<div>\n",
|
||
|
"<style scoped>\n",
|
||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||
|
" vertical-align: middle;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe tbody tr th {\n",
|
||
|
" vertical-align: top;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe thead th {\n",
|
||
|
" text-align: right;\n",
|
||
|
" }\n",
|
||
|
"</style>\n",
|
||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||
|
" <thead>\n",
|
||
|
" <tr style=\"text-align: right;\">\n",
|
||
|
" <th></th>\n",
|
||
|
" <th>body</th>\n",
|
||
|
" <th>category</th>\n",
|
||
|
" <th>id</th>\n",
|
||
|
" <th>title</th>\n",
|
||
|
" </tr>\n",
|
||
|
" </thead>\n",
|
||
|
" <tbody>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>0</th>\n",
|
||
|
" <td>What do you call a cow with no legs?\\r\\n\\r\\nGr...</td>\n",
|
||
|
" <td>Animal</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>Cow With No Legs</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>1</th>\n",
|
||
|
" <td>What do you call a cow jumping over a barbed w...</td>\n",
|
||
|
" <td>Animal</td>\n",
|
||
|
" <td>2</td>\n",
|
||
|
" <td>Jumping Cow</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>2</th>\n",
|
||
|
" <td>What's black and white and red all over?\\r\\n\\r...</td>\n",
|
||
|
" <td>Other / Misc</td>\n",
|
||
|
" <td>4</td>\n",
|
||
|
" <td>Black, White and Red</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>3</th>\n",
|
||
|
" <td>So, this guy walks into a bar.\\r\\n\\r\\nAnd says...</td>\n",
|
||
|
" <td>Bar</td>\n",
|
||
|
" <td>5</td>\n",
|
||
|
" <td>Guy in a Bar</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>4</th>\n",
|
||
|
" <td>If the opposite of pro is con, isn't the oppos...</td>\n",
|
||
|
" <td>One Liners</td>\n",
|
||
|
" <td>6</td>\n",
|
||
|
" <td>Progress</td>\n",
|
||
|
" </tr>\n",
|
||
|
" </tbody>\n",
|
||
|
"</table>\n",
|
||
|
"</div>"
|
||
|
],
|
||
|
"text/plain": [
|
||
|
" body category id \\\n",
|
||
|
"0 What do you call a cow with no legs?\\r\\n\\r\\nGr... Animal 1 \n",
|
||
|
"1 What do you call a cow jumping over a barbed w... Animal 2 \n",
|
||
|
"2 What's black and white and red all over?\\r\\n\\r... Other / Misc 4 \n",
|
||
|
"3 So, this guy walks into a bar.\\r\\n\\r\\nAnd says... Bar 5 \n",
|
||
|
"4 If the opposite of pro is con, isn't the oppos... One Liners 6 \n",
|
||
|
"\n",
|
||
|
" title \n",
|
||
|
"0 Cow With No Legs \n",
|
||
|
"1 Jumping Cow \n",
|
||
|
"2 Black, White and Red \n",
|
||
|
"3 Guy in a Bar \n",
|
||
|
"4 Progress "
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 10,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# create pandas dataframe of the data\n",
|
||
|
"df = pd.DataFrame(data)\n",
|
||
|
"df.head()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 11,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Animal\n",
|
||
|
"At Work\n",
|
||
|
"Bar\n",
|
||
|
"Blond\n",
|
||
|
"Blonde\n",
|
||
|
"Children\n",
|
||
|
"College\n",
|
||
|
"Gross\n",
|
||
|
"Insults\n",
|
||
|
"Knock-Knock\n",
|
||
|
"Lawyer\n",
|
||
|
"Lightbulb\n",
|
||
|
"Medical\n",
|
||
|
"Men / Women\n",
|
||
|
"News / Politics\n",
|
||
|
"One Liners\n",
|
||
|
"Other / Misc\n",
|
||
|
"Puns\n",
|
||
|
"Redneck\n",
|
||
|
"Religious\n",
|
||
|
"Sports\n",
|
||
|
"Tech\n",
|
||
|
"Yo Mama\n",
|
||
|
"Yo Momma\n",
|
||
|
"amount of categories: 24\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkQAAAIcCAYAAAAXPbHAAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAACRRElEQVR4nOzdd1RT2fc28CcU6UUQRRQEuyiWEXvHgtjL2AY79t7Frw0dxzaKfcQO9t67omDvgxUbImABC4MIKgqc9w9/3NcIahISQPN81spa5t6bkx2EZOeUfWRCCAEiIiIiLaaT3QEQERERZTcmRERERKT1mBARERGR1mNCRERERFqPCRERERFpPSZEREREpPWYEBEREZHWY0JEREREWo8JEREREWk9JkREJPHx8YFMJsOrV6+yOxQioizFhIjoF+Tv7w+ZTIYrV65kdygEYPr06di9e3d2h0FE38GEiIhIw5gQEeV8TIiISCslJiZmdwg5Dn8mpM2YEBFpiRMnTqBWrVowMTGBpaUlWrZsidDQ0B8+LiIiAkWLFkWZMmUQExMDAIiLi8OwYcNgb28PAwMDFC1aFLNmzUJqaqrcYzdv3oyKFSvCzMwM5ubmcHFxwYIFC777fI8fP4ZMJsOcOXMwb948FCpUCEZGRqhTpw5u3bqV7vq7d+/i999/h5WVFQwNDeHq6oq9e/fKXZM2hBgcHIwBAwYgb968KFiw4Hfj+PDhA3x8fFC8eHEYGhoif/78aNOmDcLCwqRr5syZg+rVq8Pa2hpGRkaoWLEitm/fLteOTCZDYmIiAgICIJPJIJPJ0L17d+n806dP0bNnT+TLlw8GBgYoXbo0Vq9enS6eiIgItGjRAiYmJsibNy+GDx+OI0eOQCaTISgoSO7abdu2oWLFijAyMkKePHnQuXNnPH36VO6a7t27w9TUFGFhYWjSpAnMzMzg6emJyZMnQ19fHy9fvkwXQ58+fWBpaYkPHz5892dH9DPSy+4AiEjzjh8/Dg8PDxQuXBg+Pj54//49Fi1ahBo1auDatWtwdHTM8HFhYWFwc3ODlZUVjh07hjx58uDdu3eoU6cOnj59ir59+8LBwQHnzp3DuHHj8Pz5c8yfPx8AcOzYMXTq1An169fHrFmzAAChoaE4e/Yshg4d+sOY165di7dv32LgwIH48OEDFixYADc3N9y8eRP58uUDANy+fRs1atRAgQIF4O3tDRMTE2zduhWtWrXCjh070Lp1a7k2BwwYABsbG0yaNOm7vSEpKSlo1qwZAgMD0bFjRwwdOhRv377FsWPHcOvWLRQpUgQAsGDBArRo0QKenp74+PEjNm/ejHbt2mH//v1o2rQpAGDdunXo1asXKleujD59+gCA9PiYmBhUrVoVMpkMgwYNgo2NDQ4dOgQvLy/Ex8dj2LBhAD733Li5ueH58+cYOnQobG1tsXHjRpw8eTJd7P7+/ujRowcqVaqEGTNmICYmBgsWLMDZs2fx77//wtLSUro2OTkZ7u7uqFmzJubMmQNjY2NUq1YNU6dOxZYtWzBo0CDp2o8fP2L79u1o27YtDA0Nf/j/R/TTEUT0y1mzZo0AIC5fviyEEKJ8+fIib9684vXr19I1169fFzo6OqJr167SscmTJwsA4uXLlyI0NFTY2dmJSpUqidjYWOmaP//8U5iYmIj79+/LPae3t7fQ1dUVkZGRQgghhg4dKszNzUVycrJSsYeHhwsAwsjISDx58kQ6fvHiRQFADB8+XDpWv3594eLiIj58+CAdS01NFdWrVxfFihVL9/OoWbOmQvGsXr1aABC+vr7pzqWmpkr/fvfundy5jx8/ijJlygg3Nze54yYmJqJbt27p2vLy8hL58+cXr169kjvesWNHYWFhIbU/d+5cAUDs3r1buub9+/eiZMmSAoA4efKk9Px58+YVZcqUEe/fv5eu3b9/vwAgJk2aJB3r1q2bACC8vb3TxVWtWjVRpUoVuWM7d+6Uey6iXw2HzIh+cc+fP0dISAi6d+8OKysr6XjZsmXRsGFDHDx4MN1jbt26hTp16sDR0RHHjx9H7ty5pXPbtm1DrVq1kDt3brx69Uq6NWjQACkpKTh16hQAwNLSEomJiTh27JhKcbdq1QoFChSQ7leuXBlVqlSR4o2NjcWJEyfQvn17vH37Vorj9evXcHd3x4MHD9INE/Xu3Ru6uro/fO4dO3YgT548GDx4cLpzMplM+reRkZH07//++w9v3rxBrVq1cO3atR8+hxACO3bsQPPmzSGEkPtZuru7482bN1I7hw8fRoECBdCiRQvp8YaGhujdu7dcm1euXMGLFy8wYMAAuV6cpk2bomTJkjhw4EC6OPr375/uWNeuXXHx4kW54cENGzbA3t4ederU+eFrI/oZMSEi+sVFREQAAEqUKJHuXKlSpfDq1at0w0fNmzeHmZkZjhw5AnNzc7lzDx48wOHDh2FjYyN3a9CgAQDgxYsXAD4PTxUvXhweHh4oWLAgevbsicOHDyscd7FixdIdK168OB4/fgwAePjwIYQQmDhxYrpYJk+eLBdLGicnJ4WeOywsDCVKlICe3vdnFezfvx9Vq1aFoaEhrKysYGNjg6VLl+LNmzc/fI6XL18iLi4Oy5cvTxd/jx495OKPiIhAkSJF5JIxAChatKjc/e/9X5csWVI6n0ZPTy/DuVQdOnSAgYEBNmzYAAB48+YN9u/fD09Pz3QxEP0qOIeIiNJp27YtAgICsGHDBvTt21fuXGpqKho2bIgxY8Zk+NjixYsDAPLmzYuQkBAcOXIEhw4dwqFDh7BmzRp07doVAQEBmY4xbQL3qFGj4O7unuE1XycMX/boZNbp06fRokUL1K5dG//88w/y588PfX19rFmzBhs3bvzh49Pi79y5M7p165bhNWXLllVbvBkxMDCAjk7678W5c+dGs2bNsGHDBkyaNAnbt29HUlISOnfurNF4iLITEyKiX1yhQoUAAPfu3Ut37u7du8iTJw9MTEzkjv/999/Q09PDgAEDYGZmhj/++EM6V6RIESQkJEg9Qt+TK1cuNG/eHM2bN0dqaioGDBiAZcuWYeLEiemSla89ePAg3bH79+9LE8ALFy4MANDX11coFmUUKVIEFy9exKdPn6Cvr5/hNTt27IChoSGOHDkCAwMD6fiaNWvSXZtRr4qNjQ3MzMyQkpLyw/gLFSqEO3fuQAgh19bDhw/TXQd8/r92c3OTO3fv3j3pvCK6du2Kli1b4vLly9iwYQMqVKiA0qVLK/x4op8Nh8yIfnH58+dH+fLlERAQgLi4OOn4rVu3cPToUTRp0iTdY2QyGZYvX47ff/8d3bp1k1vG3r59e5w/fx5HjhxJ97i4uDgkJycDAF6/fi13TkdHR+rxSEpK+mHcu3fvlpsDdOnSJVy8eBEeHh4APvdA1a1bF8uWLcPz58/TPT6jZeOKatu2LV69eoXFixenOyeEAADo6upCJpMhJSVFOvf48eMMCzCamJjI/ezTHt+2bVvs2LEjw3ICX8bv7u6Op0+fyv0/fPjwAStWrJB7jKurK/LmzQs/Pz+5n/GhQ4cQGhoqrXxThIeHB/LkyYNZs2YhODiYvUP0y2MPEZEW+Pvvv+Hh4YFq1arBy8tLWnZvYWEBHx+fDB+jo6OD9evXo1WrVmjfvj0OHjwINzc3jB49Gnv37kWzZs3QvXt3VKxYEYmJibh58ya2b9+Ox48fI0+ePOjVqxdiY2Ph5uaGggULIiIiAosWLUL58uVRqlSpH8ZctGhR1KxZE/3790dSUhLmz58Pa2truaG6JUuWoGbNmnBxcUHv3r1RuHBhxMTE4Pz583jy5AmuX7+u0s+ra9euWLt2LUaMGIFLly6hVq1aSExMxPHjxzFgwAC0bNkSTZs2ha+vLxo3bow//vgDL168wJIlS1C0aFHcuHFDrr2KFSvi+PHj8PX1hZ2dHZycnFClShXMnDkTJ0+eRJUqVdC7d284OzsjNjYW165dw/HjxxEbGwsA6Nu3LxYvXoxOnTph6NChyJ8/PzZs2CBNnE7rNdLX18esWbPQo0cP1KlTB506dZKW3Ts6OmL48OEK/wz
|
||
|
"text/plain": [
|
||
|
"<Figure size 640x480 with 1 Axes>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"#print unique categories\n",
|
||
|
"categories = df['category'].unique()\n",
|
||
|
"categories= sorted(categories)\n",
|
||
|
"for category in categories:\n",
|
||
|
" print(category)\n",
|
||
|
"print('amount of categories:', len(categories))\n",
|
||
|
"# plot the distribution of categories\n",
|
||
|
"category_counts = df['category'].value_counts()\n",
|
||
|
"category_counts.plot(kind='bar')\n",
|
||
|
"plt.title('Jokes per category')\n",
|
||
|
"plt.ylabel('Number of jokes')\n",
|
||
|
"plt.xlabel('Category')\n",
|
||
|
"plt.xticks(rotation=90)\n",
|
||
|
"plt.show()\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 12,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkQAAAHHCAYAAABeLEexAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAABSRElEQVR4nO3deVxUVf8H8M+wzIDAgIhsIYtiKu5L4STuKBmaJpaaKSpWGjwK7laPmpXr41qprWJPmkmPmmmiiKCpuKG4ixuKJoulMODCen5/+OL+vIIKCA5wP+/X675ezTlnzv2eGYFPd+69oxJCCBAREREpmJGhCyAiIiIyNAYiIiIiUjwGIiIiIlI8BiIiIiJSPAYiIiIiUjwGIiIiIlI8BiIiIiJSPAYiIiIiUjwGIiIiIlI8BiJSjJkzZ0KlUj2XfXXp0gVdunSRHsfGxkKlUuHXX399LvsfPnw43N3dn8u+yis7OxujRo2Co6MjVCoVQkNDDV1SmT36PlNxVek1Kvod8Pfffxu6FKqCGIioWgoPD4dKpZI2MzMzODs7w8/PD8uWLUNWVlaF7OfGjRuYOXMmEhISKmS+ilSVayuN2bNnIzw8HGPGjMF///tfDB06tMRxXl5eaNmyZbH2jRs3QqVSoXPnzsX6fvjhB6hUKuzYsaPC6y6vgoICrFq1Cl26dIGtrS00Gg3c3d0xYsQIHDlyxNDlAQDOnDmDmTNn4sqVK4Yu5bGKfvarymtGNYeJoQsgehazZs2Ch4cH8vLykJqaitjYWISGhmLRokXYvHkzWrRoIY39+OOPMXXq1DLNf+PGDXzyySdwd3dHq1atSv285/GH+Em1ffvttygsLKz0Gp7Frl270L59e8yYMeOJ43x8fPD9998jMzMT1tbWUvu+fftgYmKCw4cPIy8vD6amprI+Y2Nj6HS6Squ/LO7du4f+/fsjMjISnTp1wocffghbW1tcuXIF69evx+rVq5GcnAwXFxeD1nnmzBl88skn6NKlS4UdYaxKoZToSXiEiKq1Xr164Z133sGIESMwbdo0bN++HTt37kR6ejpef/113Lt3TxprYmICMzOzSq3n7t27AAC1Wg21Wl2p+3oSU1NTaDQag+2/NNLT02FjY/PUcT4+PigsLMT+/ftl7fv27cNbb72Fe/fuIT4+Xta3d+9etGjRAlZWVs9U4507d57p+UUmTZqEyMhILF68GLt378bEiRMxcuRIzJo1C6dPn8b8+fMrZD9VkaF/FohKi4GIapxu3brh3//+N65evYqffvpJai/pHKKoqCj4+PjAxsYGlpaWaNSoET788EMAD877eemllwAAI0aMkD6eCw8PB/Dg3IhmzZohPj4enTp1Qq1ataTnPu68iYKCAnz44YdwdHSEhYUFXn/9dVy7dk02xt3dHcOHDy/23IfnfFptJZ1DdOfOHUyYMAH16tWDRqNBo0aN8J///AdCCNk4lUqFkJAQbNq0Cc2aNYNGo0HTpk0RGRlZ8gv+iPT0dAQFBcHBwQFmZmZo2bIlVq9eLfUXnU+VlJSErVu3SrU/7mMaHx8fAA8CUJH79+/j6NGj6N+/P+rXry/ru3nzJs6fPy89DwCOHTuGXr16QavVwtLSEt27d8eBAwdk+yn6KGb37t344IMPYG9vLzti880336BBgwYwNzfHyy+/jD///LNUr8f169fx9ddfo0ePHiWeJ2VsbIyJEyfK9lWaeh93TlzROh5+Pd3d3dG7d2/s3bsXL7/8MszMzFC/fn38+OOPsue9+eabAICuXbtK70tsbCwA4MiRI/Dz84OdnR3Mzc3h4eGBkSNHPnX9jzufbv369fj888/h4uICMzMzdO/eHRcvXnzqfI+za9cudOzYERYWFrCxsUHfvn1x9uzZpz7v6tWr8PT0RLNmzZCWlgYAyMjIQGhoqPSz4unpiXnz5hU76rpu3Tq0bdsWVlZW0Gq1aN68OZYuXVruNZBh8SMzqpGGDh2KDz/8EDt27MC7775b4pjTp0+jd+/eaNGiBWbNmgWNRoOLFy9Kf1ybNGmCWbNmYfr06XjvvffQsWNHAMArr7wizfHPP/+gV69eGDRoEN555x04ODg8sa7PP/8cKpUKU6ZMQXp6OpYsWQJfX18kJCTA3Ny81OsrTW0PE0Lg9ddfR0xMDIKCgtCqVSts374dkyZNwl9//YXFixfLxu/duxcbNmzABx98ACsrKyxbtgwBAQFITk5GnTp1HlvXvXv30KVLF1y8eBEhISHw8PBAREQEhg8fjoyMDIwbNw5NmjTBf//7X4SFhcHFxQUTJkwAANStW7fEOevXrw9nZ2fs3btXajt8+DByc3Pxyiuv4JVXXsG+ffukeYqOJBUFotOnT6Njx47QarWYPHkyTE1N8fXXX6NLly7YvXs3vL29Zfv74IMPULduXUyfPl06QvT999/j/fffxyuvvILQ0FBcvnwZr7/+OmxtbVGvXr3Hvh4AsG3bNuTn5z/2HKlHlbXe0rp48SIGDBiAoKAgBAYG4ocffsDw4cPRtm1bNG3aFJ06dcLYsWOxbNkyfPjhh2jSpAmAB//W0tPT0bNnT9StWxdTp06FjY0Nrly5gg0bNpSrFgCYO3cujIyMMHHiRGRmZmL+/PkYMmQIDh48WOa5du7ciV69eqF+/fqYOXMm7t27hy+++AIdOnTA0aNHH/vx36VLl9CtWzfY2toiKioKdnZ2uHv3Ljp37oy//voL77//PlxdXbF//35MmzYNKSkpWLJkCYAH/zM1ePBgdO/eHfPmzQMAnD17Fvv27cO4cePK+7KQIQmiamjVqlUCgDh8+PBjx1hbW4vWrVtLj2fMmCEe/ie/ePFiAUDcvHnzsXMcPnxYABCrVq0q1te5c2cBQKxcubLEvs6dO0uPY2JiBADxwgsvCL1eL7WvX79eABBLly6V2tzc3ERgYOBT53xSbYGBgcLNzU16vGnTJgFAfPbZZ7JxAwYMECqVSly8eFFqAyDUarWs7fjx4wKA+OKLL4rt62FLliwRAMRPP/0kteXm5gqdTicsLS1la3dzcxP+/v5PnK/Im2++KczNzUVubq4QQog5c+YIDw8PIYQQy5cvF/b29tLYiRMnCgDir7/+EkII0a9fP6FWq8WlS5ekMTdu3BBWVlaiU6dOUlvRvykfHx+Rn58vq9/e3l60atVK5OTkSO3ffPONACB7T0oSFhYmAIhjx46Vaq2lrffRf8+PriMpKUlqc3NzEwDEnj17pLb09HSh0WjEhAkTpLaIiAgBQMTExMjm3Lhx41N/3h7ncT8LTZo0kb2eS5cuFQDEyZMnnzhfST/7rVq1Evb29uKff/6R2o4fPy6MjIzEsGHDpLai1+zmzZvi7NmzwtnZWbz00kvi1q1b0phPP/1UWFhYiPPnz8v2O3XqVGFsbCySk5OFEEKMGzdOaLVa2b8Vqt74kRnVWJaWlk+82qzo/JXffvut3CcgazQajBgxotTjhw0bJjuvZcCAAXBycsIff/xRrv2X1h9//AFjY2OMHTtW1j5hwgQIIbBt2zZZu6+vLxo0aCA9btGiBbRaLS5fvvzU/Tg6OmLw4MFSm6mpKcaOHYvs7Gzs3r27XPX7+PjIzhXat2+fdDSsQ4cOSE9Px4ULF6Q+Dw8PODs7o6CgADt27EC/fv1Qv359aT4nJye8/fbb2Lt3L/R6vWxf7777LoyNjaXHR44cQXp6OkaPHi07F2b48OGyk7wfp2j+0pzPVJ56S8vLy0s6kgg8OCLXqFGjp76nwP//rGzZsgV5eXnl2v+jRowYIXs9i2orTT0PS0lJQUJCAoYPHw5bW1upvUWLFujRo0eJP1unTp1C586d4e7ujp07d6J27dpSX0REBDp27IjatWvj77//ljZfX18UFBRgz549AB68Jnfu3EFUVFSZ6qWqi4GIaqzs7Own/hEaOHAgOnTogFGjRsHBwQGDBg3C+vXryxSOXnjhhTKdMNqwYUPZY5VKBU9Pz0q/zPnq1atwdnYu9noUfSxy9epVWburq2uxOWrXro3bt28/dT8NGzaEkZH8V8vj9lNaD59HJITA/v370aFDBwB
|
||
|
"text/plain": [
|
||
|
"<Figure size 640x480 with 1 Axes>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"stop_words = set(stopwords.words('english'))\n",
|
||
|
"\n",
|
||
|
"# get all the jokes\n",
|
||
|
"jokes = df['body'].values\n",
|
||
|
"\n",
|
||
|
"# tokenize the jokes\n",
|
||
|
"tokenized_jokes = [word_tokenize(joke) for joke in jokes]\n",
|
||
|
"\n",
|
||
|
"# remove stop words\n",
|
||
|
"filtered_jokes = [[word for word in joke if word.lower() not in stop_words] for joke in tokenized_jokes]\n",
|
||
|
"\n",
|
||
|
"# count the number of words in each joke\n",
|
||
|
"word_counts = [len(joke) for joke in filtered_jokes]\n",
|
||
|
"\n",
|
||
|
"# plot the distribution of word counts\n",
|
||
|
"plt.hist(word_counts, bins=100)\n",
|
||
|
"plt.xlabel('Number of Words')\n",
|
||
|
"plt.ylabel('Frequency')\n",
|
||
|
"plt.title('Distribution of Word Counts in Jokes')\n",
|
||
|
"plt.show()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 13,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
". 71849\n",
|
||
|
", 61789\n",
|
||
|
"'' 25674\n",
|
||
|
"`` 23253\n",
|
||
|
"? 14285\n",
|
||
|
"! 10304\n",
|
||
|
": 10000\n",
|
||
|
"'s 9615\n",
|
||
|
"n't 6959\n",
|
||
|
"-- 5676\n",
|
||
|
"said 4701\n",
|
||
|
") 3969\n",
|
||
|
"man 3745\n",
|
||
|
"one 3715\n",
|
||
|
"* 3580\n",
|
||
|
"- 3213\n",
|
||
|
"says 2901\n",
|
||
|
"get 2802\n",
|
||
|
"... 2630\n",
|
||
|
"( 2429\n",
|
||
|
"like 2314\n",
|
||
|
"would 2205\n",
|
||
|
"know 2083\n",
|
||
|
"'m 2081\n",
|
||
|
"asked 2054\n",
|
||
|
"' 1964\n",
|
||
|
"back 1959\n",
|
||
|
"'re 1951\n",
|
||
|
"time 1843\n",
|
||
|
"go 1835\n",
|
||
|
"day 1812\n",
|
||
|
"got 1661\n",
|
||
|
"first 1497\n",
|
||
|
"people 1455\n",
|
||
|
"say 1431\n",
|
||
|
"could 1419\n",
|
||
|
"two 1415\n",
|
||
|
"'ll 1373\n",
|
||
|
"went 1346\n",
|
||
|
"take 1301\n",
|
||
|
"little 1298\n",
|
||
|
"see 1289\n",
|
||
|
"; 1260\n",
|
||
|
"wife 1238\n",
|
||
|
"going 1207\n",
|
||
|
"old 1187\n",
|
||
|
"'ve 1176\n",
|
||
|
"want 1175\n",
|
||
|
"woman 1156\n",
|
||
|
"car 1156\n",
|
||
|
"replied 1142\n",
|
||
|
"Well 1139\n",
|
||
|
"make 1126\n",
|
||
|
"guy 1122\n",
|
||
|
"blonde 1117\n",
|
||
|
"think 1107\n",
|
||
|
"home 1086\n",
|
||
|
"$ 1074\n",
|
||
|
"good 1054\n",
|
||
|
"Q 1044\n",
|
||
|
"next 1039\n",
|
||
|
"around 1025\n",
|
||
|
"One 1024\n",
|
||
|
"2 1016\n",
|
||
|
"right 976\n",
|
||
|
"ca 975\n",
|
||
|
"asks 967\n",
|
||
|
"way 959\n",
|
||
|
"Yo 953\n",
|
||
|
"new 952\n",
|
||
|
"tell 948\n",
|
||
|
"put 947\n",
|
||
|
"call 942\n",
|
||
|
"'d 934\n",
|
||
|
"1 933\n",
|
||
|
"came 920\n",
|
||
|
"3 892\n",
|
||
|
"told 883\n",
|
||
|
"boy 877\n",
|
||
|
"come 834\n",
|
||
|
"God 826\n",
|
||
|
"never 818\n",
|
||
|
"room 816\n",
|
||
|
"name 816\n",
|
||
|
"door 808\n",
|
||
|
"really 806\n",
|
||
|
"night 803\n",
|
||
|
"house 800\n",
|
||
|
"goes 794\n",
|
||
|
"look 767\n",
|
||
|
"⢠767\n",
|
||
|
"love 758\n",
|
||
|
"give 755\n",
|
||
|
"dog 754\n",
|
||
|
"work 752\n",
|
||
|
"still 734\n",
|
||
|
"called 724\n",
|
||
|
"years 719\n",
|
||
|
"last 712\n",
|
||
|
"Oh 712\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# count the frequency of each word\n",
|
||
|
"word_freq = Counter()\n",
|
||
|
"for joke in filtered_jokes:\n",
|
||
|
" word_freq.update(joke)\n",
|
||
|
"\n",
|
||
|
"# get the most common words\n",
|
||
|
"most_common = word_freq.most_common(100)\n",
|
||
|
"\n",
|
||
|
"for word, count in most_common:\n",
|
||
|
" print(word, count)"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.10.4"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 2
|
||
|
}
|