568 lines
41 KiB
Plaintext
568 lines
41 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Tokenization adn Normalization"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 46,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"import json\n",
|
|||
|
"import os\n",
|
|||
|
"from collections import Counter\n",
|
|||
|
"\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"import pandas as pd\n",
|
|||
|
"\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import nltk\n",
|
|||
|
"\n",
|
|||
|
"from nltk.corpus import stopwords\n",
|
|||
|
"from nltk.tokenize import word_tokenize, sent_tokenize\n",
|
|||
|
"from nltk.stem import WordNetLemmatizer\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 47,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"[nltk_data] Downloading package punkt to\n",
|
|||
|
"[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n",
|
|||
|
"[nltk_data] Package punkt is already up-to-date!\n",
|
|||
|
"[nltk_data] Downloading package stopwords to\n",
|
|||
|
"[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n",
|
|||
|
"[nltk_data] Package stopwords is already up-to-date!\n",
|
|||
|
"[nltk_data] Downloading package wordnet to\n",
|
|||
|
"[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n",
|
|||
|
"[nltk_data] Package wordnet is already up-to-date!\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"True"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 47,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# nltk count words\n",
|
|||
|
"nltk.download('punkt')\n",
|
|||
|
"nltk.download('stopwords')\n",
|
|||
|
"nltk.download('wordnet')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 48,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Load the data\n",
|
|||
|
"# Load the data from the JSON file\n",
|
|||
|
"data_path = './data/reddit_jokes.json'\n",
|
|||
|
"with open(data_path) as f:\n",
|
|||
|
" data = json.load(f)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 49,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>body</th>\n",
|
|||
|
" <th>id</th>\n",
|
|||
|
" <th>score</th>\n",
|
|||
|
" <th>title</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>Now I have to say \"Leroy can you please paint ...</td>\n",
|
|||
|
" <td>5tz52q</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>I hate how you cant even say black paint anymore</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>Pizza doesn't scream when you put it in the ov...</td>\n",
|
|||
|
" <td>5tz4dd</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>What's the difference between a Jew in Nazi Ge...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>...and being there really helped me learn abou...</td>\n",
|
|||
|
" <td>5tz319</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>I recently went to America....</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>A Sunday school teacher is concerned that his ...</td>\n",
|
|||
|
" <td>5tz2wj</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>Brian raises his hand and says, “He’s in Heaven.”</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>He got caught trying to sell the two books to ...</td>\n",
|
|||
|
" <td>5tz1pc</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>You hear about the University book store worke...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" body id score \\\n",
|
|||
|
"0 Now I have to say \"Leroy can you please paint ... 5tz52q 1 \n",
|
|||
|
"1 Pizza doesn't scream when you put it in the ov... 5tz4dd 0 \n",
|
|||
|
"2 ...and being there really helped me learn abou... 5tz319 0 \n",
|
|||
|
"3 A Sunday school teacher is concerned that his ... 5tz2wj 1 \n",
|
|||
|
"4 He got caught trying to sell the two books to ... 5tz1pc 0 \n",
|
|||
|
"\n",
|
|||
|
" title \n",
|
|||
|
"0 I hate how you cant even say black paint anymore \n",
|
|||
|
"1 What's the difference between a Jew in Nazi Ge... \n",
|
|||
|
"2 I recently went to America.... \n",
|
|||
|
"3 Brian raises his hand and says, “He’s in Heaven.” \n",
|
|||
|
"4 You hear about the University book store worke... "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 49,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# create pandas dataframe of the data\n",
|
|||
|
"df = pd.DataFrame(data)\n",
|
|||
|
"df.head()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 50,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# NOTE: bit more than 1000 jokes for removing duplicates and empty jokes\n",
|
|||
|
"num_good = 1046\n",
|
|||
|
"num_bad = 1033\n",
|
|||
|
"min_score = 50\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"df_sroted = df.sort_values(by='score', ascending=False)\n",
|
|||
|
"df_sroted = df_sroted.reset_index(drop=True)\n",
|
|||
|
"\n",
|
|||
|
"df_good = df_sroted.head(num_good)\n",
|
|||
|
"df_good = df_good.reset_index(drop=True)\n",
|
|||
|
"\n",
|
|||
|
"df_bad = df_sroted[df_sroted['score'] > min_score].tail(num_bad)\n",
|
|||
|
"df_bad = df_bad.reset_index(drop=True)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 51,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"avg score good jokes: 10731.98 shape: (1046, 4)\n",
|
|||
|
"avg score bad jokes: 52.32 shape: (1033, 4)\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# avg score and shape\n",
|
|||
|
"print('avg score good jokes:', df_good['score'].mean().round(2), 'shape:', df_good.shape)\n",
|
|||
|
"print('avg score bad jokes:', df_bad['score'].mean().round(2), 'shape:', df_bad.shape)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 52,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"####################################################################################################\n",
|
|||
|
"5 random good jokes:\n",
|
|||
|
"####################################################################################################\n",
|
|||
|
"I told my wife we can have sex or go see Star Wars, she said, I'm on my period and Star Wars is sold out.\n",
|
|||
|
"....what happens next will shock you.\"\n",
|
|||
|
"----------------------------------------------------------------------------------------------------\n",
|
|||
|
"Two men from Texas were sitting at a bar\n",
|
|||
|
"\"No way. That's impossible!\" she said.\n",
|
|||
|
"\n",
|
|||
|
"\"Trust me,\" I said, \"I have no idea where our baby is.\"\n",
|
|||
|
"----------------------------------------------------------------------------------------------------\n",
|
|||
|
"My wife always accuses me of having a favorite child.\n",
|
|||
|
"...hands down.\n",
|
|||
|
"----------------------------------------------------------------------------------------------------\n",
|
|||
|
"If i got 50 cents for every failed math exam,\n",
|
|||
|
"Two fleas had an arrangement to meet every winter in Miami for a vacation. Last year, when one flea gets to Miami he is shivering and shaking.\n",
|
|||
|
"\n",
|
|||
|
"The other flea asked him, \"Why are shaking so badly?\"\n",
|
|||
|
"\n",
|
|||
|
"The first flea says, \"I rode down here from New Jersey in the moustache of a guy on a Harley.\"\n",
|
|||
|
"\n",
|
|||
|
"The other flea says, \"That's the worst way to travel. Do what I do. Go to the New Jersey airport bar. Have a few drinks. While there, look for a nice stewardess, crawl up her leg and nestle in where it's warm and cozy. It's the best way to travel that I can think of.\"\n",
|
|||
|
"\n",
|
|||
|
"The first flea thanks the second flea and says he will give it a try next winter. A year goes by... When the first flea shows up in Miami he shivering and shaking again.\n",
|
|||
|
"\n",
|
|||
|
"The second flea says, \"Didn't you try what I told you?\"\n",
|
|||
|
"\n",
|
|||
|
"\"Yes,\" says the first flea. \"I did exactly what you said. I went to the New Jersey airport bar. I had a few drinks. Finally, this nice young stewardess came in. I crawled right up to her warm cozy spot. It was so nice and warm that I fell asleep. When I woke up, I was back in the moustache of a guy on a Harley.\"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
" Edit: Front page already!!! Gr8 Guys.\n",
|
|||
|
"----------------------------------------------------------------------------------------------------\n",
|
|||
|
"I told god a Holocaust joke. He didn't laugh.\n",
|
|||
|
"Whoops, wrong sub.\n",
|
|||
|
"----------------------------------------------------------------------------------------------------\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# print random 5 good jokes\n",
|
|||
|
"print('#' * 100)\n",
|
|||
|
"print('5 random good jokes:')\n",
|
|||
|
"print('#' * 100)\n",
|
|||
|
"for i in range(5):\n",
|
|||
|
" print(df_good['title'][np.random.randint(0, num_good)])\n",
|
|||
|
" print(df_good['body'][np.random.randint(0, num_good)])\n",
|
|||
|
" print('-' * 100)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 53,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"####################################################################################################\n",
|
|||
|
"5 random bad jokes:\n",
|
|||
|
"####################################################################################################\n",
|
|||
|
"why dont they have drivers ed and sex ed on the same day in mexico?\n",
|
|||
|
"As he was praying a black guy was walking nearby with groceries when he dropped his cheese wheel and it rolled to the Mexican. The Mexican grabbed it, praised god, and ran home. \n",
|
|||
|
"\n",
|
|||
|
"When he gets home he instructs his wife to make nachos with the cheese. \n",
|
|||
|
"\n",
|
|||
|
"\"Why nachos\" asks his wife \"we can make so many better meals with this cheese\"\n",
|
|||
|
"\n",
|
|||
|
"\"No\" said the Mexican \"god instructs me to make nachos.\"\n",
|
|||
|
"\n",
|
|||
|
"\"What do you mean\" asked the wife \n",
|
|||
|
"\n",
|
|||
|
"\"As I was praying God sent me the cheese wheel and as I was running home with it I heard him yelling That's Nacho cheese, that's nacho cheese!\"\n",
|
|||
|
"----------------------------------------------------------------------------------------------------\n",
|
|||
|
"2 Priests go to the beach...\n",
|
|||
|
"a navel..\n",
|
|||
|
"----------------------------------------------------------------------------------------------------\n",
|
|||
|
"Two parrots were sitting on a perch...\n",
|
|||
|
"Okay I'm donesn't born yesterday..\n",
|
|||
|
"----------------------------------------------------------------------------------------------------\n",
|
|||
|
"I recently had sex with a biologist at her laboratory.\n",
|
|||
|
"MAN: I’d like to buy some dog food.\n",
|
|||
|
"CHECKOUT LADY: Do you have a dog?\n",
|
|||
|
"MAN: Yes.\n",
|
|||
|
"CHECKOUT LADY: Where is he?\n",
|
|||
|
"MAN: He’s at home.\n",
|
|||
|
"CHECKOUT LADY: I’m sorry; I can’t sell this dog food to you unless Isee the dog. Store policy.\n",
|
|||
|
"The next day, the man returns.\n",
|
|||
|
"MAN: I’d like to buy some cat food.\n",
|
|||
|
"CHECKOUT LADY: Do you have a cat?\n",
|
|||
|
"MAN: Yes.\n",
|
|||
|
"CHECKOUT LADY: Well...where is he?\n",
|
|||
|
"MAN: He’s at home!\n",
|
|||
|
"CHECKOUT LADY: Sorry, I can’t sell this cat food to you unless I see\n",
|
|||
|
"your cat.\n",
|
|||
|
"The next day the man returns.\n",
|
|||
|
"CHECKOUT LADY: What’s in the sack?\n",
|
|||
|
"MAN: Put your hand inside.\n",
|
|||
|
"CHECKOUT LADY: Hmmm... It’s warm and moist! What is it?\n",
|
|||
|
"MAN: I would like to buy some toilet paper.\n",
|
|||
|
"----------------------------------------------------------------------------------------------------\n",
|
|||
|
"Farmer Dan\n",
|
|||
|
"He hit me until I was 21. \n",
|
|||
|
"----------------------------------------------------------------------------------------------------\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# print random 5 bad jokes\n",
|
|||
|
"print('#' * 100)\n",
|
|||
|
"print('5 random bad jokes:')\n",
|
|||
|
"print('#' * 100)\n",
|
|||
|
"for i in range(5):\n",
|
|||
|
" print(df_bad['title'][np.random.randint(0, num_bad)])\n",
|
|||
|
" print(df_bad['body'][np.random.randint(0, num_bad)])\n",
|
|||
|
" print('-' * 100)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"shape after tokenizing: (1046, 6) (1033, 6)\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# tokenize the jokes\n",
|
|||
|
"stop_words = set(stopwords.words('english'))\n",
|
|||
|
"\n",
|
|||
|
"def sentinize(joke):\n",
|
|||
|
" tokens = sent_tokenize(joke)\n",
|
|||
|
" tokens = [word for word in tokens if word not in stop_words]\n",
|
|||
|
" return tokens\n",
|
|||
|
"\n",
|
|||
|
"# Tokenize and lemmatize the jokes\n",
|
|||
|
"df_good['tok_body'] = df_good['body'].apply(sentinize)\n",
|
|||
|
"df_bad['tok_body'] = df_bad['body'].apply(sentinize)\n",
|
|||
|
"\n",
|
|||
|
"df_good['tok_title'] = df_good['title'].apply(sentinize)\n",
|
|||
|
"df_bad['tok_title'] = df_bad['title'].apply(sentinize)\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"print('shape after tokenizing:', df_good.shape, df_bad.shape)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"def rem_null_dubl(df):\n",
|
|||
|
" # Remove rows where 'tok_body' or 'tok_title' is empty\n",
|
|||
|
" df = df[df['tok_body'].map(len) > 0]\n",
|
|||
|
" df = df[df['tok_title'].map(len) > 0]\n",
|
|||
|
" \n",
|
|||
|
" # Convert list-type columns to strings for deduplication\n",
|
|||
|
" df['tok_body_str'] = df['tok_body'].apply(lambda x: ' '.join(x))\n",
|
|||
|
" df['tok_title_str'] = df['tok_title'].apply(lambda x: ' '.join(x))\n",
|
|||
|
" \n",
|
|||
|
" # Remove rows where both 'tok_body' and 'tok_title' are duplicates\n",
|
|||
|
" df = df.drop_duplicates(subset=['tok_body_str', 'tok_title_str'], keep=False)\n",
|
|||
|
" \n",
|
|||
|
" # Drop the temporary string columns\n",
|
|||
|
" df = df.drop(columns=['tok_body_str', 'tok_title_str'])\n",
|
|||
|
" \n",
|
|||
|
" return df\n",
|
|||
|
"\n",
|
|||
|
"df_good = rem_null_dubl(df_good)\n",
|
|||
|
"df_bad = rem_null_dubl(df_bad)\n",
|
|||
|
"print('shape after removing duplicates:', df_good.shape, df_bad.shape)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"df_good['tokens'] = df_good['tok_body'] + df_good['tok_title']\n",
|
|||
|
"df_bad['tokens'] = df_bad['tok_body'] + df_bad['tok_title']"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 58,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"good jokes with more than 1 sentence: 576 total: 1008\n",
|
|||
|
"bad jokes with more than 1 sentence: 477 total: 1016\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjsAAAGwCAYAAABPSaTdAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAABCtklEQVR4nO3deVhWdf7/8dfNKsiWKCAJbomC4pIrWWZJKprpaKXFpJbLWDiKmJZTqWmFWqnpuMy0YDNpNk1qZYU55FKGGw2uiGvhjCKNC4gmKpzfH/28v925xI337Q3H5+O6znV5PudzPud97vOdeH3PajEMwxAAAIBJubm6AAAAAGci7AAAAFMj7AAAAFMj7AAAAFMj7AAAAFMj7AAAAFMj7AAAAFPzcHUBlUFZWZmOHDkif39/WSwWV5cDAADKwTAMnT59WuHh4XJzu/r5G8KOpCNHjigiIsLVZQAAgAo4fPiw6tSpc9XlhB1J/v7+kn7+sQICAlxcDQAAKI+ioiJFRERY/45fDWFHsl66CggIIOwAAFDF/NYtKNygDAAATI2wAwAATI2wAwAATI17dgAAVVJpaakuXLjg6jLgRJ6ennJ3d7/ucQg7AIAqxTAM5efn69SpU64uBTdAUFCQwsLCrus9eIQdAECVcinohISEyNfXl5fBmpRhGDp79qwKCgokSbVr167wWIQdAECVUVpaag06wcHBri4HTubj4yNJKigoUEhISIUvaXGDMgCgyrh0j46vr6+LK8GNculYX8/9WYQdAECVw6Wrm4cjjjVhBwAAmBphBwAAmBo3KAMAqrxZq/fe0O2NuS/qhm7vetSrV0/JyclKTk6+4vLJkydrxYoVys7OLtd433//verXr69///vfatmypcPqdCaXntlZsGCBmjdvbv0AZ1xcnL744gvr8nPnzikpKUnBwcHy8/NTv379dOzYMZsx8vLy1LNnT/n6+iokJETjxo3TxYsXb/SuAABQJT399NPKyMhwdRlO5dKwU6dOHU2bNk1ZWVnaunWr7r33XvXu3Vu7du2SJI0ZM0affvqpPvzwQ61bt05HjhxR3759reuXlpaqZ8+eOn/+vL799lu9++67WrRokSZOnOiqXQIAoErx8/Mz/WP8Lg07vXr1Uo8ePdSoUSNFRUXp5Zdflp+fnzZu3KjCwkK9/fbbmjlzpu699161bt1aaWlp+vbbb7Vx40ZJ0pdffqndu3frvffeU8uWLZWQkKCpU6dq3rx5On/+vCt3DQAAq9OnTysxMVHVq1dX7dq1NWvWLHXu3Nnm0tLJkyc1cOBA3XLLLfL19VVCQoL27dtnM85HH32kpk2bytvbW/Xq1dPrr79us7ygoEC9evWSj4+P6tevr8WLF/9mbZMnT7a5HFVWVqYpU6aoTp068vb2VsuWLZWenn7V9UtLS/XEE0+oSZMmysvLkyR9/PHHuv3221WtWjU1aNBAL774ovWqi2EYmjx5siIjI+Xt7a3w8HCNGjXqN+u8HpXmnp3S0lJ9+OGHOnPmjOLi4pSVlaULFy4oPj7e2qdJkyaKjIxUZmamOnTooMzMTMXGxio0NNTap1u3bnryySe1a9cutWrV6orbKikpUUlJiXW+qKjIafvlzOvIVemaMQDczFJSUrRhwwZ98sknCg0N1cSJE/Xdd9/ZhIzBgwdr3759+uSTTxQQEKBnnnlGPXr00O7du+Xp6amsrCw9/PDDmjx5svr3769vv/1WTz31lIKDgzV48GDrGEeOHNGaNWvk6empUaNGWd9AXF5vvPGGXn/9df3lL39Rq1at9M477+iBBx7Qrl271KhRI5u+JSUleuSRR/T999/r66+/Vq1atfT1119r4MCBmjNnju666y4dOHBAw4cPlyRNmjRJH330kWbNmqWlS5eqadOmys/P17Zt267r9/0tLg87O3bsUFxcnM6dOyc/Pz8tX75cMTExys7OlpeXl4KCgmz6h4aGKj8/X9LPrwz/ZdC5tPzSsqtJTU3Viy++6NgdAQDgCk6fPq13331XS5YsUZcuXSRJaWlpCg8Pt/a5FHI2bNigO+64Q5K0ePFiRUREaMWKFXrooYc0c+ZMdenSRS+88IIkKSoqSrt379arr76qwYMHa+/evfriiy+0efNmtW3bVpL09ttvKzo62q56X3vtNT3zzDMaMGCAJGn69Olas2aNZs+erXnz5ln7FRcXq2fPniopKdGaNWsUGBgoSXrxxRf17LPPatCgQZKkBg0aaOrUqRo/frwmTZqkvLw8hYWFKT4+Xp6enoqMjFS7du0q8tOWm8sfPW/cuLGys7O1adMmPfnkkxo0aJB2797t1G1OmDBBhYWF1unw4cNO3R4A4OZ18OBBXbhwweYPemBgoBo3bmydz8nJkYeHh9q3b29tCw4OVuPGjZWTk2Pt07FjR5uxO3bsqH379qm0tNQ6RuvWra3LmzRpctlJg2spKirSkSNHrridS3Vc8sgjj+jMmTP68ssvrUFHkrZt26YpU6bIz8/POg0bNkxHjx7V2bNn9dBDD+mnn35SgwYNNGzYMC1fvtzpDxa5POx4eXnptttuU+vWrZWamqoWLVrojTfeUFhYmM6fP3/ZV22PHTumsLAwSVJYWNhlT2ddmr/U50q8vb2tT4BdmgAAQPn16NFD27dvV2Zmpk17cXGxXnzxRWVnZ1unHTt2aN++fapWrZoiIiKUm5ur+fPny8fHR0899ZQ6dep0XZ+D+C0uDzu/VlZWppKSErVu3Vqenp42j8Pl5uYqLy9PcXFxkqS4uDjt2LHD5nrk6tWrFRAQoJiYmBteOwAAv9agQQN5enpqy5Yt1rbCwkLt3ft/93RGR0fr4sWL2rRpk7Xt+PHjys3Ntf49i46O1oYNG2zG3rBhg6KiouTu7q4mTZro4sWLysrKsi7Pzc297KTBtQQEBCg8PPyK2/n139Unn3xS06ZN0wMPPKB169ZZ22+//Xbl5ubqtttuu2xyc/s5dvj4+KhXr16aM2eO1q5dq8zMTO3YsaPcddrLpffsTJgwQQkJCYqMjNTp06e1ZMkSrV27VqtWrVJgYKCGDBmilJQU1ahRQwEBAfrjH/+ouLg4dejQQZLUtWtXxcTE6LHHHtOMGTOUn5+v559/XklJSfL29nblrgEAIEny9/fXoEGDNG7cONWoUUMhISGaNGmS3NzcrN99atSokXr37q1hw4bpL3/5i/z9/fXss8/q1ltvVe/evSVJY8eOVdu2bTV16lT1799fmZmZ+vOf/6z58+dL+vm2kO7du+sPf/iDFixYIA8PDyUnJ1u/HF5e48aN06RJk9SwYUO1bNlSaWlpys7OvuKTXX/84x9VWlqq+++/X1988YXuvPNOTZw4Uffff78iIyP14IMPys3NTdu2bdPOnTv10ksvadGiRSotLVX79u3l6+ur9957Tz4+Pqpbt+51/tJX59KwU1BQoIEDB+ro0aMKDAxU8+bNtWrVKt13332SpFmzZsnNzU39+vVTSUmJunXrZj2okuTu7q6VK1fqySefVFxcnKpXr65BgwZpypQprtolAIALVPanU2fOnKkRI0bo/vvvV0BAgMaPH6/Dhw+rWrVq1j5paWkaPXq07r//fp0/f16dOnXS559/Lk9PT0k/nzH5xz/+oYkTJ2rq1KmqXbu2pkyZYn0S69IYQ4cO1d13363Q0FC99NJL1huay2vUqFEqLCzU2LFjVVBQoJiYGH3yySeXPYl1SXJyssrKytSjRw+lp6erW7duWrlypaZMmaLp06fL09NTTZo00dChQyVJQUFBmjZtmlJSUlRaWqrY2Fh9+umnTn3Xj8UwDMNpo1cRRUVFCgwMVGFhocPv3+HRcwBwnHPnzunQoUOqX7++TVCoas6cOaNbb71Vr7/+uoYMGeLSWiZMmKCvv/5a33zzjUvruJprHfPy/v12+aPnAACY3b///W/t2bNH7dq1U2FhofUKxKVLVK5gGIYOHjyojIyMq76Xziwq3Q3KAACY0WuvvaYWLVooPj5eZ86c0ddff62aNWu6rJ7CwkLFxMTIy8tLf/rTn1xWx43AmR0AAJysVatWNk9
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# plot avarage number of sentences in jokes\n",
|
|||
|
"good_sent = df_good['tokens'].apply(len)\n",
|
|||
|
"bad_sent = df_bad['tokens'].apply(len)\n",
|
|||
|
"\n",
|
|||
|
"# bigger than 1\n",
|
|||
|
"good_sent = good_sent[good_sent > 2]\n",
|
|||
|
"bad_sent = bad_sent[bad_sent > 2]\n",
|
|||
|
"\n",
|
|||
|
"# print how many jokes have more than 1 sentence relative to the total number of jokes\n",
|
|||
|
"print('good jokes with more than 1 sentence:', good_sent.shape[0], 'total:', df_good.shape[0])\n",
|
|||
|
"print('bad jokes with more than 1 sentence:', bad_sent.shape[0], 'total:', df_bad.shape[0])\n",
|
|||
|
"\n",
|
|||
|
"plt.hist(good_sent, bins=20, alpha=0.5, label='good jokes')\n",
|
|||
|
"plt.hist(bad_sent, bins=20, alpha=0.5, label='bad jokes')\n",
|
|||
|
"plt.xlabel('Number of sentences')\n",
|
|||
|
"plt.ylabel('Number of jokes')\n",
|
|||
|
"plt.legend()\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Find Patterns\n",
|
|||
|
"use tok_title and tok_body for it"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Oneliner"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Questions -> Answer"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Wordplay "
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Dialog\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Knock-Knock Jokes"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": []
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "Python 3",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.10.4"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|