568 lines
41 KiB
Plaintext
568 lines
41 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Tokenization adn Normalization"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 46,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import json\n",
|
||
"import os\n",
|
||
"from collections import Counter\n",
|
||
"\n",
|
||
"import numpy as np\n",
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"import nltk\n",
|
||
"\n",
|
||
"from nltk.corpus import stopwords\n",
|
||
"from nltk.tokenize import word_tokenize, sent_tokenize\n",
|
||
"from nltk.stem import WordNetLemmatizer\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 47,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"[nltk_data] Downloading package punkt to\n",
|
||
"[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n",
|
||
"[nltk_data] Package punkt is already up-to-date!\n",
|
||
"[nltk_data] Downloading package stopwords to\n",
|
||
"[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n",
|
||
"[nltk_data] Package stopwords is already up-to-date!\n",
|
||
"[nltk_data] Downloading package wordnet to\n",
|
||
"[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n",
|
||
"[nltk_data] Package wordnet is already up-to-date!\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"True"
|
||
]
|
||
},
|
||
"execution_count": 47,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# nltk count words\n",
|
||
"nltk.download('punkt')\n",
|
||
"nltk.download('stopwords')\n",
|
||
"nltk.download('wordnet')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 48,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Load the data\n",
|
||
"# Load the data from the JSON file\n",
|
||
"data_path = './data/reddit_jokes.json'\n",
|
||
"with open(data_path) as f:\n",
|
||
" data = json.load(f)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 49,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>body</th>\n",
|
||
" <th>id</th>\n",
|
||
" <th>score</th>\n",
|
||
" <th>title</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>Now I have to say \"Leroy can you please paint ...</td>\n",
|
||
" <td>5tz52q</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>I hate how you cant even say black paint anymore</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>Pizza doesn't scream when you put it in the ov...</td>\n",
|
||
" <td>5tz4dd</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>What's the difference between a Jew in Nazi Ge...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>...and being there really helped me learn abou...</td>\n",
|
||
" <td>5tz319</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>I recently went to America....</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>A Sunday school teacher is concerned that his ...</td>\n",
|
||
" <td>5tz2wj</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Brian raises his hand and says, “He’s in Heaven.”</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>He got caught trying to sell the two books to ...</td>\n",
|
||
" <td>5tz1pc</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>You hear about the University book store worke...</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" body id score \\\n",
|
||
"0 Now I have to say \"Leroy can you please paint ... 5tz52q 1 \n",
|
||
"1 Pizza doesn't scream when you put it in the ov... 5tz4dd 0 \n",
|
||
"2 ...and being there really helped me learn abou... 5tz319 0 \n",
|
||
"3 A Sunday school teacher is concerned that his ... 5tz2wj 1 \n",
|
||
"4 He got caught trying to sell the two books to ... 5tz1pc 0 \n",
|
||
"\n",
|
||
" title \n",
|
||
"0 I hate how you cant even say black paint anymore \n",
|
||
"1 What's the difference between a Jew in Nazi Ge... \n",
|
||
"2 I recently went to America.... \n",
|
||
"3 Brian raises his hand and says, “He’s in Heaven.” \n",
|
||
"4 You hear about the University book store worke... "
|
||
]
|
||
},
|
||
"execution_count": 49,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# create pandas dataframe of the data\n",
|
||
"df = pd.DataFrame(data)\n",
|
||
"df.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 50,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# NOTE: bit more than 1000 jokes for removing duplicates and empty jokes\n",
|
||
"num_good = 1046\n",
|
||
"num_bad = 1033\n",
|
||
"min_score = 50\n",
|
||
"\n",
|
||
"\n",
|
||
"df_sroted = df.sort_values(by='score', ascending=False)\n",
|
||
"df_sroted = df_sroted.reset_index(drop=True)\n",
|
||
"\n",
|
||
"df_good = df_sroted.head(num_good)\n",
|
||
"df_good = df_good.reset_index(drop=True)\n",
|
||
"\n",
|
||
"df_bad = df_sroted[df_sroted['score'] > min_score].tail(num_bad)\n",
|
||
"df_bad = df_bad.reset_index(drop=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 51,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"avg score good jokes: 10731.98 shape: (1046, 4)\n",
|
||
"avg score bad jokes: 52.32 shape: (1033, 4)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# avg score and shape\n",
|
||
"print('avg score good jokes:', df_good['score'].mean().round(2), 'shape:', df_good.shape)\n",
|
||
"print('avg score bad jokes:', df_bad['score'].mean().round(2), 'shape:', df_bad.shape)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 52,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"####################################################################################################\n",
|
||
"5 random good jokes:\n",
|
||
"####################################################################################################\n",
|
||
"I told my wife we can have sex or go see Star Wars, she said, I'm on my period and Star Wars is sold out.\n",
|
||
"....what happens next will shock you.\"\n",
|
||
"----------------------------------------------------------------------------------------------------\n",
|
||
"Two men from Texas were sitting at a bar\n",
|
||
"\"No way. That's impossible!\" she said.\n",
|
||
"\n",
|
||
"\"Trust me,\" I said, \"I have no idea where our baby is.\"\n",
|
||
"----------------------------------------------------------------------------------------------------\n",
|
||
"My wife always accuses me of having a favorite child.\n",
|
||
"...hands down.\n",
|
||
"----------------------------------------------------------------------------------------------------\n",
|
||
"If i got 50 cents for every failed math exam,\n",
|
||
"Two fleas had an arrangement to meet every winter in Miami for a vacation. Last year, when one flea gets to Miami he is shivering and shaking.\n",
|
||
"\n",
|
||
"The other flea asked him, \"Why are shaking so badly?\"\n",
|
||
"\n",
|
||
"The first flea says, \"I rode down here from New Jersey in the moustache of a guy on a Harley.\"\n",
|
||
"\n",
|
||
"The other flea says, \"That's the worst way to travel. Do what I do. Go to the New Jersey airport bar. Have a few drinks. While there, look for a nice stewardess, crawl up her leg and nestle in where it's warm and cozy. It's the best way to travel that I can think of.\"\n",
|
||
"\n",
|
||
"The first flea thanks the second flea and says he will give it a try next winter. A year goes by... When the first flea shows up in Miami he shivering and shaking again.\n",
|
||
"\n",
|
||
"The second flea says, \"Didn't you try what I told you?\"\n",
|
||
"\n",
|
||
"\"Yes,\" says the first flea. \"I did exactly what you said. I went to the New Jersey airport bar. I had a few drinks. Finally, this nice young stewardess came in. I crawled right up to her warm cozy spot. It was so nice and warm that I fell asleep. When I woke up, I was back in the moustache of a guy on a Harley.\"\n",
|
||
"\n",
|
||
"\n",
|
||
" Edit: Front page already!!! Gr8 Guys.\n",
|
||
"----------------------------------------------------------------------------------------------------\n",
|
||
"I told god a Holocaust joke. He didn't laugh.\n",
|
||
"Whoops, wrong sub.\n",
|
||
"----------------------------------------------------------------------------------------------------\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# print random 5 good jokes\n",
|
||
"print('#' * 100)\n",
|
||
"print('5 random good jokes:')\n",
|
||
"print('#' * 100)\n",
|
||
"for i in range(5):\n",
|
||
" print(df_good['title'][np.random.randint(0, num_good)])\n",
|
||
" print(df_good['body'][np.random.randint(0, num_good)])\n",
|
||
" print('-' * 100)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 53,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"####################################################################################################\n",
|
||
"5 random bad jokes:\n",
|
||
"####################################################################################################\n",
|
||
"why dont they have drivers ed and sex ed on the same day in mexico?\n",
|
||
"As he was praying a black guy was walking nearby with groceries when he dropped his cheese wheel and it rolled to the Mexican. The Mexican grabbed it, praised god, and ran home. \n",
|
||
"\n",
|
||
"When he gets home he instructs his wife to make nachos with the cheese. \n",
|
||
"\n",
|
||
"\"Why nachos\" asks his wife \"we can make so many better meals with this cheese\"\n",
|
||
"\n",
|
||
"\"No\" said the Mexican \"god instructs me to make nachos.\"\n",
|
||
"\n",
|
||
"\"What do you mean\" asked the wife \n",
|
||
"\n",
|
||
"\"As I was praying God sent me the cheese wheel and as I was running home with it I heard him yelling That's Nacho cheese, that's nacho cheese!\"\n",
|
||
"----------------------------------------------------------------------------------------------------\n",
|
||
"2 Priests go to the beach...\n",
|
||
"a navel..\n",
|
||
"----------------------------------------------------------------------------------------------------\n",
|
||
"Two parrots were sitting on a perch...\n",
|
||
"Okay I'm donesn't born yesterday..\n",
|
||
"----------------------------------------------------------------------------------------------------\n",
|
||
"I recently had sex with a biologist at her laboratory.\n",
|
||
"MAN: I’d like to buy some dog food.\n",
|
||
"CHECKOUT LADY: Do you have a dog?\n",
|
||
"MAN: Yes.\n",
|
||
"CHECKOUT LADY: Where is he?\n",
|
||
"MAN: He’s at home.\n",
|
||
"CHECKOUT LADY: I’m sorry; I can’t sell this dog food to you unless Isee the dog. Store policy.\n",
|
||
"The next day, the man returns.\n",
|
||
"MAN: I’d like to buy some cat food.\n",
|
||
"CHECKOUT LADY: Do you have a cat?\n",
|
||
"MAN: Yes.\n",
|
||
"CHECKOUT LADY: Well...where is he?\n",
|
||
"MAN: He’s at home!\n",
|
||
"CHECKOUT LADY: Sorry, I can’t sell this cat food to you unless I see\n",
|
||
"your cat.\n",
|
||
"The next day the man returns.\n",
|
||
"CHECKOUT LADY: What’s in the sack?\n",
|
||
"MAN: Put your hand inside.\n",
|
||
"CHECKOUT LADY: Hmmm... It’s warm and moist! What is it?\n",
|
||
"MAN: I would like to buy some toilet paper.\n",
|
||
"----------------------------------------------------------------------------------------------------\n",
|
||
"Farmer Dan\n",
|
||
"He hit me until I was 21. \n",
|
||
"----------------------------------------------------------------------------------------------------\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# print random 5 bad jokes\n",
|
||
"print('#' * 100)\n",
|
||
"print('5 random bad jokes:')\n",
|
||
"print('#' * 100)\n",
|
||
"for i in range(5):\n",
|
||
" print(df_bad['title'][np.random.randint(0, num_bad)])\n",
|
||
" print(df_bad['body'][np.random.randint(0, num_bad)])\n",
|
||
" print('-' * 100)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"shape after tokenizing: (1046, 6) (1033, 6)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# tokenize the jokes\n",
|
||
"stop_words = set(stopwords.words('english'))\n",
|
||
"\n",
|
||
"def sentinize(joke):\n",
|
||
" tokens = sent_tokenize(joke)\n",
|
||
" tokens = [word for word in tokens if word not in stop_words]\n",
|
||
" return tokens\n",
|
||
"\n",
|
||
"# Tokenize and lemmatize the jokes\n",
|
||
"df_good['tok_body'] = df_good['body'].apply(sentinize)\n",
|
||
"df_bad['tok_body'] = df_bad['body'].apply(sentinize)\n",
|
||
"\n",
|
||
"df_good['tok_title'] = df_good['title'].apply(sentinize)\n",
|
||
"df_bad['tok_title'] = df_bad['title'].apply(sentinize)\n",
|
||
"\n",
|
||
"\n",
|
||
"\n",
|
||
"print('shape after tokenizing:', df_good.shape, df_bad.shape)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def rem_null_dubl(df):\n",
|
||
" # Remove rows where 'tok_body' or 'tok_title' is empty\n",
|
||
" df = df[df['tok_body'].map(len) > 0]\n",
|
||
" df = df[df['tok_title'].map(len) > 0]\n",
|
||
" \n",
|
||
" # Convert list-type columns to strings for deduplication\n",
|
||
" df['tok_body_str'] = df['tok_body'].apply(lambda x: ' '.join(x))\n",
|
||
" df['tok_title_str'] = df['tok_title'].apply(lambda x: ' '.join(x))\n",
|
||
" \n",
|
||
" # Remove rows where both 'tok_body' and 'tok_title' are duplicates\n",
|
||
" df = df.drop_duplicates(subset=['tok_body_str', 'tok_title_str'], keep=False)\n",
|
||
" \n",
|
||
" # Drop the temporary string columns\n",
|
||
" df = df.drop(columns=['tok_body_str', 'tok_title_str'])\n",
|
||
" \n",
|
||
" return df\n",
|
||
"\n",
|
||
"df_good = rem_null_dubl(df_good)\n",
|
||
"df_bad = rem_null_dubl(df_bad)\n",
|
||
"print('shape after removing duplicates:', df_good.shape, df_bad.shape)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df_good['tokens'] = df_good['tok_body'] + df_good['tok_title']\n",
|
||
"df_bad['tokens'] = df_bad['tok_body'] + df_bad['tok_title']"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 58,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"good jokes with more than 1 sentence: 576 total: 1008\n",
|
||
"bad jokes with more than 1 sentence: 477 total: 1016\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 640x480 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"# plot avarage number of sentences in jokes\n",
|
||
"good_sent = df_good['tokens'].apply(len)\n",
|
||
"bad_sent = df_bad['tokens'].apply(len)\n",
|
||
"\n",
|
||
"# bigger than 1\n",
|
||
"good_sent = good_sent[good_sent > 2]\n",
|
||
"bad_sent = bad_sent[bad_sent > 2]\n",
|
||
"\n",
|
||
"# print how many jokes have more than 1 sentence relative to the total number of jokes\n",
|
||
"print('good jokes with more than 1 sentence:', good_sent.shape[0], 'total:', df_good.shape[0])\n",
|
||
"print('bad jokes with more than 1 sentence:', bad_sent.shape[0], 'total:', df_bad.shape[0])\n",
|
||
"\n",
|
||
"plt.hist(good_sent, bins=20, alpha=0.5, label='good jokes')\n",
|
||
"plt.hist(bad_sent, bins=20, alpha=0.5, label='bad jokes')\n",
|
||
"plt.xlabel('Number of sentences')\n",
|
||
"plt.ylabel('Number of jokes')\n",
|
||
"plt.legend()\n",
|
||
"plt.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Find Patterns\n",
|
||
"use tok_title and tok_body for it"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Oneliner"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Questions -> Answer"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Wordplay "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Dialog\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Knock-Knock Jokes"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.10.4"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|