554 lines
270 KiB
Plaintext
554 lines
270 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Tokenization adn Normalization"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 125,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"import json\n",
|
|||
|
"import os\n",
|
|||
|
"from collections import Counter\n",
|
|||
|
"\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"import pandas as pd\n",
|
|||
|
"\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import nltk\n",
|
|||
|
"\n",
|
|||
|
"from nltk.corpus import stopwords\n",
|
|||
|
"from nltk.tokenize import word_tokenize\n",
|
|||
|
"from nltk.stem import WordNetLemmatizer\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 126,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"[nltk_data] Downloading package punkt to\n",
|
|||
|
"[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n",
|
|||
|
"[nltk_data] Package punkt is already up-to-date!\n",
|
|||
|
"[nltk_data] Downloading package stopwords to\n",
|
|||
|
"[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n",
|
|||
|
"[nltk_data] Package stopwords is already up-to-date!\n",
|
|||
|
"[nltk_data] Downloading package wordnet to\n",
|
|||
|
"[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n",
|
|||
|
"[nltk_data] Package wordnet is already up-to-date!\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"True"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 126,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# nltk count words\n",
|
|||
|
"nltk.download('punkt')\n",
|
|||
|
"nltk.download('stopwords')\n",
|
|||
|
"nltk.download('wordnet')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 127,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Load the data\n",
|
|||
|
"# Load the data from the JSON file\n",
|
|||
|
"data_path = './data/reddit_jokes.json'\n",
|
|||
|
"with open(data_path) as f:\n",
|
|||
|
" data = json.load(f)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 128,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>body</th>\n",
|
|||
|
" <th>id</th>\n",
|
|||
|
" <th>score</th>\n",
|
|||
|
" <th>title</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>Now I have to say \"Leroy can you please paint ...</td>\n",
|
|||
|
" <td>5tz52q</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>I hate how you cant even say black paint anymore</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>Pizza doesn't scream when you put it in the ov...</td>\n",
|
|||
|
" <td>5tz4dd</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>What's the difference between a Jew in Nazi Ge...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>...and being there really helped me learn abou...</td>\n",
|
|||
|
" <td>5tz319</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>I recently went to America....</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>A Sunday school teacher is concerned that his ...</td>\n",
|
|||
|
" <td>5tz2wj</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>Brian raises his hand and says, “He’s in Heaven.”</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>He got caught trying to sell the two books to ...</td>\n",
|
|||
|
" <td>5tz1pc</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>You hear about the University book store worke...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" body id score \\\n",
|
|||
|
"0 Now I have to say \"Leroy can you please paint ... 5tz52q 1 \n",
|
|||
|
"1 Pizza doesn't scream when you put it in the ov... 5tz4dd 0 \n",
|
|||
|
"2 ...and being there really helped me learn abou... 5tz319 0 \n",
|
|||
|
"3 A Sunday school teacher is concerned that his ... 5tz2wj 1 \n",
|
|||
|
"4 He got caught trying to sell the two books to ... 5tz1pc 0 \n",
|
|||
|
"\n",
|
|||
|
" title \n",
|
|||
|
"0 I hate how you cant even say black paint anymore \n",
|
|||
|
"1 What's the difference between a Jew in Nazi Ge... \n",
|
|||
|
"2 I recently went to America.... \n",
|
|||
|
"3 Brian raises his hand and says, “He’s in Heaven.” \n",
|
|||
|
"4 You hear about the University book store worke... "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 128,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# create pandas dataframe of the data\n",
|
|||
|
"df = pd.DataFrame(data)\n",
|
|||
|
"df.head()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 129,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# NOTE: bit more than 1000 jokes for removing duplicates and empty jokes\n",
|
|||
|
"num_good = 1033\n",
|
|||
|
"num_bad = 1019\n",
|
|||
|
"min_score = 50\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"df_sroted = df.sort_values(by='score', ascending=False)\n",
|
|||
|
"df_sroted = df_sroted.reset_index(drop=True)\n",
|
|||
|
"\n",
|
|||
|
"df_good = df_sroted.head(num_good)\n",
|
|||
|
"df_good = df_good.reset_index(drop=True)\n",
|
|||
|
"\n",
|
|||
|
"df_bad = df_sroted[df_sroted['score'] > min_score].tail(num_bad)\n",
|
|||
|
"df_bad = df_bad.reset_index(drop=True)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 130,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"avg score good jokes: 10802.19 shape: (1033, 4)\n",
|
|||
|
"avg score bad jokes: 52.3 shape: (1019, 4)\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# avg score and shape\n",
|
|||
|
"print('avg score good jokes:', df_good['score'].mean().round(2), 'shape:', df_good.shape)\n",
|
|||
|
"print('avg score bad jokes:', df_bad['score'].mean().round(2), 'shape:', df_bad.shape)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 131,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"####################################################################################################\n",
|
|||
|
"5 random good jokes:\n",
|
|||
|
"####################################################################################################\n",
|
|||
|
"George R R Martin, dead after reaching peak popularity\n",
|
|||
|
"[Removed]\n",
|
|||
|
"----------------------------------------------------------------------------------------------------\n",
|
|||
|
"Royal Wedding\n",
|
|||
|
"My dad was holding me from behind .\n",
|
|||
|
"----------------------------------------------------------------------------------------------------\n",
|
|||
|
"I want to be a millionaire just like my dad!!\n",
|
|||
|
"...The King of England and The King of Spain are having an argument over who has the biggest penis. Eventually they decide to let the people judge.\n",
|
|||
|
"They all stand on a stage in front of the people and drop their pants one by one.\n",
|
|||
|
"\n",
|
|||
|
"The king of France drops his and the French crowd shout \"viva la france!!\"\n",
|
|||
|
"\n",
|
|||
|
"The king of Spain drops his and the Spanish crowd shout \"Viva la españa!!\"\n",
|
|||
|
"\n",
|
|||
|
"The king of England drops his, a long silence from the crowd, and then everybody shouts \"God save the Queen!!!\"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"Edit: I posted this whilst high, The title of the post is part of the joke the ellipses symbolise this \n",
|
|||
|
"----------------------------------------------------------------------------------------------------\n",
|
|||
|
"A women is cooking eggs in the kitchen when her husband comes running in…\n",
|
|||
|
"You can hide, but you can't run.\n",
|
|||
|
"----------------------------------------------------------------------------------------------------\n",
|
|||
|
"You know that tingly sensation you get when you like somebody?\n",
|
|||
|
"...and sees a gorgeous woman nursing a drink. Walking up behind her he says: \"Hi there, good lookin'. How's it going?\"\n",
|
|||
|
"Having already downed a few power drinks, she turns around, faces him, looks him straight in the eye and says: \"Listen up, buddy. I screw anybody, anytime, anywhere, your place, my place, in the car, front door,back door, on the ground, standing up, sitting down, naked or with clothes on,dirty, clean... It just doesn't matter to me. I've been doing it ever since I got out of college and I just flat-ass love it.\"\n",
|
|||
|
"Eyes now wide with interest, he responds: \"No kidding. I'm a lawyer too. What firm are you with?\"\n",
|
|||
|
"----------------------------------------------------------------------------------------------------\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# print random 5 good jokes\n",
|
|||
|
"print('#' * 100)\n",
|
|||
|
"print('5 random good jokes:')\n",
|
|||
|
"print('#' * 100)\n",
|
|||
|
"for i in range(5):\n",
|
|||
|
" print(df_good['title'][np.random.randint(0, num_good)])\n",
|
|||
|
" print(df_good['body'][np.random.randint(0, num_good)])\n",
|
|||
|
" print('-' * 100)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 132,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"####################################################################################################\n",
|
|||
|
"5 random bad jokes:\n",
|
|||
|
"####################################################################################################\n",
|
|||
|
"Saying Goodbye to Mother\n",
|
|||
|
"There was a farmer who had a brown cow and a white cow and he wanted to get them bred, so borrowed his neighbor's bull and turned it loose in the pasture. He told his son to watch and come in and tell him when the bull was finished.\n",
|
|||
|
"\n",
|
|||
|
"After a while the boy came into the living where his father was talking with some friends. \"Say, Pop,\" said the boy. \"Yes,\" replied his father.\n",
|
|||
|
"\n",
|
|||
|
"\"The bull just fucked the brown cow.\"\n",
|
|||
|
"\n",
|
|||
|
"There was a sudden lull in the conversation. The father said \"Excuse me\" and took his son outside. \"Son, you mustn't use language like that in front of company. You should say 'The bull surprised the brown cow'. Now go and watch and tell me when the bull surprises the white cow.\"\n",
|
|||
|
"\n",
|
|||
|
"The father went back inside the house. After a while the boy came in and said, \"Hey, Daddy.\"\n",
|
|||
|
"\n",
|
|||
|
"\"Yes, son. Did the bull surprise the white cow?\"\n",
|
|||
|
"\n",
|
|||
|
"\"He sure did, Pop! He fucked the brown cow again!\"\n",
|
|||
|
"\n",
|
|||
|
"----------------------------------------------------------------------------------------------------\n",
|
|||
|
"I organized a threesome this past weekend.\n",
|
|||
|
"But now I am a well-rounded person, so it worked out pretty well.\n",
|
|||
|
"----------------------------------------------------------------------------------------------------\n",
|
|||
|
"A guy gets his bike stolen from synagogue...\n",
|
|||
|
"Eclipse it!\n",
|
|||
|
"----------------------------------------------------------------------------------------------------\n",
|
|||
|
"Why are Hispanic dwarves called Paragraphs?\n",
|
|||
|
"Seriously. That other guy hasn't answered yet, and I'm dying to find out!\n",
|
|||
|
"----------------------------------------------------------------------------------------------------\n",
|
|||
|
"What do a burnt pizza, a frozen beer, and a pregnant woman have in common?\n",
|
|||
|
"They steal all the green cards.\n",
|
|||
|
"\n",
|
|||
|
"----------------------------------------------------------------------------------------------------\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# print random 5 bad jokes\n",
|
|||
|
"print('#' * 100)\n",
|
|||
|
"print('5 random bad jokes:')\n",
|
|||
|
"print('#' * 100)\n",
|
|||
|
"for i in range(5):\n",
|
|||
|
" print(df_bad['title'][np.random.randint(0, num_bad)])\n",
|
|||
|
" print(df_bad['body'][np.random.randint(0, num_bad)])\n",
|
|||
|
" print('-' * 100)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 133,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"shape after tokenizing: (1033, 7) (1019, 7)\n",
|
|||
|
"shape after removing empty jokes: (1033, 7) (1019, 7)\n",
|
|||
|
"shape after removing duplicates: (1002, 7) (1000, 7)\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"bool_lemma = True\n",
|
|||
|
"\n",
|
|||
|
"# tokenize the jokes\n",
|
|||
|
"stop_words = set(stopwords.words('english'))\n",
|
|||
|
"\n",
|
|||
|
"lemmatizer = WordNetLemmatizer()\n",
|
|||
|
"\n",
|
|||
|
"def tokenize_and_lemmatize_joke(joke):\n",
|
|||
|
" tokens = word_tokenize(joke)\n",
|
|||
|
" tokens = [word.lower() for word in tokens if word.isalpha()]\n",
|
|||
|
" tokens = [word for word in tokens if word not in stop_words]\n",
|
|||
|
" if bool_lemma:\n",
|
|||
|
" tokens = [lemmatizer.lemmatize(word) for word in tokens]\n",
|
|||
|
" return tokens\n",
|
|||
|
"\n",
|
|||
|
"# Tokenize and lemmatize the jokes\n",
|
|||
|
"df_good['tok_body'] = df_good['body'].apply(tokenize_and_lemmatize_joke)\n",
|
|||
|
"df_bad['tok_body'] = df_bad['body'].apply(tokenize_and_lemmatize_joke)\n",
|
|||
|
"\n",
|
|||
|
"df_good['tok_title'] = df_good['title'].apply(tokenize_and_lemmatize_joke)\n",
|
|||
|
"df_bad['tok_title'] = df_bad['title'].apply(tokenize_and_lemmatize_joke)\n",
|
|||
|
"\n",
|
|||
|
"df_good['tokens'] = df_good['tok_body'] + df_good['tok_title']\n",
|
|||
|
"df_bad['tokens'] = df_bad['tok_body'] + df_bad['tok_title']\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"print('shape after tokenizing:', df_good.shape, df_bad.shape)\n",
|
|||
|
"# remove empty jokes\n",
|
|||
|
"df_good = df_good[df_good['tokens'].map(len) > 0]\n",
|
|||
|
"df_bad = df_bad[df_bad['tokens'].map(len) > 0]\n",
|
|||
|
"print('shape after removing empty jokes:', df_good.shape, df_bad.shape)\n",
|
|||
|
"\n",
|
|||
|
"# remove duplicates\n",
|
|||
|
"df_good = df_good.drop_duplicates(subset='body')\n",
|
|||
|
"df_bad = df_bad.drop_duplicates(subset='body')\n",
|
|||
|
"print('shape after removing duplicates:', df_good.shape, df_bad.shape)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 134,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"good words: Counter({'say': 523, 'man': 401, 'said': 393, 'one': 309, 'get': 252, 'go': 238, 'woman': 203, 'wife': 188, 'guy': 187, 'like': 169, 'asked': 164, 'back': 164, 'know': 163, 'would': 162, 'asks': 162, 'day': 160, 'time': 153, 'girl': 147, 'husband': 142, 'well': 141, 'two': 138, 'edit': 138, 'want': 137, 'see': 134, 'old': 129, 'think': 127, 'joke': 126, 'take': 125, 'could': 124, 'look': 122, 'first': 119, 'year': 118, 'give': 116, 'told': 112, 'got': 110, 'come': 109, 'people': 108, 'little': 105, 'went': 104, 'boy': 104, 'going': 99, 'next': 98, 'tell': 98, 'father': 98, 'sex': 97, 'walk': 96, 'good': 93, 'bar': 93, 'son': 92, 'make': 90, 'friend': 84, 'replied': 82, 'dad': 82, 'hand': 81, 'find': 80, 'reply': 79, 'ca': 78, 'way': 78, 'never': 76, 'really': 75, 'minute': 75, 'home': 71, 'right': 71, 'around': 71, 'doctor': 69, 'car': 68, 'put': 67, 'still': 66, 'night': 66, 'later': 66, 'blonde': 65, 'black': 65, 'priest': 64, 'god': 63, 'much': 63, 'new': 63, 'house': 63, 'let': 62, 'front': 62, 'door': 62, 'lady': 61, 'girlfriend': 60, 'oh': 60, 'second': 60, 'every': 59, 'last': 59, 'thing': 58, 'dollar': 58, 'sure': 58, 'drink': 57, 'came': 57, 'eye': 57, 'room': 57, 'thought': 56, 'kid': 56, 'yes': 56, 'three': 55, 'ever': 54, 'head': 54, 'bartender': 54, 'nothing': 53, 'call': 53, 'today': 52, 'word': 51, 'finally': 51, 'mother': 51, 'took': 51, 'thanks': 50, 'turn': 50, 'hell': 50, 'job': 49, 'men': 49, 'ask': 49, 'another': 48, 'even': 48, 'couple': 48, 'officer': 48, 'line': 48, 'teacher': 47, 'bed': 46, 'started': 46, 'heard': 46, 'difference': 46, 'please': 45, 'left': 45, 'great': 45, 'sitting': 45, 'work': 44, 'many': 44, 'everyone': 43, 'start': 43, 'life': 43, 'best': 43, 'love': 42, 'trump': 41, 'need': 41, 'sir': 41, 'must': 41, 'found': 40, 'long': 40, 'show': 40, 'answer': 40, 'satan': 40, 'better': 39, 'try': 39, 'mean': 39, 'open': 39, 'big': 38, 'young': 38, 'wow': 38, 'driver': 38, 'lawyer': 38, 'looked': 37, 'gold': 37, 'help': 37, 'question': 37, 'mom': 36, 'bet': 36, 'anything': 36, 'president': 35, 'wrong': 35, 'world': 35, 'getting': 35, 'enough': 35, 'week': 35, 'use': 35, 'money': 35, 'change': 35, 'arm': 34, 'away': 34, 'pull': 34, 'morning': 34, 'married': 34, 'bit': 34, 'also': 33, 'saying': 33, 'child': 33, 'white': 33, 'nice': 33, 'wanted': 33, 'face': 33, 'always': 33, 'u': 33, 'beer': 33, 'sorry': 32, 'order': 32, 'ok': 32, 'keep': 32, 'page': 32, 'fuck': 32, 'called': 32, 'owner': 32, 'fucking': 32, 'made': 31, 'without': 31, 'thank': 31, 'immediately': 31, 'hot': 31, 'happy': 31, 'na': 31, 'shit': 31, 'name': 31, 'grandpa': 31, 'place': 30, 'since': 30, 'free': 30, 'student': 30, 'behind': 30, 'stop': 30, 'something': 30, 'talk': 30, 'end': 30, 'pulled': 29, 'step': 29, 'school': 29, 'water': 29, 'bedroom': 29, 'lot': 29, 'table': 29, 'responds': 29, 'wo': 28, 'jew': 28, 'happened': 28, 'police': 28, 'buy': 28, 'check': 28, 'light': 28, 'looking': 28, 'someone': 28, 'run': 28, 'engineer': 28, 'cop': 28, 'bill': 27, 'problem': 27, 'smile': 27, 'saw': 27, 'turned': 27, 'leaf': 27, 'third': 27, 'penis': 27, 'beggar': 27, 'class': 27, 'wall': 26, 'person': 26, 'tried': 26, 'daughter': 26, 'family': 26, 'leg': 26, 'sound': 26, 'pretty': 26, 'beautiful': 26, 'baby': 26, 'dinner': 26, 'trying': 26, 'hey': 26, 'five': 26, 'ten': 25, 'course': 25, 'bad': 25, 'together': 25, 'hole': 25, 'feel': 25, 'suit': 25, 'devil': 25, 'dead': 25, 'tree': 25, 'drug': 25, 'hear': 24, 'seen': 24, 'month': 24, 'watch': 24, 'gay': 24, 'kind': 24, 'parent': 24, 'post': 24, 'floor': 24, 'ground': 24, 'john': 24, 'ran': 23, 'glass': 23, 'hit': 23, 'irishman': 23, 'office': 23, 'frog': 23, 'muslim': 23, 'gate': 22, 'pay': 22, 'everything': 22, 'laugh': 22, 'may': 22, 'drinking': 22, 'decides': 22, 'air': 22, 'wish': 22, 'fell': 22, 'yeah': 22, 'dick': 22, 'peter': 22, 'okay': 22, 'coming': 22, 'nbsp': 22, 'donald': 21, 'sleep': 21, 'body': 21, 'hour': 21, 'point': 21, 'side': 21, 'decided': 21, 'number': 21, 'done': 21, 'suddenly': 21, 'knock': 21, 'naked': 21, 'standing': 21, 'leave': 21, '
|
|||
|
"bad words: Counter({'say': 557, 'man': 430, 'one': 295, 'guy': 233, 'go': 232, 'get': 222, 'said': 218, 'woman': 178, 'day': 175, 'asks': 170, 'like': 155, 'wife': 148, 'back': 136, 'know': 135, 'walk': 131, 'first': 130, 'want': 124, 'well': 120, 'see': 119, 'two': 115, 'time': 115, 'come': 114, 'look': 113, 'would': 113, 'little': 110, 'take': 109, 'tell': 103, 'reply': 101, 'give': 101, 'husband': 101, 'next': 96, 'got': 96, 'bar': 90, 'door': 90, 'year': 88, 'could': 84, 'life': 82, 'new': 81, 'asked': 81, 'girl': 79, 'think': 79, 'going': 79, 'call': 79, 'son': 78, 'second': 77, 'three': 77, 'make': 75, 'last': 74, 'boy': 74, 'went': 74, 'home': 73, 'around': 72, 'doctor': 72, 'old': 70, 'ca': 68, 'officer': 68, 'good': 67, 'joke': 66, 'night': 64, 'friend': 64, 'car': 63, 'hand': 62, 'later': 62, 'put': 61, 'house': 60, 'dog': 59, 'right': 58, 'oh': 58, 'sex': 56, 'nun': 56, 'every': 56, 'another': 55, 'head': 55, 'blonde': 55, 'people': 54, 'room': 54, 'still': 53, 'dad': 53, 'never': 53, 'really': 53, 'let': 53, 'lawyer': 53, 'way': 51, 'thing': 50, 'find': 50, 'black': 49, 'start': 48, 'told': 48, 'turn': 47, 'ever': 46, 'came': 46, 'drink': 45, 'replied': 45, 'front': 45, 'stop': 44, 'many': 44, 'buy': 44, 'eye': 43, 'sitting': 43, 'bartender': 43, 'cat': 42, 'bed': 42, 'teacher': 42, 'god': 41, 'decided': 41, 'driver': 40, 'long': 40, 'yes': 40, 'young': 40, 'mother': 39, 'away': 39, 'open': 39, 'light': 39, 'something': 39, 'sure': 38, 'thought': 38, 'love': 38, 'lady': 38, 'week': 38, 'difference': 37, 'father': 37, 'big': 37, 'money': 37, 'couple': 37, 'horse': 37, 'job': 36, 'hear': 36, 'bill': 36, 'hour': 36, 'finally': 36, 'white': 36, 'cop': 36, 'great': 35, 'sorry': 35, 'best': 35, 'work': 35, 'need': 35, 'even': 35, 'chicken': 35, 'dollar': 35, 'took': 35, 'farmer': 35, 'mean': 34, 'ask': 34, 'johnny': 34, 'name': 34, 'much': 34, 'heard': 34, 'morning': 33, 'problem': 33, 'must': 33, 'baby': 33, 'made': 33, 'cow': 33, 'pull': 32, 'walking': 32, 'found': 32, 'shit': 32, 'left': 32, 'milk': 32, 'minute': 32, 'try': 32, 'saw': 32, 'kid': 31, 'always': 31, 'knock': 31, 'school': 31, 'as': 30, 'fuck': 30, 'third': 30, 'small': 30, 'face': 30, 'show': 29, 'please': 29, 'men': 29, 'looking': 29, 'part': 29, 'sign': 29, 'run': 28, 'wrong': 28, 'wish': 28, 'shot': 28, 'gave': 28, 'child': 28, 'lost': 27, 'beautiful': 27, 'also': 27, 'question': 27, 'order': 26, 'pant': 26, 'started': 26, 'dinner': 26, 'sir': 26, 'table': 26, 'tax': 26, 'notice': 25, 'suddenly': 25, 'jump': 25, 'dead': 25, 'foot': 25, 'help': 25, 'family': 25, 'change': 25, 'without': 25, 'party': 25, 'local': 25, 'bos': 25, 'answer': 24, 'keep': 24, 'nothing': 24, 'mom': 24, 'play': 24, 'hell': 24, 'getting': 24, 'priest': 24, 'responds': 23, 'hard': 23, 'ok': 23, 'dick': 23, 'seen': 23, 'bus': 23, 'turned': 23, 'stand': 23, 'world': 23, 'owner': 23, 'store': 23, 'cowboy': 23, 'side': 22, 'bit': 22, 'behind': 22, 'mexican': 22, 'drive': 22, 'hey': 22, 'better': 22, 'looked': 22, 'penis': 22, 'window': 22, 'word': 22, 'thinking': 22, 'president': 22, 'wood': 22, 'sound': 22, 'everyone': 22, 'free': 22, 'full': 22, 'use': 21, 'number': 21, 'leaf': 21, 'chinese': 21, 'poor': 21, 'moment': 21, 'watch': 21, 'voice': 21, 'read': 21, 'point': 21, 'today': 21, 'sits': 21, 'na': 21, 'chief': 21, 'leg': 20, 'called': 20, 'driving': 20, 'honey': 20, 'kill': 20, 'paddy': 20, 'died': 20, 'glass': 20, 'leave': 20, 'inside': 20, 'step': 20, 'student': 20, 'gay': 20, 'bad': 20, 'brother': 20, 'middle': 19, 'wearing': 19, 'road': 19, 'fucking': 19, 'police': 19, 'decides': 19, 'pretty': 19, 'wear': 19, 'fly': 19, 'food': 19, 'saying': 19, 'course': 19, 'hole': 19, 'across': 19, 'screw': 19, 'used': 19, 'monk': 19, 'eat': 19, 'class': 19, 'cry': 18, 'month': 18, 'wanted': 18, 'water': 18, 'since': 18, 'mind': 18, 'jesus': 18, 'rabbit': 18, 'wait': 18, 'catch': 18, 'end': 18, 'peter': 18, 'enough': 18, 'american': 18, 'pig': 18, 'ten': 18, 'large': 18, 'dave': 18, 'fire': 17, 'quite': 17, 'done': 17, 'daddy': 17, 'six': 17, 'shocked': 17, 'shall': 17, 's
|
|||
|
"avg relative frequency difference: 0.04720864704418073\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# count the words\n",
|
|||
|
"good_words = Counter()\n",
|
|||
|
"for tokens in df_good['tokens']:\n",
|
|||
|
" good_words.update(tokens)\n",
|
|||
|
"\n",
|
|||
|
"bad_words = Counter()\n",
|
|||
|
"for tokens in df_bad['tokens']:\n",
|
|||
|
" bad_words.update(tokens)\n",
|
|||
|
"\n",
|
|||
|
"print('good words:', good_words)\n",
|
|||
|
"print('bad words:', bad_words)\n",
|
|||
|
"\n",
|
|||
|
"# if words in both good and bad jokes\n",
|
|||
|
"common_words = good_words.keys() & bad_words.keys()\n",
|
|||
|
"# plot counter of common words for good and bad jokes\n",
|
|||
|
"common_words_good = {word: good_words[word] for word in common_words}\n",
|
|||
|
"common_words_bad = {word: bad_words[word] for word in common_words}\n",
|
|||
|
"# realtive how often word is used in good and bad jokes\n",
|
|||
|
"common_words_good_rel = {word: good_words[word] / (good_words[word] + bad_words[word]) for word in common_words}\n",
|
|||
|
"common_words_bad_rel = {word: bad_words[word] / (good_words[word] + bad_words[word]) for word in common_words}\n",
|
|||
|
"# avg relative frequency difference between good and bad jokes\n",
|
|||
|
"avg_rel_diff = np.mean([common_words_good_rel[word] - common_words_bad_rel[word] for word in common_words])\n",
|
|||
|
"print('avg relative frequency difference:', avg_rel_diff)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 135,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjsAAAHHCAYAAABZbpmkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAABRmElEQVR4nO3deVRU9f8/8OewiuAMgsJIsrgLirvCpKYlioqmHymXSDS3jwYqS6Z8crcETcUl07QC+7TY6pILinsZKqIkKiLmAp8UMBFQTGR5//7w5/02ocbADIOX5+Oce47zfr/n3td944HnuatCCCFAREREJFMmxi6AiIiIyJAYdoiIiEjWGHaIiIhI1hh2iIiISNYYdoiIiEjWGHaIiIhI1hh2iIiISNYYdoiIiEjWGHaIiIhI1hh2iKjG6927N9q2bVst23r//ffRtGlTmJqaokOHDk8cN3bsWLi5uVVqGwqFAsHBwZUrkIh0xrBDVEvExsZCoVDg5MmTxi7lsa5fv4758+cjOTnZaDXs3bsXb7/9Nrp3746YmBgsXrzYaLUQkf6YGbsAIiLgYdhZsGAB3NzcnnpExZAOHDgAExMTfPLJJ7CwsHjq2I0bN6KsrKyaKiOiqmDYISL6/3JycmBlZfWPQQcAzM3Nq6EiItIHnsYiIi2///47xo0bB0dHR1haWqJNmzb49NNPtcYcOnQICoUC33zzDd577z00btwYderUQZ8+fXDp0qVy61y7di2aNm0KKysrdOvWDT/99BN69+6N3r17S+vr2rUrAOCNN96AQqGAQqFAbGys1nrOnz+PF198EXXr1sVzzz2HpUuXVmifSkpKsGjRIjRr1gyWlpZwc3PDf/7zHxQVFUljFAoFYmJiUFhY+MTt/9XjrtkpLCxEeHg4nJ2dYWlpiVatWmHZsmUQQvxjje+++y5MTEywZs0aqW337t3o2bMnrK2tUa9ePfj5+eHcuXNa38vKysIbb7yBxo0bw9LSEo0aNcKQIUNw9erVCs0NUW3AIztEJMnOzoa3t7d0AW3Dhg2xe/dujB8/HgUFBQgJCdEaHxUVBRMTE7z11lvIz8/H0qVLERAQgOPHj0tj1q1bh+DgYPTs2ROhoaG4evUqhg4divr166Nx48YAAHd3dyxcuBBz587FpEmT0LNnTwDA888/L63n9u3b6N+/P4YNG4bhw4fju+++w8yZM+Hp6YkBAwY8db8mTJiATZs24ZVXXkF4eDiOHz+OyMhIpKamYsuWLQCA//73v9iwYQNOnDiBjz/+uNz2/4kQAi+//DIOHjyI8ePHo0OHDtizZw9mzJiB33//HdHR0U/87uzZs7F48WJ89NFHmDhxolTPmDFj4OvriyVLluDevXtYt24devTogdOnT0tBy9/fH+fOncPUqVPh5uaGnJwcxMfHIyMjo9IXUBPJjiCiWiEmJkYAEImJiU8cM378eNGoUSPxxx9/aLWPHDlSqFQqce/ePSGEEAcPHhQAhLu7uygqKpLGrVq1SgAQKSkpQgghioqKhL29vejatasoLi6WxsXGxgoAolevXlJbYmKiACBiYmLK1dWrVy8BQHz22WdSW1FRkVCr1cLf3/+p+52cnCwAiAkTJmi1v/XWWwKAOHDggNQ2ZswYYW1t/dT1/XWsq6ur9Hnr1q0CgHj33Xe1xr3yyitCoVCIS5cuSW0ARFBQkBBCiPDwcGFiYiJiY2Ol/jt37ghbW1sxceJErXVlZWUJlUoltd++fVsAEO+//36FaiaqrXgai4gAPDwy8f3332Pw4MEQQuCPP/6QFl9fX+Tn5+PUqVNa33njjTe0rm95dETm8uXLAICTJ0/i1q1bmDhxIszM/u9AckBAAOrXr69TfTY2Nnj99delzxYWFujWrZu0rSfZtWsXACAsLEyrPTw8HACwc+dOnep42nZMTU0xbdq0ctsRQmD37t1a7UIIBAcHY9WqVfj8888xZswYqS8+Ph55eXkYNWqU1s/B1NQUXl5eOHjwIABI1xcdOnQIt2/f1st+EMkRT2MREQDg5s2byMvLw4YNG7Bhw4bHjsnJydH67OLiovX5UYB59If32rVrAIDmzZtrjTMzM9P5FEvjxo2hUCjKbe/MmTNP/d61a9dgYmJSrga1Wg1bW1upxqq6du0anJycUK9ePa12d3d3qf+vPvvsM9y9exfr1q3DqFGjtPrS09MBAC+99NJjt6VUKgEAlpaWWLJkCcLDw+Ho6Ahvb28MGjQIgYGBUKvVetkvIjlg2CEiAJBuo3799de1jjL8Vbt27bQ+m5qaPnacqMAFubqq6rb+HpSMrXv37khOTsYHH3yA4cOHw87OTup79LP473//+9jQ8tejZCEhIRg8eDC2bt2KPXv2YM6cOYiMjMSBAwfQsWNHw+8I0TOAYYeIAAANGzZEvXr1UFpaCh8fH72s09XVFQBw6dIlvPjii1J7SUkJrl69qhWeDBVGXF1dUVZWhvT0dOkoC/DwYuy8vDypRn1sZ9++fbhz547W0Z0LFy5I/X/VvHlzLF26FL1790b//v2xf/9+6XvNmjUDADg4OFToZ9GsWTOEh4cjPDwc6enp6NChA5YvX47PP/9cL/tG9KzjNTtEBODhkRN/f398//33OHv2bLn+mzdv6rzOLl26wN7eHhs3bkRJSYnU/sUXX5S7xsTa2hoAkJeXp/N2nmbgwIEAgJUrV2q1r1ixAgDg5+ent+2Ulpbigw8+0GqPjo6GQqF47B1j7dq1w65du5CamorBgwfjzz//BAD4+vpCqVRi8eLFKC4uLve9Rz+Le/fu4f79+1p9zZo1Q7169bRuqyeq7Xhkh6iW+fTTTxEXF1euffr06YiKisLBgwfh5eWFiRMnwsPDA7m5uTh16hT27duH3NxcnbZlYWGB+fPnY+rUqXjppZcwfPhwXL16FbGxsWjWrJnW0ZxmzZrB1tYW69evR7169WBtbQ0vLy80adKkSvvbvn17jBkzBhs2bEBeXh569eqFEydOYNOmTRg6dKjWEaeqGDx4MF588UW88847uHr1Ktq3b4+9e/di27ZtCAkJkY7W/J23tze2bduGgQMH4pVXXsHWrVuhVCqxbt06jB49Gp06dcLIkSPRsGFDZGRkYOfOnejevTs++OADXLx4EX369MHw4cPh4eEBMzMzbNmyBdnZ2Rg5cqRe9otIFox5KxgRVZ9Ht54/acnMzBRCCJGdnS2CgoKEs7OzMDc3F2q1WvTp00ds2LBBWtejW8+//fZbrW1cuXLlsbePr169Wri6ugpLS0vRrVs3cfToUdG5c2fRv39/rXHbtm0THh4ewszMTGs9vXr1Em3atCm3T3+//ftJiouLxYIFC0STJk2Eubm5cHZ2FhEREeL+/fvl1lfZW8+FeHjLeGhoqHBychLm5uaiRYsW4v333xdlZWVa4/CXW88f2bZtmzAzMxMjRowQpaWlQoiH8+zr6ytUKpWoU6eOaNasmRg7dqw4efKkEEKIP/74QwQFBYnWrVsLa2troVKphJeXl/jmm28qtA9EtYVCCANcSUhE9BRlZWVo2LAhhg0bho0bNxq7nEoZPXo0EhISHvvEaCKqWXjNDhEZ1P3798vdMfXZZ58hNzdXel3Es+jGjRto0KCBscsgogrgNTtEZFDHjh1DaGgoXn31Vdjb2+PUqVP45JNP0LZtW7z66qvGLk9nZ86cwdatW3HkyBHMmDHD2OUQUQUw7BCRQbm5ucHZ2RmrV69Gbm4u7OzsEBgYiKioqAq9Xbym+eGHH7BmzRqMHDkSERERxi6HiCqA1+wQERGRrPGaHSIiIpI1hh0iIiKSNV6zg4e3wV6/fh316tWrce/PISIioscTQuDOnTtwcnKCicmTj98w7AC4fv06nJ2djV0GERERVUJmZiYaN278xH6GHUB6+V5mZiaUSqWRqyEiIqKKKCgogLOzs9bLdx+HYQf/97ZlpVLJsENERPSM+adLUHiBMhEREckaww4RERHJGsMOERERyZpRr9lxc3PDtWvXyrW/+eabWLt2Le7fv4/w8HBs3rwZRUVF8PX1xYcffghHR0dpbEZGBqZMmYK
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# count length of jokes\n",
|
|||
|
"df_good['length'] = df_good['tokens'].apply(len)\n",
|
|||
|
"df_bad['length'] = df_bad['tokens'].apply(len)\n",
|
|||
|
"\n",
|
|||
|
"# plot the length of jokes\n",
|
|||
|
"plt.hist(df_good['length'], bins=20, alpha=0.5, label='good jokes')\n",
|
|||
|
"plt.hist(df_bad['length'], bins=20, alpha=0.5, label='bad jokes')\n",
|
|||
|
"plt.legend()\n",
|
|||
|
"plt.xlabel('Number of words')\n",
|
|||
|
"plt.ylabel('Number of jokes')\n",
|
|||
|
"plt.title('Length of jokes')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 136,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAk0AAAHHCAYAAACiOWx7AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAA9hAAAPYQGoP6dpAADGz0lEQVR4nOzdeXhV1dX48e85585j5gEI84w4oQi1LVqpsWJbq7W2tQqobfXFAWirdnJ8W21/bdXWoTPYvrZWOztLcagiKoIoICBzgIxkuPN0ht8fl3tNIEASAklgfZ4nT829O+fse5Mmi73XXkuxLMtCCCGEEEIclNrXExBCCCGEGAgkaBJCCCGE6AIJmoQQQgghukCCJiGEEEKILpCgSQghhBCiCyRoEkIIIYToAgmahBBCCCG6QIImIYQQQogukKBJCCGEEKILJGgSQnTbWWedxQknnNDX0zikxYsXoygK27dv79N5KIrC7bff3qdzOJRoNMrVV19NRUUFiqIwf/78A44dPnw4c+bM6fY9Xn75ZRRF4a9//WvPJypEH5KgSYg+kvuD/vbbb/f1VDpVW1vL7bffzurVq/t6KgPCM8880+8Do4P54Q9/yOLFi7n22mv54x//yOWXX97XUxKi37H19QSEEP1TbW0td9xxB8OHD+fkk0/u6+n0e8888wwPPvhgp4FTIpHAZuvfv25ffPFFpk2bxm233XbIsRs3bkRV5d/c4vgjP/VCCNGJWCzWa9dyuVz9PmhqbGykoKCgS2OdTid2u/3ITkiIfkiCJiH6ud27d3PllVdSXl6O0+lk0qRJ/P73v+8wJpcr8vjjj/ODH/yAIUOG4HK5OOecc9i8efN+13zwwQcZOXIkbrebqVOn8uqrr3LWWWdx1lln5a93+umnAzB37lwURUFRFBYvXtzhOu+//z5nn302Ho+HwYMH8+Mf/7hLr2nRokV84hOfoKysDKfTycSJE3n44Yf3Gzd8+HAuuOACXnvtNaZOnYrL5WLkyJH84Q9/2G/sunXr+MQnPoHb7WbIkCH87//+L6Zpdmk+c+bMwefzsWXLFs4//3z8fj+XXXYZAK+++iqXXHIJQ4cOxel0UlVVxYIFC0gkEh2+/sEHHwTIv1eKouSf3zen6fbbb0dRFDZv3sycOXMoKCggGAwyd+5c4vF4h7klEgluuOEGSkpK8Pv9fOYzn2H37t1dzpNqbGzkqquuory8HJfLxUknncQjjzySfz73s7Nt2zaefvrp/NwPlgfWWU7T1q1bueSSSygqKsLj8TBt2jSefvrpQ84vlUpxwQUXEAwGef311wEwTZP77ruPSZMm4XK5KC8v5+tf/zqtra0dvvbtt9+murqakpIS3G43I0aM4MorrzzkPYXoqf79Tx8hjnMNDQ1MmzYNRVG47rrrKC0t5dlnn+Wqq64iHA7vl6x7zz33oKoq3/zmNwmFQvz4xz/msssu480338yPefjhh7nuuuv42Mc+xoIFC9i+fTsXXnghhYWFDBkyBIAJEyZw5513cuutt/K1r32Nj33sYwB85CMfyV+ntbWV8847j4suuogvfOEL/PWvf+Xmm29m8uTJfOpTnzro63r44YeZNGkSn/nMZ7DZbDz55JP8z//8D6ZpMm/evA5jN2/ezOc//3muuuoqZs+eze9//3vmzJnDlClTmDRpEgD19fWcffbZ6LrOLbfcgtfr5de//jVut7vL77Wu61RXV/PRj36Un/zkJ3g8HgCeeOIJ4vE41157LcXFxbz11lv84he/YNeuXTzxxBMAfP3rX6e2tpYlS5bwxz/+scv3/MIXvsCIESO4++67WbVqFb/97W8pKyvjRz/6UX7MnDlzePzxx7n88suZNm0ar7zyCrNmzerS9ROJBGeddRabN2/muuuuY8SIETzxxBPMmTOHtrY2brzxRiZMmMAf//hHFixYwJAhQ/jGN74BQGlpaZdfR0NDAx/5yEeIx+PccMMNFBcX88gjj/CZz3yGv/71r3zuc5874Pw++9nP8vbbb/Of//wnH6h//etfZ/HixcydO5cbbriBbdu28cADD/DOO++wbNky7HY7jY2NnHvuuZSWlnLLLbdQUFDA9u3b+fvf/97leQvRbZYQok8sWrTIAqwVK1YccMxVV11lVVZWWnv27Onw+Be/+EUrGAxa8XjcsizLeumllyzAmjBhgpVKpfLj7r//fguw1qxZY1mWZaVSKau4uNg6/fTTrUwmkx+3ePFiC7BmzJiRf2zFihUWYC1atGi/ec2YMcMCrD/84Q/5x1KplFVRUWFdfPHFh3ztuXm3V11dbY0cObLDY8OGDbMA67///W/+scbGRsvpdFrf+MY38o/Nnz/fAqw333yzw7hgMGgB1rZt2w46n9mzZ1uAdcstt3RprnfffbelKIq1Y8eO/GPz5s2zDvQrFbBuu+22/Oe33XabBVhXXnllh3Gf+9znrOLi4vznK1eutABr/vz5HcbNmTNnv2t25r777rMA6//+7//yj6XTaWv69OmWz+ezwuFw/vFhw4ZZs2bNOuj12o+dPXt2/vPc+//qq6/mH4tEItaIESOs4cOHW4ZhWJb14c/pE088YUUiEWvGjBlWSUmJ9c477+S/7tVXX7UA69FHH+1wz+eee67D4//4xz8O+f8fIXqbbM8J0U9ZlsXf/vY3Pv3pT2NZFnv27Ml/VFdXEwqFWLVqVYevmTt3Lg6HI/95boVo69atQHY7o7m5ma9+9asdcmwuu+wyCgsLuzU/n8/HV77ylfznDoeDqVOn5u91MO1XgEKhEHv27GHGjBls3bqVUCjUYezEiRPzrwOyKyDjxo3rcJ9nnnmGadOmMXXq1A7jcltsXXXttdcedK6xWIw9e/bwkY98BMuyeOedd7p1/X1dc801HT7/2Mc+RnNzM+FwGIDnnnsOgP/5n//pMO7666/v0vWfeeYZKioq+NKXvpR/zG63c8MNNxCNRnnllVcOZ/od7jN16lQ++tGP5h/z+Xx87WtfY/v27bz//vsdxodCIc4991w2bNjAyy+/3OGgwRNPPEEwGOSTn/xkh5/5KVOm4PP5eOmllwDy+VdPPfUUmUymV16HEIciQZMQ/VRTUxNtbW38+te/prS0tMPH3LlzgWy+SntDhw7t8HkuEMrlguzYsQOA0aNHdxhns9kYPnx4t+Y3ZMiQDnk7ufvtm3fSmWXLljFz5ky8Xi8FBQWUlpbyne98B2C/oGnf19TZfXbs2MGYMWP2Gzdu3LguvRbIvge57cn2ampqmDNnDkVFRfh8PkpLS5kxY0anc+2urny/VFVlxIgRHcbt+/07kNz7su9JtwkTJuSf7w07duzo9L0+0H3mz5/PihUr+M9//pPfYs3ZtGkToVCIsrKy/X7uo9Fo/md+xowZXHzxxdxxxx2UlJTw2c9+lkWLFpFKpXrlNQnRGclpEqKfyiUxf+UrX2H27NmdjjnxxBM7fK5pWqfjLMvq3ckdxr22bNnCOeecw/jx4/nZz35GVVUVDoeDZ555hnvvvXe/5O2j9ZqcTud+wYVhGHzyk5+kpaWFm2++mfHjx+P1etm9ezdz5szpcqL5gRzN71d/8tnPfpbHHnuMe+65hz/84Q8d3nfTNCkrK+PRRx/t9GtzuVa5IplvvPEGTz75JM8//zxXXnklP/3pT3njjTfw+XxH5bWI44sETUL0U6Wlpfj9fgzDYObMmb1yzWHDhgHZ5Oqzzz47/7iu62zfvr1DELbvKlJvefLJJ0mlUvz73//usNKS23bpiWHDhrFp06b9Ht+4cWOPrwmwZs0aPvjgAx555BGuuOKK/ONLlizZb+yReL+GDRuGaZps27atw0paZyciD/T17733HqZpdghMNmzYkH++t+bZ2Xt9oPtceOGFnHvuucyZMwe/39/h5OSoUaP4z3/+w5lnntmlRP5p06Yxbdo0fvCDH/CnP/2Jyy67jMcee4yrr776MF+VEPuT7Tkh+ilN07j44ov529/+xtq1a/d7vqmpqdvXPO200yguLuY3v/kNuq7nH3/00Uf321bzer0AtLW1dfs+B5NbXWm/mhIKhVi0aFGPr3n
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# plot length and rating of jokes\n",
|
|||
|
"plt.scatter(df_good['length'], df_good['score'], alpha=0.5, label='good jokes')\n",
|
|||
|
"plt.scatter(df_bad['length'], df_bad['score'], alpha=0.5, label='bad jokes')\n",
|
|||
|
"plt.legend()\n",
|
|||
|
"plt.xlabel('Number of words')\n",
|
|||
|
"plt.ylabel('Rating')\n",
|
|||
|
"plt.title('Length and rating of jokes')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 137,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"####################################################################################################\n",
|
|||
|
"5 random good jokes:\n",
|
|||
|
"####################################################################################################\n",
|
|||
|
"['one', 'buy', 'going', 'kill', 'writing', 'book', 'called', 'overreacting']\n",
|
|||
|
"----------------------------------------------------------------------------------------------------\n",
|
|||
|
"['christian', 'priest', 'came', 'laid', 'hand', 'hand', 'said', 'jesus', 'christ', 'walk', 'today', 'smiled', 'told', 'paralysed', 'rabbi', 'came', 'laid', 'hand', 'hand', 'said', 'god', 'almighty', 'walk', 'today', 'le', 'amused', 'told', 'nothing', 'wrong', 'mullah', 'came', 'took', 'hand', 'said', 'insha', 'allah', 'walk', 'today', 'snapped', 'nothing', 'wrong', 'buddhist', 'monk', 'came', 'held', 'hand', 'said', 'great', 'buddha', 'walk', 'today', 'rudely', 'told', 'nothing', 'wrong', 'sermon', 'stepped', 'outside', 'found', 'car', 'stolen', 'edit', 'thanks', 'upvotes', 'inadvertently', 'upset', 'anyone', 'apologize', 'meant', 'joke', 'intended', 'give', 'u', 'chuckle', 'went', 'mixed', 'religion', 'seminar']\n",
|
|||
|
"----------------------------------------------------------------------------------------------------\n",
|
|||
|
"['attire', 'edit', 'aware', 'posted', 'recently', 'care', 'research', 'either', 'deal', 'difference', 'well', 'dressed', 'man', 'unicycle', 'poorly', 'dressed', 'man', 'bicycle']\n",
|
|||
|
"----------------------------------------------------------------------------------------------------\n",
|
|||
|
"['fell', 'forward', 'still', 'boat', 'scuba', 'diver', 'fall', 'backwards', 'boat']\n",
|
|||
|
"----------------------------------------------------------------------------------------------------\n",
|
|||
|
"['found', 'website', 'guaranteed', 'real', 'virgin', 'nsfw']\n",
|
|||
|
"----------------------------------------------------------------------------------------------------\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# print random 5 good jokes\n",
|
|||
|
"print('#' * 100)\n",
|
|||
|
"print('5 random good jokes:')\n",
|
|||
|
"print('#' * 100)\n",
|
|||
|
"for i in range(5):\n",
|
|||
|
" print(df_good['tokens'][np.random.randint(0, num_good - 100)])\n",
|
|||
|
" print('-' * 100)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# save the data\n",
|
|||
|
"#df_good.to_csv('./data/tok_good_jokes.csv', index=False)\n",
|
|||
|
"#df_bad.to_csv('./data/tok_bad_jokes.csv', index=False)"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "Python 3",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.10.4"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|