ANLP_WS24_CA1/structual_pattern.ipynb

568 lines
41 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tokenization adn Normalization"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import os\n",
"from collections import Counter\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import nltk\n",
"\n",
"from nltk.corpus import stopwords\n",
"from nltk.tokenize import word_tokenize, sent_tokenize\n",
"from nltk.stem import WordNetLemmatizer\n"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to\n",
"[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to\n",
"[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"[nltk_data] Downloading package wordnet to\n",
"[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package wordnet is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# nltk count words\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"nltk.download('wordnet')"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"# Load the data\n",
"# Load the data from the JSON file\n",
"data_path = './data/reddit_jokes.json'\n",
"with open(data_path) as f:\n",
" data = json.load(f)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>body</th>\n",
" <th>id</th>\n",
" <th>score</th>\n",
" <th>title</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Now I have to say \"Leroy can you please paint ...</td>\n",
" <td>5tz52q</td>\n",
" <td>1</td>\n",
" <td>I hate how you cant even say black paint anymore</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Pizza doesn't scream when you put it in the ov...</td>\n",
" <td>5tz4dd</td>\n",
" <td>0</td>\n",
" <td>What's the difference between a Jew in Nazi Ge...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>...and being there really helped me learn abou...</td>\n",
" <td>5tz319</td>\n",
" <td>0</td>\n",
" <td>I recently went to America....</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>A Sunday school teacher is concerned that his ...</td>\n",
" <td>5tz2wj</td>\n",
" <td>1</td>\n",
" <td>Brian raises his hand and says, “Hes in Heaven.”</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>He got caught trying to sell the two books to ...</td>\n",
" <td>5tz1pc</td>\n",
" <td>0</td>\n",
" <td>You hear about the University book store worke...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" body id score \\\n",
"0 Now I have to say \"Leroy can you please paint ... 5tz52q 1 \n",
"1 Pizza doesn't scream when you put it in the ov... 5tz4dd 0 \n",
"2 ...and being there really helped me learn abou... 5tz319 0 \n",
"3 A Sunday school teacher is concerned that his ... 5tz2wj 1 \n",
"4 He got caught trying to sell the two books to ... 5tz1pc 0 \n",
"\n",
" title \n",
"0 I hate how you cant even say black paint anymore \n",
"1 What's the difference between a Jew in Nazi Ge... \n",
"2 I recently went to America.... \n",
"3 Brian raises his hand and says, “Hes in Heaven.” \n",
"4 You hear about the University book store worke... "
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# create pandas dataframe of the data\n",
"df = pd.DataFrame(data)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"# NOTE: bit more than 1000 jokes for removing duplicates and empty jokes\n",
"num_good = 1046\n",
"num_bad = 1033\n",
"min_score = 50\n",
"\n",
"\n",
"df_sroted = df.sort_values(by='score', ascending=False)\n",
"df_sroted = df_sroted.reset_index(drop=True)\n",
"\n",
"df_good = df_sroted.head(num_good)\n",
"df_good = df_good.reset_index(drop=True)\n",
"\n",
"df_bad = df_sroted[df_sroted['score'] > min_score].tail(num_bad)\n",
"df_bad = df_bad.reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"avg score good jokes: 10731.98 shape: (1046, 4)\n",
"avg score bad jokes: 52.32 shape: (1033, 4)\n"
]
}
],
"source": [
"# avg score and shape\n",
"print('avg score good jokes:', df_good['score'].mean().round(2), 'shape:', df_good.shape)\n",
"print('avg score bad jokes:', df_bad['score'].mean().round(2), 'shape:', df_bad.shape)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"####################################################################################################\n",
"5 random good jokes:\n",
"####################################################################################################\n",
"I told my wife we can have sex or go see Star Wars, she said, I'm on my period and Star Wars is sold out.\n",
"....what happens next will shock you.\"\n",
"----------------------------------------------------------------------------------------------------\n",
"Two men from Texas were sitting at a bar\n",
"\"No way. That's impossible!\" she said.\n",
"\n",
"\"Trust me,\" I said, \"I have no idea where our baby is.\"\n",
"----------------------------------------------------------------------------------------------------\n",
"My wife always accuses me of having a favorite child.\n",
"...hands down.\n",
"----------------------------------------------------------------------------------------------------\n",
"If i got 50 cents for every failed math exam,\n",
"Two fleas had an arrangement to meet every winter in Miami for a vacation. Last year, when one flea gets to Miami he is shivering and shaking.\n",
"\n",
"The other flea asked him, \"Why are shaking so badly?\"\n",
"\n",
"The first flea says, \"I rode down here from New Jersey in the moustache of a guy on a Harley.\"\n",
"\n",
"The other flea says, \"That's the worst way to travel. Do what I do. Go to the New Jersey airport bar. Have a few drinks. While there, look for a nice stewardess, crawl up her leg and nestle in where it's warm and cozy. It's the best way to travel that I can think of.\"\n",
"\n",
"The first flea thanks the second flea and says he will give it a try next winter. A year goes by... When the first flea shows up in Miami he shivering and shaking again.\n",
"\n",
"The second flea says, \"Didn't you try what I told you?\"\n",
"\n",
"\"Yes,\" says the first flea. \"I did exactly what you said. I went to the New Jersey airport bar. I had a few drinks. Finally, this nice young stewardess came in. I crawled right up to her warm cozy spot. It was so nice and warm that I fell asleep. When I woke up, I was back in the moustache of a guy on a Harley.\"\n",
"\n",
"\n",
" Edit: Front page already!!! Gr8 Guys.\n",
"----------------------------------------------------------------------------------------------------\n",
"I told god a Holocaust joke. He didn't laugh.\n",
"Whoops, wrong sub.\n",
"----------------------------------------------------------------------------------------------------\n"
]
}
],
"source": [
"# print random 5 good jokes\n",
"print('#' * 100)\n",
"print('5 random good jokes:')\n",
"print('#' * 100)\n",
"for i in range(5):\n",
" print(df_good['title'][np.random.randint(0, num_good)])\n",
" print(df_good['body'][np.random.randint(0, num_good)])\n",
" print('-' * 100)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"####################################################################################################\n",
"5 random bad jokes:\n",
"####################################################################################################\n",
"why dont they have drivers ed and sex ed on the same day in mexico?\n",
"As he was praying a black guy was walking nearby with groceries when he dropped his cheese wheel and it rolled to the Mexican. The Mexican grabbed it, praised god, and ran home. \n",
"\n",
"When he gets home he instructs his wife to make nachos with the cheese. \n",
"\n",
"\"Why nachos\" asks his wife \"we can make so many better meals with this cheese\"\n",
"\n",
"\"No\" said the Mexican \"god instructs me to make nachos.\"\n",
"\n",
"\"What do you mean\" asked the wife \n",
"\n",
"\"As I was praying God sent me the cheese wheel and as I was running home with it I heard him yelling That's Nacho cheese, that's nacho cheese!\"\n",
"----------------------------------------------------------------------------------------------------\n",
"2 Priests go to the beach...\n",
"a navel..\n",
"----------------------------------------------------------------------------------------------------\n",
"Two parrots were sitting on a perch...\n",
"Okay I'm donesn't born yesterday..\n",
"----------------------------------------------------------------------------------------------------\n",
"I recently had sex with a biologist at her laboratory.\n",
"MAN: Id like to buy some dog food.\n",
"CHECKOUT LADY: Do you have a dog?\n",
"MAN: Yes.\n",
"CHECKOUT LADY: Where is he?\n",
"MAN: Hes at home.\n",
"CHECKOUT LADY: Im sorry; I cant sell this dog food to you unless Isee the dog. Store policy.\n",
"The next day, the man returns.\n",
"MAN: Id like to buy some cat food.\n",
"CHECKOUT LADY: Do you have a cat?\n",
"MAN: Yes.\n",
"CHECKOUT LADY: Well...where is he?\n",
"MAN: Hes at home!\n",
"CHECKOUT LADY: Sorry, I cant sell this cat food to you unless I see\n",
"your cat.\n",
"The next day the man returns.\n",
"CHECKOUT LADY: Whats in the sack?\n",
"MAN: Put your hand inside.\n",
"CHECKOUT LADY: Hmmm... Its warm and moist! What is it?\n",
"MAN: I would like to buy some toilet paper.\n",
"----------------------------------------------------------------------------------------------------\n",
"Farmer Dan\n",
"He hit me until I was 21. \n",
"----------------------------------------------------------------------------------------------------\n"
]
}
],
"source": [
"# print random 5 bad jokes\n",
"print('#' * 100)\n",
"print('5 random bad jokes:')\n",
"print('#' * 100)\n",
"for i in range(5):\n",
" print(df_bad['title'][np.random.randint(0, num_bad)])\n",
" print(df_bad['body'][np.random.randint(0, num_bad)])\n",
" print('-' * 100)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"shape after tokenizing: (1046, 6) (1033, 6)\n"
]
}
],
"source": [
"# tokenize the jokes\n",
"stop_words = set(stopwords.words('english'))\n",
"\n",
"def sentinize(joke):\n",
" tokens = sent_tokenize(joke)\n",
" tokens = [word for word in tokens if word not in stop_words]\n",
" return tokens\n",
"\n",
"# Tokenize and lemmatize the jokes\n",
"df_good['tok_body'] = df_good['body'].apply(sentinize)\n",
"df_bad['tok_body'] = df_bad['body'].apply(sentinize)\n",
"\n",
"df_good['tok_title'] = df_good['title'].apply(sentinize)\n",
"df_bad['tok_title'] = df_bad['title'].apply(sentinize)\n",
"\n",
"\n",
"\n",
"print('shape after tokenizing:', df_good.shape, df_bad.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def rem_null_dubl(df):\n",
" # Remove rows where 'tok_body' or 'tok_title' is empty\n",
" df = df[df['tok_body'].map(len) > 0]\n",
" df = df[df['tok_title'].map(len) > 0]\n",
" \n",
" # Convert list-type columns to strings for deduplication\n",
" df['tok_body_str'] = df['tok_body'].apply(lambda x: ' '.join(x))\n",
" df['tok_title_str'] = df['tok_title'].apply(lambda x: ' '.join(x))\n",
" \n",
" # Remove rows where both 'tok_body' and 'tok_title' are duplicates\n",
" df = df.drop_duplicates(subset=['tok_body_str', 'tok_title_str'], keep=False)\n",
" \n",
" # Drop the temporary string columns\n",
" df = df.drop(columns=['tok_body_str', 'tok_title_str'])\n",
" \n",
" return df\n",
"\n",
"df_good = rem_null_dubl(df_good)\n",
"df_bad = rem_null_dubl(df_bad)\n",
"print('shape after removing duplicates:', df_good.shape, df_bad.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_good['tokens'] = df_good['tok_body'] + df_good['tok_title']\n",
"df_bad['tokens'] = df_bad['tok_body'] + df_bad['tok_title']"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"good jokes with more than 1 sentence: 576 total: 1008\n",
"bad jokes with more than 1 sentence: 477 total: 1016\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# plot avarage number of sentences in jokes\n",
"good_sent = df_good['tokens'].apply(len)\n",
"bad_sent = df_bad['tokens'].apply(len)\n",
"\n",
"# bigger than 1\n",
"good_sent = good_sent[good_sent > 2]\n",
"bad_sent = bad_sent[bad_sent > 2]\n",
"\n",
"# print how many jokes have more than 1 sentence relative to the total number of jokes\n",
"print('good jokes with more than 1 sentence:', good_sent.shape[0], 'total:', df_good.shape[0])\n",
"print('bad jokes with more than 1 sentence:', bad_sent.shape[0], 'total:', df_bad.shape[0])\n",
"\n",
"plt.hist(good_sent, bins=20, alpha=0.5, label='good jokes')\n",
"plt.hist(bad_sent, bins=20, alpha=0.5, label='bad jokes')\n",
"plt.xlabel('Number of sentences')\n",
"plt.ylabel('Number of jokes')\n",
"plt.legend()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Find Patterns\n",
"use tok_title and tok_body for it"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Oneliner"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Questions -> Answer"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Wordplay "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Dialog\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Knock-Knock Jokes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}