{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import json\n", "import os\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import nltk\n", "from nltk.corpus import stopwords\n", "from nltk.tokenize import word_tokenize\n", "from collections import Counter" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to\n", "[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n", "[nltk_data] Downloading package stopwords to\n", "[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# nltk count words\n", "nltk.download('punkt')\n", "nltk.download('stopwords')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# dataset reddit jokes" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Load the data from the JSON file\n", "data_path = './data/reddit_jokes.json'\n", "with open(data_path) as f:\n", " data = json.load(f)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | body | \n", "id | \n", "score | \n", "title | \n", "
---|---|---|---|---|
0 | \n", "Now I have to say \"Leroy can you please paint ... | \n", "5tz52q | \n", "1 | \n", "I hate how you cant even say black paint anymore | \n", "
1 | \n", "Pizza doesn't scream when you put it in the ov... | \n", "5tz4dd | \n", "0 | \n", "What's the difference between a Jew in Nazi Ge... | \n", "
2 | \n", "...and being there really helped me learn abou... | \n", "5tz319 | \n", "0 | \n", "I recently went to America.... | \n", "
3 | \n", "A Sunday school teacher is concerned that his ... | \n", "5tz2wj | \n", "1 | \n", "Brian raises his hand and says, “He’s in Heaven.” | \n", "
4 | \n", "He got caught trying to sell the two books to ... | \n", "5tz1pc | \n", "0 | \n", "You hear about the University book store worke... | \n", "