{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Tokenization adn Normalization" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "import json\n", "import os\n", "from collections import Counter\n", "\n", "import numpy as np\n", "import pandas as pd\n", "\n", "import matplotlib.pyplot as plt\n", "import nltk\n", "\n", "from nltk.corpus import stopwords\n", "from nltk.tokenize import word_tokenize, sent_tokenize\n", "from nltk.stem import WordNetLemmatizer\n" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to\n", "[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n", "[nltk_data] Downloading package stopwords to\n", "[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "[nltk_data] Downloading package wordnet to\n", "[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# nltk count words\n", "nltk.download('punkt')\n", "nltk.download('stopwords')\n", "nltk.download('wordnet')" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "# Load the data\n", "# Load the data from the JSON file\n", "data_path = './data/reddit_jokes.json'\n", "with open(data_path) as f:\n", " data = json.load(f)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | body | \n", "id | \n", "score | \n", "title | \n", "
---|---|---|---|---|
0 | \n", "Now I have to say \"Leroy can you please paint ... | \n", "5tz52q | \n", "1 | \n", "I hate how you cant even say black paint anymore | \n", "
1 | \n", "Pizza doesn't scream when you put it in the ov... | \n", "5tz4dd | \n", "0 | \n", "What's the difference between a Jew in Nazi Ge... | \n", "
2 | \n", "...and being there really helped me learn abou... | \n", "5tz319 | \n", "0 | \n", "I recently went to America.... | \n", "
3 | \n", "A Sunday school teacher is concerned that his ... | \n", "5tz2wj | \n", "1 | \n", "Brian raises his hand and says, “He’s in Heaven.” | \n", "
4 | \n", "He got caught trying to sell the two books to ... | \n", "5tz1pc | \n", "0 | \n", "You hear about the University book store worke... | \n", "