{ "cells": [ { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "import json\n", "import os\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import nltk\n", "from nltk.corpus import stopwords\n", "from nltk.tokenize import word_tokenize\n", "from collections import Counter" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to\n", "[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n", "[nltk_data] Downloading package stopwords to\n", "[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# nltk count words\n", "nltk.download('punkt')\n", "nltk.download('stopwords')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# dataset wocka" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Load the data from the JSON file\n", "data_path = './data/wocka.json'\n", "with open(data_path) as f:\n", " data = json.load(f)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | body | \n", "category | \n", "id | \n", "title | \n", "
---|---|---|---|---|
0 | \n", "What do you call a cow with no legs?\\r\\n\\r\\nGr... | \n", "Animal | \n", "1 | \n", "Cow With No Legs | \n", "
1 | \n", "What do you call a cow jumping over a barbed w... | \n", "Animal | \n", "2 | \n", "Jumping Cow | \n", "
2 | \n", "What's black and white and red all over?\\r\\n\\r... | \n", "Other / Misc | \n", "4 | \n", "Black, White and Red | \n", "
3 | \n", "So, this guy walks into a bar.\\r\\n\\r\\nAnd says... | \n", "Bar | \n", "5 | \n", "Guy in a Bar | \n", "
4 | \n", "If the opposite of pro is con, isn't the oppos... | \n", "One Liners | \n", "6 | \n", "Progress | \n", "