{ "cells": [ { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "import json\n", "import os\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import nltk\n", "from nltk.corpus import stopwords\n", "from nltk.tokenize import word_tokenize\n", "from collections import Counter" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to\n", "[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n", "[nltk_data] Downloading package stopwords to\n", "[nltk_data] C:\\Users\\felix\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# nltk count words\n", "nltk.download('punkt')\n", "nltk.download('stopwords')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# dataset stupistuff" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# Load the data from the JSON file\n", "data_path = './data/stupidstuff.json'\n", "with open(data_path) as f:\n", " data = json.load(f)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | body | \n", "category | \n", "id | \n", "rating | \n", "
---|---|---|---|---|
0 | \n", "A blackjack dealer and a player with a thirtee... | \n", "Children | \n", "1 | \n", "2.63 | \n", "
1 | \n", "At a dinner party, several of the guests were ... | \n", "Blonde Jokes | \n", "2 | \n", "2.57 | \n", "
2 | \n", "One day this cop pulls over a blonde for speed... | \n", "Blonde Jokes | \n", "3 | \n", "3.09 | \n", "
3 | \n", "Three women are about to be executed for crime... | \n", "Blonde Jokes | \n", "4 | \n", "4.10 | \n", "
4 | \n", "A girl came skipping home FROM school one day.... | \n", "Blonde Jokes | \n", "5 | \n", "4.30 | \n", "