{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import json\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Load the data\n", "with open('data/pun_anno/pun_het.json') as f:\n", " data_het = json.load(f)\n", "\n", "with open('data/pun_anno/pun_hom.json') as f:\n", " data_hom = json.load(f)\n", "\n", "with open('data/pun_annotated.json') as f:\n", " data_anno = json.load(f)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Create a DataFrame\n", "df_anno = pd.DataFrame(data_anno)\n", "\n", "df_het = pd.DataFrame(data_het)\n", "# df switch columns to rows\n", "df_het = df_het.T\n", "\n", "df_hom = pd.DataFrame(data_hom)\n", "# df switch columns to rows\n", "df_hom = df_hom.T" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 hom_362\n", "1 het_837\n", "2 het_635\n", "3 hom_657\n", "4 het_1275\n", " ... \n", "1894 hom_2076\n", "1895 hom_1437\n", "1896 het_1530\n", "1897 het_100\n", "1898 hom_364\n", "Name: ID, Length: 1899, dtype: object\n", "Index(['het_991', 'het_990', 'het_987', 'het_982', 'het_980', 'het_978',\n", " 'het_973', 'het_958', 'het_956', 'het_955',\n", " ...\n", " 'het_1739', 'het_1741', 'het_1747', 'het_1748', 'het_1753', 'het_1757',\n", " 'het_1758', 'het_1759', 'het_1764', 'het_1770'],\n", " dtype='object', length=1146)\n", "Index(['hom_998', 'hom_996', 'hom_994', 'hom_993', 'hom_992', 'hom_990',\n", " 'hom_99', 'hom_985', 'hom_984', 'hom_981',\n", " ...\n", " 'hom_2221', 'hom_2223', 'hom_2225', 'hom_2226', 'hom_2230', 'hom_2232',\n", " 'hom_2234', 'hom_2243', 'hom_2246', 'hom_2247'],\n", " dtype='object', length=1443)\n" ] } ], "source": [ "# print index for each df\n", "print(df_anno['ID'])\n", "print(df_het.index)\n", "print(df_hom.index)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(655, 8) (1146, 11) (1899, 8)\n", "(825, 8) (1443, 11) (1899, 8)\n" ] } ], "source": [ "# find matches from df_anno['ID'] to df_het.index\n", "df_het_match = df_anno[df_anno['ID'].isin(df_het.index)]\n", "print(df_het_match.shape, df_het.shape, df_anno.shape)\n", "\n", "# find matches from df_anno['ID'] to df_hom.index\n", "df_hom_match = df_anno[df_anno['ID'].isin(df_hom.index)]\n", "print(df_hom_match.shape, df_hom.shape, df_anno.shape)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 hom_362\n", "3 hom_657\n", "6 hom_1510\n", "7 hom_955\n", "8 hom_1505\n", " ... \n", "1893 hom_151\n", "1894 hom_2076\n", "1895 hom_1437\n", "1896 het_1530\n", "1898 hom_364\n", "Name: ID, Length: 1244, dtype: object\n", "Index(['het_955', 'het_907', 'het_905', 'het_786', 'het_783', 'het_777',\n", " 'het_639', 'het_573', 'het_466', 'het_435',\n", " ...\n", " 'het_1739', 'het_1741', 'het_1747', 'het_1748', 'het_1753', 'het_1757',\n", " 'het_1758', 'het_1759', 'het_1764', 'het_1770'],\n", " dtype='object', length=491)\n" ] } ], "source": [ "# print not matched IDs and index\n", "print(df_anno[~df_anno['ID'].isin(df_het.index)]['ID'])\n", "print(df_het.index[~df_het.index.isin(df_anno['ID'])])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# merge df_anno and df_het where ID matches with index\n", "df_het_merge = pd.merge(df_anno, df_het, left_on='ID', right_index=True)\n", "# score_avg \n", "df_het_merge['score_avg'] = df_het_merge['Funniness (1-5)'].apply(lambda x: np.mean(x))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.4" } }, "nbformat": 4, "nbformat_minor": 2 }