diff --git a/.gitignore b/.gitignore index b81c6af..bf9a08a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +# Ignore pycache directory +__pycache__/ + # Ignore virtual environment directory .venv/ @@ -17,4 +20,7 @@ plots/ # Ignore plot file *.png -*.jpg \ No newline at end of file +*.jpg + +# Ignore everything with delete_me in name +*delete_me* \ No newline at end of file diff --git a/data/embedded_padded/test.pt b/data/embedded_padded/test.pt new file mode 100644 index 0000000..c806369 Binary files /dev/null and b/data/embedded_padded/test.pt differ diff --git a/data/embedded_padded/train.pt b/data/embedded_padded/train.pt new file mode 100644 index 0000000..0a04318 Binary files /dev/null and b/data/embedded_padded/train.pt differ diff --git a/data/embedded_padded/val.pt b/data/embedded_padded/val.pt new file mode 100644 index 0000000..2da4843 Binary files /dev/null and b/data/embedded_padded/val.pt differ diff --git a/data/idx_based_padded/test.pt b/data/idx_based_padded/test.pt new file mode 100644 index 0000000..91a44cd Binary files /dev/null and b/data/idx_based_padded/test.pt differ diff --git a/data/idx_based_padded/train.pt b/data/idx_based_padded/train.pt new file mode 100644 index 0000000..9e28544 Binary files /dev/null and b/data/idx_based_padded/train.pt differ diff --git a/data/idx_based_padded/val.pt b/data/idx_based_padded/val.pt new file mode 100644 index 0000000..c59c227 Binary files /dev/null and b/data/idx_based_padded/val.pt differ diff --git a/dataset_generator.py b/dataset_generator.py new file mode 100644 index 0000000..a0f7118 --- /dev/null +++ b/dataset_generator.py @@ -0,0 +1,123 @@ +""" +This file contains the dataset generation and preprocessing. +""" +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split +from nltk.tokenize import word_tokenize +import gensim +import torch +import os + +from HumorDataset import HumorDataset + +def get_embedding_idx(model, word): + if word in model.wv: + return model.wv.key_to_index[word] + else: + return unk_index + +def get_embedding_vector(model, word): + if word in model.wv: + return model.wv[word] + else: + return np.zeros(model.vector_size) + +def encode_tokens(tokens, vector=False): + if vector: + return [get_embedding_vector(model_embedding, token) for token in tokens] + else: + return [get_embedding_idx(model_embedding, token) for token in tokens] + +def pad_sequences(sequences, max_len, pad_index): + return np.array([np.pad(seq, (0, max_len - len(seq)), mode='constant', constant_values=pad_index) if len(seq) < max_len else seq[:max_len] for seq in sequences]) + + +def split_data(X, y, test_size=0.1, val_size=0.1): + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size + val_size, random_state=42) + val_split_ratio = val_size / (val_size + test_size) + X_test, X_val, y_test, y_val = train_test_split(X_train, y_train, test_size=val_split_ratio, random_state=42) + + ret_dict = { + 'train': {'X': X_train, 'y': y_train}, + 'test': {'X': X_test, 'y': y_test}, + 'val': {'X': X_val, 'y': y_val} + } + return ret_dict + +def save_data(data_dict, path, prefix, vocab_size=0, emb_dim=None): + if not os.path.exists(path): + print('Creating directory:', path) + os.makedirs(path) + print('saving data into:', path) + for key, value in data_dict.items(): + # tansform to Dataset + dataset = HumorDataset(value['X'], value['y'], vocab_size, emb_dim) + # save dataset + torch.save(dataset, path + prefix + key + '.pt') + +if __name__ == "__main__": + # Load the data from csv + df = pd.read_csv('data/hack.csv') + print(df.shape) + + df = df.dropna(subset=['humor_rating']) + + # find median of humor_rating + median_rating = df['humor_rating'].median() + #print('median and therefore middle of humor_rating:', median_rating) + + df['y'] = df['humor_rating'] > median_rating + + # transfrom data into dataset + X = df['text'] + y = df['y'] + + # Tokenize the data with nltk + tokens = [word_tokenize(text.lower()) for text in X] + + vocab_size = len(set([word for sentence in tokens for word in sentence])) + print('vocab size:', vocab_size) + + # Pad the sequences + # NOTE: Info comes from data explore notebook: 280 is max length, + # 139 contains 80% and 192 contains 95% of the data + max_len = 280 + padded_indices = pad_sequences(tokens, max_len=max_len, pad_index='') + + # split data into train, test, and validation + data_dict = split_data(padded_indices, y) + + # TODO: test gloVe embeddings + # Embed the data with word2vec + model_embedding = gensim.models.Word2Vec(tokens, window=5, min_count=1, workers=4) + + # Add a special token for out-of-vocabulary words + model_embedding.wv.add_vector('', np.zeros(model_embedding.vector_size)) + unk_index = model_embedding.wv.key_to_index[''] + + # Add padding index for padding + model_embedding.wv.add_vector('', np.zeros(model_embedding.vector_size)) + pad_index = model_embedding.wv.key_to_index[''] + + + data_idx_based = data_dict.copy() + vector_based = False + + for key in data_idx_based.keys(): + data_idx_based[key]['X'] = [encode_tokens(tokens, vector_based) for tokens in data_dict[key]['X']] + # print shape of data + #print(key, len(data_dict[key]['X']), len(data_dict[key]['y'])) + + # save the data + save_data(data_idx_based, 'data/idx_based_padded/', '', vocab_size) + + vector_based = True + # Encode the tokens + for key in data_dict.keys(): + data_dict[key]['X'] = [encode_tokens(tokens, vector_based) for tokens in data_dict[key]['X']] + # print shape of data + #print(key, len(data_dict[key]['X']), len(data_dict[key]['y'])) + + # Save the data + save_data(data_dict, 'data/embedded_padded/', '', vocab_size, emb_dim=model_embedding.vector_size)