added data preprocessing and data
parent
12e638cec3
commit
b5f315b3a9
|
|
@ -1,3 +1,6 @@
|
|||
# Ignore pycache directory
|
||||
__pycache__/
|
||||
|
||||
# Ignore virtual environment directory
|
||||
.venv/
|
||||
|
||||
|
|
@ -17,4 +20,7 @@ plots/
|
|||
|
||||
# Ignore plot file
|
||||
*.png
|
||||
*.jpg
|
||||
*.jpg
|
||||
|
||||
# Ignore everything with delete_me in name
|
||||
*delete_me*
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -0,0 +1,123 @@
|
|||
"""
|
||||
This file contains the dataset generation and preprocessing.
|
||||
"""
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.model_selection import train_test_split
|
||||
from nltk.tokenize import word_tokenize
|
||||
import gensim
|
||||
import torch
|
||||
import os
|
||||
|
||||
from HumorDataset import HumorDataset
|
||||
|
||||
def get_embedding_idx(model, word):
|
||||
if word in model.wv:
|
||||
return model.wv.key_to_index[word]
|
||||
else:
|
||||
return unk_index
|
||||
|
||||
def get_embedding_vector(model, word):
|
||||
if word in model.wv:
|
||||
return model.wv[word]
|
||||
else:
|
||||
return np.zeros(model.vector_size)
|
||||
|
||||
def encode_tokens(tokens, vector=False):
|
||||
if vector:
|
||||
return [get_embedding_vector(model_embedding, token) for token in tokens]
|
||||
else:
|
||||
return [get_embedding_idx(model_embedding, token) for token in tokens]
|
||||
|
||||
def pad_sequences(sequences, max_len, pad_index):
|
||||
return np.array([np.pad(seq, (0, max_len - len(seq)), mode='constant', constant_values=pad_index) if len(seq) < max_len else seq[:max_len] for seq in sequences])
|
||||
|
||||
|
||||
def split_data(X, y, test_size=0.1, val_size=0.1):
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size + val_size, random_state=42)
|
||||
val_split_ratio = val_size / (val_size + test_size)
|
||||
X_test, X_val, y_test, y_val = train_test_split(X_train, y_train, test_size=val_split_ratio, random_state=42)
|
||||
|
||||
ret_dict = {
|
||||
'train': {'X': X_train, 'y': y_train},
|
||||
'test': {'X': X_test, 'y': y_test},
|
||||
'val': {'X': X_val, 'y': y_val}
|
||||
}
|
||||
return ret_dict
|
||||
|
||||
def save_data(data_dict, path, prefix, vocab_size=0, emb_dim=None):
|
||||
if not os.path.exists(path):
|
||||
print('Creating directory:', path)
|
||||
os.makedirs(path)
|
||||
print('saving data into:', path)
|
||||
for key, value in data_dict.items():
|
||||
# tansform to Dataset
|
||||
dataset = HumorDataset(value['X'], value['y'], vocab_size, emb_dim)
|
||||
# save dataset
|
||||
torch.save(dataset, path + prefix + key + '.pt')
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Load the data from csv
|
||||
df = pd.read_csv('data/hack.csv')
|
||||
print(df.shape)
|
||||
|
||||
df = df.dropna(subset=['humor_rating'])
|
||||
|
||||
# find median of humor_rating
|
||||
median_rating = df['humor_rating'].median()
|
||||
#print('median and therefore middle of humor_rating:', median_rating)
|
||||
|
||||
df['y'] = df['humor_rating'] > median_rating
|
||||
|
||||
# transfrom data into dataset
|
||||
X = df['text']
|
||||
y = df['y']
|
||||
|
||||
# Tokenize the data with nltk
|
||||
tokens = [word_tokenize(text.lower()) for text in X]
|
||||
|
||||
vocab_size = len(set([word for sentence in tokens for word in sentence]))
|
||||
print('vocab size:', vocab_size)
|
||||
|
||||
# Pad the sequences
|
||||
# NOTE: Info comes from data explore notebook: 280 is max length,
|
||||
# 139 contains 80% and 192 contains 95% of the data
|
||||
max_len = 280
|
||||
padded_indices = pad_sequences(tokens, max_len=max_len, pad_index='<PAD>')
|
||||
|
||||
# split data into train, test, and validation
|
||||
data_dict = split_data(padded_indices, y)
|
||||
|
||||
# TODO: test gloVe embeddings
|
||||
# Embed the data with word2vec
|
||||
model_embedding = gensim.models.Word2Vec(tokens, window=5, min_count=1, workers=4)
|
||||
|
||||
# Add a special token for out-of-vocabulary words
|
||||
model_embedding.wv.add_vector('<UNK>', np.zeros(model_embedding.vector_size))
|
||||
unk_index = model_embedding.wv.key_to_index['<UNK>']
|
||||
|
||||
# Add padding index for padding
|
||||
model_embedding.wv.add_vector('<PAD>', np.zeros(model_embedding.vector_size))
|
||||
pad_index = model_embedding.wv.key_to_index['<PAD>']
|
||||
|
||||
|
||||
data_idx_based = data_dict.copy()
|
||||
vector_based = False
|
||||
|
||||
for key in data_idx_based.keys():
|
||||
data_idx_based[key]['X'] = [encode_tokens(tokens, vector_based) for tokens in data_dict[key]['X']]
|
||||
# print shape of data
|
||||
#print(key, len(data_dict[key]['X']), len(data_dict[key]['y']))
|
||||
|
||||
# save the data
|
||||
save_data(data_idx_based, 'data/idx_based_padded/', '', vocab_size)
|
||||
|
||||
vector_based = True
|
||||
# Encode the tokens
|
||||
for key in data_dict.keys():
|
||||
data_dict[key]['X'] = [encode_tokens(tokens, vector_based) for tokens in data_dict[key]['X']]
|
||||
# print shape of data
|
||||
#print(key, len(data_dict[key]['X']), len(data_dict[key]['y']))
|
||||
|
||||
# Save the data
|
||||
save_data(data_dict, 'data/embedded_padded/', '', vocab_size, emb_dim=model_embedding.vector_size)
|
||||
Loading…
Reference in New Issue