added glove embeddings
parent
8279123019
commit
a358432c91
|
|
@ -23,4 +23,8 @@ plots/
|
|||
*.jpg
|
||||
|
||||
# Ignore everything with delete_me in name
|
||||
*delete_me*
|
||||
*delete_me*
|
||||
|
||||
# Ignore glove
|
||||
*.zip
|
||||
*glove*/
|
||||
|
|
@ -28,6 +28,8 @@ https://aclanthology.org/2021.semeval-1.9.pdf#:~:text=HaHackathon%20is%20the%20f
|
|||
|
||||
|
||||
|
||||
## Data embeddings
|
||||
- gloVe 6B tokens: https://nlp.stanford.edu/projects/glove/
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -8,6 +8,7 @@ from nltk.tokenize import word_tokenize
|
|||
import gensim
|
||||
import torch
|
||||
import os
|
||||
import copy
|
||||
|
||||
from HumorDataset import HumorDataset
|
||||
|
||||
|
|
@ -23,6 +24,27 @@ def get_embedding_vector(model, word):
|
|||
else:
|
||||
return np.zeros(model.vector_size)
|
||||
|
||||
def load_glove_embeddings(glove_file_path):
|
||||
embeddings_index = {}
|
||||
with open(glove_file_path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
values = line.split()
|
||||
word = values[0]
|
||||
coefs = np.asarray(values[1:], dtype='float32')
|
||||
embeddings_index[word] = coefs
|
||||
return embeddings_index
|
||||
|
||||
def get_embedding_glove_vector(tokens, embeddings_index, default_vector_len=100, pad_tok='<PAD>'):
|
||||
default_vec = [0] * default_vector_len
|
||||
emb_matrix = []
|
||||
for token in tokens:
|
||||
if token == pad_tok:
|
||||
embedding_vector = default_vec
|
||||
else:
|
||||
embedding_vector = embeddings_index.get(token, default_vec)
|
||||
emb_matrix.append(embedding_vector)
|
||||
return emb_matrix
|
||||
|
||||
def encode_tokens(tokens, vector=False):
|
||||
if vector:
|
||||
return [get_embedding_vector(model_embedding, token) for token in tokens]
|
||||
|
|
@ -87,8 +109,7 @@ if __name__ == "__main__":
|
|||
|
||||
# split data into train, test, and validation
|
||||
data_dict = split_data(padded_indices, y)
|
||||
|
||||
# TODO: test gloVe embeddings
|
||||
|
||||
# Embed the data with word2vec
|
||||
model_embedding = gensim.models.Word2Vec(tokens, window=5, min_count=1, workers=4)
|
||||
|
||||
|
|
@ -101,7 +122,7 @@ if __name__ == "__main__":
|
|||
pad_index = model_embedding.wv.key_to_index['<PAD>']
|
||||
|
||||
|
||||
data_idx_based = data_dict.copy()
|
||||
data_idx_based = copy.deepcopy(data_dict)
|
||||
vector_based = False
|
||||
|
||||
for key in data_idx_based.keys():
|
||||
|
|
@ -112,10 +133,15 @@ if __name__ == "__main__":
|
|||
# save the data
|
||||
save_data(data_idx_based, 'data/idx_based_padded/', '', vocab_size)
|
||||
|
||||
print('loading GloVe embeddings')
|
||||
vector_based = True
|
||||
# Load GloVe embeddings
|
||||
glove_file_path = 'glove.6B/glove.6B.100d.txt'
|
||||
embeddings_index = load_glove_embeddings(glove_file_path)
|
||||
print('starting with embedding the data')
|
||||
# Encode the tokens
|
||||
for key in data_dict.keys():
|
||||
data_dict[key]['X'] = [encode_tokens(tokens, vector_based) for tokens in data_dict[key]['X']]
|
||||
data_dict[key]['X'] = [get_embedding_glove_vector(tokens, embeddings_index) for tokens in data_dict[key]['X']]
|
||||
# print shape of data
|
||||
#print(key, len(data_dict[key]['X']), len(data_dict[key]['y']))
|
||||
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue