added glove embeddings
parent
8279123019
commit
a358432c91
|
|
@ -24,3 +24,7 @@ plots/
|
||||||
|
|
||||||
# Ignore everything with delete_me in name
|
# Ignore everything with delete_me in name
|
||||||
*delete_me*
|
*delete_me*
|
||||||
|
|
||||||
|
# Ignore glove
|
||||||
|
*.zip
|
||||||
|
*glove*/
|
||||||
|
|
@ -28,6 +28,8 @@ https://aclanthology.org/2021.semeval-1.9.pdf#:~:text=HaHackathon%20is%20the%20f
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Data embeddings
|
||||||
|
- gloVe 6B tokens: https://nlp.stanford.edu/projects/glove/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -8,6 +8,7 @@ from nltk.tokenize import word_tokenize
|
||||||
import gensim
|
import gensim
|
||||||
import torch
|
import torch
|
||||||
import os
|
import os
|
||||||
|
import copy
|
||||||
|
|
||||||
from HumorDataset import HumorDataset
|
from HumorDataset import HumorDataset
|
||||||
|
|
||||||
|
|
@ -23,6 +24,27 @@ def get_embedding_vector(model, word):
|
||||||
else:
|
else:
|
||||||
return np.zeros(model.vector_size)
|
return np.zeros(model.vector_size)
|
||||||
|
|
||||||
|
def load_glove_embeddings(glove_file_path):
|
||||||
|
embeddings_index = {}
|
||||||
|
with open(glove_file_path, 'r', encoding='utf-8') as f:
|
||||||
|
for line in f:
|
||||||
|
values = line.split()
|
||||||
|
word = values[0]
|
||||||
|
coefs = np.asarray(values[1:], dtype='float32')
|
||||||
|
embeddings_index[word] = coefs
|
||||||
|
return embeddings_index
|
||||||
|
|
||||||
|
def get_embedding_glove_vector(tokens, embeddings_index, default_vector_len=100, pad_tok='<PAD>'):
|
||||||
|
default_vec = [0] * default_vector_len
|
||||||
|
emb_matrix = []
|
||||||
|
for token in tokens:
|
||||||
|
if token == pad_tok:
|
||||||
|
embedding_vector = default_vec
|
||||||
|
else:
|
||||||
|
embedding_vector = embeddings_index.get(token, default_vec)
|
||||||
|
emb_matrix.append(embedding_vector)
|
||||||
|
return emb_matrix
|
||||||
|
|
||||||
def encode_tokens(tokens, vector=False):
|
def encode_tokens(tokens, vector=False):
|
||||||
if vector:
|
if vector:
|
||||||
return [get_embedding_vector(model_embedding, token) for token in tokens]
|
return [get_embedding_vector(model_embedding, token) for token in tokens]
|
||||||
|
|
@ -88,7 +110,6 @@ if __name__ == "__main__":
|
||||||
# split data into train, test, and validation
|
# split data into train, test, and validation
|
||||||
data_dict = split_data(padded_indices, y)
|
data_dict = split_data(padded_indices, y)
|
||||||
|
|
||||||
# TODO: test gloVe embeddings
|
|
||||||
# Embed the data with word2vec
|
# Embed the data with word2vec
|
||||||
model_embedding = gensim.models.Word2Vec(tokens, window=5, min_count=1, workers=4)
|
model_embedding = gensim.models.Word2Vec(tokens, window=5, min_count=1, workers=4)
|
||||||
|
|
||||||
|
|
@ -101,7 +122,7 @@ if __name__ == "__main__":
|
||||||
pad_index = model_embedding.wv.key_to_index['<PAD>']
|
pad_index = model_embedding.wv.key_to_index['<PAD>']
|
||||||
|
|
||||||
|
|
||||||
data_idx_based = data_dict.copy()
|
data_idx_based = copy.deepcopy(data_dict)
|
||||||
vector_based = False
|
vector_based = False
|
||||||
|
|
||||||
for key in data_idx_based.keys():
|
for key in data_idx_based.keys():
|
||||||
|
|
@ -112,10 +133,15 @@ if __name__ == "__main__":
|
||||||
# save the data
|
# save the data
|
||||||
save_data(data_idx_based, 'data/idx_based_padded/', '', vocab_size)
|
save_data(data_idx_based, 'data/idx_based_padded/', '', vocab_size)
|
||||||
|
|
||||||
|
print('loading GloVe embeddings')
|
||||||
vector_based = True
|
vector_based = True
|
||||||
|
# Load GloVe embeddings
|
||||||
|
glove_file_path = 'glove.6B/glove.6B.100d.txt'
|
||||||
|
embeddings_index = load_glove_embeddings(glove_file_path)
|
||||||
|
print('starting with embedding the data')
|
||||||
# Encode the tokens
|
# Encode the tokens
|
||||||
for key in data_dict.keys():
|
for key in data_dict.keys():
|
||||||
data_dict[key]['X'] = [encode_tokens(tokens, vector_based) for tokens in data_dict[key]['X']]
|
data_dict[key]['X'] = [get_embedding_glove_vector(tokens, embeddings_index) for tokens in data_dict[key]['X']]
|
||||||
# print shape of data
|
# print shape of data
|
||||||
#print(key, len(data_dict[key]['X']), len(data_dict[key]['y']))
|
#print(key, len(data_dict[key]['X']), len(data_dict[key]['y']))
|
||||||
|
|
||||||
|
|
|
||||||
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue