added glove embeddings

main
Felix Jan Michael Mucha 2025-01-27 20:55:22 +01:00
parent 8279123019
commit a358432c91
7 changed files with 107 additions and 24 deletions

6
.gitignore vendored
View File

@ -23,4 +23,8 @@ plots/
*.jpg
# Ignore everything with delete_me in name
*delete_me*
*delete_me*
# Ignore glove
*.zip
*glove*/

View File

@ -28,6 +28,8 @@ https://aclanthology.org/2021.semeval-1.9.pdf#:~:text=HaHackathon%20is%20the%20f
## Data embeddings
- gloVe 6B tokens: https://nlp.stanford.edu/projects/glove/

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -8,6 +8,7 @@ from nltk.tokenize import word_tokenize
import gensim
import torch
import os
import copy
from HumorDataset import HumorDataset
@ -23,6 +24,27 @@ def get_embedding_vector(model, word):
else:
return np.zeros(model.vector_size)
def load_glove_embeddings(glove_file_path):
embeddings_index = {}
with open(glove_file_path, 'r', encoding='utf-8') as f:
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
return embeddings_index
def get_embedding_glove_vector(tokens, embeddings_index, default_vector_len=100, pad_tok='<PAD>'):
default_vec = [0] * default_vector_len
emb_matrix = []
for token in tokens:
if token == pad_tok:
embedding_vector = default_vec
else:
embedding_vector = embeddings_index.get(token, default_vec)
emb_matrix.append(embedding_vector)
return emb_matrix
def encode_tokens(tokens, vector=False):
if vector:
return [get_embedding_vector(model_embedding, token) for token in tokens]
@ -87,8 +109,7 @@ if __name__ == "__main__":
# split data into train, test, and validation
data_dict = split_data(padded_indices, y)
# TODO: test gloVe embeddings
# Embed the data with word2vec
model_embedding = gensim.models.Word2Vec(tokens, window=5, min_count=1, workers=4)
@ -101,7 +122,7 @@ if __name__ == "__main__":
pad_index = model_embedding.wv.key_to_index['<PAD>']
data_idx_based = data_dict.copy()
data_idx_based = copy.deepcopy(data_dict)
vector_based = False
for key in data_idx_based.keys():
@ -112,10 +133,15 @@ if __name__ == "__main__":
# save the data
save_data(data_idx_based, 'data/idx_based_padded/', '', vocab_size)
print('loading GloVe embeddings')
vector_based = True
# Load GloVe embeddings
glove_file_path = 'glove.6B/glove.6B.100d.txt'
embeddings_index = load_glove_embeddings(glove_file_path)
print('starting with embedding the data')
# Encode the tokens
for key in data_dict.keys():
data_dict[key]['X'] = [encode_tokens(tokens, vector_based) for tokens in data_dict[key]['X']]
data_dict[key]['X'] = [get_embedding_glove_vector(tokens, embeddings_index) for tokens in data_dict[key]['X']]
# print shape of data
#print(key, len(data_dict[key]['X']), len(data_dict[key]['y']))

File diff suppressed because one or more lines are too long