init structure, added data exploration hack, added init transformer

main
Felix Jan Michael Mucha 2025-01-23 21:28:45 +01:00
parent f1f25b4f8a
commit d43d44d103
12 changed files with 129996 additions and 369 deletions

20
.gitignore vendored 100644
View File

@ -0,0 +1,20 @@
# Ignore virtual environment directory
.venv/
# Ignore requirements file
reqs_venv.txt
# Ignore models directory
models/
# Ignore model file
*.h5
*.keras
*.pth
# Ignore plots directory
plots/
# Ignore plot file
*.png
*.jpg

View File

@ -1,3 +1,23 @@
# ANLP_WS24_CA2
# Master MDS Use NLP techniques to analyse texts or to build an application. Document your approach.
# Master MDS Use NLP techniques to analyse texts or to build an application. Document your approach.
## Data
- Hackathon: https://homepages.inf.ed.ac.uk/s1573290/data.html
#### Not Prioritised (Pun data)
- Challenge https://alt.qcri.org/semeval2017/task7/
- Pun Annotated Amazon (joke not included ...): https://github.com/amazon-science/expunations/tree/main/data

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

16
gpu_check.py 100644
View File

@ -0,0 +1,16 @@
import torch
# Check if CUDA is available
cuda_available = torch.cuda.is_available()
print(f"CUDA available: {cuda_available}")
if cuda_available:
# Print the current CUDA device
current_device = torch.cuda.current_device()
print(f"Current CUDA device: {current_device}")
# Print the name of the current CUDA device
device_name = torch.cuda.get_device_name(current_device)
print(f"CUDA device name: {device_name}")
else:
print("CUDA is not available. Please check your CUDA installation and PyTorch configuration.")

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,187 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Load the data\n",
"with open('data/pun_anno/pun_het.json') as f:\n",
" data_het = json.load(f)\n",
"\n",
"with open('data/pun_anno/pun_hom.json') as f:\n",
" data_hom = json.load(f)\n",
"\n",
"with open('data/pun_annotated.json') as f:\n",
" data_anno = json.load(f)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Create a DataFrame\n",
"df_anno = pd.DataFrame(data_anno)\n",
"\n",
"df_het = pd.DataFrame(data_het)\n",
"# df switch columns to rows\n",
"df_het = df_het.T\n",
"\n",
"df_hom = pd.DataFrame(data_hom)\n",
"# df switch columns to rows\n",
"df_hom = df_hom.T"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 hom_362\n",
"1 het_837\n",
"2 het_635\n",
"3 hom_657\n",
"4 het_1275\n",
" ... \n",
"1894 hom_2076\n",
"1895 hom_1437\n",
"1896 het_1530\n",
"1897 het_100\n",
"1898 hom_364\n",
"Name: ID, Length: 1899, dtype: object\n",
"Index(['het_991', 'het_990', 'het_987', 'het_982', 'het_980', 'het_978',\n",
" 'het_973', 'het_958', 'het_956', 'het_955',\n",
" ...\n",
" 'het_1739', 'het_1741', 'het_1747', 'het_1748', 'het_1753', 'het_1757',\n",
" 'het_1758', 'het_1759', 'het_1764', 'het_1770'],\n",
" dtype='object', length=1146)\n",
"Index(['hom_998', 'hom_996', 'hom_994', 'hom_993', 'hom_992', 'hom_990',\n",
" 'hom_99', 'hom_985', 'hom_984', 'hom_981',\n",
" ...\n",
" 'hom_2221', 'hom_2223', 'hom_2225', 'hom_2226', 'hom_2230', 'hom_2232',\n",
" 'hom_2234', 'hom_2243', 'hom_2246', 'hom_2247'],\n",
" dtype='object', length=1443)\n"
]
}
],
"source": [
"# print index for each df\n",
"print(df_anno['ID'])\n",
"print(df_het.index)\n",
"print(df_hom.index)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(655, 8) (1146, 11) (1899, 8)\n",
"(825, 8) (1443, 11) (1899, 8)\n"
]
}
],
"source": [
"# find matches from df_anno['ID'] to df_het.index\n",
"df_het_match = df_anno[df_anno['ID'].isin(df_het.index)]\n",
"print(df_het_match.shape, df_het.shape, df_anno.shape)\n",
"\n",
"# find matches from df_anno['ID'] to df_hom.index\n",
"df_hom_match = df_anno[df_anno['ID'].isin(df_hom.index)]\n",
"print(df_hom_match.shape, df_hom.shape, df_anno.shape)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 hom_362\n",
"3 hom_657\n",
"6 hom_1510\n",
"7 hom_955\n",
"8 hom_1505\n",
" ... \n",
"1893 hom_151\n",
"1894 hom_2076\n",
"1895 hom_1437\n",
"1896 het_1530\n",
"1898 hom_364\n",
"Name: ID, Length: 1244, dtype: object\n",
"Index(['het_955', 'het_907', 'het_905', 'het_786', 'het_783', 'het_777',\n",
" 'het_639', 'het_573', 'het_466', 'het_435',\n",
" ...\n",
" 'het_1739', 'het_1741', 'het_1747', 'het_1748', 'het_1753', 'het_1757',\n",
" 'het_1758', 'het_1759', 'het_1764', 'het_1770'],\n",
" dtype='object', length=491)\n"
]
}
],
"source": [
"# print not matched IDs and index\n",
"print(df_anno[~df_anno['ID'].isin(df_het.index)]['ID'])\n",
"print(df_het.index[~df_het.index.isin(df_anno['ID'])])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# merge df_anno and df_het where ID matches with index\n",
"df_het_merge = pd.merge(df_anno, df_het, left_on='ID', right_index=True)\n",
"# score_avg \n",
"df_het_merge['score_avg'] = df_het_merge['Funniness (1-5)'].apply(lambda x: np.mean(x))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

128031
puns/pun_annotated.json 100644

File diff suppressed because it is too large Load Diff

233
transformer.py 100644
View File

@ -0,0 +1,233 @@
"""
This file contains the transformer model.
"""
# TODO refactor the code
# TODO create ml helper script
# TODO create ml evaluation script
# TODO track overfitting better
# TODO validate model in training (accuracy, loss, etc)
# TODO set length to a constant value which is the max length of the sentences or nearly
# TODO user gloVe embeddings
#TODO: add attention mask
# TODO: add positional encoding
#TODO: add dropout (if needed)
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader
from transformers import AdamW
from sklearn.metrics import accuracy_score
import gensim
import time
# Disable the warning for beta transformers
import torchvision
torchvision.disable_beta_transforms_warning()
# Test if GPU is available
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', DEVICE)
# Input maximum length
MAX_LEN = 100
# download nltk data
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
def get_embedding(model, word):
if word in model.wv:
return model.wv.key_to_index[word]
else:
return unk_index
def encode_tokens(tokens):
return [get_embedding(model_embedding, token) for token in tokens]
def pad_sequences(sequences, MAX_LEN):
return np.array([np.pad(seq, (0, MAX_LEN - len(seq)), mode='constant', constant_values=unk_index) if len(seq) < MAX_LEN else seq[:MAX_LEN] for seq in sequences])
class HumorDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {'input_ids': torch.tensor(self.encodings[idx], dtype=torch.float)}
item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
return item
def __len__(self):
return len(self.labels)
class TransformerBinaryClassifier(nn.Module):
def __init__(self, vocab_size, embed_dim, num_heads, num_layers, hidden_dim, dropout=0.1):
super(TransformerBinaryClassifier, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.transformer = nn.Transformer(embed_dim, num_heads, num_layers, num_layers, hidden_dim, dropout)
self.fc = nn.Linear(embed_dim, 1)
self.sigmoid = nn.Sigmoid()
def forward(self, input_ids):
input_ids = input_ids.long()
embedded = self.embedding(input_ids)
transformer_output = self.transformer(embedded, embedded)
pooled_output = transformer_output.mean(dim=1)
logits = self.fc(pooled_output)
return self.sigmoid(logits)
if __name__ == "__main__":
# Load the data from csv
df = pd.read_csv('data/hack.csv')
print(df.shape)
# transfrom data into dataset
X = df['text']
y = df['is_humor']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Tokenize the data with nltk
train_tokens = [word_tokenize(text.lower()) for text in X_train]
test_tokens = [word_tokenize(text.lower()) for text in X_test]
# Embed the data with word2vec
model_embedding = gensim.models.Word2Vec(train_tokens, window=5, min_count=1, workers=4)
# Add a special token for out-of-vocabulary words
model_embedding.wv.add_vector('<UNK>', np.zeros(model_embedding.vector_size))
unk_index = model_embedding.wv.key_to_index['<UNK>']
# Encode the tokens
train_encodings = [encode_tokens(tokens) for tokens in train_tokens]
test_encodings = [encode_tokens(tokens) for tokens in test_tokens]
# Define the maximum sequence length
train_encodings = pad_sequences(train_encodings, MAX_LEN)
test_encodings = pad_sequences(test_encodings, MAX_LEN)
train_dataset = HumorDataset(train_encodings, y_train.reset_index(drop=True))
test_dataset = HumorDataset(test_encodings, y_test.reset_index(drop=True))
vocab_size = len(model_embedding.wv.key_to_index)
embed_dim = model_embedding.vector_size
num_heads = 2
num_layers = 2
hidden_dim = 256
print(f"Vocabulary size: {vocab_size}")
print(f"Embedding dimension: {embed_dim}")
model = TransformerBinaryClassifier(vocab_size, embed_dim, num_heads, num_layers, hidden_dim)
# Training parameters
epochs = 30 #3
batch_size = 8
learning_rate = 2e-5
# Optimizer and loss function
optimizer = AdamW(model.parameters(), lr=learning_rate)
criterion = nn.BCEWithLogitsLoss()
# Data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
for td in train_dataset:
print(td['input_ids'].shape)
print(td['labels'])
break
for batch in train_loader:
print(batch['input_ids'].shape)
print(batch['labels'])
break
# Model to device
model.to(DEVICE)
print("Starting training...")
start_training_time = time.time()
losses = []
# Training loop
model.train()
for epoch in range(epochs):
epoch_start_time = time.time()
batch_losses = []
for batch in train_loader:
optimizer.zero_grad()
input_ids = batch['input_ids'].to(DEVICE)
labels = batch['labels'].unsqueeze(1).to(DEVICE)
outputs = model(input_ids)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
batch_losses.append(loss.item())
losses.append(np.mean(batch_losses))
epoch_end_time = time.time()
print(f"Epoch {epoch + 1}/{epochs}, Time: {epoch_end_time - epoch_start_time:.2f} sec, Loss: {losses[-1]:.5f}")
end_training_time = time.time()
print(f"Training finished in {end_training_time - start_training_time:.2f} seconds")
print("Starting evaluation...")
# Evaluation
model.eval()
predictions, true_labels = [], []
with torch.no_grad():
for batch in test_loader:
input_ids = batch['input_ids'].to(DEVICE)
labels = batch['labels'].unsqueeze(1).to(DEVICE)
outputs = model(input_ids)
preds = outputs.round()
predictions.extend(preds.cpu().numpy())
true_labels.extend(labels.cpu().numpy())
accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy: {accuracy}")
# Save the model
timestamp = time.strftime("%Y%m%d-%H%M%S")
torch.save(model.state_dict(), f'models/transformer_acc_{accuracy}_{timestamp}.pth')
print("Model saved.")
# Save model hyperparameters as json
hyperparameters = {
'max_len': MAX_LEN,
'vocab_size': vocab_size,
'embed_dim': embed_dim,
'num_heads': num_heads,
'num_layers': num_layers,
'hidden_dim': hidden_dim,
'epochs': epochs,
'batch_size': batch_size,
'learning_rate': learning_rate,
'accuracy': accuracy
}
pd.DataFrame(hyperparameters, index=[0]).to_json(f'models/transformer_acc_{accuracy}_{timestamp}.json')