From a25821f20c2d2855362e416e1ef1d9f606232796 Mon Sep 17 00:00:00 2001 From: klara Date: Mon, 27 Jan 2025 13:56:00 +0100 Subject: [PATCH] update --- cnn.py | 110 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 54 insertions(+), 56 deletions(-) diff --git a/cnn.py b/cnn.py index 8272d3a..53cde1d 100644 --- a/cnn.py +++ b/cnn.py @@ -5,31 +5,24 @@ import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from nltk.tokenize import word_tokenize -import gensim from torch.utils.data import DataLoader, Dataset from sklearn.metrics import accuracy_score +import gensim +import nltk +import time import matplotlib.pyplot as plt -# NLTK downloads -import nltk -nltk.download('punkt_tab') -nltk.download('punkt') - -# Check if GPU is available (CUDA for NVIDIA or MPS for Apple devices) -if torch.cuda.is_available(): - DEVICE = torch.device('cuda') # Use CUDA if available -elif torch.backends.mps.is_available(): - DEVICE = torch.device('mps') # Use MPS if available -else: - DEVICE = torch.device('cpu') # Default to CPU if no GPU support is available +# NLTK Downloads +nltk.download('punkt') # Entferne punkt_tab, da es nicht existiert +# Check if GPU is available +DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('Using device:', DEVICE) # Maximum sequence length MAX_LEN = 100 - -# Data processing helpers +# Data helpers def get_embedding(model, word): if word in model.wv: return model.wv.key_to_index[word] @@ -40,11 +33,9 @@ def encode_tokens(tokens): return [get_embedding(model_embedding, token) for token in tokens] def pad_sequences(sequences, MAX_LEN): - return np.array([ - np.pad(seq, (0, MAX_LEN - len(seq)), mode='constant', constant_values=unk_index) - if len(seq) < MAX_LEN else seq[:MAX_LEN] - for seq in sequences - ]) + return np.array([np.pad(seq, (0, MAX_LEN - len(seq)), mode='constant', constant_values=unk_index) + if len(seq) < MAX_LEN else seq[:MAX_LEN] for seq in sequences]) + # Dataset class class HumorDataset(Dataset): def __init__(self, encodings, labels): @@ -83,23 +74,41 @@ class CNNBinaryClassifier(nn.Module): logits = self.out(fc_out) return self.sigmoid(logits) -# Main +# Main script if __name__ == "__main__": # Load and process data - df = pd.read_csv('ANLP_WS24_CA2/data/hack.csv') # Ensure this file exists + df = pd.read_csv('/content/hack.csv') print(f"Loaded dataset: {df.shape}") - X = df['text'] + X = df['text'].fillna("unknown").astype(str) y = df['is_humor'] + # Train-test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) - # Tokenize the datapp - train_tokens = [word_tokenize(text.lower()) for text in X_train] - test_tokens = [word_tokenize(text.lower()) for text in X_test] + # Tokenization with error handling + train_tokens = [] + test_tokens = [] + + for text in X_train: + try: + train_tokens.append(word_tokenize(text.lower())) + except Exception as e: + print(f"Error tokenizing: {text}. Error: {e}") + train_tokens.append(["unknown"]) + + for text in X_test: + try: + test_tokens.append(word_tokenize(text.lower())) + except Exception as e: + print(f"Error tokenizing: {text}. Error: {e}") + test_tokens.append(["unknown"]) + + print("Sample tokenization (Train):", train_tokens[:2]) + print("Sample tokenization (Test):", test_tokens[:2]) # Train Word2Vec model - model_embedding = gensim.models.Word2Vec(train_tokens, window=5, min_count=5, workers=4) + model_embedding = gensim.models.Word2Vec(train_tokens, vector_size=100, window=5, min_count=1, workers=4) # Add unknown token model_embedding.wv.add_vector('', np.zeros(model_embedding.vector_size)) @@ -109,10 +118,13 @@ if __name__ == "__main__": train_encodings = [encode_tokens(tokens) for tokens in train_tokens] test_encodings = [encode_tokens(tokens) for tokens in test_tokens] - # Pad sequences + # Pad sequences with validation train_encodings = pad_sequences(train_encodings, MAX_LEN) test_encodings = pad_sequences(test_encodings, MAX_LEN) + if len(train_encodings) == 0 or len(test_encodings) == 0: + raise ValueError("Tokenization or padding failed. Please check your input data.") + # Create datasets train_dataset = HumorDataset(train_encodings, y_train.reset_index(drop=True)) test_dataset = HumorDataset(test_encodings, y_test.reset_index(drop=True)) @@ -132,20 +144,21 @@ if __name__ == "__main__": batch_size = 8 learning_rate = 2e-5 + # Optimizer and loss function optimizer = optim.Adam(model.parameters(), lr=learning_rate) criterion = nn.BCELoss() + # Data loaders train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) # Move model to device model.to(DEVICE) - # Training loop with loss visualization print("Starting training...") train_losses = [] - val_losses = [] + # Training loop for epoch in range(epochs): epoch_loss = 0 model.train() @@ -161,25 +174,20 @@ if __name__ == "__main__": train_loss = epoch_loss / len(train_loader) train_losses.append(train_loss) + print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss:.4f}") - # Evaluation during training - model.eval() - val_loss = 0 - with torch.no_grad(): - for batch in test_loader: - input_ids = batch['input_ids'].to(DEVICE) - labels = batch['labels'].unsqueeze(1).to(DEVICE) - outputs = model(input_ids) - loss = criterion(outputs, labels) - val_loss += loss.item() + # Visualize training loss + plt.figure(figsize=(10, 6)) + plt.plot(range(1, epochs + 1), train_losses, marker='o', linestyle='-', label='Train Loss') + plt.xlabel('Epoch') + plt.ylabel('Loss') + plt.title('Training Loss Over Epochs') + plt.legend() + plt.grid(True) + plt.show() - val_loss /= len(test_loader) - val_losses.append(val_loss) - - print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}") - - # Final evaluation print("Starting evaluation...") + # Evaluation model.eval() predictions, true_labels = [], [] with torch.no_grad(): @@ -193,13 +201,3 @@ if __name__ == "__main__": accuracy = accuracy_score(true_labels, predictions) print(f"Final Accuracy: {accuracy:.4f}") - - # Visualize Losses - # Visualize Losses with custom colors -plt.plot(train_losses, label="Train Loss", color='#0032ff') # Blue color for train loss -plt.plot(val_losses, label="Validation Loss", color='#0fff00') # Green color for validation loss -plt.xlabel("Epochs") -plt.ylabel("Loss") -plt.title("Loss Curve") -plt.legend() -plt.show() \ No newline at end of file