diff --git a/cnn.py b/cnn.py index 25c0442..41f38e2 100644 --- a/cnn.py +++ b/cnn.py @@ -8,18 +8,26 @@ from nltk.tokenize import word_tokenize import gensim from torch.utils.data import DataLoader, Dataset from sklearn.metrics import accuracy_score -import time +import matplotlib.pyplot as plt + +# NLTK downloads +import nltk +nltk.download('punkt_tab') +nltk.download('punkt') + +# Check if GPU is available (CUDA for NVIDIA or MPS for Apple devices) +if torch.cuda.is_available(): + DEVICE = torch.device('cuda') # Use CUDA if available +elif torch.backends.mps.is_available(): + DEVICE = torch.device('mps') # Use MPS if available +else: + DEVICE = torch.device('cpu') # Default to CPU if no GPU support is available -# Check if GPU is available -DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('Using device:', DEVICE) # Maximum sequence length MAX_LEN = 100 -# NLTK downloads -import nltk -nltk.download('punkt') # Data processing helpers def get_embedding(model, word): @@ -78,7 +86,7 @@ class CNNBinaryClassifier(nn.Module): # Main if __name__ == "__main__": # Load and process data - df = pd.read_csv('data/hack.csv') # Ensure this file exists + df = pd.read_csv('ANLP_WS24_CA2/data/hack.csv') # Ensure this file exists print(f"Loaded dataset: {df.shape}") X = df['text'] @@ -112,7 +120,7 @@ if __name__ == "__main__": # Model parameters vocab_size = len(model_embedding.wv.key_to_index) embed_dim = model_embedding.vector_size - num_filters = 100 + num_filters = 200 kernel_sizes = [3, 4, 5] hidden_dim = 128 dropout = 0.5 @@ -133,11 +141,14 @@ if __name__ == "__main__": # Move model to device model.to(DEVICE) - # Training loop + # Training loop with loss visualization print("Starting training...") - model.train() + train_losses = [] + val_losses = [] + for epoch in range(epochs): epoch_loss = 0 + model.train() for batch in train_loader: optimizer.zero_grad() input_ids = batch['input_ids'].to(DEVICE) @@ -147,9 +158,27 @@ if __name__ == "__main__": loss.backward() optimizer.step() epoch_loss += loss.item() - print(f"Epoch {epoch + 1}, Loss: {epoch_loss / len(train_loader):.4f}") - # Evaluation loop + train_loss = epoch_loss / len(train_loader) + train_losses.append(train_loss) + + # Evaluation during training + model.eval() + val_loss = 0 + with torch.no_grad(): + for batch in test_loader: + input_ids = batch['input_ids'].to(DEVICE) + labels = batch['labels'].unsqueeze(1).to(DEVICE) + outputs = model(input_ids) + loss = criterion(outputs, labels) + val_loss += loss.item() + + val_loss /= len(test_loader) + val_losses.append(val_loss) + + print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}") + + # Final evaluation print("Starting evaluation...") model.eval() predictions, true_labels = [], [] @@ -158,9 +187,18 @@ if __name__ == "__main__": input_ids = batch['input_ids'].to(DEVICE) labels = batch['labels'].unsqueeze(1).to(DEVICE) outputs = model(input_ids) - preds = outputs.round() + preds = (outputs > 0.5).float() predictions.extend(preds.cpu().numpy()) true_labels.extend(labels.cpu().numpy()) accuracy = accuracy_score(true_labels, predictions) - print(f"Accuracy: {accuracy}") + print(f"Final Accuracy: {accuracy:.4f}") + + # Visualize Losses + plt.plot(train_losses, label="Train Loss") + plt.plot(val_losses, label="Validation Loss") + plt.xlabel("Epochs") + plt.ylabel("Loss") + plt.title("Loss Curve") + plt.legend() + plt.show() \ No newline at end of file