Added Plot

main
Michelle Goeppinger 2025-01-26 12:55:48 +01:00
parent d25573c5c2
commit 8ea3305644
1 changed files with 52 additions and 14 deletions

66
cnn.py
View File

@ -8,18 +8,26 @@ from nltk.tokenize import word_tokenize
import gensim
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score
import time
import matplotlib.pyplot as plt
# NLTK downloads
import nltk
nltk.download('punkt_tab')
nltk.download('punkt')
# Check if GPU is available (CUDA for NVIDIA or MPS for Apple devices)
if torch.cuda.is_available():
DEVICE = torch.device('cuda') # Use CUDA if available
elif torch.backends.mps.is_available():
DEVICE = torch.device('mps') # Use MPS if available
else:
DEVICE = torch.device('cpu') # Default to CPU if no GPU support is available
# Check if GPU is available
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', DEVICE)
# Maximum sequence length
MAX_LEN = 100
# NLTK downloads
import nltk
nltk.download('punkt')
# Data processing helpers
def get_embedding(model, word):
@ -78,7 +86,7 @@ class CNNBinaryClassifier(nn.Module):
# Main
if __name__ == "__main__":
# Load and process data
df = pd.read_csv('data/hack.csv') # Ensure this file exists
df = pd.read_csv('ANLP_WS24_CA2/data/hack.csv') # Ensure this file exists
print(f"Loaded dataset: {df.shape}")
X = df['text']
@ -112,7 +120,7 @@ if __name__ == "__main__":
# Model parameters
vocab_size = len(model_embedding.wv.key_to_index)
embed_dim = model_embedding.vector_size
num_filters = 100
num_filters = 200
kernel_sizes = [3, 4, 5]
hidden_dim = 128
dropout = 0.5
@ -133,11 +141,14 @@ if __name__ == "__main__":
# Move model to device
model.to(DEVICE)
# Training loop
# Training loop with loss visualization
print("Starting training...")
model.train()
train_losses = []
val_losses = []
for epoch in range(epochs):
epoch_loss = 0
model.train()
for batch in train_loader:
optimizer.zero_grad()
input_ids = batch['input_ids'].to(DEVICE)
@ -147,9 +158,27 @@ if __name__ == "__main__":
loss.backward()
optimizer.step()
epoch_loss += loss.item()
print(f"Epoch {epoch + 1}, Loss: {epoch_loss / len(train_loader):.4f}")
# Evaluation loop
train_loss = epoch_loss / len(train_loader)
train_losses.append(train_loss)
# Evaluation during training
model.eval()
val_loss = 0
with torch.no_grad():
for batch in test_loader:
input_ids = batch['input_ids'].to(DEVICE)
labels = batch['labels'].unsqueeze(1).to(DEVICE)
outputs = model(input_ids)
loss = criterion(outputs, labels)
val_loss += loss.item()
val_loss /= len(test_loader)
val_losses.append(val_loss)
print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")
# Final evaluation
print("Starting evaluation...")
model.eval()
predictions, true_labels = [], []
@ -158,9 +187,18 @@ if __name__ == "__main__":
input_ids = batch['input_ids'].to(DEVICE)
labels = batch['labels'].unsqueeze(1).to(DEVICE)
outputs = model(input_ids)
preds = outputs.round()
preds = (outputs > 0.5).float()
predictions.extend(preds.cpu().numpy())
true_labels.extend(labels.cpu().numpy())
accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy: {accuracy}")
print(f"Final Accuracy: {accuracy:.4f}")
# Visualize Losses
plt.plot(train_losses, label="Train Loss")
plt.plot(val_losses, label="Validation Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Loss Curve")
plt.legend()
plt.show()