actually implemented that RL

2025-12-02 12:02:04 +01:00 · 2025-12-02 12:02:04 +01:00 · a53583b1d7
parent a891d51ca9
commit a53583b1d7
2 changed files with 61 additions and 60 deletions
--- a/04_pacman_rl/pacman.py
+++ b/04_pacman_rl/pacman.py
@ -1,8 +1,6 @@
 import pygame
 import random
 import math
 import reinforcement_learning as rl
 import time
 # Initialize pygame
 pygame.init()
@ -125,19 +123,14 @@ def move_pacman(pacman, a):
 # Main game function
 def main():
    global labyrinth
    clock = pygame.time.Clock()
    # Initialize Pacman and Ghost positions
    pacman = Pacman(1, 1)
    ghost = Ghost(COLS - 2, ROWS - 2)
    s = (pacman.x, pacman.y, ghost.x, ghost.y) # as a tuple so the state becomes hashable
    q = rl.q_init()
    clock = pygame.time.Clock()
    # Game loop
    not_won = True
-    running = True
+    outer_iter = 0
-    iter = 0
+    
    while not_won:
        labyrinth = [
@ -147,20 +140,30 @@ def main():
            "#........#",
            "##########"
        ]
        running = True
        iter = 0
        # Initialize Pacman and Ghost positions
        pacman = Pacman(1, 1)
        ghost = Ghost(COLS - 2, ROWS - 2)
        s = (pacman.x, pacman.y, ghost.x, ghost.y) # as a tuple so the state becomes hashable
        # Handle events
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                not_won = False
-        while running:
+        print(outer_iter)  
        while running or iter < 100:
            screen.fill(BLACK)
            iter = iter + 1
            # Check for collisions (game over if ghost catches pacman)
            if pacman.x == ghost.x and pacman.y == ghost.y:
                print("Game Over! The ghost caught Pacman.")
                outer_iter = outer_iter + 1
                running = False
                break
            # Eat cookies
            if labyrinth[pacman.y][pacman.x] == ".":
@ -171,25 +174,29 @@ def main():
                print("You Win! Pacman ate all the cookies.")
                running = False
                not_won = False
                break
-            # Start of my code ######################################################################
+            # Q-Learning part ############################################################################       
            labyrinth_copy = [list(row) for row in labyrinth]  # Create proper deep copy            
            a = rl.epsilon_greedy(q, s) # 0 = Left; 1 = Right ; 2 = Up ; 3 = Down
-            s_new, r, labyrinth_copy = rl.take_action(s, a, labyrinth_copy)
+            s_new, r, labyrinth = rl.take_action(s, a, labyrinth)
            # print(s) # debugging
            # print(q[s]) # debugging
-            q[s][a] += ALPHA * (r + GAMMA * rl.max_q(q, s_new, labyrinth_copy) - q[s][a])
+            q[s][a] += ALPHA * (r + GAMMA * rl.max_q(q, s_new, labyrinth) - q[s][a])
            s = s_new
-            # zumindest angeben wo der nächste punkt ist, ohne geist im zustand s.
+
            # After everything was calculated; just move Pacman according to highest action a in Q-Table q.
            move_pacman(pacman, a)
            if iter % 3 == 0:
                # Ghost moves towards Pacman
                ghost.move_towards_pacman(pacman)
                # Update state
                s = (pacman.x, pacman.y, ghost.x, ghost.y)
            # End of Q-Learning part ######################################################################
            # Draw the labyrinth, pacman, and ghost
            draw_labyrinth()
@ -200,7 +207,9 @@ def main():
            pygame.display.flip()
            # Cap the frame rate
-            clock.tick(5)
+            # tick_speed = 100 
            tick_speed = 5 if outer_iter % 20 == 0 else 100
            clock.tick(tick_speed)
    pygame.quit()
--- a/04_pacman_rl/reinforcement_learning.py
+++ b/04_pacman_rl/reinforcement_learning.py
@ -6,7 +6,7 @@ ausweicht und somit vermeidet gefressen zu werden.
 """
 import numpy as np
-from collections import deque
+import random
 GAMMA = 0.90
 ALPHA = 0.2
@ -16,7 +16,8 @@ def q_init():
    # Configuration
    NUM_ACTIONS = 4
-    INITIAL_Q_VALUE = 2.0 # Small value for initialization
+    RAND_Q_VALUES = [random.uniform(-0.1, 0.1), random.uniform(-0.1, 0.1), random.uniform(-0.1, 0.1), random.uniform(-0.1, 0.1)]
    # print(RAND_Q_VALUES) # debugging
    # Labyrinth layout
    labyrinth = [
@ -50,7 +51,7 @@ def q_init():
                    # Assign all possible states a tuple of values
                    state_key = (s0, s1, s2, s3)
-                    q_values = [INITIAL_Q_VALUE] * NUM_ACTIONS
+                    q_values = RAND_Q_VALUES.copy()  # Create a copy for each state
                    # Check which actions are blocked by walls
                    # Action 0: move left (s0 - 1)
@ -72,7 +73,7 @@ def q_init():
    # print(list(q_table.items())[:5]) # Uncomment to see the first 5 entries
    return q_table
-def epsilon_greedy(q, s, epsilon=0.025):
+def epsilon_greedy(q, s, epsilon=0.1):
    """
    Return which direction Pacman should move to using epsilon-greedy algorithm
    With probability epsilon, choose a random action. Otherwise choose the greedy action.
@ -102,44 +103,15 @@ def epsilon_greedy(q, s, epsilon=0.025):
            elif a == 3:  # down
                s_test[1] += 1
            # Check if this action would cause collision
            if s_test[0] == s[2] and s_test[1] == s[3]:
                continue  # Skip this action, try next highest Q-value
            return a    
 def max_q(q, s_new, labyrinth, depth=0, max_depth=1):
    """Calculate Q-values for all possible actions in state s_new and return the maximum"""
    q_max = 0
    for a in range(4):
        if q[s_new][a] != None and s_new in q:  # Only consider valid (non-blocked) actions
            s_test = tuple(list(s_new)[:2] + [s_new[2], s_new[3]])  # Keep ghost position
            s_test_list = list(s_test)
            if a == 0:  # left
                s_test_list[0] -= 1
            elif a == 1:  # right
                s_test_list[0] += 1
            elif a == 2:  # up
                s_test_list[1] -= 1
            elif a == 3:  # down
                s_test_list[1] += 1 
            s_test = tuple(s_test_list)
            if s_test in q and depth < max_depth:
                q[s_new][a] += ALPHA * (calc_reward(s_test, labyrinth) + GAMMA * max_q(q, s_test, labyrinth, depth + 1, max_depth) - q[s_new][a])
            q_max = max(q_max, q[s_new][a])
    return q_max
 def calc_reward(s_new, labyrinth):
    # Reward for cookies; punish for not eating cookies
    r = 1.0 if labyrinth[s_new[1]][s_new[0]] == "." else -1.0
    return r
 def take_action(s, a, labyrinth):
    # Use the labyrinth parameter (already updated from previous iterations)
    s_new = list(s)
    if a == 0:  # left
        s_new[0] -= 1
@ -150,10 +122,30 @@ def take_action(s, a, labyrinth):
    if a == 3:  # down
        s_new[1] += 1
-    # Mark new Pacman position as eaten (if it's a cookie)
+    # Check if action caused gameover (Pacman caught by ghost)
-    if labyrinth[s_new[1]][s_new[0]] == ".":
+    if s_new[0] == s_new[2] and s_new[1] == s_new[3]:
-        labyrinth[s_new[1]][s_new[0]] = " "
+        r = -100.0
-    
+        print("Invalid action type shit")
    else:
        r = calc_reward(tuple(s_new), labyrinth)
    # Mark new Pacman position as eaten (if it's a cookie)
    if labyrinth[s_new[1]][s_new[0]] == ".":
        # Convert string row to list, modify it, then convert back to string
        row_list = list(labyrinth[s_new[1]])
        row_list[s_new[0]] = " "
        labyrinth[s_new[1]] = "".join(row_list)
    return tuple(s_new), r, labyrinth
 def max_q(q, s_new, labyrinth):
    """Return the maximum Q-value among valid actions in state s_new"""
    if s_new not in q:
        return 0
    q_max = 0
    for a in range(4):
        if q[s_new][a] is not None:  # Only consider valid (non-blocked) actions
            q_max = max(q_max, q[s_new][a])
    return q_max