actually implemented that RL

2025-12-02 12:02:04 +01:00 · 2025-12-02 12:02:04 +01:00 · a53583b1d7
parent a891d51ca9
commit a53583b1d7
2 changed files with 61 additions and 60 deletions
--- a/04_pacman_rl/pacman.py
+++ b/04_pacman_rl/pacman.py
@ -1,8 +1,6 @@
 import pygame
-import random
 import math
 import reinforcement_learning as rl
-import time

 # Initialize pygame
 pygame.init()
@ -125,19 +123,14 @@ def move_pacman(pacman, a):
 # Main game function
 def main():
    global labyrinth
-    clock = pygame.time.Clock()
-
-    # Initialize Pacman and Ghost positions
-    pacman = Pacman(1, 1)
-    ghost = Ghost(COLS - 2, ROWS - 2)
-
-    s = (pacman.x, pacman.y, ghost.x, ghost.y) # as a tuple so the state becomes hashable
    q = rl.q_init()
    
+    clock = pygame.time.Clock()
+
    # Game loop
    not_won = True
-    running = True
-    iter = 0
+    outer_iter = 0
+    
    while not_won:
        
        labyrinth = [
@ -147,20 +140,30 @@ def main():
            "#........#",
            "##########"
        ]
+        running = True
+        iter = 0
+        
+        # Initialize Pacman and Ghost positions
+        pacman = Pacman(1, 1)
+        ghost = Ghost(COLS - 2, ROWS - 2)
+        s = (pacman.x, pacman.y, ghost.x, ghost.y) # as a tuple so the state becomes hashable
        
        # Handle events
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                not_won = False
                  
-        while running:
+        print(outer_iter)  
+        while running or iter < 100:
            screen.fill(BLACK)
            iter = iter + 1

            # Check for collisions (game over if ghost catches pacman)
            if pacman.x == ghost.x and pacman.y == ghost.y:
                print("Game Over! The ghost caught Pacman.")
+                outer_iter = outer_iter + 1
                running = False
+                break

            # Eat cookies
            if labyrinth[pacman.y][pacman.x] == ".":
@ -171,25 +174,29 @@ def main():
                print("You Win! Pacman ate all the cookies.")
                running = False
                not_won = False
+                break

-            # Start of my code ######################################################################
-            
-            labyrinth_copy = [list(row) for row in labyrinth]  # Create proper deep copy            
+            # Q-Learning part ############################################################################       

            a = rl.epsilon_greedy(q, s) # 0 = Left; 1 = Right ; 2 = Up ; 3 = Down
-            s_new, r, labyrinth_copy = rl.take_action(s, a, labyrinth_copy)
+            s_new, r, labyrinth = rl.take_action(s, a, labyrinth)
+            # print(s) # debugging
+            # print(q[s]) # debugging
            
-            q[s][a] += ALPHA * (r + GAMMA * rl.max_q(q, s_new, labyrinth_copy) - q[s][a])
+            q[s][a] += ALPHA * (r + GAMMA * rl.max_q(q, s_new, labyrinth) - q[s][a])

            s = s_new

-            # zumindest angeben wo der nächste punkt ist, ohne geist im zustand s.
-            # After everything was calculated; just move Pacman according to highest action a in Q-Table q.
+
            move_pacman(pacman, a)

-            if iter%3==0:
+            if iter % 3 == 0:
                # Ghost moves towards Pacman
                ghost.move_towards_pacman(pacman)
+                # Update state
+                s = (pacman.x, pacman.y, ghost.x, ghost.y)
+                
+            # End of Q-Learning part ######################################################################

            # Draw the labyrinth, pacman, and ghost
            draw_labyrinth()
@ -200,7 +207,9 @@ def main():
            pygame.display.flip()
            
            # Cap the frame rate
-            clock.tick(5)
+            # tick_speed = 100 
+            tick_speed = 5 if outer_iter % 20 == 0 else 100
+            clock.tick(tick_speed)

    pygame.quit()

--- a/04_pacman_rl/reinforcement_learning.py
+++ b/04_pacman_rl/reinforcement_learning.py
@ -6,7 +6,7 @@ ausweicht und somit vermeidet gefressen zu werden.
 """

 import numpy as np
-from collections import deque
+import random

 GAMMA = 0.90
 ALPHA = 0.2
@ -16,7 +16,8 @@ def q_init():

    # Configuration
    NUM_ACTIONS = 4
-    INITIAL_Q_VALUE = 2.0 # Small value for initialization
+    RAND_Q_VALUES = [random.uniform(-0.1, 0.1), random.uniform(-0.1, 0.1), random.uniform(-0.1, 0.1), random.uniform(-0.1, 0.1)]
+    # print(RAND_Q_VALUES) # debugging

    # Labyrinth layout
    labyrinth = [
@ -50,7 +51,7 @@ def q_init():

                    # Assign all possible states a tuple of values
                    state_key = (s0, s1, s2, s3)
-                    q_values = [INITIAL_Q_VALUE] * NUM_ACTIONS
+                    q_values = RAND_Q_VALUES.copy()  # Create a copy for each state
                    
                    # Check which actions are blocked by walls
                    # Action 0: move left (s0 - 1)
@ -72,7 +73,7 @@ def q_init():
    # print(list(q_table.items())[:5]) # Uncomment to see the first 5 entries
    return q_table

-def epsilon_greedy(q, s, epsilon=0.025):
+def epsilon_greedy(q, s, epsilon=0.1):
    """
    Return which direction Pacman should move to using epsilon-greedy algorithm
    With probability epsilon, choose a random action. Otherwise choose the greedy action.
@ -102,44 +103,15 @@ def epsilon_greedy(q, s, epsilon=0.025):
            elif a == 3:  # down
                s_test[1] += 1
            
-            # Check if this action would cause collision
-            if s_test[0] == s[2] and s_test[1] == s[3]:
-                continue  # Skip this action, try next highest Q-value
-            
            return a    

-def max_q(q, s_new, labyrinth, depth=0, max_depth=1):
-    """Calculate Q-values for all possible actions in state s_new and return the maximum"""
-    q_max = 0
-    for a in range(4):
-        if q[s_new][a] != None and s_new in q:  # Only consider valid (non-blocked) actions
-            s_test = tuple(list(s_new)[:2] + [s_new[2], s_new[3]])  # Keep ghost position
-            s_test_list = list(s_test)
-            if a == 0:  # left
-                s_test_list[0] -= 1
-            elif a == 1:  # right
-                s_test_list[0] += 1
-            elif a == 2:  # up
-                s_test_list[1] -= 1
-            elif a == 3:  # down
-                s_test_list[1] += 1 
-            s_test = tuple(s_test_list)
-            
-            if s_test in q and depth < max_depth:
-                q[s_new][a] += ALPHA * (calc_reward(s_test, labyrinth) + GAMMA * max_q(q, s_test, labyrinth, depth + 1, max_depth) - q[s_new][a])
-            q_max = max(q_max, q[s_new][a])
-    
-    return q_max
-
 def calc_reward(s_new, labyrinth):
-        
    # Reward for cookies; punish for not eating cookies
    r = 1.0 if labyrinth[s_new[1]][s_new[0]] == "." else -1.0

    return r

 def take_action(s, a, labyrinth):
-    # Use the labyrinth parameter (already updated from previous iterations)
    s_new = list(s)
    if a == 0:  # left
        s_new[0] -= 1
@ -150,10 +122,30 @@ def take_action(s, a, labyrinth):
    if a == 3:  # down
        s_new[1] += 1
    
+    # Check if action caused gameover (Pacman caught by ghost)
+    if s_new[0] == s_new[2] and s_new[1] == s_new[3]:
+        r = -100.0
+        print("Invalid action type shit")
+    else:
+        r = calc_reward(tuple(s_new), labyrinth)
+    
    # Mark new Pacman position as eaten (if it's a cookie)
    if labyrinth[s_new[1]][s_new[0]] == ".":
-        labyrinth[s_new[1]][s_new[0]] = " "
-    
-    r = calc_reward(tuple(s_new), labyrinth)
+        # Convert string row to list, modify it, then convert back to string
+        row_list = list(labyrinth[s_new[1]])
+        row_list[s_new[0]] = " "
+        labyrinth[s_new[1]] = "".join(row_list)
    
    return tuple(s_new), r, labyrinth
+
+def max_q(q, s_new, labyrinth):
+    """Return the maximum Q-value among valid actions in state s_new"""
+    if s_new not in q:
+        return 0
+    
+    q_max = 0
+    for a in range(4):
+        if q[s_new][a] is not None:  # Only consider valid (non-blocked) actions
+            q_max = max(q_max, q[s_new][a])
+    
+    return q_max