commencing with actual reinforcement learning

2025-12-01 15:16:34 +01:00 · 2025-12-01 15:16:34 +01:00 · a891d51ca9
parent 48a351518d
commit a891d51ca9
2 changed files with 48 additions and 53 deletions
--- a/04_pacman_rl/pacman.py
+++ b/04_pacman_rl/pacman.py
@ -135,15 +135,27 @@ def main():
    q = rl.q_init()
    # Game loop
    not_won = True
    running = True
    iter = 0
-    while running:
+    while not_won:
-        screen.fill(BLACK)
+        
-        iter = iter + 1
+        labyrinth = [
            "##########",
            "#........#",
            "#.##..##.#",
            "#........#",
            "##########"
        ]
        # Handle events
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
-                running = False
+                not_won = False
        while running:
            screen.fill(BLACK)
            iter = iter + 1
            # Check for collisions (game over if ghost catches pacman)
            if pacman.x == ghost.x and pacman.y == ghost.y:
@ -158,38 +170,21 @@ def main():
            if all("." not in row for row in labyrinth):
                print("You Win! Pacman ate all the cookies.")
                running = False
                not_won = False
            # Start of my code ######################################################################
        # Start of my code
            labyrinth_copy = [list(row) for row in labyrinth]  # Create proper deep copy            
        s_not_terminal = True
        a = None
        iteration = 0
        max_iterations = 50  # Prevent infinite loops
        while s_not_terminal and iteration < max_iterations:
            iteration += 1
            # print("s: " + str(s)) # debugging
            # print("q[s] before action: " + str(q[s])) # debugging
            a = rl.epsilon_greedy(q, s) # 0 = Left; 1 = Right ; 2 = Up ; 3 = Down
            s_new, r, labyrinth_copy = rl.take_action(s, a, labyrinth_copy)
            q[s][a] += ALPHA * (r + GAMMA * rl.max_q(q, s_new, labyrinth_copy) - q[s][a])
            if all("." not in row for row in labyrinth_copy):
                s_not_terminal = False
                q[s][a] = 10.0
                print("There is a parallel universe with victory")
            s = s_new
            time.sleep(0.025)
-        if iteration >= max_iterations:
+            # zumindest angeben wo der nächste punkt ist, ohne geist im zustand s.
-            print(f"Max iterations reached for this loop ")
+            # After everything was calculated; just move Pacman according to highest action a in Q-Table q.
        s = (pacman.x, pacman.y, ghost.x, ghost.y) # as a tuple so the state becomes hashable
        a = rl.epsilon_greedy(q, s) # 0 = Left; 1 = Right ; 2 = Up ; 3 = Down
            move_pacman(pacman, a)
            if iter%3==0:
--- a/04_pacman_rl/reinforcement_learning.py
+++ b/04_pacman_rl/reinforcement_learning.py
@ -108,7 +108,7 @@ def epsilon_greedy(q, s, epsilon=0.025):
            return a    
-def max_q(q, s_new, labyrinth, depth=0, max_depth=2):
+def max_q(q, s_new, labyrinth, depth=0, max_depth=1):
    """Calculate Q-values for all possible actions in state s_new and return the maximum"""
    q_max = 0
    for a in range(4):
@ -133,7 +133,7 @@ def max_q(q, s_new, labyrinth, depth=0, max_depth=2):
 def calc_reward(s_new, labyrinth):
-    # Reward for cookies
+    # Reward for cookies; punish for not eating cookies
    r = 1.0 if labyrinth[s_new[1]][s_new[0]] == "." else -1.0
    return r