From 8049bfe29ff423abf346c43a02b82ec150410ba9 Mon Sep 17 00:00:00 2001
From: Ruben Seitz <ruben.seitz@kiteit.de>
Date: Mon, 24 Nov 2025 21:56:57 +0100
Subject: [PATCH] removed 0's; set q[s][a]=-10 at the right place

---
 04_pacman_rl/pacman.py                 | 10 ++---
 04_pacman_rl/reinforcement_learning.py | 55 +++++++++++++-------------
 2 files changed, 31 insertions(+), 34 deletions(-)

diff --git a/04_pacman_rl/pacman.py b/04_pacman_rl/pacman.py
index cb5bee8..ce62ffc 100644
--- a/04_pacman_rl/pacman.py
+++ b/04_pacman_rl/pacman.py
@@ -175,8 +175,6 @@ def main():
             s_new, r, labyrinth_copy = rl.take_action(s, a, labyrinth_copy)
             
             q[s][a] += ALPHA * (r + GAMMA * rl.max_q(q, s_new, labyrinth_copy) - q[s][a])
-            
-            s = s_new
 
             if all("." not in row for row in labyrinth_copy):
                 s_not_terminal = False
@@ -185,11 +183,11 @@ def main():
             # Check for collisions (game over if ghost catches pacman)
             if s[0] == s[2] and s[1] == s[3]:
                 s_not_terminal = False
-                q[s][a] = 0.01
-                print("There was just a collision!!!")
-                print("s: " + str(s))
-                print("Crashed values now q[s]: " + str(q[s]))
+                q[s][a] = -10.0
+                # print("Collision at s!!! s: " + str(s)) # debugging
+                print("Crashed values now q[s]: " + str(q[s])) # debugging
             
+            s = s_new
             time.sleep(0.025)
         
         if iteration >= max_iterations:
diff --git a/04_pacman_rl/reinforcement_learning.py b/04_pacman_rl/reinforcement_learning.py
index ea2c851..f2ac958 100644
--- a/04_pacman_rl/reinforcement_learning.py
+++ b/04_pacman_rl/reinforcement_learning.py
@@ -55,16 +55,16 @@ def q_init():
                     # Check which actions are blocked by walls
                     # Action 0: move left (s0 - 1)
                     if labyrinth[s1][s0 - 1] == "#":
-                        q_values[0] = 0
+                        q_values[0] = None
                     # Action 1: move right (s0 + 1)
                     if labyrinth[s1][s0 + 1] == "#":
-                        q_values[1] = 0
+                        q_values[1] = None
                     # Action 2: move up (s1 - 1)
                     if labyrinth[s1 - 1][s0] == "#":
-                        q_values[2] = 0
+                        q_values[2] = None
                     # Action 3: move down (s1 + 1)
                     if labyrinth[s1 + 1][s0] == "#":
-                        q_values[3] = 0
+                        q_values[3] = None
                     
                     q_table[state_key] = q_values
 
@@ -72,7 +72,7 @@ def q_init():
     # print(list(q_table.items())[:5]) # Uncomment to see the first 5 entries
     return q_table
 
-def epsilon_greedy(q, s, epsilon=0.1):
+def epsilon_greedy(q, s, epsilon=0.2):
     """ 
     Return which direction Pacman should move to using epsilon-greedy algorithm
     With probability epsilon, choose a random action. Otherwise choose the greedy action.
@@ -80,35 +80,34 @@ def epsilon_greedy(q, s, epsilon=0.1):
     Never allows Pacman to move backwards (opposite direction).
     """
     
-    """
-    q_max = max(q[s])
+    q_max = max(x for x in q[s] if isinstance(x, (int, float)))
     a = q[s].index(q_max)
     
     return a
-    """
     
-    if np.random.random() < epsilon:
-        # Explore: choose random action (excluding blocked actions with Q=0)
-        valid_actions = [i for i in range(len(q[s])) if q[s][i] > 0]
-        if valid_actions:
-            return np.random.choice(valid_actions)
-        else:
-            return np.random.randint(0, len(q[s]))
-    else:
-        # Exploit: choose best action
-        valid_q_values = [(i, q[s][i]) for i in range(len(q[s])) if q[s][i] > 0]
-        if valid_q_values:
-            # Get max Q-value among valid actions
-            best_action = max(valid_q_values, key=lambda x: x[1])[0]
-            return best_action
-        else:
-            return 0
+    # if np.random.random() < epsilon:
+    #     # Explore: choose random action (excluding blocked actions with Q=0)
+    #     valid_actions = [i for i in range(len(q[s])) if q[s][i] != None]
+    #     if valid_actions:
+    #         return np.random.choice(valid_actions)
+    #     else:
+    #         return np.random.randint(0, len(q[s]))
+    # else:
+    #     # Exploit: choose best action
+    #     valid_q_values = [(i, q[s][i]) for i in range(len(q[s])) if q[s][i] != None]
+    #     if valid_q_values:
+    #         # Get max Q-value among valid actions
+    #         best_action = max(valid_q_values, key=lambda x: x[1])[0]
+    #         return best_action
+    #     else:
+    #         return 0
+    
 
-def max_q(q, s_new, labyrinth, depth=0, max_depth=2):
+def max_q(q, s_new, labyrinth, depth=0, max_depth=4):
     """Calculate Q-values for all possible actions in state s_new and return the maximum"""
-    q_max = 0.01
+    q_max = 0
     for a in range(4):
-        if q[s_new][a] > 0:  # Only consider valid (non-blocked) actions
+        if q[s_new][a] != None and s_new in q:  # Only consider valid (non-blocked) actions
             s_test = tuple(list(s_new)[:2] + [s_new[2], s_new[3]])  # Keep ghost position
             s_test_list = list(s_test)
             if a == 0:  # left
@@ -147,7 +146,7 @@ def calc_reward(s_new, labyrinth):
     """
     
     # Reward for cookies
-    r = 1.0 if labyrinth[s_new[1]][s_new[0]] == "." else -2.0
+    r = 1.0 if labyrinth[s_new[1]][s_new[0]] == "." else -1.0
 
     return r