From 8049bfe29ff423abf346c43a02b82ec150410ba9 Mon Sep 17 00:00:00 2001 From: Ruben Seitz Date: Mon, 24 Nov 2025 21:56:57 +0100 Subject: [PATCH] removed 0's; set q[s][a]=-10 at the right place --- 04_pacman_rl/pacman.py | 10 ++--- 04_pacman_rl/reinforcement_learning.py | 55 +++++++++++++------------- 2 files changed, 31 insertions(+), 34 deletions(-) diff --git a/04_pacman_rl/pacman.py b/04_pacman_rl/pacman.py index cb5bee8..ce62ffc 100644 --- a/04_pacman_rl/pacman.py +++ b/04_pacman_rl/pacman.py @@ -175,8 +175,6 @@ def main(): s_new, r, labyrinth_copy = rl.take_action(s, a, labyrinth_copy) q[s][a] += ALPHA * (r + GAMMA * rl.max_q(q, s_new, labyrinth_copy) - q[s][a]) - - s = s_new if all("." not in row for row in labyrinth_copy): s_not_terminal = False @@ -185,11 +183,11 @@ def main(): # Check for collisions (game over if ghost catches pacman) if s[0] == s[2] and s[1] == s[3]: s_not_terminal = False - q[s][a] = 0.01 - print("There was just a collision!!!") - print("s: " + str(s)) - print("Crashed values now q[s]: " + str(q[s])) + q[s][a] = -10.0 + # print("Collision at s!!! s: " + str(s)) # debugging + print("Crashed values now q[s]: " + str(q[s])) # debugging + s = s_new time.sleep(0.025) if iteration >= max_iterations: diff --git a/04_pacman_rl/reinforcement_learning.py b/04_pacman_rl/reinforcement_learning.py index ea2c851..f2ac958 100644 --- a/04_pacman_rl/reinforcement_learning.py +++ b/04_pacman_rl/reinforcement_learning.py @@ -55,16 +55,16 @@ def q_init(): # Check which actions are blocked by walls # Action 0: move left (s0 - 1) if labyrinth[s1][s0 - 1] == "#": - q_values[0] = 0 + q_values[0] = None # Action 1: move right (s0 + 1) if labyrinth[s1][s0 + 1] == "#": - q_values[1] = 0 + q_values[1] = None # Action 2: move up (s1 - 1) if labyrinth[s1 - 1][s0] == "#": - q_values[2] = 0 + q_values[2] = None # Action 3: move down (s1 + 1) if labyrinth[s1 + 1][s0] == "#": - q_values[3] = 0 + q_values[3] = None q_table[state_key] = q_values @@ -72,7 +72,7 @@ def q_init(): # print(list(q_table.items())[:5]) # Uncomment to see the first 5 entries return q_table -def epsilon_greedy(q, s, epsilon=0.1): +def epsilon_greedy(q, s, epsilon=0.2): """ Return which direction Pacman should move to using epsilon-greedy algorithm With probability epsilon, choose a random action. Otherwise choose the greedy action. @@ -80,35 +80,34 @@ def epsilon_greedy(q, s, epsilon=0.1): Never allows Pacman to move backwards (opposite direction). """ - """ - q_max = max(q[s]) + q_max = max(x for x in q[s] if isinstance(x, (int, float))) a = q[s].index(q_max) return a - """ - if np.random.random() < epsilon: - # Explore: choose random action (excluding blocked actions with Q=0) - valid_actions = [i for i in range(len(q[s])) if q[s][i] > 0] - if valid_actions: - return np.random.choice(valid_actions) - else: - return np.random.randint(0, len(q[s])) - else: - # Exploit: choose best action - valid_q_values = [(i, q[s][i]) for i in range(len(q[s])) if q[s][i] > 0] - if valid_q_values: - # Get max Q-value among valid actions - best_action = max(valid_q_values, key=lambda x: x[1])[0] - return best_action - else: - return 0 + # if np.random.random() < epsilon: + # # Explore: choose random action (excluding blocked actions with Q=0) + # valid_actions = [i for i in range(len(q[s])) if q[s][i] != None] + # if valid_actions: + # return np.random.choice(valid_actions) + # else: + # return np.random.randint(0, len(q[s])) + # else: + # # Exploit: choose best action + # valid_q_values = [(i, q[s][i]) for i in range(len(q[s])) if q[s][i] != None] + # if valid_q_values: + # # Get max Q-value among valid actions + # best_action = max(valid_q_values, key=lambda x: x[1])[0] + # return best_action + # else: + # return 0 + -def max_q(q, s_new, labyrinth, depth=0, max_depth=2): +def max_q(q, s_new, labyrinth, depth=0, max_depth=4): """Calculate Q-values for all possible actions in state s_new and return the maximum""" - q_max = 0.01 + q_max = 0 for a in range(4): - if q[s_new][a] > 0: # Only consider valid (non-blocked) actions + if q[s_new][a] != None and s_new in q: # Only consider valid (non-blocked) actions s_test = tuple(list(s_new)[:2] + [s_new[2], s_new[3]]) # Keep ghost position s_test_list = list(s_test) if a == 0: # left @@ -147,7 +146,7 @@ def calc_reward(s_new, labyrinth): """ # Reward for cookies - r = 1.0 if labyrinth[s_new[1]][s_new[0]] == "." else -2.0 + r = 1.0 if labyrinth[s_new[1]][s_new[0]] == "." else -1.0 return r