From 48a351518d716994aab918fc35f88d5ded224874 Mon Sep 17 00:00:00 2001 From: Ruben Seitz Date: Thu, 27 Nov 2025 16:09:47 +0100 Subject: [PATCH] good enough i guess --- 04_pacman_rl/reinforcement_learning.py | 58 +++++++++++++------------- 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/04_pacman_rl/reinforcement_learning.py b/04_pacman_rl/reinforcement_learning.py index 279fcb1..a747b91 100644 --- a/04_pacman_rl/reinforcement_learning.py +++ b/04_pacman_rl/reinforcement_learning.py @@ -72,43 +72,41 @@ def q_init(): # print(list(q_table.items())[:5]) # Uncomment to see the first 5 entries return q_table -def epsilon_greedy(q, s, epsilon=0.1): +def epsilon_greedy(q, s, epsilon=0.025): """ Return which direction Pacman should move to using epsilon-greedy algorithm With probability epsilon, choose a random action. Otherwise choose the greedy action. Avoids actions that would result in collision with ghost. """ - # if np.random.random() < epsilon: - # # Explore: choose random action (excluding blocked actions with Q=0) - # valid_actions = [i for i in range(len(q[s])) if q[s][i] is not None] - # if valid_actions: - # return np.random.choice(valid_actions) - # else: - # return np.random.randint(0, len(q[s])) - # else: - # Get all valid (non-blocked) actions with their Q-values - valid_actions = [(i, q[s][i]) for i in range(len(q[s])) if q[s][i] is not None] - - # Sort by Q-value in descending order - valid_actions.sort(key=lambda x: x[1], reverse=True) - - # Try each action starting from highest Q-value - for a, q_val in valid_actions: - s_test = list(s) - if a == 0: # left - s_test[0] -= 1 - elif a == 1: # right - s_test[0] += 1 - elif a == 2: # up - s_test[1] -= 1 - elif a == 3: # down - s_test[1] += 1 + if np.random.random() < epsilon: + # Explore: choose random action (excluding blocked actions with Q=0) + valid_actions = [i for i in range(len(q[s])) if q[s][i] is not None] + return np.random.choice(valid_actions) - # Check if this action would cause collision - if s_test[0] == s[2] and s_test[1] == s[3]: - continue # Skip this action, try next highest Q-value + else: + # Get all valid (non-blocked) actions with their Q-values + valid_actions = [(i, q[s][i]) for i in range(len(q[s])) if q[s][i] is not None] - return a + # Sort by Q-value in descending order + valid_actions.sort(key=lambda x: x[1], reverse=True) + + # Try each action starting from highest Q-value + for a, q_val in valid_actions: + s_test = list(s) + if a == 0: # left + s_test[0] -= 1 + elif a == 1: # right + s_test[0] += 1 + elif a == 2: # up + s_test[1] -= 1 + elif a == 3: # down + s_test[1] += 1 + + # Check if this action would cause collision + if s_test[0] == s[2] and s_test[1] == s[3]: + continue # Skip this action, try next highest Q-value + + return a def max_q(q, s_new, labyrinth, depth=0, max_depth=2): """Calculate Q-values for all possible actions in state s_new and return the maximum"""