diff --git a/04_pacman_rl/pacman.py b/04_pacman_rl/pacman.py index ce62ffc..492c32a 100644 --- a/04_pacman_rl/pacman.py +++ b/04_pacman_rl/pacman.py @@ -168,8 +168,8 @@ def main(): while s_not_terminal and iteration < max_iterations: iteration += 1 - print("s: " + str(s)) # debugging - print("q[s] before action: " + str(q[s])) # debugging + # print("s: " + str(s)) # debugging + # print("q[s] before action: " + str(q[s])) # debugging a = rl.epsilon_greedy(q, s) # 0 = Left; 1 = Right ; 2 = Up ; 3 = Down s_new, r, labyrinth_copy = rl.take_action(s, a, labyrinth_copy) @@ -179,19 +179,14 @@ def main(): if all("." not in row for row in labyrinth_copy): s_not_terminal = False q[s][a] = 10.0 - - # Check for collisions (game over if ghost catches pacman) - if s[0] == s[2] and s[1] == s[3]: - s_not_terminal = False - q[s][a] = -10.0 - # print("Collision at s!!! s: " + str(s)) # debugging - print("Crashed values now q[s]: " + str(q[s])) # debugging + print("There is a parallel universe with victory") + s = s_new time.sleep(0.025) if iteration >= max_iterations: - print(f"Max iterations reached breaking out of loop") + print(f"Max iterations reached for this loop ") s = (pacman.x, pacman.y, ghost.x, ghost.y) # as a tuple so the state becomes hashable a = rl.epsilon_greedy(q, s) # 0 = Left; 1 = Right ; 2 = Up ; 3 = Down diff --git a/04_pacman_rl/reinforcement_learning.py b/04_pacman_rl/reinforcement_learning.py index f2ac958..279fcb1 100644 --- a/04_pacman_rl/reinforcement_learning.py +++ b/04_pacman_rl/reinforcement_learning.py @@ -72,38 +72,45 @@ def q_init(): # print(list(q_table.items())[:5]) # Uncomment to see the first 5 entries return q_table -def epsilon_greedy(q, s, epsilon=0.2): - """ +def epsilon_greedy(q, s, epsilon=0.1): + """ Return which direction Pacman should move to using epsilon-greedy algorithm With probability epsilon, choose a random action. Otherwise choose the greedy action. - If multiple actions have the same max Q-value, prefer actions different from a_prev. - Never allows Pacman to move backwards (opposite direction). + Avoids actions that would result in collision with ghost. """ - - q_max = max(x for x in q[s] if isinstance(x, (int, float))) - a = q[s].index(q_max) - - return a - # if np.random.random() < epsilon: - # # Explore: choose random action (excluding blocked actions with Q=0) - # valid_actions = [i for i in range(len(q[s])) if q[s][i] != None] + # # Explore: choose random action (excluding blocked actions with Q=0) + # valid_actions = [i for i in range(len(q[s])) if q[s][i] is not None] # if valid_actions: # return np.random.choice(valid_actions) # else: # return np.random.randint(0, len(q[s])) # else: - # # Exploit: choose best action - # valid_q_values = [(i, q[s][i]) for i in range(len(q[s])) if q[s][i] != None] - # if valid_q_values: - # # Get max Q-value among valid actions - # best_action = max(valid_q_values, key=lambda x: x[1])[0] - # return best_action - # else: - # return 0 + # Get all valid (non-blocked) actions with their Q-values + valid_actions = [(i, q[s][i]) for i in range(len(q[s])) if q[s][i] is not None] + # Sort by Q-value in descending order + valid_actions.sort(key=lambda x: x[1], reverse=True) + + # Try each action starting from highest Q-value + for a, q_val in valid_actions: + s_test = list(s) + if a == 0: # left + s_test[0] -= 1 + elif a == 1: # right + s_test[0] += 1 + elif a == 2: # up + s_test[1] -= 1 + elif a == 3: # down + s_test[1] += 1 + + # Check if this action would cause collision + if s_test[0] == s[2] and s_test[1] == s[3]: + continue # Skip this action, try next highest Q-value + + return a -def max_q(q, s_new, labyrinth, depth=0, max_depth=4): +def max_q(q, s_new, labyrinth, depth=0, max_depth=2): """Calculate Q-values for all possible actions in state s_new and return the maximum""" q_max = 0 for a in range(4): @@ -127,24 +134,7 @@ def max_q(q, s_new, labyrinth, depth=0, max_depth=4): return q_max def calc_reward(s_new, labyrinth): - """ - # consider new distance between Pacman and Ghost using actual pathfinding - pacman_pos_new = (s_new[0], s_new[1]) - ghost_pos = (s_new[2], s_new[3]) - - # distance_old = bfs_distance((s[0], s[1]), ghost_pos, labyrinth) - distance_new = bfs_distance(pacman_pos_new, ghost_pos, labyrinth) - - r = 0 - - if distance_new < 3: - r = -3 - elif distance_new == 4: - r = 1.0 - elif distance_new > 4: - r = 2.0 - """ - + # Reward for cookies r = 1.0 if labyrinth[s_new[1]][s_new[0]] == "." else -1.0