diff --git a/04_pacman_rl/pacman.py b/04_pacman_rl/pacman.py index 2c6ed61..75bd7b5 100644 --- a/04_pacman_rl/pacman.py +++ b/04_pacman_rl/pacman.py @@ -125,6 +125,9 @@ def main(): # Initialize Pacman and Ghost positions pacman = Pacman(1, 1) ghost = Ghost(COLS - 2, ROWS - 2) + + s = (pacman.x, pacman.y, ghost.x, ghost.y) # as a tuple so the state becomes hashable + a_prev = 4 q = rl.q_init() gamma = 0.9 alpha = 0.8 @@ -140,17 +143,6 @@ def main(): if event.type == pygame.QUIT: running = False - # Handle Pacman movement - keys = pygame.key.get_pressed() - if keys[pygame.K_LEFT]: - pacman.move(-1, 0) - if keys[pygame.K_RIGHT]: - pacman.move(1, 0) - if keys[pygame.K_UP]: - pacman.move(0, -1) - if keys[pygame.K_DOWN]: - pacman.move(0, 1) - if iter%3==0: # Ghost moves towards Pacman ghost.move_towards_pacman(pacman) @@ -170,20 +162,24 @@ def main(): running = False # Start of my code - - s = (pacman.x, pacman.y, ghost.x, ghost.y) # as a tuple so the state becomes hashable - - #s_not_terminal = True + # s_not_terminal = True # while s_not_terminal: - a = rl.epsilon_greedy(q, s) # 0 = Left; 1 = Right ; 2 = Up ; 3 = Down + print("s: " + str(s)) + print("q[s] before action: " + str(q[s])) + + + a = rl.epsilon_greedy(q, s, a_prev) # 0 = Left; 1 = Right ; 2 = Up ; 3 = Down s_new, r = rl.take_action(s, a, labyrinth) move_pacman(pacman, a) - print("state: " + str(s_new) + " r: " + str(r)) q[s][a] += round(alpha * (r + gamma * max(q[s_new]) - q[s][a]), 2) - print(q[s]) + print("s_new: " + str(s_new)) + print("q[s] after action with manipulated a: " + str(q[s])) + print("q[s_new] after action: " + str(q[s_new])) + print() s = s_new + a_prev = a time.sleep(0.5) #gamma *= gamma diff --git a/04_pacman_rl/reinforcement_learning.py b/04_pacman_rl/reinforcement_learning.py index 113787f..c704a69 100644 --- a/04_pacman_rl/reinforcement_learning.py +++ b/04_pacman_rl/reinforcement_learning.py @@ -12,7 +12,7 @@ def q_init(): # Configuration NUM_ACTIONS = 4 - INITIAL_Q_VALUE = 1.0 # Small value for initialization + INITIAL_Q_VALUE = 2.0 # Small value for initialization # Labyrinth layout labyrinth = [ @@ -70,18 +70,43 @@ def q_init(): # print(list(q_table.items())[:5]) # Uncomment to see the first 5 entries return q_table -def epsilon_greedy(q, s, epsilon=0.2): +def epsilon_greedy(q, s, a_prev, epsilon=0.2): """ Return which direction Pacman should move to using epsilon-greedy algorithm With probability epsilon, choose a random action. Otherwise choose the greedy action. + If multiple actions have the same max Q-value, prefer actions different from a_prev. + Never allows Pacman to move backwards (opposite direction). """ - """ + + opposite_action = {0: 1, 1: 0, 2: 3, 3: 2} + q_max = max(q[s]) a = q[s].index(q_max) - - return a + """ + # Find all actions with the maximum Q-value + max_actions = [a for a in range(4) if q[s][a] == q_max] + # Exclude the opposite action (going backwards) + if a_prev in opposite_action: + backward_action = opposite_action[a_prev] + if backward_action in max_actions: + max_actions.remove(backward_action) + + # If no actions left after removing backward action, allow it (no choice) + if not max_actions: + max_actions = [a for a in range(4) if q[s][a] == q_max] + if a_prev in opposite_action: + backward_action = opposite_action[a_prev] + if backward_action in max_actions: + max_actions.remove(backward_action) + + # Return the first valid action + a = max_actions[0] if max_actions else 0 + """ + return a + + """ if np.random.random() < epsilon: # Explore: choose random action (excluding blocked actions with Q=0) valid_actions = [i for i in range(len(q[s])) if q[s][i] > 0] @@ -98,6 +123,11 @@ def epsilon_greedy(q, s, epsilon=0.2): return best_action else: return 0 + """ + + +def max_q(q, s_new): + pass def bfs_distance(start, end, labyrinth): @@ -143,20 +173,21 @@ def take_action(s, a, labyrinth): s_new[1] += 1 # consider if there is a point on the field - r = 3 if labyrinth[s_new[1]][s_new[0]] == "." else -1 + r = 2.0 if labyrinth[s_new[1]][s_new[0]] == "." else -5.0 # consider new distance between Pacman and Ghost using actual pathfinding pacman_pos = (s[0], s[1]) ghost_pos = (s[2], s[3]) pacman_pos_new = (s_new[0], s_new[1]) - distance = bfs_distance(pacman_pos, ghost_pos, labyrinth) distance_new = bfs_distance(pacman_pos_new, ghost_pos, labyrinth) - # Reward for increasing distance from ghost (moving away is good) - if distance_new > distance: - r += 0.5 # Bonus for moving away - elif distance_new < distance: - r -= 1.0 # Penalty for moving closer + # Reward based on distance from ghost (closer distance = worse reward) + if distance_new >= 4: + r += 2.0 # Good reward for being far away + elif distance_new >= 2: + r += 1.0 # Small reward for being moderately far + elif distance_new == 1: + r -= 10.0 # Large penalty for being adjacent to ghost return tuple(s_new), r \ No newline at end of file