diff --git a/04_pacman_rl/pacman.py b/04_pacman_rl/pacman.py index e49d3fb..2c6ed61 100644 --- a/04_pacman_rl/pacman.py +++ b/04_pacman_rl/pacman.py @@ -2,6 +2,7 @@ import pygame import random import math import reinforcement_learning as rl +import time # Initialize pygame pygame.init() @@ -124,6 +125,9 @@ def main(): # Initialize Pacman and Ghost positions pacman = Pacman(1, 1) ghost = Ghost(COLS - 2, ROWS - 2) + q = rl.q_init() + gamma = 0.9 + alpha = 0.8 # Game loop running = True @@ -166,23 +170,23 @@ def main(): running = False # Start of my code - alpha = 0.8 - gamma = 0.9 - + s = (pacman.x, pacman.y, ghost.x, ghost.y) # as a tuple so the state becomes hashable - s_not_terminal = True - q = rl.q_init() + #s_not_terminal = True + # while s_not_terminal: + a = rl.epsilon_greedy(q, s) # 0 = Left; 1 = Right ; 2 = Up ; 3 = Down + s_new, r = rl.take_action(s, a, labyrinth) + move_pacman(pacman, a) + print("state: " + str(s_new) + " r: " + str(r)) - while s_not_terminal: - a = rl.epsilon_greedy(q, s) # 0 = Left; 1 = Right ; 2 = Up ; 3 = Down - s_new, r = rl.take_action(s, a, labyrinth) - move_pacman(pacman, a) - - q[s][a] += alpha * (r + gamma * max(q[s_new]) - q[s][a]) - print(q[s][a]) - - s = s_new + q[s][a] += round(alpha * (r + gamma * max(q[s_new]) - q[s][a]), 2) + print(q[s]) + + s = s_new + time.sleep(0.5) + + #gamma *= gamma # Draw the labyrinth, pacman, and ghost draw_labyrinth() diff --git a/04_pacman_rl/reinforcement_learning.py b/04_pacman_rl/reinforcement_learning.py index bfdd344..113787f 100644 --- a/04_pacman_rl/reinforcement_learning.py +++ b/04_pacman_rl/reinforcement_learning.py @@ -14,6 +14,15 @@ def q_init(): NUM_ACTIONS = 4 INITIAL_Q_VALUE = 1.0 # Small value for initialization + # Labyrinth layout + labyrinth = [ + "##########", + "#........#", + "#.##..##.#", + "#........#", + "##########" + ] + s0_range = range(1, 9) s1_range = range(1, 4) s2_range = range(1, 9) @@ -33,43 +42,121 @@ def q_init(): if s1 == 2 and s0 not in s_constrained_values: continue if s3 == 2 and s2 not in s_constrained_values: - continue + continue + if s0 == s2 and s1 == s3: + continue # Assign all possible states a tuple of values state_key = (s0, s1, s2, s3) - q_table[state_key] = [INITIAL_Q_VALUE] * NUM_ACTIONS + q_values = [INITIAL_Q_VALUE] * NUM_ACTIONS + + # Check which actions are blocked by walls + # Action 0: move left (s0 - 1) + if labyrinth[s1][s0 - 1] == "#": + q_values[0] = 0 + # Action 1: move right (s0 + 1) + if labyrinth[s1][s0 + 1] == "#": + q_values[1] = 0 + # Action 2: move up (s1 - 1) + if labyrinth[s1 - 1][s0] == "#": + q_values[2] = 0 + # Action 3: move down (s1 + 1) + if labyrinth[s1 + 1][s0] == "#": + q_values[3] = 0 + + q_table[state_key] = q_values - print(f"Total number of valid states initialized: {len(q_table)}") # debugging + # print(f"Total number of valid states initialized: {len(q_table)}") # debugging # print(list(q_table.items())[:5]) # Uncomment to see the first 5 entries return q_table -def epsilon_greedy(q, s, epsilon=0.9): +def epsilon_greedy(q, s, epsilon=0.2): """ - Return which direction Pacman should move to - epsilon-greedy algorithm TBD + Return which direction Pacman should move to using epsilon-greedy algorithm + With probability epsilon, choose a random action. Otherwise choose the greedy action. + """ """ q_max = max(q[s]) a = q[s].index(q_max) return a + """ + + if np.random.random() < epsilon: + # Explore: choose random action (excluding blocked actions with Q=0) + valid_actions = [i for i in range(len(q[s])) if q[s][i] > 0] + if valid_actions: + return np.random.choice(valid_actions) + else: + return np.random.randint(0, len(q[s])) + else: + # Exploit: choose best action + valid_q_values = [(i, q[s][i]) for i in range(len(q[s])) if q[s][i] > 0] + if valid_q_values: + # Get max Q-value among valid actions + best_action = max(valid_q_values, key=lambda x: x[1])[0] + return best_action + else: + return 0 + + +def bfs_distance(start, end, labyrinth): + """ + Calculate shortest path distance between two points using BFS. + Returns the distance or infinity if no path exists. + """ + from collections import deque + + if start == end: + return 0 + + queue = deque([(start, 0)]) # (position, distance) + visited = {start} + + while queue: + (x, y), dist = queue.popleft() + + # Check all 4 directions + for dx, dy in [(-1, 0), (1, 0), (0, -1), (0, 1)]: + nx, ny = x + dx, y + dy + + if (nx, ny) == end: + return dist + 1 + + if 0 <= ny < len(labyrinth) and 0 <= nx < len(labyrinth[0]): + if (nx, ny) not in visited and labyrinth[ny][nx] != "#": + visited.add((nx, ny)) + queue.append(((nx, ny), dist + 1)) + + return float('inf') # No path found def take_action(s, a, labyrinth): s_new = list(s) - if a == 0: + if a == 0: # left s_new[0] -= 1 - if a == 1: + if a == 1: # right s_new[0] += 1 - if a == 2: - s_new[1] += 1 - if a == 3: + if a == 2: # up s_new[1] -= 1 + if a == 3: # down + s_new[1] += 1 # consider if there is a point on the field - r = 1 if labyrinth[s_new[0]][s_new[1]] == "." else 0 - # consider new distance between Pacman and Ghost - distance = abs(s[0] - s[2]) + abs(s[1] - s[3]) - distance_new = abs(s_new[0] - s_new[2]) + abs(s_new[1] - s_new[3]) - r += distance_new - distance # adjust this value if necessary + r = 3 if labyrinth[s_new[1]][s_new[0]] == "." else -1 + + # consider new distance between Pacman and Ghost using actual pathfinding + pacman_pos = (s[0], s[1]) + ghost_pos = (s[2], s[3]) + pacman_pos_new = (s_new[0], s_new[1]) + + distance = bfs_distance(pacman_pos, ghost_pos, labyrinth) + distance_new = bfs_distance(pacman_pos_new, ghost_pos, labyrinth) + + # Reward for increasing distance from ghost (moving away is good) + if distance_new > distance: + r += 0.5 # Bonus for moving away + elif distance_new < distance: + r -= 1.0 # Penalty for moving closer return tuple(s_new), r \ No newline at end of file