diff --git a/04_pacman_rl/pacman.py b/04_pacman_rl/pacman.py index 9bdb4fd..e49d3fb 100644 --- a/04_pacman_rl/pacman.py +++ b/04_pacman_rl/pacman.py @@ -107,6 +107,16 @@ def draw_labyrinth(): elif cell == ".": pygame.draw.circle(screen, WHITE, (x * CELL_SIZE + CELL_SIZE // 2, y * CELL_SIZE + CELL_SIZE // 2), 5) +def move_pacman(pacman, a): + if a == 0: # left + pacman.move(-1, 0) + if a == 1: # right + pacman.move(1, 0) + if a == 2: # up + pacman.move(0, -1) + if a == 3: # down + pacman.move(0, 1) + # Main game function def main(): clock = pygame.time.Clock() @@ -159,20 +169,20 @@ def main(): alpha = 0.8 gamma = 0.9 - s = [pacman.x, pacman.y, ghost.x, ghost.y] + s = (pacman.x, pacman.y, ghost.x, ghost.y) # as a tuple so the state becomes hashable s_not_terminal = True q = rl.q_init() while s_not_terminal: a = rl.epsilon_greedy(q, s) # 0 = Left; 1 = Right ; 2 = Up ; 3 = Down - s_new, r = rl.take_action(s, a) + s_new, r = rl.take_action(s, a, labyrinth) + move_pacman(pacman, a) - q[s][a] += alpha * (r + gamma * max_q(q, s_new) - q[s][a]) + q[s][a] += alpha * (r + gamma * max(q[s_new]) - q[s][a]) + print(q[s][a]) s = s_new - pass - # Draw the labyrinth, pacman, and ghost draw_labyrinth() diff --git a/04_pacman_rl/reinforcement_learning.py b/04_pacman_rl/reinforcement_learning.py index 5265bd2..bfdd344 100644 --- a/04_pacman_rl/reinforcement_learning.py +++ b/04_pacman_rl/reinforcement_learning.py @@ -12,31 +12,31 @@ def q_init(): # Configuration NUM_ACTIONS = 4 - INITIAL_Q_VALUE = 0.0 # Small value for initialization + INITIAL_Q_VALUE = 1.0 # Small value for initialization - s1_range = range(1, 9) - s2_range = range(1, 4) - s3_range = range(1, 9) - s4_range = range(1, 4) + s0_range = range(1, 9) + s1_range = range(1, 4) + s2_range = range(1, 9) + s3_range = range(1, 4) s_constrained_values = {1, 4, 5, 8} # The Q-Table dictionary q_table = {} - # Iterate through all possible combinations of s1, s2, s3, s4 - for s1 in s1_range: - for s2 in s2_range: - for s3 in s3_range: - for s4 in s4_range: + # Iterate through all possible combinations of s0, s1, s2, s3 + for s0 in s0_range: + for s1 in s1_range: + for s2 in s2_range: + for s3 in s3_range: # Skip impossible states - if s2 == 2 and s1 not in s_constrained_values: + if s1 == 2 and s0 not in s_constrained_values: continue - if s4 == 2 and s3 not in s_constrained_values: + if s3 == 2 and s2 not in s_constrained_values: continue # Assign all possible states a tuple of values - state_key = (s1, s2, s3, s4) + state_key = (s0, s1, s2, s3) q_table[state_key] = [INITIAL_Q_VALUE] * NUM_ACTIONS print(f"Total number of valid states initialized: {len(q_table)}") # debugging @@ -48,14 +48,14 @@ def epsilon_greedy(q, s, epsilon=0.9): Return which direction Pacman should move to epsilon-greedy algorithm TBD """ - a_val = max(q[s]) - a = q[s].index(a_val) + q_max = max(q[s]) + a = q[s].index(q_max) return a -def take_action(s, a): - s_new = s +def take_action(s, a, labyrinth): + s_new = list(s) if a == 0: s_new[0] -= 1 if a == 1: @@ -65,9 +65,11 @@ def take_action(s, a): if a == 3: s_new[1] -= 1 - # Calculate fucking r - # include if there is a point on the field - r = 0 + # consider if there is a point on the field + r = 1 if labyrinth[s_new[0]][s_new[1]] == "." else 0 + # consider new distance between Pacman and Ghost + distance = abs(s[0] - s[2]) + abs(s[1] - s[3]) + distance_new = abs(s_new[0] - s_new[2]) + abs(s_new[1] - s_new[3]) + r += distance_new - distance # adjust this value if necessary - return s_new, r - + return tuple(s_new), r \ No newline at end of file