diff --git a/04_pacman_rl/pacman.py b/04_pacman_rl/pacman.py index 1064c9e..9e3cf9a 100644 --- a/04_pacman_rl/pacman.py +++ b/04_pacman_rl/pacman.py @@ -1,8 +1,6 @@ import pygame -import random import math import reinforcement_learning as rl -import time # Initialize pygame pygame.init() @@ -125,19 +123,14 @@ def move_pacman(pacman, a): # Main game function def main(): global labyrinth - clock = pygame.time.Clock() - - # Initialize Pacman and Ghost positions - pacman = Pacman(1, 1) - ghost = Ghost(COLS - 2, ROWS - 2) - - s = (pacman.x, pacman.y, ghost.x, ghost.y) # as a tuple so the state becomes hashable q = rl.q_init() + + clock = pygame.time.Clock() # Game loop not_won = True - running = True - iter = 0 + outer_iter = 0 + while not_won: labyrinth = [ @@ -147,20 +140,30 @@ def main(): "#........#", "##########" ] + running = True + iter = 0 + + # Initialize Pacman and Ghost positions + pacman = Pacman(1, 1) + ghost = Ghost(COLS - 2, ROWS - 2) + s = (pacman.x, pacman.y, ghost.x, ghost.y) # as a tuple so the state becomes hashable # Handle events for event in pygame.event.get(): if event.type == pygame.QUIT: not_won = False - - while running: + + print(outer_iter) + while running or iter < 100: screen.fill(BLACK) iter = iter + 1 # Check for collisions (game over if ghost catches pacman) if pacman.x == ghost.x and pacman.y == ghost.y: print("Game Over! The ghost caught Pacman.") + outer_iter = outer_iter + 1 running = False + break # Eat cookies if labyrinth[pacman.y][pacman.x] == ".": @@ -171,36 +174,42 @@ def main(): print("You Win! Pacman ate all the cookies.") running = False not_won = False + break - # Start of my code ###################################################################### - - labyrinth_copy = [list(row) for row in labyrinth] # Create proper deep copy + # Q-Learning part ############################################################################ a = rl.epsilon_greedy(q, s) # 0 = Left; 1 = Right ; 2 = Up ; 3 = Down - s_new, r, labyrinth_copy = rl.take_action(s, a, labyrinth_copy) + s_new, r, labyrinth = rl.take_action(s, a, labyrinth) + # print(s) # debugging + # print(q[s]) # debugging - q[s][a] += ALPHA * (r + GAMMA * rl.max_q(q, s_new, labyrinth_copy) - q[s][a]) + q[s][a] += ALPHA * (r + GAMMA * rl.max_q(q, s_new, labyrinth) - q[s][a]) s = s_new - # zumindest angeben wo der nächste punkt ist, ohne geist im zustand s. - # After everything was calculated; just move Pacman according to highest action a in Q-Table q. + move_pacman(pacman, a) - if iter%3==0: + if iter % 3 == 0: # Ghost moves towards Pacman ghost.move_towards_pacman(pacman) + # Update state + s = (pacman.x, pacman.y, ghost.x, ghost.y) + + # End of Q-Learning part ###################################################################### # Draw the labyrinth, pacman, and ghost draw_labyrinth() pacman.draw() ghost.draw() - + # Update display pygame.display.flip() - + # Cap the frame rate - clock.tick(5) + # tick_speed = 100 + tick_speed = 5 if outer_iter % 20 == 0 else 100 + clock.tick(tick_speed) pygame.quit() diff --git a/04_pacman_rl/reinforcement_learning.py b/04_pacman_rl/reinforcement_learning.py index ee5c340..8b248a7 100644 --- a/04_pacman_rl/reinforcement_learning.py +++ b/04_pacman_rl/reinforcement_learning.py @@ -6,7 +6,7 @@ ausweicht und somit vermeidet gefressen zu werden. """ import numpy as np -from collections import deque +import random GAMMA = 0.90 ALPHA = 0.2 @@ -16,7 +16,8 @@ def q_init(): # Configuration NUM_ACTIONS = 4 - INITIAL_Q_VALUE = 2.0 # Small value for initialization + RAND_Q_VALUES = [random.uniform(-0.1, 0.1), random.uniform(-0.1, 0.1), random.uniform(-0.1, 0.1), random.uniform(-0.1, 0.1)] + # print(RAND_Q_VALUES) # debugging # Labyrinth layout labyrinth = [ @@ -50,7 +51,7 @@ def q_init(): # Assign all possible states a tuple of values state_key = (s0, s1, s2, s3) - q_values = [INITIAL_Q_VALUE] * NUM_ACTIONS + q_values = RAND_Q_VALUES.copy() # Create a copy for each state # Check which actions are blocked by walls # Action 0: move left (s0 - 1) @@ -72,7 +73,7 @@ def q_init(): # print(list(q_table.items())[:5]) # Uncomment to see the first 5 entries return q_table -def epsilon_greedy(q, s, epsilon=0.025): +def epsilon_greedy(q, s, epsilon=0.1): """ Return which direction Pacman should move to using epsilon-greedy algorithm With probability epsilon, choose a random action. Otherwise choose the greedy action. @@ -102,44 +103,15 @@ def epsilon_greedy(q, s, epsilon=0.025): elif a == 3: # down s_test[1] += 1 - # Check if this action would cause collision - if s_test[0] == s[2] and s_test[1] == s[3]: - continue # Skip this action, try next highest Q-value - return a -def max_q(q, s_new, labyrinth, depth=0, max_depth=1): - """Calculate Q-values for all possible actions in state s_new and return the maximum""" - q_max = 0 - for a in range(4): - if q[s_new][a] != None and s_new in q: # Only consider valid (non-blocked) actions - s_test = tuple(list(s_new)[:2] + [s_new[2], s_new[3]]) # Keep ghost position - s_test_list = list(s_test) - if a == 0: # left - s_test_list[0] -= 1 - elif a == 1: # right - s_test_list[0] += 1 - elif a == 2: # up - s_test_list[1] -= 1 - elif a == 3: # down - s_test_list[1] += 1 - s_test = tuple(s_test_list) - - if s_test in q and depth < max_depth: - q[s_new][a] += ALPHA * (calc_reward(s_test, labyrinth) + GAMMA * max_q(q, s_test, labyrinth, depth + 1, max_depth) - q[s_new][a]) - q_max = max(q_max, q[s_new][a]) - - return q_max - def calc_reward(s_new, labyrinth): - # Reward for cookies; punish for not eating cookies r = 1.0 if labyrinth[s_new[1]][s_new[0]] == "." else -1.0 return r def take_action(s, a, labyrinth): - # Use the labyrinth parameter (already updated from previous iterations) s_new = list(s) if a == 0: # left s_new[0] -= 1 @@ -150,10 +122,30 @@ def take_action(s, a, labyrinth): if a == 3: # down s_new[1] += 1 + # Check if action caused gameover (Pacman caught by ghost) + if s_new[0] == s_new[2] and s_new[1] == s_new[3]: + r = -100.0 + print("Invalid action type shit") + else: + r = calc_reward(tuple(s_new), labyrinth) + # Mark new Pacman position as eaten (if it's a cookie) if labyrinth[s_new[1]][s_new[0]] == ".": - labyrinth[s_new[1]][s_new[0]] = " " + # Convert string row to list, modify it, then convert back to string + row_list = list(labyrinth[s_new[1]]) + row_list[s_new[0]] = " " + labyrinth[s_new[1]] = "".join(row_list) - r = calc_reward(tuple(s_new), labyrinth) + return tuple(s_new), r, labyrinth + +def max_q(q, s_new, labyrinth): + """Return the maximum Q-value among valid actions in state s_new""" + if s_new not in q: + return 0 - return tuple(s_new), r, labyrinth \ No newline at end of file + q_max = 0 + for a in range(4): + if q[s_new][a] is not None: # Only consider valid (non-blocked) actions + q_max = max(q_max, q[s_new][a]) + + return q_max \ No newline at end of file