removed impossible states; better distance consideration; bug fixing

2025-11-18 14:42:43 +01:00 · 2025-11-18 14:42:43 +01:00 · 24714fca0e
parent a7b43c9037
commit 24714fca0e
2 changed files with 121 additions and 30 deletions
--- a/04_pacman_rl/pacman.py
+++ b/04_pacman_rl/pacman.py
@ -2,6 +2,7 @@ import pygame
 import random
 import math
 import reinforcement_learning as rl
 import time
 # Initialize pygame
 pygame.init()
@ -124,6 +125,9 @@ def main():
    # Initialize Pacman and Ghost positions
    pacman = Pacman(1, 1)
    ghost = Ghost(COLS - 2, ROWS - 2)
    q = rl.q_init()
    gamma = 0.9
    alpha = 0.8
    # Game loop
    running = True
@ -166,23 +170,23 @@ def main():
            running = False
        # Start of my code
-        alpha = 0.8
+                
        gamma = 0.9
        s = (pacman.x, pacman.y, ghost.x, ghost.y) # as a tuple so the state becomes hashable
        s_not_terminal = True
-        q = rl.q_init()
+        #s_not_terminal = True
        # while s_not_terminal:
        a = rl.epsilon_greedy(q, s) # 0 = Left; 1 = Right ; 2 = Up ; 3 = Down
        s_new, r = rl.take_action(s, a, labyrinth)
        move_pacman(pacman, a)
        print("state: " + str(s_new) + " r: " + str(r))
-        while s_not_terminal:
+        q[s][a] += round(alpha * (r + gamma * max(q[s_new]) - q[s][a]), 2)
-            a = rl.epsilon_greedy(q, s) # 0 = Left; 1 = Right ; 2 = Up ; 3 = Down
+        print(q[s])
-            s_new, r = rl.take_action(s, a, labyrinth)
+        
-            move_pacman(pacman, a)
+        s = s_new
-            
+        time.sleep(0.5)
-            q[s][a] += alpha * (r + gamma * max(q[s_new]) - q[s][a])
+        
-            print(q[s][a])
+        #gamma *= gamma
            s = s_new
        # Draw the labyrinth, pacman, and ghost
        draw_labyrinth()
--- a/04_pacman_rl/reinforcement_learning.py
+++ b/04_pacman_rl/reinforcement_learning.py
@ -14,6 +14,15 @@ def q_init():
    NUM_ACTIONS = 4
    INITIAL_Q_VALUE = 1.0 # Small value for initialization
    # Labyrinth layout
    labyrinth = [
        "##########",
        "#........#",
        "#.##..##.#",
        "#........#",
        "##########"
    ]
    s0_range = range(1, 9)
    s1_range = range(1, 4)
    s2_range = range(1, 9)
@ -33,43 +42,121 @@ def q_init():
                    if s1 == 2 and s0 not in s_constrained_values:
                        continue
                    if s3 == 2 and s2 not in s_constrained_values:
-                        continue 
+                        continue
                    if s0 == s2 and s1 == s3:
                        continue
                    # Assign all possible states a tuple of values
                    state_key = (s0, s1, s2, s3)
-                    q_table[state_key] = [INITIAL_Q_VALUE] * NUM_ACTIONS
+                    q_values = [INITIAL_Q_VALUE] * NUM_ACTIONS
                    # Check which actions are blocked by walls
                    # Action 0: move left (s0 - 1)
                    if labyrinth[s1][s0 - 1] == "#":
                        q_values[0] = 0
                    # Action 1: move right (s0 + 1)
                    if labyrinth[s1][s0 + 1] == "#":
                        q_values[1] = 0
                    # Action 2: move up (s1 - 1)
                    if labyrinth[s1 - 1][s0] == "#":
                        q_values[2] = 0
                    # Action 3: move down (s1 + 1)
                    if labyrinth[s1 + 1][s0] == "#":
                        q_values[3] = 0
                    q_table[state_key] = q_values
-    print(f"Total number of valid states initialized: {len(q_table)}") # debugging
+    # print(f"Total number of valid states initialized: {len(q_table)}") # debugging
    # print(list(q_table.items())[:5]) # Uncomment to see the first 5 entries
    return q_table
-def epsilon_greedy(q, s, epsilon=0.9):
+def epsilon_greedy(q, s, epsilon=0.2):
    """ 
-    Return which direction Pacman should move to 
+    Return which direction Pacman should move to using epsilon-greedy algorithm
-    epsilon-greedy algorithm TBD
+    With probability epsilon, choose a random action. Otherwise choose the greedy action.
    """
    """
    q_max = max(q[s])
    a = q[s].index(q_max)
    return a
    """
    if np.random.random() < epsilon:
        # Explore: choose random action (excluding blocked actions with Q=0)
        valid_actions = [i for i in range(len(q[s])) if q[s][i] > 0]
        if valid_actions:
            return np.random.choice(valid_actions)
        else:
            return np.random.randint(0, len(q[s]))
    else:
        # Exploit: choose best action
        valid_q_values = [(i, q[s][i]) for i in range(len(q[s])) if q[s][i] > 0]
        if valid_q_values:
            # Get max Q-value among valid actions
            best_action = max(valid_q_values, key=lambda x: x[1])[0]
            return best_action
        else:
            return 0
 def bfs_distance(start, end, labyrinth):
    """
    Calculate shortest path distance between two points using BFS.
    Returns the distance or infinity if no path exists.
    """
    from collections import deque
    if start == end:
        return 0
    queue = deque([(start, 0)])  # (position, distance)
    visited = {start}
    while queue:
        (x, y), dist = queue.popleft()
        # Check all 4 directions
        for dx, dy in [(-1, 0), (1, 0), (0, -1), (0, 1)]:
            nx, ny = x + dx, y + dy
            if (nx, ny) == end:
                return dist + 1
            if 0 <= ny < len(labyrinth) and 0 <= nx < len(labyrinth[0]):
                if (nx, ny) not in visited and labyrinth[ny][nx] != "#":
                    visited.add((nx, ny))
                    queue.append(((nx, ny), dist + 1))
    return float('inf')  # No path found
 def take_action(s, a, labyrinth):
    s_new = list(s)
-    if a == 0:
+    if a == 0:  # left
        s_new[0] -= 1
-    if a == 1:
+    if a == 1:  # right
        s_new[0] += 1
-    if a == 2:
+    if a == 2:  # up
        s_new[1] += 1
    if a == 3:
        s_new[1] -= 1
    if a == 3:  # down
        s_new[1] += 1
    # consider if there is a point on the field 
-    r = 1 if labyrinth[s_new[0]][s_new[1]] == "." else 0
+    r = 3 if labyrinth[s_new[1]][s_new[0]] == "." else -1
-    # consider new distance between Pacman and Ghost
+    
-    distance = abs(s[0] - s[2]) + abs(s[1] - s[3])
+    # consider new distance between Pacman and Ghost using actual pathfinding
-    distance_new = abs(s_new[0] - s_new[2]) + abs(s_new[1] - s_new[3])
+    pacman_pos = (s[0], s[1])
-    r += distance_new - distance # adjust this value if necessary
+    ghost_pos = (s[2], s[3])
    pacman_pos_new = (s_new[0], s_new[1])
    distance = bfs_distance(pacman_pos, ghost_pos, labyrinth)
    distance_new = bfs_distance(pacman_pos_new, ghost_pos, labyrinth)
    # Reward for increasing distance from ghost (moving away is good)
    if distance_new > distance:
        r += 0.5  # Bonus for moving away
    elif distance_new < distance:
        r -= 1.0  # Penalty for moving closer
    return tuple(s_new), r