removed impossible states; better distance consideration; bug fixing

2025-11-18 14:42:43 +01:00 · 2025-11-18 14:42:43 +01:00 · 24714fca0e
parent a7b43c9037
commit 24714fca0e
2 changed files with 121 additions and 30 deletions
--- a/04_pacman_rl/pacman.py
+++ b/04_pacman_rl/pacman.py
@ -2,6 +2,7 @@ import pygame
 import random
 import math
 import reinforcement_learning as rl
+import time

 # Initialize pygame
 pygame.init()
@ -124,6 +125,9 @@ def main():
    # Initialize Pacman and Ghost positions
    pacman = Pacman(1, 1)
    ghost = Ghost(COLS - 2, ROWS - 2)
+    q = rl.q_init()
+    gamma = 0.9
+    alpha = 0.8

    # Game loop
    running = True
@ -166,23 +170,23 @@ def main():
            running = False

        # Start of my code
-        alpha = 0.8
-        gamma = 0.9
                
        s = (pacman.x, pacman.y, ghost.x, ghost.y) # as a tuple so the state becomes hashable
-        s_not_terminal = True
        
-        q = rl.q_init()
+        #s_not_terminal = True
+        # while s_not_terminal:
+        a = rl.epsilon_greedy(q, s) # 0 = Left; 1 = Right ; 2 = Up ; 3 = Down
+        s_new, r = rl.take_action(s, a, labyrinth)
+        move_pacman(pacman, a)
+        print("state: " + str(s_new) + " r: " + str(r))
        
-        while s_not_terminal:
-            a = rl.epsilon_greedy(q, s) # 0 = Left; 1 = Right ; 2 = Up ; 3 = Down
-            s_new, r = rl.take_action(s, a, labyrinth)
-            move_pacman(pacman, a)
+        q[s][a] += round(alpha * (r + gamma * max(q[s_new]) - q[s][a]), 2)
+        print(q[s])
        
-            q[s][a] += alpha * (r + gamma * max(q[s_new]) - q[s][a])
-            print(q[s][a])
+        s = s_new
+        time.sleep(0.5)
        
-            s = s_new
+        #gamma *= gamma

        # Draw the labyrinth, pacman, and ghost
        draw_labyrinth()
--- a/04_pacman_rl/reinforcement_learning.py
+++ b/04_pacman_rl/reinforcement_learning.py
@ -14,6 +14,15 @@ def q_init():
    NUM_ACTIONS = 4
    INITIAL_Q_VALUE = 1.0 # Small value for initialization

+    # Labyrinth layout
+    labyrinth = [
+        "##########",
+        "#........#",
+        "#.##..##.#",
+        "#........#",
+        "##########"
+    ]
+
    s0_range = range(1, 9)
    s1_range = range(1, 4)
    s2_range = range(1, 9)
@ -34,42 +43,120 @@ def q_init():
                        continue
                    if s3 == 2 and s2 not in s_constrained_values:
                        continue
+                    if s0 == s2 and s1 == s3:
+                        continue

                    # Assign all possible states a tuple of values
                    state_key = (s0, s1, s2, s3)
-                    q_table[state_key] = [INITIAL_Q_VALUE] * NUM_ACTIONS
+                    q_values = [INITIAL_Q_VALUE] * NUM_ACTIONS
                    
-    print(f"Total number of valid states initialized: {len(q_table)}") # debugging
+                    # Check which actions are blocked by walls
+                    # Action 0: move left (s0 - 1)
+                    if labyrinth[s1][s0 - 1] == "#":
+                        q_values[0] = 0
+                    # Action 1: move right (s0 + 1)
+                    if labyrinth[s1][s0 + 1] == "#":
+                        q_values[1] = 0
+                    # Action 2: move up (s1 - 1)
+                    if labyrinth[s1 - 1][s0] == "#":
+                        q_values[2] = 0
+                    # Action 3: move down (s1 + 1)
+                    if labyrinth[s1 + 1][s0] == "#":
+                        q_values[3] = 0
+                    
+                    q_table[state_key] = q_values
+
+    # print(f"Total number of valid states initialized: {len(q_table)}") # debugging
    # print(list(q_table.items())[:5]) # Uncomment to see the first 5 entries
    return q_table

-def epsilon_greedy(q, s, epsilon=0.9):
+def epsilon_greedy(q, s, epsilon=0.2):
+    """ 
+    Return which direction Pacman should move to using epsilon-greedy algorithm
+    With probability epsilon, choose a random action. Otherwise choose the greedy action.
    """
-    Return which direction Pacman should move to 
-    epsilon-greedy algorithm TBD
    """
    q_max = max(q[s])
    a = q[s].index(q_max)
    
    return a
+    """
+    
+    if np.random.random() < epsilon:
+        # Explore: choose random action (excluding blocked actions with Q=0)
+        valid_actions = [i for i in range(len(q[s])) if q[s][i] > 0]
+        if valid_actions:
+            return np.random.choice(valid_actions)
+        else:
+            return np.random.randint(0, len(q[s]))
+    else:
+        # Exploit: choose best action
+        valid_q_values = [(i, q[s][i]) for i in range(len(q[s])) if q[s][i] > 0]
+        if valid_q_values:
+            # Get max Q-value among valid actions
+            best_action = max(valid_q_values, key=lambda x: x[1])[0]
+            return best_action
+        else:
+            return 0
+
+
+def bfs_distance(start, end, labyrinth):
+    """
+    Calculate shortest path distance between two points using BFS.
+    Returns the distance or infinity if no path exists.
+    """
+    from collections import deque
+    
+    if start == end:
+        return 0
+    
+    queue = deque([(start, 0)])  # (position, distance)
+    visited = {start}
+    
+    while queue:
+        (x, y), dist = queue.popleft()
+        
+        # Check all 4 directions
+        for dx, dy in [(-1, 0), (1, 0), (0, -1), (0, 1)]:
+            nx, ny = x + dx, y + dy
+            
+            if (nx, ny) == end:
+                return dist + 1
+            
+            if 0 <= ny < len(labyrinth) and 0 <= nx < len(labyrinth[0]):
+                if (nx, ny) not in visited and labyrinth[ny][nx] != "#":
+                    visited.add((nx, ny))
+                    queue.append(((nx, ny), dist + 1))
+    
+    return float('inf')  # No path found


 def take_action(s, a, labyrinth):
    s_new = list(s)
-    if a == 0:
+    if a == 0:  # left
        s_new[0] -= 1
-    if a == 1:
+    if a == 1:  # right
        s_new[0] += 1
-    if a == 2:
-        s_new[1] += 1
-    if a == 3:
+    if a == 2:  # up
        s_new[1] -= 1
+    if a == 3:  # down
+        s_new[1] += 1
    
    # consider if there is a point on the field 
-    r = 1 if labyrinth[s_new[0]][s_new[1]] == "." else 0
-    # consider new distance between Pacman and Ghost
-    distance = abs(s[0] - s[2]) + abs(s[1] - s[3])
-    distance_new = abs(s_new[0] - s_new[2]) + abs(s_new[1] - s_new[3])
-    r += distance_new - distance # adjust this value if necessary
+    r = 3 if labyrinth[s_new[1]][s_new[0]] == "." else -1
+    
+    # consider new distance between Pacman and Ghost using actual pathfinding
+    pacman_pos = (s[0], s[1])
+    ghost_pos = (s[2], s[3])
+    pacman_pos_new = (s_new[0], s_new[1])
+    
+    distance = bfs_distance(pacman_pos, ghost_pos, labyrinth)
+    distance_new = bfs_distance(pacman_pos_new, ghost_pos, labyrinth)
+    
+    # Reward for increasing distance from ghost (moving away is good)
+    if distance_new > distance:
+        r += 0.5  # Bonus for moving away
+    elif distance_new < distance:
+        r -= 1.0  # Penalty for moving closer
    
    return tuple(s_new), r