tried a couple things out; Balancing reward system

2025-11-19 13:59:41 +01:00 · 2025-11-19 13:59:41 +01:00 · ee04e00627
parent 24714fca0e
commit ee04e00627
2 changed files with 57 additions and 30 deletions
--- a/04_pacman_rl/pacman.py
+++ b/04_pacman_rl/pacman.py
@ -125,6 +125,9 @@ def main():
    # Initialize Pacman and Ghost positions
    pacman = Pacman(1, 1)
    ghost = Ghost(COLS - 2, ROWS - 2)
+
+    s = (pacman.x, pacman.y, ghost.x, ghost.y) # as a tuple so the state becomes hashable
+    a_prev = 4
    q = rl.q_init()
    gamma = 0.9
    alpha = 0.8
@ -140,17 +143,6 @@ def main():
            if event.type == pygame.QUIT:
                running = False

-        # Handle Pacman movement
-        keys = pygame.key.get_pressed()
-        if keys[pygame.K_LEFT]:
-            pacman.move(-1, 0)
-        if keys[pygame.K_RIGHT]:
-            pacman.move(1, 0)
-        if keys[pygame.K_UP]:
-            pacman.move(0, -1)
-        if keys[pygame.K_DOWN]:
-            pacman.move(0, 1)
-
        if iter%3==0:
            # Ghost moves towards Pacman
            ghost.move_towards_pacman(pacman)
@ -170,20 +162,24 @@ def main():
            running = False

        # Start of my code
-                
-        s = (pacman.x, pacman.y, ghost.x, ghost.y) # as a tuple so the state becomes hashable
-        
-        #s_not_terminal = True
+        # s_not_terminal = True
        # while s_not_terminal:
-        a = rl.epsilon_greedy(q, s) # 0 = Left; 1 = Right ; 2 = Up ; 3 = Down
+        print("s: " + str(s))
+        print("q[s] before action: " + str(q[s]))
+        
+
+        a = rl.epsilon_greedy(q, s, a_prev) # 0 = Left; 1 = Right ; 2 = Up ; 3 = Down
        s_new, r = rl.take_action(s, a, labyrinth)
        move_pacman(pacman, a)
-        print("state: " + str(s_new) + " r: " + str(r))
        
        q[s][a] += round(alpha * (r + gamma * max(q[s_new]) - q[s][a]), 2)
-        print(q[s])
+        print("s_new: " + str(s_new))
+        print("q[s] after action with manipulated a: " + str(q[s]))
+        print("q[s_new] after action: " + str(q[s_new]))
+        print()
        
        s = s_new
+        a_prev = a
        time.sleep(0.5)
        
        #gamma *= gamma
--- a/04_pacman_rl/reinforcement_learning.py
+++ b/04_pacman_rl/reinforcement_learning.py
@ -12,7 +12,7 @@ def q_init():

    # Configuration
    NUM_ACTIONS = 4
-    INITIAL_Q_VALUE = 1.0 # Small value for initialization
+    INITIAL_Q_VALUE = 2.0 # Small value for initialization

    # Labyrinth layout
    labyrinth = [
@ -70,18 +70,43 @@ def q_init():
    # print(list(q_table.items())[:5]) # Uncomment to see the first 5 entries
    return q_table

-def epsilon_greedy(q, s, epsilon=0.2):
+def epsilon_greedy(q, s, a_prev, epsilon=0.2):
    """ 
    Return which direction Pacman should move to using epsilon-greedy algorithm
    With probability epsilon, choose a random action. Otherwise choose the greedy action.
+    If multiple actions have the same max Q-value, prefer actions different from a_prev.
+    Never allows Pacman to move backwards (opposite direction).
    """
-    """
+
+    opposite_action = {0: 1, 1: 0, 2: 3, 3: 2}
+    
    q_max = max(q[s])
    a = q[s].index(q_max)

-    return a
    """
+    # Find all actions with the maximum Q-value
+    max_actions = [a for a in range(4) if q[s][a] == q_max]
    
+    # Exclude the opposite action (going backwards)
+    if a_prev in opposite_action:
+        backward_action = opposite_action[a_prev]
+        if backward_action in max_actions:
+            max_actions.remove(backward_action)
+    
+    # If no actions left after removing backward action, allow it (no choice)
+    if not max_actions:
+        max_actions = [a for a in range(4) if q[s][a] == q_max]
+        if a_prev in opposite_action:
+            backward_action = opposite_action[a_prev]
+            if backward_action in max_actions:
+                max_actions.remove(backward_action)
+    
+    # Return the first valid action
+    a = max_actions[0] if max_actions else 0
+    """
+    return a
+    
+    """
    if np.random.random() < epsilon:
        # Explore: choose random action (excluding blocked actions with Q=0)
        valid_actions = [i for i in range(len(q[s])) if q[s][i] > 0]
@ -98,6 +123,11 @@ def epsilon_greedy(q, s, epsilon=0.2):
            return best_action
        else:
            return 0
+    """
+
+
+def max_q(q, s_new):
+    pass


 def bfs_distance(start, end, labyrinth):
@ -143,20 +173,21 @@ def take_action(s, a, labyrinth):
        s_new[1] += 1
    
    # consider if there is a point on the field 
-    r = 3 if labyrinth[s_new[1]][s_new[0]] == "." else -1
+    r = 2.0 if labyrinth[s_new[1]][s_new[0]] == "." else -5.0
    
    # consider new distance between Pacman and Ghost using actual pathfinding
    pacman_pos = (s[0], s[1])
    ghost_pos = (s[2], s[3])
    pacman_pos_new = (s_new[0], s_new[1])
    
-    distance = bfs_distance(pacman_pos, ghost_pos, labyrinth)
    distance_new = bfs_distance(pacman_pos_new, ghost_pos, labyrinth)
    
-    # Reward for increasing distance from ghost (moving away is good)
-    if distance_new > distance:
-        r += 0.5  # Bonus for moving away
-    elif distance_new < distance:
-        r -= 1.0  # Penalty for moving closer
+    # Reward based on distance from ghost (closer distance = worse reward)
+    if distance_new >= 4:
+        r += 2.0  # Good reward for being far away
+    elif distance_new >= 2:
+        r += 1.0  # Small reward for being moderately far
+    elif distance_new == 1:
+        r -= 10.0  # Large penalty for being adjacent to ghost
    
    return tuple(s_new), r