diff --git a/04_pacman_rl/pacman.py b/04_pacman_rl/pacman.py
index 2c6ed61..75bd7b5 100644
--- a/04_pacman_rl/pacman.py
+++ b/04_pacman_rl/pacman.py
@@ -125,6 +125,9 @@ def main():
     # Initialize Pacman and Ghost positions
     pacman = Pacman(1, 1)
     ghost = Ghost(COLS - 2, ROWS - 2)
+
+    s = (pacman.x, pacman.y, ghost.x, ghost.y) # as a tuple so the state becomes hashable
+    a_prev = 4
     q = rl.q_init()
     gamma = 0.9
     alpha = 0.8
@@ -140,17 +143,6 @@ def main():
             if event.type == pygame.QUIT:
                 running = False
 
-        # Handle Pacman movement
-        keys = pygame.key.get_pressed()
-        if keys[pygame.K_LEFT]:
-            pacman.move(-1, 0)
-        if keys[pygame.K_RIGHT]:
-            pacman.move(1, 0)
-        if keys[pygame.K_UP]:
-            pacman.move(0, -1)
-        if keys[pygame.K_DOWN]:
-            pacman.move(0, 1)
-
         if iter%3==0:
             # Ghost moves towards Pacman
             ghost.move_towards_pacman(pacman)
@@ -170,20 +162,24 @@ def main():
             running = False
 
         # Start of my code
-                
-        s = (pacman.x, pacman.y, ghost.x, ghost.y) # as a tuple so the state becomes hashable
-        
-        #s_not_terminal = True
+        # s_not_terminal = True
         # while s_not_terminal:
-        a = rl.epsilon_greedy(q, s) # 0 = Left; 1 = Right ; 2 = Up ; 3 = Down
+        print("s: " + str(s))
+        print("q[s] before action: " + str(q[s]))
+        
+
+        a = rl.epsilon_greedy(q, s, a_prev) # 0 = Left; 1 = Right ; 2 = Up ; 3 = Down
         s_new, r = rl.take_action(s, a, labyrinth)
         move_pacman(pacman, a)
-        print("state: " + str(s_new) + " r: " + str(r))
         
         q[s][a] += round(alpha * (r + gamma * max(q[s_new]) - q[s][a]), 2)
-        print(q[s])
+        print("s_new: " + str(s_new))
+        print("q[s] after action with manipulated a: " + str(q[s]))
+        print("q[s_new] after action: " + str(q[s_new]))
+        print()
         
         s = s_new
+        a_prev = a
         time.sleep(0.5)
         
         #gamma *= gamma
diff --git a/04_pacman_rl/reinforcement_learning.py b/04_pacman_rl/reinforcement_learning.py
index 113787f..c704a69 100644
--- a/04_pacman_rl/reinforcement_learning.py
+++ b/04_pacman_rl/reinforcement_learning.py
@@ -12,7 +12,7 @@ def q_init():
 
     # Configuration
     NUM_ACTIONS = 4
-    INITIAL_Q_VALUE = 1.0 # Small value for initialization
+    INITIAL_Q_VALUE = 2.0 # Small value for initialization
 
     # Labyrinth layout
     labyrinth = [
@@ -70,18 +70,43 @@ def q_init():
     # print(list(q_table.items())[:5]) # Uncomment to see the first 5 entries
     return q_table
 
-def epsilon_greedy(q, s, epsilon=0.2):
+def epsilon_greedy(q, s, a_prev, epsilon=0.2):
     """ 
     Return which direction Pacman should move to using epsilon-greedy algorithm
     With probability epsilon, choose a random action. Otherwise choose the greedy action.
+    If multiple actions have the same max Q-value, prefer actions different from a_prev.
+    Never allows Pacman to move backwards (opposite direction).
     """
-    """
+
+    opposite_action = {0: 1, 1: 0, 2: 3, 3: 2}
+    
     q_max = max(q[s])
     a = q[s].index(q_max)
-    
-    return a
+
     """
+    # Find all actions with the maximum Q-value
+    max_actions = [a for a in range(4) if q[s][a] == q_max]
     
+    # Exclude the opposite action (going backwards)
+    if a_prev in opposite_action:
+        backward_action = opposite_action[a_prev]
+        if backward_action in max_actions:
+            max_actions.remove(backward_action)
+    
+    # If no actions left after removing backward action, allow it (no choice)
+    if not max_actions:
+        max_actions = [a for a in range(4) if q[s][a] == q_max]
+        if a_prev in opposite_action:
+            backward_action = opposite_action[a_prev]
+            if backward_action in max_actions:
+                max_actions.remove(backward_action)
+    
+    # Return the first valid action
+    a = max_actions[0] if max_actions else 0
+    """
+    return a
+    
+    """
     if np.random.random() < epsilon:
         # Explore: choose random action (excluding blocked actions with Q=0)
         valid_actions = [i for i in range(len(q[s])) if q[s][i] > 0]
@@ -98,6 +123,11 @@ def epsilon_greedy(q, s, epsilon=0.2):
             return best_action
         else:
             return 0
+    """
+
+
+def max_q(q, s_new):
+    pass
 
 
 def bfs_distance(start, end, labyrinth):
@@ -143,20 +173,21 @@ def take_action(s, a, labyrinth):
         s_new[1] += 1
     
     # consider if there is a point on the field 
-    r = 3 if labyrinth[s_new[1]][s_new[0]] == "." else -1
+    r = 2.0 if labyrinth[s_new[1]][s_new[0]] == "." else -5.0
     
     # consider new distance between Pacman and Ghost using actual pathfinding
     pacman_pos = (s[0], s[1])
     ghost_pos = (s[2], s[3])
     pacman_pos_new = (s_new[0], s_new[1])
     
-    distance = bfs_distance(pacman_pos, ghost_pos, labyrinth)
     distance_new = bfs_distance(pacman_pos_new, ghost_pos, labyrinth)
     
-    # Reward for increasing distance from ghost (moving away is good)
-    if distance_new > distance:
-        r += 0.5  # Bonus for moving away
-    elif distance_new < distance:
-        r -= 1.0  # Penalty for moving closer
+    # Reward based on distance from ghost (closer distance = worse reward)
+    if distance_new >= 4:
+        r += 2.0  # Good reward for being far away
+    elif distance_new >= 2:
+        r += 1.0  # Small reward for being moderately far
+    elif distance_new == 1:
+        r -= 10.0  # Large penalty for being adjacent to ghost
     
     return tuple(s_new), r
\ No newline at end of file