tried a couple things out; Balancing reward system

master
Ruben Seitz 2025-11-19 13:59:41 +01:00
parent 24714fca0e
commit ee04e00627
2 changed files with 57 additions and 30 deletions

View File

@ -125,6 +125,9 @@ def main():
# Initialize Pacman and Ghost positions # Initialize Pacman and Ghost positions
pacman = Pacman(1, 1) pacman = Pacman(1, 1)
ghost = Ghost(COLS - 2, ROWS - 2) ghost = Ghost(COLS - 2, ROWS - 2)
s = (pacman.x, pacman.y, ghost.x, ghost.y) # as a tuple so the state becomes hashable
a_prev = 4
q = rl.q_init() q = rl.q_init()
gamma = 0.9 gamma = 0.9
alpha = 0.8 alpha = 0.8
@ -140,17 +143,6 @@ def main():
if event.type == pygame.QUIT: if event.type == pygame.QUIT:
running = False running = False
# Handle Pacman movement
keys = pygame.key.get_pressed()
if keys[pygame.K_LEFT]:
pacman.move(-1, 0)
if keys[pygame.K_RIGHT]:
pacman.move(1, 0)
if keys[pygame.K_UP]:
pacman.move(0, -1)
if keys[pygame.K_DOWN]:
pacman.move(0, 1)
if iter%3==0: if iter%3==0:
# Ghost moves towards Pacman # Ghost moves towards Pacman
ghost.move_towards_pacman(pacman) ghost.move_towards_pacman(pacman)
@ -170,20 +162,24 @@ def main():
running = False running = False
# Start of my code # Start of my code
# s_not_terminal = True
s = (pacman.x, pacman.y, ghost.x, ghost.y) # as a tuple so the state becomes hashable
#s_not_terminal = True
# while s_not_terminal: # while s_not_terminal:
a = rl.epsilon_greedy(q, s) # 0 = Left; 1 = Right ; 2 = Up ; 3 = Down print("s: " + str(s))
print("q[s] before action: " + str(q[s]))
a = rl.epsilon_greedy(q, s, a_prev) # 0 = Left; 1 = Right ; 2 = Up ; 3 = Down
s_new, r = rl.take_action(s, a, labyrinth) s_new, r = rl.take_action(s, a, labyrinth)
move_pacman(pacman, a) move_pacman(pacman, a)
print("state: " + str(s_new) + " r: " + str(r))
q[s][a] += round(alpha * (r + gamma * max(q[s_new]) - q[s][a]), 2) q[s][a] += round(alpha * (r + gamma * max(q[s_new]) - q[s][a]), 2)
print(q[s]) print("s_new: " + str(s_new))
print("q[s] after action with manipulated a: " + str(q[s]))
print("q[s_new] after action: " + str(q[s_new]))
print()
s = s_new s = s_new
a_prev = a
time.sleep(0.5) time.sleep(0.5)
#gamma *= gamma #gamma *= gamma

View File

@ -12,7 +12,7 @@ def q_init():
# Configuration # Configuration
NUM_ACTIONS = 4 NUM_ACTIONS = 4
INITIAL_Q_VALUE = 1.0 # Small value for initialization INITIAL_Q_VALUE = 2.0 # Small value for initialization
# Labyrinth layout # Labyrinth layout
labyrinth = [ labyrinth = [
@ -70,18 +70,43 @@ def q_init():
# print(list(q_table.items())[:5]) # Uncomment to see the first 5 entries # print(list(q_table.items())[:5]) # Uncomment to see the first 5 entries
return q_table return q_table
def epsilon_greedy(q, s, epsilon=0.2): def epsilon_greedy(q, s, a_prev, epsilon=0.2):
""" """
Return which direction Pacman should move to using epsilon-greedy algorithm Return which direction Pacman should move to using epsilon-greedy algorithm
With probability epsilon, choose a random action. Otherwise choose the greedy action. With probability epsilon, choose a random action. Otherwise choose the greedy action.
If multiple actions have the same max Q-value, prefer actions different from a_prev.
Never allows Pacman to move backwards (opposite direction).
""" """
"""
opposite_action = {0: 1, 1: 0, 2: 3, 3: 2}
q_max = max(q[s]) q_max = max(q[s])
a = q[s].index(q_max) a = q[s].index(q_max)
return a
""" """
# Find all actions with the maximum Q-value
max_actions = [a for a in range(4) if q[s][a] == q_max]
# Exclude the opposite action (going backwards)
if a_prev in opposite_action:
backward_action = opposite_action[a_prev]
if backward_action in max_actions:
max_actions.remove(backward_action)
# If no actions left after removing backward action, allow it (no choice)
if not max_actions:
max_actions = [a for a in range(4) if q[s][a] == q_max]
if a_prev in opposite_action:
backward_action = opposite_action[a_prev]
if backward_action in max_actions:
max_actions.remove(backward_action)
# Return the first valid action
a = max_actions[0] if max_actions else 0
"""
return a
"""
if np.random.random() < epsilon: if np.random.random() < epsilon:
# Explore: choose random action (excluding blocked actions with Q=0) # Explore: choose random action (excluding blocked actions with Q=0)
valid_actions = [i for i in range(len(q[s])) if q[s][i] > 0] valid_actions = [i for i in range(len(q[s])) if q[s][i] > 0]
@ -98,6 +123,11 @@ def epsilon_greedy(q, s, epsilon=0.2):
return best_action return best_action
else: else:
return 0 return 0
"""
def max_q(q, s_new):
pass
def bfs_distance(start, end, labyrinth): def bfs_distance(start, end, labyrinth):
@ -143,20 +173,21 @@ def take_action(s, a, labyrinth):
s_new[1] += 1 s_new[1] += 1
# consider if there is a point on the field # consider if there is a point on the field
r = 3 if labyrinth[s_new[1]][s_new[0]] == "." else -1 r = 2.0 if labyrinth[s_new[1]][s_new[0]] == "." else -5.0
# consider new distance between Pacman and Ghost using actual pathfinding # consider new distance between Pacman and Ghost using actual pathfinding
pacman_pos = (s[0], s[1]) pacman_pos = (s[0], s[1])
ghost_pos = (s[2], s[3]) ghost_pos = (s[2], s[3])
pacman_pos_new = (s_new[0], s_new[1]) pacman_pos_new = (s_new[0], s_new[1])
distance = bfs_distance(pacman_pos, ghost_pos, labyrinth)
distance_new = bfs_distance(pacman_pos_new, ghost_pos, labyrinth) distance_new = bfs_distance(pacman_pos_new, ghost_pos, labyrinth)
# Reward for increasing distance from ghost (moving away is good) # Reward based on distance from ghost (closer distance = worse reward)
if distance_new > distance: if distance_new >= 4:
r += 0.5 # Bonus for moving away r += 2.0 # Good reward for being far away
elif distance_new < distance: elif distance_new >= 2:
r -= 1.0 # Penalty for moving closer r += 1.0 # Small reward for being moderately far
elif distance_new == 1:
r -= 10.0 # Large penalty for being adjacent to ghost
return tuple(s_new), r return tuple(s_new), r