removed 0's; set q[s][a]=-10 at the right place
parent
ad40c248d3
commit
8049bfe29f
|
|
@ -175,8 +175,6 @@ def main():
|
||||||
s_new, r, labyrinth_copy = rl.take_action(s, a, labyrinth_copy)
|
s_new, r, labyrinth_copy = rl.take_action(s, a, labyrinth_copy)
|
||||||
|
|
||||||
q[s][a] += ALPHA * (r + GAMMA * rl.max_q(q, s_new, labyrinth_copy) - q[s][a])
|
q[s][a] += ALPHA * (r + GAMMA * rl.max_q(q, s_new, labyrinth_copy) - q[s][a])
|
||||||
|
|
||||||
s = s_new
|
|
||||||
|
|
||||||
if all("." not in row for row in labyrinth_copy):
|
if all("." not in row for row in labyrinth_copy):
|
||||||
s_not_terminal = False
|
s_not_terminal = False
|
||||||
|
|
@ -185,11 +183,11 @@ def main():
|
||||||
# Check for collisions (game over if ghost catches pacman)
|
# Check for collisions (game over if ghost catches pacman)
|
||||||
if s[0] == s[2] and s[1] == s[3]:
|
if s[0] == s[2] and s[1] == s[3]:
|
||||||
s_not_terminal = False
|
s_not_terminal = False
|
||||||
q[s][a] = 0.01
|
q[s][a] = -10.0
|
||||||
print("There was just a collision!!!")
|
# print("Collision at s!!! s: " + str(s)) # debugging
|
||||||
print("s: " + str(s))
|
print("Crashed values now q[s]: " + str(q[s])) # debugging
|
||||||
print("Crashed values now q[s]: " + str(q[s]))
|
|
||||||
|
|
||||||
|
s = s_new
|
||||||
time.sleep(0.025)
|
time.sleep(0.025)
|
||||||
|
|
||||||
if iteration >= max_iterations:
|
if iteration >= max_iterations:
|
||||||
|
|
|
||||||
|
|
@ -55,16 +55,16 @@ def q_init():
|
||||||
# Check which actions are blocked by walls
|
# Check which actions are blocked by walls
|
||||||
# Action 0: move left (s0 - 1)
|
# Action 0: move left (s0 - 1)
|
||||||
if labyrinth[s1][s0 - 1] == "#":
|
if labyrinth[s1][s0 - 1] == "#":
|
||||||
q_values[0] = 0
|
q_values[0] = None
|
||||||
# Action 1: move right (s0 + 1)
|
# Action 1: move right (s0 + 1)
|
||||||
if labyrinth[s1][s0 + 1] == "#":
|
if labyrinth[s1][s0 + 1] == "#":
|
||||||
q_values[1] = 0
|
q_values[1] = None
|
||||||
# Action 2: move up (s1 - 1)
|
# Action 2: move up (s1 - 1)
|
||||||
if labyrinth[s1 - 1][s0] == "#":
|
if labyrinth[s1 - 1][s0] == "#":
|
||||||
q_values[2] = 0
|
q_values[2] = None
|
||||||
# Action 3: move down (s1 + 1)
|
# Action 3: move down (s1 + 1)
|
||||||
if labyrinth[s1 + 1][s0] == "#":
|
if labyrinth[s1 + 1][s0] == "#":
|
||||||
q_values[3] = 0
|
q_values[3] = None
|
||||||
|
|
||||||
q_table[state_key] = q_values
|
q_table[state_key] = q_values
|
||||||
|
|
||||||
|
|
@ -72,7 +72,7 @@ def q_init():
|
||||||
# print(list(q_table.items())[:5]) # Uncomment to see the first 5 entries
|
# print(list(q_table.items())[:5]) # Uncomment to see the first 5 entries
|
||||||
return q_table
|
return q_table
|
||||||
|
|
||||||
def epsilon_greedy(q, s, epsilon=0.1):
|
def epsilon_greedy(q, s, epsilon=0.2):
|
||||||
"""
|
"""
|
||||||
Return which direction Pacman should move to using epsilon-greedy algorithm
|
Return which direction Pacman should move to using epsilon-greedy algorithm
|
||||||
With probability epsilon, choose a random action. Otherwise choose the greedy action.
|
With probability epsilon, choose a random action. Otherwise choose the greedy action.
|
||||||
|
|
@ -80,35 +80,34 @@ def epsilon_greedy(q, s, epsilon=0.1):
|
||||||
Never allows Pacman to move backwards (opposite direction).
|
Never allows Pacman to move backwards (opposite direction).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
"""
|
q_max = max(x for x in q[s] if isinstance(x, (int, float)))
|
||||||
q_max = max(q[s])
|
|
||||||
a = q[s].index(q_max)
|
a = q[s].index(q_max)
|
||||||
|
|
||||||
return a
|
return a
|
||||||
"""
|
|
||||||
|
|
||||||
if np.random.random() < epsilon:
|
# if np.random.random() < epsilon:
|
||||||
# Explore: choose random action (excluding blocked actions with Q=0)
|
# # Explore: choose random action (excluding blocked actions with Q=0)
|
||||||
valid_actions = [i for i in range(len(q[s])) if q[s][i] > 0]
|
# valid_actions = [i for i in range(len(q[s])) if q[s][i] != None]
|
||||||
if valid_actions:
|
# if valid_actions:
|
||||||
return np.random.choice(valid_actions)
|
# return np.random.choice(valid_actions)
|
||||||
else:
|
# else:
|
||||||
return np.random.randint(0, len(q[s]))
|
# return np.random.randint(0, len(q[s]))
|
||||||
else:
|
# else:
|
||||||
# Exploit: choose best action
|
# # Exploit: choose best action
|
||||||
valid_q_values = [(i, q[s][i]) for i in range(len(q[s])) if q[s][i] > 0]
|
# valid_q_values = [(i, q[s][i]) for i in range(len(q[s])) if q[s][i] != None]
|
||||||
if valid_q_values:
|
# if valid_q_values:
|
||||||
# Get max Q-value among valid actions
|
# # Get max Q-value among valid actions
|
||||||
best_action = max(valid_q_values, key=lambda x: x[1])[0]
|
# best_action = max(valid_q_values, key=lambda x: x[1])[0]
|
||||||
return best_action
|
# return best_action
|
||||||
else:
|
# else:
|
||||||
return 0
|
# return 0
|
||||||
|
|
||||||
|
|
||||||
def max_q(q, s_new, labyrinth, depth=0, max_depth=2):
|
def max_q(q, s_new, labyrinth, depth=0, max_depth=4):
|
||||||
"""Calculate Q-values for all possible actions in state s_new and return the maximum"""
|
"""Calculate Q-values for all possible actions in state s_new and return the maximum"""
|
||||||
q_max = 0.01
|
q_max = 0
|
||||||
for a in range(4):
|
for a in range(4):
|
||||||
if q[s_new][a] > 0: # Only consider valid (non-blocked) actions
|
if q[s_new][a] != None and s_new in q: # Only consider valid (non-blocked) actions
|
||||||
s_test = tuple(list(s_new)[:2] + [s_new[2], s_new[3]]) # Keep ghost position
|
s_test = tuple(list(s_new)[:2] + [s_new[2], s_new[3]]) # Keep ghost position
|
||||||
s_test_list = list(s_test)
|
s_test_list = list(s_test)
|
||||||
if a == 0: # left
|
if a == 0: # left
|
||||||
|
|
@ -147,7 +146,7 @@ def calc_reward(s_new, labyrinth):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Reward for cookies
|
# Reward for cookies
|
||||||
r = 1.0 if labyrinth[s_new[1]][s_new[0]] == "." else -2.0
|
r = 1.0 if labyrinth[s_new[1]][s_new[0]] == "." else -1.0
|
||||||
|
|
||||||
return r
|
return r
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue