"""
Entwickeln Sie einen Reinforcement Learning (RL) Agenten, der in
einem minimalistischen Pacman-Spiel (bereitgestellt auf meiner
Homepage) effektiv Punkte sammelt, während er dem Geist
ausweicht und somit vermeidet gefressen zu werden.
"""

import numpy as np

def q_init():
    """ Fill every possible action in every state with a small value for initialization"""

    # Configuration
    NUM_ACTIONS = 4
    INITIAL_Q_VALUE = 1.0 # Small value for initialization

    s0_range = range(1, 9)
    s1_range = range(1, 4)
    s2_range = range(1, 9)
    s3_range = range(1, 4)
    s_constrained_values = {1, 4, 5, 8}

    # The Q-Table dictionary
    q_table = {}

    # Iterate through all possible combinations of s0, s1, s2, s3
    for s0 in s0_range:
        for s1 in s1_range:
            for s2 in s2_range:
                for s3 in s3_range:
                    
                    # Skip impossible states
                    if s1 == 2 and s0 not in s_constrained_values:
                        continue
                    if s3 == 2 and s2 not in s_constrained_values:
                        continue 

                    # Assign all possible states a tuple of values
                    state_key = (s0, s1, s2, s3)
                    q_table[state_key] = [INITIAL_Q_VALUE] * NUM_ACTIONS

    print(f"Total number of valid states initialized: {len(q_table)}") # debugging
    # print(list(q_table.items())[:5]) # Uncomment to see the first 5 entries
    return q_table

def epsilon_greedy(q, s, epsilon=0.9):
    """ 
    Return which direction Pacman should move to 
    epsilon-greedy algorithm TBD
    """
    q_max = max(q[s])
    a = q[s].index(q_max)
    
    return a


def take_action(s, a, labyrinth):
    s_new = list(s)
    if a == 0:
        s_new[0] -= 1
    if a == 1:
        s_new[0] += 1
    if a == 2:
        s_new[1] += 1
    if a == 3:
        s_new[1] -= 1
    
    # consider if there is a point on the field 
    r = 1 if labyrinth[s_new[0]][s_new[1]] == "." else 0
    # consider new distance between Pacman and Ghost
    distance = abs(s[0] - s[2]) + abs(s[1] - s[3])
    distance_new = abs(s_new[0] - s_new[2]) + abs(s_new[1] - s_new[3])
    r += distance_new - distance # adjust this value if necessary
    
    return tuple(s_new), r