Source code for mushroom_rl.environments.generators.taxi

import numpy as np
from sklearn.utils.extmath import cartesian

from mushroom_rl.environments.finite_mdp import FiniteMDP


[docs]def generate_taxi(grid, prob=.9, rew=(0, 1, 3, 15), gamma=.99, horizon=np.inf):
    """
    This Taxi generator requires a .txt file to specify the shape of the grid
    world and the cells. There are five types of cells: 'S' is the starting
    where the agent is; 'G' is the goal state; '.' is a normal cell; 'F' is a
    passenger, when the agent steps on a hole, it picks up it.
    '#' is a wall, when the agent is supposed to step on a wall, it actually
    remains in its current state. The initial states distribution is uniform
    among all the initial states provided. The episode terminates when the agent
    reaches the goal state. The reward is always 0, except for the goal state
    where it depends on the number of collected passengers. Each action has
    a certain probability of success and, if it fails, the agent goes in a
    perpendicular direction from the supposed one.

    The grid is expected to be rectangular.

    This problem is inspired from:
    "Bayesian Q-Learning". Dearden R. et al.. 1998.

    Args:
        grid (str): the path of the file containing the grid structure;
        prob (float, .9): probability of success of an action;
        rew (tuple, (0, 1, 3, 15)): rewards obtained in goal states;
        gamma (float, .99): discount factor;
        horizon (int, np.inf): the horizon.

    Returns:
        A FiniteMDP object built with the provided parameters.

    """
    grid_map, cell_list, passenger_list = parse_grid(grid)

    assert len(rew) == len(np.argwhere(np.array(grid_map) == 'F')) + 1

    p = compute_probabilities(grid_map, cell_list, passenger_list, prob)
    r = compute_reward(grid_map, cell_list, passenger_list, rew)
    mu = compute_mu(grid_map, cell_list, passenger_list)

    return FiniteMDP(p, r, mu, gamma, horizon)


[docs]def parse_grid(grid):
    """
    Parse the grid file:

    Args:
        grid (str): the path of the file containing the grid structure.

    Returns:
        A list containing the grid structure.

    """
    grid_map = list()
    cell_list = list()
    passenger_list = list()
    with open(grid, 'r') as f:
        m = f.read()

        assert 'S' in m and 'G' in m

        row = list()
        row_idx = 0
        col_idx = 0
        for c in m:
            if c in ['#', '.', 'S', 'G', 'F']:
                row.append(c)
                if c in ['.', 'S', 'G', 'F']:
                    cell_list.append([row_idx, col_idx])
                    if c == 'F':
                        passenger_list.append([row_idx, col_idx])
                col_idx += 1
            elif c == '\n':
                grid_map.append(row)
                row = list()
                row_idx += 1
                col_idx = 0
            else:
                raise ValueError('Unknown marker.')

    return grid_map, cell_list, passenger_list


[docs]def compute_probabilities(grid_map, cell_list, passenger_list, prob):
    """
    Compute the transition probability matrix.

    Args:
        grid_map (list): list containing the grid structure;
        cell_list (list): list of non-wall cells;
        passenger_list (list): list of passenger cells;
        prob (float): probability of success of an action.

    Returns:
        The transition probability matrix;

    """
    g = np.array(grid_map)
    c = np.array(cell_list)
    n_states = len(cell_list) * 2**len(passenger_list)
    p = np.zeros((n_states, 4, n_states))
    directions = [[-1, 0], [1, 0], [0, -1], [0, 1]]
    passenger_states = cartesian([[0, 1]] * len(passenger_list))

    for i in range(n_states):
        idx = i // len(cell_list)
        collected_passengers = np.array(
            passenger_list)[np.argwhere(passenger_states[idx] == 1).ravel()]
        state = c[i % len(cell_list)]

        if g[tuple(state)] in ['.', 'S', 'F']:
            if g[tuple(state)] in ['F']\
                    and state.tolist() not in collected_passengers.tolist():
                continue
            for a in range(len(directions)):
                new_state = state + directions[a]

                j = np.where((c == new_state).all(axis=1))[0]
                if j.size > 0:
                    assert j.size == 1

                    if g[tuple(new_state)] == 'F' and new_state.tolist()\
                            not in collected_passengers.tolist():
                        current_passenger_state = np.zeros(len(passenger_list))
                        current_passenger_idx = np.where(
                            (new_state == passenger_list).all(axis=1))[0]
                        current_passenger_state[current_passenger_idx] = 1
                        new_passenger_state = passenger_states[
                            idx] + current_passenger_state
                        new_idx = np.where((
                            passenger_states == new_passenger_state).all(
                            axis=1))[0]

                        j += len(cell_list) * new_idx
                    else:
                        j += len(cell_list) * idx
                else:
                    j = i

                p[i, a, j] = prob

                for d in [1 - np.abs(directions[a]),
                          np.abs(directions[a]) - 1]:
                    slip_state = state + d
                    k = np.where((c == slip_state).all(axis=1))[0]
                    if k.size > 0:
                        assert k.size == 1

                        if g[tuple(slip_state)] == 'F' and slip_state.tolist()\
                                not in collected_passengers.tolist():
                            current_passenger_state = np.zeros(
                                len(passenger_list))
                            current_passenger_idx = np.where(
                                (slip_state == passenger_list).all(axis=1))[0]
                            current_passenger_state[current_passenger_idx] = 1
                            new_passenger_state = passenger_states[
                                idx] + current_passenger_state
                            new_idx = np.where((
                                passenger_states == new_passenger_state).all(
                                axis=1))[0]

                            k += len(cell_list) * new_idx
                        else:
                            k += len(cell_list) * idx
                    else:
                        k = i

                    p[i, a, k] += (1. - prob) * .5

    return p


[docs]def compute_reward(grid_map, cell_list, passenger_list, rew):
    """
    Compute the reward matrix.

    Args:
        grid_map (list): list containing the grid structure;
        cell_list (list): list of non-wall cells;
        passenger_list (list): list of passenger cells;
        rew (tuple): rewards obtained in goal states.

    Returns:
        The reward matrix.

    """
    g = np.array(grid_map)
    c = np.array(cell_list)
    n_states = len(cell_list) * 2**len(passenger_list)
    r = np.zeros((n_states, 4, n_states))
    directions = [[-1, 0], [1, 0], [0, -1], [0, 1]]
    passenger_states = cartesian([[0, 1]] * len(passenger_list))

    for goal in np.argwhere(g == 'G'):
        for a in range(len(directions)):
            prev_state = goal - directions[a]
            if prev_state.tolist() in c.tolist():
                for i in range(len(passenger_states)):
                    i_idx = np.where((c == prev_state).all(axis=1))[0] + len(
                        cell_list) * i
                    j_idx = j = np.where((c == goal).all(axis=1))[0] + len(
                        cell_list) * i

                    r[i_idx, a, j_idx] = rew[np.sum(passenger_states[i])]

    return r


[docs]def compute_mu(grid_map, cell_list, passenger_list):
    """
    Compute the initial states distribution.

    Args:
        grid_map (list): list containing the grid structure;
        cell_list (list): list of non-wall cells;
        passenger_list (list): list of passenger cells.

    Returns:
        The initial states distribution.

    """
    g = np.array(grid_map)
    c = np.array(cell_list)
    n_states = len(cell_list) * 2**len(passenger_list)
    mu = np.zeros(n_states)
    starts = np.argwhere(g == 'S')

    for s in starts:
        i = np.where((c == s).all(axis=1))[0]
        mu[i] = 1. / len(starts)

    return mu