Source code for mushroom_rl.environments.cart_pole

import numpy as np
from scipy.integrate import odeint

from mushroom_rl.core import Environment, MDPInfo
from mushroom_rl.rl_utils import spaces
from mushroom_rl.utils.angles import normalize_angle
from mushroom_rl.utils.viewer import Viewer


[docs]class CartPole(Environment): """ The Inverted Pendulum on a Cart environment as presented in: "Least-Squares Policy Iteration". Lagoudakis M. G. and Parr R.. 2003. """
[docs] def __init__(self, m=2., M=8., l=.5, g=9.8, mu=1e-2, max_u=50., noise_u=10., horizon=3000, gamma=.95): """ Constructor. Args: m (float, 2.0): mass of the pendulum; M (float, 8.0): mass of the cart; l (float, .5): length of the pendulum; g (float, 9.8): gravity acceleration constant; max_u (float, 50.): maximum allowed input torque; noise_u (float, 10.): maximum noise on the action; horizon (int, 3000): horizon of the problem; gamma (float, .95): discount factor. """ # MDP parameters self._m = m self._M = M self._l = l self._g = g self._alpha = 1 / (self._m + self._M) self._mu = mu self._max_u = max_u self._noise_u = noise_u high = np.array([np.inf, np.inf]) # MDP properties dt = .1 observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Discrete(3) mdp_info = MDPInfo(observation_space, action_space, gamma, horizon, dt) # Visualization self._viewer = Viewer(2.5 * l, 2.5 * l) self._last_u = None self._state = None super().__init__(mdp_info)
[docs] def reset(self, state=None): if state is None: angle = np.random.uniform(-np.pi / 8., np.pi / 8.) self._state = np.array([angle, 0.]) else: self._state = state self._state[0] = normalize_angle(self._state[0]) self._last_u = 0 return self._state, {}
[docs] def step(self, action): if action == 0: u = -self._max_u elif action == 1: u = 0. else: u = self._max_u self._last_u = u u += np.random.uniform(-self._noise_u, self._noise_u) new_state = odeint(self._dynamics, self._state, [0, self.info.dt], (u,)) self._state = np.array(new_state[-1]) self._state[0] = normalize_angle(self._state[0]) if np.abs(self._state[0]) > np.pi * .5: reward = -1. absorbing = True else: reward = 0. absorbing = False return self._state, reward, absorbing, {}
[docs] def render(self, record=False): start = 1.25 * self._l * np.ones(2) end = 1.25 * self._l * np.ones(2) end[0] += self._l * np.sin(self._state[0]) end[1] += self._l * np.cos(self._state[0]) self._viewer.line(start, end) self._viewer.square(start, 0, self._l / 10) self._viewer.circle(end, self._l / 20) direction = -np.sign(self._last_u) * np.array([1, 0]) value = np.abs(self._last_u) self._viewer.force_arrow(start, direction, value, self._max_u, self._l / 5) frame = self._viewer.get_frame() if record else None self._viewer.display(self.info.dt) return frame
[docs] def stop(self): self._viewer.close()
def _dynamics(self, state, t, u): theta = state[0] omega = state[1] d_theta = omega d_omega = (self._g * np.sin(theta) - self._alpha * self._m * self._l * .5 * d_theta ** 2 * np.sin(2 * theta) * .5 - self._alpha * np.cos(theta) * u) / (2 / 3 * self._l - self._alpha * self._m * self._l * .5 * np.cos(theta) ** 2) return d_theta, d_omega