import numpy as np
from mushroom_rl.core import Agent
from mushroom_rl.approximators import Regressor
from mushroom_rl.approximators.parametric import LinearApproximator
from mushroom_rl.rl_utils.parameters import to_parameter
[docs]class StochasticAC(Agent):
"""
Stochastic Actor critic in the episodic setting as presented in:
"Model-Free Reinforcement Learning with Continuous Action in Practice".
Degris T. et al.. 2012.
"""
[docs] def __init__(self, mdp_info, policy, alpha_theta, alpha_v, lambda_par=.9, value_function_features=None):
"""
Constructor.
Args:
alpha_theta ([float, Parameter]): learning rate for policy update;
alpha_v ([float, Parameter]): learning rate for the value function;
lambda_par ([float, Parameter], .9): trace decay parameter;
value_function_features (Features, None): features used by the value function approximator.
"""
self._psi = value_function_features
self._alpha_theta = to_parameter(alpha_theta)
self._alpha_v = to_parameter(alpha_v)
self._lambda = to_parameter(lambda_par)
super().__init__(mdp_info, policy)
if self._psi is not None:
input_shape = (self._psi.size,)
else:
input_shape = mdp_info.observation_space.shape
self._V = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1,))
self._e_v = np.zeros(self._V.weights_size)
self._e_theta = np.zeros(self.policy.weights_size)
self._add_save_attr(
_psi='pickle',
_alpha_theta='mushroom',
_alpha_v='mushroom',
_lambda='mushroom',
_V='mushroom',
_e_v='numpy',
_e_theta='numpy'
)
[docs] def episode_start(self, initial_state, episode_info):
self._e_v = np.zeros(self._V.weights_size)
self._e_theta = np.zeros(self.policy.weights_size)
return super().episode_start(initial_state, episode_info)
[docs] def fit(self, dataset):
for step in dataset:
s, a, r, ss, absorbing, _ = step
s_psi = self._psi(s) if self._psi is not None else s
ss_psi = self._psi(ss) if self._psi is not None else ss
v_next = self._V(ss_psi) if not absorbing else 0
delta = self._compute_td_n_traces(s, a, r, v_next, s_psi)
# Update value function
delta_v = self._alpha_v(s, a) * delta * self._e_v
v_new = self._V.get_weights() + delta_v
self._V.set_weights(v_new)
# Update policy
delta_theta = self._alpha_theta(s, a) * delta * self._e_theta
theta_new = self.policy.get_weights() + delta_theta
self.policy.set_weights(theta_new)
def _compute_td_n_traces(self, s, a, r, v_next, s_psi):
# Compute TD error
delta = r + self.mdp_info.gamma * v_next - self._V(s_psi)
# Update traces
self._e_v = self.mdp_info.gamma * self._lambda() * self._e_v + s_psi
self._e_theta = self.mdp_info.gamma * self._lambda() * self._e_theta + self.policy.diff_log(s, a)
return delta
[docs]class StochasticAC_AVG(StochasticAC):
"""
Stochastic Actor critic in the average reward setting as presented in:
"Model-Free Reinforcement Learning with Continuous Action in Practice".
Degris T. et al.. 2012.
"""
[docs] def __init__(self, mdp_info, policy, alpha_theta, alpha_v, alpha_r, lambda_par=.9, value_function_features=None):
"""
Constructor.
Args:
alpha_r (Parameter): learning rate for the reward trace.
"""
super().__init__(mdp_info, policy, alpha_theta, alpha_v, lambda_par, value_function_features)
self._alpha_r = to_parameter(alpha_r)
self._r_bar = 0
self._add_save_attr(_alpha_r='mushroom', _r_bar='primitive')
def _compute_td_n_traces(self, s, a, r, v_next, s_psi):
# Compute TD error
delta = r - self._r_bar + v_next - self._V(s_psi)
# Update traces
self._r_bar += self._alpha_r() * delta
self._e_v = self._lambda() * self._e_v + s_psi
self._e_theta = self._lambda() * self._e_theta + self.policy.diff_log(s, a)
return delta