Source code for mushroom_rl.algorithms.actor_critic.deep_actor_critic.a2c

import torch

from mushroom_rl.algorithms.actor_critic.deep_actor_critic import DeepAC
from mushroom_rl.approximators import Regressor
from mushroom_rl.approximators.parametric import TorchApproximator
from mushroom_rl.utils.value_functions import compute_advantage_montecarlo
from mushroom_rl.utils.dataset import parse_dataset
from mushroom_rl.utils.torch import to_float_tensor

from copy import deepcopy


[docs]class A2C(DeepAC): """ Advantage Actor Critic algorithm (A2C). Synchronous version of the A3C algorithm. "Asynchronous Methods for Deep Reinforcement Learning". Mnih V. et. al.. 2016. """
[docs] def __init__(self, mdp_info, policy, actor_optimizer, critic_params, ent_coeff, max_grad_norm=None, critic_fit_params=None): """ Constructor. Args: policy (TorchPolicy): torch policy to be learned by the algorithm; actor_optimizer (dict): parameters to specify the actor optimizer algorithm; critic_params (dict): parameters of the critic approximator to build; ent_coeff (float, 0): coefficient for the entropy penalty; max_grad_norm (float, None): maximum norm for gradient clipping. If None, no clipping will be performed, unless specified otherwise in actor_optimizer; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator. """ self._critic_fit_params = dict() if critic_fit_params is None else critic_fit_params self._entropy_coeff = ent_coeff self._V = Regressor(TorchApproximator, **critic_params) if 'clipping' not in actor_optimizer and max_grad_norm is not None: actor_optimizer = deepcopy(actor_optimizer) clipping_params = dict(max_norm=max_grad_norm, norm_type=2) actor_optimizer['clipping'] = dict( method=torch.nn.utils.clip_grad_norm_, params=clipping_params) self._add_save_attr( _critic_fit_params='pickle', _entropy_coeff='primitive', _V='mushroom' ) super().__init__(mdp_info, policy, actor_optimizer, policy.parameters())
[docs] def fit(self, dataset): state, action, reward, next_state, absorbing, _ = parse_dataset(dataset) v, adv = compute_advantage_montecarlo(self._V, state, next_state, reward, absorbing, self.mdp_info.gamma) self._V.fit(state, v, **self._critic_fit_params) loss = self._loss(state, action, adv) self._optimize_actor_parameters(loss)
def _loss(self, state, action, adv): use_cuda = self.policy.use_cuda s = to_float_tensor(state, use_cuda) a = to_float_tensor(action, use_cuda) adv_t = to_float_tensor(adv, use_cuda) gradient_loss = -torch.mean(self.policy.log_prob_t(s, a)*adv_t) entropy_loss = -self.policy.entropy_t(s) return gradient_loss + self._entropy_coeff * entropy_loss
[docs] def _post_load(self): if self._optimizer is not None: self._parameters = list(self.policy.parameters())