How to make a deep RL experiment¶
The usual script to run a deep RL experiment does not significantly differ from
the one for a shallow RL experiment.
This tutorial shows how to solve Atari
games in MushroomRL using DQN
, and how to solve
MuJoCo tasks using DDPG
. This
tutorial will not explain some technicalities that are already described in the
previous tutorials, and will only briefly explain how to run deep RL experiments.
Be sure to read the previous tutorials before starting this one.
Solving Atari with DQN¶
This script runs the experiment to solve the Atari Breakout game as described in the DQN paper “Human-level control through deep reinforcement learning”, Mnih V. et al., 2015). We start creating the neural network to learn the action-value function:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from mushroom_rl.algorithms.value import DQN
from mushroom_rl.approximators.parametric import TorchApproximator
from mushroom_rl.core import Core
from mushroom_rl.environments import Atari
from mushroom_rl.policy import EpsGreedy
from mushroom_rl.utils.dataset import compute_metrics
from mushroom_rl.utils.parameters import LinearParameter, Parameter
class Network(nn.Module):
n_features = 512
def __init__(self, input_shape, output_shape, **kwargs):
super().__init__()
n_input = input_shape[0]
n_output = output_shape[0]
self._h1 = nn.Conv2d(n_input, 32, kernel_size=8, stride=4)
self._h2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
self._h3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
self._h4 = nn.Linear(3136, self.n_features)
self._h5 = nn.Linear(self.n_features, n_output)
nn.init.xavier_uniform_(self._h1.weight,
gain=nn.init.calculate_gain('relu'))
nn.init.xavier_uniform_(self._h2.weight,
gain=nn.init.calculate_gain('relu'))
nn.init.xavier_uniform_(self._h3.weight,
gain=nn.init.calculate_gain('relu'))
nn.init.xavier_uniform_(self._h4.weight,
gain=nn.init.calculate_gain('relu'))
nn.init.xavier_uniform_(self._h5.weight,
gain=nn.init.calculate_gain('linear'))
def forward(self, state, action=None):
h = F.relu(self._h1(state.float() / 255.))
h = F.relu(self._h2(h))
h = F.relu(self._h3(h))
h = F.relu(self._h4(h.view(-1, 3136)))
q = self._h5(h)
if action is None:
return q
else:
q_acted = torch.squeeze(q.gather(1, action.long()))
return q_acted
Note that the forward function may return all the action-values of state
,
or only the one for the provided action
. This network will be used later in
the script.
Now, we define useful functions, set some hyperparameters, and create the mdp
and the policy pi
:
def print_epoch(epoch):
print('################################################################')
print('Epoch: ', epoch)
print('----------------------------------------------------------------')
def get_stats(dataset):
score = compute_metrics(dataset)
print(('min_reward: %f, max_reward: %f, mean_reward: %f,'
' games_completed: %d' % score))
return score
scores = list()
optimizer = dict()
optimizer['class'] = optim.Adam
optimizer['params'] = dict(lr=.00025)
# Settings
width = 84
height = 84
history_length = 4
train_frequency = 4
evaluation_frequency = 250000
target_update_frequency = 10000
initial_replay_size = 50000
max_replay_size = 500000
test_samples = 125000
max_steps = 50000000
# MDP
mdp = Atari('BreakoutDeterministic-v4', width, height, ends_at_life=True,
history_length=history_length, max_no_op_actions=30)
# Policy
epsilon = LinearParameter(value=1.,
threshold_value=.1,
n=1000000)
epsilon_test = Parameter(value=.05)
epsilon_random = Parameter(value=1)
pi = EpsGreedy(epsilon=epsilon_random)
Differently from the literature, we use Adam
as the optimizer.
Then, the approximator
:
# Approximator
input_shape = (history_length, height, width)
approximator_params = dict(
network=Network,
input_shape=input_shape,
output_shape=(mdp.info.action_space.n,),
n_actions=mdp.info.action_space.n,
n_features=Network.n_features,
optimizer=optimizer,
loss=F.smooth_l1_loss
)
approximator = TorchApproximator
Finally, the agent
and the core
:
# Agent
algorithm_params = dict(
batch_size=32,
target_update_frequency=target_update_frequency // train_frequency,
replay_memory=None,
initial_replay_size=initial_replay_size,
max_replay_size=max_replay_size
)
agent = DQN(mdp.info, pi, approximator,
approximator_params=approximator_params,
**algorithm_params)
# Algorithm
core = Core(agent, mdp)
Eventually, the learning loop is performed. As done in literature, learning and evaluation steps are alternated:
# RUN
# Fill replay memory with random dataset
print_epoch(0)
core.learn(n_steps=initial_replay_size,
n_steps_per_fit=initial_replay_size)
# Evaluate initial policy
pi.set_epsilon(epsilon_test)
mdp.set_episode_end(False)
dataset = core.evaluate(n_steps=test_samples)
scores.append(get_stats(dataset))
for n_epoch in range(1, max_steps // evaluation_frequency + 1):
print_epoch(n_epoch)
print('- Learning:')
# learning step
pi.set_epsilon(epsilon)
mdp.set_episode_end(True)
core.learn(n_steps=evaluation_frequency,
n_steps_per_fit=train_frequency)
print('- Evaluation:')
# evaluation step
pi.set_epsilon(epsilon_test)
mdp.set_episode_end(False)
dataset = core.evaluate(n_steps=test_samples)
scores.append(get_stats(dataset))
Solving MuJoCo with DDPG¶
This script runs the experiment to solve the Walker-Stand MuJoCo task, as
implemented in MuJoCo. As with DQN
,
we start creating the neural networks. For DDPG
, we need an actor and a critic
network:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from mushroom_rl.algorithms.actor_critic import DDPG
from mushroom_rl.core import Core
from mushroom_rl.environments.dm_control_env import DMControl
from mushroom_rl.policy import OrnsteinUhlenbeckPolicy
from mushroom_rl.utils.dataset import compute_J
class CriticNetwork(nn.Module):
def __init__(self, input_shape, output_shape, n_features, **kwargs):
super().__init__()
n_input = input_shape[-1]
n_output = output_shape[0]
self._h1 = nn.Linear(n_input, n_features)
self._h2 = nn.Linear(n_features, n_features)
self._h3 = nn.Linear(n_features, n_output)
nn.init.xavier_uniform_(self._h1.weight,
gain=nn.init.calculate_gain('relu'))
nn.init.xavier_uniform_(self._h2.weight,
gain=nn.init.calculate_gain('relu'))
nn.init.xavier_uniform_(self._h3.weight,
gain=nn.init.calculate_gain('linear'))
def forward(self, state, action):
state_action = torch.cat((state.float(), action.float()), dim=1)
features1 = F.relu(self._h1(state_action))
features2 = F.relu(self._h2(features1))
q = self._h3(features2)
return torch.squeeze(q)
class ActorNetwork(nn.Module):
def __init__(self, input_shape, output_shape, n_features, **kwargs):
super(ActorNetwork, self).__init__()
n_input = input_shape[-1]
n_output = output_shape[0]
self._h1 = nn.Linear(n_input, n_features)
self._h2 = nn.Linear(n_features, n_features)
self._h3 = nn.Linear(n_features, n_output)
nn.init.xavier_uniform_(self._h1.weight,
gain=nn.init.calculate_gain('relu'))
nn.init.xavier_uniform_(self._h2.weight,
gain=nn.init.calculate_gain('relu'))
nn.init.xavier_uniform_(self._h3.weight,
gain=nn.init.calculate_gain('linear'))
def forward(self, state):
features1 = F.relu(self._h1(torch.squeeze(state, 1).float()))
features2 = F.relu(self._h2(features1))
a = self._h3(features2)
return a
We create the mdp
, the policy, and set some hyperparameters:
# MDP
horizon = 500
gamma = 0.99
gamma_eval = 1.
mdp = DMControl('walker', 'stand', horizon, gamma)
# Policy
policy_class = OrnsteinUhlenbeckPolicy
policy_params = dict(sigma=np.ones(1) * .2, theta=.15, dt=1e-2)
# Settings
initial_replay_size = 500
max_replay_size = 5000
batch_size = 200
n_features = 80
tau = .001
Note that the policy is not instatiated in the script, since in DDPG the instatiation is done inside the algorithm constructor.
We create the actor and the critic approximators:
# Approximator
actor_input_shape = mdp.info.observation_space.shape
actor_params = dict(network=ActorNetwork,
n_features=n_features,
input_shape=actor_input_shape,
output_shape=mdp.info.action_space.shape)
actor_optimizer = {'class': optim.Adam,
'params': {'lr': 1e-5}}
critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],)
critic_params = dict(network=CriticNetwork,
optimizer={'class': optim.Adam,
'params': {'lr': 1e-3}},
loss=F.mse_loss,
n_features=n_features,
input_shape=critic_input_shape,
output_shape=(1,))
Finally, we create the agent
and the core
:
# Agent
agent = DDPG(mdp.info, policy_class, policy_params,
actor_params, actor_optimizer, critic_params,
batch_size, initial_replay_size, max_replay_size,
tau)
# Algorithm
core = Core(agent, mdp)
As in DQN
, we alternate learning and evaluation steps:
# Fill the replay memory with random samples
core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size)
# RUN
n_epochs = 40
n_steps = 1000
n_steps_test = 2000
dataset = core.evaluate(n_steps=n_steps_test, render=False)
J = compute_J(dataset, gamma_eval)
print('Epoch: 0')
print('J: ', np.mean(J))
for n in range(n_epochs):
print('Epoch: ', n+1)
core.learn(n_steps=n_steps, n_steps_per_fit=1)
dataset = core.evaluate(n_steps=n_steps_test, render=False)