import numpy as np
from tqdm import trange
from .fqi import FQI
[docs]class DoubleFQI(FQI):
"""
Double Fitted Q-Iteration algorithm.
"Estimating the Maximum Expected Value in Continuous Reinforcement Learning
Problems". D'Eramo C. et al.. 2017.
"""
[docs] def __init__(self, mdp_info, policy, approximator, n_iterations,
approximator_params=None, fit_params=None, quiet=False):
approximator_params['n_models'] = 2
super().__init__(mdp_info, policy, approximator, n_iterations,
approximator_params, fit_params, quiet)
[docs] def fit(self, dataset):
for _ in trange(self._n_iterations(), dynamic_ncols=True, disable=self._quiet, leave=False):
state = list()
action = list()
reward = list()
next_state = list()
absorbing = list()
half = len(dataset) // 2
for i in range(2):
s, a, r, ss, ab, _ = dataset[i * half:(i + 1) * half].parse(to='numpy')
state.append(s)
action.append(a)
reward.append(r)
next_state.append(ss)
absorbing.append(ab)
if self._target is None:
self._target = reward
else:
for i in range(2):
q_i = self.approximator.predict(next_state[i], idx=i)
amax_q = np.expand_dims(np.argmax(q_i, axis=1), axis=1)
max_q = self.approximator.predict(next_state[i], amax_q,
idx=1 - i)
if np.any(absorbing[i]):
max_q *= 1 - absorbing[i]
self._target[i] = reward[i] + self.mdp_info.gamma * max_q
for i in range(2):
self.approximator.fit(state[i], action[i], self._target[i], idx=i,
**self._fit_params)