import numpy as np
[docs]def compute_lqr_feedback_gain(lqr, max_iterations=100):
"""
Computes the optimal gain matrix K.
Args:
lqr (LQR): LQR environment;
max_iterations (int): max iterations for convergence.
Returns:
Feedback gain matrix K.
"""
A, B, Q, R, gamma = _parse_lqr(lqr)
P = np.eye(Q.shape[0])
K = _compute_riccati_gain(P, A, B, R, gamma)
it = 0
while it < max_iterations:
P = _compute_riccati_rhs(A, B, Q, R, gamma, K, P)
K = _compute_riccati_gain(P, A, B, R, gamma)
it += 1
return K
[docs]def compute_lqr_P(lqr, K):
"""
Computes the P matrix for a given gain matrix K.
Args:
lqr (LQR): LQR environment;
K (np.ndarray): controller matrix.
Returns:
The P matrix of the value function.
"""
A, B, Q, R, gamma = _parse_lqr(lqr)
L, M = _compute_lqr_intermediate_results(K, A, B, Q, R, gamma)
vec_P = np.linalg.solve(M, L.reshape(-1))
return vec_P.reshape(Q.shape)
[docs]def compute_lqr_V(s, lqr, K):
"""
Computes the value function at a state s, with the given gain matrix K.
Args:
s (np.ndarray): state;
lqr (LQR): LQR environment;
K (np.ndarray): controller matrix.
Returns:
The value function at s
"""
if s.ndim == 1:
s = s.reshape((1, -1))
P = compute_lqr_P(lqr, K)
return -1. * np.einsum('...k,kl,...l->...', s, P, s).reshape(-1, 1)
[docs]def compute_lqr_V_gaussian_policy(s, lqr, K, Sigma):
"""
Computes the value function at a state s, with the given gain matrix K and
covariance Sigma.
Args:
s (np.ndarray): state;
lqr (LQR): LQR environment;
K (np.ndarray): controller matrix;
Sigma (np.ndarray): covariance matrix.
Returns:
The value function at s.
"""
b = _compute_lqr_V_gaussian_policy_additional_term(lqr, K, Sigma)
return compute_lqr_V(s, lqr, K) - b
[docs]def compute_lqr_Q(s, a, lqr, K):
"""
Computes the state-action value function Q at a state-action pair (s, a),
with the given gain matrix K.
Args:
s (np.ndarray): state;
a (np.ndarray): action;
lqr (LQR): LQR environment;
K (np.ndarray): controller matrix.
Returns:
The Q function at s, a.
"""
if s.ndim == 1:
s = s.reshape((1, -1))
if a.ndim == 1:
a = a.reshape((1, -1))
sa = np.hstack((s, a))
M = _compute_lqr_Q_matrix(lqr, K)
return -1. * np.einsum('...k,kl,...l->...', sa, M, sa).reshape(-1, 1)
[docs]def compute_lqr_Q_gaussian_policy(s, a, lqr, K, Sigma):
"""
Computes the state-action value function Q at a state-action pair (s, a),
with the given gain matrix K and covariance Sigma.
Args:
s (np.ndarray): state;
a (np.ndarray): action;
lqr (LQR): LQR environment;
K (np.ndarray): controller matrix;
Sigma (np.ndarray): covariance matrix.
Returns:
The Q function at (s, a).
"""
b = _compute_lqr_Q_gaussian_policy_additional_term(lqr, K, Sigma)
return compute_lqr_Q(s, a, lqr, K) - b
[docs]def compute_lqr_V_gaussian_policy_gradient_K(s, lqr, K, Sigma):
"""
Computes the gradient of the objective function J (equal to the value
function V) at state s, w.r.t. the controller matrix K, with the current
policy parameters K and Sigma. J(s, K, Sigma) = ValueFunction(s, K, Sigma).
Args:
s (np.ndarray): state;
lqr (LQR): LQR environment;
K (np.ndarray): controller matrix;
Sigma (np.ndarray): covariance matrix.
Returns:
The gradient of J wrt to K.
"""
if s.ndim == 1:
s = s.reshape((1, -1))
batch_size = s.shape[0]
A, B, Q, R, gamma = _parse_lqr(lqr)
L, M = _compute_lqr_intermediate_results(K, A, B, Q, R, gamma)
Minv = np.linalg.inv(M)
n_elems = K.shape[0]*K.shape[1]
dJ = np.zeros((batch_size, n_elems))
for i in range(n_elems):
dLi, dMi = _compute_lqr_intermediate_results_diff(K, A, B, R, gamma, i)
vec_dPi = -Minv @ dMi @ Minv @ L.reshape(-1) + np.linalg.solve(
M, dLi.reshape(-1)
)
dPi = vec_dPi.reshape(Q.shape)
dJ[:, i] = np.einsum('...k,kl,...l->...', s, dPi, s).reshape(-1, 1) \
+ gamma * np.trace(Sigma @ B.T @ dPi @ B) / (1.0 - gamma)
return -dJ
[docs]def compute_lqr_Q_gaussian_policy_gradient_K(s, a, lqr, K, Sigma):
"""
Computes the gradient of the state-action Value function at state-action
pair (s, a), w.r.t. the controller matrix K, with the current policy
parameters K and Sigma.
Args:
s (np.ndarray): state;
a (np.ndarray): action;
lqr (LQR): LQR environment;
K (np.ndarray): controller matrix;
Sigma (np.ndarray): covariance matrix.
Returns:
The gradient of Q wrt to K.
"""
if s.ndim == 1:
s = s.reshape((1, -1))
if a.ndim == 1:
a = a.reshape((1, -1))
s_next = (lqr.A @ s.T).T + (lqr.B @ a.T).T
return lqr.info.gamma * compute_lqr_V_gaussian_policy_gradient_K(
s_next, lqr, K, Sigma
)
def _parse_lqr(lqr):
return lqr.A, lqr.B, lqr.Q, lqr.R, lqr.info.gamma
def _compute_riccati_rhs(A, B, Q, R, gamma, K, P):
return Q + gamma * (
A.T @ P @ A - K.T @ B.T @ P @ A - A.T @ P @ B @ K +
K.T @ B.T @ P @ B @ K) + K.T @ R @ K
def _compute_riccati_gain(P, A, B, R, gamma):
return gamma * np.linalg.inv((R + gamma * (B.T @ P @ B))) @ B.T @ P @ A
def _compute_lqr_intermediate_results(K, A, B, Q, R, gamma):
size = Q.shape[0] ** 2
L = Q + K.T @ R @ K
kb = K.T @ B.T
M = np.eye(size, size) - gamma * (np.kron(A.T, A.T) - np.kron(A.T, kb) -
np.kron(kb, A.T) + np.kron(kb, kb))
return L, M
def _compute_lqr_intermediate_results_diff(K, A, B, R, gamma, i):
n_elems = K.shape[0]*K.shape[1]
vec_dKi = np.zeros(n_elems)
vec_dKi[i] = 1
dKi = vec_dKi.reshape(K.shape)
kb = K.T @ B.T
dkb = dKi.T @ B.T
dL = dKi.T @ R @ K + K.T @ R @ dKi
dM = gamma * (np.kron(A.T, dkb) + np.kron(dkb, A.T) - np.kron(dkb, kb) -
np.kron(kb, dkb))
return dL, dM
def _compute_lqr_Q_matrix(lqr, K):
A, B, Q, R, gamma = _parse_lqr(lqr)
P = compute_lqr_P(lqr, K)
M = np.block([[Q + gamma * A.T @ P @ A, gamma * A.T @ P @ B],
[gamma * B.T @ P @ A, R + gamma * B.T @ P @ B]])
return M
def _compute_lqr_V_gaussian_policy_additional_term(lqr, K, Sigma):
A, B, Q, R, gamma = _parse_lqr(lqr)
P = compute_lqr_P(lqr, K)
b = np.trace(Sigma @ (R + gamma * B.T @ P @ B)) / (1.0 - gamma)
return b
def _compute_lqr_Q_gaussian_policy_additional_term(lqr, K, Sigma):
A, B, Q, R, gamma = _parse_lqr(lqr)
P = compute_lqr_P(lqr, K)
b = gamma / (1 - gamma) * np.trace(Sigma @ (R + gamma * B.T @ P @ B))
return b