Source code for mushroom_rl.solvers.lqr

import numpy as np


[docs]def compute_lqr_feedback_gain(lqr, max_iterations=100): """ Computes the optimal gain matrix K. Args: lqr (LQR): LQR environment; max_iterations (int): max iterations for convergence. Returns: Feedback gain matrix K. """ A, B, Q, R, gamma = _parse_lqr(lqr) P = np.eye(Q.shape[0]) K = _compute_riccati_gain(P, A, B, R, gamma) it = 0 while it < max_iterations: P = _compute_riccati_rhs(A, B, Q, R, gamma, K, P) K = _compute_riccati_gain(P, A, B, R, gamma) it += 1 return K
[docs]def compute_lqr_P(lqr, K): """ Computes the P matrix for a given gain matrix K. Args: lqr (LQR): LQR environment; K (np.ndarray): controller matrix. Returns: The P matrix of the value function. """ A, B, Q, R, gamma = _parse_lqr(lqr) L, M = _compute_lqr_intermediate_results(K, A, B, Q, R, gamma) vec_P = np.linalg.solve(M, L.reshape(-1)) return vec_P.reshape(Q.shape)
[docs]def compute_lqr_V(s, lqr, K): """ Computes the value function at a state s, with the given gain matrix K. Args: s (np.ndarray): state; lqr (LQR): LQR environment; K (np.ndarray): controller matrix. Returns: The value function at s """ if s.ndim == 1: s = s.reshape((1, -1)) P = compute_lqr_P(lqr, K) return -1. * np.einsum('...k,kl,...l->...', s, P, s).reshape(-1, 1)
[docs]def compute_lqr_V_gaussian_policy(s, lqr, K, Sigma): """ Computes the value function at a state s, with the given gain matrix K and covariance Sigma. Args: s (np.ndarray): state; lqr (LQR): LQR environment; K (np.ndarray): controller matrix; Sigma (np.ndarray): covariance matrix. Returns: The value function at s. """ b = _compute_lqr_V_gaussian_policy_additional_term(lqr, K, Sigma) return compute_lqr_V(s, lqr, K) - b
[docs]def compute_lqr_Q(s, a, lqr, K): """ Computes the state-action value function Q at a state-action pair (s, a), with the given gain matrix K. Args: s (np.ndarray): state; a (np.ndarray): action; lqr (LQR): LQR environment; K (np.ndarray): controller matrix. Returns: The Q function at s, a. """ if s.ndim == 1: s = s.reshape((1, -1)) if a.ndim == 1: a = a.reshape((1, -1)) sa = np.hstack((s, a)) M = _compute_lqr_Q_matrix(lqr, K) return -1. * np.einsum('...k,kl,...l->...', sa, M, sa).reshape(-1, 1)
[docs]def compute_lqr_Q_gaussian_policy(s, a, lqr, K, Sigma): """ Computes the state-action value function Q at a state-action pair (s, a), with the given gain matrix K and covariance Sigma. Args: s (np.ndarray): state; a (np.ndarray): action; lqr (LQR): LQR environment; K (np.ndarray): controller matrix; Sigma (np.ndarray): covariance matrix. Returns: The Q function at (s, a). """ b = _compute_lqr_Q_gaussian_policy_additional_term(lqr, K, Sigma) return compute_lqr_Q(s, a, lqr, K) - b
[docs]def compute_lqr_V_gaussian_policy_gradient_K(s, lqr, K, Sigma): """ Computes the gradient of the objective function J (equal to the value function V) at state s, w.r.t. the controller matrix K, with the current policy parameters K and Sigma. J(s, K, Sigma) = ValueFunction(s, K, Sigma). Args: s (np.ndarray): state; lqr (LQR): LQR environment; K (np.ndarray): controller matrix; Sigma (np.ndarray): covariance matrix. Returns: The gradient of J wrt to K. """ if s.ndim == 1: s = s.reshape((1, -1)) batch_size = s.shape[0] A, B, Q, R, gamma = _parse_lqr(lqr) L, M = _compute_lqr_intermediate_results(K, A, B, Q, R, gamma) Minv = np.linalg.inv(M) n_elems = K.shape[0]*K.shape[1] dJ = np.zeros((batch_size, n_elems)) for i in range(n_elems): dLi, dMi = _compute_lqr_intermediate_results_diff(K, A, B, R, gamma, i) vec_dPi = -Minv @ dMi @ Minv @ L.reshape(-1) + np.linalg.solve( M, dLi.reshape(-1) ) dPi = vec_dPi.reshape(Q.shape) dJ[:, i] = np.einsum('...k,kl,...l->...', s, dPi, s).reshape(-1, 1) \ + gamma * np.trace(Sigma @ B.T @ dPi @ B) / (1.0 - gamma) return -dJ
[docs]def compute_lqr_Q_gaussian_policy_gradient_K(s, a, lqr, K, Sigma): """ Computes the gradient of the state-action Value function at state-action pair (s, a), w.r.t. the controller matrix K, with the current policy parameters K and Sigma. Args: s (np.ndarray): state; a (np.ndarray): action; lqr (LQR): LQR environment; K (np.ndarray): controller matrix; Sigma (np.ndarray): covariance matrix. Returns: The gradient of Q wrt to K. """ if s.ndim == 1: s = s.reshape((1, -1)) if a.ndim == 1: a = a.reshape((1, -1)) s_next = (lqr.A @ s.T).T + (lqr.B @ a.T).T return lqr.info.gamma * compute_lqr_V_gaussian_policy_gradient_K( s_next, lqr, K, Sigma )
def _parse_lqr(lqr): return lqr.A, lqr.B, lqr.Q, lqr.R, lqr.info.gamma def _compute_riccati_rhs(A, B, Q, R, gamma, K, P): return Q + gamma * ( A.T @ P @ A - K.T @ B.T @ P @ A - A.T @ P @ B @ K + K.T @ B.T @ P @ B @ K) + K.T @ R @ K def _compute_riccati_gain(P, A, B, R, gamma): return gamma * np.linalg.inv((R + gamma * (B.T @ P @ B))) @ B.T @ P @ A def _compute_lqr_intermediate_results(K, A, B, Q, R, gamma): size = Q.shape[0] ** 2 L = Q + K.T @ R @ K kb = K.T @ B.T M = np.eye(size, size) - gamma * (np.kron(A.T, A.T) - np.kron(A.T, kb) - np.kron(kb, A.T) + np.kron(kb, kb)) return L, M def _compute_lqr_intermediate_results_diff(K, A, B, R, gamma, i): n_elems = K.shape[0]*K.shape[1] vec_dKi = np.zeros(n_elems) vec_dKi[i] = 1 dKi = vec_dKi.reshape(K.shape) kb = K.T @ B.T dkb = dKi.T @ B.T dL = dKi.T @ R @ K + K.T @ R @ dKi dM = gamma * (np.kron(A.T, dkb) + np.kron(dkb, A.T) - np.kron(dkb, kb) - np.kron(kb, dkb)) return dL, dM def _compute_lqr_Q_matrix(lqr, K): A, B, Q, R, gamma = _parse_lqr(lqr) P = compute_lqr_P(lqr, K) M = np.block([[Q + gamma * A.T @ P @ A, gamma * A.T @ P @ B], [gamma * B.T @ P @ A, R + gamma * B.T @ P @ B]]) return M def _compute_lqr_V_gaussian_policy_additional_term(lqr, K, Sigma): A, B, Q, R, gamma = _parse_lqr(lqr) P = compute_lqr_P(lqr, K) b = np.trace(Sigma @ (R + gamma * B.T @ P @ B)) / (1.0 - gamma) return b def _compute_lqr_Q_gaussian_policy_additional_term(lqr, K, Sigma): A, B, Q, R, gamma = _parse_lqr(lqr) P = compute_lqr_P(lqr, K) b = gamma / (1 - gamma) * np.trace(Sigma @ (R + gamma * B.T @ P @ B)) return b