From cb0c7e957cb632f8e4b1071260598751bb6a7b97 Mon Sep 17 00:00:00 2001 From: kleeman Date: Fri, 2 Feb 2024 11:36:44 -0800 Subject: [PATCH 1/3] Use PermutationMatrix instead of indices --- include/albatross/src/cereal/eigen.hpp | 23 ++++++++ include/albatross/src/cereal/gp.hpp | 16 +++--- include/albatross/src/core/declarations.hpp | 7 +++ include/albatross/src/linalg/qr_utils.hpp | 22 ++++---- include/albatross/src/linalg/spqr_utils.hpp | 11 ++-- include/albatross/src/models/sparse_gp.hpp | 59 ++++++++------------- tests/test_sparse_gp.cc | 8 +-- 7 files changed, 82 insertions(+), 64 deletions(-) diff --git a/include/albatross/src/cereal/eigen.hpp b/include/albatross/src/cereal/eigen.hpp index 346e190a..188b24ae 100644 --- a/include/albatross/src/cereal/eigen.hpp +++ b/include/albatross/src/cereal/eigen.hpp @@ -91,6 +91,29 @@ inline void load(Archive &archive, v.indices() = indices; } +template +inline void +save(Archive &archive, + const Eigen::PermutationMatrix &v, + const std::uint32_t) { + archive(cereal::make_nvp("indices", v.indices())); +} + +template +inline void +load(Archive &archive, + Eigen::PermutationMatrix &v, + const std::uint32_t) { + typename Eigen::PermutationMatrix::IndicesType indices; + archive(cereal::make_nvp("indices", indices)); + v.indices() = indices; +} + template inline void serialize(Archive &archive, Eigen::DiagonalMatrix<_Scalar, SizeAtCompileTime> &matrix, diff --git a/include/albatross/src/cereal/gp.hpp b/include/albatross/src/cereal/gp.hpp index b3ec473c..b3926397 100644 --- a/include/albatross/src/cereal/gp.hpp +++ b/include/albatross/src/cereal/gp.hpp @@ -41,8 +41,8 @@ inline void serialize(Archive &archive, Fit> &fit, archive(cereal::make_nvp("information", fit.information)); archive(cereal::make_nvp("train_covariance", fit.train_covariance)); archive(cereal::make_nvp("train_features", fit.train_features)); - archive(cereal::make_nvp("sigma_R", fit.sigma_R)); - archive(cereal::make_nvp("permutation_indices", fit.permutation_indices)); + archive(cereal::make_nvp("R", fit.R)); + archive(cereal::make_nvp("P", fit.P)); if (version > 1) { archive(cereal::make_nvp("numerical_rank", fit.numerical_rank)); } else { @@ -53,9 +53,9 @@ inline void serialize(Archive &archive, Fit> &fit, template -void save(Archive &archive, - const GaussianProcessBase &gp, - const std::uint32_t) { +inline void save(Archive &archive, + const GaussianProcessBase &gp, + const std::uint32_t) { archive(cereal::make_nvp("name", gp.get_name())); archive(cereal::make_nvp("params", gp.get_params())); archive(cereal::make_nvp("insights", gp.insights)); @@ -63,9 +63,9 @@ void save(Archive &archive, template -void load(Archive &archive, - GaussianProcessBase &gp, - const std::uint32_t version) { +inline void load(Archive &archive, + GaussianProcessBase &gp, + const std::uint32_t version) { if (version > 0) { std::string model_name; archive(cereal::make_nvp("name", model_name)); diff --git a/include/albatross/src/core/declarations.hpp b/include/albatross/src/core/declarations.hpp index 7cc8bf14..058ac3dd 100644 --- a/include/albatross/src/core/declarations.hpp +++ b/include/albatross/src/core/declarations.hpp @@ -21,6 +21,13 @@ template class variant; using mapbox::util::variant; +/* + * Permutations + */ +namespace Eigen { +using PermutationMatrixX = PermutationMatrix; +} + namespace albatross { /* diff --git a/include/albatross/src/linalg/qr_utils.hpp b/include/albatross/src/linalg/qr_utils.hpp index 6bbb9dac..0770042e 100644 --- a/include/albatross/src/linalg/qr_utils.hpp +++ b/include/albatross/src/linalg/qr_utils.hpp @@ -25,21 +25,23 @@ get_R(const Eigen::ColPivHouseholderQR &qr) { .template triangularView(); } +inline Eigen::PermutationMatrixX +get_P(const Eigen::ColPivHouseholderQR &qr) { + return Eigen::PermutationMatrixX( + qr.colsPermutation().indices().template cast()); +} + /* * Computes R^-T P^T rhs given R and P from a QR decomposition. */ -template +template inline Eigen::MatrixXd sqrt_solve(const Eigen::MatrixXd &R, - const PermutationIndicesType &permutation_indices, + const Eigen::PermutationMatrix &P, const MatrixType &rhs) { - - Eigen::MatrixXd sqrt(rhs.rows(), rhs.cols()); - for (Eigen::Index i = 0; i < permutation_indices.size(); ++i) { - sqrt.row(i) = rhs.row(permutation_indices.coeff(i)); - } - sqrt = R.template triangularView().transpose().solve(sqrt); - return sqrt; + return R.template triangularView().transpose().solve( + P.transpose() * rhs); } template @@ -47,7 +49,7 @@ inline Eigen::MatrixXd sqrt_solve(const Eigen::ColPivHouseholderQR &qr, const MatrixType &rhs) { const Eigen::MatrixXd R = get_R(qr); - return sqrt_solve(R, qr.colsPermutation().indices(), rhs); + return sqrt_solve(R, qr.colsPermutation(), rhs); } } // namespace albatross diff --git a/include/albatross/src/linalg/spqr_utils.hpp b/include/albatross/src/linalg/spqr_utils.hpp index b98e07bd..73aafdb4 100644 --- a/include/albatross/src/linalg/spqr_utils.hpp +++ b/include/albatross/src/linalg/spqr_utils.hpp @@ -19,19 +19,20 @@ using SparseMatrix = Eigen::SparseMatrix; using SPQR = Eigen::SPQR; -using SparsePermutationMatrix = - Eigen::PermutationMatrix; - inline Eigen::MatrixXd get_R(const SPQR &qr) { return qr.matrixR() .topLeftCorner(qr.cols(), qr.cols()) .template triangularView(); } +inline Eigen::PermutationMatrixX get_P(const SPQR &qr) { + return Eigen::PermutationMatrixX( + qr.colsPermutation().indices().template cast()); +} + template inline Eigen::MatrixXd sqrt_solve(const SPQR &qr, const MatrixType &rhs) { - return sqrt_solve(get_R(qr), qr.colsPermutation().indices(), rhs); + return sqrt_solve(get_R(qr), get_P(qr), rhs); } // Matrices with any dimension smaller than this will use a special diff --git a/include/albatross/src/models/sparse_gp.hpp b/include/albatross/src/models/sparse_gp.hpp index d83c969d..36234e4e 100644 --- a/include/albatross/src/models/sparse_gp.hpp +++ b/include/albatross/src/models/sparse_gp.hpp @@ -97,8 +97,8 @@ template struct Fit> { std::vector train_features; Eigen::SerializableLDLT train_covariance; - Eigen::MatrixXd sigma_R; - PermutationIndices permutation_indices; + Eigen::MatrixXd R; + Eigen::PermutationMatrixX P; Eigen::VectorXd information; Eigen::Index numerical_rank; @@ -106,12 +106,10 @@ template struct Fit> { Fit(const std::vector &features_, const Eigen::SerializableLDLT &train_covariance_, - const Eigen::MatrixXd &sigma_R_, - PermutationIndices &&permutation_indices_, + const Eigen::MatrixXd &R_, const Eigen::PermutationMatrixX &P_, const Eigen::VectorXd &information_, Eigen::Index numerical_rank_) - : train_features(features_), train_covariance(train_covariance_), - sigma_R(sigma_R_), permutation_indices(std::move(permutation_indices_)), - information(information_), numerical_rank(numerical_rank_) {} + : train_features(features_), train_covariance(train_covariance_), R(R_), + P(P_), information(information_), numerical_rank(numerical_rank_) {} void shift_mean(const Eigen::VectorXd &mean_shift) { ALBATROSS_ASSERT(mean_shift.size() == information.size()); @@ -120,9 +118,8 @@ template struct Fit> { bool operator==(const Fit> &other) const { return (train_features == other.train_features && - train_covariance == other.train_covariance && - sigma_R == other.sigma_R && - permutation_indices == other.permutation_indices && + train_covariance == other.train_covariance && R == other.R && + P.indices() == other.P.indices() && information == other.information && numerical_rank == other.numerical_rank); } @@ -325,9 +322,9 @@ class SparseGaussianProcessRegression compute_internal_components(old_fit.train_features, features, targets, &A_ldlt, &K_uu_ldlt, &K_fu, &y); - const Eigen::Index n_old = old_fit.sigma_R.rows(); + const Eigen::Index n_old = old_fit.R.rows(); const Eigen::Index n_new = A_ldlt.rows(); - const Eigen::Index k = old_fit.sigma_R.cols(); + const Eigen::Index k = old_fit.R.cols(); Eigen::MatrixXd B = Eigen::MatrixXd::Zero(n_old + n_new, k); ALBATROSS_ASSERT(n_old == k); @@ -335,10 +332,7 @@ class SparseGaussianProcessRegression // Form: // B = |R_old P_old^T| = |Q_1| R P^T // |A^{-1/2} K_fu| |Q_2| - for (Eigen::Index i = 0; i < old_fit.permutation_indices.size(); ++i) { - const Eigen::Index &pi = old_fit.permutation_indices.coeff(i); - B.col(pi).topRows(i + 1) = old_fit.sigma_R.col(i).topRows(i + 1); - } + B.topRows(old_fit.P.rows()) = old_fit.R * old_fit.P.transpose(); B.bottomRows(n_new) = A_ldlt.sqrt_solve(K_fu); const auto B_qr = QRImplementation::compute(B, Base::threads_.get()); @@ -347,13 +341,9 @@ class SparseGaussianProcessRegression // |A^{-1/2} y | ALBATROSS_ASSERT(old_fit.information.size() == n_old); Eigen::VectorXd y_augmented(n_old + n_new); - for (Eigen::Index i = 0; i < old_fit.permutation_indices.size(); ++i) { - y_augmented[i] = - old_fit.information[old_fit.permutation_indices.coeff(i)]; - } y_augmented.topRows(n_old) = - old_fit.sigma_R.template triangularView() * - y_augmented.topRows(n_old); + old_fit.R.template triangularView() * + (old_fit.P.transpose() * old_fit.information); y_augmented.bottomRows(n_new) = A_ldlt.sqrt_solve(y, Base::threads_.get()); const Eigen::VectorXd v = B_qr->solve(y_augmented); @@ -365,10 +355,9 @@ class SparseGaussianProcessRegression Eigen::VectorXd::Constant(B_qr->cols(), details::cSparseRNugget); } using FitType = Fit>; - return FitType( - old_fit.train_features, old_fit.train_covariance, R, - B_qr->colsPermutation().indices().template cast(), v, - B_qr->rank()); + + return FitType(old_fit.train_features, old_fit.train_covariance, R, + get_P(*B_qr), v, B_qr->rank()); } // Here we create the QR decomposition of: @@ -415,10 +404,7 @@ class SparseGaussianProcessRegression using InducingPointFeatureType = typename std::decay::type; using FitType = Fit>; - return FitType( - u, K_uu_ldlt, get_R(*B_qr), - B_qr->colsPermutation().indices().template cast(), v, - B_qr->rank()); + return FitType(u, K_uu_ldlt, get_R(*B_qr), get_P(*B_qr), v, B_qr->rank()); } template @@ -471,9 +457,8 @@ class SparseGaussianProcessRegression const Eigen::MatrixXd sigma_inv_sqrt = C_ldlt.sqrt_solve(K_zz); const auto B_qr = QRImplementation::compute(sigma_inv_sqrt, nullptr); - new_fit.permutation_indices = - B_qr->colsPermutation().indices().template cast(); - new_fit.sigma_R = get_R(*B_qr); + new_fit.P = get_P(*B_qr); + new_fit.R = get_R(*B_qr); new_fit.numerical_rank = B_qr->rank(); return output; @@ -519,8 +504,8 @@ class SparseGaussianProcessRegression Q_sqrt.cwiseProduct(Q_sqrt).array().colwise().sum(); marginal_variance -= Q_diag; - const Eigen::MatrixXd S_sqrt = sqrt_solve( - sparse_gp_fit.sigma_R, sparse_gp_fit.permutation_indices, cross_cov); + const Eigen::MatrixXd S_sqrt = + sqrt_solve(sparse_gp_fit.R, sparse_gp_fit.P, cross_cov); const Eigen::VectorXd S_diag = S_sqrt.cwiseProduct(S_sqrt).array().colwise().sum(); marginal_variance += S_diag; @@ -537,8 +522,8 @@ class SparseGaussianProcessRegression this->covariance_function_(sparse_gp_fit.train_features, features); const Eigen::MatrixXd prior_cov = this->covariance_function_(features); - const Eigen::MatrixXd S_sqrt = sqrt_solve( - sparse_gp_fit.sigma_R, sparse_gp_fit.permutation_indices, cross_cov); + const Eigen::MatrixXd S_sqrt = + sqrt_solve(sparse_gp_fit.R, sparse_gp_fit.P, cross_cov); const Eigen::MatrixXd Q_sqrt = sparse_gp_fit.train_covariance.sqrt_solve(cross_cov); diff --git a/tests/test_sparse_gp.cc b/tests/test_sparse_gp.cc index 693fc013..1dbdc539 100644 --- a/tests/test_sparse_gp.cc +++ b/tests/test_sparse_gp.cc @@ -322,10 +322,10 @@ TYPED_TEST(SparseGaussianProcessTest, test_update) { (updated_in_place_pred.covariance - full_pred.covariance).norm(); auto compute_sigma = [](const auto &fit_model) -> Eigen::MatrixXd { - const Eigen::Index n = fit_model.get_fit().sigma_R.cols(); - Eigen::MatrixXd sigma = sqrt_solve(fit_model.get_fit().sigma_R, - fit_model.get_fit().permutation_indices, - Eigen::MatrixXd::Identity(n, n)); + const Eigen::Index n = fit_model.get_fit().R.cols(); + Eigen::MatrixXd sigma = + sqrt_solve(fit_model.get_fit().R, fit_model.get_fit().P, + Eigen::MatrixXd::Identity(n, n)); return sigma.transpose() * sigma; }; From 61a4eb96360fd1182be8701720f3bc25c78380a1 Mon Sep 17 00:00:00 2001 From: kleeman Date: Fri, 2 Feb 2024 10:38:52 -0800 Subject: [PATCH 2/3] Update documentation --- doc/src/sparse-gp-details.rst | 385 ++++++++++++++++++++++++++-------- 1 file changed, 295 insertions(+), 90 deletions(-) diff --git a/doc/src/sparse-gp-details.rst b/doc/src/sparse-gp-details.rst index ce5bab0a..c2b003c4 100644 --- a/doc/src/sparse-gp-details.rst +++ b/doc/src/sparse-gp-details.rst @@ -171,159 +171,364 @@ A prediction can then be made by computing, :math:`V_a = K_{uu}^{-1/2} K_{u*}` a [f^*|f=y] \sim \mathcal{N}(K_{*u} v, K_{**} - V_a^T V_a + V_b^T V_b) ---------------------- -Adding a Group ---------------------- +------------------------- +Adding a Group (Updating) +------------------------- -Our implementation of the Sparse Gaussian Process can be efficiently updated with new groups in an online fasion. In otherwords this allows you to do: +Sparse Gaussian Processes can be efficiently updated with new groups in +an online fashion. In other words this allows you to do: -.. code-block:: c +:: - auto fit_model = model.fit(dataset_a); - fit_model.update_in_place(dataset_b); + auto fit_model = model.fit(dataset_a); + fit_model.update_in_place(dataset_b); Which will be equivalent to: -.. code-block:: c +:: + + auto fit_model = model.fit(concatenate(dataset_a, dataset_b)); + +There are some papers which describe methods for performing online +updates to sparse Gaussian processes. The paper - fit_model == model.fit(concatenate_datasets(dataset_a, dataset_b)); +:: -There are some papers which describe methods for performing online updates to sparse gaussian processes. The paper + Streaming Sparse Gaussian Process Approximations + Thang D Bui, Cuong Nguyen, and Richard E Turner. + https://arxiv.org/abs/1705.07131 - [4] Streaming Sparse Gaussian Process Approximations - Thang D Bui, Cuong Nguyen, and Richard E Turner. - https://arxiv.org/abs/1705.07131 +describes a way of both adding online observations and updating the +inducing points for the the Variational Free Energy (VFE) approach +(which is closely related to FITC). And, -describes a way of both adding online observations and updating the inducing points for the the Variational Free Energy (VFE) approach (which is closely related to FITC). +:: - [4] Online sparse Gaussian process regression using FITC and PITCapproximations - Hildo Bijl, Jan-Willem van Wingerden, Thomas B. Sch ̈on, and Michel Ver-haegen. - https://hildobijl.com/Downloads/OnlineSparseGP.pdf + Online sparse Gaussian process regression using FITC and PITC approximations + Hildo Bijl, Jan-Willem van Wingerden, Thomas B. Sch ̈on, and Michel Ver-haegen. + https://hildobijl.com/Downloads/OnlineSparseGP.pdf -describes an approach to performing online updates to FITC and PITC but focuses on rank one updates in which the entire covariance is stored. Here we describe how to update FITC and PITC with new batches of data. These batches may contain a single observation (FITC) or a new group (PITC) and are used to update the QR decomposition (rather than the full dense covariances) used in the direct fit. +describes an approach to performing online updates to FITC and PITC but +focuses on rank one updates in which the entire covariance is stored. +Here we describe how to update FITC and PITC with new batches of data. +These batches may contain a single observation (FITC) or a new group +(PITC) and are used to update the QR decomposition (rather than the full +dense covariances) used in the direct fit. -Consider the situation where we are first given a set of observation :math:`y_a`, fit the model, then want to update the model with new observations :math:`y_b`. The existing model will consist of, :math:`v_a`, :math:`R_a`, :math:`P_a`, and :math:`L_{uu}` such that: +Consider the situation where we are first given a set of observations +:math:`y_a`, fit the model, then want to update the model with new +observations :math:`y_b`. The existing model will consist of, +:math:`v_a`, :math:`R_a`, :math:`P_a`, and :math:`L_{uu}` such that, .. math:: - - \Sigma_a^{-1} &= \left(K_{uu} + K_{ua} \Lambda_a^{-1} K_{au}\right) \\ - &= P_a R_a^T R_a P_a^T \\ - v_a &= \Sigma_a K_{ua} \Lambda_{a}^{-1} y_a \\ - &= P_a R_a^{-1} Q_a1^T \Lambda_a^{-1/2} y_a \\ - K_{uu} &= L_{uu} L_{uu}^T - + \begin{aligned} + \Sigma_a^{-1} &= \left(K_{uu} + K_{ua} \Lambda_a^{-1} K_{au}\right) \\ + &= P_a R_a^T R_a P_a^T \\ + v_a &= \Sigma_a K_{ua} \Lambda_{a}^{-1} y_a \\ + &= P_a R_a^{-1} Q_a1^T \Lambda_a^{-1/2} y_a \\ + K_{uu} &= L_{uu} L_{uu}^T + \end{aligned} -We'll be given a new group in the form of raw observations: +We’ll be given new observations and can use the same low rank prior, .. math:: - - y_b \sim \mathcal{N}\left(y_b, \Lambda_b + K_{bu} K_{uu}^{-1} K_{ub}\right) + \begin{aligned} + y_b \sim \mathcal{N}\left(m_b, \Lambda_b + K_{bu} K_{uu}^{-1} K_{ub}\right). + \end{aligned} -And we wish to produce a new :math:`\hat{v}`, :math:`\hat{R}` and :math:`\hat{P}` which produce the same predictions as we would have gotten if we'd fit to :math:`\hat{y} = \begin{bmatrix} y_a \\ y_b \end{bmatrix}` directly. +And we wish to produce a new :math:`\hat{v}`, :math:`\hat{R}` and +:math:`\hat{P}` which produce the same predictions as we would have +gotten if we’d fit to +:math:`\hat{y} = \begin{bmatrix} y_a \\ y_b \end{bmatrix}` directly. -To do so we start by explicitly writing out the components we would get with all groups available. We'll use a hat, :math:`\hat{a}` to indicate quantities which correspond to a full fit. Starting with :math:`\hat{\Sigma}`, +To do so we start by explicitly writing out the components we would get +with all groups available. We’ll use a hat, :math:`\hat{a}`, to indicate +quantities which correspond to a full fit. Starting with +:math:`\hat{\Sigma}`, .. math:: - - \hat{\Sigma} &= \left(K_{uu} + K_{uf} \hat{\Lambda}^{-1} K_{fu}\right)^{-1} \\ - &=\left(K_{uu} + - \begin{bmatrix} K_{ua} & K_{ub} \end{bmatrix} - \begin{bmatrix} \Lambda_a & 0 \\ 0 & \Lambda_b \end{bmatrix}^{-1} \begin{bmatrix} K_{au} \\ K_{bu} \end{bmatrix} - \right)^{-1} \\ - &= \left(\Sigma_a^{-1} + K_{ub} \Lambda_b^{-1} K_{bu} - \right)^{-1} - + \begin{aligned} + \hat{\Sigma} &= \left(K_{uu} + K_{uf} \hat{\Lambda}^{-1} K_{fu}\right)^{-1} \\ + &=\left(K_{uu} + + \begin{bmatrix} K_{ua} & K_{ub} \end{bmatrix} + \begin{bmatrix} \Lambda_a & 0 \\ 0 & \Lambda_b \end{bmatrix}^{-1} \begin{bmatrix} K_{au} \\ K_{bu} \end{bmatrix} + \right)^{-1} \\ + &= \left(K_{uu} + K_{ua} \Lambda_a^{-1} K_{au} + K_{ub} \Lambda_b^{-1} K_{bu} + \right)^{-1} \\ + &= \left(\Sigma_a^{-1} + K_{ub} \Lambda_b^{-1} K_{bu} + \right)^{-1} + \end{aligned} + +We can then prepare for a similar QR approach to the one we used when +fitting and find a :math:`\hat{B}` such that +:math:`\hat{\Sigma} = \left(\hat{B}^T \hat{B}\right)^{-1}`. We can see +that by setting, + +.. math:: + + \begin{aligned} + \hat{B} &= \begin{bmatrix} R_a P_a^T \\ \Lambda_{b}^{-1/2} K_{bu} \end{bmatrix} + \end{aligned} -We can then find a :math:`\hat{B}` such that :math:`\hat{\Sigma} = \left(\hat{B}^T \hat{B}\right)^{-1}` using the same approach as Equation~\ref{eq:B_qr}. In particular we can see that by setting, +We can recover :math:`\hat{\Sigma}` by, .. math:: - - \hat{B} &= \begin{bmatrix} R_a P_a^T \\ \Lambda_{b}^{-1/2} K_{bu} \end{bmatrix} - + \begin{aligned} + \hat{\Sigma} &= \left(\hat{B}^T \hat{B}\right)^{-1} \\ + &= \left(P_a R_a^T R_a P_a^T + K_{ub} \Lambda_b^{-1} K_{bu} + \right)^{-1}\\ + &= \left(\Sigma_a^{-1} + K_{ub} \Lambda_b^{-1} K_{bu} + \right)^{-1} + \end{aligned} -We can then represent :math:`\hat{\Sigma}` by, +So by solving for the QR decomposition, .. math:: - - \hat{\Sigma} &= \left(\hat{B}^T \hat{B}\right)^{-1} \\ - &= \left(P_a R_a^T R_a P_a^T + K_{ub} \Lambda_b^{-1} K_{bu} - \right)^{-1}\\ - &= \left(\Sigma_a^{-1} + K_{ub} \Lambda_b^{-1} K_{bu} - \right)^{-1} - + \begin{aligned} + \hat{B} &= \begin{bmatrix} R_a P_a^T \\ \Lambda_{b}^{-1/2} K_{bu} \end{bmatrix} \\ + &= \hat{Q} \hat{R} \hat{P}^T + \end{aligned} + +We get the new updated values for :math:`\hat{P}` and :math:`\hat{R}`. -We then need to update the existing QR decomposition to get :math:`\hat{P}` and :math:`\hat{R}`, +Now we need to figure out how to update the information vector +:math:`\hat{v}`. If we had fit the model all at once the information +vector would be, .. math:: - - \hat{B} &= \begin{bmatrix} R_a P_a^T \\ \Lambda_{b}^{-1/2} K_{bu} \end{bmatrix} \\ - &= \hat{Q} \hat{R} \hat{P}^T - + \begin{aligned} + \hat{v} &= \left(K_{uu} + K_{uf} \hat{\Lambda}^{-1} K_{fu}\right)^{-1} K_{uf} \hat{\Lambda}^{-1} \hat{y} \\ + &= \hat{\Sigma} K_{uf}\hat{\Lambda}^{-1} \hat{y} + \end{aligned} -Now we need to figure out how to update the information vector :math:`\hat{v}`. If we had fit the model all at once the information vector would take the form, +which we can divide into new and old observations, .. math:: - - \hat{v} &= \left(K_{uu} + K_{uf} \hat{\Lambda}^{-1} K_{fu}\right)^{-1} K_{uf} \hat{\Lambda}^{-1} \hat{y} \\ - &= \hat{\Sigma} \begin{bmatrix} K_{ua} \Lambda_a^{-1} & K_{ub} \Lambda_b^{-1} \end{bmatrix} \begin{bmatrix} y_a \\ y_b \end{bmatrix} - + \begin{aligned} + \hat{v} &= \hat{\Sigma} \begin{bmatrix} K_{ua} \Lambda_a^{-1} & K_{ub} \Lambda_b^{-1} \end{bmatrix} \begin{bmatrix} y_a \\ y_b \end{bmatrix} + \end{aligned} -We'll already have the QR decomposition of :math:`\hat{B}` so we can try to find the :math:`z` such that :math:`\hat{v}` is the solution to the least squares problem, :math:`\left\lVert \hat{B} \hat{v} - z\right\rVert`. Solving this system gives us, +We’ll already have the QR decomposition of :math:`\hat{B}` which we can +use to compute solutions in the form, .. math:: - - \hat{v} &= \left(\hat{B}^T \hat{B}\right)^{-1} \hat{B}^T z \\ - &= \hat{\Sigma} \begin{bmatrix} P_a R_a^T & K_{ub} \Lambda_b^{-1/2} \end{bmatrix} \begin{bmatrix} z_a \\ z_b \end{bmatrix} \\ - + \begin{aligned} + \hat{v} &= \left(\hat{B}^T \hat{B}\right)^{-1} \hat{B}^T z \\ + &= \hat{\Sigma} \hat{B}^T z + \end{aligned} -From which we can see that if we set, +so if we can find a :math:`z` such that + +.. math:: + + \begin{aligned} + \hat{B}^T z = K_{uf}\hat{\Lambda}^{-1} \hat{y} + \end{aligned} + +then we can use the QR decomposition of :math:`\hat{B}` to get +:math:`\hat{v}`. By again dividing into new and old vectors we see that +we need, + +.. math:: + + \begin{aligned} + \begin{bmatrix} P_a R_a^T & K_{ub} \Lambda_{b}^{-1/2} \end{bmatrix} \begin{bmatrix} z_a \\ z_b \end{bmatrix} &= \begin{bmatrix} K_{ua} \Lambda_a^{-1} & K_{ub} \Lambda_b^{-1} \end{bmatrix} \begin{bmatrix} y_a \\ y_b \end{bmatrix} + \end{aligned} + +We can satisfy that equality if we set, + +.. math:: + + \begin{aligned} + P_a R_a^T z_a &= K_{ua} \Lambda_a^{-1} y_a \\ + K_{ub} \Lambda_b^{-1/2} z_b &= K_{ub} \Lambda_b^{-1} y_b + \end{aligned} + +Which gives us, .. math:: - - P_a R_a^T z_a &= K_{ua} \Lambda_a^{-1} y_a \\ - &= \Sigma_a^{-1} v_a \\ - z_a &= R_a^{-T} P_a^T \Sigma_a^{-1} v_a \\ - &= R_a^{-T} P_a^T P_a R_a^T R_a P_a^T v_a \\ - &= R_a P_a^T v_a - + \begin{aligned} + z_a &= R_a^{-T} P_a^T K_{ua} \Lambda_a^{-1} y_a \\ + &= R_a^{-T} P_a^T \Sigma_a^{-1} v_a \\ + &= R_a^{-T} P_a^T P_a R_a^T R_a P_a^T v_a \\ + &= R_a P_a^T v_a + \end{aligned} and .. math:: - - z_b = \Lambda_b^{-1/2} y_b -Then the following QR solution will effectively update the information vector, + \begin{aligned} + z_b = \Lambda_b^{-1/2} y_b + \end{aligned} + +Then we can plug that into the QR solution to get, .. math:: - - \hat{v} &= \hat{P} \hat{R}^{-1} \hat{Q}^T \begin{bmatrix}R_a P_a^T v_a \\ \Lambda_b^{-1/2} y_b \end{bmatrix} - -After an update the only term which changes in the posterior covariance in Equation~\ref{eq:posterior} is the computation of the explained covariance, + \begin{aligned} + \hat{v} &= \left(\hat{B}^T \hat{B}\right)^{-1} \hat{B}^T z \\ + &=\hat{P} \hat{R}^{-1} \hat{Q}^T \begin{bmatrix}R_a P_a^T v_a \\ \Lambda_b^{-1/2} y_b \end{bmatrix} + \end{aligned} + +In summary, updating an existing sparse Gaussian process (where any +added observations are considered independent of existing ones) can be +done by, + +- Computing :math:`\Lambda_b^{-1/2}`. + +- Computing the QR decomposition, + :math:`\hat{Q} \hat{R} \hat{P}^T = \begin{bmatrix}R_a P_a^T \\ \Lambda_b^{-1/2} K_{bu} \end{bmatrix}`. + +- Setting + :math:`\hat{v} = \hat{P} \hat{R}^{-1}\hat{Q}^T \begin{bmatrix} R_a P_a^T v_a \\ \Lambda_b^{-1/2} y_b \end{bmatrix}` + +Note: As long as the new datasets you add consist of different groups +you can continuously update the sparse Gaussian process retaining the +same model you’d get if you had fit everything at once. For FITC this is +always the case (since each observation is treated independently), for +PITC care needs to be taken that you don’t update with a dataset +containing groups which overlap with previous updates, the result would +be over-confident predictions. + +------------------------ +Rebasing Inducing Points +------------------------ + +Consider a problem which is temporal in nature and you’d like to be able +to fit a model with some observations, make predictions, then update +that model with more recent observations and repeat. You could do this +using a dense Gaussian process by simply concatenating previous and +new observations into a larger dataset, but the result would be +unbounded growth in the model size and it would quickly become too large to +manage. + +Instead, you could use a sparse Gaussian process, in which case you may +fit with observations on time, :math:`t_p`, then update the model with +new observations on time :math:`t_n`, and repeat. This would keep the +computation costs bounded, but if your problem is temporal in nature, +the time associated with the inducing points may diverge from the +time of the observations, causing the inducing point approximation to +degrade. + +Here we describe how to take a model which has already been fit using +inducing points valid for some previous times, :math:`p`, and then +advance them forward in time. This temporal example here is just that; +an example. These operations do not require a temporal problem, they +more generally describe how to take a model based on some previous set +of inducing points, :math:`p`, and find an equivalent model based on new +inducing points, :math:`n`. The result opens up a style of algorithms +which involve, updating a model with observations, advancing the model +forward in time, updating with new observations and repeating. + +Details +------- + +Consider the situation where you’ve already fit a model using +observations :math:`f` with inducing points :math:`p` and you then want +to rebase the model on inducing points :math:`n`. This means you will have +computed, .. math:: - - E_{**} = Q_{**} - K_{*u} \hat{\Sigma} K_{u*} -And since we've already computed :math:`\hat{\Sigma} = \left(\hat{B}^T \hat{B}\right)^{-1}` we don't need to do any further work. + \begin{aligned} + \Sigma_p &= B_p^T B_p \\ + &= K_{pp} + K_{pf} \Lambda^{-1} K_{fp} + \end{aligned} + +and would like to find the equivalent for the new inducing points, + +.. math:: + + \begin{aligned} + \Sigma_n &= B_n^T B_n \\ + &= K_{nn} + K_{nf} \Lambda^{-1} K_{fn} + \end{aligned} + +We don’t have :math:`K_{nf}` because when we fit the model using +observations, :math:`f`, we didn’t know the next inducing points, +:math:`n`. But we could use the Nystrom approximation +:math:`K_{nf} = K_{np} K_{pp}^{-1} K_{pf}` which can be interpreted as +saying: the only information we can capture in the new inducing points +is information that was already captured using the previous ones. + +.. math:: + + \begin{aligned} + \Sigma_n &= K_{nn} + K_{nf} \Lambda^{-1} K_{fn} \\ + &\approx K_{nn} + K_{np} K_{pp}^{-1} K_{pf} \Lambda^{-1} K_{fp} K_{pp}^{-1} K_{pn} + \end{aligned} + +Now we can add and subtract a :math:`K_{np}K_{pp}^{-1}K_{pn}` term and +do some rearranging, + +.. math:: + + \begin{aligned} + \Sigma_n &= K_{nn} + K_{np} K_{pp}^{-1} K_{pf} \Lambda^{-1} K_{fp} K_{pp}^{-1} K_{pn} + K_{np}K_{pp}^{-1}K_{pn} - K_{np}K_{pp}^{-1}K_{pn} \\ + &= K_{nn} + K_{np} K_{pp}^{-1} \left(K_{pp} + K_{pf} \Lambda^{-1} K_{fp} \right) K_{pp}^{-1} K_{pn} - K_{np}K_{pp}^{-1}K_{pn} + \end{aligned} + +Remember that +:math:`P_p R_p^T R_p P_p^T = K_{pp} + K_{pf} \Lambda^{-1} K_{fp}` and if +we solve for :math:`\hat{L}_{nn}` such that +:math:`\hat{L}_{nn} \hat{L}_{nn}^T = K_{nn} - K_{np}K_{pp}^{-1}K_{pn}`, +then we have, + +.. math:: + + \begin{aligned} + \Sigma_n &= \left(R_{p} P_p^T K_{pp}^{-1} K_{pn}\right)^T \left(R_{p}P_p^T K_{pp}^{-1} K_{pn}\right) + \hat{L}_{nn} \hat{L}_{nn}^T + \end{aligned} + +we can write this as the symmetric product, +:math:`\Sigma_n = \hat{B}_n^T \hat{B}_n`, where + +.. math:: + + \begin{aligned} + \hat{B}_n &= \begin{bmatrix} \hat{L}_{nn}^T \\ R_{p} P_p^T K_{pp}^{-1} K_{pn} \end{bmatrix}. + \end{aligned} + +Then by solving for :math:`\hat{Q}_n \hat{R}_n = \hat{B}_n \hat{P}_n` we +have, + +.. math:: + + \begin{aligned} + \Sigma_n &\approx \hat{B}_n^T \hat{B}_n \\ + &= \hat{P}_n \hat{R}_n^T \hat{R}_n \hat{P}_n^T \\ + &= K_{nn} + K_{nf} \Lambda^{-1} K_{fn} + \end{aligned} + +Summary +------- + +Starting with :math:`L_{pp}` and :math:`R_p` we can rebase onto new +inducing points by, + +- Building the matrices :math:`K_{pn}` and :math:`K_{nn}`. + +- Computing :math:`A = L_{pp}^{-1}K_{pn}`. + +- Solving for :math:`\hat{L}_{nn}` such that + :math:`\hat{L}_{nn} \hat{L}_{nn}^T= K_{nn} - A^T A`. -In summary, updating an existing sparse Gaussian process (where any added observations are considered independent of existing ones) can be done by, +- Solving for + :math:`\hat{Q}_n\hat{R}_n = \begin{bmatrix}\hat{L}_{nn}^T \\ R_p P_p^T L_{pp}^{-T} A \end{bmatrix}`. -- Computing :math:`\Lambda_b^{-1/2}`. -- Computing (or updating) the QR decomposition, :math:`\hat{Q} \hat{R} \hat{P}^T = \begin{bmatrix}R_a P_a^T \\ \Lambda_b^{-1/2} K_{bu} \end{bmatrix}`. -- Setting :math:`\hat{v} = \hat{P} \hat{R}^{-1}\hat{Q}^T \begin{bmatrix} R_a P_a^T v_a \\ \Lambda_b^{-1/2} y_b \end{bmatrix}` +- Solving for :math:`L_{nn} = \mbox{chol}(K_{nn})`. -Note: As long as the new datasets you add consist of different groups you can continuously update the sparse Gaussian process retaining the same model you'd get if you had fit everything at once. For FITC (each observation is treated independently) this is always the case, for PITC care needs to be taken that you don't update with a dataset containing groups which overlap with previous updates, the result would be over-confident predictions. +- Solving for :math:`v_n = K_{nn}^{-1} K_{np} v_p` --------------------- Alternative Approach From 33f3f78a82c5f85501f47b11310e29be4d0f2714 Mon Sep 17 00:00:00 2001 From: kleeman Date: Fri, 2 Feb 2024 10:39:11 -0800 Subject: [PATCH 3/3] New Rebase Approach --- include/albatross/src/models/sparse_gp.hpp | 79 ++++++++++++++++++++-- 1 file changed, 75 insertions(+), 4 deletions(-) diff --git a/include/albatross/src/models/sparse_gp.hpp b/include/albatross/src/models/sparse_gp.hpp index 36234e4e..9b757d97 100644 --- a/include/albatross/src/models/sparse_gp.hpp +++ b/include/albatross/src/models/sparse_gp.hpp @@ -703,15 +703,86 @@ class SparseGaussianProcessRegression // rebase_inducing_points takes a Sparse GP which was fit using some set of // inducing points and creates a new fit relative to new inducing points. +// // Note that this will NOT be the equivalent to having fit the model with // the new inducing points since some information may have been lost in // the process. -template +// +// For example, consider the extreme case where your first fit +// doesn't have any inducing points at all, all the information from the first +// observations will have been lost, and when you rebase on new inducing points +// you'd have the prior for those new points. +// +// For implementation details see the online documentation. +// +// The summary involves: +// - Compute K_nn = cov(new, new) +// - Compute K_pn = cov(prev, new) +// - Compute A = L_pp^-1 K_pn +// - Solve for Lhat_nn = chol(K_nn - A^T A) +// - Solve for QRP^T = [Lat_nn +// R_p P_p^T L_pp^-T A] +// - Solve for L_nn = chol(K_nn) +// - Solve for v_n = K_nn^-1 K_np v_p +// +template auto rebase_inducing_points( - const FitModel>> &fit_model, + const FitModel, + Fit>> &fit_model, const std::vector &new_inducing_points) { - return fit_model.get_model().fit_from_prediction( - new_inducing_points, fit_model.predict(new_inducing_points).joint()); + + const auto &cov = fit_model.get_model().get_covariance(); + // Compute K_nn = cov(new, new) + const Eigen::MatrixXd K_nn = + cov(new_inducing_points, fit_model.get_model().threads_.get()); + + // Compute K_pn = cov(prev, new) + const Fit> &prev_fit = fit_model.get_fit(); + const auto &prev_inducing_points = prev_fit.train_features; + const Eigen::MatrixXd K_pn = cov(prev_inducing_points, new_inducing_points, + fit_model.get_model().threads_.get()); + // A = L_pp^-1 K_pn + const Eigen::MatrixXd A = prev_fit.train_covariance.sqrt_solve(K_pn); + const Eigen::Index p = K_pn.rows(); + const Eigen::Index n = K_nn.rows(); + Eigen::MatrixXd B = Eigen::MatrixXd::Zero(n + p, n); + + // B[upper] = R P^T L_pp^-T A + const auto LTiA = prev_fit.train_covariance.sqrt_transpose_solve(A); + B.topRows(p) = prev_fit.R.template triangularView() * + (prev_fit.P.transpose() * LTiA); + + // B[lower] = chol(K_nn - A^T A)^T + Eigen::MatrixXd S_nn = K_nn - A.transpose() * A; + // This cholesky operation here is the most likely to experience numerical + // instability because of the A^T A subtraction involved, so we add a nugget. + const double nugget = + fit_model.get_model().get_params()[details::inducing_nugget_name()].value; + assert(nugget >= 0); + S_nn.diagonal() += Eigen::VectorXd::Constant(S_nn.rows(), nugget); + B.bottomRows(n) = Eigen::SerializableLDLT(S_nn).sqrt_transpose(); + + const auto B_qr = + QRImplementation::compute(B, fit_model.get_model().threads_.get()); + + Fit> new_fit; + new_fit.train_features = new_inducing_points; + new_fit.train_covariance = Eigen::SerializableLDLT(K_nn); + // v_n = K_nn^-1 K_np v_p + new_fit.information = new_fit.train_covariance.solve( + fit_model.predict(new_inducing_points).mean()); + new_fit.P = get_P(*B_qr); + new_fit.R = get_R(*B_qr); + new_fit.numerical_rank = B_qr->rank(); + + return FitModel< + SparseGaussianProcessRegression, + Fit>>(fit_model.get_model(), std::move(new_fit)); } template