From c91e4813fa6b588aae83dd7ac1446eb0ace46aa7 Mon Sep 17 00:00:00 2001
From: Filippo Airaldi <f.airaldi@tudelft.nl>
Date: Sat, 16 Dec 2023 00:10:21 +0100
Subject: [PATCH] added V to regressor forward

---
 examples_bt/myopic_acquisitions_examples.py |  2 +-
 src/globopt/regression.py                   | 44 +++++++++++++--------
 tests/test_myopic.py                        |  2 +-
 3 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/examples_bt/myopic_acquisitions_examples.py b/examples_bt/myopic_acquisitions_examples.py
index 3325f14..66c3af6 100644
--- a/examples_bt/myopic_acquisitions_examples.py
+++ b/examples_bt/myopic_acquisitions_examples.py
@@ -40,7 +40,7 @@
 
 # predict the (normal) posterior over all domain via fitted model
 X = torch.linspace(lb, ub, 1000).view(1, -1, 1)
-y_hat, s, W_sum_recipr = mdl(X)
+y_hat, s, W_sum_recipr, _ = mdl(X)
 
 # compute acquisition function by components
 z = _idw_distance(W_sum_recipr)
diff --git a/src/globopt/regression.py b/src/globopt/regression.py
index 6b35dac..18a1e9a 100644
--- a/src/globopt/regression.py
+++ b/src/globopt/regression.py
@@ -18,8 +18,11 @@
 # Conventions
 # ------------
 # botorch uses the convention `b0 x b1 x ... x q x d`, where
-# * `b-i` is the number of batches of candidates to evaluate in parallel
-# * `q` is the number of candidates to consider jointly
+# * `b-i` is the number of batches of candidates to evaluate in parallel; the
+#   minimization of the acquisition function consists in minimizing its sum over the
+#   batches, and taking the best one at last
+# * `q` is the number of candidates to consider jointly per batch; often, the best is
+#   taken out of these per batch
 # * `d` is the dimension of the design space of each `q`-th candidate.
 # Note that while there might be more than one batch dimension, usually we need just one
 # in important methods.
@@ -38,13 +41,17 @@
 # Interfacing
 # -----------
 # We make here the distinction between the myopic and non-myopic case.
-# * myopic case: for the simplest cases, i.e., the analytical acquisition functions, we
-#   expect `q = 1`. This means that the `b` dimension is botorch can be swapped in
-#   second place and used as the `m`. Usually, we call it `n` to distinguish the `m`
-#   training points from the `n` prediction points.
-#   For the Monte Carlo myopic case, TODO
-# * non-myopic case: TODO: would `b x q x 1 x d` work for regressors as repeated as
-# `b x q x m x d`?
+# * myopic case:
+#   * `MyopicAcquisitionFunction`: in this acquisition function, `q = 1`. This means
+#     that the `b` dimension is botorch is automatically swapped in second place and
+#     used as the `m` (usually, we use `n` for prediction points, and `m` for training).
+#   * `MyopicAcquisitionFunctionInExpectation`: TODO
+#   * `qMcMyopicAcquisitionFunction`: here, `q > 1` is the number of points considered
+#     in parallel, while `b` is the number of batches of these. The acquisition function
+#     is minimized over the sum of its batches, and for each the best candidate out of
+#     `q` is taken.
+# * non-myopic case:
+#   TODO: would `b x q x 1 x d` work for regressors as repeated as `b x q x m x d`?
 
 
 from typing import Any, Optional, Union
@@ -89,7 +96,7 @@ def posterior(self, X: Tensor, **_: Any) -> GPyTorchPosterior:
         self.eval()
         # NOTE: do not modify input/output shapes here. It is the responsibility of the
         # acquisition function calling this method to do so.
-        mean, scale, W_sum_recipr = self.forward(X)
+        mean, scale, W_sum_recipr, V = self.forward(X)
         # NOTE: it's a bit sketchy, but `W_sum_recipr` is needed by the acquisition
         # functions. It gets first computed here, so it is convenient to manually attach
         # it to the model for later re-use.
@@ -99,6 +106,7 @@ def posterior(self, X: Tensor, **_: Any) -> GPyTorchPosterior:
         posterior = GPyTorchPosterior(distribution)
         posterior._scale = scale
         posterior._W_sum_recipr = W_sum_recipr
+        posterior._V = V
         return posterior
 
 
@@ -125,20 +133,20 @@ def _idw_scale(Y: Tensor, train_Y: Tensor, V: Tensor) -> Tensor:
 
 def _idw_predict(
     train_X: Tensor, train_Y: Tensor, X: Tensor
-) -> tuple[Tensor, Tensor, Tensor]:
+) -> tuple[Tensor, Tensor, Tensor, Tensor]:
     """Mean and scale for IDW regression."""
     W = torch.cdist(X, train_X).square().clamp_min(DELTA).reciprocal()
     W_sum_recipr = W.sum(-1, keepdim=True).reciprocal()
     V = W.mul(W_sum_recipr)
     mean = V.matmul(train_Y)
     std = _idw_scale(mean, train_Y, V)
-    return mean, std, W_sum_recipr
+    return mean, std, W_sum_recipr, V
 
 
 class Idw(BaseRegression):
     """Inverse Distance Weighting regression model in Global Optimization."""
 
-    def forward(self, X: Tensor) -> tuple[Tensor, Tensor, Tensor]:
+    def forward(self, X: Tensor) -> tuple[Tensor, Tensor, Tensor, Tensor]:
         """Computes the IDW regression model.
 
         Parameters
@@ -155,6 +163,8 @@ def forward(self, X: Tensor) -> tuple[Tensor, Tensor, Tensor]:
                 - the mean estimate `(b0 x b1 x ...) x n x 1`
                 - the standard deviation of the estimate `(b0 x b1 x ...) x n x 1`
                 - the reciprocal of the sum of the IDW weights `(b0 x b1 x ...) x n x 1`
+                - the normalized IDW weights `(b0 x b1 x ...) x n x m`, where `m` are
+                  the number of training points.
         """
         return _idw_predict(self.train_X, self.train_Y, X)
 
@@ -205,7 +215,7 @@ def _rbf_partial_fit(
 
 def _rbf_predict(
     train_X: Tensor, train_Y: Tensor, eps: Tensor, coeffs: Tensor, X: Tensor
-) -> tuple[Tensor, Tensor, Tensor]:
+) -> tuple[Tensor, Tensor, Tensor, Tensor]:
     """Predicts mean and scale for RBF regression."""
     # NOTE: here, we do not use `KernelLinearOperator` so as to avoid computing the
     # distance from `X` to `train_X` twice, one in the linear operator and one in the
@@ -216,7 +226,7 @@ def _rbf_predict(
     W_sum_recipr = W.sum(-1, keepdim=True).reciprocal()
     V = W.mul(W_sum_recipr)
     std = _idw_scale(mean, train_Y, V)
-    return mean, std, W_sum_recipr
+    return mean, std, W_sum_recipr, V
 
 
 class Rbf(BaseRegression):
@@ -268,7 +278,7 @@ def Minv_and_coeffs(self) -> tuple[Tensor, Tensor]:
         coefficients. Use this to partially fit a new regressor (see `__init__`)"""
         return self.Minv, self.coeffs
 
-    def forward(self, X: Tensor) -> tuple[Tensor, Tensor, Tensor]:
+    def forward(self, X: Tensor) -> tuple[Tensor, Tensor, Tensor, Tensor]:
         """Computes the RBF regression model.
 
         Parameters
@@ -285,5 +295,7 @@ def forward(self, X: Tensor) -> tuple[Tensor, Tensor, Tensor]:
                 - the mean estimate `(b0 x b1 x ...) x n x 1`
                 - the standard deviation of the estimate `(b0 x b1 x ...) x n x 1`
                 - the reciprocal of the sum of the IDW weights `(b0 x b1 x ...) x n x 1`
+                - the normalized IDW weights `(b0 x b1 x ...) x n x m`, where `m` are
+                  the number of training points.
         """
         return _rbf_predict(self.train_X, self.train_Y, self.eps, self.coeffs, X)
diff --git a/tests/test_myopic.py b/tests/test_myopic.py
index bfae960..077f28e 100644
--- a/tests/test_myopic.py
+++ b/tests/test_myopic.py
@@ -26,7 +26,7 @@ def test__methods__returns_correct_values(self):
         MAF = MyopicAcquisitionFunction(mdl, 1.0, 0.5)
         a1 = MAF(x.transpose(1, 0)).squeeze().neg()
 
-        y_hat, s, W_sum_recipr = mdl(x)
+        y_hat, s, W_sum_recipr, _ = mdl(x)
         dym = Y.amax(-2) - Y.amin(-2)
         z = _idw_distance(W_sum_recipr)
         a2 = acquisition_function(y_hat, s, dym, W_sum_recipr, MAF.c1, MAF.c2).neg()