emdgroup · AdrianSosic · Jul 24, 2024 · Jul 9, 2024 · Jul 16, 2024 · Jul 16, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `Surrogate` models now operate on dataframes in experimental representation instead of
   tensors in computational representation
 - `Surrogate.posterior` models now returns a `Posterior` object
+- `param_bounds_comp` of `SearchSpace`, `SubspaceDiscrete` and `SubspaceContinuous` has
+  been replaced with `comp_rep_bounds`, which returns a dataframe
 
 ### Added
 - `Surrogate` base class now exposes a `to_botorch` method
@@ -33,6 +35,10 @@ _ `_optional` subpackage for managing optional dependencies
 - `transform` methods of `SearchSpace`, `SubspaceDiscrete` and `SubspaceContinuous`
   now take additional `allow_missing` and `allow_extra` keyword arguments
 - `GaussianSurrogate` base class for surrogate models with Gaussian posteriors
+- `comp_rep_columns` property for `Parameter`, `SearchSpace`, `SubspaceDiscrete`
+  and `SubspaceContinuous` classes
+- Reworked mechanisms for surrogate input/output scaling configurable per class
+- `ParameterScalerProtocol` class for enabling user-defined input scaling mechanisms
 
 ### Changed
 - Passing an `Objective` to `Campaign` is now optional
@@ -44,11 +50,12 @@ _ `_optional` subpackage for managing optional dependencies
 - Context information required by `Surrogate` models is now cleanly encapsulated into
   a `context` object passed to `Surrogate._fit`
 - Fallback models created by `catch_constant_targets` are stored outside of surrogate
+- `to_tensor` now also handles `numpy` arrays
+- `GaussianProcessSurrogate` no longer uses a separate scaling approach
 
 ### Removed
 - Support for Python 3.9 removed due to new [BoTorch requirements](https://github.com/pytorch/botorch/pull/2293) 
   and guidelines from [Scientific Python](https://scientific-python.org/specs/spec-0000/)
-- `register_custom_architecture` decorator
 - `Scalar` and `DefaultScaler` classes
 
 ### Fixed
@@ -64,8 +71,6 @@ _ `_optional` subpackage for managing optional dependencies
 - Passing a dataframe via the `data` argument to the `transform` methods of
   `SearchSpace`, `SubspaceDiscrete` and `SubspaceContinuous` is no longer possible.
   The dataframe must now be passed as positional argument.
-- Role of `register_custom_architecture` has been taken over by
-  `baybe.surrogates.base.SurrogateProtocol`
 
 ## [0.9.1] - 2024-06-04
 ### Changed

diff --git a/baybe/acquisition/base.py b/baybe/acquisition/base.py
@@ -52,7 +52,7 @@ def to_botorch(
         params_dict = filter_attributes(object=self, callable_=acqf_cls.__init__)
 
         train_x = surrogate.transform_inputs(measurements)
-        train_y = surrogate.transform_targets(measurements)
+        train_y = surrogate.transform_outputs(measurements)
 
         signature_params = signature(acqf_cls).parameters
         additional_params = {}

diff --git a/baybe/parameters/base.py b/baybe/parameters/base.py
@@ -48,10 +48,6 @@ def is_in_range(self, item: Any) -> bool:
             ``True`` if the item is within the parameter range, ``False`` otherwise.
         """
 
-    @abstractmethod
-    def summary(self) -> dict:
-        """Return a custom summarization of the parameter."""
-
     def __str__(self) -> str:
         return str(self.summary())
 
@@ -65,6 +61,15 @@ def is_discrete(self) -> bool:
         """Boolean indicating if this is a discrete parameter."""
         return isinstance(self, DiscreteParameter)
 
+    @property
+    @abstractmethod
+    def comp_rep_columns(self) -> tuple[str, ...]:
+        """The columns spanning the computational representation."""
+
+    @abstractmethod
+    def summary(self) -> dict:
+        """Return a custom summarization of the parameter."""
+
 
 @define(frozen=True, slots=False)
 class DiscreteParameter(Parameter, ABC):
@@ -84,8 +89,14 @@ def values(self) -> tuple:
     @cached_property
     @abstractmethod
     def comp_df(self) -> pd.DataFrame:
+        # TODO: Should be renamed to `comp_rep`
         """Return the computational representation of the parameter."""
 
+    @property
+    def comp_rep_columns(self) -> tuple[str, ...]:  # noqa: D102
+        # See base class.
+        return tuple(self.comp_df.columns)
+
     def is_in_range(self, item: Any) -> bool:  # noqa: D102
         # See base class.
         return item in self.values

diff --git a/baybe/parameters/numerical.py b/baybe/parameters/numerical.py
@@ -132,6 +132,11 @@ def is_in_range(self, item: float) -> bool:  # noqa: D102
 
         return self.bounds.contains(item)
 
+    @property
+    def comp_rep_columns(self) -> tuple[str, ...]:  # noqa: D102
+        # See base class.
+        return (self.name,)
+
     def summary(self) -> dict:  # noqa: D102
         # See base class.
         param_dict = dict(

diff --git a/baybe/recommenders/pure/bayesian/botorch.py b/baybe/recommenders/pure/bayesian/botorch.py
@@ -156,7 +156,7 @@ def _recommend_continuous(
 
         points, _ = optimize_acqf(
             acq_function=self._botorch_acqf,
-            bounds=torch.from_numpy(subspace_continuous.param_bounds_comp),
+            bounds=torch.from_numpy(subspace_continuous.comp_rep_bounds.values),
             q=batch_size,
             num_restarts=5,  # TODO make choice for num_restarts
             raw_samples=10,  # TODO make choice for raw_samples
@@ -244,7 +244,7 @@ def _recommend_hybrid(
         # Actual call of the BoTorch optimization routine
         points, _ = optimize_acqf_mixed(
             acq_function=self._botorch_acqf,
-            bounds=torch.from_numpy(searchspace.param_bounds_comp),
+            bounds=torch.from_numpy(searchspace.comp_rep_bounds.values),
             q=batch_size,
             num_restarts=5,  # TODO make choice for num_restarts
             raw_samples=10,  # TODO make choice for raw_samples

diff --git a/baybe/scaler.py b/baybe/scaler.py
diff --git a/baybe/searchspace/continuous.py b/baybe/searchspace/continuous.py
@@ -30,7 +30,6 @@
 from baybe.serialization import SerialMixin, converter, select_constructor_hook
 from baybe.utils.basic import to_tuple
 from baybe.utils.dataframe import pretty_print_df
-from baybe.utils.numerical import DTypeFloatNumpy
 
 if TYPE_CHECKING:
     from baybe.searchspace.core import SearchSpace
@@ -211,11 +210,17 @@ def param_names(self) -> tuple[str, ...]:
         return tuple(p.name for p in self.parameters)
 
     @property
-    def param_bounds_comp(self) -> np.ndarray:
-        """Return bounds as numpy array."""
-        if not self.parameters:
-            return np.empty((2, 0), dtype=DTypeFloatNumpy)
-        return np.stack([p.bounds.to_ndarray() for p in self.parameters]).T
+    def comp_rep_columns(self) -> tuple[str, ...]:
+        """The columns spanning the computational representation."""
+        return tuple(chain.from_iterable(p.comp_rep_columns for p in self.parameters))
+
+    @property
+    def comp_rep_bounds(self) -> pd.DataFrame:
+        """The minimum and maximum values of the computational representation."""
+        return pd.DataFrame(
+            {p.name: p.bounds.to_tuple() for p in self.parameters},
+            index=["min", "max"],
+        )
 
     def _drop_parameters(self, parameter_names: Collection[str]) -> SubspaceContinuous:
         """Create a copy of the subspace with certain parameters removed.
@@ -324,10 +329,10 @@ def sample_uniform(self, batch_size: int = 1) -> pd.DataFrame:
             and len(self.constraints_lin_ineq) == 0
             and len(self.constraints_cardinality) == 0
         ):
-            return self._sample_from_bounds(batch_size, self.param_bounds_comp)
+            return self._sample_from_bounds(batch_size, self.comp_rep_bounds.values)
 
         if len(self.constraints_cardinality) == 0:
-            return self._sample_from_polytope(batch_size, self.param_bounds_comp)
+            return self._sample_from_polytope(batch_size, self.comp_rep_bounds.values)
 
         return self._sample_from_polytope_with_cardinality_constraints(batch_size)
 
@@ -453,7 +458,7 @@ def sample_from_full_factorial(self, batch_size: int = 1) -> pd.DataFrame:
     def full_factorial(self) -> pd.DataFrame:
         """Get the full factorial of the continuous space."""
         index = pd.MultiIndex.from_product(
-            self.param_bounds_comp.T.tolist(), names=self.param_names
+            self.comp_rep_bounds.values.T.tolist(), names=self.param_names
         )
 
         return pd.DataFrame(index=index).reset_index()

diff --git a/baybe/searchspace/core.py b/baybe/searchspace/core.py
@@ -7,7 +7,6 @@
 from enum import Enum
 from typing import cast
 
-import numpy as np
 import pandas as pd
 from attr import define, field
 
@@ -244,10 +243,15 @@ def contains_rdkit(self) -> bool:
         )
 
     @property
-    def param_bounds_comp(self) -> np.ndarray:
-        """Return bounds as tensor."""
-        return np.hstack(
-            [self.discrete.param_bounds_comp, self.continuous.param_bounds_comp]
+    def comp_rep_columns(self) -> tuple[str, ...]:
+        """The columns spanning the computational representation."""
+        return self.discrete.comp_rep_columns + self.continuous.comp_rep_columns
+
+    @property
+    def comp_rep_bounds(self) -> pd.DataFrame:
+        """The minimum and maximum values of the computational representation."""
+        return pd.concat(
+            [self.discrete.comp_rep_bounds, self.continuous.comp_rep_bounds], axis=1
         )
 
     @property

diff --git a/baybe/searchspace/discrete.py b/baybe/searchspace/discrete.py
@@ -537,27 +537,21 @@ def is_empty(self) -> bool:
         return len(self.parameters) == 0
 
     @property
-    def param_bounds_comp(self) -> np.ndarray:
-        """Return bounds as tensor.
+    def comp_rep_columns(self) -> tuple[str, ...]:
+        """The columns spanning the computational representation."""
+        # We go via `comp_rep` here instead of using the columns of the individual
+        # parameters because the search space potentially uses only a subset of the
+        # columns due to decorrelation
+        return tuple(self.comp_rep.columns)
 
-        Take bounds from the parameter definitions, but discards bounds belonging to
-        columns that were filtered out during the creation of the space.
-        """
-        if not self.parameters:
-            return np.empty((2, 0))
-        bounds = np.hstack(
-            [
-                np.vstack([p.comp_df[col].min(), p.comp_df[col].max()])
-                for p in self.parameters
-                for col in p.comp_df
-                if col in self.comp_rep.columns
-            ]
-        )
-        return bounds
+    @property
+    def comp_rep_bounds(self) -> pd.DataFrame:
+        """The minimum and maximum values of the computational representation."""
+        return pd.DataFrame({"min": self.comp_rep.min(), "max": self.comp_rep.max()}).T
 
     @staticmethod
     def estimate_product_space_size(
-        parameters: Sequence[DiscreteParameter]
+        parameters: Sequence[DiscreteParameter],
     ) -> MemorySize:
         """Estimate an upper bound for the memory size of a product space.