Draft input scaling mechanism

emdgroup · Jul 17, 2024 · c3ade11 · c3ade11
1 parent 5bb9f59
commit c3ade11
Show file tree

Hide file tree

Showing 3 changed files with 80 additions and 2 deletions.
diff --git a/baybe/surrogates/base.py b/baybe/surrogates/base.py
@@ -19,6 +19,7 @@
 
 from baybe.exceptions import ModelNotTrainedError
 from baybe.objectives.base import Objective
+from baybe.parameters.base import Parameter
 from baybe.searchspace import SearchSpace
 from baybe.serialization.core import (
     converter,
@@ -27,13 +28,14 @@
 )
 from baybe.serialization.mixin import SerialMixin
 from baybe.utils.dataframe import to_tensor
+from baybe.utils.scaling import ScalingMethod, make_scaler
 
 if TYPE_CHECKING:
     from botorch.models.model import Model
     from botorch.posteriors import GPyTorchPosterior, Posterior
+    from sklearn.compose import ColumnTransformer
     from torch import Tensor
 
-
 _ONNX_ENCODING = "latin-1"
 """Constant signifying the encoding for onnx byte strings in pretrained models.
 
@@ -83,6 +85,32 @@ def to_botorch(self) -> Model:
 
         return AdapterModel(self)
 
+    @staticmethod
+    def _get_parameter_scaling(parameter: Parameter) -> ScalingMethod:
+        """Return the scaling method to be used for the given parameter."""
+        return ScalingMethod.MINMAX
+
+    def _make_input_scaler(
+        self, searchspace: SearchSpace, measurements: pd.DataFrame
+    ) -> ColumnTransformer:
+        """Make a scaler to be used for transforming computational dataframes."""
+        from sklearn.compose import make_column_transformer
+
+        # Create the composite scaler from the parameter-wise scaler objects
+        # TODO: Filter down to columns that actually remain in the comp rep of the
+        #   searchspace, since the transformer can break down otherwise.
+        transformers = [
+            (make_scaler(self._get_parameter_scaling(p)), p.comp_df.columns)
+            for p in searchspace.parameters
+        ]
+        scaler = make_column_transformer(*transformers)
+
+        # TODO: Decide whether scaler is to be fit to parameter bounds and/or
+        #   extreme points in the given measurement data
+        scaler.fit(searchspace.comp_rep_bounds)
+
+        return scaler
+
     def transform_inputs(self, data: pd.DataFrame) -> pd.DataFrame:
         """Transform an experimental parameter dataframe."""
         if self._input_transform is None:
@@ -148,8 +176,12 @@ def fit(
                 "Continuous search spaces are currently only supported by GPs."
             )
 
+        input_scaler = self._make_input_scaler(searchspace, measurements)
+
         # Store context-specific transformations
-        self._input_transform = lambda x: searchspace.transform(x, allow_missing=True)
+        self._input_transform = lambda x: input_scaler.transform(
+            searchspace.transform(x, allow_missing=True)
+        )
         self._target_transform = lambda x: objective.transform(x)
 
         # Transform and fit

diff --git a/baybe/surrogates/gaussian_process/core.py b/baybe/surrogates/gaussian_process/core.py
@@ -8,6 +8,7 @@
 from attrs.validators import instance_of
 
 from baybe.objective import Objective
+from baybe.parameters.base import Parameter
 from baybe.searchspace.core import SearchSpace
 from baybe.surrogates.base import Surrogate
 from baybe.surrogates.gaussian_process.kernel_factory import (
@@ -22,6 +23,7 @@
     DefaultKernelFactory,
     _default_noise_factory,
 )
+from baybe.utils.scaling import ScalingMethod
 
 if TYPE_CHECKING:
     from botorch.models.model import Model
@@ -108,6 +110,13 @@ def to_botorch(self) -> Model:  # noqa: D102
 
         return self._model
 
+    @staticmethod
+    def _get_parameter_scaling(parameter: Parameter) -> ScalingMethod:
+        # See base class.
+
+        # For GPs, we use botorch's built-in machinery for scaling.
+        return ScalingMethod.IDENTITY
+
     @staticmethod
     def _get_model_context(
         searchspace: SearchSpace, objective: Objective

diff --git a/baybe/utils/scaling.py b/baybe/utils/scaling.py
@@ -0,0 +1,37 @@
+"""Scaling utilities."""
+
+from __future__ import annotations
+
+from enum import Enum
+from typing import TYPE_CHECKING, Literal, TypeAlias
+
+if TYPE_CHECKING:
+    from sklearn.base import BaseEstimator, TransformerMixin
+
+    Scaler: TypeAlias = BaseEstimator | TransformerMixin
+
+
+class ScalingMethod(Enum):
+    """Available scaling methods."""
+
+    IDENTITY = "IDENTITY"
+    """Identity transformation (no scaling applied)."""
+
+    MINMAX = "MINMAX"
+    """Min-max scaling, mapping the observed value range to [0, 1]."""
+
+    MAXABS = "MAXABS"
+    """Max-abs scaling, scaling by the largest observed absolute (applies no shift)."""
+
+
+def make_scaler(method: ScalingMethod, /) -> Scaler | Literal["passthrough"]:
+    """Create a scaler object based on the specified method."""
+    from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler
+
+    match method:
+        case ScalingMethod.IDENTITY:
+            return "passthrough"
+        case ScalingMethod.MINMAX:
+            return MinMaxScaler()
+        case ScalingMethod.MAXABS:
+            return MaxAbsScaler()