From c3ade118244d0d51fe5c5c28bbf77aaf0885045d Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Wed, 17 Jul 2024 09:53:14 +0200 Subject: [PATCH] Draft input scaling mechanism --- baybe/surrogates/base.py | 36 ++++++++++++++++++++-- baybe/surrogates/gaussian_process/core.py | 9 ++++++ baybe/utils/scaling.py | 37 +++++++++++++++++++++++ 3 files changed, 80 insertions(+), 2 deletions(-) create mode 100644 baybe/utils/scaling.py diff --git a/baybe/surrogates/base.py b/baybe/surrogates/base.py index aa7ffc5821..2b7029a21a 100644 --- a/baybe/surrogates/base.py +++ b/baybe/surrogates/base.py @@ -19,6 +19,7 @@ from baybe.exceptions import ModelNotTrainedError from baybe.objectives.base import Objective +from baybe.parameters.base import Parameter from baybe.searchspace import SearchSpace from baybe.serialization.core import ( converter, @@ -27,13 +28,14 @@ ) from baybe.serialization.mixin import SerialMixin from baybe.utils.dataframe import to_tensor +from baybe.utils.scaling import ScalingMethod, make_scaler if TYPE_CHECKING: from botorch.models.model import Model from botorch.posteriors import GPyTorchPosterior, Posterior + from sklearn.compose import ColumnTransformer from torch import Tensor - _ONNX_ENCODING = "latin-1" """Constant signifying the encoding for onnx byte strings in pretrained models. @@ -83,6 +85,32 @@ def to_botorch(self) -> Model: return AdapterModel(self) + @staticmethod + def _get_parameter_scaling(parameter: Parameter) -> ScalingMethod: + """Return the scaling method to be used for the given parameter.""" + return ScalingMethod.MINMAX + + def _make_input_scaler( + self, searchspace: SearchSpace, measurements: pd.DataFrame + ) -> ColumnTransformer: + """Make a scaler to be used for transforming computational dataframes.""" + from sklearn.compose import make_column_transformer + + # Create the composite scaler from the parameter-wise scaler objects + # TODO: Filter down to columns that actually remain in the comp rep of the + # searchspace, since the transformer can break down otherwise. + transformers = [ + (make_scaler(self._get_parameter_scaling(p)), p.comp_df.columns) + for p in searchspace.parameters + ] + scaler = make_column_transformer(*transformers) + + # TODO: Decide whether scaler is to be fit to parameter bounds and/or + # extreme points in the given measurement data + scaler.fit(searchspace.comp_rep_bounds) + + return scaler + def transform_inputs(self, data: pd.DataFrame) -> pd.DataFrame: """Transform an experimental parameter dataframe.""" if self._input_transform is None: @@ -148,8 +176,12 @@ def fit( "Continuous search spaces are currently only supported by GPs." ) + input_scaler = self._make_input_scaler(searchspace, measurements) + # Store context-specific transformations - self._input_transform = lambda x: searchspace.transform(x, allow_missing=True) + self._input_transform = lambda x: input_scaler.transform( + searchspace.transform(x, allow_missing=True) + ) self._target_transform = lambda x: objective.transform(x) # Transform and fit diff --git a/baybe/surrogates/gaussian_process/core.py b/baybe/surrogates/gaussian_process/core.py index 6a2a2afa68..f9fba30172 100644 --- a/baybe/surrogates/gaussian_process/core.py +++ b/baybe/surrogates/gaussian_process/core.py @@ -8,6 +8,7 @@ from attrs.validators import instance_of from baybe.objective import Objective +from baybe.parameters.base import Parameter from baybe.searchspace.core import SearchSpace from baybe.surrogates.base import Surrogate from baybe.surrogates.gaussian_process.kernel_factory import ( @@ -22,6 +23,7 @@ DefaultKernelFactory, _default_noise_factory, ) +from baybe.utils.scaling import ScalingMethod if TYPE_CHECKING: from botorch.models.model import Model @@ -108,6 +110,13 @@ def to_botorch(self) -> Model: # noqa: D102 return self._model + @staticmethod + def _get_parameter_scaling(parameter: Parameter) -> ScalingMethod: + # See base class. + + # For GPs, we use botorch's built-in machinery for scaling. + return ScalingMethod.IDENTITY + @staticmethod def _get_model_context( searchspace: SearchSpace, objective: Objective diff --git a/baybe/utils/scaling.py b/baybe/utils/scaling.py new file mode 100644 index 0000000000..51d8ef437c --- /dev/null +++ b/baybe/utils/scaling.py @@ -0,0 +1,37 @@ +"""Scaling utilities.""" + +from __future__ import annotations + +from enum import Enum +from typing import TYPE_CHECKING, Literal, TypeAlias + +if TYPE_CHECKING: + from sklearn.base import BaseEstimator, TransformerMixin + + Scaler: TypeAlias = BaseEstimator | TransformerMixin + + +class ScalingMethod(Enum): + """Available scaling methods.""" + + IDENTITY = "IDENTITY" + """Identity transformation (no scaling applied).""" + + MINMAX = "MINMAX" + """Min-max scaling, mapping the observed value range to [0, 1].""" + + MAXABS = "MAXABS" + """Max-abs scaling, scaling by the largest observed absolute (applies no shift).""" + + +def make_scaler(method: ScalingMethod, /) -> Scaler | Literal["passthrough"]: + """Create a scaler object based on the specified method.""" + from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler + + match method: + case ScalingMethod.IDENTITY: + return "passthrough" + case ScalingMethod.MINMAX: + return MinMaxScaler() + case ScalingMethod.MAXABS: + return MaxAbsScaler()