diff --git a/baybe/campaign.py b/baybe/campaign.py index c2167d33f..b59dbf69c 100644 --- a/baybe/campaign.py +++ b/baybe/campaign.py @@ -3,7 +3,7 @@ from __future__ import annotations import json -from typing import List +from typing import List, Literal import cattrs import numpy as np @@ -159,7 +159,7 @@ def validate_config(cls, config_json: str) -> None: def add_measurements( self, data: pd.DataFrame, - numerical_measurements_must_be_within_tolerance: bool = True, + on_tolerance_violation: Literal["raise", "warn", "ignore"] = "raise", ) -> None: """Add results from a dataframe to the internal database. @@ -172,8 +172,12 @@ def add_measurements( Args: data: The data to be added (with filled values for targets). Preferably created via :func:`baybe.campaign.Campaign.recommend`. - numerical_measurements_must_be_within_tolerance: Flag indicating if - numerical parameters need to be within their tolerances. + on_tolerance_violation: The mode determining how to handle the attempt + of adding numerical data that violates parameter tolerances. Unless + set to ``raise``, the measurements will be added to the database + despite potential violations. However, note that values lying + significantly outside the convex hull of numerical parameters can + lead to scaling problems in model training. Raises: ValueError: If one of the targets has missing values or NaNs in the provided @@ -211,9 +215,7 @@ def add_measurements( # Update meta data # TODO: refactor responsibilities - self.searchspace.discrete.mark_as_measured( - data, numerical_measurements_must_be_within_tolerance - ) + self.searchspace.discrete.mark_as_measured(data, on_tolerance_violation) # Read in measurements and add them to the database self.n_batches_done += 1 @@ -226,12 +228,13 @@ def add_measurements( ) # Telemetry + # TODO: Code is inefficient because of unnecessary second fuzzy matching telemetry_record_value(TELEM_LABELS["COUNT_ADD_RESULTS"], 1) telemetry_record_recommended_measurement_percentage( self._cached_recommendation, data, self.parameters, - numerical_measurements_must_be_within_tolerance, + on_tolerance_violation, ) def recommend( diff --git a/baybe/parameters/numerical.py b/baybe/parameters/numerical.py index 867a80d03..e9ec4318c 100644 --- a/baybe/parameters/numerical.py +++ b/baybe/parameters/numerical.py @@ -7,12 +7,11 @@ import numpy as np import pandas as pd from attrs import define, field -from attrs.validators import min_len +from attrs.validators import ge, min_len -from baybe.exceptions import NumericalUnderflowError from baybe.parameters.base import DiscreteParameter, Parameter from baybe.parameters.validation import validate_is_finite, validate_unique_values -from baybe.utils import DTypeFloatNumpy, InfiniteIntervalError, Interval, convert_bounds +from baybe.utils import InfiniteIntervalError, Interval, convert_bounds @define(frozen=True, slots=False) @@ -24,7 +23,7 @@ class NumericalDiscreteParameter(DiscreteParameter): # See base class. # object variables - # NOTE: The parameter values are assumed to be sorted by the tolerance validator. + # NOTE: The values are assumed to be sorted by the tolerance default method. _values: Tuple[float, ...] = field( # FIXME[typing]: https://github.com/python-attrs/cattrs/issues/111 converter=lambda x: sorted(cattrs.structure(x, Tuple[float, ...])), # type: ignore @@ -37,43 +36,16 @@ class NumericalDiscreteParameter(DiscreteParameter): ) """The values the parameter can take.""" - tolerance: float = field(default=0.0) - """The absolute tolerance used for deciding whether a value is in range. A tolerance - larger than half the minimum distance between parameter values is not allowed - because that could cause ambiguity when inputting data points later.""" + tolerance: float = field(validator=ge(0.0)) + """The absolute tolerance used for deciding whether a value is considered in range. + A value is considered in range if its distance to the closest parameter value + is smaller than the specified tolerance.""" - @tolerance.validator - def _validate_tolerance( # noqa: DOC101, DOC103 - self, _: Any, tolerance: float - ) -> None: - """Validate that the given tolerance is safe. - - The tolerance is the allowed experimental uncertainty when - reading in measured values. A tolerance larger than half the minimum - distance between parameter values is not allowed because that could cause - ambiguity when inputting data points later. - - Raises: - ValueError: If the tolerance is not safe. - """ - # For zero tolerance, the only left requirement is that all parameter values - # are distinct, which is already ensured by the corresponding validator. - if tolerance == 0.0: - return - - min_dist = np.diff(self.values).min() - if min_dist == (eps := np.nextafter(0, 1, dtype=DTypeFloatNumpy)): - raise NumericalUnderflowError( - f"The distance between any two parameter values must be at least " - f"twice the size of the used floating point resolution of {eps}." - ) - - if tolerance >= (max_tol := min_dist / 2.0): - raise ValueError( - f"Parameter '{self.name}' is initialized with tolerance {tolerance} " - f"but due to the given parameter values {self.values}, the specified " - f"tolerance must be smaller than {max_tol} to avoid ambiguity." - ) + @tolerance.default + def default_tolerance(self) -> float: + """Set the tolerance to fraction of the smallest value distance.""" + fraction = 0.1 + return fraction * np.diff(self.values).min().item() @property def values(self) -> tuple: # noqa: D102 diff --git a/baybe/searchspace/discrete.py b/baybe/searchspace/discrete.py index c3ca65678..e4887a12b 100644 --- a/baybe/searchspace/discrete.py +++ b/baybe/searchspace/discrete.py @@ -2,7 +2,7 @@ from __future__ import annotations -from typing import Any, Collection, Iterable, List, Optional, Tuple, cast +from typing import Any, Collection, Iterable, List, Literal, Optional, Tuple, cast import numpy as np import pandas as pd @@ -252,21 +252,20 @@ def param_bounds_comp(self) -> torch.Tensor: def mark_as_measured( self, measurements: pd.DataFrame, - numerical_measurements_must_be_within_tolerance: bool, + on_tolerance_violation: Literal["raise", "warn", "ignore"], ) -> None: """Mark the given elements of the space as measured. Args: measurements: A dataframe containing parameter settings that should be marked as measured. - numerical_measurements_must_be_within_tolerance: See - :func:`baybe.utils.dataframe.fuzzy_row_match`. + on_tolerance_violation: See :func:`baybe.utils.dataframe.fuzzy_row_match`. """ inds_matched = fuzzy_row_match( self.exp_rep, measurements, self.parameters, - numerical_measurements_must_be_within_tolerance, + on_tolerance_violation, ) self.metadata.loc[inds_matched, "was_measured"] = True diff --git a/baybe/telemetry.py b/baybe/telemetry.py index f2983dc53..68a7149e5 100644 --- a/baybe/telemetry.py +++ b/baybe/telemetry.py @@ -78,7 +78,7 @@ import logging import os import socket -from typing import TYPE_CHECKING, Dict, List, Union +from typing import TYPE_CHECKING, Dict, List, Literal, Union from urllib.parse import urlparse import pandas as pd @@ -275,7 +275,7 @@ def telemetry_record_recommended_measurement_percentage( cached_recommendation: pd.DataFrame, measurements: pd.DataFrame, parameters: List[Parameter], - numerical_measurements_must_be_within_tolerance: bool, + on_tolerance_violation: Literal["raise", "warn", "ignore"] = "raise", ) -> None: """Submit the percentage of added measurements. @@ -293,10 +293,7 @@ def telemetry_record_recommended_measurement_percentage( measurements: The measurements which are supposed to be checked against cached recommendations. parameters: The list of parameters spanning the entire search space. - numerical_measurements_must_be_within_tolerance: If ``True``, numerical - parameter entries are matched with the reference elements only if there is - a match within the parameter tolerance. If ``False``, the closest match - is considered, irrespective of the distance. + on_tolerance_violation: See :func:`baybe.utils.dataframe.fuzzy_row_match`. """ if is_enabled(): if len(cached_recommendation) > 0: @@ -306,7 +303,7 @@ def telemetry_record_recommended_measurement_percentage( cached_recommendation, measurements, parameters, - numerical_measurements_must_be_within_tolerance, + on_tolerance_violation, ) ) / len(cached_recommendation) diff --git a/baybe/utils/dataframe.py b/baybe/utils/dataframe.py index 3ff97bc1e..9109adc32 100644 --- a/baybe/utils/dataframe.py +++ b/baybe/utils/dataframe.py @@ -3,6 +3,7 @@ from __future__ import annotations import logging +import warnings from typing import TYPE_CHECKING, Dict, Iterable, List, Literal, Optional, Tuple, Union import numpy as np @@ -321,7 +322,7 @@ def fuzzy_row_match( left_df: pd.DataFrame, right_df: pd.DataFrame, parameters: List[Parameter], - numerical_measurements_must_be_within_tolerance: bool, + on_tolerance_violation: Literal["raise", "warn", "ignore"], ) -> pd.Index: """Match row of the right dataframe to the rows of the left dataframe. @@ -337,10 +338,8 @@ def fuzzy_row_match( dataframe. parameters: List of baybe parameter objects that are needed to identify potential tolerances. - numerical_measurements_must_be_within_tolerance: If ``True``, numerical - parameters are matched with the search space elements only if there is a - match within the parameter tolerance. If ``False``, the closest match is - considered, irrespective of the distance. + on_tolerance_violation: The mode determining what how to handle a missing + match due to parameter tolerance violation. Returns: The index of the matching rows in ``left_df``. @@ -349,6 +348,13 @@ def fuzzy_row_match( ValueError: If some rows are present in the right but not in the left dataframe. ValueError: If the input data has invalid values. """ + # Assert that the passed violation mode is valid + if on_tolerance_violation not in ["raise", "warn", "ignore"]: + raise ValueError( + """Argument passed to `on_tolerance_violation` must be one """ + """of '["raise", "warn", "ignore"]'.""" + ) + # Assert that all parameters appear in the given dataframe if not all(col in right_df.columns for col in left_df.columns): raise ValueError( @@ -360,25 +366,31 @@ def fuzzy_row_match( # Iterate over all input rows for ind, row in right_df.iterrows(): - # Check if the row represents a valid input - valid = True + # Check if all values of the row are in the respective parameter ranges for param in parameters: - if param.is_numeric: - if numerical_measurements_must_be_within_tolerance: - valid &= param.is_in_range(row[param.name]) - else: - valid &= param.is_in_range(row[param.name]) - if not valid: - raise ValueError( - f"Input data on row with the index {row.name} has invalid " - f"values in parameter '{param.name}'. " - f"For categorical parameters, values need to exactly match a " - f"valid choice defined in your config. " - f"For numerical parameters, a match is accepted only if " - f"the input value is within the specified tolerance/range. Set " - f"the flag 'numerical_measurements_must_be_within_tolerance' " - f"to 'False' to disable this behavior." - ) + if not param.is_in_range((val := row[param.name])): + if param.is_numeric and on_tolerance_violation == "ignore": + break + if param.is_numeric and on_tolerance_violation == "warn": + warnings.warn( + f"The value '{val}' is outside the range of parameter " + f"'{param.name}'. " + f"If you expected a match between your input " + f"and the parameter, consider increasing the parameter's " + f"tolerance value or adding more parameter values. " + f"You can silence this warning using the 'ignore' mode.", + UserWarning, + ) + break + else: + raise ValueError( + f"The value '{val}' is outside the range of parameter " + f"'{param.name}'. " + f"If you expected a match between your input " + f"and the parameter, consider increasing the parameter's " + f"tolerance value or adding more parameter values. " + f"You can bypass this check using the 'ignore' or 'warn' mode." + ) # Differentiate category-like and discrete numerical parameters cat_cols = [p.name for p in parameters if not p.is_numeric] diff --git a/tests/hypothesis_strategies/parameters.py b/tests/hypothesis_strategies/parameters.py index 1c6d7822e..7226235e9 100644 --- a/tests/hypothesis_strategies/parameters.py +++ b/tests/hypothesis_strategies/parameters.py @@ -1,7 +1,6 @@ """Hypothesis strategies for parameters.""" import hypothesis.strategies as st -import numpy as np from hypothesis.extra.pandas import columns, data_frames from baybe.parameters.categorical import ( @@ -88,18 +87,7 @@ def numerical_discrete_parameter( unique=True, ) ) - max_tolerance = np.diff(np.sort(values)).min() / 2 - if max_tolerance == 0.0: - tolerance = 0.0 - else: - tolerance = draw( - st.floats( - min_value=0.0, - max_value=max_tolerance, - allow_nan=False, - exclude_max=True, - ) - ) + tolerance = draw(st.floats(min_value=0.0)) return NumericalDiscreteParameter(name=name, values=values, tolerance=tolerance)