diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7d357d9..d8ddb01 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,7 +13,7 @@ repos: - id: check-ast - id: check-added-large-files - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.3.4 # Ruff version. + rev: v0.4.0 # Ruff version. hooks: - id: ruff # Run the linter. args: [--fix, timebasedcv, tests] diff --git a/README.md b/README.md index abc58ac..4b62f2e 100644 --- a/README.md +++ b/README.md @@ -24,41 +24,60 @@ This codebase is experimental and is working for my use cases. It is very probab The current implementation of [scikit-learn TimeSeriesSplit](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html) lacks the flexibility of having multiple samples within the same time period/unit. -This codebase addresses such problem by providing a cross validation strategy based on a time period rather than the number of samples. This is useful when the data is time dependent, and the model should be trained on past data and tested on future data, independently from the number of observations present within a given time period. +This codebase addresses such problem by providing a cross validation strategy based on a **time period** rather than the number of samples. This is useful when the data is time dependent, and the model should be trained on past data and tested on future data, independently from the number of observations present within a given time period. + +Temporal data leakage is an issue and we want to prevent that from happening! We introduce two main classes: -- [`TimeBasedSplit`](https://fbruzzesi.github.io/timebasedcv/api/timebasedsplit/#timebasedcv.timebasedsplit.TimeBasedSplit): a class that allows to define a time based split with a given frequency, train size, test size, gap, stride and window type. It's core method `split` requires to pass a time series as input to create the boolean masks for train and test from the instance information defined above. Therefore it is not compatible with [scikit-learn CV Splitters](https://scikit-learn.org/stable/common_pitfalls.html#id3). -- [`TimeBasedCVSplitter`](https://fbruzzesi.github.io/timebasedcv/api/timebasedsplit/#timebasedcv.timebasedsplit.TimeBasedCVSplitter): a class that conforms with scikit-learn CV Splitters but requires to pass the time series as input to the instance. That is because a CV Splitter needs to know a priori the number of splits and the `split` method shouldn't take any extra arguments as input other than the arrays to split. +- [`TimeBasedSplit`](https://fbruzzesi.github.io/timebasedcv/api/timebasedsplit/#timebasedcv.timebasedsplit.TimeBasedSplit) allows to define a time based split with a given frequency, train size, test size, gap, stride and window type. Its core method `split` requires to pass a time series as input to create the boolean masks for train and test from the instance information defined above. Therefore it is not compatible with [scikit-learn CV Splitters](https://scikit-learn.org/stable/common_pitfalls.html#id3). +- [`TimeBasedCVSplitter`](https://fbruzzesi.github.io/timebasedcv/api/timebasedsplit/#timebasedcv.timebasedsplit.TimeBasedCVSplitter) conforms with scikit-learn CV Splitters but requires to pass the time series as input to the instance. That is because a CV Splitter needs to know a priori the number of splits, and the `split` method shouldn't take any extra arguments as input other than the arrays to split. ## Installation **timebasedcv** is a published Python package on [pypi](https://pypi.org/), therefore it can be installed directly via pip, as well as from source using pip and git, or with a local clone: -- **pip** (suggested): +
+ + pip (suggested) + +```bash +python -m pip install timebasedcv +``` + +
+ +
+ + pip + source/git - ```bash - python -m pip install timebasedcv - ``` +```bash +python -m pip install git+https://github.com/FBruzzesi/timebasedcv.git +``` + +
-- **pip + source/git**: +
- ```bash - python -m pip install git+https://github.com/FBruzzesi/timebasedcv.git - ``` + local clone + +```bash +git clone https://github.com/FBruzzesi/timebasedcv.git +cd timebasedcv +python -m pip install . +``` -- **local clone**: +
- ```bash - git clone https://github.com/FBruzzesi/timebasedcv.git - cd timebasedcv - python -m pip install . - ``` +## Dependencies + +As of **timebasecv v0.1.0**, the only two dependencies are [`numpy`](https://numpy.org/doc/stable/index.html) and [`narwhals>=0.7.15`](https://marcogorelli.github.io/narwhals/). + +The latter allows to have a compatibility layer between polars, pandas and other dataframe libraries. Therefore, as long as narwhals supports such dataframe object, we will as well. ## Quickstart -As a **quickstart**, you can use the following code snippet to get started. -Consider checkout out the [Getting Started](https://fbruzzesi.github.io/timebasedcv/getting-started/) section of for a detailed guide on how to use the library. +The following code snippet is all you need to get started, yet consider checking out the [Getting Started](https://fbruzzesi.github.io/timebasedcv/getting-started/) section of the documentation for a detailed guide on how to use the library. First let's generate some data with different number of points per day: @@ -80,13 +99,15 @@ df = pd.concat([ time_series, X = df["time"], df["value"] df.set_index("time").resample("D").count().head(5) +``` -# time value -# 2023-01-01 14 -# 2023-01-02 2 -# 2023-01-03 22 -# 2023-01-04 11 -# 2023-01-05 1 +```terminal +time value +2023-01-01 14 +2023-01-02 2 +2023-01-03 22 +2023-01-04 11 +2023-01-05 1 ``` Now let's run the split with a given frequency, train size, test size, gap, stride and window type: diff --git a/docs/getting-started.md b/docs/getting-started.md index 9177fc1..6111d3a 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -52,11 +52,12 @@ print(f"Number of splits: {tbs.n_splits_of(time_series=time_series)}") for X_train, X_forecast, y_train, y_forecast in tbs.split(X, y, time_series=time_series): print(f"Train: {X_train.shape}, Forecast: {X_forecast.shape}") - -# Train: (30, 2), Forecast: (7, 2) -# Train: (30, 2), Forecast: (7, 2) -# ... -# Train: (30, 2), Forecast: (7, 2) +``` +```terminal +Train: (30, 2), Forecast: (7, 2) +Train: (30, 2), Forecast: (7, 2) +... +Train: (30, 2), Forecast: (7, 2) ``` Another optional parameter that can be passed to the `split` method is `return_splitstate`. If `True`, the method will return a [`SplitState`](api/splitstate.md) dataclass which contains the "split" points for training and test, namely `train_start`, `train_end`, `forecast_start` and `forecast_end`. These can be useful if a particular logic needs to be applied to the data before training and/or forecasting. @@ -106,7 +107,10 @@ random_search_cv = RandomizedSearchCV( ).fit(X, y) random_search_cv.best_params_ -# {'positive': True, 'fit_intercept': False, 'alpha': 0.1} +``` + +```terminal +{'positive': True, 'fit_intercept': False, 'alpha': 0.1} ``` ## Examples of Cross Validation @@ -133,13 +137,15 @@ df = pd.concat([ time_series, X = df["time"], df["value"] df.set_index("time").resample("D").count().head(5) +``` -# time value -# 2023-01-01 14 -# 2023-01-02 2 -# 2023-01-03 22 -# 2023-01-04 11 -# 2023-01-05 1 +```terminal +time value +2023-01-01 14 +2023-01-02 2 +2023-01-03 22 +2023-01-04 11 +2023-01-05 1 ``` As we can see every day has a different number of points. diff --git a/docs/index.md b/docs/index.md index 1711d39..9fd0a11 100644 --- a/docs/index.md +++ b/docs/index.md @@ -25,18 +25,20 @@ This codebase is experimental and is working for my use cases. It is very probab The current implementation of [scikit-learn TimeSeriesSplit](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html) lacks the flexibility of having multiple samples within the same time period/unit. -This codebase addresses such problem by providing a cross validation strategy based on a time period rather than the number of samples. This is useful when the data is time dependent, and the model should be trained on past data and tested on future data, independently from the number of observations present within a given time period. +This codebase addresses such problem by providing a cross validation strategy based on a **time period** rather than the number of samples. This is useful when the data is time dependent, and the model should be trained on past data and tested on future data, independently from the number of observations present within a given time period. + +Temporal data leakage is an issue and we want to prevent that from happening! We introduce two main classes: -- [`TimeBasedSplit`](api/timebasedsplit.md#timebasedcv.timebasedsplit.TimeBasedSplit): a class that allows to define a time based split with a given frequency, train size, test size, gap, stride and window type. It's core method `split` requires to pass a time series as input to create the boolean masks for train and test from the instance information defined above. Therefore it is not compatible with [scikit-learn CV Splitters](https://scikit-learn.org/stable/common_pitfalls.html#id3). -- [`TimeBasedCVSplitter`](api/timebasedsplit.md#timebasedcv.timebasedsplit.TimeBasedCVSplitter): a class that conforms with scikit-learn CV Splitters but requires to pass the time series as input to the instance. That is because a CV Splitter needs to know a priori the number of splits and the `split` method shouldn't take any extra arguments as input other than the arrays to split. +- [`TimeBasedSplit`](api/timebasedsplit.md#timebasedcv.timebasedsplit.TimeBasedSplit)allows to define a time based split with a given frequency, train size, test size, gap, stride and window type. Its core method `split` requires to pass a time series as input to create the boolean masks for train and test from the instance information defined above. Therefore it is not compatible with [scikit-learn CV Splitters](https://scikit-learn.org/stable/common_pitfalls.html#id3). +- [`TimeBasedCVSplitter`](api/timebasedsplit.md#timebasedcv.timebasedsplit.TimeBasedCVSplitter) conforms with scikit-learn CV Splitters but requires to pass the time series as input to the instance. That is because a CV Splitter needs to know a priori the number of splits, and the `split` method shouldn't take any extra arguments as input other than the arrays to split.it. ## Installation **timebasedcv** is a published Python package on [pypi](https://pypi.org/), therefore it can be installed directly via pip, as well as from source using pip and git, or with a local clone: -=== "pip" +=== "pip (suggested)" ```bash python -m pip install timebasedcv diff --git a/pyproject.toml b/pyproject.toml index 2ad86c9..de2feee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "timebasedcv" -version = "0.0.2" +version = "0.1.0" description = "Time based cross validation" license = {file = "LICENSE"} @@ -45,7 +45,7 @@ dev = [ ] lint = [ - "ruff>=0.1.6" + "ruff>=0.4.0" ] docs = [ @@ -76,7 +76,15 @@ packages = ["timebasedcv"] line-length = 120 [tool.ruff.lint] -extend-select = ["I"] +extend-select = [ + "E", + "F", + "I", + # "N", # pep8-naming + "W", + "PERF", + "RUF", +] ignore = [ "E731", # do not assign a `lambda` expression, use a `def` ] diff --git a/tests/utils/backends_test.py b/tests/utils/backends_test.py index 180b513..9d0a8b4 100644 --- a/tests/utils/backends_test.py +++ b/tests/utils/backends_test.py @@ -1,9 +1,9 @@ from contextlib import nullcontext as does_not_raise +import narwhals as nw import numpy as np import pandas as pd import pytest -import narwhals as nw from timebasedcv.utils._backends import ( BACKEND_TO_INDEXING_METHOD, diff --git a/timebasedcv/timebasedsplit.py b/timebasedcv/timebasedsplit.py index 00dd69e..32edcfc 100644 --- a/timebasedcv/timebasedsplit.py +++ b/timebasedcv/timebasedsplit.py @@ -2,8 +2,8 @@ from datetime import timedelta from itertools import chain from typing import Generator, Tuple, Union, get_args -import narwhals as nw +import narwhals as nw import numpy as np from timebasedcv.splitstate import SplitState @@ -371,7 +371,7 @@ def split( ts_shape = time_series.shape if len(ts_shape) != 1: raise ValueError(f"Time series must be 1-dimensional. Got {len(ts_shape)} dimensions.") - + arrays = tuple([nw.from_native(array, eager_only=True, allow_series=True, strict=False) for array in arrays]) time_series = nw.from_native(time_series, series_only=True, strict=False) a0 = arrays[0] @@ -396,7 +396,10 @@ def split( train_forecast_arrays = tuple( chain.from_iterable( - (nw.to_native(_idx_method(_arr, train_mask), strict=False), nw.to_native(_idx_method(_arr, forecast_mask), strict=False)) + ( + nw.to_native(_idx_method(_arr, train_mask), strict=False), + nw.to_native(_idx_method(_arr, forecast_mask), strict=False), + ) for _arr, _idx_method in zip(arrays, _index_methods) ) ) diff --git a/timebasedcv/utils/_backends.py b/timebasedcv/utils/_backends.py index ecd4ec0..d7cee41 100644 --- a/timebasedcv/utils/_backends.py +++ b/timebasedcv/utils/_backends.py @@ -1,7 +1,7 @@ from typing import Callable, Dict, TypeVar -import numpy as np import narwhals as nw +import numpy as np def default_indexing_method(arr, mask): diff --git a/timebasedcv/utils/_types.py b/timebasedcv/utils/_types.py index d3fb96f..98ef20b 100644 --- a/timebasedcv/utils/_types.py +++ b/timebasedcv/utils/_types.py @@ -2,7 +2,7 @@ import sys from datetime import date, datetime -from typing import Literal, Protocol, Tuple, TypeVar, Union, TYPE_CHECKING +from typing import TYPE_CHECKING, Literal, Protocol, Tuple, TypeVar, Union if sys.version_info >= (3, 10): from typing import TypeAlias # pragma: no cover @@ -45,23 +45,17 @@ def max(self: Self) -> T: ... @property - def shape(self: Self) -> Tuple[int]: - ... + def shape(self: Self) -> Tuple[int]: ... - def __lt__(self: Self, other: Union[T, SeriesLike[T]]) -> SeriesLike[bool]: - ... + def __lt__(self: Self, other: Union[T, SeriesLike[T]]) -> SeriesLike[bool]: ... - def __gt__(self: Self, other: Union[T, SeriesLike[T]]) -> SeriesLike[bool]: - ... + def __gt__(self: Self, other: Union[T, SeriesLike[T]]) -> SeriesLike[bool]: ... - def __le__(self: Self, other: Union[T, SeriesLike[T]]) -> SeriesLike[bool]: - ... + def __le__(self: Self, other: Union[T, SeriesLike[T]]) -> SeriesLike[bool]: ... - def __ge__(self: Self, other: Union[T, SeriesLike[T]]) -> SeriesLike[bool]: - ... + def __ge__(self: Self, other: Union[T, SeriesLike[T]]) -> SeriesLike[bool]: ... - def __and__(self: SeriesLike[bool], other: SeriesLike[bool]) -> SeriesLike[bool]: - ... + def __and__(self: SeriesLike[bool], other: SeriesLike[bool]) -> SeriesLike[bool]: ... T_co = TypeVar("T_co", covariant=True) @@ -76,5 +70,4 @@ class TensorLike(Protocol[T_co]): """ @property - def shape(self: Self) -> Tuple[int, ...]: - ... + def shape(self: Self) -> Tuple[int, ...]: ...