diff --git a/kats/detectors/prophet_detector.py b/kats/detectors/prophet_detector.py index 979dfaf50..ca9df708b 100644 --- a/kats/detectors/prophet_detector.py +++ b/kats/detectors/prophet_detector.py @@ -37,6 +37,8 @@ PROPHET_YHAT_COLUMN = "yhat" PROPHET_YHAT_LOWER_COLUMN = "yhat_lower" PROPHET_YHAT_UPPER_COLUMN = "yhat_upper" +HOLIDAY_NAMES_COLUMN_NAME = "holiday" +HOLIDAY_DATES_COLUMN_NAME = "ds" import os import sys @@ -177,6 +179,9 @@ class SeasonalityTypes(Enum): WEEKEND = 3 +USER_HOLIDAY_NAME = "user_provided_holiday" + + def to_seasonality(seasonality: Union[str, SeasonalityTypes]) -> SeasonalityTypes: if isinstance(seasonality, str): try: @@ -316,6 +321,7 @@ def __init__( ] ] = None, countries_holidays: Optional[List[str]] = None, + holidays_list: Optional[Union[List[str], Dict[str, List[str]]]] = None, ) -> None: """ Initializartion of Prophet @@ -333,6 +339,8 @@ def __init__( If argument SeasonalityTypes, List[SeasonalityTypes], than mentioned seasonilities will be used in Prophet. If argument Dict[SeasonalityTypes, bool] - each seasonality can be setted directly (True - means used it, False - not to use, 'auto' according to Prophet.). SeasonalityTypes enum values: DAY, WEEK , YEAR, WEEKEND Daily, Weekly, Yearly seasonlities used as "auto" by default. + countries_holidays: Optional[List[str]]: List of countries for which holidays should be added to the model. + holidays_list: Optional[Union[List[str], Dict[str, List[str]]]] : List of holiday dates to be added to the model. like ["2022-01-01","2022-03-31"], or dict of list if we have diffreent holidays patterns for example {"ds":["2022-01-01","2022-03-31"], "holidays":["playoff","superbowl"]} """ if serialized_model: @@ -368,6 +376,7 @@ def __init__( if countries_holidays is None: countries_holidays = [] self.countries_holidays: List[str] = countries_holidays + self.holidays_list = holidays_list def serialize(self) -> bytes: """Serialize the model into a json. @@ -449,6 +458,26 @@ def fit( additional_seasonalities = [] if self.seasonalities_to_fit[SeasonalityTypes.WEEKEND]: additional_seasonalities = prophet_weekend_masks(data_df) + holidays = self.holidays_list + if holidays is not None and len(holidays) > 0: + if isinstance(holidays, List): + if isinstance(holidays[0], str): + holidays = { + HOLIDAY_DATES_COLUMN_NAME: self.holidays_list, + HOLIDAY_NAMES_COLUMN_NAME: ["holiday"] * len(holidays), + } + else: + raise ValueError( + "holidays_list should be a list of str or dict of list of str" + ) + if not isinstance(holidays, Dict): + raise ValueError( + "holidays_list should be a list of str or dict of list of str" + ) + # we use default lower and upper bound for holidays + + holidays = pd.DataFrame(holidays) + # No incremental training. Create a model and train from scratch model = Prophet( interval_width=self.scoring_confidence_interval, @@ -456,6 +485,7 @@ def fit( daily_seasonality=self.seasonalities_to_fit[SeasonalityTypes.DAY], yearly_seasonality=self.seasonalities_to_fit[SeasonalityTypes.YEAR], weekly_seasonality=self.seasonalities_to_fit[SeasonalityTypes.WEEK], + holidays=holidays, ) for country in self.countries_holidays: model.add_country_holidays(country) diff --git a/kats/tests/detectors/test_prophet_detector.py b/kats/tests/detectors/test_prophet_detector.py index b205afd8f..d28b8da98 100644 --- a/kats/tests/detectors/test_prophet_detector.py +++ b/kats/tests/detectors/test_prophet_detector.py @@ -25,13 +25,15 @@ from kats.utils.simulator import Simulator from parameterized.parameterized import parameterized +START_DATE_TEST_DATA = "2018-01-01" + class TestProphetDetector(TestCase): def create_random_ts( self, seed: int, length: int, magnitude: float, slope_factor: float ) -> TimeSeriesData: np.random.seed(seed) - sim = Simulator(n=length, freq="1D", start=pd.to_datetime("2020-01-01")) + sim = Simulator(n=length, freq="1D", start=pd.to_datetime(START_DATE_TEST_DATA)) sim.add_trend(magnitude=magnitude * np.random.rand() * slope_factor) sim.add_seasonality( @@ -51,7 +53,7 @@ def create_ts( freq: str = "1D", ) -> TimeSeriesData: np.random.seed(seed) - sim = Simulator(n=length, freq=freq, start=pd.to_datetime("2020-01-01")) + sim = Simulator(n=length, freq=freq, start=pd.to_datetime(START_DATE_TEST_DATA)) sim.add_seasonality(magnitude, period=timedelta(days=7)) sim.add_noise(magnitude=signal_to_noise_ratio * magnitude) @@ -68,7 +70,7 @@ def create_multi_seasonality_ts( ) -> TimeSeriesData: np.random.seed(seed) - sim = Simulator(n=length, freq=freq, start=pd.to_datetime("2020-01-01")) + sim = Simulator(n=length, freq=freq, start=pd.to_datetime(START_DATE_TEST_DATA)) magnitude = (max_val - min_val) / 2 sim.add_trend(-0.2 * magnitude) @@ -153,7 +155,9 @@ def add_smooth_anomaly( # start time and freq don't matter, since we only care about the values np.random.seed(seed) - anomaly_sim = Simulator(n=length, freq="1D", start=pd.to_datetime("2020-01-01")) + anomaly_sim = Simulator( + n=length, freq="1D", start=pd.to_datetime(START_DATE_TEST_DATA) + ) anomaly_sim.add_seasonality(magnitude, period=timedelta(days=2 * length)) # anomaly_sim.add_noise(magnitude=0.3 * magnitude * np.random.rand()) @@ -170,7 +174,7 @@ def add_trend_shift( self, ts: TimeSeriesData, length: int, freq: str, magnitude: float ) -> None: ts_df = ts.to_dataframe() - sim = Simulator(n=length, freq=freq, start=pd.to_datetime("2020-01-01")) + sim = Simulator(n=length, freq=freq, start=pd.to_datetime(START_DATE_TEST_DATA)) elevation = sim.trend_shift_sim( cp_arr=[0, 1], trend_arr=[0, 0, 0], @@ -215,7 +219,7 @@ def add_multi_event( event_relative_magnitude: float, ) -> TimeSeriesData: np.random.seed(seed) - sim = Simulator(n=length, freq=freq, start=pd.to_datetime("2020-01-01")) + sim = Simulator(n=length, freq=freq, start=pd.to_datetime(START_DATE_TEST_DATA)) event_start = int(length * event_start_ratio) event_end = int(length * event_end_ratio) @@ -653,14 +657,14 @@ def test_heteroskedastic_noise_signal(self) -> None: verifies that anomalies in low-noise segments have higher z-scores than those in high-noise segments. This occurs because low noise segments will have lower standard deviations, which result in higher z-scores. - With call ProphetDetectorMopdel without weekend seasonaluty this taest fails + With call ProphetDetectorModel without weekend seasonaluty this taest fails """ ts = self.create_ts(length=100 * 24, signal_to_noise_ratio=0.05, freq="1h") # add heteroskedastic noise to the data ts.value *= ( - (ts.time - pd.to_datetime("2020-01-01")) % timedelta(days=7) + (ts.time - pd.to_datetime(START_DATE_TEST_DATA)) % timedelta(days=7) > timedelta(days=3.5) ) * np.random.rand(100 * 24) * 2.5 + 0.5 @@ -684,6 +688,7 @@ def test_heteroskedastic_noise_signal_with_holidays(self) -> None: verifies that anomalies in low-noise segments have higher z-scores than those in high-noise segments. This occurs because low noise segments will have lower standard deviations, which result in higher z-scores. + We are addingh holiday, to check the param works With call ProphetDetectorMopdel without weekend seasonaluty this taest fails """ ts = self.create_ts(length=100 * 24, signal_to_noise_ratio=0.05, freq="1h") @@ -691,7 +696,7 @@ def test_heteroskedastic_noise_signal_with_holidays(self) -> None: # add heteroskedastic noise to the data ts.value *= ( - (ts.time - pd.to_datetime("2020-01-01")) % timedelta(days=7) + (ts.time - pd.to_datetime(START_DATE_TEST_DATA)) % timedelta(days=7) > timedelta(days=3.5) ) * np.random.rand(100 * 24) * 2.5 + 0.5 @@ -709,6 +714,58 @@ def test_heteroskedastic_noise_signal_with_holidays(self) -> None: response.scores.value[13 * 24], response.scores.value[16 * 24] ) + def test_heteroskedastic_noise_signal_with_specific_holidays(self) -> None: + """Tests the z-score strategy on signals with heteroskedastic noise + + This test creates synthetic data with heteroskedastic noise. Then, it adds + anomalies of identical magnitudes to segments with different noise. Finally, it + verifies that anomalies in low-noise segments have higher z-scores than those + in high-noise segments. This occurs because low noise segments will have lower + standard deviations, which result in higher z-scores. + We also adding value for the first day abnormakl, which shouldn;'t affects outcome as it holiday + With call ProphetDetectorModel without weekend seasonaluty this taest fails + """ + ts = self.create_ts(length=100 * 24, signal_to_noise_ratio=0.05, freq="1h") + + # add heteroskedastic noise to the data + playoffs = [ + START_DATE_TEST_DATA, + (pd.to_datetime(START_DATE_TEST_DATA) + pd.Timedelta(days=4)).strftime( + "%Y-%m-%d" + ), + ] + ts.value *= ( + (ts.time - pd.to_datetime(START_DATE_TEST_DATA)) % timedelta(days=7) + > timedelta(days=3.5) + ) * np.random.rand(100 * 24) * 2.5 + 0.5 + ts.value[0] += 1000 + ts.value[93 * 24] += 100 + ts.value[96 * 24] += 100 + + model = ProphetDetectorModel( + score_func="z_score", + seasonalities={SeasonalityTypes.WEEKEND: True}, + countries_holidays=["US", "UK"], + holidays_list=playoffs, + ) + response = model.fit_predict(ts[80 * 24 :], ts[: 80 * 24]) + + self.assertGreater( + response.scores.value[13 * 24], response.scores.value[16 * 24] + ) + + model = ProphetDetectorModel( + score_func="z_score", + seasonalities={SeasonalityTypes.WEEKEND: True}, + countries_holidays=["US", "UK"], + holidays_list={"ds": playoffs, "holiday": ["playoff"] * len(playoffs)}, + ) + response = model.fit_predict(ts[80 * 24 :], ts[: 80 * 24]) + + self.assertGreater( + response.scores.value[13 * 24], response.scores.value[16 * 24] + ) + def test_weekend_seasonality_noise_signal(self) -> None: """Tests the accuracy with heteroskedastic series and noise