From 3799ce24c9ed4693ae853146555808444d878878 Mon Sep 17 00:00:00 2001 From: pavle-martinovic_data Date: Fri, 23 Jan 2026 17:21:45 +0100 Subject: [PATCH] Fix generating custom cutoffs for quarters --- .../forecast/prophet/diagnostics.py | 8 +++--- .../automl_runtime/forecast/utils.py | 4 +-- .../forecast/prophet/diagnostics_test.py | 27 +++++++++++++++++++ .../automl_runtime/forecast/utils_test.py | 7 +++++ 4 files changed, 40 insertions(+), 6 deletions(-) diff --git a/runtime/databricks/automl_runtime/forecast/prophet/diagnostics.py b/runtime/databricks/automl_runtime/forecast/prophet/diagnostics.py index 0dd0651e..e7baf3aa 100644 --- a/runtime/databricks/automl_runtime/forecast/prophet/diagnostics.py +++ b/runtime/databricks/automl_runtime/forecast/prophet/diagnostics.py @@ -106,10 +106,10 @@ def map(self, func, *iterables): # add validation of the cutoff to make sure that the min cutoff is strictly greater than the min date in the history if min(cutoffs) <= df['ds'].min(): raise ValueError("Minimum cutoff value is not strictly greater than min date in history") - # max value of cutoffs is <= (end date minus horizon) - end_date_minus_horizon = df['ds'].max() - horizon - if max(cutoffs) > end_date_minus_horizon: - raise ValueError("Maximum cutoff value is greater than end date minus horizon, no value for cross-validation remaining") + # max cutoff plus horizon must be <= max date in history + end_date = df['ds'].max() + if max(cutoffs) + horizon > end_date: + raise ValueError("Maximum cutoff plus horizon exceeds end date, no value for cross-validation remaining") initial = cutoffs[0] - df['ds'].min() # Check if the initial window diff --git a/runtime/databricks/automl_runtime/forecast/utils.py b/runtime/databricks/automl_runtime/forecast/utils.py index 496f3862..e16fd4fc 100644 --- a/runtime/databricks/automl_runtime/forecast/utils.py +++ b/runtime/databricks/automl_runtime/forecast/utils.py @@ -219,8 +219,8 @@ def generate_custom_cutoffs(df: pd.DataFrame, horizon: int, frequency_unit: str, # First cutoff is the cutoff bewteen splits cutoff = split_cutoff result = [] - max_cutoff = max(df["ds"]) - horizon_dateoffset - while cutoff <= max_cutoff: + max_cutoff = max(df["ds"]) + while cutoff + horizon_dateoffset <= max_cutoff: # If data does not exist in data range (cutoff, cutoff + horizon_dateoffset] if (not (((df["ds"] > cutoff) & (df["ds"] <= cutoff + horizon_dateoffset)).any())): # Next cutoff point is "next date after cutoff in data - horizon_dateoffset" diff --git a/runtime/tests/automl_runtime/forecast/prophet/diagnostics_test.py b/runtime/tests/automl_runtime/forecast/prophet/diagnostics_test.py index 7b7b9245..b69b9de9 100644 --- a/runtime/tests/automl_runtime/forecast/prophet/diagnostics_test.py +++ b/runtime/tests/automl_runtime/forecast/prophet/diagnostics_test.py @@ -64,3 +64,30 @@ def test_cross_validation_success(self): df_cv = cross_validation(model, horizon=horizon, cutoffs=cutoffs) self.assertEqual(df_cv["ds"].tolist(), expected_ds.tolist()) self.assertEqual(set(df_cv.columns), set(expected_cols)) + + def test_cross_validation_month_end_cutoff(self): + df = pd.DataFrame({ + "ds": pd.to_datetime([ + "2019-03-31", + "2019-06-30", + "2019-09-30", + "2019-12-31", + "2020-03-31", + "2020-06-30", + "2020-09-30", + ]), + "y": range(7), + }) + model = Prophet( + yearly_seasonality=False, + weekly_seasonality=False, + daily_seasonality=False, + ) + model.fit(df) + + cutoffs = [pd.Timestamp("2020-03-31")] + horizon = pd.DateOffset(months=6) + df_cv = cross_validation(model, horizon=horizon, cutoffs=cutoffs) + + expected_ds = df[(df["ds"] > cutoffs[0]) & (df["ds"] <= cutoffs[0] + horizon)]["ds"] + self.assertEqual(df_cv["ds"].tolist(), expected_ds.tolist()) diff --git a/runtime/tests/automl_runtime/forecast/utils_test.py b/runtime/tests/automl_runtime/forecast/utils_test.py index c043da9a..d2ab3095 100644 --- a/runtime/tests/automl_runtime/forecast/utils_test.py +++ b/runtime/tests/automl_runtime/forecast/utils_test.py @@ -284,6 +284,13 @@ def test_generate_custom_cutoffs_success_quaterly(self): cutoffs = generate_custom_cutoffs(df, horizon=7, frequency_unit="QS", split_cutoff=pd.Timestamp('2020-07-12 00:00:00')) self.assertEqual([pd.Timestamp('2020-07-12 00:00:00'), pd.Timestamp('2020-10-12 00:00:00')], cutoffs) + def test_generate_custom_cutoffs_success_quaterly_end(self): + df = pd.DataFrame( + pd.date_range(start="2020-03-31", periods=3, freq=pd.DateOffset(months=3)), columns=["ds"] + ).rename_axis("y").reset_index() + cutoffs = generate_custom_cutoffs(df, horizon=2, frequency_unit="QS", split_cutoff=pd.Timestamp('2020-03-31 00:00:00')) + self.assertEqual([pd.Timestamp('2020-03-31 00:00:00')], cutoffs) + def test_generate_custom_cutoffs_success_annualy(self): df = pd.DataFrame( pd.date_range(start="2012-07-14", periods=10, freq=pd.DateOffset(years=1)), columns=["ds"]