Source code for climagrid.forecasting.baselines

"""
Baseline forecasters.

Every ML forecast must beat these to justify itself; the backtest reports the
skill score of the trained model relative to both. They are deliberately
simple and parameter-free.

- ``PersistenceForecaster``: tomorrow looks like today. Predicts the origin
  value for every horizon. A strong baseline for smooth, autocorrelated series.
- ``ClimatologyForecaster``: predicts the historical day-of-year average of the
  target (optionally restricted to a recent window, a hedge against climate
  drift). Captures seasonality with no trend or autocorrelation.

Both expose ``fit(frame, target)`` and ``predict(frame, horizon)`` returning a
point forecast as a NumPy array aligned to the rows of ``frame``. ``frame`` is
a supervised frame from ``dataset.build_supervised_frame`` (it has ``asset_id``,
``date`` and ``y_t`` columns).
"""

from __future__ import annotations

import numpy as np
import pandas as pd

from climagrid.forecasting.config import ForecastConfig


[docs] class PersistenceForecaster: """Predicts the origin value ``y_t`` for every horizon.""" def __init__(self, config: ForecastConfig): self._config = config
[docs] def fit(self, frame: pd.DataFrame, target: str) -> PersistenceForecaster: """No-op: persistence has nothing to learn.""" return self
[docs] def predict(self, frame: pd.DataFrame, horizon: int) -> np.ndarray: """Return the origin value for each row (horizon-independent).""" return frame["y_t"].to_numpy(dtype=float)
[docs] class ClimatologyForecaster: """Predicts the historical day-of-year mean of the target.""" def __init__(self, config: ForecastConfig): self._config = config self._global_mean: float = float("nan") self._table: pd.Series = pd.Series(dtype=float)
[docs] def fit(self, frame: pd.DataFrame, target: str) -> ClimatologyForecaster: """Build the day-of-year mean table from the training rows.""" df = frame[["asset_id", "date", "y_t"]].dropna(subset=["y_t"]).copy() window = self._config.climatology_window_years if window is not None and not df.empty: cutoff = df["date"].max() - pd.DateOffset(years=window) df = df[df["date"] >= cutoff] if df.empty: self._global_mean = float("nan") self._table = pd.Series(dtype=float) return self df["doy"] = df["date"].dt.dayofyear self._global_mean = float(df["y_t"].mean()) if self._config.per_asset: self._table = df.groupby(["asset_id", "doy"])["y_t"].mean() else: self._table = df.groupby("doy")["y_t"].mean() return self
[docs] def predict(self, frame: pd.DataFrame, horizon: int) -> np.ndarray: """Return the day-of-year mean for each row's target date (origin + h).""" target_dates = frame["date"] + pd.to_timedelta(horizon, unit="D") day_of_year = target_dates.dt.dayofyear if self._config.per_asset: values = [ self._table.get((asset_id, doy), self._global_mean) for asset_id, doy in zip(frame["asset_id"], day_of_year, strict=True) ] return np.asarray(values, dtype=float) mapped = day_of_year.map(self._table).to_numpy(dtype=float) return np.where(np.isnan(mapped), self._global_mean, mapped)