Source code for climagrid.forecasting.baselines
"""
Baseline forecasters.
Every ML forecast must beat these to justify itself; the backtest reports the
skill score of the trained model relative to both. They are deliberately
simple and parameter-free.
- ``PersistenceForecaster``: tomorrow looks like today. Predicts the origin
value for every horizon. A strong baseline for smooth, autocorrelated series.
- ``ClimatologyForecaster``: predicts the historical day-of-year average of the
target (optionally restricted to a recent window, a hedge against climate
drift). Captures seasonality with no trend or autocorrelation.
Both expose ``fit(frame, target)`` and ``predict(frame, horizon)`` returning a
point forecast as a NumPy array aligned to the rows of ``frame``. ``frame`` is
a supervised frame from ``dataset.build_supervised_frame`` (it has ``asset_id``,
``date`` and ``y_t`` columns).
"""
from __future__ import annotations
import numpy as np
import pandas as pd
from climagrid.forecasting.config import ForecastConfig
[docs]
class PersistenceForecaster:
"""Predicts the origin value ``y_t`` for every horizon."""
def __init__(self, config: ForecastConfig):
self._config = config
[docs]
def fit(self, frame: pd.DataFrame, target: str) -> PersistenceForecaster:
"""No-op: persistence has nothing to learn."""
return self
[docs]
def predict(self, frame: pd.DataFrame, horizon: int) -> np.ndarray:
"""Return the origin value for each row (horizon-independent)."""
return frame["y_t"].to_numpy(dtype=float)
[docs]
class ClimatologyForecaster:
"""Predicts the historical day-of-year mean of the target."""
def __init__(self, config: ForecastConfig):
self._config = config
self._global_mean: float = float("nan")
self._table: pd.Series = pd.Series(dtype=float)
[docs]
def fit(self, frame: pd.DataFrame, target: str) -> ClimatologyForecaster:
"""Build the day-of-year mean table from the training rows."""
df = frame[["asset_id", "date", "y_t"]].dropna(subset=["y_t"]).copy()
window = self._config.climatology_window_years
if window is not None and not df.empty:
cutoff = df["date"].max() - pd.DateOffset(years=window)
df = df[df["date"] >= cutoff]
if df.empty:
self._global_mean = float("nan")
self._table = pd.Series(dtype=float)
return self
df["doy"] = df["date"].dt.dayofyear
self._global_mean = float(df["y_t"].mean())
if self._config.per_asset:
self._table = df.groupby(["asset_id", "doy"])["y_t"].mean()
else:
self._table = df.groupby("doy")["y_t"].mean()
return self
[docs]
def predict(self, frame: pd.DataFrame, horizon: int) -> np.ndarray:
"""Return the day-of-year mean for each row's target date (origin + h)."""
target_dates = frame["date"] + pd.to_timedelta(horizon, unit="D")
day_of_year = target_dates.dt.dayofyear
if self._config.per_asset:
values = [
self._table.get((asset_id, doy), self._global_mean)
for asset_id, doy in zip(frame["asset_id"], day_of_year, strict=True)
]
return np.asarray(values, dtype=float)
mapped = day_of_year.map(self._table).to_numpy(dtype=float)
return np.where(np.isnan(mapped), self._global_mean, mapped)