Source code for climagrid.schema

"""
Column contract for climagrid's EnvironmentalDataFrame.

Every adapter and feature function must conform to these names, dtypes, and units.
Adding a column? Define it here first.
"""

from __future__ import annotations

import pandas as pd
from pydantic import BaseModel, ConfigDict

# ---------------------------------------------------------------------------
# Column descriptors
# ---------------------------------------------------------------------------


[docs] class ColumnSpec(BaseModel): model_config = ConfigDict(frozen=True) name: str dtype: str units: str description: str source: str # "index" | "noaa_hrrr" | "nasa_power" | "noaa_ncei" | "usda_nrcs" | "usfs_wfigs" | "feature" nullable: bool = True
# --------------------------------------------------------------------------- # Index columns (always present, never null) # --------------------------------------------------------------------------- INDEX_COLUMNS: list[ColumnSpec] = [ ColumnSpec(name="asset_id", dtype="str", units="n/a", description="Utility asset identifier from AssetRegistry", source="index", nullable=False), ColumnSpec(name="timestamp", dtype="datetime64[ns, UTC]", units="n/a", description="UTC timestamp of the observation or forecast hour", source="index", nullable=False), ColumnSpec(name="lat", dtype="float64", units="degrees", description="Asset latitude (WGS-84)", source="index", nullable=False), ColumnSpec(name="lon", dtype="float64", units="degrees", description="Asset longitude (WGS-84)", source="index", nullable=False), ] # --------------------------------------------------------------------------- # NOAA HRRR columns (3 km CONUS analysis/forecast) # --------------------------------------------------------------------------- NOAA_HRRR_COLUMNS: list[ColumnSpec] = [ ColumnSpec(name="hrrr_temperature_2m", dtype="float64", units="°C", description="2-metre air temperature", source="noaa_hrrr"), ColumnSpec(name="hrrr_wind_speed_10m", dtype="float64", units="m/s", description="10-metre wind speed (magnitude)", source="noaa_hrrr"), ColumnSpec(name="hrrr_wind_direction_10m", dtype="float64", units="degrees", description="10-metre wind direction (met convention)", source="noaa_hrrr"), ColumnSpec(name="hrrr_relative_humidity_2m", dtype="float64", units="%", description="2-metre relative humidity", source="noaa_hrrr"), ColumnSpec(name="hrrr_precipitation_rate", dtype="float64", units="mm/hr", description="Hourly accumulated precipitation", source="noaa_hrrr"), ColumnSpec(name="hrrr_solar_irradiance_ghi", dtype="float64", units="W/m²", description="Downward short-wave radiation at surface", source="noaa_hrrr"), ColumnSpec(name="hrrr_snow_depth", dtype="float64", units="m", description="Snow depth at surface", source="noaa_hrrr"), ] # --------------------------------------------------------------------------- # NASA POWER columns (satellite-derived surface meteorology) # --------------------------------------------------------------------------- NASA_POWER_COLUMNS: list[ColumnSpec] = [ ColumnSpec(name="nasa_temperature_2m", dtype="float64", units="°C", description="2-metre air temperature (MERRA-2 based)", source="nasa_power"), ColumnSpec(name="nasa_wind_speed_10m", dtype="float64", units="m/s", description="10-metre wind speed", source="nasa_power"), ColumnSpec(name="nasa_solar_irradiance_ghi", dtype="float64", units="W/m²", description="Global horizontal irradiance", source="nasa_power"), ColumnSpec(name="nasa_relative_humidity_2m", dtype="float64", units="%", description="Relative humidity at 2 m", source="nasa_power"), ColumnSpec(name="nasa_precipitation", dtype="float64", units="mm", description="Precipitation (daily or sub-daily)", source="nasa_power"), ] # --------------------------------------------------------------------------- # NOAA NCEI columns (surface station observations) # --------------------------------------------------------------------------- NOAA_NCEI_COLUMNS: list[ColumnSpec] = [ ColumnSpec(name="ncei_temperature_max", dtype="float64", units="°C", description="Daily maximum temperature", source="noaa_ncei"), ColumnSpec(name="ncei_temperature_min", dtype="float64", units="°C", description="Daily minimum temperature", source="noaa_ncei"), ColumnSpec(name="ncei_wind_speed", dtype="float64", units="m/s", description="Average daily wind speed", source="noaa_ncei"), ColumnSpec(name="ncei_precipitation_daily", dtype="float64", units="mm", description="Daily liquid-equivalent precipitation", source="noaa_ncei"), ColumnSpec(name="ncei_relative_humidity", dtype="float64", units="%", description="Average daily relative humidity", source="noaa_ncei"), ] # --------------------------------------------------------------------------- # USDA NRCS columns (SCAN / SNOTEL soil sensors) # --------------------------------------------------------------------------- USDA_NRCS_COLUMNS: list[ColumnSpec] = [ ColumnSpec(name="nrcs_soil_moisture_pct", dtype="float64", units="%", description="Volumetric soil moisture (nearest SCAN station)", source="usda_nrcs"), ColumnSpec(name="nrcs_soil_temperature", dtype="float64", units="°C", description="Soil temperature at 2-inch depth", source="usda_nrcs"), ColumnSpec(name="nrcs_snow_water_equivalent", dtype="float64", units="mm", description="Snow water equivalent (nearest SNOTEL)", source="usda_nrcs"), ColumnSpec(name="nrcs_station_distance_km", dtype="float64", units="km", description="Distance from asset to nearest NRCS station", source="usda_nrcs"), ] # --------------------------------------------------------------------------- # USFS / NIFC WFIGS columns (wildfire perimeters) # --------------------------------------------------------------------------- USFS_WFIGS_COLUMNS: list[ColumnSpec] = [ ColumnSpec(name="wfigs_nearest_fire_km", dtype="float64", units="km", description="Distance to nearest active fire perimeter edge", source="usfs_wfigs"), ColumnSpec(name="wfigs_fire_active", dtype="bool", units="n/a", description="True if any active fire within 50 km", source="usfs_wfigs"), ColumnSpec(name="wfigs_fire_area_ha", dtype="float64", units="ha", description="Area of nearest active fire in hectares", source="usfs_wfigs"), ] # --------------------------------------------------------------------------- # Feature (stress index) columns: computed by climagrid.features # --------------------------------------------------------------------------- FEATURE_COLUMNS: list[ColumnSpec] = [ ColumnSpec(name="feat_thermal_aging_factor", dtype="float64", units="per-unit", description="Arrhenius FAA relative to 110°C reference (IEEE C57.91)", source="feature"), ColumnSpec(name="feat_heat_hours_above_35c", dtype="float64", units="hours", description="Cumulative hours with temperature > 35°C in rolling window", source="feature"), ColumnSpec(name="feat_freeze_thaw_cycles", dtype="float64", units="count", description="Freeze-thaw transition count over the rolling window", source="feature"), ColumnSpec(name="feat_ice_loading_risk", dtype="float64", units="0-1", description="Normalized ice accretion risk (simplified, ASCE 7-22 based)", source="feature"), ColumnSpec(name="feat_soil_saturation_index", dtype="float64", units="0-1", description="Normalized soil saturation proxy for ground stability", source="feature"), ColumnSpec(name="feat_wildfire_proximity", dtype="float64", units="0-1", description="Normalized wildfire proximity score (0 = far, 1 = adjacent)", source="feature"), ColumnSpec(name="feat_conductor_sag_index", dtype="float64", units="0-1", description="Normalized thermal sag index (IEEE 738 simplified)", source="feature"), ] # --------------------------------------------------------------------------- # Master registry and helpers # --------------------------------------------------------------------------- ALL_COLUMNS: list[ColumnSpec] = ( INDEX_COLUMNS + NOAA_HRRR_COLUMNS + NASA_POWER_COLUMNS + NOAA_NCEI_COLUMNS + USDA_NRCS_COLUMNS + USFS_WFIGS_COLUMNS + FEATURE_COLUMNS ) COLUMN_MAP: dict[str, ColumnSpec] = {c.name: c for c in ALL_COLUMNS}
[docs] def validate_dataframe(df: pd.DataFrame, required_sources: list[str] | None = None) -> list[str]: """Return a list of validation errors (empty = valid).""" errors: list[str] = [] # Check non-nullable index columns for spec in INDEX_COLUMNS: if spec.name not in df.columns: errors.append(f"Missing required column: {spec.name}") elif df[spec.name].isna().any(): errors.append(f"Null values in non-nullable column: {spec.name}") # Check that any present column matches the expected dtype for col_name in df.columns: if col_name in COLUMN_MAP: spec = COLUMN_MAP[col_name] expected = spec.dtype actual = str(df[col_name].dtype) # Loose check: just verify numeric vs non-numeric alignment if expected == "float64" and not pd.api.types.is_float_dtype(df[col_name]): errors.append(f"Column {col_name} expected float64, got {actual}") elif expected == "bool" and not pd.api.types.is_bool_dtype(df[col_name]): # bool columns are sometimes stored as object: only warn pass return errors
[docs] def empty_dataframe() -> pd.DataFrame: """Return an empty DataFrame with all index columns present.""" return pd.DataFrame(columns=[c.name for c in INDEX_COLUMNS])
[docs] def schema_summary() -> pd.DataFrame: """Return a human-readable summary of the full schema.""" return pd.DataFrame( [ { "column": c.name, "dtype": c.dtype, "units": c.units, "source": c.source, "nullable": c.nullable, "description": c.description, } for c in ALL_COLUMNS ] )