From ddd1b632cb958cc95c72ca23ac0565e4145587df Mon Sep 17 00:00:00 2001 From: aclerc Date: Tue, 30 Jun 2026 10:59:39 +0100 Subject: [PATCH 1/3] add study_power_model_compare.py --- .../baselines/study_power_model_compare.py | 292 ++++++++++++++++++ docs/v1/findings.md | 88 ++++++ 2 files changed, 380 insertions(+) create mode 100644 benchmarking/baselines/study_power_model_compare.py diff --git a/benchmarking/baselines/study_power_model_compare.py b/benchmarking/baselines/study_power_model_compare.py new file mode 100644 index 0000000..30d605c --- /dev/null +++ b/benchmarking/baselines/study_power_model_compare.py @@ -0,0 +1,292 @@ +"""Re-run ``power_model`` over the overnight cases and compare it against the existing v0 + naive. + +Iterative-development helper. The overnight v0 runs are very slow, so they are computed once +(:mod:`benchmarking.baselines.study_overnight_prepost` / ``study_overnight_toggle`` with +``include_v0=True``) and kept on disk. As ``power_model`` is improved you only want to re-run the +cheap method and re-draw the comparison plots, reusing the frozen v0 (and naive) numbers. That is +what this script does: + +1. **Run** ``power_model`` **only** over the *exact* current overnight prepost + toggle configs — + same seven :func:`~benchmarking.baselines.overnight_profiles.overnight_profiles`, same campaign + grids, ``n_replicates=4``, ``seed=0`` — so every ``(profile, turbine, treatment-start, campaign)`` + case matches the frozen overnight run. v0 and naive are *not* recomputed (v0 is very slow and + both are already frozen in the reference run). +2. **Merge** each fresh ``power_model`` result with the ``v0_binned`` and ``naive_ratio`` rows + pulled from the reference overnight directory. +3. **Plot** a three-method campaign-length comparison (``naive_ratio``, ``v0_binned``, + ``power_model``) per profile, and write merged per-profile + all-profiles leaderboards. + +An alignment guard checks that the method-independent ground ``truth`` of every fresh case equals +the reference run's truth for the same key; a mismatch means the configs have drifted and the +merge would be comparing different cases, so it fails loudly rather than plotting nonsense. (The +``truth`` column the harness attaches to every ``power_model`` row is itself the cross-check, so +re-running the oracle/naive anchors purely to validate the line-up is unnecessary.) + +Run from the repo root:: + + uv run python -m benchmarking.baselines.study_power_model_compare \ + --reference-dir "~/temp/wind-up-benchmarking/badass overnight runs 30 June" + +Use ``--skip-run`` to only re-merge/re-plot from a previous ``power_model`` run under +``--output-dir`` (e.g. to tweak plotting without re-fitting). Use ``--modes prepost`` / +``--modes toggle`` to restrict to one mode. +""" + +from __future__ import annotations + +import argparse +import logging +from functools import partial +from pathlib import Path + +import numpy as np +import pandas as pd + +from benchmarking.baselines.example_prepost_study import ( + DEFAULT_END_DT_EXCL, + DEFAULT_START_DT, + DEFAULT_TREATMENT_START_RANGE, + DEFAULT_TURBINE_SUBSET, + DEFAULT_WTG_NUMBERS, + MIN_PRE_MONTHS, + save_per_method_curve, +) +from benchmarking.baselines.example_toggle_study import DEFAULT_TOGGLE_PERIOD +from benchmarking.baselines.hot_context import build_hot_v0_context +from benchmarking.baselines.overnight_profiles import overnight_profiles +from benchmarking.baselines.power_model import PowerModelMethod +from benchmarking.harness import StudyConfig, leaderboard, plot_campaign_curves, score_study +from benchmarking.synthetic import HOT_COLUMNS +from benchmarking.synthetic.sources.hill_of_towie import load_hot_scada + +logger = logging.getLogger(__name__) + +# The three methods compared in the merged plots (oracle is dropped: it is only a truth anchor). +COMPARE_METHODS = ["naive_ratio", "v0_binned", "power_model"] +REUSED_METHODS = ["naive_ratio", "v0_binned"] # taken from the reference run, not recomputed + +# Match study_overnight_prepost.py / study_overnight_toggle.py exactly. +PREPOST_CAMPAIGN_MONTHS = [3, 6, 12] +TOGGLE_CAMPAIGN_MONTHS = [3, 6, 9, 12] +N_REPLICATES = 4 +SEED = 0 + +# Keys that identify one scored case independently of the method. +_CASE_KEYS = ["profile", "test_wtg", "campaign_months", "treatment_start"] +_DEFAULT_REFERENCE_DIR = Path.home() / "temp" / "wind-up-benchmarking" / "badass overnight runs 30 June" +_DEFAULT_OUTPUT_DIR = Path.home() / "temp" / "wind-up-benchmarking" / "power_model_compare" + + +def _prepost_study() -> StudyConfig: + return StudyConfig( + mode="prepost", + turbine_subset=DEFAULT_TURBINE_SUBSET, + treatment_start_range=DEFAULT_TREATMENT_START_RANGE, + min_pre_months=MIN_PRE_MONTHS, + campaign_months=PREPOST_CAMPAIGN_MONTHS, + n_replicates=N_REPLICATES, + seed=SEED, + ) + + +def _toggle_study() -> StudyConfig: + return StudyConfig( + mode="toggle", + turbine_subset=DEFAULT_TURBINE_SUBSET, + treatment_start_range=DEFAULT_TREATMENT_START_RANGE, + min_pre_months=MIN_PRE_MONTHS, + campaign_months=TOGGLE_CAMPAIGN_MONTHS, + toggle_period=DEFAULT_TOGGLE_PERIOD, + n_replicates=N_REPLICATES, + seed=SEED, + ) + + +def run_power_model(mode: str, out_dir: Path) -> pd.DataFrame: + """Score **only** ``power_model`` over the overnight cases for one mode (no v0/naive/oracle). + + Each fresh row still carries the harness's method-independent ground ``truth`` (so the merge's + alignment guard has its cross-check without recomputing any anchor). Writes a per-profile + ``results_*.csv`` and per-profile ``power_model`` curve under ``out_dir`` (the latter lets a long + run be sanity-checked profile-by-profile), and returns the concatenated tidy results. + """ + out_dir.mkdir(parents=True, exist_ok=True) + scada_df, _ = load_hot_scada( + start_dt=DEFAULT_START_DT, + end_dt_excl=DEFAULT_END_DT_EXCL, + wtg_numbers=DEFAULT_WTG_NUMBERS, + wtg_names=DEFAULT_TURBINE_SUBSET, + ) + context = build_hot_v0_context(wtg_names=DEFAULT_TURBINE_SUBSET) + study = _prepost_study() if mode == "prepost" else _toggle_study() + + all_results = [] + for profile_name, profile in overnight_profiles().items(): + method = PowerModelMethod( + active_power_col=HOT_COLUMNS.active_power, + wind_speed_col=HOT_COLUMNS.wind_speed, + availability_col=HOT_COLUMNS.availability, + era5_hourly_df=context.reanalysis_datasets[0].data, + out_dir=out_dir / "power_model_runs", + ) + logger.info("Scoring %s profile %s with power_model", mode, profile_name) + results = score_study( + scada_df, + profile=profile, + methods=[method], + study=study, + profile_name=profile_name, + on_method_complete=partial(save_per_method_curve, out_dir, profile_name), + ) + results.to_csv(out_dir / f"results_{profile_name}.csv", index=False) + all_results.append(results) + return pd.concat(all_results, ignore_index=True) + + +def _load_fresh_results(mode_out_dir: Path) -> pd.DataFrame: + """Concatenate the per-profile ``results_*.csv`` a previous run wrote (for ``--skip-run``).""" + files = sorted(mode_out_dir.glob("results_*.csv")) + if not files: + msg = f"no results_*.csv under {mode_out_dir}; run without --skip-run first." + raise FileNotFoundError(msg) + return pd.concat([pd.read_csv(f) for f in files], ignore_index=True) + + +def _load_reference_methods(reference_mode_dir: Path, profiles: list[str], methods: list[str]) -> pd.DataFrame: + """Load the requested methods' rows for the given profiles from the reference run directory.""" + frames = [] + for profile in profiles: + path = reference_mode_dir / f"results_{profile}.csv" + if not path.exists(): + msg = f"reference results missing for profile {profile!r}: {path}" + raise FileNotFoundError(msg) + df = pd.read_csv(path) + frames.append(df[df["method"].isin(methods)]) + return pd.concat(frames, ignore_index=True) + + +def _case_key(df: pd.DataFrame) -> pd.Series: + """Build a stable per-case string key (treatment_start normalised so str/Timestamp compare equal).""" + ts = pd.to_datetime(df["treatment_start"], utc=True).dt.strftime("%Y-%m-%d %H:%M:%S%z") + return ( + df["profile"].astype(str) + + "|" + + df["test_wtg"].astype(str) + + "|" + + df["campaign_months"].astype(int).astype(str) + + "|" + + ts + ) + + +def _check_alignment(fresh: pd.DataFrame, reference: pd.DataFrame) -> None: + """Fail loudly if the fresh cases do not line up case-for-case with the reference run. + + Ground ``truth`` is method-independent and deterministic in the study config + seed, so equal + keys must carry equal truth. Mismatched keys or truths mean the configs drifted. + """ + f_overall = fresh[fresh["condition"] == "overall"].copy() + r_overall = reference[reference["condition"] == "overall"].copy() + f_truth = f_overall.assign(key=_case_key(f_overall)).groupby("key")["truth"].first() + r_truth = r_overall.assign(key=_case_key(r_overall)).groupby("key")["truth"].first() + + missing = sorted(set(f_truth.index) - set(r_truth.index)) + if missing: + msg = f"{len(missing)} fresh case(s) have no reference match, e.g. {missing[:3]}" + raise ValueError(msg) + common = f_truth.index.intersection(r_truth.index) + bad = ~np.isclose(f_truth.loc[common].to_numpy(), r_truth.loc[common].to_numpy(), rtol=1e-6, atol=1e-9) + if bad.any(): + example = common[bad][0] + msg = ( + f"{int(bad.sum())} case(s) disagree on ground truth between the fresh power_model run and the " + f"reference run (e.g. {example}: fresh={f_truth[example]:.6g} vs ref={r_truth[example]:.6g}). " + f"The overnight config has drifted from the reference run; the merge would compare different cases." + ) + raise ValueError(msg) + logger.info("Alignment OK: %d cases match the reference run on ground truth.", len(common)) + + +def merge_and_plot(mode: str, fresh: pd.DataFrame, reference_mode_dir: Path, out_dir: Path) -> pd.DataFrame: + """Merge fresh power_model with reference v0/naive, write merged tables + per-profile plots.""" + comparison_dir = out_dir / "comparison" + comparison_dir.mkdir(parents=True, exist_ok=True) + + profiles = sorted(fresh["profile"].unique()) + reference = _load_reference_methods(reference_mode_dir, profiles, REUSED_METHODS) + _check_alignment(fresh, reference) + + power_model = fresh[fresh["method"] == "power_model"] + merged = pd.concat([reference, power_model], ignore_index=True) + merged = merged[merged["method"].isin(COMPARE_METHODS)] + + merged.to_csv(comparison_dir / f"merged_results_{mode}.csv", index=False) + for profile in profiles: + prof_rows = merged[merged["profile"] == profile] + summary = leaderboard(prof_rows) + summary.to_csv(comparison_dir / f"leaderboard_{profile}.csv", index=False) + plot_campaign_curves( + summary, + save_path=comparison_dir / f"campaign_curves_{profile}.png", + title=f"{mode} - {profile} (naive vs v0 vs power_model)", + ) + + all_summary = leaderboard(merged) + all_summary.to_csv(comparison_dir / f"leaderboard_all_profiles_{mode}.csv", index=False) + logger.info( + "%s comparison (all profiles):\n%s", + mode, + all_summary[["method", "profile", "campaign_months", "bias", "spread", "score"]].to_string(index=False), + ) + logger.info("Wrote %s comparison outputs to %s", mode, comparison_dir) + return merged + + +def main() -> None: + """Run power_model over the overnight cases for each mode, then merge + plot vs v0/naive.""" + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument( + "--reference-dir", + type=Path, + default=_DEFAULT_REFERENCE_DIR, + help="overnight run dir holding prepost/ and toggle/ with the frozen v0 + naive results", + ) + parser.add_argument( + "--output-dir", + type=Path, + default=_DEFAULT_OUTPUT_DIR, + help="where fresh power_model runs and the merged comparison are written", + ) + parser.add_argument( + "--modes", + nargs="+", + choices=["prepost", "toggle"], + default=["prepost", "toggle"], + help="which mode(s) to run (default: both)", + ) + parser.add_argument( + "--skip-run", + action="store_true", + help="reuse a previous power_model run under --output-dir; only re-merge and re-plot", + ) + args = parser.parse_args() + + logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s", force=True) + reference_dir = args.reference_dir.expanduser() + output_dir = args.output_dir.expanduser() + + for mode in args.modes: + logger.info("=== %s ===", mode.upper()) + mode_out_dir = output_dir / mode + if args.skip_run: + fresh = _load_fresh_results(mode_out_dir) + logger.info("Reusing %d fresh rows from %s", len(fresh), mode_out_dir) + else: + fresh = run_power_model(mode, mode_out_dir) + merge_and_plot(mode, fresh, reference_dir / mode, mode_out_dir) + + logger.info("All done. Comparison plots under %s//comparison/", output_dir) + + +if __name__ == "__main__": + main() diff --git a/docs/v1/findings.md b/docs/v1/findings.md index 42ab2e5..ff2b0f6 100644 --- a/docs/v1/findings.md +++ b/docs/v1/findings.md @@ -7,6 +7,94 @@ from, not just conclusions. --- +## F4 — power_model beats v0 in prepost and at longer-toggle; v0's edge is only short-toggle, and v0 alone breaks on rated-power uprates + +*2026-06-30 — the three-way comparison F3 flagged as the open question (power_model vs v0 in +*both* modes). Source: `benchmarking/baselines/study_power_model_compare.py`, which re-runs +**only** `power_model` over the current overnight cases and merges it with the frozen `v0_binned` ++ `naive_ratio` rows from the overnight run (`study_overnight_{prepost,toggle}.py`, +`include_v0=True`). Seven `overnight_profiles` (`cp_minus_10pct`, `cp_0pct`, `cp_plus_3pct`, +`cp_plus_10pct`, `ws_dependent_cp`, `ti_dependent_cp`, `rated_plus_5pct`), `n_replicates=4`, +seed 0; prepost campaigns 3/6/12 mo (84 cases/method), toggle 3/6/9/12 mo (112 cases/method). The +script's alignment guard confirmed all 84 + 112 fresh cases match the reference run's +method-independent ground truth exactly, so the merge compares identical cases.* + +All numbers below are percentage points of fractional uplift (a 0.01 fraction = 1 pp). Bias = mean +signed error, spread = std of signed error, RMSE pooled over all cases for that mode/method. + +### Observation — pooled over all profiles and campaign lengths + +| mode | method | bias | spread | MAE | RMSE | within ±1pp | per-case win | +|---|---|---|---|---|---|---|---| +| prepost | naive_ratio | −1.11 | 4.62 | 3.83 | 4.73 | 17% | 0% | +| prepost | **power_model** | **−0.39** | **0.49** | **0.53** | **0.62** | **86%** | **73%** | +| prepost | v0_binned | −0.73 | 0.72 | 0.84 | 1.03 | 62% | 27% | +| toggle | naive_ratio | +0.15 | 0.16 | 0.19 | 0.22 | 100% | 39% | +| toggle | power_model | +0.16 | 0.19 | 0.20 | 0.24 | 100% | 22% | +| toggle | v0_binned | −0.01 | 0.33 | 0.23 | 0.32 | 98% | 38% | + +("per-case win" = share of the N cases where that method has the smallest |error|.) In prepost, +**power_model's |error| is smaller than v0's in 73% of cases and smaller than naive's in 100%**. + +### Observation — RMSE by campaign length (the short-data story, serves G2) + +| mode | method | 3mo | 6mo | 9mo | 12mo | +|---|---|---|---|---|---| +| prepost | naive_ratio | 7.32 | 3.19 | — | 1.84 | +| prepost | **power_model** | **0.82** | **0.46** | — | **0.53** | +| prepost | v0_binned | 1.22 | 0.94 | — | 0.89 | +| toggle | naive_ratio | **0.30** | 0.23 | 0.17 | 0.14 | +| toggle | power_model | 0.38 | **0.22** | **0.18** | **0.11** | +| toggle | v0_binned | 0.32 | 0.31 | 0.33 | 0.32 | + +The two structural facts: **(a)** in prepost power_model leads at every length, its biggest margin +at 3 months (0.82 vs v0 1.22 vs naive 7.32); **(b)** in toggle, power_model and naive both tighten +with more data (power_model 0.38 → 0.11), but **v0 does not improve with campaign length** — it +sits at ~0.32 RMSE from 3 to 12 months. So v0 only wins the shortest toggle campaign; from 6 +months on, power_model is best in toggle too. + +### Observation — profile spotlight (pooled over campaigns) + +| profile | method | bias | RMSE | max |error| | +|---|---|---|---|---| +| prepost `rated_plus_5pct` | **power_model** | **−0.38** | **0.62** | **1.05** | +| prepost `rated_plus_5pct` | v0_binned | −1.24 | 1.42 | 2.23 | +| toggle `rated_plus_5pct` | **power_model** | +0.16 | **0.25** | **0.49** | +| toggle `rated_plus_5pct` | v0_binned | −0.63 | 0.68 | 1.10 | + +power_model is **flat across all seven profiles** (prepost RMSE 0.61–0.63, bias ≈ −0.38 on every +one), whereas **v0 has a specific weak spot on the rated-power uprate** — its worst profile in both +modes (the only profile where v0's toggle RMSE, 0.68, is more than ~2× its others). A rated-power +change shifts power at high wind speeds where v0's binned power-curve has sparse, noisy bins; +power_model's continuous reference-conditioned fit has no such blind spot. On the placebo +(`cp_0pct`) all three are well-behaved (toggle v0 even edges power_model, 0.18 vs 0.24 RMSE). + +### Interpretation +- **Prepost: power_model is the better method, decisively.** Lower bias (−0.39 vs −0.73 pp), lower + spread (0.49 vs 0.72), ~40% lower RMSE than v0, and it wins the majority of cases head-to-head — + the F1 contrast lever (expected power through the references) cancelling the common-mode drift + that v0 corrects only through its detrend step. naive is not in contention (covariate shift). +- **Toggle: a near-tie that tips to power_model with data.** Naive and power_model are + near-identical and both beat v0 overall on RMSE; v0's larger spread and its failure to improve + with longer toggling are the cost of its binning. v0's only advantage is the 3-month toggle + campaign, where power_model carries slightly more bias (+0.37) before its variance collapses. +- **v0's rated-power weakness is the clearest single result.** It is the one regime where v0 is + both biased and high-variance in *both* modes, and where power_model's flatness is most valuable. + +### Implications +1. **Answers F3's open question: power_model ≥ v0 in both modes for P50** — strictly better in + prepost and at toggle ≥ 6 months, with v0 ahead only at the shortest toggle campaign. It is now + the baseline to beat (G-level), not just vs naive. +2. **Short-toggle bias is power_model's one soft spot** — the +0.37 pp at 3-month toggle is the + thing to chip at next (mirrors the residual prepost bias noted in F3 #3); candidates are the + baseline-horizon / recency weighting already on the list. +3. **Add a rated-power-uprate case to any v0 regression framing** — it is v0's worst regime and a + natural demonstrator for power_model's advantage; worth a dedicated diagnostic. +4. Reproduce/extend with `study_power_model_compare.py` (`--skip-run` to re-merge, `--modes` to + restrict); it reuses the frozen slow v0 so each power_model iteration is cheap. + +--- + ## F3 — A simple counterfactual power model halves prepost bias and spread vs naive; toggle is a wash *2026-06-29 — new method `power_model` (the simplest-possible ML method: a single LightGBM From bde599aa863b0c9b5e25179ab3802947cdd14ccd Mon Sep 17 00:00:00 2001 From: aclerc Date: Tue, 30 Jun 2026 11:56:40 +0100 Subject: [PATCH 2/3] add study_power_model_compare baseline --- .../baselines/study_power_model_compare.py | 177 +++++++- .../study_power_model_compare_baseline.json | 393 ++++++++++++++++++ 2 files changed, 567 insertions(+), 3 deletions(-) create mode 100644 benchmarking/baselines/study_power_model_compare_baseline.json diff --git a/benchmarking/baselines/study_power_model_compare.py b/benchmarking/baselines/study_power_model_compare.py index 30d605c..e456576 100644 --- a/benchmarking/baselines/study_power_model_compare.py +++ b/benchmarking/baselines/study_power_model_compare.py @@ -22,22 +22,36 @@ ``truth`` column the harness attaches to every ``power_model`` row is itself the cross-check, so re-running the oracle/naive anchors purely to validate the line-up is unnecessary.) +**Benchmark regression tracking.** ``power_model``'s bias/spread/score per +``(mode, profile, campaign_months)`` at a known-good commit is frozen in a committed JSON benchmark +(``study_power_model_compare_baseline.json``, next to this script). Every run diffs the fresh +``power_model`` against it and logs a mean-over-profiles table of the deltas (spread/score: a +negative delta is an improvement; bias: a smaller ``|bias|`` is better) plus a per-cell +``benchmark_comparison_.csv``, so an attempt to improve ``power_model`` is scored objectively +against where it stands today. Bump the benchmark **deliberately** with ``--update-baseline`` once an +improvement is accepted (it rewrites only the modes you ran; commit the new JSON). + Run from the repo root:: uv run python -m benchmarking.baselines.study_power_model_compare \ --reference-dir "~/temp/wind-up-benchmarking/badass overnight runs 30 June" -Use ``--skip-run`` to only re-merge/re-plot from a previous ``power_model`` run under -``--output-dir`` (e.g. to tweak plotting without re-fitting). Use ``--modes prepost`` / -``--modes toggle`` to restrict to one mode. +Use ``--skip-run`` to only re-merge/re-plot (and re-diff the benchmark) from a previous +``power_model`` run under ``--output-dir`` (e.g. to tweak plotting without re-fitting). Use +``--modes prepost`` / ``--modes toggle`` to restrict to one mode. Use ``--update-baseline`` to +re-record the benchmark from the current run. """ from __future__ import annotations import argparse +import json import logging +import subprocess +from datetime import datetime, timezone from functools import partial from pathlib import Path +from typing import Any import numpy as np import pandas as pd @@ -76,6 +90,18 @@ _DEFAULT_REFERENCE_DIR = Path.home() / "temp" / "wind-up-benchmarking" / "badass overnight runs 30 June" _DEFAULT_OUTPUT_DIR = Path.home() / "temp" / "wind-up-benchmarking" / "power_model_compare" +# The committed power_model benchmark: its bias/spread/score per (mode, profile, campaign) frozen at +# a known-good commit, so future power_model changes are scored against it. Lives next to this script +# (tracked) — update it deliberately with --update-baseline when an improvement is accepted. +_BASELINE_PATH = Path(__file__).resolve().parent / "study_power_model_compare_baseline.json" +_BASELINE_SCHEMA = "power_model_compare_baseline_v1" +# Per-cell metrics recorded and diffed. spread/score: lower is better; bias: |bias| nearer 0 is better. +_METRIC_COLS = ["bias", "spread", "score"] +# A delta smaller than this (fractional uplift) is floating-point noise, not a real change: an +# identical (deterministic, seed-fixed) run must read as no change, while any genuine power_model +# change moves the metrics by >= ~1e-4. 1e-7 fraction = 1e-5 pp, well below the reported resolution. +_IMPROVE_EPS = 1e-7 + def _prepost_study() -> StudyConfig: return StudyConfig( @@ -207,6 +233,129 @@ def _check_alignment(fresh: pd.DataFrame, reference: pd.DataFrame) -> None: logger.info("Alignment OK: %d cases match the reference run on ground truth.", len(common)) +def _git_commit() -> str: + """Return the short HEAD commit (``-dirty`` if the tree is modified), or ``unknown``.""" + repo = Path(__file__).resolve().parent + try: + commit = subprocess.run( + ["git", "rev-parse", "--short", "HEAD"], # noqa: S607 + cwd=repo, + capture_output=True, + text=True, + check=True, + ).stdout.strip() + dirty = subprocess.run( + ["git", "status", "--porcelain"], # noqa: S607 + cwd=repo, + capture_output=True, + text=True, + check=True, + ).stdout.strip() + except (subprocess.SubprocessError, OSError): + return "unknown" + return f"{commit}-dirty" if dirty else commit + + +def power_model_leaderboard(fresh: pd.DataFrame) -> pd.DataFrame: + """Reduce the fresh ``power_model`` rows to one ``(profile, campaign_months)`` row of bias/spread/score.""" + pm = fresh[fresh["method"] == "power_model"] + summary = leaderboard(pm) + return summary[["profile", "campaign_months", *_METRIC_COLS]].sort_values(["profile", "campaign_months"]) + + +def record_baseline(lb_by_mode: dict[str, pd.DataFrame], study_by_mode: dict[str, StudyConfig], path: Path) -> None: + """Write/refresh the committed benchmark for the given modes (other modes in the file are kept).""" + doc: dict[str, Any] = {"schema": _BASELINE_SCHEMA, "modes": {}} + if path.exists(): + doc = json.loads(path.read_text()) + doc.setdefault("modes", {}) + commit = _git_commit() + now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + for mode, lb in lb_by_mode.items(): + study = study_by_mode[mode] + doc["modes"][mode] = { + "recorded_utc": now, + "git_commit": commit, + "n_replicates": study.n_replicates, + "seed": study.seed, + "campaign_months": list(study.campaign_months), + "profiles": sorted(lb["profile"].unique()), + "cells": lb.round(8).to_dict(orient="records"), + } + path.write_text(json.dumps(doc, indent=2) + "\n") + logger.info("Recorded power_model benchmark for %s at %s (commit %s)", list(lb_by_mode), path, commit) + + +def _load_baseline_cells(mode: str, path: Path) -> tuple[pd.DataFrame, dict[str, Any]] | None: + """Load the recorded benchmark cells (+ provenance) for ``mode``, or ``None`` if not recorded.""" + if not path.exists(): + return None + entry = json.loads(path.read_text()).get("modes", {}).get(mode) + if entry is None: + return None + return pd.DataFrame(entry["cells"]), entry + + +def compare_to_benchmark(mode: str, lb: pd.DataFrame, baseline_path: Path, comparison_dir: Path) -> None: + """Diff the fresh power_model bias/spread/score against the committed benchmark and report it. + + Writes a per-cell ``benchmark_comparison_.csv`` and logs a mean-over-profiles table with the + deltas. spread/score: a negative delta is an improvement; bias: a smaller ``|bias|`` is better. + """ + loaded = _load_baseline_cells(mode, baseline_path) + if loaded is None: + logger.warning( + "No power_model benchmark recorded for %s in %s yet — run with --update-baseline to set it.", + mode, + baseline_path, + ) + return + base, prov = loaded + merged = lb.merge(base, on=["profile", "campaign_months"], how="outer", suffixes=("", "_base")) + for col in _METRIC_COLS: + merged[f"d_{col}"] = merged[col] - merged[f"{col}_base"] + merged["d_abs_bias"] = merged["bias"].abs() - merged["bias_base"].abs() + merged.sort_values(["profile", "campaign_months"]).to_csv( + comparison_dir / f"benchmark_comparison_{mode}.csv", index=False + ) + + pp = 100.0 + report = merged.groupby("campaign_months").mean(numeric_only=True) + overall = merged.mean(numeric_only=True).to_frame().T + overall.index = ["ALL"] + table = pd.concat([report, overall]) + show = pd.DataFrame( + { + "bias": table["bias"] * pp, + "Δbias": table["d_bias"] * pp, + "spread": table["spread"] * pp, + "Δspread": table["d_spread"] * pp, + "score": table["score"] * pp, + "Δscore": table["d_score"] * pp, + } + ).round(3) + n_cells = int(merged[_METRIC_COLS].notna().all(axis=1).sum()) + + def tally(delta: pd.Series) -> str: + """Count cells that moved beyond the noise floor: ``better`` (down) vs ``worse`` (up).""" + better = int((delta < -_IMPROVE_EPS).sum()) + worse = int((delta > _IMPROVE_EPS).sum()) + return f"{better} better / {worse} worse (of {n_cells})" + + logger.info( + "%s power_model vs benchmark (recorded %s, commit %s); mean over profiles [pp], " + "Δ<0 = better for spread/score:\n%s\n" + "cells: spread %s; score %s; |bias| %s", + mode, + prov.get("recorded_utc", "?"), + prov.get("git_commit", "?"), + show.to_string(), + tally(merged["d_spread"]), + tally(merged["d_score"]), + tally(merged["d_abs_bias"]), + ) + + def merge_and_plot(mode: str, fresh: pd.DataFrame, reference_mode_dir: Path, out_dir: Path) -> pd.DataFrame: """Merge fresh power_model with reference v0/naive, write merged tables + per-profile plots.""" comparison_dir = out_dir / "comparison" @@ -269,12 +418,27 @@ def main() -> None: action="store_true", help="reuse a previous power_model run under --output-dir; only re-merge and re-plot", ) + parser.add_argument( + "--baseline-path", + type=Path, + default=_BASELINE_PATH, + help="the committed power_model benchmark JSON to diff against (and to --update-baseline)", + ) + parser.add_argument( + "--update-baseline", + action="store_true", + help="overwrite the recorded benchmark for the run modes with this run (do this deliberately, " + "only when an improvement is accepted); without it, the run is diffed against the benchmark", + ) args = parser.parse_args() logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s", force=True) reference_dir = args.reference_dir.expanduser() output_dir = args.output_dir.expanduser() + baseline_path = args.baseline_path.expanduser() + lb_by_mode: dict[str, pd.DataFrame] = {} + study_by_mode: dict[str, StudyConfig] = {} for mode in args.modes: logger.info("=== %s ===", mode.upper()) mode_out_dir = output_dir / mode @@ -284,6 +448,13 @@ def main() -> None: else: fresh = run_power_model(mode, mode_out_dir) merge_and_plot(mode, fresh, reference_dir / mode, mode_out_dir) + lb_by_mode[mode] = power_model_leaderboard(fresh) + study_by_mode[mode] = _prepost_study() if mode == "prepost" else _toggle_study() + if not args.update_baseline: + compare_to_benchmark(mode, lb_by_mode[mode], baseline_path, mode_out_dir / "comparison") + + if args.update_baseline: + record_baseline(lb_by_mode, study_by_mode, baseline_path) logger.info("All done. Comparison plots under %s//comparison/", output_dir) diff --git a/benchmarking/baselines/study_power_model_compare_baseline.json b/benchmarking/baselines/study_power_model_compare_baseline.json new file mode 100644 index 0000000..798cc5d --- /dev/null +++ b/benchmarking/baselines/study_power_model_compare_baseline.json @@ -0,0 +1,393 @@ +{ + "schema": "power_model_compare_baseline_v1", + "modes": { + "prepost": { + "recorded_utc": "2026-06-30T10:04:36Z", + "git_commit": "ddd1b63-dirty", + "n_replicates": 4, + "seed": 0, + "campaign_months": [ + 3, + 6, + 12 + ], + "profiles": [ + "cp_0pct", + "cp_minus_10pct", + "cp_plus_10pct", + "cp_plus_3pct", + "rated_plus_5pct", + "ti_dependent_cp", + "ws_dependent_cp" + ], + "cells": [ + { + "profile": "cp_0pct", + "campaign_months": 3, + "bias": -0.00660529, + "spread": 0.00466565, + "score": 0.00808691 + }, + { + "profile": "cp_0pct", + "campaign_months": 6, + "bias": -0.00216502, + "spread": 0.00397077, + "score": 0.00452264 + }, + { + "profile": "cp_0pct", + "campaign_months": 12, + "bias": -0.00260598, + "spread": 0.00450127, + "score": 0.0052012 + }, + { + "profile": "cp_minus_10pct", + "campaign_months": 3, + "bias": -0.00608179, + "spread": 0.00437577, + "score": 0.00749236 + }, + { + "profile": "cp_minus_10pct", + "campaign_months": 6, + "bias": -0.00192257, + "spread": 0.00368846, + "score": 0.00415945 + }, + { + "profile": "cp_minus_10pct", + "campaign_months": 12, + "bias": -0.0023763, + "spread": 0.00423041, + "score": 0.00485213 + }, + { + "profile": "cp_plus_10pct", + "campaign_months": 3, + "bias": -0.00712802, + "spread": 0.00495523, + "score": 0.00868119 + }, + { + "profile": "cp_plus_10pct", + "campaign_months": 6, + "bias": -0.00240698, + "spread": 0.00425268, + "score": 0.0048866 + }, + { + "profile": "cp_plus_10pct", + "campaign_months": 12, + "bias": -0.00283565, + "spread": 0.00477221, + "score": 0.00555111 + }, + { + "profile": "cp_plus_3pct", + "campaign_months": 3, + "bias": -0.00676189, + "spread": 0.00475233, + "score": 0.00826485 + }, + { + "profile": "cp_plus_3pct", + "campaign_months": 6, + "bias": -0.00223764, + "spread": 0.00405543, + "score": 0.0046318 + }, + { + "profile": "cp_plus_3pct", + "campaign_months": 12, + "bias": -0.00267486, + "spread": 0.00458252, + "score": 0.00530607 + }, + { + "profile": "rated_plus_5pct", + "campaign_months": 3, + "bias": -0.00670173, + "spread": 0.00474853, + "score": 0.00821351 + }, + { + "profile": "rated_plus_5pct", + "campaign_months": 6, + "bias": -0.00218881, + "spread": 0.00403347, + "score": 0.00458909 + }, + { + "profile": "rated_plus_5pct", + "campaign_months": 12, + "bias": -0.00264277, + "spread": 0.00458772, + "score": 0.00529447 + }, + { + "profile": "ti_dependent_cp", + "campaign_months": 3, + "bias": -0.0069171, + "spread": 0.00485054, + "score": 0.00844831 + }, + { + "profile": "ti_dependent_cp", + "campaign_months": 6, + "bias": -0.00231017, + "spread": 0.00414554, + "score": 0.00474578 + }, + { + "profile": "ti_dependent_cp", + "campaign_months": 12, + "bias": -0.00274427, + "spread": 0.00467084, + "score": 0.00541736 + }, + { + "profile": "ws_dependent_cp", + "campaign_months": 3, + "bias": -0.00683324, + "spread": 0.00483019, + "score": 0.00836803 + }, + { + "profile": "ws_dependent_cp", + "campaign_months": 6, + "bias": -0.00225087, + "spread": 0.00412935, + "score": 0.00470297 + }, + { + "profile": "ws_dependent_cp", + "campaign_months": 12, + "bias": -0.00267457, + "spread": 0.00463682, + "score": 0.00535289 + } + ] + }, + "toggle": { + "recorded_utc": "2026-06-30T10:04:36Z", + "git_commit": "ddd1b63-dirty", + "n_replicates": 4, + "seed": 0, + "campaign_months": [ + 3, + 6, + 9, + 12 + ], + "profiles": [ + "cp_0pct", + "cp_minus_10pct", + "cp_plus_10pct", + "cp_plus_3pct", + "rated_plus_5pct", + "ti_dependent_cp", + "ws_dependent_cp" + ], + "cells": [ + { + "profile": "cp_0pct", + "campaign_months": 3, + "bias": 0.00362415, + "spread": 0.00084001, + "score": 0.00372022 + }, + { + "profile": "cp_0pct", + "campaign_months": 6, + "bias": 0.00129828, + "spread": 0.00178902, + "score": 0.00221046 + }, + { + "profile": "cp_0pct", + "campaign_months": 9, + "bias": 0.00086062, + "spread": 0.00160665, + "score": 0.00182263 + }, + { + "profile": "cp_0pct", + "campaign_months": 12, + "bias": 0.00056382, + "spread": 0.00088262, + "score": 0.00104734 + }, + { + "profile": "cp_minus_10pct", + "campaign_months": 3, + "bias": 0.0034594, + "spread": 0.00078816, + "score": 0.00354805 + }, + { + "profile": "cp_minus_10pct", + "campaign_months": 6, + "bias": 0.00131574, + "spread": 0.00168113, + "score": 0.0021348 + }, + { + "profile": "cp_minus_10pct", + "campaign_months": 9, + "bias": 0.00088486, + "spread": 0.0015148, + "score": 0.0017543 + }, + { + "profile": "cp_minus_10pct", + "campaign_months": 12, + "bias": 0.00061016, + "spread": 0.00085288, + "score": 0.00104867 + }, + { + "profile": "cp_plus_10pct", + "campaign_months": 3, + "bias": 0.0037889, + "spread": 0.00089431, + "score": 0.00389301 + }, + { + "profile": "cp_plus_10pct", + "campaign_months": 6, + "bias": 0.00128082, + "spread": 0.00189692, + "score": 0.00228884 + }, + { + "profile": "cp_plus_10pct", + "campaign_months": 9, + "bias": 0.00083644, + "spread": 0.00169846, + "score": 0.00189325 + }, + { + "profile": "cp_plus_10pct", + "campaign_months": 12, + "bias": 0.00051748, + "spread": 0.00091239, + "score": 0.00104892 + }, + { + "profile": "cp_plus_3pct", + "campaign_months": 3, + "bias": 0.00367357, + "spread": 0.00085606, + "score": 0.003772 + }, + { + "profile": "cp_plus_3pct", + "campaign_months": 6, + "bias": 0.00129304, + "spread": 0.00182139, + "score": 0.0022337 + }, + { + "profile": "cp_plus_3pct", + "campaign_months": 9, + "bias": 0.00085336, + "spread": 0.0016342, + "score": 0.00184359 + }, + { + "profile": "cp_plus_3pct", + "campaign_months": 12, + "bias": 0.00054992, + "spread": 0.00089155, + "score": 0.00104751 + }, + { + "profile": "rated_plus_5pct", + "campaign_months": 3, + "bias": 0.00369558, + "spread": 0.00086132, + "score": 0.00379462 + }, + { + "profile": "rated_plus_5pct", + "campaign_months": 6, + "bias": 0.00133541, + "spread": 0.00181816, + "score": 0.00225589 + }, + { + "profile": "rated_plus_5pct", + "campaign_months": 9, + "bias": 0.00088821, + "spread": 0.00163333, + "score": 0.00185922 + }, + { + "profile": "rated_plus_5pct", + "campaign_months": 12, + "bias": 0.00059381, + "spread": 0.00090885, + "score": 0.00108564 + }, + { + "profile": "ti_dependent_cp", + "campaign_months": 3, + "bias": 0.00373349, + "spread": 0.00087011, + "score": 0.00383354 + }, + { + "profile": "ti_dependent_cp", + "campaign_months": 6, + "bias": 0.00128565, + "spread": 0.0018638, + "score": 0.00226421 + }, + { + "profile": "ti_dependent_cp", + "campaign_months": 9, + "bias": 0.0008499, + "spread": 0.00167478, + "score": 0.00187809 + }, + { + "profile": "ti_dependent_cp", + "campaign_months": 12, + "bias": 0.00053768, + "spread": 0.0009069, + "score": 0.00105431 + }, + { + "profile": "ws_dependent_cp", + "campaign_months": 3, + "bias": 0.00372194, + "spread": 0.00088052, + "score": 0.00382467 + }, + { + "profile": "ws_dependent_cp", + "campaign_months": 6, + "bias": 0.00132792, + "spread": 0.00184903, + "score": 0.00227646 + }, + { + "profile": "ws_dependent_cp", + "campaign_months": 9, + "bias": 0.00087805, + "spread": 0.00165961, + "score": 0.00187757 + }, + { + "profile": "ws_dependent_cp", + "campaign_months": 12, + "bias": 0.00058349, + "spread": 0.00092061, + "score": 0.00108995 + } + ] + } + } +} From 1ccd74557456f78732f88aec316e17942a8763fb Mon Sep 17 00:00:00 2001 From: aclerc Date: Tue, 30 Jun 2026 16:02:05 +0100 Subject: [PATCH 3/3] uplift by condition WIP --- .../inspect_prepost_feature_ablation.py | 2 +- .../baselines/inspect_prepost_hard_case.py | 84 +++++++- .../baselines/power_model/features.py | 25 +++ benchmarking/baselines/power_model/method.py | 31 ++- .../baselines/study_power_model_compare.py | 63 +++++- benchmarking/harness/__init__.py | 18 +- benchmarking/harness/conditions.py | 49 +++++ benchmarking/harness/leaderboard.py | 30 +++ benchmarking/harness/method.py | 5 +- benchmarking/harness/plots.py | 38 ++++ benchmarking/harness/scoring.py | 74 +++++-- .../test_inspect_prepost_hard_case.py | 23 +++ .../baselines/test_power_model_method.py | 39 +++- .../test_study_power_model_compare.py | 193 ++++++++++++++++++ tests/benchmarking/harness/stubs.py | 41 +++- tests/benchmarking/harness/test_conditions.py | 47 +++++ .../benchmarking/harness/test_leaderboard.py | 47 ++++- tests/benchmarking/harness/test_plots.py | 21 +- tests/benchmarking/harness/test_scoring.py | 22 +- 19 files changed, 807 insertions(+), 45 deletions(-) create mode 100644 benchmarking/harness/conditions.py create mode 100644 tests/benchmarking/baselines/test_inspect_prepost_hard_case.py create mode 100644 tests/benchmarking/baselines/test_study_power_model_compare.py create mode 100644 tests/benchmarking/harness/test_conditions.py diff --git a/benchmarking/baselines/inspect_prepost_feature_ablation.py b/benchmarking/baselines/inspect_prepost_feature_ablation.py index 8aadc5e..9f100de 100644 --- a/benchmarking/baselines/inspect_prepost_feature_ablation.py +++ b/benchmarking/baselines/inspect_prepost_feature_ablation.py @@ -92,7 +92,7 @@ def inspect_prepost_feature_ablation() -> pd.DataFrame: wtg_numbers=DEFAULT_WTG_NUMBERS, wtg_names=DEFAULT_TURBINE_SUBSET, ) - _rep, mi, truth = _pin_case( + _rep, mi, truth, _window = _pin_case( scada_df, study=study, profile_name=DEFAULT_PROFILE, diff --git a/benchmarking/baselines/inspect_prepost_hard_case.py b/benchmarking/baselines/inspect_prepost_hard_case.py index 06520e7..5164b3e 100644 --- a/benchmarking/baselines/inspect_prepost_hard_case.py +++ b/benchmarking/baselines/inspect_prepost_hard_case.py @@ -48,11 +48,15 @@ from benchmarking.baselines.power_model import PowerModelMethod from benchmarking.baselines.v0_binned import V0BinnedMethod from benchmarking.harness import ( + CONDITION_BINS, + CampaignWindow, Method, MethodInput, + MethodOutput, StudyConfig, build_replicates, campaign_windows, + plot_conditional_uplift, treated_activity_mask, window_row_mask, ) @@ -106,8 +110,8 @@ def _select_replicate(replicates: list[Replicate], test_wtg: str) -> Replicate: def _pin_case( scada_df: pd.DataFrame, *, study: StudyConfig, profile_name: str, test_wtg: str, campaign_months: int -) -> tuple[Replicate, MethodInput, float]: - """Build the pinned replicate, its shared ``MethodInput`` and the ground-truth uplift.""" +) -> tuple[Replicate, MethodInput, float, CampaignWindow]: + """Build the pinned replicate, its shared ``MethodInput``, the ground-truth uplift, and the window.""" profile = overnight_profiles()[profile_name] replicates = build_replicates(scada_df, profile=profile, study=study) rep = _select_replicate(replicates, test_wtg) @@ -145,7 +149,7 @@ def _pin_case( window.activity_end, 100 * truth, ) - return rep, mi, truth + return rep, mi, truth, window def _build_methods(out_dir: Path, *, include_v0: bool) -> list[Method]: @@ -161,6 +165,7 @@ def _build_methods(out_dir: Path, *, include_v0: bool) -> list[Method]: PowerModelMethod( active_power_col=HOT_COLUMNS.active_power, wind_speed_col=HOT_COLUMNS.wind_speed, + wind_speed_sd_col=HOT_COLUMNS.wind_speed_sd, availability_col=HOT_COLUMNS.availability, era5_hourly_df=context.reanalysis_datasets[0].data, out_dir=out_dir / "power_model", @@ -199,6 +204,75 @@ def _run_methods(methods: list[Method], *, mi: MethodInput, truth: float) -> pd. return pd.DataFrame(rows) +def _plot_conditional_uplift( + rep: Replicate, mi: MethodInput, window: CampaignWindow, *, out_dir: Path, profile_name: str +) -> None: + """Re-run the power model on ``mi`` and write per-condition uplift-vs-truth plots.""" + context = build_hot_v0_context(wtg_names=DEFAULT_TURBINE_SUBSET) + power_model = PowerModelMethod( + active_power_col=HOT_COLUMNS.active_power, + wind_speed_col=HOT_COLUMNS.wind_speed, + wind_speed_sd_col=HOT_COLUMNS.wind_speed_sd, + availability_col=HOT_COLUMNS.availability, + era5_hourly_df=context.reanalysis_datasets[0].data, + out_dir=out_dir / "power_model_conditional", + ) + pm_output = power_model.estimate(mi) + if pm_output.p50_by_condition is None: + logger.warning("power_model returned no p50_by_condition; skipping conditional uplift plots") + return + + test_index = rep.synthetic_df.loc[rep.synthetic_df[HOT_COLUMNS.turbine] == rep.test_wtg].index + mask = treated_activity_mask(test_index, rep.upgrade_timing, window=window) + truth_by_condition = { + c: rep.true_uplift(mask=mask, by=c, bins=CONDITION_BINS[c]).by_condition for c in ("ws", "ti") + } + # Filter out conditions where true_uplift returned None (should not happen when bins given) + truth_by_condition_clean: dict[str, pd.DataFrame] = { + c: df for c, df in truth_by_condition.items() if df is not None + } + if not truth_by_condition_clean: + logger.warning("No per-condition truth available; skipping conditional uplift plots") + return + + frame = conditional_truth_vs_estimate(pm_output, truth_by_condition_clean, method_name=power_model.name) + for c in truth_by_condition_clean: + plot_conditional_uplift( + frame, + condition=c, + save_path=out_dir / f"conditional_uplift_{c}.png", + title=f"Conditional uplift ({c}) — profile={profile_name}, wtg={mi.test_wtg}", + ) + logger.info("Wrote conditional uplift plots to %s", out_dir) + + +def conditional_truth_vs_estimate( + output: MethodOutput, truth_by_condition: dict[str, pd.DataFrame], *, method_name: str +) -> pd.DataFrame: + """Shape a method's p50_by_condition + per-condition truth into a plot_conditional_uplift frame.""" + if output.p50_by_condition is None: + msg = "output.p50_by_condition must not be None" + raise ValueError(msg) + frames = [] + bc = output.p50_by_condition + for condition, truth in truth_by_condition.items(): + est = bc[bc["condition"] == condition].set_index("condition_bin")["p50_uplift"] + t = truth.assign(condition_bin=truth["condition_bin"].astype(str)).set_index("condition_bin")["true_uplift"] + bins = est.index.union(t.index) + frames.append( + pd.DataFrame( + { + "method": method_name, + "condition": condition, + "condition_bin": bins, + "mean_estimate": est.reindex(bins).to_numpy(), + "mean_truth": t.reindex(bins).to_numpy(), + } + ) + ) + return pd.concat(frames, ignore_index=True) + + def inspect_prepost_hard_case( *, profile_name: str = DEFAULT_PROFILE, @@ -228,12 +302,14 @@ def inspect_prepost_hard_case( wtg_names=DEFAULT_TURBINE_SUBSET, ) - _rep, mi, truth = _pin_case( + rep, mi, truth, window = _pin_case( scada_df, study=study, profile_name=profile_name, test_wtg=test_wtg, campaign_months=campaign_months ) methods = _build_methods(out_dir, include_v0=include_v0) summary = _run_methods(methods, mi=mi, truth=truth) + _plot_conditional_uplift(rep, mi, window, out_dir=out_dir, profile_name=profile_name) + summary_path = out_dir / "comparison_summary.csv" summary.to_csv(summary_path, index=False) logger.info("Wrote %s\n%s", summary_path, summary.to_string(index=False)) diff --git a/benchmarking/baselines/power_model/features.py b/benchmarking/baselines/power_model/features.py index 7dce4cc..118c7b6 100644 --- a/benchmarking/baselines/power_model/features.py +++ b/benchmarking/baselines/power_model/features.py @@ -133,6 +133,31 @@ def reference_mean_wind_speed( return pd.concat(cols, axis=1).mean(axis=1) +def test_condition_signals( + scada_df: pd.DataFrame, + *, + test_wtg: str, + turbine_col: str, + wind_speed_col: str, + wind_speed_sd_col: str | None, +) -> pd.DataFrame: + """Test turbine's MEASURED ws and ti on the unique sorted index (post-treatment, accepted §3). + + ``ti`` is omitted when no SD column is configured. + """ + index = pd.DatetimeIndex(pd.unique(scada_df.index)).sort_values() + rows = scada_df[scada_df[turbine_col] == test_wtg] + ws = pd.Series(rows[wind_speed_col].to_numpy(dtype=float), index=pd.DatetimeIndex(rows.index)) + ws = ws[~ws.index.duplicated()].reindex(index) + out = pd.DataFrame({"ws": ws}) + if wind_speed_sd_col is not None and wind_speed_sd_col in scada_df.columns: + sd = pd.Series(rows[wind_speed_sd_col].to_numpy(dtype=float), index=pd.DatetimeIndex(rows.index)) + sd = sd[~sd.index.duplicated()].reindex(index) + ws_arr = ws.to_numpy() + out["ti"] = np.divide(sd.to_numpy(), ws_arr, out=np.full(len(ws_arr), np.nan), where=ws_arr != 0) + return out + + def check_reference_only(feature_names: list[str], *, test_wtg: str) -> None: """Raise if any feature is qualified with the test turbine (violating the §3 rule).""" offenders = [f for f in feature_names if f.endswith(f"{QUALIFIER}{test_wtg}")] diff --git a/benchmarking/baselines/power_model/method.py b/benchmarking/baselines/power_model/method.py index 0e3fdeb..b96d977 100644 --- a/benchmarking/baselines/power_model/method.py +++ b/benchmarking/baselines/power_model/method.py @@ -37,9 +37,11 @@ era5_feature_frame, extract_outcome, reference_mean_wind_speed, + test_condition_signals, ) from benchmarking.baselines.rlearner.nuisance import make_outcome_model from benchmarking.diagnostics import DiagnosticContext, write_common_diagnostics, write_run_config +from benchmarking.harness.conditions import CONDITION_BINS, energy_ratio_by_bin from benchmarking.harness.method import MethodInput, MethodOutput from benchmarking.synthetic import HOT_COLUMNS, ToggleSchedule, treated_mask @@ -93,6 +95,7 @@ class PowerModelMethod: active_power_col: str availability_col: str wind_speed_col: str | None = None + wind_speed_sd_col: str | None = None era5_hourly_df: pd.DataFrame | None = None columns: ColumnSchema = HOT_COLUMNS name: str = "power_model" @@ -148,6 +151,17 @@ def estimate(self, mi: MethodInput) -> MethodOutput: sum_counter = float(fit["pred_upgraded"].sum()) uplift = sum_actual / sum_counter - 1.0 if np.isfinite(sum_counter) and sum_counter != 0 else float("nan") + by_condition: pd.DataFrame | None = None + if self.wind_speed_col is not None: + conditions = test_condition_signals( + scada, + test_wtg=mi.test_wtg, + turbine_col=mi.turbine_col, + wind_speed_col=self.wind_speed_col, + wind_speed_sd_col=self.wind_speed_sd_col, + ) + by_condition = self._conditional_uplift(conditions, upgraded_sel=upgraded_sel, fit=fit) + self._write( mi, index=index, @@ -164,7 +178,7 @@ def estimate(self, mi: MethodInput) -> MethodOutput: n_refs=n_refs, era5=era5, ) - return MethodOutput(p50_overall=uplift) + return MethodOutput(p50_overall=uplift, p50_by_condition=by_condition) def _add_era5( self, @@ -234,6 +248,20 @@ def _fit_predict( "pred_baseline_valid": pred_valid, } + def _conditional_uplift( + self, conditions: pd.DataFrame, *, upgraded_sel: np.ndarray, fit: dict[str, Any] + ) -> pd.DataFrame | None: + """Reduce the upgraded actual/counterfactual ledger to per-bin energy-ratio uplift.""" + cond_up = conditions.iloc[upgraded_sel] + actual = fit["y_upgraded"] + counterfactual = fit["pred_upgraded"] + frames = [] + for name in [c for c in ("ws", "ti") if c in cond_up.columns]: + table = energy_ratio_by_bin(cond_up[name].to_numpy(), actual, counterfactual, bins=CONDITION_BINS[name]) + table.insert(0, "condition", name) + frames.append(table[["condition", "condition_bin", "p50_uplift"]]) + return pd.concat(frames, ignore_index=True) if frames else None + def _make_model(self) -> Any: # noqa: ANN401 """Outcome model with ``seed`` plumbed into LightGBM's ``random_state`` (caller overrides win).""" return make_outcome_model(**{"random_state": self.seed, **self.model_params}) @@ -361,6 +389,7 @@ def _config_params(self) -> dict[str, Any]: "active_power_col": self.active_power_col, "availability_col": self.availability_col, "wind_speed_col": self.wind_speed_col, + "wind_speed_sd_col": self.wind_speed_sd_col, "seed": self.seed, "toggle_campaign_only": self.toggle_campaign_only, "has_era5": self.era5_hourly_df is not None, diff --git a/benchmarking/baselines/study_power_model_compare.py b/benchmarking/baselines/study_power_model_compare.py index e456576..40c51be 100644 --- a/benchmarking/baselines/study_power_model_compare.py +++ b/benchmarking/baselines/study_power_model_compare.py @@ -69,7 +69,14 @@ from benchmarking.baselines.hot_context import build_hot_v0_context from benchmarking.baselines.overnight_profiles import overnight_profiles from benchmarking.baselines.power_model import PowerModelMethod -from benchmarking.harness import StudyConfig, leaderboard, plot_campaign_curves, score_study +from benchmarking.harness import ( + StudyConfig, + conditional_leaderboard, + leaderboard, + plot_campaign_curves, + plot_conditional_uplift, + score_study, +) from benchmarking.synthetic import HOT_COLUMNS from benchmarking.synthetic.sources.hill_of_towie import load_hot_scada @@ -94,7 +101,7 @@ # a known-good commit, so future power_model changes are scored against it. Lives next to this script # (tracked) — update it deliberately with --update-baseline when an improvement is accepted. _BASELINE_PATH = Path(__file__).resolve().parent / "study_power_model_compare_baseline.json" -_BASELINE_SCHEMA = "power_model_compare_baseline_v1" +_BASELINE_SCHEMA = "power_model_compare_baseline_v2" # Per-cell metrics recorded and diffed. spread/score: lower is better; bias: |bias| nearer 0 is better. _METRIC_COLS = ["bias", "spread", "score"] # A delta smaller than this (fractional uplift) is floating-point noise, not a real change: an @@ -152,6 +159,7 @@ def run_power_model(mode: str, out_dir: Path) -> pd.DataFrame: active_power_col=HOT_COLUMNS.active_power, wind_speed_col=HOT_COLUMNS.wind_speed, availability_col=HOT_COLUMNS.availability, + wind_speed_sd_col=HOT_COLUMNS.wind_speed_sd, era5_hourly_df=context.reanalysis_datasets[0].data, out_dir=out_dir / "power_model_runs", ) @@ -257,18 +265,23 @@ def _git_commit() -> str: def power_model_leaderboard(fresh: pd.DataFrame) -> pd.DataFrame: - """Reduce the fresh ``power_model`` rows to one ``(profile, campaign_months)`` row of bias/spread/score.""" + """One row per (profile, campaign, condition, bin) of power_model bias/spread/score (overall incl.).""" pm = fresh[fresh["method"] == "power_model"] - summary = leaderboard(pm) - return summary[["profile", "campaign_months", *_METRIC_COLS]].sort_values(["profile", "campaign_months"]) + overall = leaderboard(pm).assign(condition="overall", condition_bin="overall") + conditional = conditional_leaderboard(pm) + cols = ["profile", "campaign_months", "condition", "condition_bin", *_METRIC_COLS] + stacked = pd.concat([overall[cols], conditional[cols]], ignore_index=True) + return stacked.sort_values(["profile", "campaign_months", "condition", "condition_bin"]) def record_baseline(lb_by_mode: dict[str, pd.DataFrame], study_by_mode: dict[str, StudyConfig], path: Path) -> None: """Write/refresh the committed benchmark for the given modes (other modes in the file are kept).""" doc: dict[str, Any] = {"schema": _BASELINE_SCHEMA, "modes": {}} if path.exists(): - doc = json.loads(path.read_text()) - doc.setdefault("modes", {}) + loaded = json.loads(path.read_text()) + if loaded.get("schema") == _BASELINE_SCHEMA: + doc = loaded + doc.setdefault("modes", {}) commit = _git_commit() now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") for mode, lb in lb_by_mode.items(): @@ -290,7 +303,16 @@ def _load_baseline_cells(mode: str, path: Path) -> tuple[pd.DataFrame, dict[str, """Load the recorded benchmark cells (+ provenance) for ``mode``, or ``None`` if not recorded.""" if not path.exists(): return None - entry = json.loads(path.read_text()).get("modes", {}).get(mode) + doc = json.loads(path.read_text()) + if doc.get("schema") != _BASELINE_SCHEMA: + logger.warning( + "Baseline %s has schema %r, expected %r — run --update-baseline to regenerate.", + path, + doc.get("schema"), + _BASELINE_SCHEMA, + ) + return None + entry = doc.get("modes", {}).get(mode) if entry is None: return None return pd.DataFrame(entry["cells"]), entry @@ -311,16 +333,17 @@ def compare_to_benchmark(mode: str, lb: pd.DataFrame, baseline_path: Path, compa ) return base, prov = loaded - merged = lb.merge(base, on=["profile", "campaign_months"], how="outer", suffixes=("", "_base")) + merge_keys = ["profile", "campaign_months", "condition", "condition_bin"] + merged = lb.merge(base, on=merge_keys, how="outer", suffixes=("", "_base")) for col in _METRIC_COLS: merged[f"d_{col}"] = merged[col] - merged[f"{col}_base"] merged["d_abs_bias"] = merged["bias"].abs() - merged["bias_base"].abs() - merged.sort_values(["profile", "campaign_months"]).to_csv( + merged.sort_values(["profile", "campaign_months", "condition", "condition_bin"]).to_csv( comparison_dir / f"benchmark_comparison_{mode}.csv", index=False ) pp = 100.0 - report = merged.groupby("campaign_months").mean(numeric_only=True) + report = merged.groupby(["campaign_months", "condition"]).mean(numeric_only=True) overall = merged.mean(numeric_only=True).to_frame().T overall.index = ["ALL"] table = pd.concat([report, overall]) @@ -387,6 +410,24 @@ def merge_and_plot(mode: str, fresh: pd.DataFrame, reference_mode_dir: Path, out mode, all_summary[["method", "profile", "campaign_months", "bias", "spread", "score"]].to_string(index=False), ) + + # Conditional comparison plots for power_model (v0/naive emit no conditional rows — expected). + pm_rows = merged[merged["method"] == "power_model"] + if not pm_rows.empty and "condition" in pm_rows.columns: + cond_lb = conditional_leaderboard(pm_rows) + if not cond_lb.empty: + for profile in profiles: + for condition in cond_lb["condition"].unique(): + subset = cond_lb[(cond_lb["profile"] == profile) & (cond_lb["condition"] == condition)] + if subset.empty: + continue + plot_conditional_uplift( + subset, + condition=condition, + save_path=comparison_dir / f"conditional_{profile}_{condition}.png", + title=f"{mode} - {profile} power_model vs {condition}", + ) + logger.info("Wrote %s comparison outputs to %s", mode, comparison_dir) return merged diff --git a/benchmarking/harness/__init__.py b/benchmarking/harness/__init__.py index acbe3fc..426b3ef 100644 --- a/benchmarking/harness/__init__.py +++ b/benchmarking/harness/__init__.py @@ -12,14 +12,25 @@ treated_activity_mask, window_row_mask, ) -from benchmarking.harness.leaderboard import leaderboard +from benchmarking.harness.conditions import ( + CONDITION_BINS, + CONDITIONS, + TI_BINS, + WS_BINS, + energy_ratio_by_bin, +) +from benchmarking.harness.leaderboard import conditional_leaderboard, leaderboard from benchmarking.harness.method import Method, MethodInput, MethodOutput from benchmarking.harness.metrics import ErrorSummary, summarize_errors -from benchmarking.harness.plots import plot_campaign_curves +from benchmarking.harness.plots import plot_campaign_curves, plot_conditional_uplift from benchmarking.harness.replicates import Replicate, StudyConfig, build_replicates from benchmarking.harness.scoring import score_study __all__ = [ + "CONDITIONS", + "CONDITION_BINS", + "TI_BINS", + "WS_BINS", "CampaignWindow", "ErrorSummary", "Method", @@ -29,8 +40,11 @@ "StudyConfig", "build_replicates", "campaign_windows", + "conditional_leaderboard", + "energy_ratio_by_bin", "leaderboard", "plot_campaign_curves", + "plot_conditional_uplift", "score_study", "summarize_errors", "treated_activity_mask", diff --git a/benchmarking/harness/conditions.py b/benchmarking/harness/conditions.py new file mode 100644 index 0000000..2200917 --- /dev/null +++ b/benchmarking/harness/conditions.py @@ -0,0 +1,49 @@ +"""Shared condition bins and the binned energy-ratio reducer. + +One source of truth for the wind-speed / TI bin edges, imported by both the method (to bin its +counterfactual ledger) and the harness truth path, so the two bin identically. The reducer is the +same energy ratio as the overall number, taken within each bin: ``Σactual / Σcounterfactual - 1``. +""" + +from __future__ import annotations + +import numpy as np +import numpy.typing as npt +import pandas as pd + +WS_BINS: list[float] = [float(x) for x in np.arange(0.0, 28.0, 2.0)] # 0,2,…,26 +TI_BINS: list[float] = [round(float(x), 2) for x in np.arange(0.0, 0.55, 0.05)] # 0,0.05,…,0.50 +CONDITIONS: tuple[str, ...] = ("ws", "ti") +CONDITION_BINS: dict[str, list[float]] = {"ws": WS_BINS, "ti": TI_BINS} + + +def energy_ratio_by_bin( + condition_values: npt.ArrayLike, + actual: npt.ArrayLike, + counterfactual: npt.ArrayLike, + *, + bins: list[float], +) -> pd.DataFrame: + """Energy-ratio uplift within bins of a condition signal (NaN-safe; every bin represented).""" + cond = np.asarray(condition_values, dtype=float) + act = np.asarray(actual, dtype=float) + cf = np.asarray(counterfactual, dtype=float) + finite = np.isfinite(cond) & np.isfinite(act) & np.isfinite(cf) + frame = pd.DataFrame( + { + "condition_bin": pd.cut(cond[finite], bins=bins).astype(str), + "actual": act[finite], + "counterfactual": cf[finite], + } + ) + all_bins = pd.cut([], bins=bins).categories.astype(str) + grouped = frame.groupby("condition_bin", observed=False) + table = grouped[["actual", "counterfactual"]].sum() + table["n_records"] = grouped.size() + table = table.reindex(all_bins) + table["n_records"] = table["n_records"].fillna(0).astype(int) + denom = table["counterfactual"].to_numpy() + table["p50_uplift"] = ( + np.divide(table["actual"].to_numpy(), denom, out=np.full(len(table), np.nan), where=denom != 0) - 1.0 + ) + return table.reset_index(names="condition_bin")[["condition_bin", "p50_uplift", "n_records"]] diff --git a/benchmarking/harness/leaderboard.py b/benchmarking/harness/leaderboard.py index 5d2f7fb..f76197a 100644 --- a/benchmarking/harness/leaderboard.py +++ b/benchmarking/harness/leaderboard.py @@ -12,6 +12,7 @@ from benchmarking.harness.metrics import summarize_errors _GROUP_KEYS = ["method", "profile", "campaign_months"] +_CONDITION_GROUP_KEYS = ["method", "profile", "campaign_months", "condition", "condition_bin"] def leaderboard(results_df: pd.DataFrame) -> pd.DataFrame: @@ -56,3 +57,32 @@ def leaderboard(results_df: pd.DataFrame) -> pd.DataFrame: "wall_time_s_mean", ] return pd.DataFrame(records, columns=columns) + + +def conditional_leaderboard(results_df: pd.DataFrame) -> pd.DataFrame: + """Per-(method, profile, campaign, condition, bin) bias/spread/score over the conditional rows. + + Only per-condition rows (``condition`` in ["ws", "ti"]) are summarised; overall rows + (``condition == "overall"``) are excluded. Returns one row per group with ``bias``, + ``spread``, ``score``, the mean recovered and true uplift (``mean_estimate`` / + ``mean_truth``, when those columns are present in the input), and ``n_replicates``, + sorted by method, profile, campaign length, condition, then condition_bin. + """ + cond = results_df[results_df["condition"].isin(["ws", "ti"])] if "condition" in results_df else results_df.iloc[:0] + + records = [] + for keys, group in cond.groupby(_CONDITION_GROUP_KEYS, sort=True): + summary = summarize_errors(group["signed_error"].to_numpy()) + records.append( + { + **dict(zip(_CONDITION_GROUP_KEYS, keys, strict=True)), + "bias": summary.bias, + "spread": summary.spread, + "score": summary.score, + "mean_estimate": float(group["estimate"].mean()) if "estimate" in group else float("nan"), + "mean_truth": float(group["truth"].mean()) if "truth" in group else float("nan"), + "n_replicates": summary.n, + } + ) + columns = [*_CONDITION_GROUP_KEYS, "bias", "spread", "score", "mean_estimate", "mean_truth", "n_replicates"] + return pd.DataFrame(records, columns=columns) diff --git a/benchmarking/harness/method.py b/benchmarking/harness/method.py index 483c08f..e2f26a9 100644 --- a/benchmarking/harness/method.py +++ b/benchmarking/harness/method.py @@ -44,8 +44,9 @@ class MethodOutput: """A method's P50 uplift estimate. :param p50_overall: overall P50 uplift (energy-ratio fraction) - :param p50_by_condition: optional per-condition estimates (columns ``condition_bin``, - ``p50_uplift``); ``None`` when the method produces only an overall number + :param p50_by_condition: optional per-condition estimates (columns ``condition``, + ``condition_bin``, ``p50_uplift``); ``condition`` ∈ {"ws","ti"}; ``None`` when the + method produces only an overall number """ p50_overall: float diff --git a/benchmarking/harness/plots.py b/benchmarking/harness/plots.py index 28cc07f..3f6a012 100644 --- a/benchmarking/harness/plots.py +++ b/benchmarking/harness/plots.py @@ -105,3 +105,41 @@ def _set_score_ylim(ax: plt.Axes, scores_pp: np.ndarray) -> None: span = hi - lo margin = 0.05 * span if span > 0 else max(abs(hi), 1.0) * 0.05 ax.set_ylim(bottom=min(0.0, lo) - margin) + + +def plot_conditional_uplift( + summary_df: pd.DataFrame, + *, + condition: str, + save_path: str | Path | None = None, + title: str | None = None, +) -> Figure: + """Plot mean recovered vs true uplift across bins of one condition, with a bias±spread band.""" + df = summary_df[summary_df["condition"] == condition].copy() + df["_left"] = df["condition_bin"].str.extract(r"\(([-0-9.]+),").astype(float) + df = df.sort_values("_left") + order = df.drop_duplicates("condition_bin")["condition_bin"].tolist() + x = np.arange(len(order)) + + fig, ax = plt.subplots(figsize=(9, 5)) + truth = df.drop_duplicates("condition_bin").set_index("condition_bin").reindex(order)["mean_truth"] + ax.plot(x, truth.to_numpy() * _FRACTION_TO_PP, "--", marker="s", color="k", label="true uplift") + for i, method in enumerate(sorted(df["method"].unique())): + m = df[df["method"] == method].set_index("condition_bin").reindex(order) + est = m["mean_estimate"].to_numpy() * _FRACTION_TO_PP + ax.plot(x, est, marker="o", color=f"C{i}", label=method) + if "spread" in m: + sp = m["spread"].to_numpy() * _FRACTION_TO_PP + ax.fill_between(x, est - sp, est + sp, color=f"C{i}", alpha=0.2) + ax.set_xticks(x) + ax.set_xticklabels(order, rotation=45, ha="right") + ax.set_xlabel(condition) + ax.set_ylabel("uplift [pp]") + ax.axhline(0.0, color="grey", lw=0.8) + ax.legend() + if title: + ax.set_title(title) + fig.tight_layout() + if save_path is not None: + fig.savefig(save_path, dpi=120) + return fig diff --git a/benchmarking/harness/scoring.py b/benchmarking/harness/scoring.py index fb61819..f381835 100644 --- a/benchmarking/harness/scoring.py +++ b/benchmarking/harness/scoring.py @@ -23,6 +23,7 @@ import pandas as pd from benchmarking.harness.campaign import campaign_windows, treated_activity_mask, window_row_mask +from benchmarking.harness.conditions import CONDITION_BINS from benchmarking.harness.method import MethodInput from benchmarking.harness.replicates import build_replicates from benchmarking.synthetic import HOT_COLUMNS @@ -30,6 +31,9 @@ if TYPE_CHECKING: from collections.abc import Callable + import numpy as np + import numpy.typing as npt + from benchmarking.harness.campaign import CampaignWindow from benchmarking.harness.method import Method from benchmarking.harness.replicates import Replicate, StudyConfig @@ -66,33 +70,40 @@ def score_study( instances = _materialise_instances(replicates, study, data_start=data_start, data_end=data_end) # Truth depends only on ``(replicate, window)``, so compute it once here rather than once # per method (it would otherwise carry an avoidable ``len(methods)`` multiplier). - truths = [_truth_overall(replicate, window) for replicate, window in instances] + truth_masks = [_truth_mask(r, w) for r, w in instances] + truths = [r.true_uplift(mask=m).overall for (r, _), m in zip(instances, truth_masks, strict=True)] rows = [] for method in methods: - method_rows = [] - for (replicate, window), truth in zip(instances, truths, strict=True): + method_rows: list[dict[str, object]] = [] + for (replicate, window), truth, mask in zip(instances, truths, truth_masks, strict=True): method_input = _method_input(replicate, window) start = time.perf_counter() output = method.estimate(method_input) wall_time_s = time.perf_counter() - start + base_fields: dict[str, object] = { + "method": method.name, + "profile": profile_name, + "replicate": replicate.replicate_id, + "test_wtg": replicate.test_wtg, + "campaign_months": window.months, + "treatment_start": window.treatment_start, + "baseline_start": window.baseline_start, + "activity_end": window.activity_end, + } method_rows.append( { - "method": method.name, - "profile": profile_name, - "replicate": replicate.replicate_id, - "test_wtg": replicate.test_wtg, - "campaign_months": window.months, - "treatment_start": window.treatment_start, - "baseline_start": window.baseline_start, - "activity_end": window.activity_end, + **base_fields, "condition": "overall", + "condition_bin": "overall", "estimate": output.p50_overall, "truth": truth, "signed_error": output.p50_overall - truth, "wall_time_s": wall_time_s, } ) + if output.p50_by_condition is not None: + method_rows.extend(_conditional_rows(output.p50_by_condition, replicate, window, mask, base_fields)) if on_method_complete is not None: on_method_complete(method.name, pd.DataFrame(method_rows)) rows.extend(method_rows) @@ -132,9 +143,42 @@ def _method_input(replicate: Replicate, window: CampaignWindow) -> MethodInput: ) -def _truth_overall(replicate: Replicate, window: CampaignWindow) -> float: - """Ground-truth uplift over the test turbine's treated rows within the activity window.""" +def _truth_mask(replicate: Replicate, window: CampaignWindow) -> npt.NDArray[np.bool_]: + """Return the treated-activity mask over the test turbine's rows for this window.""" synthetic = replicate.synthetic_df test_index = synthetic.loc[synthetic[replicate.dataset.columns.turbine] == replicate.test_wtg].index - mask = treated_activity_mask(test_index, replicate.upgrade_timing, window=window) - return replicate.true_uplift(mask=mask).overall + return treated_activity_mask(test_index, replicate.upgrade_timing, window=window) + + +def _conditional_rows( + by_condition: pd.DataFrame, + replicate: Replicate, + window: CampaignWindow, # noqa: ARG001 (kept for symmetry / future use) + mask: npt.NDArray[np.bool_], + base_fields: dict[str, object], +) -> list[dict[str, object]]: + """Join each method per-bin estimate to per-bin truth for every condition present.""" + rows: list[dict[str, object]] = [] + for condition, est in by_condition.groupby("condition"): + truth_df = replicate.true_uplift(mask=mask, by=condition, bins=CONDITION_BINS[str(condition)]).by_condition + if truth_df is None: # always set when by= is passed; guard for type narrowing + continue # pragma: no cover + truth_series = truth_df.assign(condition_bin=truth_df["condition_bin"].astype(str)).set_index("condition_bin")[ + "true_uplift" + ] + for _, r in est.iterrows(): + bin_label = str(r["condition_bin"]) + t = float(truth_series.get(bin_label, float("nan"))) + e = float(r["p50_uplift"]) + rows.append( + { + **base_fields, + "condition": condition, + "condition_bin": bin_label, + "estimate": e, + "truth": t, + "signed_error": e - t, + "wall_time_s": float("nan"), + } + ) + return rows diff --git a/tests/benchmarking/baselines/test_inspect_prepost_hard_case.py b/tests/benchmarking/baselines/test_inspect_prepost_hard_case.py new file mode 100644 index 0000000..c5689a1 --- /dev/null +++ b/tests/benchmarking/baselines/test_inspect_prepost_hard_case.py @@ -0,0 +1,23 @@ +"""The hard-case inspector's conditional-uplift plotting helper.""" + +from __future__ import annotations + +import pandas as pd + +from benchmarking.baselines.inspect_prepost_hard_case import conditional_truth_vs_estimate +from benchmarking.harness.method import MethodOutput + + +def test_conditional_truth_vs_estimate_merges_estimate_and_truth() -> None: + out = MethodOutput( + p50_overall=0.05, + p50_by_condition=pd.DataFrame( + {"condition": ["ws", "ws"], "condition_bin": ["(6.0, 8.0]", "(8.0, 10.0]"], "p50_uplift": [0.06, 0.04]} + ), + ) + truth = pd.DataFrame({"condition_bin": ["(6.0, 8.0]", "(8.0, 10.0]"], "true_uplift": [0.05, 0.05]}) + merged = conditional_truth_vs_estimate(out, {"ws": truth}, method_name="power_model") + row = merged[merged["condition_bin"] == "(6.0, 8.0]"].iloc[0] + assert row["mean_estimate"] == 0.06 + assert row["mean_truth"] == 0.05 + assert row["method"] == "power_model" diff --git a/tests/benchmarking/baselines/test_power_model_method.py b/tests/benchmarking/baselines/test_power_model_method.py index f1294ff..875c364 100644 --- a/tests/benchmarking/baselines/test_power_model_method.py +++ b/tests/benchmarking/baselines/test_power_model_method.py @@ -20,6 +20,7 @@ _POWER = "wtc_ActPower_mean" _AVAIL = "wtc_ScReToOp_timeon" _WS = "wtc_AcWindSp_mean" +_WS_SD = "wtc_AcWindSp_stddev" # Small/fast LightGBM so the toy data (a few thousand rows) is fit well. _FAST_PARAMS = {"n_estimators": 120, "learning_rate": 0.1, "num_leaves": 31, "min_child_samples": 20} @@ -45,7 +46,9 @@ def _toy_scada(n: int, *, uplift: float, treated: np.ndarray, seed: int = 0) -> "R3": r3, } parts = [ - pd.DataFrame({_TURBINE: name, _POWER: power, _AVAIL: 600.0, _WS: power / 100.0}, index=idx) + pd.DataFrame( + {_TURBINE: name, _POWER: power, _AVAIL: 600.0, _WS: power / 100.0, _WS_SD: power / 1000.0}, index=idx + ) for name, power in frames.items() ] return pd.concat(parts) @@ -131,3 +134,37 @@ def test_leak_bait_test_column_does_not_change_estimate(self) -> None: with_leak = method.estimate(mi_leak).p50_overall # the reference-only builder ignores test-turbine columns, so the estimate is unchanged assert with_leak == pytest.approx(baseline, abs=1e-9) + + +class TestConditionalUplift: + def test_emits_conditional_uplift_by_ws_and_ti(self) -> None: + n = 4000 + idx = pd.date_range("2019-01-01", periods=n, freq="10min", tz="UTC") + changeover = idx[n // 2] + treated = np.asarray(idx >= changeover) + scada = _toy_scada(n, uplift=0.05, treated=treated) # now includes _WS_SD + method = PowerModelMethod( + active_power_col=_POWER, + availability_col=_AVAIL, + wind_speed_col=_WS, + wind_speed_sd_col=_WS_SD, + model_params=_FAST_PARAMS, + ) + mi = MethodInput(scada_df=scada, test_wtg="T1", upgrade_timing=pd.Timestamp(changeover), turbine_col=_TURBINE) + out = method.estimate(mi) + bc = out.p50_by_condition + assert list(bc.columns) == ["condition", "condition_bin", "p50_uplift"] + assert set(bc["condition"]) == {"ws", "ti"} + + def test_ws_only_when_no_sd_column_configured(self) -> None: + n = 4000 + idx = pd.date_range("2019-01-01", periods=n, freq="10min", tz="UTC") + changeover = idx[n // 2] + treated = np.asarray(idx >= changeover) + scada = _toy_scada(n, uplift=0.05, treated=treated) + method = PowerModelMethod( + active_power_col=_POWER, availability_col=_AVAIL, wind_speed_col=_WS, model_params=_FAST_PARAMS + ) + mi = MethodInput(scada_df=scada, test_wtg="T1", upgrade_timing=pd.Timestamp(changeover), turbine_col=_TURBINE) + out = method.estimate(mi) + assert set(out.p50_by_condition["condition"]) == {"ws"} diff --git a/tests/benchmarking/baselines/test_study_power_model_compare.py b/tests/benchmarking/baselines/test_study_power_model_compare.py new file mode 100644 index 0000000..3323833 --- /dev/null +++ b/tests/benchmarking/baselines/test_study_power_model_compare.py @@ -0,0 +1,193 @@ +"""Benchmark reshaping in study_power_model_compare.""" + +from __future__ import annotations + +import json +from typing import TYPE_CHECKING + +import pandas as pd + +if TYPE_CHECKING: + from pathlib import Path + +from benchmarking.baselines.study_power_model_compare import ( + _BASELINE_SCHEMA, + _load_baseline_cells, + power_model_leaderboard, + record_baseline, +) +from benchmarking.harness import StudyConfig + + +def test_power_model_leaderboard_includes_overall_and_conditional_cells() -> None: + fresh = pd.DataFrame( + { + "method": "power_model", + "profile": "ws_dependent_cp", + "campaign_months": 6, + "replicate": [0, 1, 0, 1], + "condition": ["overall", "overall", "ws", "ws"], + "condition_bin": ["overall", "overall", "(6.0, 8.0]", "(6.0, 8.0]"], + "estimate": [0.05, 0.05, 0.08, 0.06], + "truth": [0.05, 0.05, 0.07, 0.07], + "signed_error": [0.0, 0.0, 0.01, -0.01], + } + ) + lb = power_model_leaderboard(fresh) + keys = set(zip(lb["condition"], lb["condition_bin"], strict=True)) + assert ("overall", "overall") in keys + assert ("ws", "(6.0, 8.0]") in keys + assert {"profile", "campaign_months", "condition", "condition_bin", "bias", "spread", "score"} <= set(lb.columns) + + +def _minimal_study() -> StudyConfig: + return StudyConfig( + mode="prepost", + turbine_subset=["WT01"], + treatment_start_range=(pd.Timestamp("2020-01-01", tz="UTC"), pd.Timestamp("2020-06-01", tz="UTC")), + min_pre_months=6, + campaign_months=[6], + n_replicates=1, + seed=0, + ) + + +def _minimal_leaderboard() -> pd.DataFrame: + return pd.DataFrame( + { + "profile": ["test_profile"], + "campaign_months": [6], + "condition": ["overall"], + "condition_bin": ["overall"], + "bias": [0.01], + "spread": [0.02], + "score": [0.03], + } + ) + + +def test_load_baseline_cells_returns_none_for_wrong_schema(tmp_path: Path) -> None: + """_load_baseline_cells must return None (not crash) when the file has an old/mismatched schema.""" + path = tmp_path / "baseline.json" + doc = { + "schema": "power_model_compare_baseline_v1", + "modes": { + "prepost": { + "recorded_utc": "2025-01-01T00:00:00Z", + "git_commit": "abc1234", + "n_replicates": 1, + "seed": 0, + "campaign_months": [6], + "profiles": ["test_profile"], + "cells": [{"profile": "test_profile", "campaign_months": 6, "bias": 0.01}], + } + }, + } + path.write_text(json.dumps(doc)) + + result = _load_baseline_cells("prepost", path) + + assert result is None + + +def test_record_baseline_drops_stale_modes_on_schema_bump(tmp_path: Path) -> None: + """When on-disk schema != _BASELINE_SCHEMA, old-schema mode cells must be dropped (not inherited).""" + path = tmp_path / "baseline.json" + # Simulate a v1 file that has a toggle entry with v1-shaped cells (no condition/condition_bin). + old_doc = { + "schema": "power_model_compare_baseline_v1", + "modes": { + "toggle": { + "recorded_utc": "2025-01-01T00:00:00Z", + "git_commit": "abc1234", + "n_replicates": 1, + "seed": 0, + "campaign_months": [6], + "profiles": ["test_profile"], + "cells": [ + {"profile": "test_profile", "campaign_months": 6, "bias": 0.01, "spread": 0.02, "score": 0.03} + ], + } + }, + } + path.write_text(json.dumps(old_doc)) + + lb = _minimal_leaderboard() + study = _minimal_study() + record_baseline({"prepost": lb}, {"prepost": study}, path) + + written = json.loads(path.read_text()) + assert written["schema"] == _BASELINE_SCHEMA + # The stale v1 toggle entry must be gone — a later toggle compare must not KeyError on missing columns. + assert "toggle" not in written.get("modes", {}), "stale v1 toggle cells must be dropped after schema bump" + # Equivalently, _load_baseline_cells for toggle returns None. + assert _load_baseline_cells("toggle", path) is None + + +def test_record_baseline_preserves_sibling_modes_when_schemas_match(tmp_path: Path) -> None: + """When on-disk schema matches _BASELINE_SCHEMA, sibling modes must be preserved (incremental update).""" + path = tmp_path / "baseline.json" + # Write a v2 file that already has a toggle entry. + existing_toggle_cells = [ + { + "profile": "test_profile", + "campaign_months": 6, + "condition": "overall", + "condition_bin": "overall", + "bias": 0.05, + "spread": 0.06, + "score": 0.07, + } + ] + existing_doc = { + "schema": _BASELINE_SCHEMA, + "modes": { + "toggle": { + "recorded_utc": "2025-01-01T00:00:00Z", + "git_commit": "abc1234", + "n_replicates": 1, + "seed": 0, + "campaign_months": [6], + "profiles": ["test_profile"], + "cells": existing_toggle_cells, + } + }, + } + path.write_text(json.dumps(existing_doc)) + + lb = _minimal_leaderboard() + study = _minimal_study() + record_baseline({"prepost": lb}, {"prepost": study}, path) + + written = json.loads(path.read_text()) + assert written["schema"] == _BASELINE_SCHEMA + # The sibling toggle mode must still be present (incremental update must not drop it). + assert "toggle" in written.get("modes", {}), "matching-schema sibling mode must be preserved" + assert "prepost" in written.get("modes", {}), "newly recorded prepost mode must be present" + toggle_result = _load_baseline_cells("toggle", path) + assert toggle_result is not None + + +def test_record_baseline_stamps_current_schema_over_old_file(tmp_path: Path) -> None: + """record_baseline must overwrite 'schema' with _BASELINE_SCHEMA even when an old-schema file exists.""" + path = tmp_path / "baseline.json" + # Write a v1 stub to simulate the on-disk old file. + old_doc = { + "schema": "power_model_compare_baseline_v1", + "modes": {}, + } + path.write_text(json.dumps(old_doc)) + + lb = _minimal_leaderboard() + study = _minimal_study() + record_baseline({"prepost": lb}, {"prepost": study}, path) + + written = json.loads(path.read_text()) + assert written["schema"] == _BASELINE_SCHEMA + + # The round-trip via _load_baseline_cells must now succeed (not return None). + loaded = _load_baseline_cells("prepost", path) + assert loaded is not None + cells_df, prov = loaded + assert isinstance(cells_df, pd.DataFrame) + assert prov["n_replicates"] == 1 diff --git a/tests/benchmarking/harness/stubs.py b/tests/benchmarking/harness/stubs.py index 6ab53f2..aac4b97 100644 --- a/tests/benchmarking/harness/stubs.py +++ b/tests/benchmarking/harness/stubs.py @@ -8,16 +8,13 @@ from __future__ import annotations -from typing import TYPE_CHECKING - import numpy as np +import pandas as pd +from benchmarking.harness.conditions import CONDITION_BINS, energy_ratio_by_bin from benchmarking.harness.method import MethodInput, MethodOutput from benchmarking.synthetic import HOT_COLUMNS, treated_mask -if TYPE_CHECKING: - import pandas as pd - def oracle_overall_uplift(mi: MethodInput, original_df: pd.DataFrame) -> float: """Energy-ratio uplift over the treated test-turbine rows in ``mi``'s window.""" @@ -88,3 +85,37 @@ def estimate(self, mi: MethodInput) -> MethodOutput: ) self.seen.append(captured) return MethodOutput(p50_overall=0.0) + + +def conditional_oracle_by_condition(mi: MethodInput, original_df: pd.DataFrame) -> pd.DataFrame: + """Per-bin oracle uplift over treated rows, binned on the test turbine's measured ws/ti.""" + syn = mi.scada_df + test_rows = syn[syn[mi.turbine_col] == mi.test_wtg] + treated = treated_mask(test_rows.index, mi.upgrade_timing) + treated_rows = test_rows[treated] + orig_test = original_df[original_df[mi.turbine_col] == mi.test_wtg] + actual = treated_rows[HOT_COLUMNS.active_power].to_numpy(dtype=float) + counterfactual = orig_test.loc[treated_rows.index, HOT_COLUMNS.active_power].to_numpy(dtype=float) + ws = treated_rows[HOT_COLUMNS.wind_speed].to_numpy(dtype=float) + sd = treated_rows[HOT_COLUMNS.wind_speed_sd].to_numpy(dtype=float) + ti = np.divide(sd, ws, out=np.full_like(sd, np.nan), where=ws != 0) + frames = [] + for name, values in (("ws", ws), ("ti", ti)): + table = energy_ratio_by_bin(values, actual, counterfactual, bins=CONDITION_BINS[name]) + table.insert(0, "condition", name) + frames.append(table[["condition", "condition_bin", "p50_uplift"]]) + return pd.concat(frames, ignore_index=True) + + +class ConditionalOracleMethod: + """Oracle that also emits per-bin oracle uplift; conditional signed error should be ~0.""" + + def __init__(self, original_df: pd.DataFrame, name: str = "cond_oracle") -> None: + self._original = original_df + self.name = name + + def estimate(self, mi: MethodInput) -> MethodOutput: + return MethodOutput( + p50_overall=oracle_overall_uplift(mi, self._original), + p50_by_condition=conditional_oracle_by_condition(mi, self._original), + ) diff --git a/tests/benchmarking/harness/test_conditions.py b/tests/benchmarking/harness/test_conditions.py new file mode 100644 index 0000000..511cec1 --- /dev/null +++ b/tests/benchmarking/harness/test_conditions.py @@ -0,0 +1,47 @@ +"""Tests for the shared condition bins and the binned energy-ratio reducer.""" + +from __future__ import annotations + +import numpy as np + +from benchmarking.harness.conditions import ( + CONDITION_BINS, + CONDITIONS, + TI_BINS, + WS_BINS, + energy_ratio_by_bin, +) + + +def test_bin_edges_have_expected_width() -> None: + assert WS_BINS[0] == 0.0 + assert WS_BINS[-1] == 26.0 + assert np.allclose(np.diff(WS_BINS), 2.0) + assert TI_BINS[0] == 0.0 + assert TI_BINS[-1] == 0.5 + assert np.allclose(np.diff(TI_BINS), 0.05) + assert CONDITIONS == ("ws", "ti") + assert CONDITION_BINS == {"ws": WS_BINS, "ti": TI_BINS} + + +def test_energy_ratio_by_bin_computes_per_bin_ratio() -> None: + # two rows in (4,6], two in (6,8]; counterfactual constant so uplift = mean(actual)/cf - 1 + cond = np.array([5.0, 5.0, 7.0, 7.0]) + actual = np.array([110.0, 90.0, 150.0, 150.0]) + counterfactual = np.array([100.0, 100.0, 100.0, 100.0]) + out = energy_ratio_by_bin(cond, actual, counterfactual, bins=WS_BINS) + by = out.set_index("condition_bin") + assert by.loc["(4.0, 6.0]", "p50_uplift"] == 0.0 # (110+90)/200 - 1 + assert by.loc["(6.0, 8.0]", "p50_uplift"] == 0.5 # 300/200 - 1 + assert by.loc["(4.0, 6.0]", "n_records"] == 2 + + +def test_energy_ratio_by_bin_is_nan_safe_and_covers_all_bins() -> None: + cond = np.array([5.0, np.nan, 7.0]) + actual = np.array([100.0, 50.0, np.nan]) + counterfactual = np.array([100.0, 50.0, 100.0]) + out = energy_ratio_by_bin(cond, actual, counterfactual, bins=WS_BINS) + # one row per bin edge interval, no warnings, empty bins -> NaN uplift + assert len(out) == len(WS_BINS) - 1 + assert out["condition_bin"].is_unique + assert out.loc[out["condition_bin"] == "(6.0, 8.0]", "p50_uplift"].isna().all() diff --git a/tests/benchmarking/harness/test_leaderboard.py b/tests/benchmarking/harness/test_leaderboard.py index ddb83ab..3716cb7 100644 --- a/tests/benchmarking/harness/test_leaderboard.py +++ b/tests/benchmarking/harness/test_leaderboard.py @@ -7,7 +7,7 @@ import pandas as pd import pytest -from benchmarking.harness.leaderboard import leaderboard +from benchmarking.harness.leaderboard import conditional_leaderboard, leaderboard def _results(rows: list[dict]) -> pd.DataFrame: @@ -94,3 +94,48 @@ def test_methods_are_compared_side_by_side() -> None: by_method = summary.set_index("method") assert by_method.loc["a", "score"] == pytest.approx(0.0) assert by_method.loc["b", "score"] == pytest.approx(0.1) + + +def test_conditional_leaderboard_groups_by_condition_bin() -> None: + df = pd.DataFrame( + { + "method": "m", + "profile": "p", + "campaign_months": 6, + "condition": ["ws", "ws", "ws", "ws"], + "condition_bin": ["(6.0, 8.0]", "(6.0, 8.0]", "(8.0, 10.0]", "(8.0, 10.0]"], + "estimate": [0.11, 0.09, 0.05, 0.05], + "truth": [0.10, 0.10, 0.05, 0.05], + "signed_error": [0.01, -0.01, 0.0, 0.0], + } + ) + lb = conditional_leaderboard(df) + assert set(lb.columns) >= { + "method", + "profile", + "campaign_months", + "condition", + "condition_bin", + "bias", + "spread", + "score", + } + row = lb[lb["condition_bin"] == "(6.0, 8.0]"].iloc[0] + assert row["bias"] == 0.0 + assert row["spread"] == pytest.approx(0.01) + + +def test_conditional_leaderboard_ignores_overall_rows() -> None: + df = pd.DataFrame( + { + "method": "m", + "profile": "p", + "campaign_months": 6, + "condition": ["overall"], + "condition_bin": ["overall"], + "estimate": [0.1], + "truth": [0.1], + "signed_error": [0.0], + } + ) + assert conditional_leaderboard(df).empty diff --git a/tests/benchmarking/harness/test_plots.py b/tests/benchmarking/harness/test_plots.py index 1d53e42..0104505 100644 --- a/tests/benchmarking/harness/test_plots.py +++ b/tests/benchmarking/harness/test_plots.py @@ -8,9 +8,10 @@ mpl.use("Agg") # headless: no display needed for tests +import matplotlib.pyplot as plt import pandas as pd -from benchmarking.harness.plots import plot_campaign_curves +from benchmarking.harness.plots import plot_campaign_curves, plot_conditional_uplift if TYPE_CHECKING: from pathlib import Path @@ -91,3 +92,21 @@ def test_saves_file(tmp_path: Path) -> None: plot_campaign_curves(_summary(), save_path=save_path) assert save_path.exists() assert save_path.stat().st_size > 0 + + +def test_plot_conditional_uplift_writes_png(tmp_path: Path) -> None: + summary = pd.DataFrame( + { + "method": ["power_model", "power_model"], + "condition": ["ws", "ws"], + "condition_bin": ["(4.0, 6.0]", "(6.0, 8.0]"], + "mean_estimate": [0.09, 0.05], + "mean_truth": [0.10, 0.05], + "bias": [-0.01, 0.0], + "spread": [0.01, 0.005], + } + ) + out = tmp_path / "cond.png" + fig = plot_conditional_uplift(summary, condition="ws", save_path=out, title="ws_dependent_cp 6mo") + assert out.exists() + plt.close(fig) diff --git a/tests/benchmarking/harness/test_scoring.py b/tests/benchmarking/harness/test_scoring.py index 4e18344..8f24717 100644 --- a/tests/benchmarking/harness/test_scoring.py +++ b/tests/benchmarking/harness/test_scoring.py @@ -10,7 +10,7 @@ from benchmarking.synthetic import HOT_COLUMNS, ConstantCpChange from wind_up.constants import TIMESTAMP_COL -from .stubs import BiasedMethod, OracleMethod, RecordingMethod +from .stubs import BiasedMethod, ConditionalOracleMethod, OracleMethod, RecordingMethod PROFILE = [ConstantCpChange(delta=0.05)] @@ -180,3 +180,23 @@ def test_on_method_complete_is_optional() -> None: # anything else. drop = ["wall_time_s"] pd.testing.assert_frame_equal(with_cb.drop(columns=drop), without_cb.drop(columns=drop)) + + +def test_conditional_rows_are_emitted_with_truth_and_near_zero_error() -> None: + base = _base_scada() + results = score_study(base, profile=PROFILE, methods=[ConditionalOracleMethod(base)], study=_study(n_replicates=1)) + assert "condition_bin" in results.columns + overall = results[results["condition"] == "overall"] + assert (overall["condition_bin"] == "overall").all() + cond = results[results["condition"].isin(["ws", "ti"])] + assert set(cond["condition"]) == {"ws", "ti"} + # populated bins: a per-bin oracle must match per-bin truth + populated = cond[cond["truth"].notna() & cond["estimate"].notna()] + assert len(populated) > 0 + assert np.allclose(populated["signed_error"].to_numpy(), 0.0, atol=1e-9) + + +def test_overall_only_method_emits_no_conditional_rows() -> None: + base = _base_scada() + results = score_study(base, profile=PROFILE, methods=[OracleMethod(base)], study=_study(n_replicates=1)) + assert set(results["condition"]) == {"overall"}