From ddd1b632cb958cc95c72ca23ac0565e4145587df Mon Sep 17 00:00:00 2001
From: aclerc <alex.clerc@res-group.com>
Date: Tue, 30 Jun 2026 10:59:39 +0100
Subject: [PATCH 1/3] add study_power_model_compare.py

---
 .../baselines/study_power_model_compare.py    | 292 ++++++++++++++++++
 docs/v1/findings.md                           |  88 ++++++
 2 files changed, 380 insertions(+)
 create mode 100644 benchmarking/baselines/study_power_model_compare.py

diff --git a/benchmarking/baselines/study_power_model_compare.py b/benchmarking/baselines/study_power_model_compare.py
new file mode 100644
index 0000000..30d605c
--- /dev/null
+++ b/benchmarking/baselines/study_power_model_compare.py
@@ -0,0 +1,292 @@
+"""Re-run ``power_model`` over the overnight cases and compare it against the existing v0 + naive.
+
+Iterative-development helper. The overnight v0 runs are very slow, so they are computed once
+(:mod:`benchmarking.baselines.study_overnight_prepost` / ``study_overnight_toggle`` with
+``include_v0=True``) and kept on disk. As ``power_model`` is improved you only want to re-run the
+cheap method and re-draw the comparison plots, reusing the frozen v0 (and naive) numbers. That is
+what this script does:
+
+1. **Run** ``power_model`` **only** over the *exact* current overnight prepost + toggle configs —
+   same seven :func:`~benchmarking.baselines.overnight_profiles.overnight_profiles`, same campaign
+   grids, ``n_replicates=4``, ``seed=0`` — so every ``(profile, turbine, treatment-start, campaign)``
+   case matches the frozen overnight run. v0 and naive are *not* recomputed (v0 is very slow and
+   both are already frozen in the reference run).
+2. **Merge** each fresh ``power_model`` result with the ``v0_binned`` and ``naive_ratio`` rows
+   pulled from the reference overnight directory.
+3. **Plot** a three-method campaign-length comparison (``naive_ratio``, ``v0_binned``,
+   ``power_model``) per profile, and write merged per-profile + all-profiles leaderboards.
+
+An alignment guard checks that the method-independent ground ``truth`` of every fresh case equals
+the reference run's truth for the same key; a mismatch means the configs have drifted and the
+merge would be comparing different cases, so it fails loudly rather than plotting nonsense. (The
+``truth`` column the harness attaches to every ``power_model`` row is itself the cross-check, so
+re-running the oracle/naive anchors purely to validate the line-up is unnecessary.)
+
+Run from the repo root::
+
+    uv run python -m benchmarking.baselines.study_power_model_compare \
+        --reference-dir "~/temp/wind-up-benchmarking/badass overnight runs 30 June"
+
+Use ``--skip-run`` to only re-merge/re-plot from a previous ``power_model`` run under
+``--output-dir`` (e.g. to tweak plotting without re-fitting). Use ``--modes prepost`` /
+``--modes toggle`` to restrict to one mode.
+"""
+
+from __future__ import annotations
+
+import argparse
+import logging
+from functools import partial
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+
+from benchmarking.baselines.example_prepost_study import (
+    DEFAULT_END_DT_EXCL,
+    DEFAULT_START_DT,
+    DEFAULT_TREATMENT_START_RANGE,
+    DEFAULT_TURBINE_SUBSET,
+    DEFAULT_WTG_NUMBERS,
+    MIN_PRE_MONTHS,
+    save_per_method_curve,
+)
+from benchmarking.baselines.example_toggle_study import DEFAULT_TOGGLE_PERIOD
+from benchmarking.baselines.hot_context import build_hot_v0_context
+from benchmarking.baselines.overnight_profiles import overnight_profiles
+from benchmarking.baselines.power_model import PowerModelMethod
+from benchmarking.harness import StudyConfig, leaderboard, plot_campaign_curves, score_study
+from benchmarking.synthetic import HOT_COLUMNS
+from benchmarking.synthetic.sources.hill_of_towie import load_hot_scada
+
+logger = logging.getLogger(__name__)
+
+# The three methods compared in the merged plots (oracle is dropped: it is only a truth anchor).
+COMPARE_METHODS = ["naive_ratio", "v0_binned", "power_model"]
+REUSED_METHODS = ["naive_ratio", "v0_binned"]  # taken from the reference run, not recomputed
+
+# Match study_overnight_prepost.py / study_overnight_toggle.py exactly.
+PREPOST_CAMPAIGN_MONTHS = [3, 6, 12]
+TOGGLE_CAMPAIGN_MONTHS = [3, 6, 9, 12]
+N_REPLICATES = 4
+SEED = 0
+
+# Keys that identify one scored case independently of the method.
+_CASE_KEYS = ["profile", "test_wtg", "campaign_months", "treatment_start"]
+_DEFAULT_REFERENCE_DIR = Path.home() / "temp" / "wind-up-benchmarking" / "badass overnight runs 30 June"
+_DEFAULT_OUTPUT_DIR = Path.home() / "temp" / "wind-up-benchmarking" / "power_model_compare"
+
+
+def _prepost_study() -> StudyConfig:
+    return StudyConfig(
+        mode="prepost",
+        turbine_subset=DEFAULT_TURBINE_SUBSET,
+        treatment_start_range=DEFAULT_TREATMENT_START_RANGE,
+        min_pre_months=MIN_PRE_MONTHS,
+        campaign_months=PREPOST_CAMPAIGN_MONTHS,
+        n_replicates=N_REPLICATES,
+        seed=SEED,
+    )
+
+
+def _toggle_study() -> StudyConfig:
+    return StudyConfig(
+        mode="toggle",
+        turbine_subset=DEFAULT_TURBINE_SUBSET,
+        treatment_start_range=DEFAULT_TREATMENT_START_RANGE,
+        min_pre_months=MIN_PRE_MONTHS,
+        campaign_months=TOGGLE_CAMPAIGN_MONTHS,
+        toggle_period=DEFAULT_TOGGLE_PERIOD,
+        n_replicates=N_REPLICATES,
+        seed=SEED,
+    )
+
+
+def run_power_model(mode: str, out_dir: Path) -> pd.DataFrame:
+    """Score **only** ``power_model`` over the overnight cases for one mode (no v0/naive/oracle).
+
+    Each fresh row still carries the harness's method-independent ground ``truth`` (so the merge's
+    alignment guard has its cross-check without recomputing any anchor). Writes a per-profile
+    ``results_*.csv`` and per-profile ``power_model`` curve under ``out_dir`` (the latter lets a long
+    run be sanity-checked profile-by-profile), and returns the concatenated tidy results.
+    """
+    out_dir.mkdir(parents=True, exist_ok=True)
+    scada_df, _ = load_hot_scada(
+        start_dt=DEFAULT_START_DT,
+        end_dt_excl=DEFAULT_END_DT_EXCL,
+        wtg_numbers=DEFAULT_WTG_NUMBERS,
+        wtg_names=DEFAULT_TURBINE_SUBSET,
+    )
+    context = build_hot_v0_context(wtg_names=DEFAULT_TURBINE_SUBSET)
+    study = _prepost_study() if mode == "prepost" else _toggle_study()
+
+    all_results = []
+    for profile_name, profile in overnight_profiles().items():
+        method = PowerModelMethod(
+            active_power_col=HOT_COLUMNS.active_power,
+            wind_speed_col=HOT_COLUMNS.wind_speed,
+            availability_col=HOT_COLUMNS.availability,
+            era5_hourly_df=context.reanalysis_datasets[0].data,
+            out_dir=out_dir / "power_model_runs",
+        )
+        logger.info("Scoring %s profile %s with power_model", mode, profile_name)
+        results = score_study(
+            scada_df,
+            profile=profile,
+            methods=[method],
+            study=study,
+            profile_name=profile_name,
+            on_method_complete=partial(save_per_method_curve, out_dir, profile_name),
+        )
+        results.to_csv(out_dir / f"results_{profile_name}.csv", index=False)
+        all_results.append(results)
+    return pd.concat(all_results, ignore_index=True)
+
+
+def _load_fresh_results(mode_out_dir: Path) -> pd.DataFrame:
+    """Concatenate the per-profile ``results_*.csv`` a previous run wrote (for ``--skip-run``)."""
+    files = sorted(mode_out_dir.glob("results_*.csv"))
+    if not files:
+        msg = f"no results_*.csv under {mode_out_dir}; run without --skip-run first."
+        raise FileNotFoundError(msg)
+    return pd.concat([pd.read_csv(f) for f in files], ignore_index=True)
+
+
+def _load_reference_methods(reference_mode_dir: Path, profiles: list[str], methods: list[str]) -> pd.DataFrame:
+    """Load the requested methods' rows for the given profiles from the reference run directory."""
+    frames = []
+    for profile in profiles:
+        path = reference_mode_dir / f"results_{profile}.csv"
+        if not path.exists():
+            msg = f"reference results missing for profile {profile!r}: {path}"
+            raise FileNotFoundError(msg)
+        df = pd.read_csv(path)
+        frames.append(df[df["method"].isin(methods)])
+    return pd.concat(frames, ignore_index=True)
+
+
+def _case_key(df: pd.DataFrame) -> pd.Series:
+    """Build a stable per-case string key (treatment_start normalised so str/Timestamp compare equal)."""
+    ts = pd.to_datetime(df["treatment_start"], utc=True).dt.strftime("%Y-%m-%d %H:%M:%S%z")
+    return (
+        df["profile"].astype(str)
+        + "|"
+        + df["test_wtg"].astype(str)
+        + "|"
+        + df["campaign_months"].astype(int).astype(str)
+        + "|"
+        + ts
+    )
+
+
+def _check_alignment(fresh: pd.DataFrame, reference: pd.DataFrame) -> None:
+    """Fail loudly if the fresh cases do not line up case-for-case with the reference run.
+
+    Ground ``truth`` is method-independent and deterministic in the study config + seed, so equal
+    keys must carry equal truth. Mismatched keys or truths mean the configs drifted.
+    """
+    f_overall = fresh[fresh["condition"] == "overall"].copy()
+    r_overall = reference[reference["condition"] == "overall"].copy()
+    f_truth = f_overall.assign(key=_case_key(f_overall)).groupby("key")["truth"].first()
+    r_truth = r_overall.assign(key=_case_key(r_overall)).groupby("key")["truth"].first()
+
+    missing = sorted(set(f_truth.index) - set(r_truth.index))
+    if missing:
+        msg = f"{len(missing)} fresh case(s) have no reference match, e.g. {missing[:3]}"
+        raise ValueError(msg)
+    common = f_truth.index.intersection(r_truth.index)
+    bad = ~np.isclose(f_truth.loc[common].to_numpy(), r_truth.loc[common].to_numpy(), rtol=1e-6, atol=1e-9)
+    if bad.any():
+        example = common[bad][0]
+        msg = (
+            f"{int(bad.sum())} case(s) disagree on ground truth between the fresh power_model run and the "
+            f"reference run (e.g. {example}: fresh={f_truth[example]:.6g} vs ref={r_truth[example]:.6g}). "
+            f"The overnight config has drifted from the reference run; the merge would compare different cases."
+        )
+        raise ValueError(msg)
+    logger.info("Alignment OK: %d cases match the reference run on ground truth.", len(common))
+
+
+def merge_and_plot(mode: str, fresh: pd.DataFrame, reference_mode_dir: Path, out_dir: Path) -> pd.DataFrame:
+    """Merge fresh power_model with reference v0/naive, write merged tables + per-profile plots."""
+    comparison_dir = out_dir / "comparison"
+    comparison_dir.mkdir(parents=True, exist_ok=True)
+
+    profiles = sorted(fresh["profile"].unique())
+    reference = _load_reference_methods(reference_mode_dir, profiles, REUSED_METHODS)
+    _check_alignment(fresh, reference)
+
+    power_model = fresh[fresh["method"] == "power_model"]
+    merged = pd.concat([reference, power_model], ignore_index=True)
+    merged = merged[merged["method"].isin(COMPARE_METHODS)]
+
+    merged.to_csv(comparison_dir / f"merged_results_{mode}.csv", index=False)
+    for profile in profiles:
+        prof_rows = merged[merged["profile"] == profile]
+        summary = leaderboard(prof_rows)
+        summary.to_csv(comparison_dir / f"leaderboard_{profile}.csv", index=False)
+        plot_campaign_curves(
+            summary,
+            save_path=comparison_dir / f"campaign_curves_{profile}.png",
+            title=f"{mode} - {profile} (naive vs v0 vs power_model)",
+        )
+
+    all_summary = leaderboard(merged)
+    all_summary.to_csv(comparison_dir / f"leaderboard_all_profiles_{mode}.csv", index=False)
+    logger.info(
+        "%s comparison (all profiles):\n%s",
+        mode,
+        all_summary[["method", "profile", "campaign_months", "bias", "spread", "score"]].to_string(index=False),
+    )
+    logger.info("Wrote %s comparison outputs to %s", mode, comparison_dir)
+    return merged
+
+
+def main() -> None:
+    """Run power_model over the overnight cases for each mode, then merge + plot vs v0/naive."""
+    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument(
+        "--reference-dir",
+        type=Path,
+        default=_DEFAULT_REFERENCE_DIR,
+        help="overnight run dir holding prepost/ and toggle/ with the frozen v0 + naive results",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=_DEFAULT_OUTPUT_DIR,
+        help="where fresh power_model runs and the merged comparison are written",
+    )
+    parser.add_argument(
+        "--modes",
+        nargs="+",
+        choices=["prepost", "toggle"],
+        default=["prepost", "toggle"],
+        help="which mode(s) to run (default: both)",
+    )
+    parser.add_argument(
+        "--skip-run",
+        action="store_true",
+        help="reuse a previous power_model run under --output-dir; only re-merge and re-plot",
+    )
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s", force=True)
+    reference_dir = args.reference_dir.expanduser()
+    output_dir = args.output_dir.expanduser()
+
+    for mode in args.modes:
+        logger.info("=== %s ===", mode.upper())
+        mode_out_dir = output_dir / mode
+        if args.skip_run:
+            fresh = _load_fresh_results(mode_out_dir)
+            logger.info("Reusing %d fresh rows from %s", len(fresh), mode_out_dir)
+        else:
+            fresh = run_power_model(mode, mode_out_dir)
+        merge_and_plot(mode, fresh, reference_dir / mode, mode_out_dir)
+
+    logger.info("All done. Comparison plots under %s/<mode>/comparison/", output_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/v1/findings.md b/docs/v1/findings.md
index 42ab2e5..ff2b0f6 100644
--- a/docs/v1/findings.md
+++ b/docs/v1/findings.md
@@ -7,6 +7,94 @@ from, not just conclusions.
 
 ---
 
+## F4 — power_model beats v0 in prepost and at longer-toggle; v0's edge is only short-toggle, and v0 alone breaks on rated-power uprates
+
+*2026-06-30 — the three-way comparison F3 flagged as the open question (power_model vs v0 in
+*both* modes). Source: `benchmarking/baselines/study_power_model_compare.py`, which re-runs
+**only** `power_model` over the current overnight cases and merges it with the frozen `v0_binned`
++ `naive_ratio` rows from the overnight run (`study_overnight_{prepost,toggle}.py`,
+`include_v0=True`). Seven `overnight_profiles` (`cp_minus_10pct`, `cp_0pct`, `cp_plus_3pct`,
+`cp_plus_10pct`, `ws_dependent_cp`, `ti_dependent_cp`, `rated_plus_5pct`), `n_replicates=4`,
+seed 0; prepost campaigns 3/6/12 mo (84 cases/method), toggle 3/6/9/12 mo (112 cases/method). The
+script's alignment guard confirmed all 84 + 112 fresh cases match the reference run's
+method-independent ground truth exactly, so the merge compares identical cases.*
+
+All numbers below are percentage points of fractional uplift (a 0.01 fraction = 1 pp). Bias = mean
+signed error, spread = std of signed error, RMSE pooled over all cases for that mode/method.
+
+### Observation — pooled over all profiles and campaign lengths
+
+| mode | method | bias | spread | MAE | RMSE | within ±1pp | per-case win |
+|---|---|---|---|---|---|---|---|
+| prepost | naive_ratio | −1.11 | 4.62 | 3.83 | 4.73 | 17% | 0% |
+| prepost | **power_model** | **−0.39** | **0.49** | **0.53** | **0.62** | **86%** | **73%** |
+| prepost | v0_binned | −0.73 | 0.72 | 0.84 | 1.03 | 62% | 27% |
+| toggle | naive_ratio | +0.15 | 0.16 | 0.19 | 0.22 | 100% | 39% |
+| toggle | power_model | +0.16 | 0.19 | 0.20 | 0.24 | 100% | 22% |
+| toggle | v0_binned | −0.01 | 0.33 | 0.23 | 0.32 | 98% | 38% |
+
+("per-case win" = share of the N cases where that method has the smallest |error|.) In prepost,
+**power_model's |error| is smaller than v0's in 73% of cases and smaller than naive's in 100%**.
+
+### Observation — RMSE by campaign length (the short-data story, serves G2)
+
+| mode | method | 3mo | 6mo | 9mo | 12mo |
+|---|---|---|---|---|---|
+| prepost | naive_ratio | 7.32 | 3.19 | — | 1.84 |
+| prepost | **power_model** | **0.82** | **0.46** | — | **0.53** |
+| prepost | v0_binned | 1.22 | 0.94 | — | 0.89 |
+| toggle | naive_ratio | **0.30** | 0.23 | 0.17 | 0.14 |
+| toggle | power_model | 0.38 | **0.22** | **0.18** | **0.11** |
+| toggle | v0_binned | 0.32 | 0.31 | 0.33 | 0.32 |
+
+The two structural facts: **(a)** in prepost power_model leads at every length, its biggest margin
+at 3 months (0.82 vs v0 1.22 vs naive 7.32); **(b)** in toggle, power_model and naive both tighten
+with more data (power_model 0.38 → 0.11), but **v0 does not improve with campaign length** — it
+sits at ~0.32 RMSE from 3 to 12 months. So v0 only wins the shortest toggle campaign; from 6
+months on, power_model is best in toggle too.
+
+### Observation — profile spotlight (pooled over campaigns)
+
+| profile | method | bias | RMSE | max |error| |
+|---|---|---|---|---|
+| prepost `rated_plus_5pct` | **power_model** | **−0.38** | **0.62** | **1.05** |
+| prepost `rated_plus_5pct` | v0_binned | −1.24 | 1.42 | 2.23 |
+| toggle `rated_plus_5pct` | **power_model** | +0.16 | **0.25** | **0.49** |
+| toggle `rated_plus_5pct` | v0_binned | −0.63 | 0.68 | 1.10 |
+
+power_model is **flat across all seven profiles** (prepost RMSE 0.61–0.63, bias ≈ −0.38 on every
+one), whereas **v0 has a specific weak spot on the rated-power uprate** — its worst profile in both
+modes (the only profile where v0's toggle RMSE, 0.68, is more than ~2× its others). A rated-power
+change shifts power at high wind speeds where v0's binned power-curve has sparse, noisy bins;
+power_model's continuous reference-conditioned fit has no such blind spot. On the placebo
+(`cp_0pct`) all three are well-behaved (toggle v0 even edges power_model, 0.18 vs 0.24 RMSE).
+
+### Interpretation
+- **Prepost: power_model is the better method, decisively.** Lower bias (−0.39 vs −0.73 pp), lower
+  spread (0.49 vs 0.72), ~40% lower RMSE than v0, and it wins the majority of cases head-to-head —
+  the F1 contrast lever (expected power through the references) cancelling the common-mode drift
+  that v0 corrects only through its detrend step. naive is not in contention (covariate shift).
+- **Toggle: a near-tie that tips to power_model with data.** Naive and power_model are
+  near-identical and both beat v0 overall on RMSE; v0's larger spread and its failure to improve
+  with longer toggling are the cost of its binning. v0's only advantage is the 3-month toggle
+  campaign, where power_model carries slightly more bias (+0.37) before its variance collapses.
+- **v0's rated-power weakness is the clearest single result.** It is the one regime where v0 is
+  both biased and high-variance in *both* modes, and where power_model's flatness is most valuable.
+
+### Implications
+1. **Answers F3's open question: power_model ≥ v0 in both modes for P50** — strictly better in
+   prepost and at toggle ≥ 6 months, with v0 ahead only at the shortest toggle campaign. It is now
+   the baseline to beat (G-level), not just vs naive.
+2. **Short-toggle bias is power_model's one soft spot** — the +0.37 pp at 3-month toggle is the
+   thing to chip at next (mirrors the residual prepost bias noted in F3 #3); candidates are the
+   baseline-horizon / recency weighting already on the list.
+3. **Add a rated-power-uprate case to any v0 regression framing** — it is v0's worst regime and a
+   natural demonstrator for power_model's advantage; worth a dedicated diagnostic.
+4. Reproduce/extend with `study_power_model_compare.py` (`--skip-run` to re-merge, `--modes` to
+   restrict); it reuses the frozen slow v0 so each power_model iteration is cheap.
+
+---
+
 ## F3 — A simple counterfactual power model halves prepost bias and spread vs naive; toggle is a wash
 
 *2026-06-29 — new method `power_model` (the simplest-possible ML method: a single LightGBM

From bde599aa863b0c9b5e25179ab3802947cdd14ccd Mon Sep 17 00:00:00 2001
From: aclerc <alex.clerc@res-group.com>
Date: Tue, 30 Jun 2026 11:56:40 +0100
Subject: [PATCH 2/3] add study_power_model_compare baseline

---
 .../baselines/study_power_model_compare.py    | 177 +++++++-
 .../study_power_model_compare_baseline.json   | 393 ++++++++++++++++++
 2 files changed, 567 insertions(+), 3 deletions(-)
 create mode 100644 benchmarking/baselines/study_power_model_compare_baseline.json

diff --git a/benchmarking/baselines/study_power_model_compare.py b/benchmarking/baselines/study_power_model_compare.py
index 30d605c..e456576 100644
--- a/benchmarking/baselines/study_power_model_compare.py
+++ b/benchmarking/baselines/study_power_model_compare.py
@@ -22,22 +22,36 @@
 ``truth`` column the harness attaches to every ``power_model`` row is itself the cross-check, so
 re-running the oracle/naive anchors purely to validate the line-up is unnecessary.)
 
+**Benchmark regression tracking.** ``power_model``'s bias/spread/score per
+``(mode, profile, campaign_months)`` at a known-good commit is frozen in a committed JSON benchmark
+(``study_power_model_compare_baseline.json``, next to this script). Every run diffs the fresh
+``power_model`` against it and logs a mean-over-profiles table of the deltas (spread/score: a
+negative delta is an improvement; bias: a smaller ``|bias|`` is better) plus a per-cell
+``benchmark_comparison_<mode>.csv``, so an attempt to improve ``power_model`` is scored objectively
+against where it stands today. Bump the benchmark **deliberately** with ``--update-baseline`` once an
+improvement is accepted (it rewrites only the modes you ran; commit the new JSON).
+
 Run from the repo root::
 
     uv run python -m benchmarking.baselines.study_power_model_compare \
         --reference-dir "~/temp/wind-up-benchmarking/badass overnight runs 30 June"
 
-Use ``--skip-run`` to only re-merge/re-plot from a previous ``power_model`` run under
-``--output-dir`` (e.g. to tweak plotting without re-fitting). Use ``--modes prepost`` /
-``--modes toggle`` to restrict to one mode.
+Use ``--skip-run`` to only re-merge/re-plot (and re-diff the benchmark) from a previous
+``power_model`` run under ``--output-dir`` (e.g. to tweak plotting without re-fitting). Use
+``--modes prepost`` / ``--modes toggle`` to restrict to one mode. Use ``--update-baseline`` to
+re-record the benchmark from the current run.
 """
 
 from __future__ import annotations
 
 import argparse
+import json
 import logging
+import subprocess
+from datetime import datetime, timezone
 from functools import partial
 from pathlib import Path
+from typing import Any
 
 import numpy as np
 import pandas as pd
@@ -76,6 +90,18 @@
 _DEFAULT_REFERENCE_DIR = Path.home() / "temp" / "wind-up-benchmarking" / "badass overnight runs 30 June"
 _DEFAULT_OUTPUT_DIR = Path.home() / "temp" / "wind-up-benchmarking" / "power_model_compare"
 
+# The committed power_model benchmark: its bias/spread/score per (mode, profile, campaign) frozen at
+# a known-good commit, so future power_model changes are scored against it. Lives next to this script
+# (tracked) — update it deliberately with --update-baseline when an improvement is accepted.
+_BASELINE_PATH = Path(__file__).resolve().parent / "study_power_model_compare_baseline.json"
+_BASELINE_SCHEMA = "power_model_compare_baseline_v1"
+# Per-cell metrics recorded and diffed. spread/score: lower is better; bias: |bias| nearer 0 is better.
+_METRIC_COLS = ["bias", "spread", "score"]
+# A delta smaller than this (fractional uplift) is floating-point noise, not a real change: an
+# identical (deterministic, seed-fixed) run must read as no change, while any genuine power_model
+# change moves the metrics by >= ~1e-4. 1e-7 fraction = 1e-5 pp, well below the reported resolution.
+_IMPROVE_EPS = 1e-7
+
 
 def _prepost_study() -> StudyConfig:
     return StudyConfig(
@@ -207,6 +233,129 @@ def _check_alignment(fresh: pd.DataFrame, reference: pd.DataFrame) -> None:
     logger.info("Alignment OK: %d cases match the reference run on ground truth.", len(common))
 
 
+def _git_commit() -> str:
+    """Return the short HEAD commit (``-dirty`` if the tree is modified), or ``unknown``."""
+    repo = Path(__file__).resolve().parent
+    try:
+        commit = subprocess.run(
+            ["git", "rev-parse", "--short", "HEAD"],  # noqa: S607
+            cwd=repo,
+            capture_output=True,
+            text=True,
+            check=True,
+        ).stdout.strip()
+        dirty = subprocess.run(
+            ["git", "status", "--porcelain"],  # noqa: S607
+            cwd=repo,
+            capture_output=True,
+            text=True,
+            check=True,
+        ).stdout.strip()
+    except (subprocess.SubprocessError, OSError):
+        return "unknown"
+    return f"{commit}-dirty" if dirty else commit
+
+
+def power_model_leaderboard(fresh: pd.DataFrame) -> pd.DataFrame:
+    """Reduce the fresh ``power_model`` rows to one ``(profile, campaign_months)`` row of bias/spread/score."""
+    pm = fresh[fresh["method"] == "power_model"]
+    summary = leaderboard(pm)
+    return summary[["profile", "campaign_months", *_METRIC_COLS]].sort_values(["profile", "campaign_months"])
+
+
+def record_baseline(lb_by_mode: dict[str, pd.DataFrame], study_by_mode: dict[str, StudyConfig], path: Path) -> None:
+    """Write/refresh the committed benchmark for the given modes (other modes in the file are kept)."""
+    doc: dict[str, Any] = {"schema": _BASELINE_SCHEMA, "modes": {}}
+    if path.exists():
+        doc = json.loads(path.read_text())
+        doc.setdefault("modes", {})
+    commit = _git_commit()
+    now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+    for mode, lb in lb_by_mode.items():
+        study = study_by_mode[mode]
+        doc["modes"][mode] = {
+            "recorded_utc": now,
+            "git_commit": commit,
+            "n_replicates": study.n_replicates,
+            "seed": study.seed,
+            "campaign_months": list(study.campaign_months),
+            "profiles": sorted(lb["profile"].unique()),
+            "cells": lb.round(8).to_dict(orient="records"),
+        }
+    path.write_text(json.dumps(doc, indent=2) + "\n")
+    logger.info("Recorded power_model benchmark for %s at %s (commit %s)", list(lb_by_mode), path, commit)
+
+
+def _load_baseline_cells(mode: str, path: Path) -> tuple[pd.DataFrame, dict[str, Any]] | None:
+    """Load the recorded benchmark cells (+ provenance) for ``mode``, or ``None`` if not recorded."""
+    if not path.exists():
+        return None
+    entry = json.loads(path.read_text()).get("modes", {}).get(mode)
+    if entry is None:
+        return None
+    return pd.DataFrame(entry["cells"]), entry
+
+
+def compare_to_benchmark(mode: str, lb: pd.DataFrame, baseline_path: Path, comparison_dir: Path) -> None:
+    """Diff the fresh power_model bias/spread/score against the committed benchmark and report it.
+
+    Writes a per-cell ``benchmark_comparison_<mode>.csv`` and logs a mean-over-profiles table with the
+    deltas. spread/score: a negative delta is an improvement; bias: a smaller ``|bias|`` is better.
+    """
+    loaded = _load_baseline_cells(mode, baseline_path)
+    if loaded is None:
+        logger.warning(
+            "No power_model benchmark recorded for %s in %s yet — run with --update-baseline to set it.",
+            mode,
+            baseline_path,
+        )
+        return
+    base, prov = loaded
+    merged = lb.merge(base, on=["profile", "campaign_months"], how="outer", suffixes=("", "_base"))
+    for col in _METRIC_COLS:
+        merged[f"d_{col}"] = merged[col] - merged[f"{col}_base"]
+    merged["d_abs_bias"] = merged["bias"].abs() - merged["bias_base"].abs()
+    merged.sort_values(["profile", "campaign_months"]).to_csv(
+        comparison_dir / f"benchmark_comparison_{mode}.csv", index=False
+    )
+
+    pp = 100.0
+    report = merged.groupby("campaign_months").mean(numeric_only=True)
+    overall = merged.mean(numeric_only=True).to_frame().T
+    overall.index = ["ALL"]
+    table = pd.concat([report, overall])
+    show = pd.DataFrame(
+        {
+            "bias": table["bias"] * pp,
+            "Δbias": table["d_bias"] * pp,
+            "spread": table["spread"] * pp,
+            "Δspread": table["d_spread"] * pp,
+            "score": table["score"] * pp,
+            "Δscore": table["d_score"] * pp,
+        }
+    ).round(3)
+    n_cells = int(merged[_METRIC_COLS].notna().all(axis=1).sum())
+
+    def tally(delta: pd.Series) -> str:
+        """Count cells that moved beyond the noise floor: ``better`` (down) vs ``worse`` (up)."""
+        better = int((delta < -_IMPROVE_EPS).sum())
+        worse = int((delta > _IMPROVE_EPS).sum())
+        return f"{better} better / {worse} worse (of {n_cells})"
+
+    logger.info(
+        "%s power_model vs benchmark (recorded %s, commit %s); mean over profiles [pp], "
+        "Δ<0 = better for spread/score:\n%s\n"
+        "cells: spread %s; score %s; |bias| %s",
+        mode,
+        prov.get("recorded_utc", "?"),
+        prov.get("git_commit", "?"),
+        show.to_string(),
+        tally(merged["d_spread"]),
+        tally(merged["d_score"]),
+        tally(merged["d_abs_bias"]),
+    )
+
+
 def merge_and_plot(mode: str, fresh: pd.DataFrame, reference_mode_dir: Path, out_dir: Path) -> pd.DataFrame:
     """Merge fresh power_model with reference v0/naive, write merged tables + per-profile plots."""
     comparison_dir = out_dir / "comparison"
@@ -269,12 +418,27 @@ def main() -> None:
         action="store_true",
         help="reuse a previous power_model run under --output-dir; only re-merge and re-plot",
     )
+    parser.add_argument(
+        "--baseline-path",
+        type=Path,
+        default=_BASELINE_PATH,
+        help="the committed power_model benchmark JSON to diff against (and to --update-baseline)",
+    )
+    parser.add_argument(
+        "--update-baseline",
+        action="store_true",
+        help="overwrite the recorded benchmark for the run modes with this run (do this deliberately, "
+        "only when an improvement is accepted); without it, the run is diffed against the benchmark",
+    )
     args = parser.parse_args()
 
     logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s", force=True)
     reference_dir = args.reference_dir.expanduser()
     output_dir = args.output_dir.expanduser()
+    baseline_path = args.baseline_path.expanduser()
 
+    lb_by_mode: dict[str, pd.DataFrame] = {}
+    study_by_mode: dict[str, StudyConfig] = {}
     for mode in args.modes:
         logger.info("=== %s ===", mode.upper())
         mode_out_dir = output_dir / mode
@@ -284,6 +448,13 @@ def main() -> None:
         else:
             fresh = run_power_model(mode, mode_out_dir)
         merge_and_plot(mode, fresh, reference_dir / mode, mode_out_dir)
+        lb_by_mode[mode] = power_model_leaderboard(fresh)
+        study_by_mode[mode] = _prepost_study() if mode == "prepost" else _toggle_study()
+        if not args.update_baseline:
+            compare_to_benchmark(mode, lb_by_mode[mode], baseline_path, mode_out_dir / "comparison")
+
+    if args.update_baseline:
+        record_baseline(lb_by_mode, study_by_mode, baseline_path)
 
     logger.info("All done. Comparison plots under %s/<mode>/comparison/", output_dir)
 
diff --git a/benchmarking/baselines/study_power_model_compare_baseline.json b/benchmarking/baselines/study_power_model_compare_baseline.json
new file mode 100644
index 0000000..798cc5d
--- /dev/null
+++ b/benchmarking/baselines/study_power_model_compare_baseline.json
@@ -0,0 +1,393 @@
+{
+  "schema": "power_model_compare_baseline_v1",
+  "modes": {
+    "prepost": {
+      "recorded_utc": "2026-06-30T10:04:36Z",
+      "git_commit": "ddd1b63-dirty",
+      "n_replicates": 4,
+      "seed": 0,
+      "campaign_months": [
+        3,
+        6,
+        12
+      ],
+      "profiles": [
+        "cp_0pct",
+        "cp_minus_10pct",
+        "cp_plus_10pct",
+        "cp_plus_3pct",
+        "rated_plus_5pct",
+        "ti_dependent_cp",
+        "ws_dependent_cp"
+      ],
+      "cells": [
+        {
+          "profile": "cp_0pct",
+          "campaign_months": 3,
+          "bias": -0.00660529,
+          "spread": 0.00466565,
+          "score": 0.00808691
+        },
+        {
+          "profile": "cp_0pct",
+          "campaign_months": 6,
+          "bias": -0.00216502,
+          "spread": 0.00397077,
+          "score": 0.00452264
+        },
+        {
+          "profile": "cp_0pct",
+          "campaign_months": 12,
+          "bias": -0.00260598,
+          "spread": 0.00450127,
+          "score": 0.0052012
+        },
+        {
+          "profile": "cp_minus_10pct",
+          "campaign_months": 3,
+          "bias": -0.00608179,
+          "spread": 0.00437577,
+          "score": 0.00749236
+        },
+        {
+          "profile": "cp_minus_10pct",
+          "campaign_months": 6,
+          "bias": -0.00192257,
+          "spread": 0.00368846,
+          "score": 0.00415945
+        },
+        {
+          "profile": "cp_minus_10pct",
+          "campaign_months": 12,
+          "bias": -0.0023763,
+          "spread": 0.00423041,
+          "score": 0.00485213
+        },
+        {
+          "profile": "cp_plus_10pct",
+          "campaign_months": 3,
+          "bias": -0.00712802,
+          "spread": 0.00495523,
+          "score": 0.00868119
+        },
+        {
+          "profile": "cp_plus_10pct",
+          "campaign_months": 6,
+          "bias": -0.00240698,
+          "spread": 0.00425268,
+          "score": 0.0048866
+        },
+        {
+          "profile": "cp_plus_10pct",
+          "campaign_months": 12,
+          "bias": -0.00283565,
+          "spread": 0.00477221,
+          "score": 0.00555111
+        },
+        {
+          "profile": "cp_plus_3pct",
+          "campaign_months": 3,
+          "bias": -0.00676189,
+          "spread": 0.00475233,
+          "score": 0.00826485
+        },
+        {
+          "profile": "cp_plus_3pct",
+          "campaign_months": 6,
+          "bias": -0.00223764,
+          "spread": 0.00405543,
+          "score": 0.0046318
+        },
+        {
+          "profile": "cp_plus_3pct",
+          "campaign_months": 12,
+          "bias": -0.00267486,
+          "spread": 0.00458252,
+          "score": 0.00530607
+        },
+        {
+          "profile": "rated_plus_5pct",
+          "campaign_months": 3,
+          "bias": -0.00670173,
+          "spread": 0.00474853,
+          "score": 0.00821351
+        },
+        {
+          "profile": "rated_plus_5pct",
+          "campaign_months": 6,
+          "bias": -0.00218881,
+          "spread": 0.00403347,
+          "score": 0.00458909
+        },
+        {
+          "profile": "rated_plus_5pct",
+          "campaign_months": 12,
+          "bias": -0.00264277,
+          "spread": 0.00458772,
+          "score": 0.00529447
+        },
+        {
+          "profile": "ti_dependent_cp",
+          "campaign_months": 3,
+          "bias": -0.0069171,
+          "spread": 0.00485054,
+          "score": 0.00844831
+        },
+        {
+          "profile": "ti_dependent_cp",
+          "campaign_months": 6,
+          "bias": -0.00231017,
+          "spread": 0.00414554,
+          "score": 0.00474578
+        },
+        {
+          "profile": "ti_dependent_cp",
+          "campaign_months": 12,
+          "bias": -0.00274427,
+          "spread": 0.00467084,
+          "score": 0.00541736
+        },
+        {
+          "profile": "ws_dependent_cp",
+          "campaign_months": 3,
+          "bias": -0.00683324,
+          "spread": 0.00483019,
+          "score": 0.00836803
+        },
+        {
+          "profile": "ws_dependent_cp",
+          "campaign_months": 6,
+          "bias": -0.00225087,
+          "spread": 0.00412935,
+          "score": 0.00470297
+        },
+        {
+          "profile": "ws_dependent_cp",
+          "campaign_months": 12,
+          "bias": -0.00267457,
+          "spread": 0.00463682,
+          "score": 0.00535289
+        }
+      ]
+    },
+    "toggle": {
+      "recorded_utc": "2026-06-30T10:04:36Z",
+      "git_commit": "ddd1b63-dirty",
+      "n_replicates": 4,
+      "seed": 0,
+      "campaign_months": [
+        3,
+        6,
+        9,
+        12
+      ],
+      "profiles": [
+        "cp_0pct",
+        "cp_minus_10pct",
+        "cp_plus_10pct",
+        "cp_plus_3pct",
+        "rated_plus_5pct",
+        "ti_dependent_cp",
+        "ws_dependent_cp"
+      ],
+      "cells": [
+        {
+          "profile": "cp_0pct",
+          "campaign_months": 3,
+          "bias": 0.00362415,
+          "spread": 0.00084001,
+          "score": 0.00372022
+        },
+        {
+          "profile": "cp_0pct",
+          "campaign_months": 6,
+          "bias": 0.00129828,
+          "spread": 0.00178902,
+          "score": 0.00221046
+        },
+        {
+          "profile": "cp_0pct",
+          "campaign_months": 9,
+          "bias": 0.00086062,
+          "spread": 0.00160665,
+          "score": 0.00182263
+        },
+        {
+          "profile": "cp_0pct",
+          "campaign_months": 12,
+          "bias": 0.00056382,
+          "spread": 0.00088262,
+          "score": 0.00104734
+        },
+        {
+          "profile": "cp_minus_10pct",
+          "campaign_months": 3,
+          "bias": 0.0034594,
+          "spread": 0.00078816,
+          "score": 0.00354805
+        },
+        {
+          "profile": "cp_minus_10pct",
+          "campaign_months": 6,
+          "bias": 0.00131574,
+          "spread": 0.00168113,
+          "score": 0.0021348
+        },
+        {
+          "profile": "cp_minus_10pct",
+          "campaign_months": 9,
+          "bias": 0.00088486,
+          "spread": 0.0015148,
+          "score": 0.0017543
+        },
+        {
+          "profile": "cp_minus_10pct",
+          "campaign_months": 12,
+          "bias": 0.00061016,
+          "spread": 0.00085288,
+          "score": 0.00104867
+        },
+        {
+          "profile": "cp_plus_10pct",
+          "campaign_months": 3,
+          "bias": 0.0037889,
+          "spread": 0.00089431,
+          "score": 0.00389301
+        },
+        {
+          "profile": "cp_plus_10pct",
+          "campaign_months": 6,
+          "bias": 0.00128082,
+          "spread": 0.00189692,
+          "score": 0.00228884
+        },
+        {
+          "profile": "cp_plus_10pct",
+          "campaign_months": 9,
+          "bias": 0.00083644,
+          "spread": 0.00169846,
+          "score": 0.00189325
+        },
+        {
+          "profile": "cp_plus_10pct",
+          "campaign_months": 12,
+          "bias": 0.00051748,
+          "spread": 0.00091239,
+          "score": 0.00104892
+        },
+        {
+          "profile": "cp_plus_3pct",
+          "campaign_months": 3,
+          "bias": 0.00367357,
+          "spread": 0.00085606,
+          "score": 0.003772
+        },
+        {
+          "profile": "cp_plus_3pct",
+          "campaign_months": 6,
+          "bias": 0.00129304,
+          "spread": 0.00182139,
+          "score": 0.0022337
+        },
+        {
+          "profile": "cp_plus_3pct",
+          "campaign_months": 9,
+          "bias": 0.00085336,
+          "spread": 0.0016342,
+          "score": 0.00184359
+        },
+        {
+          "profile": "cp_plus_3pct",
+          "campaign_months": 12,
+          "bias": 0.00054992,
+          "spread": 0.00089155,
+          "score": 0.00104751
+        },
+        {
+          "profile": "rated_plus_5pct",
+          "campaign_months": 3,
+          "bias": 0.00369558,
+          "spread": 0.00086132,
+          "score": 0.00379462
+        },
+        {
+          "profile": "rated_plus_5pct",
+          "campaign_months": 6,
+          "bias": 0.00133541,
+          "spread": 0.00181816,
+          "score": 0.00225589
+        },
+        {
+          "profile": "rated_plus_5pct",
+          "campaign_months": 9,
+          "bias": 0.00088821,
+          "spread": 0.00163333,
+          "score": 0.00185922
+        },
+        {
+          "profile": "rated_plus_5pct",
+          "campaign_months": 12,
+          "bias": 0.00059381,
+          "spread": 0.00090885,
+          "score": 0.00108564
+        },
+        {
+          "profile": "ti_dependent_cp",
+          "campaign_months": 3,
+          "bias": 0.00373349,
+          "spread": 0.00087011,
+          "score": 0.00383354
+        },
+        {
+          "profile": "ti_dependent_cp",
+          "campaign_months": 6,
+          "bias": 0.00128565,
+          "spread": 0.0018638,
+          "score": 0.00226421
+        },
+        {
+          "profile": "ti_dependent_cp",
+          "campaign_months": 9,
+          "bias": 0.0008499,
+          "spread": 0.00167478,
+          "score": 0.00187809
+        },
+        {
+          "profile": "ti_dependent_cp",
+          "campaign_months": 12,
+          "bias": 0.00053768,
+          "spread": 0.0009069,
+          "score": 0.00105431
+        },
+        {
+          "profile": "ws_dependent_cp",
+          "campaign_months": 3,
+          "bias": 0.00372194,
+          "spread": 0.00088052,
+          "score": 0.00382467
+        },
+        {
+          "profile": "ws_dependent_cp",
+          "campaign_months": 6,
+          "bias": 0.00132792,
+          "spread": 0.00184903,
+          "score": 0.00227646
+        },
+        {
+          "profile": "ws_dependent_cp",
+          "campaign_months": 9,
+          "bias": 0.00087805,
+          "spread": 0.00165961,
+          "score": 0.00187757
+        },
+        {
+          "profile": "ws_dependent_cp",
+          "campaign_months": 12,
+          "bias": 0.00058349,
+          "spread": 0.00092061,
+          "score": 0.00108995
+        }
+      ]
+    }
+  }
+}

From 1ccd74557456f78732f88aec316e17942a8763fb Mon Sep 17 00:00:00 2001
From: aclerc <alex.clerc@res-group.com>
Date: Tue, 30 Jun 2026 16:02:05 +0100
Subject: [PATCH 3/3] uplift by condition WIP

---
 .../inspect_prepost_feature_ablation.py       |   2 +-
 .../baselines/inspect_prepost_hard_case.py    |  84 +++++++-
 .../baselines/power_model/features.py         |  25 +++
 benchmarking/baselines/power_model/method.py  |  31 ++-
 .../baselines/study_power_model_compare.py    |  63 +++++-
 benchmarking/harness/__init__.py              |  18 +-
 benchmarking/harness/conditions.py            |  49 +++++
 benchmarking/harness/leaderboard.py           |  30 +++
 benchmarking/harness/method.py                |   5 +-
 benchmarking/harness/plots.py                 |  38 ++++
 benchmarking/harness/scoring.py               |  74 +++++--
 .../test_inspect_prepost_hard_case.py         |  23 +++
 .../baselines/test_power_model_method.py      |  39 +++-
 .../test_study_power_model_compare.py         | 193 ++++++++++++++++++
 tests/benchmarking/harness/stubs.py           |  41 +++-
 tests/benchmarking/harness/test_conditions.py |  47 +++++
 .../benchmarking/harness/test_leaderboard.py  |  47 ++++-
 tests/benchmarking/harness/test_plots.py      |  21 +-
 tests/benchmarking/harness/test_scoring.py    |  22 +-
 19 files changed, 807 insertions(+), 45 deletions(-)
 create mode 100644 benchmarking/harness/conditions.py
 create mode 100644 tests/benchmarking/baselines/test_inspect_prepost_hard_case.py
 create mode 100644 tests/benchmarking/baselines/test_study_power_model_compare.py
 create mode 100644 tests/benchmarking/harness/test_conditions.py

diff --git a/benchmarking/baselines/inspect_prepost_feature_ablation.py b/benchmarking/baselines/inspect_prepost_feature_ablation.py
index 8aadc5e..9f100de 100644
--- a/benchmarking/baselines/inspect_prepost_feature_ablation.py
+++ b/benchmarking/baselines/inspect_prepost_feature_ablation.py
@@ -92,7 +92,7 @@ def inspect_prepost_feature_ablation() -> pd.DataFrame:
         wtg_numbers=DEFAULT_WTG_NUMBERS,
         wtg_names=DEFAULT_TURBINE_SUBSET,
     )
-    _rep, mi, truth = _pin_case(
+    _rep, mi, truth, _window = _pin_case(
         scada_df,
         study=study,
         profile_name=DEFAULT_PROFILE,
diff --git a/benchmarking/baselines/inspect_prepost_hard_case.py b/benchmarking/baselines/inspect_prepost_hard_case.py
index 06520e7..5164b3e 100644
--- a/benchmarking/baselines/inspect_prepost_hard_case.py
+++ b/benchmarking/baselines/inspect_prepost_hard_case.py
@@ -48,11 +48,15 @@
 from benchmarking.baselines.power_model import PowerModelMethod
 from benchmarking.baselines.v0_binned import V0BinnedMethod
 from benchmarking.harness import (
+    CONDITION_BINS,
+    CampaignWindow,
     Method,
     MethodInput,
+    MethodOutput,
     StudyConfig,
     build_replicates,
     campaign_windows,
+    plot_conditional_uplift,
     treated_activity_mask,
     window_row_mask,
 )
@@ -106,8 +110,8 @@ def _select_replicate(replicates: list[Replicate], test_wtg: str) -> Replicate:
 
 def _pin_case(
     scada_df: pd.DataFrame, *, study: StudyConfig, profile_name: str, test_wtg: str, campaign_months: int
-) -> tuple[Replicate, MethodInput, float]:
-    """Build the pinned replicate, its shared ``MethodInput`` and the ground-truth uplift."""
+) -> tuple[Replicate, MethodInput, float, CampaignWindow]:
+    """Build the pinned replicate, its shared ``MethodInput``, the ground-truth uplift, and the window."""
     profile = overnight_profiles()[profile_name]
     replicates = build_replicates(scada_df, profile=profile, study=study)
     rep = _select_replicate(replicates, test_wtg)
@@ -145,7 +149,7 @@ def _pin_case(
         window.activity_end,
         100 * truth,
     )
-    return rep, mi, truth
+    return rep, mi, truth, window
 
 
 def _build_methods(out_dir: Path, *, include_v0: bool) -> list[Method]:
@@ -161,6 +165,7 @@ def _build_methods(out_dir: Path, *, include_v0: bool) -> list[Method]:
         PowerModelMethod(
             active_power_col=HOT_COLUMNS.active_power,
             wind_speed_col=HOT_COLUMNS.wind_speed,
+            wind_speed_sd_col=HOT_COLUMNS.wind_speed_sd,
             availability_col=HOT_COLUMNS.availability,
             era5_hourly_df=context.reanalysis_datasets[0].data,
             out_dir=out_dir / "power_model",
@@ -199,6 +204,75 @@ def _run_methods(methods: list[Method], *, mi: MethodInput, truth: float) -> pd.
     return pd.DataFrame(rows)
 
 
+def _plot_conditional_uplift(
+    rep: Replicate, mi: MethodInput, window: CampaignWindow, *, out_dir: Path, profile_name: str
+) -> None:
+    """Re-run the power model on ``mi`` and write per-condition uplift-vs-truth plots."""
+    context = build_hot_v0_context(wtg_names=DEFAULT_TURBINE_SUBSET)
+    power_model = PowerModelMethod(
+        active_power_col=HOT_COLUMNS.active_power,
+        wind_speed_col=HOT_COLUMNS.wind_speed,
+        wind_speed_sd_col=HOT_COLUMNS.wind_speed_sd,
+        availability_col=HOT_COLUMNS.availability,
+        era5_hourly_df=context.reanalysis_datasets[0].data,
+        out_dir=out_dir / "power_model_conditional",
+    )
+    pm_output = power_model.estimate(mi)
+    if pm_output.p50_by_condition is None:
+        logger.warning("power_model returned no p50_by_condition; skipping conditional uplift plots")
+        return
+
+    test_index = rep.synthetic_df.loc[rep.synthetic_df[HOT_COLUMNS.turbine] == rep.test_wtg].index
+    mask = treated_activity_mask(test_index, rep.upgrade_timing, window=window)
+    truth_by_condition = {
+        c: rep.true_uplift(mask=mask, by=c, bins=CONDITION_BINS[c]).by_condition for c in ("ws", "ti")
+    }
+    # Filter out conditions where true_uplift returned None (should not happen when bins given)
+    truth_by_condition_clean: dict[str, pd.DataFrame] = {
+        c: df for c, df in truth_by_condition.items() if df is not None
+    }
+    if not truth_by_condition_clean:
+        logger.warning("No per-condition truth available; skipping conditional uplift plots")
+        return
+
+    frame = conditional_truth_vs_estimate(pm_output, truth_by_condition_clean, method_name=power_model.name)
+    for c in truth_by_condition_clean:
+        plot_conditional_uplift(
+            frame,
+            condition=c,
+            save_path=out_dir / f"conditional_uplift_{c}.png",
+            title=f"Conditional uplift ({c}) — profile={profile_name}, wtg={mi.test_wtg}",
+        )
+    logger.info("Wrote conditional uplift plots to %s", out_dir)
+
+
+def conditional_truth_vs_estimate(
+    output: MethodOutput, truth_by_condition: dict[str, pd.DataFrame], *, method_name: str
+) -> pd.DataFrame:
+    """Shape a method's p50_by_condition + per-condition truth into a plot_conditional_uplift frame."""
+    if output.p50_by_condition is None:
+        msg = "output.p50_by_condition must not be None"
+        raise ValueError(msg)
+    frames = []
+    bc = output.p50_by_condition
+    for condition, truth in truth_by_condition.items():
+        est = bc[bc["condition"] == condition].set_index("condition_bin")["p50_uplift"]
+        t = truth.assign(condition_bin=truth["condition_bin"].astype(str)).set_index("condition_bin")["true_uplift"]
+        bins = est.index.union(t.index)
+        frames.append(
+            pd.DataFrame(
+                {
+                    "method": method_name,
+                    "condition": condition,
+                    "condition_bin": bins,
+                    "mean_estimate": est.reindex(bins).to_numpy(),
+                    "mean_truth": t.reindex(bins).to_numpy(),
+                }
+            )
+        )
+    return pd.concat(frames, ignore_index=True)
+
+
 def inspect_prepost_hard_case(
     *,
     profile_name: str = DEFAULT_PROFILE,
@@ -228,12 +302,14 @@ def inspect_prepost_hard_case(
         wtg_names=DEFAULT_TURBINE_SUBSET,
     )
 
-    _rep, mi, truth = _pin_case(
+    rep, mi, truth, window = _pin_case(
         scada_df, study=study, profile_name=profile_name, test_wtg=test_wtg, campaign_months=campaign_months
     )
     methods = _build_methods(out_dir, include_v0=include_v0)
     summary = _run_methods(methods, mi=mi, truth=truth)
 
+    _plot_conditional_uplift(rep, mi, window, out_dir=out_dir, profile_name=profile_name)
+
     summary_path = out_dir / "comparison_summary.csv"
     summary.to_csv(summary_path, index=False)
     logger.info("Wrote %s\n%s", summary_path, summary.to_string(index=False))
diff --git a/benchmarking/baselines/power_model/features.py b/benchmarking/baselines/power_model/features.py
index 7dce4cc..118c7b6 100644
--- a/benchmarking/baselines/power_model/features.py
+++ b/benchmarking/baselines/power_model/features.py
@@ -133,6 +133,31 @@ def reference_mean_wind_speed(
     return pd.concat(cols, axis=1).mean(axis=1)
 
 
+def test_condition_signals(
+    scada_df: pd.DataFrame,
+    *,
+    test_wtg: str,
+    turbine_col: str,
+    wind_speed_col: str,
+    wind_speed_sd_col: str | None,
+) -> pd.DataFrame:
+    """Test turbine's MEASURED ws and ti on the unique sorted index (post-treatment, accepted §3).
+
+    ``ti`` is omitted when no SD column is configured.
+    """
+    index = pd.DatetimeIndex(pd.unique(scada_df.index)).sort_values()
+    rows = scada_df[scada_df[turbine_col] == test_wtg]
+    ws = pd.Series(rows[wind_speed_col].to_numpy(dtype=float), index=pd.DatetimeIndex(rows.index))
+    ws = ws[~ws.index.duplicated()].reindex(index)
+    out = pd.DataFrame({"ws": ws})
+    if wind_speed_sd_col is not None and wind_speed_sd_col in scada_df.columns:
+        sd = pd.Series(rows[wind_speed_sd_col].to_numpy(dtype=float), index=pd.DatetimeIndex(rows.index))
+        sd = sd[~sd.index.duplicated()].reindex(index)
+        ws_arr = ws.to_numpy()
+        out["ti"] = np.divide(sd.to_numpy(), ws_arr, out=np.full(len(ws_arr), np.nan), where=ws_arr != 0)
+    return out
+
+
 def check_reference_only(feature_names: list[str], *, test_wtg: str) -> None:
     """Raise if any feature is qualified with the test turbine (violating the §3 rule)."""
     offenders = [f for f in feature_names if f.endswith(f"{QUALIFIER}{test_wtg}")]
diff --git a/benchmarking/baselines/power_model/method.py b/benchmarking/baselines/power_model/method.py
index 0e3fdeb..b96d977 100644
--- a/benchmarking/baselines/power_model/method.py
+++ b/benchmarking/baselines/power_model/method.py
@@ -37,9 +37,11 @@
     era5_feature_frame,
     extract_outcome,
     reference_mean_wind_speed,
+    test_condition_signals,
 )
 from benchmarking.baselines.rlearner.nuisance import make_outcome_model
 from benchmarking.diagnostics import DiagnosticContext, write_common_diagnostics, write_run_config
+from benchmarking.harness.conditions import CONDITION_BINS, energy_ratio_by_bin
 from benchmarking.harness.method import MethodInput, MethodOutput
 from benchmarking.synthetic import HOT_COLUMNS, ToggleSchedule, treated_mask
 
@@ -93,6 +95,7 @@ class PowerModelMethod:
     active_power_col: str
     availability_col: str
     wind_speed_col: str | None = None
+    wind_speed_sd_col: str | None = None
     era5_hourly_df: pd.DataFrame | None = None
     columns: ColumnSchema = HOT_COLUMNS
     name: str = "power_model"
@@ -148,6 +151,17 @@ def estimate(self, mi: MethodInput) -> MethodOutput:
         sum_counter = float(fit["pred_upgraded"].sum())
         uplift = sum_actual / sum_counter - 1.0 if np.isfinite(sum_counter) and sum_counter != 0 else float("nan")
 
+        by_condition: pd.DataFrame | None = None
+        if self.wind_speed_col is not None:
+            conditions = test_condition_signals(
+                scada,
+                test_wtg=mi.test_wtg,
+                turbine_col=mi.turbine_col,
+                wind_speed_col=self.wind_speed_col,
+                wind_speed_sd_col=self.wind_speed_sd_col,
+            )
+            by_condition = self._conditional_uplift(conditions, upgraded_sel=upgraded_sel, fit=fit)
+
         self._write(
             mi,
             index=index,
@@ -164,7 +178,7 @@ def estimate(self, mi: MethodInput) -> MethodOutput:
             n_refs=n_refs,
             era5=era5,
         )
-        return MethodOutput(p50_overall=uplift)
+        return MethodOutput(p50_overall=uplift, p50_by_condition=by_condition)
 
     def _add_era5(
         self,
@@ -234,6 +248,20 @@ def _fit_predict(
             "pred_baseline_valid": pred_valid,
         }
 
+    def _conditional_uplift(
+        self, conditions: pd.DataFrame, *, upgraded_sel: np.ndarray, fit: dict[str, Any]
+    ) -> pd.DataFrame | None:
+        """Reduce the upgraded actual/counterfactual ledger to per-bin energy-ratio uplift."""
+        cond_up = conditions.iloc[upgraded_sel]
+        actual = fit["y_upgraded"]
+        counterfactual = fit["pred_upgraded"]
+        frames = []
+        for name in [c for c in ("ws", "ti") if c in cond_up.columns]:
+            table = energy_ratio_by_bin(cond_up[name].to_numpy(), actual, counterfactual, bins=CONDITION_BINS[name])
+            table.insert(0, "condition", name)
+            frames.append(table[["condition", "condition_bin", "p50_uplift"]])
+        return pd.concat(frames, ignore_index=True) if frames else None
+
     def _make_model(self) -> Any:  # noqa: ANN401
         """Outcome model with ``seed`` plumbed into LightGBM's ``random_state`` (caller overrides win)."""
         return make_outcome_model(**{"random_state": self.seed, **self.model_params})
@@ -361,6 +389,7 @@ def _config_params(self) -> dict[str, Any]:
             "active_power_col": self.active_power_col,
             "availability_col": self.availability_col,
             "wind_speed_col": self.wind_speed_col,
+            "wind_speed_sd_col": self.wind_speed_sd_col,
             "seed": self.seed,
             "toggle_campaign_only": self.toggle_campaign_only,
             "has_era5": self.era5_hourly_df is not None,
diff --git a/benchmarking/baselines/study_power_model_compare.py b/benchmarking/baselines/study_power_model_compare.py
index e456576..40c51be 100644
--- a/benchmarking/baselines/study_power_model_compare.py
+++ b/benchmarking/baselines/study_power_model_compare.py
@@ -69,7 +69,14 @@
 from benchmarking.baselines.hot_context import build_hot_v0_context
 from benchmarking.baselines.overnight_profiles import overnight_profiles
 from benchmarking.baselines.power_model import PowerModelMethod
-from benchmarking.harness import StudyConfig, leaderboard, plot_campaign_curves, score_study
+from benchmarking.harness import (
+    StudyConfig,
+    conditional_leaderboard,
+    leaderboard,
+    plot_campaign_curves,
+    plot_conditional_uplift,
+    score_study,
+)
 from benchmarking.synthetic import HOT_COLUMNS
 from benchmarking.synthetic.sources.hill_of_towie import load_hot_scada
 
@@ -94,7 +101,7 @@
 # a known-good commit, so future power_model changes are scored against it. Lives next to this script
 # (tracked) — update it deliberately with --update-baseline when an improvement is accepted.
 _BASELINE_PATH = Path(__file__).resolve().parent / "study_power_model_compare_baseline.json"
-_BASELINE_SCHEMA = "power_model_compare_baseline_v1"
+_BASELINE_SCHEMA = "power_model_compare_baseline_v2"
 # Per-cell metrics recorded and diffed. spread/score: lower is better; bias: |bias| nearer 0 is better.
 _METRIC_COLS = ["bias", "spread", "score"]
 # A delta smaller than this (fractional uplift) is floating-point noise, not a real change: an
@@ -152,6 +159,7 @@ def run_power_model(mode: str, out_dir: Path) -> pd.DataFrame:
             active_power_col=HOT_COLUMNS.active_power,
             wind_speed_col=HOT_COLUMNS.wind_speed,
             availability_col=HOT_COLUMNS.availability,
+            wind_speed_sd_col=HOT_COLUMNS.wind_speed_sd,
             era5_hourly_df=context.reanalysis_datasets[0].data,
             out_dir=out_dir / "power_model_runs",
         )
@@ -257,18 +265,23 @@ def _git_commit() -> str:
 
 
 def power_model_leaderboard(fresh: pd.DataFrame) -> pd.DataFrame:
-    """Reduce the fresh ``power_model`` rows to one ``(profile, campaign_months)`` row of bias/spread/score."""
+    """One row per (profile, campaign, condition, bin) of power_model bias/spread/score (overall incl.)."""
     pm = fresh[fresh["method"] == "power_model"]
-    summary = leaderboard(pm)
-    return summary[["profile", "campaign_months", *_METRIC_COLS]].sort_values(["profile", "campaign_months"])
+    overall = leaderboard(pm).assign(condition="overall", condition_bin="overall")
+    conditional = conditional_leaderboard(pm)
+    cols = ["profile", "campaign_months", "condition", "condition_bin", *_METRIC_COLS]
+    stacked = pd.concat([overall[cols], conditional[cols]], ignore_index=True)
+    return stacked.sort_values(["profile", "campaign_months", "condition", "condition_bin"])
 
 
 def record_baseline(lb_by_mode: dict[str, pd.DataFrame], study_by_mode: dict[str, StudyConfig], path: Path) -> None:
     """Write/refresh the committed benchmark for the given modes (other modes in the file are kept)."""
     doc: dict[str, Any] = {"schema": _BASELINE_SCHEMA, "modes": {}}
     if path.exists():
-        doc = json.loads(path.read_text())
-        doc.setdefault("modes", {})
+        loaded = json.loads(path.read_text())
+        if loaded.get("schema") == _BASELINE_SCHEMA:
+            doc = loaded
+            doc.setdefault("modes", {})
     commit = _git_commit()
     now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
     for mode, lb in lb_by_mode.items():
@@ -290,7 +303,16 @@ def _load_baseline_cells(mode: str, path: Path) -> tuple[pd.DataFrame, dict[str,
     """Load the recorded benchmark cells (+ provenance) for ``mode``, or ``None`` if not recorded."""
     if not path.exists():
         return None
-    entry = json.loads(path.read_text()).get("modes", {}).get(mode)
+    doc = json.loads(path.read_text())
+    if doc.get("schema") != _BASELINE_SCHEMA:
+        logger.warning(
+            "Baseline %s has schema %r, expected %r — run --update-baseline to regenerate.",
+            path,
+            doc.get("schema"),
+            _BASELINE_SCHEMA,
+        )
+        return None
+    entry = doc.get("modes", {}).get(mode)
     if entry is None:
         return None
     return pd.DataFrame(entry["cells"]), entry
@@ -311,16 +333,17 @@ def compare_to_benchmark(mode: str, lb: pd.DataFrame, baseline_path: Path, compa
         )
         return
     base, prov = loaded
-    merged = lb.merge(base, on=["profile", "campaign_months"], how="outer", suffixes=("", "_base"))
+    merge_keys = ["profile", "campaign_months", "condition", "condition_bin"]
+    merged = lb.merge(base, on=merge_keys, how="outer", suffixes=("", "_base"))
     for col in _METRIC_COLS:
         merged[f"d_{col}"] = merged[col] - merged[f"{col}_base"]
     merged["d_abs_bias"] = merged["bias"].abs() - merged["bias_base"].abs()
-    merged.sort_values(["profile", "campaign_months"]).to_csv(
+    merged.sort_values(["profile", "campaign_months", "condition", "condition_bin"]).to_csv(
         comparison_dir / f"benchmark_comparison_{mode}.csv", index=False
     )
 
     pp = 100.0
-    report = merged.groupby("campaign_months").mean(numeric_only=True)
+    report = merged.groupby(["campaign_months", "condition"]).mean(numeric_only=True)
     overall = merged.mean(numeric_only=True).to_frame().T
     overall.index = ["ALL"]
     table = pd.concat([report, overall])
@@ -387,6 +410,24 @@ def merge_and_plot(mode: str, fresh: pd.DataFrame, reference_mode_dir: Path, out
         mode,
         all_summary[["method", "profile", "campaign_months", "bias", "spread", "score"]].to_string(index=False),
     )
+
+    # Conditional comparison plots for power_model (v0/naive emit no conditional rows — expected).
+    pm_rows = merged[merged["method"] == "power_model"]
+    if not pm_rows.empty and "condition" in pm_rows.columns:
+        cond_lb = conditional_leaderboard(pm_rows)
+        if not cond_lb.empty:
+            for profile in profiles:
+                for condition in cond_lb["condition"].unique():
+                    subset = cond_lb[(cond_lb["profile"] == profile) & (cond_lb["condition"] == condition)]
+                    if subset.empty:
+                        continue
+                    plot_conditional_uplift(
+                        subset,
+                        condition=condition,
+                        save_path=comparison_dir / f"conditional_{profile}_{condition}.png",
+                        title=f"{mode} - {profile} power_model vs {condition}",
+                    )
+
     logger.info("Wrote %s comparison outputs to %s", mode, comparison_dir)
     return merged
 
diff --git a/benchmarking/harness/__init__.py b/benchmarking/harness/__init__.py
index acbe3fc..426b3ef 100644
--- a/benchmarking/harness/__init__.py
+++ b/benchmarking/harness/__init__.py
@@ -12,14 +12,25 @@
     treated_activity_mask,
     window_row_mask,
 )
-from benchmarking.harness.leaderboard import leaderboard
+from benchmarking.harness.conditions import (
+    CONDITION_BINS,
+    CONDITIONS,
+    TI_BINS,
+    WS_BINS,
+    energy_ratio_by_bin,
+)
+from benchmarking.harness.leaderboard import conditional_leaderboard, leaderboard
 from benchmarking.harness.method import Method, MethodInput, MethodOutput
 from benchmarking.harness.metrics import ErrorSummary, summarize_errors
-from benchmarking.harness.plots import plot_campaign_curves
+from benchmarking.harness.plots import plot_campaign_curves, plot_conditional_uplift
 from benchmarking.harness.replicates import Replicate, StudyConfig, build_replicates
 from benchmarking.harness.scoring import score_study
 
 __all__ = [
+    "CONDITIONS",
+    "CONDITION_BINS",
+    "TI_BINS",
+    "WS_BINS",
     "CampaignWindow",
     "ErrorSummary",
     "Method",
@@ -29,8 +40,11 @@
     "StudyConfig",
     "build_replicates",
     "campaign_windows",
+    "conditional_leaderboard",
+    "energy_ratio_by_bin",
     "leaderboard",
     "plot_campaign_curves",
+    "plot_conditional_uplift",
     "score_study",
     "summarize_errors",
     "treated_activity_mask",
diff --git a/benchmarking/harness/conditions.py b/benchmarking/harness/conditions.py
new file mode 100644
index 0000000..2200917
--- /dev/null
+++ b/benchmarking/harness/conditions.py
@@ -0,0 +1,49 @@
+"""Shared condition bins and the binned energy-ratio reducer.
+
+One source of truth for the wind-speed / TI bin edges, imported by both the method (to bin its
+counterfactual ledger) and the harness truth path, so the two bin identically. The reducer is the
+same energy ratio as the overall number, taken within each bin: ``Σactual / Σcounterfactual - 1``.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import numpy.typing as npt
+import pandas as pd
+
+WS_BINS: list[float] = [float(x) for x in np.arange(0.0, 28.0, 2.0)]  # 0,2,…,26
+TI_BINS: list[float] = [round(float(x), 2) for x in np.arange(0.0, 0.55, 0.05)]  # 0,0.05,…,0.50
+CONDITIONS: tuple[str, ...] = ("ws", "ti")
+CONDITION_BINS: dict[str, list[float]] = {"ws": WS_BINS, "ti": TI_BINS}
+
+
+def energy_ratio_by_bin(
+    condition_values: npt.ArrayLike,
+    actual: npt.ArrayLike,
+    counterfactual: npt.ArrayLike,
+    *,
+    bins: list[float],
+) -> pd.DataFrame:
+    """Energy-ratio uplift within bins of a condition signal (NaN-safe; every bin represented)."""
+    cond = np.asarray(condition_values, dtype=float)
+    act = np.asarray(actual, dtype=float)
+    cf = np.asarray(counterfactual, dtype=float)
+    finite = np.isfinite(cond) & np.isfinite(act) & np.isfinite(cf)
+    frame = pd.DataFrame(
+        {
+            "condition_bin": pd.cut(cond[finite], bins=bins).astype(str),
+            "actual": act[finite],
+            "counterfactual": cf[finite],
+        }
+    )
+    all_bins = pd.cut([], bins=bins).categories.astype(str)
+    grouped = frame.groupby("condition_bin", observed=False)
+    table = grouped[["actual", "counterfactual"]].sum()
+    table["n_records"] = grouped.size()
+    table = table.reindex(all_bins)
+    table["n_records"] = table["n_records"].fillna(0).astype(int)
+    denom = table["counterfactual"].to_numpy()
+    table["p50_uplift"] = (
+        np.divide(table["actual"].to_numpy(), denom, out=np.full(len(table), np.nan), where=denom != 0) - 1.0
+    )
+    return table.reset_index(names="condition_bin")[["condition_bin", "p50_uplift", "n_records"]]
diff --git a/benchmarking/harness/leaderboard.py b/benchmarking/harness/leaderboard.py
index 5d2f7fb..f76197a 100644
--- a/benchmarking/harness/leaderboard.py
+++ b/benchmarking/harness/leaderboard.py
@@ -12,6 +12,7 @@
 from benchmarking.harness.metrics import summarize_errors
 
 _GROUP_KEYS = ["method", "profile", "campaign_months"]
+_CONDITION_GROUP_KEYS = ["method", "profile", "campaign_months", "condition", "condition_bin"]
 
 
 def leaderboard(results_df: pd.DataFrame) -> pd.DataFrame:
@@ -56,3 +57,32 @@ def leaderboard(results_df: pd.DataFrame) -> pd.DataFrame:
         "wall_time_s_mean",
     ]
     return pd.DataFrame(records, columns=columns)
+
+
+def conditional_leaderboard(results_df: pd.DataFrame) -> pd.DataFrame:
+    """Per-(method, profile, campaign, condition, bin) bias/spread/score over the conditional rows.
+
+    Only per-condition rows (``condition`` in ["ws", "ti"]) are summarised; overall rows
+    (``condition == "overall"``) are excluded. Returns one row per group with ``bias``,
+    ``spread``, ``score``, the mean recovered and true uplift (``mean_estimate`` /
+    ``mean_truth``, when those columns are present in the input), and ``n_replicates``,
+    sorted by method, profile, campaign length, condition, then condition_bin.
+    """
+    cond = results_df[results_df["condition"].isin(["ws", "ti"])] if "condition" in results_df else results_df.iloc[:0]
+
+    records = []
+    for keys, group in cond.groupby(_CONDITION_GROUP_KEYS, sort=True):
+        summary = summarize_errors(group["signed_error"].to_numpy())
+        records.append(
+            {
+                **dict(zip(_CONDITION_GROUP_KEYS, keys, strict=True)),
+                "bias": summary.bias,
+                "spread": summary.spread,
+                "score": summary.score,
+                "mean_estimate": float(group["estimate"].mean()) if "estimate" in group else float("nan"),
+                "mean_truth": float(group["truth"].mean()) if "truth" in group else float("nan"),
+                "n_replicates": summary.n,
+            }
+        )
+    columns = [*_CONDITION_GROUP_KEYS, "bias", "spread", "score", "mean_estimate", "mean_truth", "n_replicates"]
+    return pd.DataFrame(records, columns=columns)
diff --git a/benchmarking/harness/method.py b/benchmarking/harness/method.py
index 483c08f..e2f26a9 100644
--- a/benchmarking/harness/method.py
+++ b/benchmarking/harness/method.py
@@ -44,8 +44,9 @@ class MethodOutput:
     """A method's P50 uplift estimate.
 
     :param p50_overall: overall P50 uplift (energy-ratio fraction)
-    :param p50_by_condition: optional per-condition estimates (columns ``condition_bin``,
-        ``p50_uplift``); ``None`` when the method produces only an overall number
+    :param p50_by_condition: optional per-condition estimates (columns ``condition``,
+        ``condition_bin``, ``p50_uplift``); ``condition`` ∈ {"ws","ti"}; ``None`` when the
+        method produces only an overall number
     """
 
     p50_overall: float
diff --git a/benchmarking/harness/plots.py b/benchmarking/harness/plots.py
index 28cc07f..3f6a012 100644
--- a/benchmarking/harness/plots.py
+++ b/benchmarking/harness/plots.py
@@ -105,3 +105,41 @@ def _set_score_ylim(ax: plt.Axes, scores_pp: np.ndarray) -> None:
     span = hi - lo
     margin = 0.05 * span if span > 0 else max(abs(hi), 1.0) * 0.05
     ax.set_ylim(bottom=min(0.0, lo) - margin)
+
+
+def plot_conditional_uplift(
+    summary_df: pd.DataFrame,
+    *,
+    condition: str,
+    save_path: str | Path | None = None,
+    title: str | None = None,
+) -> Figure:
+    """Plot mean recovered vs true uplift across bins of one condition, with a bias±spread band."""
+    df = summary_df[summary_df["condition"] == condition].copy()
+    df["_left"] = df["condition_bin"].str.extract(r"\(([-0-9.]+),").astype(float)
+    df = df.sort_values("_left")
+    order = df.drop_duplicates("condition_bin")["condition_bin"].tolist()
+    x = np.arange(len(order))
+
+    fig, ax = plt.subplots(figsize=(9, 5))
+    truth = df.drop_duplicates("condition_bin").set_index("condition_bin").reindex(order)["mean_truth"]
+    ax.plot(x, truth.to_numpy() * _FRACTION_TO_PP, "--", marker="s", color="k", label="true uplift")
+    for i, method in enumerate(sorted(df["method"].unique())):
+        m = df[df["method"] == method].set_index("condition_bin").reindex(order)
+        est = m["mean_estimate"].to_numpy() * _FRACTION_TO_PP
+        ax.plot(x, est, marker="o", color=f"C{i}", label=method)
+        if "spread" in m:
+            sp = m["spread"].to_numpy() * _FRACTION_TO_PP
+            ax.fill_between(x, est - sp, est + sp, color=f"C{i}", alpha=0.2)
+    ax.set_xticks(x)
+    ax.set_xticklabels(order, rotation=45, ha="right")
+    ax.set_xlabel(condition)
+    ax.set_ylabel("uplift [pp]")
+    ax.axhline(0.0, color="grey", lw=0.8)
+    ax.legend()
+    if title:
+        ax.set_title(title)
+    fig.tight_layout()
+    if save_path is not None:
+        fig.savefig(save_path, dpi=120)
+    return fig
diff --git a/benchmarking/harness/scoring.py b/benchmarking/harness/scoring.py
index fb61819..f381835 100644
--- a/benchmarking/harness/scoring.py
+++ b/benchmarking/harness/scoring.py
@@ -23,6 +23,7 @@
 import pandas as pd
 
 from benchmarking.harness.campaign import campaign_windows, treated_activity_mask, window_row_mask
+from benchmarking.harness.conditions import CONDITION_BINS
 from benchmarking.harness.method import MethodInput
 from benchmarking.harness.replicates import build_replicates
 from benchmarking.synthetic import HOT_COLUMNS
@@ -30,6 +31,9 @@
 if TYPE_CHECKING:
     from collections.abc import Callable
 
+    import numpy as np
+    import numpy.typing as npt
+
     from benchmarking.harness.campaign import CampaignWindow
     from benchmarking.harness.method import Method
     from benchmarking.harness.replicates import Replicate, StudyConfig
@@ -66,33 +70,40 @@ def score_study(
     instances = _materialise_instances(replicates, study, data_start=data_start, data_end=data_end)
     # Truth depends only on ``(replicate, window)``, so compute it once here rather than once
     # per method (it would otherwise carry an avoidable ``len(methods)`` multiplier).
-    truths = [_truth_overall(replicate, window) for replicate, window in instances]
+    truth_masks = [_truth_mask(r, w) for r, w in instances]
+    truths = [r.true_uplift(mask=m).overall for (r, _), m in zip(instances, truth_masks, strict=True)]
 
     rows = []
     for method in methods:
-        method_rows = []
-        for (replicate, window), truth in zip(instances, truths, strict=True):
+        method_rows: list[dict[str, object]] = []
+        for (replicate, window), truth, mask in zip(instances, truths, truth_masks, strict=True):
             method_input = _method_input(replicate, window)
             start = time.perf_counter()
             output = method.estimate(method_input)
             wall_time_s = time.perf_counter() - start
+            base_fields: dict[str, object] = {
+                "method": method.name,
+                "profile": profile_name,
+                "replicate": replicate.replicate_id,
+                "test_wtg": replicate.test_wtg,
+                "campaign_months": window.months,
+                "treatment_start": window.treatment_start,
+                "baseline_start": window.baseline_start,
+                "activity_end": window.activity_end,
+            }
             method_rows.append(
                 {
-                    "method": method.name,
-                    "profile": profile_name,
-                    "replicate": replicate.replicate_id,
-                    "test_wtg": replicate.test_wtg,
-                    "campaign_months": window.months,
-                    "treatment_start": window.treatment_start,
-                    "baseline_start": window.baseline_start,
-                    "activity_end": window.activity_end,
+                    **base_fields,
                     "condition": "overall",
+                    "condition_bin": "overall",
                     "estimate": output.p50_overall,
                     "truth": truth,
                     "signed_error": output.p50_overall - truth,
                     "wall_time_s": wall_time_s,
                 }
             )
+            if output.p50_by_condition is not None:
+                method_rows.extend(_conditional_rows(output.p50_by_condition, replicate, window, mask, base_fields))
         if on_method_complete is not None:
             on_method_complete(method.name, pd.DataFrame(method_rows))
         rows.extend(method_rows)
@@ -132,9 +143,42 @@ def _method_input(replicate: Replicate, window: CampaignWindow) -> MethodInput:
     )
 
 
-def _truth_overall(replicate: Replicate, window: CampaignWindow) -> float:
-    """Ground-truth uplift over the test turbine's treated rows within the activity window."""
+def _truth_mask(replicate: Replicate, window: CampaignWindow) -> npt.NDArray[np.bool_]:
+    """Return the treated-activity mask over the test turbine's rows for this window."""
     synthetic = replicate.synthetic_df
     test_index = synthetic.loc[synthetic[replicate.dataset.columns.turbine] == replicate.test_wtg].index
-    mask = treated_activity_mask(test_index, replicate.upgrade_timing, window=window)
-    return replicate.true_uplift(mask=mask).overall
+    return treated_activity_mask(test_index, replicate.upgrade_timing, window=window)
+
+
+def _conditional_rows(
+    by_condition: pd.DataFrame,
+    replicate: Replicate,
+    window: CampaignWindow,  # noqa: ARG001 (kept for symmetry / future use)
+    mask: npt.NDArray[np.bool_],
+    base_fields: dict[str, object],
+) -> list[dict[str, object]]:
+    """Join each method per-bin estimate to per-bin truth for every condition present."""
+    rows: list[dict[str, object]] = []
+    for condition, est in by_condition.groupby("condition"):
+        truth_df = replicate.true_uplift(mask=mask, by=condition, bins=CONDITION_BINS[str(condition)]).by_condition
+        if truth_df is None:  # always set when by= is passed; guard for type narrowing
+            continue  # pragma: no cover
+        truth_series = truth_df.assign(condition_bin=truth_df["condition_bin"].astype(str)).set_index("condition_bin")[
+            "true_uplift"
+        ]
+        for _, r in est.iterrows():
+            bin_label = str(r["condition_bin"])
+            t = float(truth_series.get(bin_label, float("nan")))
+            e = float(r["p50_uplift"])
+            rows.append(
+                {
+                    **base_fields,
+                    "condition": condition,
+                    "condition_bin": bin_label,
+                    "estimate": e,
+                    "truth": t,
+                    "signed_error": e - t,
+                    "wall_time_s": float("nan"),
+                }
+            )
+    return rows
diff --git a/tests/benchmarking/baselines/test_inspect_prepost_hard_case.py b/tests/benchmarking/baselines/test_inspect_prepost_hard_case.py
new file mode 100644
index 0000000..c5689a1
--- /dev/null
+++ b/tests/benchmarking/baselines/test_inspect_prepost_hard_case.py
@@ -0,0 +1,23 @@
+"""The hard-case inspector's conditional-uplift plotting helper."""
+
+from __future__ import annotations
+
+import pandas as pd
+
+from benchmarking.baselines.inspect_prepost_hard_case import conditional_truth_vs_estimate
+from benchmarking.harness.method import MethodOutput
+
+
+def test_conditional_truth_vs_estimate_merges_estimate_and_truth() -> None:
+    out = MethodOutput(
+        p50_overall=0.05,
+        p50_by_condition=pd.DataFrame(
+            {"condition": ["ws", "ws"], "condition_bin": ["(6.0, 8.0]", "(8.0, 10.0]"], "p50_uplift": [0.06, 0.04]}
+        ),
+    )
+    truth = pd.DataFrame({"condition_bin": ["(6.0, 8.0]", "(8.0, 10.0]"], "true_uplift": [0.05, 0.05]})
+    merged = conditional_truth_vs_estimate(out, {"ws": truth}, method_name="power_model")
+    row = merged[merged["condition_bin"] == "(6.0, 8.0]"].iloc[0]
+    assert row["mean_estimate"] == 0.06
+    assert row["mean_truth"] == 0.05
+    assert row["method"] == "power_model"
diff --git a/tests/benchmarking/baselines/test_power_model_method.py b/tests/benchmarking/baselines/test_power_model_method.py
index f1294ff..875c364 100644
--- a/tests/benchmarking/baselines/test_power_model_method.py
+++ b/tests/benchmarking/baselines/test_power_model_method.py
@@ -20,6 +20,7 @@
 _POWER = "wtc_ActPower_mean"
 _AVAIL = "wtc_ScReToOp_timeon"
 _WS = "wtc_AcWindSp_mean"
+_WS_SD = "wtc_AcWindSp_stddev"
 
 # Small/fast LightGBM so the toy data (a few thousand rows) is fit well.
 _FAST_PARAMS = {"n_estimators": 120, "learning_rate": 0.1, "num_leaves": 31, "min_child_samples": 20}
@@ -45,7 +46,9 @@ def _toy_scada(n: int, *, uplift: float, treated: np.ndarray, seed: int = 0) ->
         "R3": r3,
     }
     parts = [
-        pd.DataFrame({_TURBINE: name, _POWER: power, _AVAIL: 600.0, _WS: power / 100.0}, index=idx)
+        pd.DataFrame(
+            {_TURBINE: name, _POWER: power, _AVAIL: 600.0, _WS: power / 100.0, _WS_SD: power / 1000.0}, index=idx
+        )
         for name, power in frames.items()
     ]
     return pd.concat(parts)
@@ -131,3 +134,37 @@ def test_leak_bait_test_column_does_not_change_estimate(self) -> None:
         with_leak = method.estimate(mi_leak).p50_overall
         # the reference-only builder ignores test-turbine columns, so the estimate is unchanged
         assert with_leak == pytest.approx(baseline, abs=1e-9)
+
+
+class TestConditionalUplift:
+    def test_emits_conditional_uplift_by_ws_and_ti(self) -> None:
+        n = 4000
+        idx = pd.date_range("2019-01-01", periods=n, freq="10min", tz="UTC")
+        changeover = idx[n // 2]
+        treated = np.asarray(idx >= changeover)
+        scada = _toy_scada(n, uplift=0.05, treated=treated)  # now includes _WS_SD
+        method = PowerModelMethod(
+            active_power_col=_POWER,
+            availability_col=_AVAIL,
+            wind_speed_col=_WS,
+            wind_speed_sd_col=_WS_SD,
+            model_params=_FAST_PARAMS,
+        )
+        mi = MethodInput(scada_df=scada, test_wtg="T1", upgrade_timing=pd.Timestamp(changeover), turbine_col=_TURBINE)
+        out = method.estimate(mi)
+        bc = out.p50_by_condition
+        assert list(bc.columns) == ["condition", "condition_bin", "p50_uplift"]
+        assert set(bc["condition"]) == {"ws", "ti"}
+
+    def test_ws_only_when_no_sd_column_configured(self) -> None:
+        n = 4000
+        idx = pd.date_range("2019-01-01", periods=n, freq="10min", tz="UTC")
+        changeover = idx[n // 2]
+        treated = np.asarray(idx >= changeover)
+        scada = _toy_scada(n, uplift=0.05, treated=treated)
+        method = PowerModelMethod(
+            active_power_col=_POWER, availability_col=_AVAIL, wind_speed_col=_WS, model_params=_FAST_PARAMS
+        )
+        mi = MethodInput(scada_df=scada, test_wtg="T1", upgrade_timing=pd.Timestamp(changeover), turbine_col=_TURBINE)
+        out = method.estimate(mi)
+        assert set(out.p50_by_condition["condition"]) == {"ws"}
diff --git a/tests/benchmarking/baselines/test_study_power_model_compare.py b/tests/benchmarking/baselines/test_study_power_model_compare.py
new file mode 100644
index 0000000..3323833
--- /dev/null
+++ b/tests/benchmarking/baselines/test_study_power_model_compare.py
@@ -0,0 +1,193 @@
+"""Benchmark reshaping in study_power_model_compare."""
+
+from __future__ import annotations
+
+import json
+from typing import TYPE_CHECKING
+
+import pandas as pd
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+from benchmarking.baselines.study_power_model_compare import (
+    _BASELINE_SCHEMA,
+    _load_baseline_cells,
+    power_model_leaderboard,
+    record_baseline,
+)
+from benchmarking.harness import StudyConfig
+
+
+def test_power_model_leaderboard_includes_overall_and_conditional_cells() -> None:
+    fresh = pd.DataFrame(
+        {
+            "method": "power_model",
+            "profile": "ws_dependent_cp",
+            "campaign_months": 6,
+            "replicate": [0, 1, 0, 1],
+            "condition": ["overall", "overall", "ws", "ws"],
+            "condition_bin": ["overall", "overall", "(6.0, 8.0]", "(6.0, 8.0]"],
+            "estimate": [0.05, 0.05, 0.08, 0.06],
+            "truth": [0.05, 0.05, 0.07, 0.07],
+            "signed_error": [0.0, 0.0, 0.01, -0.01],
+        }
+    )
+    lb = power_model_leaderboard(fresh)
+    keys = set(zip(lb["condition"], lb["condition_bin"], strict=True))
+    assert ("overall", "overall") in keys
+    assert ("ws", "(6.0, 8.0]") in keys
+    assert {"profile", "campaign_months", "condition", "condition_bin", "bias", "spread", "score"} <= set(lb.columns)
+
+
+def _minimal_study() -> StudyConfig:
+    return StudyConfig(
+        mode="prepost",
+        turbine_subset=["WT01"],
+        treatment_start_range=(pd.Timestamp("2020-01-01", tz="UTC"), pd.Timestamp("2020-06-01", tz="UTC")),
+        min_pre_months=6,
+        campaign_months=[6],
+        n_replicates=1,
+        seed=0,
+    )
+
+
+def _minimal_leaderboard() -> pd.DataFrame:
+    return pd.DataFrame(
+        {
+            "profile": ["test_profile"],
+            "campaign_months": [6],
+            "condition": ["overall"],
+            "condition_bin": ["overall"],
+            "bias": [0.01],
+            "spread": [0.02],
+            "score": [0.03],
+        }
+    )
+
+
+def test_load_baseline_cells_returns_none_for_wrong_schema(tmp_path: Path) -> None:
+    """_load_baseline_cells must return None (not crash) when the file has an old/mismatched schema."""
+    path = tmp_path / "baseline.json"
+    doc = {
+        "schema": "power_model_compare_baseline_v1",
+        "modes": {
+            "prepost": {
+                "recorded_utc": "2025-01-01T00:00:00Z",
+                "git_commit": "abc1234",
+                "n_replicates": 1,
+                "seed": 0,
+                "campaign_months": [6],
+                "profiles": ["test_profile"],
+                "cells": [{"profile": "test_profile", "campaign_months": 6, "bias": 0.01}],
+            }
+        },
+    }
+    path.write_text(json.dumps(doc))
+
+    result = _load_baseline_cells("prepost", path)
+
+    assert result is None
+
+
+def test_record_baseline_drops_stale_modes_on_schema_bump(tmp_path: Path) -> None:
+    """When on-disk schema != _BASELINE_SCHEMA, old-schema mode cells must be dropped (not inherited)."""
+    path = tmp_path / "baseline.json"
+    # Simulate a v1 file that has a toggle entry with v1-shaped cells (no condition/condition_bin).
+    old_doc = {
+        "schema": "power_model_compare_baseline_v1",
+        "modes": {
+            "toggle": {
+                "recorded_utc": "2025-01-01T00:00:00Z",
+                "git_commit": "abc1234",
+                "n_replicates": 1,
+                "seed": 0,
+                "campaign_months": [6],
+                "profiles": ["test_profile"],
+                "cells": [
+                    {"profile": "test_profile", "campaign_months": 6, "bias": 0.01, "spread": 0.02, "score": 0.03}
+                ],
+            }
+        },
+    }
+    path.write_text(json.dumps(old_doc))
+
+    lb = _minimal_leaderboard()
+    study = _minimal_study()
+    record_baseline({"prepost": lb}, {"prepost": study}, path)
+
+    written = json.loads(path.read_text())
+    assert written["schema"] == _BASELINE_SCHEMA
+    # The stale v1 toggle entry must be gone — a later toggle compare must not KeyError on missing columns.
+    assert "toggle" not in written.get("modes", {}), "stale v1 toggle cells must be dropped after schema bump"
+    # Equivalently, _load_baseline_cells for toggle returns None.
+    assert _load_baseline_cells("toggle", path) is None
+
+
+def test_record_baseline_preserves_sibling_modes_when_schemas_match(tmp_path: Path) -> None:
+    """When on-disk schema matches _BASELINE_SCHEMA, sibling modes must be preserved (incremental update)."""
+    path = tmp_path / "baseline.json"
+    # Write a v2 file that already has a toggle entry.
+    existing_toggle_cells = [
+        {
+            "profile": "test_profile",
+            "campaign_months": 6,
+            "condition": "overall",
+            "condition_bin": "overall",
+            "bias": 0.05,
+            "spread": 0.06,
+            "score": 0.07,
+        }
+    ]
+    existing_doc = {
+        "schema": _BASELINE_SCHEMA,
+        "modes": {
+            "toggle": {
+                "recorded_utc": "2025-01-01T00:00:00Z",
+                "git_commit": "abc1234",
+                "n_replicates": 1,
+                "seed": 0,
+                "campaign_months": [6],
+                "profiles": ["test_profile"],
+                "cells": existing_toggle_cells,
+            }
+        },
+    }
+    path.write_text(json.dumps(existing_doc))
+
+    lb = _minimal_leaderboard()
+    study = _minimal_study()
+    record_baseline({"prepost": lb}, {"prepost": study}, path)
+
+    written = json.loads(path.read_text())
+    assert written["schema"] == _BASELINE_SCHEMA
+    # The sibling toggle mode must still be present (incremental update must not drop it).
+    assert "toggle" in written.get("modes", {}), "matching-schema sibling mode must be preserved"
+    assert "prepost" in written.get("modes", {}), "newly recorded prepost mode must be present"
+    toggle_result = _load_baseline_cells("toggle", path)
+    assert toggle_result is not None
+
+
+def test_record_baseline_stamps_current_schema_over_old_file(tmp_path: Path) -> None:
+    """record_baseline must overwrite 'schema' with _BASELINE_SCHEMA even when an old-schema file exists."""
+    path = tmp_path / "baseline.json"
+    # Write a v1 stub to simulate the on-disk old file.
+    old_doc = {
+        "schema": "power_model_compare_baseline_v1",
+        "modes": {},
+    }
+    path.write_text(json.dumps(old_doc))
+
+    lb = _minimal_leaderboard()
+    study = _minimal_study()
+    record_baseline({"prepost": lb}, {"prepost": study}, path)
+
+    written = json.loads(path.read_text())
+    assert written["schema"] == _BASELINE_SCHEMA
+
+    # The round-trip via _load_baseline_cells must now succeed (not return None).
+    loaded = _load_baseline_cells("prepost", path)
+    assert loaded is not None
+    cells_df, prov = loaded
+    assert isinstance(cells_df, pd.DataFrame)
+    assert prov["n_replicates"] == 1
diff --git a/tests/benchmarking/harness/stubs.py b/tests/benchmarking/harness/stubs.py
index 6ab53f2..aac4b97 100644
--- a/tests/benchmarking/harness/stubs.py
+++ b/tests/benchmarking/harness/stubs.py
@@ -8,16 +8,13 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
-
 import numpy as np
+import pandas as pd
 
+from benchmarking.harness.conditions import CONDITION_BINS, energy_ratio_by_bin
 from benchmarking.harness.method import MethodInput, MethodOutput
 from benchmarking.synthetic import HOT_COLUMNS, treated_mask
 
-if TYPE_CHECKING:
-    import pandas as pd
-
 
 def oracle_overall_uplift(mi: MethodInput, original_df: pd.DataFrame) -> float:
     """Energy-ratio uplift over the treated test-turbine rows in ``mi``'s window."""
@@ -88,3 +85,37 @@ def estimate(self, mi: MethodInput) -> MethodOutput:
         )
         self.seen.append(captured)
         return MethodOutput(p50_overall=0.0)
+
+
+def conditional_oracle_by_condition(mi: MethodInput, original_df: pd.DataFrame) -> pd.DataFrame:
+    """Per-bin oracle uplift over treated rows, binned on the test turbine's measured ws/ti."""
+    syn = mi.scada_df
+    test_rows = syn[syn[mi.turbine_col] == mi.test_wtg]
+    treated = treated_mask(test_rows.index, mi.upgrade_timing)
+    treated_rows = test_rows[treated]
+    orig_test = original_df[original_df[mi.turbine_col] == mi.test_wtg]
+    actual = treated_rows[HOT_COLUMNS.active_power].to_numpy(dtype=float)
+    counterfactual = orig_test.loc[treated_rows.index, HOT_COLUMNS.active_power].to_numpy(dtype=float)
+    ws = treated_rows[HOT_COLUMNS.wind_speed].to_numpy(dtype=float)
+    sd = treated_rows[HOT_COLUMNS.wind_speed_sd].to_numpy(dtype=float)
+    ti = np.divide(sd, ws, out=np.full_like(sd, np.nan), where=ws != 0)
+    frames = []
+    for name, values in (("ws", ws), ("ti", ti)):
+        table = energy_ratio_by_bin(values, actual, counterfactual, bins=CONDITION_BINS[name])
+        table.insert(0, "condition", name)
+        frames.append(table[["condition", "condition_bin", "p50_uplift"]])
+    return pd.concat(frames, ignore_index=True)
+
+
+class ConditionalOracleMethod:
+    """Oracle that also emits per-bin oracle uplift; conditional signed error should be ~0."""
+
+    def __init__(self, original_df: pd.DataFrame, name: str = "cond_oracle") -> None:
+        self._original = original_df
+        self.name = name
+
+    def estimate(self, mi: MethodInput) -> MethodOutput:
+        return MethodOutput(
+            p50_overall=oracle_overall_uplift(mi, self._original),
+            p50_by_condition=conditional_oracle_by_condition(mi, self._original),
+        )
diff --git a/tests/benchmarking/harness/test_conditions.py b/tests/benchmarking/harness/test_conditions.py
new file mode 100644
index 0000000..511cec1
--- /dev/null
+++ b/tests/benchmarking/harness/test_conditions.py
@@ -0,0 +1,47 @@
+"""Tests for the shared condition bins and the binned energy-ratio reducer."""
+
+from __future__ import annotations
+
+import numpy as np
+
+from benchmarking.harness.conditions import (
+    CONDITION_BINS,
+    CONDITIONS,
+    TI_BINS,
+    WS_BINS,
+    energy_ratio_by_bin,
+)
+
+
+def test_bin_edges_have_expected_width() -> None:
+    assert WS_BINS[0] == 0.0
+    assert WS_BINS[-1] == 26.0
+    assert np.allclose(np.diff(WS_BINS), 2.0)
+    assert TI_BINS[0] == 0.0
+    assert TI_BINS[-1] == 0.5
+    assert np.allclose(np.diff(TI_BINS), 0.05)
+    assert CONDITIONS == ("ws", "ti")
+    assert CONDITION_BINS == {"ws": WS_BINS, "ti": TI_BINS}
+
+
+def test_energy_ratio_by_bin_computes_per_bin_ratio() -> None:
+    # two rows in (4,6], two in (6,8]; counterfactual constant so uplift = mean(actual)/cf - 1
+    cond = np.array([5.0, 5.0, 7.0, 7.0])
+    actual = np.array([110.0, 90.0, 150.0, 150.0])
+    counterfactual = np.array([100.0, 100.0, 100.0, 100.0])
+    out = energy_ratio_by_bin(cond, actual, counterfactual, bins=WS_BINS)
+    by = out.set_index("condition_bin")
+    assert by.loc["(4.0, 6.0]", "p50_uplift"] == 0.0  # (110+90)/200 - 1
+    assert by.loc["(6.0, 8.0]", "p50_uplift"] == 0.5  # 300/200 - 1
+    assert by.loc["(4.0, 6.0]", "n_records"] == 2
+
+
+def test_energy_ratio_by_bin_is_nan_safe_and_covers_all_bins() -> None:
+    cond = np.array([5.0, np.nan, 7.0])
+    actual = np.array([100.0, 50.0, np.nan])
+    counterfactual = np.array([100.0, 50.0, 100.0])
+    out = energy_ratio_by_bin(cond, actual, counterfactual, bins=WS_BINS)
+    # one row per bin edge interval, no warnings, empty bins -> NaN uplift
+    assert len(out) == len(WS_BINS) - 1
+    assert out["condition_bin"].is_unique
+    assert out.loc[out["condition_bin"] == "(6.0, 8.0]", "p50_uplift"].isna().all()
diff --git a/tests/benchmarking/harness/test_leaderboard.py b/tests/benchmarking/harness/test_leaderboard.py
index ddb83ab..3716cb7 100644
--- a/tests/benchmarking/harness/test_leaderboard.py
+++ b/tests/benchmarking/harness/test_leaderboard.py
@@ -7,7 +7,7 @@
 import pandas as pd
 import pytest
 
-from benchmarking.harness.leaderboard import leaderboard
+from benchmarking.harness.leaderboard import conditional_leaderboard, leaderboard
 
 
 def _results(rows: list[dict]) -> pd.DataFrame:
@@ -94,3 +94,48 @@ def test_methods_are_compared_side_by_side() -> None:
     by_method = summary.set_index("method")
     assert by_method.loc["a", "score"] == pytest.approx(0.0)
     assert by_method.loc["b", "score"] == pytest.approx(0.1)
+
+
+def test_conditional_leaderboard_groups_by_condition_bin() -> None:
+    df = pd.DataFrame(
+        {
+            "method": "m",
+            "profile": "p",
+            "campaign_months": 6,
+            "condition": ["ws", "ws", "ws", "ws"],
+            "condition_bin": ["(6.0, 8.0]", "(6.0, 8.0]", "(8.0, 10.0]", "(8.0, 10.0]"],
+            "estimate": [0.11, 0.09, 0.05, 0.05],
+            "truth": [0.10, 0.10, 0.05, 0.05],
+            "signed_error": [0.01, -0.01, 0.0, 0.0],
+        }
+    )
+    lb = conditional_leaderboard(df)
+    assert set(lb.columns) >= {
+        "method",
+        "profile",
+        "campaign_months",
+        "condition",
+        "condition_bin",
+        "bias",
+        "spread",
+        "score",
+    }
+    row = lb[lb["condition_bin"] == "(6.0, 8.0]"].iloc[0]
+    assert row["bias"] == 0.0
+    assert row["spread"] == pytest.approx(0.01)
+
+
+def test_conditional_leaderboard_ignores_overall_rows() -> None:
+    df = pd.DataFrame(
+        {
+            "method": "m",
+            "profile": "p",
+            "campaign_months": 6,
+            "condition": ["overall"],
+            "condition_bin": ["overall"],
+            "estimate": [0.1],
+            "truth": [0.1],
+            "signed_error": [0.0],
+        }
+    )
+    assert conditional_leaderboard(df).empty
diff --git a/tests/benchmarking/harness/test_plots.py b/tests/benchmarking/harness/test_plots.py
index 1d53e42..0104505 100644
--- a/tests/benchmarking/harness/test_plots.py
+++ b/tests/benchmarking/harness/test_plots.py
@@ -8,9 +8,10 @@
 
 mpl.use("Agg")  # headless: no display needed for tests
 
+import matplotlib.pyplot as plt
 import pandas as pd
 
-from benchmarking.harness.plots import plot_campaign_curves
+from benchmarking.harness.plots import plot_campaign_curves, plot_conditional_uplift
 
 if TYPE_CHECKING:
     from pathlib import Path
@@ -91,3 +92,21 @@ def test_saves_file(tmp_path: Path) -> None:
     plot_campaign_curves(_summary(), save_path=save_path)
     assert save_path.exists()
     assert save_path.stat().st_size > 0
+
+
+def test_plot_conditional_uplift_writes_png(tmp_path: Path) -> None:
+    summary = pd.DataFrame(
+        {
+            "method": ["power_model", "power_model"],
+            "condition": ["ws", "ws"],
+            "condition_bin": ["(4.0, 6.0]", "(6.0, 8.0]"],
+            "mean_estimate": [0.09, 0.05],
+            "mean_truth": [0.10, 0.05],
+            "bias": [-0.01, 0.0],
+            "spread": [0.01, 0.005],
+        }
+    )
+    out = tmp_path / "cond.png"
+    fig = plot_conditional_uplift(summary, condition="ws", save_path=out, title="ws_dependent_cp 6mo")
+    assert out.exists()
+    plt.close(fig)
diff --git a/tests/benchmarking/harness/test_scoring.py b/tests/benchmarking/harness/test_scoring.py
index 4e18344..8f24717 100644
--- a/tests/benchmarking/harness/test_scoring.py
+++ b/tests/benchmarking/harness/test_scoring.py
@@ -10,7 +10,7 @@
 from benchmarking.synthetic import HOT_COLUMNS, ConstantCpChange
 from wind_up.constants import TIMESTAMP_COL
 
-from .stubs import BiasedMethod, OracleMethod, RecordingMethod
+from .stubs import BiasedMethod, ConditionalOracleMethod, OracleMethod, RecordingMethod
 
 PROFILE = [ConstantCpChange(delta=0.05)]
 
@@ -180,3 +180,23 @@ def test_on_method_complete_is_optional() -> None:
     # anything else.
     drop = ["wall_time_s"]
     pd.testing.assert_frame_equal(with_cb.drop(columns=drop), without_cb.drop(columns=drop))
+
+
+def test_conditional_rows_are_emitted_with_truth_and_near_zero_error() -> None:
+    base = _base_scada()
+    results = score_study(base, profile=PROFILE, methods=[ConditionalOracleMethod(base)], study=_study(n_replicates=1))
+    assert "condition_bin" in results.columns
+    overall = results[results["condition"] == "overall"]
+    assert (overall["condition_bin"] == "overall").all()
+    cond = results[results["condition"].isin(["ws", "ti"])]
+    assert set(cond["condition"]) == {"ws", "ti"}
+    # populated bins: a per-bin oracle must match per-bin truth
+    populated = cond[cond["truth"].notna() & cond["estimate"].notna()]
+    assert len(populated) > 0
+    assert np.allclose(populated["signed_error"].to_numpy(), 0.0, atol=1e-9)
+
+
+def test_overall_only_method_emits_no_conditional_rows() -> None:
+    base = _base_scada()
+    results = score_study(base, profile=PROFILE, methods=[OracleMethod(base)], study=_study(n_replicates=1))
+    assert set(results["condition"]) == {"overall"}