From 7dbd73bb4817f383c120bc21cfd8ec39996c2649 Mon Sep 17 00:00:00 2001
From: Bob <dutifulbob@gmail.com>
Date: Mon, 15 Jun 2026 14:15:45 +0800
Subject: [PATCH 01/11] docs(speed-bench): add generated benchmark summary

---
 speed-bench/README.md         |  16 ++++
 speed-bench/update_summary.py | 136 ++++++++++++++++++++++++++++++++++
 2 files changed, 152 insertions(+)
 create mode 100644 speed-bench/update_summary.py

diff --git a/speed-bench/README.md b/speed-bench/README.md
index 32075fe18..823dc9685 100644
--- a/speed-bench/README.md
+++ b/speed-bench/README.md
@@ -26,3 +26,19 @@ python3 speed-bench/plot_speed.py speed-bench/m3_max.csv --title "M3 Max t/s"
 
 The script uses only the Python standard library. By default it writes a file
 next to the CSV using the `_ts.svg` suffix, such as `speed-bench/m3_max_ts.svg`.
+
+<!-- BEGIN GENERATED BENCHMARK SUMMARY -->
+## Generated Benchmark Summary
+
+Generated from the CSV files in this directory by `python3 speed-bench/update_summary.py`.
+
+`@ 32k ctx` means the row where `ctx_tokens` is `32768`.
+
+| Benchmark | Best gen | Gen @ 32k ctx | Avg gen | Best prefill | Prefill @ 32k ctx | Avg prefill |
+| --- | ---: | ---: | ---: | ---: | ---: | ---: |
+| M4 Max | 26.76 t/s | 24.52 t/s | 24.57 t/s | 343.76 t/s | 247.91 t/s | 250.39 t/s |
+| M2 Ultra | 23.22 t/s | 21.92 t/s | 21.85 t/s | 410.62 t/s | 325.77 t/s | 324.90 t/s |
+| GB10 | 14.23 t/s | 12.98 t/s | 13.13 t/s | 402.88 t/s | 346.36 t/s | 343.02 t/s |
+| PRO model M3 Ultra | 12.42 t/s | 9.56 t/s | 9.90 t/s | 183.06 t/s | 138.82 t/s | 149.28 t/s |
+
+<!-- END GENERATED BENCHMARK SUMMARY -->
diff --git a/speed-bench/update_summary.py b/speed-bench/update_summary.py
new file mode 100644
index 000000000..d2ab33fe7
--- /dev/null
+++ b/speed-bench/update_summary.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+"""Update the generated benchmark summary in speed-bench/README.md."""
+
+import csv
+from dataclasses import dataclass
+from pathlib import Path
+
+
+BEGIN_MARKER = "<!-- BEGIN GENERATED BENCHMARK SUMMARY -->"
+END_MARKER = "<!-- END GENERATED BENCHMARK SUMMARY -->"
+README = Path(__file__).with_name("README.md")
+BENCH_DIR = Path(__file__).resolve().parent
+REQUIRED_COLUMNS = {"ctx_tokens", "prefill_tps", "gen_tps"}
+TARGET_CTX = 32768
+
+
+@dataclass
+class BenchSummary:
+    name: str
+    best_gen: float
+    gen_at_target_ctx: float | None
+    avg_gen: float
+    best_prefill: float
+    prefill_at_target_ctx: float | None
+    avg_prefill: float
+
+
+def display_name(path: Path) -> str:
+    replacements = {
+        "gb10": "GB10",
+        "m2": "M2",
+        "m3": "M3",
+        "m4": "M4",
+        "m5": "M5",
+        "pro": "PRO",
+        "max": "Max",
+        "ultra": "Ultra",
+    }
+    words = path.stem.replace("-", "_").split("_")
+    return " ".join(replacements.get(word.lower(), word) for word in words)
+
+
+def fmt_tps(value: float | None) -> str:
+    if value is None:
+        return "n/a"
+    return f"{value:.2f} t/s"
+
+
+def read_summary(path: Path) -> BenchSummary:
+    rows = []
+    with path.open("r", encoding="utf-8-sig", newline="") as fp:
+        reader = csv.DictReader(fp)
+        missing = REQUIRED_COLUMNS.difference(reader.fieldnames or ())
+        if missing:
+            missing_list = ", ".join(sorted(missing))
+            raise SystemExit(f"{path}: missing CSV column(s): {missing_list}")
+
+        for row in reader:
+            rows.append(
+                {
+                    "ctx_tokens": int(row["ctx_tokens"]),
+                    "prefill_tps": float(row["prefill_tps"]),
+                    "gen_tps": float(row["gen_tps"]),
+                }
+            )
+
+    if not rows:
+        raise SystemExit(f"{path}: no benchmark rows")
+
+    target_row = next((row for row in rows if row["ctx_tokens"] == TARGET_CTX), None)
+    return BenchSummary(
+        name=display_name(path),
+        best_gen=max(row["gen_tps"] for row in rows),
+        gen_at_target_ctx=target_row["gen_tps"] if target_row else None,
+        avg_gen=sum(row["gen_tps"] for row in rows) / len(rows),
+        best_prefill=max(row["prefill_tps"] for row in rows),
+        prefill_at_target_ctx=target_row["prefill_tps"] if target_row else None,
+        avg_prefill=sum(row["prefill_tps"] for row in rows) / len(rows),
+    )
+
+
+def render_summary(summaries: list[BenchSummary]) -> str:
+    summaries = sorted(summaries, key=lambda item: item.best_gen, reverse=True)
+    lines = [
+        BEGIN_MARKER,
+        "## Generated Benchmark Summary",
+        "",
+        "Generated from the CSV files in this directory by `python3 speed-bench/update_summary.py`.",
+        "",
+        f"`@ 32k ctx` means the row where `ctx_tokens` is `{TARGET_CTX}`.",
+        "",
+        "| Benchmark | Best gen | Gen @ 32k ctx | Avg gen | Best prefill | Prefill @ 32k ctx | Avg prefill |",
+        "| --- | ---: | ---: | ---: | ---: | ---: | ---: |",
+    ]
+    for summary in summaries:
+        lines.append(
+            "| "
+            + " | ".join(
+                [
+                    summary.name,
+                    fmt_tps(summary.best_gen),
+                    fmt_tps(summary.gen_at_target_ctx),
+                    fmt_tps(summary.avg_gen),
+                    fmt_tps(summary.best_prefill),
+                    fmt_tps(summary.prefill_at_target_ctx),
+                    fmt_tps(summary.avg_prefill),
+                ]
+            )
+            + " |"
+        )
+    lines.extend(["", END_MARKER, ""])
+    return "\n".join(lines)
+
+
+def replace_generated_section(readme: str, generated: str) -> str:
+    begin = readme.find(BEGIN_MARKER)
+    end = readme.find(END_MARKER)
+    if begin == -1 and end == -1:
+        return readme.rstrip() + "\n\n" + generated
+    if begin == -1 or end == -1 or end < begin:
+        raise SystemExit("README.md has mismatched generated summary markers")
+    end += len(END_MARKER)
+    return readme[:begin].rstrip() + "\n\n" + generated.rstrip() + readme[end:].rstrip() + "\n"
+
+
+def main() -> None:
+    csv_paths = sorted(BENCH_DIR.glob("*.csv"))
+    if not csv_paths:
+        raise SystemExit(f"{BENCH_DIR}: no CSV files found")
+    summaries = [read_summary(path) for path in csv_paths]
+    generated = render_summary(summaries)
+    README.write_text(replace_generated_section(README.read_text(encoding="utf-8"), generated), encoding="utf-8")
+
+
+if __name__ == "__main__":
+    main()

From aa29a54a19fb3a87a5d3dcaf25c667794e20d0f4 Mon Sep 17 00:00:00 2001
From: Bob <dutifulbob@gmail.com>
Date: Mon, 15 Jun 2026 14:20:44 +0800
Subject: [PATCH 02/11] docs(speed-bench): simplify summary heading

---
 speed-bench/README.md         | 2 +-
 speed-bench/update_summary.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/speed-bench/README.md b/speed-bench/README.md
index 823dc9685..f18cd78af 100644
--- a/speed-bench/README.md
+++ b/speed-bench/README.md
@@ -28,7 +28,7 @@ The script uses only the Python standard library. By default it writes a file
 next to the CSV using the `_ts.svg` suffix, such as `speed-bench/m3_max_ts.svg`.
 
 <!-- BEGIN GENERATED BENCHMARK SUMMARY -->
-## Generated Benchmark Summary
+## Benchmark Summary
 
 Generated from the CSV files in this directory by `python3 speed-bench/update_summary.py`.
 
diff --git a/speed-bench/update_summary.py b/speed-bench/update_summary.py
index d2ab33fe7..e69cd8e26 100644
--- a/speed-bench/update_summary.py
+++ b/speed-bench/update_summary.py
@@ -83,7 +83,7 @@ def render_summary(summaries: list[BenchSummary]) -> str:
     summaries = sorted(summaries, key=lambda item: item.best_gen, reverse=True)
     lines = [
         BEGIN_MARKER,
-        "## Generated Benchmark Summary",
+        "## Benchmark Summary",
         "",
         "Generated from the CSV files in this directory by `python3 speed-bench/update_summary.py`.",
         "",

From 17ef23c1d0a69726581f1023c151fd3a3a370a17 Mon Sep 17 00:00:00 2001
From: Bob <dutifulbob@gmail.com>
Date: Mon, 15 Jun 2026 14:22:06 +0800
Subject: [PATCH 03/11] docs(speed-bench): clarify benchmark labels

---
 speed-bench/README.md         | 2 +-
 speed-bench/update_summary.py | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/speed-bench/README.md b/speed-bench/README.md
index f18cd78af..162e3718d 100644
--- a/speed-bench/README.md
+++ b/speed-bench/README.md
@@ -38,7 +38,7 @@ Generated from the CSV files in this directory by `python3 speed-bench/update_su
 | --- | ---: | ---: | ---: | ---: | ---: | ---: |
 | M4 Max | 26.76 t/s | 24.52 t/s | 24.57 t/s | 343.76 t/s | 247.91 t/s | 250.39 t/s |
 | M2 Ultra | 23.22 t/s | 21.92 t/s | 21.85 t/s | 410.62 t/s | 325.77 t/s | 324.90 t/s |
-| GB10 | 14.23 t/s | 12.98 t/s | 13.13 t/s | 402.88 t/s | 346.36 t/s | 343.02 t/s |
+| DGX Spark / GB10 | 14.23 t/s | 12.98 t/s | 13.13 t/s | 402.88 t/s | 346.36 t/s | 343.02 t/s |
 | PRO model M3 Ultra | 12.42 t/s | 9.56 t/s | 9.90 t/s | 183.06 t/s | 138.82 t/s | 149.28 t/s |
 
 <!-- END GENERATED BENCHMARK SUMMARY -->
diff --git a/speed-bench/update_summary.py b/speed-bench/update_summary.py
index e69cd8e26..c46456e52 100644
--- a/speed-bench/update_summary.py
+++ b/speed-bench/update_summary.py
@@ -26,6 +26,15 @@ class BenchSummary:
 
 
 def display_name(path: Path) -> str:
+    name_overrides = {
+        "gb10": "DGX Spark / GB10",
+        "m2_ultra": "M2 Ultra",
+        "m4_max": "M4 Max",
+        "pro_model_m3_ultra": "PRO model M3 Ultra",
+    }
+    if path.stem in name_overrides:
+        return name_overrides[path.stem]
+
     replacements = {
         "gb10": "GB10",
         "m2": "M2",

From 3339423194ab04f46be39e82300fa6a6985b177c Mon Sep 17 00:00:00 2001
From: Bob <dutifulbob@gmail.com>
Date: Mon, 15 Jun 2026 14:24:04 +0800
Subject: [PATCH 04/11] docs(speed-bench): spell out benchmark targets

---
 speed-bench/README.md         | 8 ++++----
 speed-bench/update_summary.py | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/speed-bench/README.md b/speed-bench/README.md
index 162e3718d..37ea1e8a5 100644
--- a/speed-bench/README.md
+++ b/speed-bench/README.md
@@ -36,9 +36,9 @@ Generated from the CSV files in this directory by `python3 speed-bench/update_su
 
 | Benchmark | Best gen | Gen @ 32k ctx | Avg gen | Best prefill | Prefill @ 32k ctx | Avg prefill |
 | --- | ---: | ---: | ---: | ---: | ---: | ---: |
-| M4 Max | 26.76 t/s | 24.52 t/s | 24.57 t/s | 343.76 t/s | 247.91 t/s | 250.39 t/s |
-| M2 Ultra | 23.22 t/s | 21.92 t/s | 21.85 t/s | 410.62 t/s | 325.77 t/s | 324.90 t/s |
-| DGX Spark / GB10 | 14.23 t/s | 12.98 t/s | 13.13 t/s | 402.88 t/s | 346.36 t/s | 343.02 t/s |
-| PRO model M3 Ultra | 12.42 t/s | 9.56 t/s | 9.90 t/s | 183.06 t/s | 138.82 t/s | 149.28 t/s |
+| Apple M4 Max (DeepSeek V4 Flash q2) | 26.76 t/s | 24.52 t/s | 24.57 t/s | 343.76 t/s | 247.91 t/s | 250.39 t/s |
+| Apple M2 Ultra (DeepSeek V4 Flash q2) | 23.22 t/s | 21.92 t/s | 21.85 t/s | 410.62 t/s | 325.77 t/s | 324.90 t/s |
+| NVIDIA DGX Spark / GB10 (DeepSeek V4 Flash q2) | 14.23 t/s | 12.98 t/s | 13.13 t/s | 402.88 t/s | 346.36 t/s | 343.02 t/s |
+| Apple M3 Ultra (DeepSeek V4 PRO q2) | 12.42 t/s | 9.56 t/s | 9.90 t/s | 183.06 t/s | 138.82 t/s | 149.28 t/s |
 
 <!-- END GENERATED BENCHMARK SUMMARY -->
diff --git a/speed-bench/update_summary.py b/speed-bench/update_summary.py
index c46456e52..1b08b6c99 100644
--- a/speed-bench/update_summary.py
+++ b/speed-bench/update_summary.py
@@ -27,10 +27,10 @@ class BenchSummary:
 
 def display_name(path: Path) -> str:
     name_overrides = {
-        "gb10": "DGX Spark / GB10",
-        "m2_ultra": "M2 Ultra",
-        "m4_max": "M4 Max",
-        "pro_model_m3_ultra": "PRO model M3 Ultra",
+        "gb10": "NVIDIA DGX Spark / GB10 (DeepSeek V4 Flash q2)",
+        "m2_ultra": "Apple M2 Ultra (DeepSeek V4 Flash q2)",
+        "m4_max": "Apple M4 Max (DeepSeek V4 Flash q2)",
+        "pro_model_m3_ultra": "Apple M3 Ultra (DeepSeek V4 PRO q2)",
     }
     if path.stem in name_overrides:
         return name_overrides[path.stem]

From e06cba522fe1a8869b7d5a34b073054001cf47f5 Mon Sep 17 00:00:00 2001
From: Bob <dutifulbob@gmail.com>
Date: Mon, 15 Jun 2026 14:25:04 +0800
Subject: [PATCH 05/11] docs(speed-bench): move units into table headers

---
 speed-bench/README.md         | 10 +++++-----
 speed-bench/update_summary.py |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/speed-bench/README.md b/speed-bench/README.md
index 37ea1e8a5..e22382f4b 100644
--- a/speed-bench/README.md
+++ b/speed-bench/README.md
@@ -34,11 +34,11 @@ Generated from the CSV files in this directory by `python3 speed-bench/update_su
 
 `@ 32k ctx` means the row where `ctx_tokens` is `32768`.
 
-| Benchmark | Best gen | Gen @ 32k ctx | Avg gen | Best prefill | Prefill @ 32k ctx | Avg prefill |
+| Benchmark | Best gen (t/s) | Gen @ 32k ctx (t/s) | Avg gen (t/s) | Best prefill (t/s) | Prefill @ 32k ctx (t/s) | Avg prefill (t/s) |
 | --- | ---: | ---: | ---: | ---: | ---: | ---: |
-| Apple M4 Max (DeepSeek V4 Flash q2) | 26.76 t/s | 24.52 t/s | 24.57 t/s | 343.76 t/s | 247.91 t/s | 250.39 t/s |
-| Apple M2 Ultra (DeepSeek V4 Flash q2) | 23.22 t/s | 21.92 t/s | 21.85 t/s | 410.62 t/s | 325.77 t/s | 324.90 t/s |
-| NVIDIA DGX Spark / GB10 (DeepSeek V4 Flash q2) | 14.23 t/s | 12.98 t/s | 13.13 t/s | 402.88 t/s | 346.36 t/s | 343.02 t/s |
-| Apple M3 Ultra (DeepSeek V4 PRO q2) | 12.42 t/s | 9.56 t/s | 9.90 t/s | 183.06 t/s | 138.82 t/s | 149.28 t/s |
+| Apple M4 Max (DeepSeek V4 Flash q2) | 26.76 | 24.52 | 24.57 | 343.76 | 247.91 | 250.39 |
+| Apple M2 Ultra (DeepSeek V4 Flash q2) | 23.22 | 21.92 | 21.85 | 410.62 | 325.77 | 324.90 |
+| NVIDIA DGX Spark / GB10 (DeepSeek V4 Flash q2) | 14.23 | 12.98 | 13.13 | 402.88 | 346.36 | 343.02 |
+| Apple M3 Ultra (DeepSeek V4 PRO q2) | 12.42 | 9.56 | 9.90 | 183.06 | 138.82 | 149.28 |
 
 <!-- END GENERATED BENCHMARK SUMMARY -->
diff --git a/speed-bench/update_summary.py b/speed-bench/update_summary.py
index 1b08b6c99..a9e4ae127 100644
--- a/speed-bench/update_summary.py
+++ b/speed-bench/update_summary.py
@@ -52,7 +52,7 @@ def display_name(path: Path) -> str:
 def fmt_tps(value: float | None) -> str:
     if value is None:
         return "n/a"
-    return f"{value:.2f} t/s"
+    return f"{value:.2f}"
 
 
 def read_summary(path: Path) -> BenchSummary:
@@ -98,7 +98,7 @@ def render_summary(summaries: list[BenchSummary]) -> str:
         "",
         f"`@ 32k ctx` means the row where `ctx_tokens` is `{TARGET_CTX}`.",
         "",
-        "| Benchmark | Best gen | Gen @ 32k ctx | Avg gen | Best prefill | Prefill @ 32k ctx | Avg prefill |",
+        "| Benchmark | Best gen (t/s) | Gen @ 32k ctx (t/s) | Avg gen (t/s) | Best prefill (t/s) | Prefill @ 32k ctx (t/s) | Avg prefill (t/s) |",
         "| --- | ---: | ---: | ---: | ---: | ---: | ---: |",
     ]
     for summary in summaries:

From a28fdefb7df9bedf17b44f6557b47a438fa16a35 Mon Sep 17 00:00:00 2001
From: Bob <dutifulbob@gmail.com>
Date: Mon, 15 Jun 2026 14:27:18 +0800
Subject: [PATCH 06/11] docs(speed-bench): group summary by model

---
 speed-bench/README.md         | 17 ++++++---
 speed-bench/update_summary.py | 72 ++++++++++++++++++++++-------------
 2 files changed, 57 insertions(+), 32 deletions(-)

diff --git a/speed-bench/README.md b/speed-bench/README.md
index e22382f4b..92a8ce8fb 100644
--- a/speed-bench/README.md
+++ b/speed-bench/README.md
@@ -34,11 +34,18 @@ Generated from the CSV files in this directory by `python3 speed-bench/update_su
 
 `@ 32k ctx` means the row where `ctx_tokens` is `32768`.
 
-| Benchmark | Best gen (t/s) | Gen @ 32k ctx (t/s) | Avg gen (t/s) | Best prefill (t/s) | Prefill @ 32k ctx (t/s) | Avg prefill (t/s) |
+### DeepSeek V4 Flash q2
+
+| Hardware | Best gen (t/s) | Gen @ 32k ctx (t/s) | Avg gen (t/s) | Best prefill (t/s) | Prefill @ 32k ctx (t/s) | Avg prefill (t/s) |
+| --- | ---: | ---: | ---: | ---: | ---: | ---: |
+| Apple M4 Max | 26.76 | 24.52 | 24.57 | 343.76 | 247.91 | 250.39 |
+| Apple M2 Ultra | 23.22 | 21.92 | 21.85 | 410.62 | 325.77 | 324.90 |
+| NVIDIA DGX Spark / GB10 | 14.23 | 12.98 | 13.13 | 402.88 | 346.36 | 343.02 |
+
+### DeepSeek V4 PRO q2
+
+| Hardware | Best gen (t/s) | Gen @ 32k ctx (t/s) | Avg gen (t/s) | Best prefill (t/s) | Prefill @ 32k ctx (t/s) | Avg prefill (t/s) |
 | --- | ---: | ---: | ---: | ---: | ---: | ---: |
-| Apple M4 Max (DeepSeek V4 Flash q2) | 26.76 | 24.52 | 24.57 | 343.76 | 247.91 | 250.39 |
-| Apple M2 Ultra (DeepSeek V4 Flash q2) | 23.22 | 21.92 | 21.85 | 410.62 | 325.77 | 324.90 |
-| NVIDIA DGX Spark / GB10 (DeepSeek V4 Flash q2) | 14.23 | 12.98 | 13.13 | 402.88 | 346.36 | 343.02 |
-| Apple M3 Ultra (DeepSeek V4 PRO q2) | 12.42 | 9.56 | 9.90 | 183.06 | 138.82 | 149.28 |
+| Apple M3 Ultra | 12.42 | 9.56 | 9.90 | 183.06 | 138.82 | 149.28 |
 
 <!-- END GENERATED BENCHMARK SUMMARY -->
diff --git a/speed-bench/update_summary.py b/speed-bench/update_summary.py
index a9e4ae127..c56e25550 100644
--- a/speed-bench/update_summary.py
+++ b/speed-bench/update_summary.py
@@ -16,7 +16,8 @@
 
 @dataclass
 class BenchSummary:
-    name: str
+    hardware: str
+    model: str
     best_gen: float
     gen_at_target_ctx: float | None
     avg_gen: float
@@ -25,12 +26,12 @@ class BenchSummary:
     avg_prefill: float
 
 
-def display_name(path: Path) -> str:
+def benchmark_labels(path: Path) -> tuple[str, str]:
     name_overrides = {
-        "gb10": "NVIDIA DGX Spark / GB10 (DeepSeek V4 Flash q2)",
-        "m2_ultra": "Apple M2 Ultra (DeepSeek V4 Flash q2)",
-        "m4_max": "Apple M4 Max (DeepSeek V4 Flash q2)",
-        "pro_model_m3_ultra": "Apple M3 Ultra (DeepSeek V4 PRO q2)",
+        "gb10": ("NVIDIA DGX Spark / GB10", "DeepSeek V4 Flash q2"),
+        "m2_ultra": ("Apple M2 Ultra", "DeepSeek V4 Flash q2"),
+        "m4_max": ("Apple M4 Max", "DeepSeek V4 Flash q2"),
+        "pro_model_m3_ultra": ("Apple M3 Ultra", "DeepSeek V4 PRO q2"),
     }
     if path.stem in name_overrides:
         return name_overrides[path.stem]
@@ -46,7 +47,7 @@ def display_name(path: Path) -> str:
         "ultra": "Ultra",
     }
     words = path.stem.replace("-", "_").split("_")
-    return " ".join(replacements.get(word.lower(), word) for word in words)
+    return " ".join(replacements.get(word.lower(), word) for word in words), "Unspecified model"
 
 
 def fmt_tps(value: float | None) -> str:
@@ -77,8 +78,10 @@ def read_summary(path: Path) -> BenchSummary:
         raise SystemExit(f"{path}: no benchmark rows")
 
     target_row = next((row for row in rows if row["ctx_tokens"] == TARGET_CTX), None)
+    hardware, model = benchmark_labels(path)
     return BenchSummary(
-        name=display_name(path),
+        hardware=hardware,
+        model=model,
         best_gen=max(row["gen_tps"] for row in rows),
         gen_at_target_ctx=target_row["gen_tps"] if target_row else None,
         avg_gen=sum(row["gen_tps"] for row in rows) / len(rows),
@@ -89,7 +92,14 @@ def read_summary(path: Path) -> BenchSummary:
 
 
 def render_summary(summaries: list[BenchSummary]) -> str:
-    summaries = sorted(summaries, key=lambda item: item.best_gen, reverse=True)
+    by_model = {}
+    for summary in summaries:
+        by_model.setdefault(summary.model, []).append(summary)
+    model_groups = sorted(
+        by_model.items(),
+        key=lambda item: max(summary.best_gen for summary in item[1]),
+        reverse=True,
+    )
     lines = [
         BEGIN_MARKER,
         "## Benchmark Summary",
@@ -98,26 +108,34 @@ def render_summary(summaries: list[BenchSummary]) -> str:
         "",
         f"`@ 32k ctx` means the row where `ctx_tokens` is `{TARGET_CTX}`.",
         "",
-        "| Benchmark | Best gen (t/s) | Gen @ 32k ctx (t/s) | Avg gen (t/s) | Best prefill (t/s) | Prefill @ 32k ctx (t/s) | Avg prefill (t/s) |",
-        "| --- | ---: | ---: | ---: | ---: | ---: | ---: |",
     ]
-    for summary in summaries:
-        lines.append(
-            "| "
-            + " | ".join(
-                [
-                    summary.name,
-                    fmt_tps(summary.best_gen),
-                    fmt_tps(summary.gen_at_target_ctx),
-                    fmt_tps(summary.avg_gen),
-                    fmt_tps(summary.best_prefill),
-                    fmt_tps(summary.prefill_at_target_ctx),
-                    fmt_tps(summary.avg_prefill),
-                ]
-            )
-            + " |"
+    for model, model_summaries in model_groups:
+        lines.extend(
+            [
+                f"### {model}",
+                "",
+                "| Hardware | Best gen (t/s) | Gen @ 32k ctx (t/s) | Avg gen (t/s) | Best prefill (t/s) | Prefill @ 32k ctx (t/s) | Avg prefill (t/s) |",
+                "| --- | ---: | ---: | ---: | ---: | ---: | ---: |",
+            ]
         )
-    lines.extend(["", END_MARKER, ""])
+        for summary in sorted(model_summaries, key=lambda item: item.best_gen, reverse=True):
+            lines.append(
+                "| "
+                + " | ".join(
+                    [
+                        summary.hardware,
+                        fmt_tps(summary.best_gen),
+                        fmt_tps(summary.gen_at_target_ctx),
+                        fmt_tps(summary.avg_gen),
+                        fmt_tps(summary.best_prefill),
+                        fmt_tps(summary.prefill_at_target_ctx),
+                        fmt_tps(summary.avg_prefill),
+                    ]
+                )
+                + " |"
+            )
+        lines.append("")
+    lines.extend([END_MARKER, ""])
     return "\n".join(lines)
 
 

From 77fe03d81b0857e79c58dba01abc32e9fc15bfd9 Mon Sep 17 00:00:00 2001
From: Bob <dutifulbob@gmail.com>
Date: Mon, 15 Jun 2026 14:30:31 +0800
Subject: [PATCH 07/11] docs(speed-bench): round summary to three significant
 figures

---
 speed-bench/README.md         | 8 ++++----
 speed-bench/update_summary.py | 4 ++++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/speed-bench/README.md b/speed-bench/README.md
index 92a8ce8fb..a5c3b0bd3 100644
--- a/speed-bench/README.md
+++ b/speed-bench/README.md
@@ -38,14 +38,14 @@ Generated from the CSV files in this directory by `python3 speed-bench/update_su
 
 | Hardware | Best gen (t/s) | Gen @ 32k ctx (t/s) | Avg gen (t/s) | Best prefill (t/s) | Prefill @ 32k ctx (t/s) | Avg prefill (t/s) |
 | --- | ---: | ---: | ---: | ---: | ---: | ---: |
-| Apple M4 Max | 26.76 | 24.52 | 24.57 | 343.76 | 247.91 | 250.39 |
-| Apple M2 Ultra | 23.22 | 21.92 | 21.85 | 410.62 | 325.77 | 324.90 |
-| NVIDIA DGX Spark / GB10 | 14.23 | 12.98 | 13.13 | 402.88 | 346.36 | 343.02 |
+| Apple M4 Max | 26.8 | 24.5 | 24.6 | 344 | 248 | 250 |
+| Apple M2 Ultra | 23.2 | 21.9 | 21.9 | 411 | 326 | 325 |
+| NVIDIA DGX Spark / GB10 | 14.2 | 13.0 | 13.1 | 403 | 346 | 343 |
 
 ### DeepSeek V4 PRO q2
 
 | Hardware | Best gen (t/s) | Gen @ 32k ctx (t/s) | Avg gen (t/s) | Best prefill (t/s) | Prefill @ 32k ctx (t/s) | Avg prefill (t/s) |
 | --- | ---: | ---: | ---: | ---: | ---: | ---: |
-| Apple M3 Ultra | 12.42 | 9.56 | 9.90 | 183.06 | 138.82 | 149.28 |
+| Apple M3 Ultra | 12.4 | 9.56 | 9.90 | 183 | 139 | 149 |
 
 <!-- END GENERATED BENCHMARK SUMMARY -->
diff --git a/speed-bench/update_summary.py b/speed-bench/update_summary.py
index c56e25550..953dd580f 100644
--- a/speed-bench/update_summary.py
+++ b/speed-bench/update_summary.py
@@ -53,6 +53,10 @@ def benchmark_labels(path: Path) -> tuple[str, str]:
 def fmt_tps(value: float | None) -> str:
     if value is None:
         return "n/a"
+    if abs(value) >= 100:
+        return f"{value:.0f}"
+    if abs(value) >= 10:
+        return f"{value:.1f}"
     return f"{value:.2f}"
 
 

From 738a042055f00f1bf5a701a7e53fc1f458e1d4f6 Mon Sep 17 00:00:00 2001
From: Bob <dutifulbob@gmail.com>
Date: Mon, 15 Jun 2026 14:34:35 +0800
Subject: [PATCH 08/11] docs(speed-bench): add benchmark metadata manifest

---
 speed-bench/README.md         |  1 +
 speed-bench/benchmarks.json   | 57 ++++++++++++++++++++++++++++++
 speed-bench/update_summary.py | 66 +++++++++++++++++++++--------------
 3 files changed, 97 insertions(+), 27 deletions(-)
 create mode 100644 speed-bench/benchmarks.json

diff --git a/speed-bench/README.md b/speed-bench/README.md
index a5c3b0bd3..2d0844c32 100644
--- a/speed-bench/README.md
+++ b/speed-bench/README.md
@@ -17,6 +17,7 @@ Run `ds4-bench` as:
 Provide PR including your numbers if your hardware was not already tested.
 Call the benchmark csv file something like `m3_max.csv` or alike, so that
 it is clear what hardware was used for the benchmark.
+Record the machine, backend, model, and run parameters in `benchmarks.json`.
 
 To generate an SVG graph from a CSV file:
 
diff --git a/speed-bench/benchmarks.json b/speed-bench/benchmarks.json
new file mode 100644
index 000000000..5f93bc2a1
--- /dev/null
+++ b/speed-bench/benchmarks.json
@@ -0,0 +1,57 @@
+{
+  "schema_version": 1,
+  "benchmarks": [
+    {
+      "csv": "gb10.csv",
+      "hardware": "NVIDIA DGX Spark / GB10",
+      "backend": "CUDA",
+      "model": "DeepSeek V4 Flash",
+      "quant": "q2",
+      "model_label": "DeepSeek V4 Flash q2",
+      "prompt_file": "speed-bench/promessi_sposi.txt",
+      "ctx_start": 2048,
+      "ctx_max": 65536,
+      "step_incr": 2048,
+      "gen_tokens": 128
+    },
+    {
+      "csv": "m2_ultra.csv",
+      "hardware": "Apple M2 Ultra",
+      "backend": "Metal",
+      "model": "DeepSeek V4 Flash",
+      "quant": "q2",
+      "model_label": "DeepSeek V4 Flash q2",
+      "prompt_file": "speed-bench/promessi_sposi.txt",
+      "ctx_start": 2048,
+      "ctx_max": 65536,
+      "step_incr": 2048,
+      "gen_tokens": 128
+    },
+    {
+      "csv": "m4_max.csv",
+      "hardware": "Apple M4 Max",
+      "backend": "Metal",
+      "model": "DeepSeek V4 Flash",
+      "quant": "q2",
+      "model_label": "DeepSeek V4 Flash q2",
+      "prompt_file": "speed-bench/promessi_sposi.txt",
+      "ctx_start": 2048,
+      "ctx_max": 65536,
+      "step_incr": 2048,
+      "gen_tokens": 128
+    },
+    {
+      "csv": "pro_model_m3_ultra.csv",
+      "hardware": "Apple M3 Ultra",
+      "backend": "Metal",
+      "model": "DeepSeek V4 PRO",
+      "quant": "q2",
+      "model_label": "DeepSeek V4 PRO q2",
+      "prompt_file": "speed-bench/promessi_sposi.txt",
+      "ctx_start": 2048,
+      "ctx_max": 32768,
+      "step_incr": 2048,
+      "gen_tokens": 128
+    }
+  ]
+}
diff --git a/speed-bench/update_summary.py b/speed-bench/update_summary.py
index 953dd580f..2e68815cb 100644
--- a/speed-bench/update_summary.py
+++ b/speed-bench/update_summary.py
@@ -2,6 +2,7 @@
 """Update the generated benchmark summary in speed-bench/README.md."""
 
 import csv
+import json
 from dataclasses import dataclass
 from pathlib import Path
 
@@ -10,6 +11,7 @@
 END_MARKER = "<!-- END GENERATED BENCHMARK SUMMARY -->"
 README = Path(__file__).with_name("README.md")
 BENCH_DIR = Path(__file__).resolve().parent
+METADATA = BENCH_DIR / "benchmarks.json"
 REQUIRED_COLUMNS = {"ctx_tokens", "prefill_tps", "gen_tps"}
 TARGET_CTX = 32768
 
@@ -26,28 +28,34 @@ class BenchSummary:
     avg_prefill: float
 
 
-def benchmark_labels(path: Path) -> tuple[str, str]:
-    name_overrides = {
-        "gb10": ("NVIDIA DGX Spark / GB10", "DeepSeek V4 Flash q2"),
-        "m2_ultra": ("Apple M2 Ultra", "DeepSeek V4 Flash q2"),
-        "m4_max": ("Apple M4 Max", "DeepSeek V4 Flash q2"),
-        "pro_model_m3_ultra": ("Apple M3 Ultra", "DeepSeek V4 PRO q2"),
-    }
-    if path.stem in name_overrides:
-        return name_overrides[path.stem]
-
-    replacements = {
-        "gb10": "GB10",
-        "m2": "M2",
-        "m3": "M3",
-        "m4": "M4",
-        "m5": "M5",
-        "pro": "PRO",
-        "max": "Max",
-        "ultra": "Ultra",
-    }
-    words = path.stem.replace("-", "_").split("_")
-    return " ".join(replacements.get(word.lower(), word) for word in words), "Unspecified model"
+def read_metadata() -> dict[str, dict[str, object]]:
+    try:
+        data = json.loads(METADATA.read_text(encoding="utf-8"))
+    except FileNotFoundError:
+        raise SystemExit(f"{METADATA}: metadata file is required") from None
+    except json.JSONDecodeError as exc:
+        raise SystemExit(f"{METADATA}: invalid JSON: {exc}") from None
+
+    benchmarks = data.get("benchmarks")
+    if not isinstance(benchmarks, list):
+        raise SystemExit(f"{METADATA}: expected a benchmarks list")
+
+    by_csv: dict[str, dict[str, object]] = {}
+    required = {"csv", "hardware", "model_label"}
+    for item in benchmarks:
+        if not isinstance(item, dict):
+            raise SystemExit(f"{METADATA}: benchmark entries must be objects")
+        missing = required.difference(item)
+        if missing:
+            missing_list = ", ".join(sorted(missing))
+            raise SystemExit(f"{METADATA}: benchmark entry missing {missing_list}")
+        csv_name = item["csv"]
+        if not isinstance(csv_name, str) or not csv_name:
+            raise SystemExit(f"{METADATA}: benchmark csv must be a non-empty string")
+        if csv_name in by_csv:
+            raise SystemExit(f"{METADATA}: duplicate benchmark metadata for {csv_name}")
+        by_csv[csv_name] = item
+    return by_csv
 
 
 def fmt_tps(value: float | None) -> str:
@@ -60,7 +68,7 @@ def fmt_tps(value: float | None) -> str:
     return f"{value:.2f}"
 
 
-def read_summary(path: Path) -> BenchSummary:
+def read_summary(path: Path, metadata: dict[str, object]) -> BenchSummary:
     rows = []
     with path.open("r", encoding="utf-8-sig", newline="") as fp:
         reader = csv.DictReader(fp)
@@ -82,10 +90,9 @@ def read_summary(path: Path) -> BenchSummary:
         raise SystemExit(f"{path}: no benchmark rows")
 
     target_row = next((row for row in rows if row["ctx_tokens"] == TARGET_CTX), None)
-    hardware, model = benchmark_labels(path)
     return BenchSummary(
-        hardware=hardware,
-        model=model,
+        hardware=str(metadata["hardware"]),
+        model=str(metadata["model_label"]),
         best_gen=max(row["gen_tps"] for row in rows),
         gen_at_target_ctx=target_row["gen_tps"] if target_row else None,
         avg_gen=sum(row["gen_tps"] for row in rows) / len(rows),
@@ -158,7 +165,12 @@ def main() -> None:
     csv_paths = sorted(BENCH_DIR.glob("*.csv"))
     if not csv_paths:
         raise SystemExit(f"{BENCH_DIR}: no CSV files found")
-    summaries = [read_summary(path) for path in csv_paths]
+    metadata = read_metadata()
+    missing = [path.name for path in csv_paths if path.name not in metadata]
+    if missing:
+        missing_list = ", ".join(missing)
+        raise SystemExit(f"{METADATA}: missing metadata for CSV file(s): {missing_list}")
+    summaries = [read_summary(path, metadata[path.name]) for path in csv_paths]
     generated = render_summary(summaries)
     README.write_text(replace_generated_section(README.read_text(encoding="utf-8"), generated), encoding="utf-8")
 

From 365101f3346f528da2f8b505e9bed1d8b3453dfe Mon Sep 17 00:00:00 2001
From: Bob <dutifulbob@gmail.com>
Date: Mon, 15 Jun 2026 14:38:11 +0800
Subject: [PATCH 09/11] docs(speed-bench): parse benchmark metadata as
 dataclass

---
 speed-bench/update_summary.py | 75 ++++++++++++++++++++++++++---------
 1 file changed, 57 insertions(+), 18 deletions(-)

diff --git a/speed-bench/update_summary.py b/speed-bench/update_summary.py
index 2e68815cb..d154dc4d5 100644
--- a/speed-bench/update_summary.py
+++ b/speed-bench/update_summary.py
@@ -16,6 +16,21 @@
 TARGET_CTX = 32768
 
 
+@dataclass(frozen=True)
+class BenchmarkMetadata:
+    csv: str
+    hardware: str
+    backend: str
+    model: str
+    quant: str
+    model_label: str
+    prompt_file: str
+    ctx_start: int
+    ctx_max: int
+    step_incr: int
+    gen_tokens: int
+
+
 @dataclass
 class BenchSummary:
     hardware: str
@@ -28,7 +43,40 @@ class BenchSummary:
     avg_prefill: float
 
 
-def read_metadata() -> dict[str, dict[str, object]]:
+def require_str(item: dict[str, object], field: str) -> str:
+    value = item.get(field)
+    if not isinstance(value, str) or not value:
+        raise SystemExit(f"{METADATA}: benchmark {field} must be a non-empty string")
+    return value
+
+
+def require_int(item: dict[str, object], field: str) -> int:
+    value = item.get(field)
+    if not isinstance(value, int):
+        raise SystemExit(f"{METADATA}: benchmark {field} must be an integer")
+    return value
+
+
+def parse_benchmark_metadata(item: object) -> BenchmarkMetadata:
+    if not isinstance(item, dict):
+        raise SystemExit(f"{METADATA}: benchmark entries must be objects")
+
+    return BenchmarkMetadata(
+        csv=require_str(item, "csv"),
+        hardware=require_str(item, "hardware"),
+        backend=require_str(item, "backend"),
+        model=require_str(item, "model"),
+        quant=require_str(item, "quant"),
+        model_label=require_str(item, "model_label"),
+        prompt_file=require_str(item, "prompt_file"),
+        ctx_start=require_int(item, "ctx_start"),
+        ctx_max=require_int(item, "ctx_max"),
+        step_incr=require_int(item, "step_incr"),
+        gen_tokens=require_int(item, "gen_tokens"),
+    )
+
+
+def read_metadata() -> dict[str, BenchmarkMetadata]:
     try:
         data = json.loads(METADATA.read_text(encoding="utf-8"))
     except FileNotFoundError:
@@ -40,21 +88,12 @@ def read_metadata() -> dict[str, dict[str, object]]:
     if not isinstance(benchmarks, list):
         raise SystemExit(f"{METADATA}: expected a benchmarks list")
 
-    by_csv: dict[str, dict[str, object]] = {}
-    required = {"csv", "hardware", "model_label"}
+    by_csv: dict[str, BenchmarkMetadata] = {}
     for item in benchmarks:
-        if not isinstance(item, dict):
-            raise SystemExit(f"{METADATA}: benchmark entries must be objects")
-        missing = required.difference(item)
-        if missing:
-            missing_list = ", ".join(sorted(missing))
-            raise SystemExit(f"{METADATA}: benchmark entry missing {missing_list}")
-        csv_name = item["csv"]
-        if not isinstance(csv_name, str) or not csv_name:
-            raise SystemExit(f"{METADATA}: benchmark csv must be a non-empty string")
-        if csv_name in by_csv:
-            raise SystemExit(f"{METADATA}: duplicate benchmark metadata for {csv_name}")
-        by_csv[csv_name] = item
+        metadata = parse_benchmark_metadata(item)
+        if metadata.csv in by_csv:
+            raise SystemExit(f"{METADATA}: duplicate benchmark metadata for {metadata.csv}")
+        by_csv[metadata.csv] = metadata
     return by_csv
 
 
@@ -68,7 +107,7 @@ def fmt_tps(value: float | None) -> str:
     return f"{value:.2f}"
 
 
-def read_summary(path: Path, metadata: dict[str, object]) -> BenchSummary:
+def read_summary(path: Path, metadata: BenchmarkMetadata) -> BenchSummary:
     rows = []
     with path.open("r", encoding="utf-8-sig", newline="") as fp:
         reader = csv.DictReader(fp)
@@ -91,8 +130,8 @@ def read_summary(path: Path, metadata: dict[str, object]) -> BenchSummary:
 
     target_row = next((row for row in rows if row["ctx_tokens"] == TARGET_CTX), None)
     return BenchSummary(
-        hardware=str(metadata["hardware"]),
-        model=str(metadata["model_label"]),
+        hardware=metadata.hardware,
+        model=metadata.model_label,
         best_gen=max(row["gen_tps"] for row in rows),
         gen_at_target_ctx=target_row["gen_tps"] if target_row else None,
         avg_gen=sum(row["gen_tps"] for row in rows) / len(rows),

From 66253da1c5676ebc9cd1de098807aa78248d7f71 Mon Sep 17 00:00:00 2001
From: Bob <dutifulbob@gmail.com>
Date: Mon, 15 Jun 2026 14:43:26 +0800
Subject: [PATCH 10/11] docs(speed-bench): use schema for benchmark metadata

---
 speed-bench/README.md              |  2 +
 speed-bench/benchmarks.json        |  1 +
 speed-bench/benchmarks.schema.json | 79 ++++++++++++++++++++++++++++++
 speed-bench/update_summary.py      | 51 +++++--------------
 4 files changed, 95 insertions(+), 38 deletions(-)
 create mode 100644 speed-bench/benchmarks.schema.json

diff --git a/speed-bench/README.md b/speed-bench/README.md
index 2d0844c32..03a6cad7a 100644
--- a/speed-bench/README.md
+++ b/speed-bench/README.md
@@ -18,6 +18,8 @@ Provide PR including your numbers if your hardware was not already tested.
 Call the benchmark csv file something like `m3_max.csv` or alike, so that
 it is clear what hardware was used for the benchmark.
 Record the machine, backend, model, and run parameters in `benchmarks.json`.
+The summary updater validates it with `benchmarks.schema.json` and requires the
+Python `jsonschema` package.
 
 To generate an SVG graph from a CSV file:
 
diff --git a/speed-bench/benchmarks.json b/speed-bench/benchmarks.json
index 5f93bc2a1..6ac690aca 100644
--- a/speed-bench/benchmarks.json
+++ b/speed-bench/benchmarks.json
@@ -1,4 +1,5 @@
 {
+  "$schema": "./benchmarks.schema.json",
   "schema_version": 1,
   "benchmarks": [
     {
diff --git a/speed-bench/benchmarks.schema.json b/speed-bench/benchmarks.schema.json
new file mode 100644
index 000000000..180597901
--- /dev/null
+++ b/speed-bench/benchmarks.schema.json
@@ -0,0 +1,79 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "type": "object",
+  "required": [
+    "schema_version",
+    "benchmarks"
+  ],
+  "additionalProperties": false,
+  "properties": {
+    "$schema": {
+      "type": "string"
+    },
+    "schema_version": {
+      "const": 1
+    },
+    "benchmarks": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "required": [
+          "csv",
+          "hardware",
+          "backend",
+          "model",
+          "quant",
+          "model_label",
+          "prompt_file",
+          "ctx_start",
+          "ctx_max",
+          "step_incr",
+          "gen_tokens"
+        ],
+        "additionalProperties": false,
+        "properties": {
+          "csv": {
+            "type": "string",
+            "minLength": 1
+          },
+          "hardware": {
+            "type": "string",
+            "minLength": 1
+          },
+          "backend": {
+            "type": "string",
+            "minLength": 1
+          },
+          "model": {
+            "type": "string",
+            "minLength": 1
+          },
+          "quant": {
+            "type": "string",
+            "minLength": 1
+          },
+          "model_label": {
+            "type": "string",
+            "minLength": 1
+          },
+          "prompt_file": {
+            "type": "string",
+            "minLength": 1
+          },
+          "ctx_start": {
+            "type": "integer"
+          },
+          "ctx_max": {
+            "type": "integer"
+          },
+          "step_incr": {
+            "type": "integer"
+          },
+          "gen_tokens": {
+            "type": "integer"
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/speed-bench/update_summary.py b/speed-bench/update_summary.py
index d154dc4d5..04b1f148e 100644
--- a/speed-bench/update_summary.py
+++ b/speed-bench/update_summary.py
@@ -6,12 +6,18 @@
 from dataclasses import dataclass
 from pathlib import Path
 
+try:
+    import jsonschema
+except ImportError as exc:
+    raise SystemExit("python3-jsonschema is required to validate benchmarks.json") from exc
+
 
 BEGIN_MARKER = "<!-- BEGIN GENERATED BENCHMARK SUMMARY -->"
 END_MARKER = "<!-- END GENERATED BENCHMARK SUMMARY -->"
 README = Path(__file__).with_name("README.md")
 BENCH_DIR = Path(__file__).resolve().parent
 METADATA = BENCH_DIR / "benchmarks.json"
+METADATA_SCHEMA = BENCH_DIR / "benchmarks.schema.json"
 REQUIRED_COLUMNS = {"ctx_tokens", "prefill_tps", "gen_tps"}
 TARGET_CTX = 32768
 
@@ -43,39 +49,6 @@ class BenchSummary:
     avg_prefill: float
 
 
-def require_str(item: dict[str, object], field: str) -> str:
-    value = item.get(field)
-    if not isinstance(value, str) or not value:
-        raise SystemExit(f"{METADATA}: benchmark {field} must be a non-empty string")
-    return value
-
-
-def require_int(item: dict[str, object], field: str) -> int:
-    value = item.get(field)
-    if not isinstance(value, int):
-        raise SystemExit(f"{METADATA}: benchmark {field} must be an integer")
-    return value
-
-
-def parse_benchmark_metadata(item: object) -> BenchmarkMetadata:
-    if not isinstance(item, dict):
-        raise SystemExit(f"{METADATA}: benchmark entries must be objects")
-
-    return BenchmarkMetadata(
-        csv=require_str(item, "csv"),
-        hardware=require_str(item, "hardware"),
-        backend=require_str(item, "backend"),
-        model=require_str(item, "model"),
-        quant=require_str(item, "quant"),
-        model_label=require_str(item, "model_label"),
-        prompt_file=require_str(item, "prompt_file"),
-        ctx_start=require_int(item, "ctx_start"),
-        ctx_max=require_int(item, "ctx_max"),
-        step_incr=require_int(item, "step_incr"),
-        gen_tokens=require_int(item, "gen_tokens"),
-    )
-
-
 def read_metadata() -> dict[str, BenchmarkMetadata]:
     try:
         data = json.loads(METADATA.read_text(encoding="utf-8"))
@@ -84,13 +57,15 @@ def read_metadata() -> dict[str, BenchmarkMetadata]:
     except json.JSONDecodeError as exc:
         raise SystemExit(f"{METADATA}: invalid JSON: {exc}") from None
 
-    benchmarks = data.get("benchmarks")
-    if not isinstance(benchmarks, list):
-        raise SystemExit(f"{METADATA}: expected a benchmarks list")
+    schema = json.loads(METADATA_SCHEMA.read_text(encoding="utf-8"))
+    try:
+        jsonschema.validate(data, schema)
+    except jsonschema.ValidationError as exc:
+        raise SystemExit(f"{METADATA}: invalid metadata: {exc.message}") from None
 
     by_csv: dict[str, BenchmarkMetadata] = {}
-    for item in benchmarks:
-        metadata = parse_benchmark_metadata(item)
+    for item in data["benchmarks"]:
+        metadata = BenchmarkMetadata(**item)
         if metadata.csv in by_csv:
             raise SystemExit(f"{METADATA}: duplicate benchmark metadata for {metadata.csv}")
         by_csv[metadata.csv] = metadata

From 3c74bf6a57b813d4077ddb39288f4abdecdd753f Mon Sep 17 00:00:00 2001
From: Bob <dutifulbob@gmail.com>
Date: Mon, 15 Jun 2026 14:46:39 +0800
Subject: [PATCH 11/11] docs(speed-bench): keep metadata parsing stdlib-only

---
 speed-bench/README.md              |  2 -
 speed-bench/benchmarks.json        |  1 -
 speed-bench/benchmarks.schema.json | 79 ------------------------------
 speed-bench/update_summary.py      | 22 ++++-----
 4 files changed, 10 insertions(+), 94 deletions(-)
 delete mode 100644 speed-bench/benchmarks.schema.json

diff --git a/speed-bench/README.md b/speed-bench/README.md
index 03a6cad7a..2d0844c32 100644
--- a/speed-bench/README.md
+++ b/speed-bench/README.md
@@ -18,8 +18,6 @@ Provide PR including your numbers if your hardware was not already tested.
 Call the benchmark csv file something like `m3_max.csv` or alike, so that
 it is clear what hardware was used for the benchmark.
 Record the machine, backend, model, and run parameters in `benchmarks.json`.
-The summary updater validates it with `benchmarks.schema.json` and requires the
-Python `jsonschema` package.
 
 To generate an SVG graph from a CSV file:
 
diff --git a/speed-bench/benchmarks.json b/speed-bench/benchmarks.json
index 6ac690aca..5f93bc2a1 100644
--- a/speed-bench/benchmarks.json
+++ b/speed-bench/benchmarks.json
@@ -1,5 +1,4 @@
 {
-  "$schema": "./benchmarks.schema.json",
   "schema_version": 1,
   "benchmarks": [
     {
diff --git a/speed-bench/benchmarks.schema.json b/speed-bench/benchmarks.schema.json
deleted file mode 100644
index 180597901..000000000
--- a/speed-bench/benchmarks.schema.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "$schema": "https://json-schema.org/draft/2020-12/schema",
-  "type": "object",
-  "required": [
-    "schema_version",
-    "benchmarks"
-  ],
-  "additionalProperties": false,
-  "properties": {
-    "$schema": {
-      "type": "string"
-    },
-    "schema_version": {
-      "const": 1
-    },
-    "benchmarks": {
-      "type": "array",
-      "items": {
-        "type": "object",
-        "required": [
-          "csv",
-          "hardware",
-          "backend",
-          "model",
-          "quant",
-          "model_label",
-          "prompt_file",
-          "ctx_start",
-          "ctx_max",
-          "step_incr",
-          "gen_tokens"
-        ],
-        "additionalProperties": false,
-        "properties": {
-          "csv": {
-            "type": "string",
-            "minLength": 1
-          },
-          "hardware": {
-            "type": "string",
-            "minLength": 1
-          },
-          "backend": {
-            "type": "string",
-            "minLength": 1
-          },
-          "model": {
-            "type": "string",
-            "minLength": 1
-          },
-          "quant": {
-            "type": "string",
-            "minLength": 1
-          },
-          "model_label": {
-            "type": "string",
-            "minLength": 1
-          },
-          "prompt_file": {
-            "type": "string",
-            "minLength": 1
-          },
-          "ctx_start": {
-            "type": "integer"
-          },
-          "ctx_max": {
-            "type": "integer"
-          },
-          "step_incr": {
-            "type": "integer"
-          },
-          "gen_tokens": {
-            "type": "integer"
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/speed-bench/update_summary.py b/speed-bench/update_summary.py
index 04b1f148e..6639e5ab1 100644
--- a/speed-bench/update_summary.py
+++ b/speed-bench/update_summary.py
@@ -6,18 +6,12 @@
 from dataclasses import dataclass
 from pathlib import Path
 
-try:
-    import jsonschema
-except ImportError as exc:
-    raise SystemExit("python3-jsonschema is required to validate benchmarks.json") from exc
-
 
 BEGIN_MARKER = "<!-- BEGIN GENERATED BENCHMARK SUMMARY -->"
 END_MARKER = "<!-- END GENERATED BENCHMARK SUMMARY -->"
 README = Path(__file__).with_name("README.md")
 BENCH_DIR = Path(__file__).resolve().parent
 METADATA = BENCH_DIR / "benchmarks.json"
-METADATA_SCHEMA = BENCH_DIR / "benchmarks.schema.json"
 REQUIRED_COLUMNS = {"ctx_tokens", "prefill_tps", "gen_tps"}
 TARGET_CTX = 32768
 
@@ -57,15 +51,19 @@ def read_metadata() -> dict[str, BenchmarkMetadata]:
     except json.JSONDecodeError as exc:
         raise SystemExit(f"{METADATA}: invalid JSON: {exc}") from None
 
-    schema = json.loads(METADATA_SCHEMA.read_text(encoding="utf-8"))
     try:
-        jsonschema.validate(data, schema)
-    except jsonschema.ValidationError as exc:
-        raise SystemExit(f"{METADATA}: invalid metadata: {exc.message}") from None
+        benchmarks = data["benchmarks"]
+    except (KeyError, TypeError):
+        raise SystemExit(f"{METADATA}: expected a benchmarks list") from None
+    if not isinstance(benchmarks, list):
+        raise SystemExit(f"{METADATA}: expected a benchmarks list")
 
     by_csv: dict[str, BenchmarkMetadata] = {}
-    for item in data["benchmarks"]:
-        metadata = BenchmarkMetadata(**item)
+    for item in benchmarks:
+        try:
+            metadata = BenchmarkMetadata(**item)
+        except TypeError as exc:
+            raise SystemExit(f"{METADATA}: invalid benchmark metadata: {exc}") from None
         if metadata.csv in by_csv:
             raise SystemExit(f"{METADATA}: duplicate benchmark metadata for {metadata.csv}")
         by_csv[metadata.csv] = metadata