diff --git a/.codex/config.toml b/.codex/config.toml
index 3e0fa368..ffb51eba 100644
--- a/.codex/config.toml
+++ b/.codex/config.toml
@@ -6,124 +6,124 @@ codex_hooks = true
[agents.gsd-advisor-researcher]
description = "Researches a single gray area decision and returns a structured comparison table with rationale. Spawned by discuss-phase advisor mode."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-advisor-researcher.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-advisor-researcher.toml"
[agents.gsd-ai-researcher]
description = "Researches a chosen AI framework's official docs to produce implementation-ready guidance — best practices, syntax, core patterns, and pitfalls distilled for the specific use case. Writes the Framework Quick Reference and Implementation Guidance sections of AI-SPEC.md. Spawned by /gsd-ai-integration-phase orchestrator."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-ai-researcher.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-ai-researcher.toml"
[agents.gsd-assumptions-analyzer]
description = "Deeply analyzes codebase for a phase and returns structured assumptions with evidence. Spawned by discuss-phase assumptions mode."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-assumptions-analyzer.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-assumptions-analyzer.toml"
[agents.gsd-code-fixer]
description = "Applies fixes to code review findings from REVIEW.md. Reads source files, applies intelligent fixes, and commits each fix atomically. Spawned by /gsd-code-review-fix."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-code-fixer.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-code-fixer.toml"
[agents.gsd-code-reviewer]
description = "Reviews source files for bugs, security issues, and code quality problems. Produces structured REVIEW.md with severity-classified findings. Spawned by /gsd-code-review."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-code-reviewer.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-code-reviewer.toml"
[agents.gsd-codebase-mapper]
description = "Explores codebase and writes structured analysis documents. Spawned by map-codebase with a focus area (tech, arch, quality, concerns). Writes documents directly to reduce orchestrator context load."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-codebase-mapper.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-codebase-mapper.toml"
[agents.gsd-debug-session-manager]
description = "Manages multi-cycle /gsd-debug checkpoint and continuation loop in isolated context. Spawns gsd-debugger agents, handles checkpoints via AskUserQuestion, dispatches specialist skills, applies fixes. Returns compact summary to main context. Spawned by /gsd-debug command."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-debug-session-manager.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-debug-session-manager.toml"
[agents.gsd-debugger]
description = "Investigates bugs using scientific method, manages debug sessions, handles checkpoints. Spawned by /gsd-debug orchestrator."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-debugger.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-debugger.toml"
[agents.gsd-doc-verifier]
description = "Verifies factual claims in generated docs against the live codebase. Returns structured JSON per doc."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-doc-verifier.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-doc-verifier.toml"
[agents.gsd-doc-writer]
description = "Writes and updates project documentation. Spawned with a doc_assignment block specifying doc type, mode (create/update/supplement), and project context."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-doc-writer.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-doc-writer.toml"
[agents.gsd-domain-researcher]
description = "Researches the business domain and real-world application context of the AI system being built. Surfaces domain expert evaluation criteria, industry-specific failure modes, regulatory context, and what \"good\" looks like for practitioners in this field — before the eval-planner turns it into measurable rubrics. Spawned by /gsd-ai-integration-phase orchestrator."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-domain-researcher.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-domain-researcher.toml"
[agents.gsd-eval-auditor]
description = "Retroactive audit of an implemented AI phase's evaluation coverage. Checks implementation against the AI-SPEC.md evaluation plan. Scores each eval dimension as COVERED/PARTIAL/MISSING. Produces a scored EVAL-REVIEW.md with findings, gaps, and remediation guidance. Spawned by /gsd-eval-review orchestrator."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-eval-auditor.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-eval-auditor.toml"
[agents.gsd-eval-planner]
description = "Designs a structured evaluation strategy for an AI phase. Identifies critical failure modes, selects eval dimensions with rubrics, recommends tooling, and specifies the reference dataset. Writes the Evaluation Strategy, Guardrails, and Production Monitoring sections of AI-SPEC.md. Spawned by /gsd-ai-integration-phase orchestrator."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-eval-planner.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-eval-planner.toml"
[agents.gsd-executor]
description = "Executes GSD plans with atomic commits, deviation handling, checkpoint protocols, and state management. Spawned by execute-phase orchestrator or execute-plan command."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-executor.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-executor.toml"
[agents.gsd-framework-selector]
description = "Presents an interactive decision matrix to surface the right AI/LLM framework for the user's specific use case. Produces a scored recommendation with rationale. Spawned by /gsd-ai-integration-phase and /gsd-select-framework orchestrators."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-framework-selector.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-framework-selector.toml"
[agents.gsd-integration-checker]
description = "Verifies cross-phase integration and E2E flows. Checks that phases connect properly and user workflows complete end-to-end."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-integration-checker.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-integration-checker.toml"
[agents.gsd-intel-updater]
description = "Analyzes codebase and writes structured intel files to .planning/intel/."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-intel-updater.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-intel-updater.toml"
[agents.gsd-nyquist-auditor]
description = "Fills Nyquist validation gaps by generating tests and verifying coverage for phase requirements"
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-nyquist-auditor.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-nyquist-auditor.toml"
[agents.gsd-pattern-mapper]
description = "Analyzes codebase for existing patterns and produces PATTERNS.md mapping new files to closest analogs. Read-only codebase analysis spawned by /gsd-plan-phase orchestrator before planning."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-pattern-mapper.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-pattern-mapper.toml"
[agents.gsd-phase-researcher]
description = "Researches how to implement a phase before planning. Produces RESEARCH.md consumed by gsd-planner. Spawned by /gsd-plan-phase orchestrator."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-phase-researcher.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-phase-researcher.toml"
[agents.gsd-plan-checker]
description = "Verifies plans will achieve phase goal before execution. Goal-backward analysis of plan quality. Spawned by /gsd-plan-phase orchestrator."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-plan-checker.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-plan-checker.toml"
[agents.gsd-planner]
description = "Creates executable phase plans with task breakdown, dependency analysis, and goal-backward verification. Spawned by /gsd-plan-phase orchestrator."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-planner.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-planner.toml"
[agents.gsd-project-researcher]
description = "Researches domain ecosystem before roadmap creation. Produces files in .planning/research/ consumed during roadmap creation. Spawned by /gsd-new-project or /gsd-new-milestone orchestrators."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-project-researcher.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-project-researcher.toml"
[agents.gsd-research-synthesizer]
description = "Synthesizes research outputs from parallel researcher agents into SUMMARY.md. Spawned by /gsd-new-project after 4 researcher agents complete."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-research-synthesizer.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-research-synthesizer.toml"
[agents.gsd-roadmapper]
description = "Creates project roadmaps with phase breakdown, requirement mapping, success criteria derivation, and coverage validation. Spawned by /gsd-new-project orchestrator."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-roadmapper.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-roadmapper.toml"
[agents.gsd-security-auditor]
description = "Verifies threat mitigations from PLAN.md threat model exist in implemented code. Produces SECURITY.md. Spawned by /gsd-secure-phase."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-security-auditor.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-security-auditor.toml"
[agents.gsd-ui-auditor]
description = "Retroactive 6-pillar visual audit of implemented frontend code. Produces scored UI-REVIEW.md. Spawned by /gsd-ui-review orchestrator."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-ui-auditor.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-ui-auditor.toml"
[agents.gsd-ui-checker]
description = "Validates UI-SPEC.md design contracts against 6 quality dimensions. Produces BLOCK/FLAG/PASS verdicts. Spawned by /gsd-ui-phase orchestrator."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-ui-checker.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-ui-checker.toml"
[agents.gsd-ui-researcher]
description = "Produces UI-SPEC.md design contract for frontend phases. Reads upstream artifacts, detects design system state, asks only unanswered questions. Spawned by /gsd-ui-phase orchestrator."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-ui-researcher.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-ui-researcher.toml"
[agents.gsd-user-profiler]
description = "Analyzes extracted session messages across 8 behavioral dimensions to produce a scored developer profile with confidence levels and evidence. Spawned by profile orchestration workflows."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-user-profiler.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-user-profiler.toml"
[agents.gsd-verifier]
description = "Verifies phase goal achievement through goal-backward analysis. Checks codebase delivers what phase promised, not just that tasks completed. Creates VERIFICATION.md report."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-verifier.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-verifier.toml"
diff --git a/.planning/MILESTONES.md b/.planning/MILESTONES.md
index 8e62151b..82b359eb 100644
--- a/.planning/MILESTONES.md
+++ b/.planning/MILESTONES.md
@@ -1,5 +1,72 @@
# Project Milestones: EMEL
+## v1.27 Ryzen AVX2/FMA Kernel Support (Shipped: 2026-06-25)
+
+**Phases completed:** 6 phases, 6 plans, 0 tasks
+
+**Key accomplishments:**
+
+- Added a source-backed x86_64 host feature contract for AVX2, FMA, and F16C
+ while explicitly no-claiming AVX-512, AVX-VNNI, AMX, BF16, native FP16, GPU,
+ and broader x86 feature families.
+
+- Added EMEL-owned AVX2/FMA/F16C flash-attention execution for supported
+ x86_64 requests with deterministic shared fallback/no-claim behavior.
+
+- Added EMEL-owned AVX2/FMA q2_K, q3_K, and q6_K x q8_K hot-path kernels with
+ block-native operand flow, no whole-tensor f32 dequant substitution, and
+ allocation-free supported dispatch proof.
+
+- Integrated the optimized x86_64 path through the maintained generator ->
+ graph -> processor -> kernel chain and paritychecker attribution for `1`,
+ `10`, `100`, and `1000` token generation.
+
+- Published truthful `kernel_x86_64` benchmark evidence with counter-checked
+ optimized flash and q2/q3/q6 rows separated from shared/scalar/reference
+ lanes.
+
+**Audit:** Final source-backed audit passed with 13/13 requirements satisfied.
+The initial `XBN-01` benchmark attribution gap was closed before archive, and
+the x86_64 unary SML rule debt was removed from the milestone path.
+
+---
+
+## v1.26 I/O Staged Read Loading Strategy (Shipped: 2026-05-08)
+
+**Phases completed:** 12 phases, 12 plans, 0 tasks
+
+**Key accomplishments:**
+
+- Established the canonical `src/emel/io/staged_read` Stateforward.SML strategy
+ actor under the existing `emel/io` boundary.
+
+- Modeled source span, target window, stage sizing, platform/resource, and
+ validation behavior through explicit guards/transitions before accepting
+ staged copy work.
+
+- Implemented deterministic source-span staged copy semantics with monotonic
+ progress, explicit terminal success, and named deterministic error categories.
+
+- Integrated staged loading through public tensor-to-I/O dispatch while keeping
+ `model/tensor` as the sole tensor load/bind/evict/residency owner.
+
+- Kept maintained loader, benchmark, paritychecker, and embedded probe lanes on
+ public runtime contracts with no actor-internal reach-through.
+
+- Closed the source-backed audit by repairing direct tensor staged-load
+ nonzero-offset source-window behavior and reconciling closeout artifact plus
+ embedded-probe reporting truth.
+
+**Audit:** Final source-backed audit passed with all active requirements
+satisfied after Phase 237 and Phase 238 gap closure. `ESG-02B` remains
+deferred/future because real file open/seek/read and per-stage short-read
+taxonomy requires a separately approved file-backed staged-read source path.
+
+**Known deferred items at close:** `ESG-02B` plus the carried-forward deferred
+items listed in STATE.md.
+
+---
+
## v1.25 I/O Read Loading Strategy (Shipped: 2026-05-06)
**Phases completed:** 16 phases, 21 plans, 12 tasks
diff --git a/.planning/PROJECT.md b/.planning/PROJECT.md
index bfda3a07..69dfda88 100644
--- a/.planning/PROJECT.md
+++ b/.planning/PROJECT.md
@@ -16,61 +16,83 @@ before widening API surface or model scope.
## Current State
-Current milestone: `v1.26 I/O Staged Read Loading Strategy` (GitHub issue #63)
-
-Latest shipped milestone: `v1.25 I/O Read Loading Strategy`
-
-Status: `v1.25` shipped on 2026-05-06 after Phase 225 review-fix cleanup and a
-refreshed source-backed milestone audit passed. The repo now ships the issue #62 read/copy
-strategy path beneath the existing `src/emel/io` boundary while preserving
-tensor-owned residency. `src/emel/io/read` is the canonical read/copy actor,
-`model/tensor` owns the target buffer and residency commit, and maintained
-loader/tool lanes select/report read/copy through public runtime surfaces.
-Maintained source-byte loading for benchmark, paritychecker, and embedded probe
-evidence now uses the public `emel::io::source::load_file_bytes` setup-time
-contract instead of actor-internal `io/read/detail.hpp`. Phase 224 also
-confirmed Phase 214 is historical, clarified the direct `request_read_load`
-coverage shape, and captured fresh passing `emel_tests_io` evidence before
-archive.
-
-Planning for `v1.26` (issue #63) adds a bounded `src/emel/io/staged_read`
-Stateforward.SML actor for constrained-memory chunked/staged tensor loads,
-integrated through the existing tensor-to-I/O boundary from issue #60, without
-moving tensor residency ownership out of `model/tensor` and without cooperative
-coroutine scheduling unless separately approved.
-
-## Current Milestone: v1.26 I/O Staged Read Loading Strategy
+Current milestone: none.
+
+Latest shipped milestone: `v1.27 Ryzen AVX2/FMA Kernel Support`
+
+Status: `v1.27` shipped on 2026-06-25 for this host CPU, an AMD Ryzen 9 5950X.
+The practical native feature contract is x86_64 AVX2 plus FMA, with F16C
+conversion support only. Phases 239-244 are verified for the host contract,
+optimized flash attention, q2_K/q3_K, q6_K, allocation-free quantized hot-path
+contract, runtime integration, maintained parity attribution, and truthful
+benchmark publication. Approved snapshot updates landed for the
+`kernel_x86_64` benchmark baseline and the maintained LFM2 generation
+publication baselines. The source-backed milestone audit passed after repairing
+the `XBN-01` optimized benchmark attribution gap and removing the x86_64 unary
+SML rule debt; the scoped quality gate passed after those updates.
+
+## Latest Shipped Milestone: v1.27 Ryzen AVX2/FMA Kernel Support
+
+**Goal:** Bring the maintained x86_64 runtime path on this Ryzen host up to the
+same standard as the earlier NEON/AArch64 support: EMEL-owned AVX2/FMA flash
+and quantized hot-path kernels, explicit runtime attribution, maintained parity
+proof, and truthful benchmark publication.
+
+**Source:** User request on 2026-06-25: "add support for this processor exactly
+how NEON was added"; local host inspection reports AMD Ryzen 9 5950X with
+AVX2, FMA, and F16C, and without AVX-512/VNNI/AMX/BF16/native FP16.
+
+**Target features:**
+- x86_64 host feature contract that detects and publishes AVX2, FMA, and F16C
+ availability while explicitly marking unsupported feature families as no-claim.
+- Host-tuned x86_64 build/config support analogous to the AArch64 host-feature
+ switch, without requiring unsupported AVX-512, AVX-VNNI, AMX, BF16, or native
+ FP16 instructions.
+- EMEL-owned AVX2/FMA flash-attention implementation for supported x86_64
+ requests, with deterministic fallback/no-claim behavior for unsupported
+ shapes and operand contracts.
+- EMEL-owned AVX2/FMA `q2_K/q3_K/q6_K x q8_K` hot-path kernels using the same
+ effective operand class as the reference path, with zero hot-path allocation
+ and no whole-tensor dequantize-to-f32 substitution.
+- Runtime integration through the shipped generator -> graph -> processor ->
+ kernel chain, with counters/attribution, maintained `1/10/100/1000` parity
+ proof, and source-backed tests for supported and fallback behavior.
+- Benchmark and documentation publication that truthfully distinguishes x86_64
+ Ryzen evidence from ARM-first claims and from reference-lane results.
+
+**Archive:** `.planning/milestones/v1.27-ROADMAP.md`
+**Requirements:** `.planning/milestones/v1.27-REQUIREMENTS.md`
+**Audit:** `.planning/milestones/v1.27-MILESTONE-AUDIT.md`
+
+## Latest Shipped Milestone: v1.26 I/O Staged Read Loading Strategy
**Goal:** Add a dedicated `io/staged_read` Stateforward.SML strategy actor under
`src/emel/io` so tensor-owned model loading can request bounded staging/chunked
-read residency into caller-owned target memory through the public I/O boundary,
-without folding mmap, full-span single-shot read/copy internals, async
-cooperative scheduling, or device strategy behavior into this issue.
+read residency into caller-owned target memory through the public I/O boundary.
**Source:** GitHub issue #63, "Add io/staged_read state machine for
-constrained-memory tensor loading" (depends on issue #60 boundary; follows
-v1.24 mmap and v1.25 read/copy strategy milestones).
+constrained-memory tensor loading"
-**Target features:**
-- Dedicated `src/emel/io/staged_read` machine: events, guards, actions, context,
- errors, and public aliases following `AGENTS.md` / `docs/rules/sml.rules.md`
- (destination-first transitions, no dispatch-local context handoff, explicit
- guard-modeled validation and chunk/stage policy).
-- Tensor-to-I/O integration that lets `model/tensor` request staged read loading
- through the public `emel/io` boundary while `model/tensor` remains the
- residency lifecycle owner; the staged strategy never takes residency
- ownership of the target tensor buffer.
-- Deterministic multi-stage progress: explicit success, chunk/short-read,
- validation, platform-unsupported, and file errors surfaced through states and
- events without hidden behavior selection in actions or `detail` helpers.
-- RTC-safe externalization of blocking filesystem work per project conventions;
- bounded transient resources per stage; no handle pool retained across dispatch
- boundaries beyond what prior I/O actors allow.
-- Maintained tests, docs, lint snapshots, benchmark snapshots, and model
- artifacts updated from maintained commands when required; public reporting
- reflects actual staged-read runtime usage.
-
-## Latest Shipped Milestone: v1.25 I/O Read Loading Strategy
+**Shipped:** 2026-05-08
+
+**Delivered:**
+- Added the canonical `src/emel/io/staged_read` Stateforward.SML actor with
+ guard-modeled source span, target window, stage sizing, and platform/resource
+ validation.
+- Implemented deterministic source-span staged copy semantics with monotonic
+ forward progress, explicit terminal success, and named deterministic errors.
+- Preserved tensor-owned residency: `model/tensor` dispatches through public
+ `emel/io` events and remains the load/bind/evict/residency owner.
+- Kept maintained loader, benchmark, paritychecker, and embedded probe lanes on
+ public runtime contracts, with no reach-through into actor internals.
+- Closed the source-backed audit by repairing direct tensor staged-load
+ nonzero-offset behavior and reconciling closeout artifact/reporting truth.
+
+**Audit:** Final source-backed audit passed with all active requirements
+satisfied. `ESG-02B` remains deferred/future for a separately approved
+file-backed staged-read source path.
+
+## Previous Shipped Milestone: v1.25 I/O Read Loading Strategy
**Goal:** Add a dedicated `io/read` Stateforward.SML strategy actor under `src/emel/io`
so tensor-owned model loading can request explicit read/copy residency into a
@@ -446,12 +468,23 @@ truth anchor and without broadening into generic Liquid-family support.
### Active
-- v1.26 defines scoped staged/chunked constrained-memory loading under
- `src/emel/io/staged_read` with tensor-owned residency (see
- `.planning/REQUIREMENTS.md` and `.planning/ROADMAP.md`).
+- v1.27 defines scoped x86_64 Ryzen AVX2/FMA kernel support that mirrors the
+ earlier NEON/AArch64 optimization path: host feature contract, flash
+ attention, quantized hot-path kernels, maintained runtime/parity proof, and
+ benchmark attribution (see `.planning/REQUIREMENTS.md` and
+ `.planning/ROADMAP.md`).
### Recently Validated
+- v1.26 added the dedicated `src/emel/io/staged_read` Stateforward.SML strategy
+ actor for bounded staged/chunked source-span reads under tensor-owned
+ residency.
+- v1.26 proved staged copy progress, explicit success/failure outcomes, public
+ tensor-to-I/O integration, maintained loader/tool publication truth, and
+ non-regression guardrails for shipped mmap and bulk read/copy strategies.
+- v1.26 intentionally deferred real file open/seek/read and per-stage short-read
+ taxonomy (`ESG-02B`) until a future file-backed staged-read source path is
+ approved.
- v1.25 added a dedicated `src/emel/io/read` Stateforward.SML strategy actor for read/copy
tensor loading.
- v1.25 integrated read-backed residency requests through the public tensor-to-I/O boundary
@@ -461,9 +494,6 @@ truth anchor and without broadening into generic Liquid-family support.
helpers.
- v1.25 kept mmap changes, staged/chunked read policy, device-specific loading, cooperative
async loading, new model families, and broad public API expansion out of scope.
-- v1.26 is the dedicated follow-on for staged/chunked constrained-memory reads; it keeps
- cooperative coroutine scheduling and device-specific strategies out of scope unless
- separately approved and must not regress shipped mmap or bulk `io/read` semantics.
### Validated
@@ -622,9 +652,11 @@ truth anchor and without broadening into generic Liquid-family support.
## Context
This remains a brownfield repository with an existing codebase map under `.planning/codebase/`.
-The repo stays governed by `AGENTS.md` and `docs/rules/sml.rules.md`. `v1.25` is
-the latest shipped I/O milestone; `v1.26` plans constrained-memory staged reads
-below the same boundary. Earlier shipped work includes quality gate optimization
+The repo stays governed by `AGENTS.md` and `docs/rules/sml.rules.md`. `v1.26` is
+the latest shipped I/O milestone; `v1.27` returns to native kernel performance
+work on this x86_64 Ryzen host by mirroring the earlier NEON/AArch64 progression
+for flash, quantized kernels, runtime proof, and benchmark attribution. Earlier
+shipped work includes quality gate optimization
(`v1.21`) with manifest-backed selective runners,
conservative fallback, and parallel lane reporting. The current maintained state includes repo-owned
EMEL generation, embedding, diarization, and Whisper ASR lanes plus pluggable parity and benchmark
@@ -636,7 +668,7 @@ mandatory validation or change benchmark/parity semantics. `v1.22` shipped from
shipped from issue #60 and added the missing `emel/io` orchestration boundary under tensor-owned
residency while deferring concrete strategy machines to follow-on milestones (mmap #61,
read/copy #62, staged read #63). `v1.24` shipped mmap; `v1.25` shipped bulk read/copy. `v1.26`
-owns constrained-memory staged reads under issue #63.
+shipped constrained-memory staged reads under issue #63.
## Constraints
@@ -681,12 +713,17 @@ owns constrained-memory staged reads under issue #63.
`src/emel/io/staged_read` beneath tensor-owned residency. It must not regress shipped mmap or
bulk read/copy strategy machines, introduce cooperative coroutine scheduling, add device-specific
strategies, or move tensor residency ownership out of `model/tensor`.
+- **x86_64 Ryzen kernel scope**: `v1.27` targets this host's AVX2/FMA feature set with F16C
+ conversion only. It must not claim AVX-512, AVX-VNNI, AMX, BF16, native FP16 arithmetic, GPU
+ acceleration, broad public API widening, or dequantize-to-f32 hot-path substitution unless a
+ future user-approved milestone explicitly changes that performance contract.
## Key Decisions
| Decision | Rationale | Outcome |
|----------|-----------|---------|
-| Start v1.26 from GitHub issue #63 as the `io/staged_read` constrained-memory milestone | v1.25 shipped bulk read/copy; constrained-memory staging is the next narrow strategy slice under tensor-owned residency and the issue #60 boundary | ⏳ Planned |
+| Start v1.27 as Ryzen AVX2/FMA kernel support | The user asked to add support for this processor exactly how NEON was added; this host is an AMD Ryzen 9 5950X with AVX2/FMA/F16C but no AVX-512/VNNI/AMX/BF16/native FP16, so the milestone mirrors the NEON flash/quantized/runtime/benchmark progression for x86_64 | Phases 239-244 verified; milestone audit and closeout lifecycle next |
+| Start v1.26 from GitHub issue #63 as the `io/staged_read` constrained-memory milestone | v1.25 shipped bulk read/copy; constrained-memory staging is the next narrow strategy slice under tensor-owned residency and the issue #60 boundary | ✓ Shipped |
| Start v1.25 from GitHub issue #62 as the `io/read` loading strategy milestone | v1.24 shipped the mmap strategy and left read/copy as the next narrow concrete strategy path beneath tensor-owned residency | ✓ Shipped |
| Start v1.24 from GitHub issue #61 as the `io/mmap` loading strategy milestone | v1.23 established the `emel/io` strategy boundary and explicitly deferred concrete mmap behavior; issue #61 is the next narrow strategy path to land beneath tensor-owned residency | ✓ Shipped |
| Start v1.23 from GitHub issue #60 as the `emel/io` boundary milestone | v1.22 moved tensor residency ownership into `model/tensor`; the next architecture step is the explicit I/O strategy seam beneath tensor-owned residency before concrete mmap or staged strategy work lands | Phase 203 closeout cleanup |
@@ -745,4 +782,4 @@ This document evolves at phase transitions and milestone boundaries.
4. Update Context with current state
---
-*Last updated: 2026-05-07 after starting v1.26 I/O staged read loading strategy milestone (issue #63)*
+*Last updated: 2026-06-25 after starting v1.27 Ryzen AVX2/FMA kernel support milestone*
diff --git a/.planning/RETROSPECTIVE.md b/.planning/RETROSPECTIVE.md
index 657360cd..d9c6c587 100644
--- a/.planning/RETROSPECTIVE.md
+++ b/.planning/RETROSPECTIVE.md
@@ -2,6 +2,50 @@
*A living document updated after each milestone. Lessons feed forward into future planning.*
+## Milestone: v1.27 - Ryzen AVX2/FMA Kernel Support
+
+**Shipped:** 2026-06-25
+**Phases:** 6 | **Plans:** 6 | **Sessions:** autonomous execution, source-backed audit, and closeout
+
+### What Was Built
+
+- x86_64 host feature contract for AVX2, FMA, and F16C on the Ryzen 9 5950X.
+- EMEL-owned AVX2/FMA/F16C flash-attention path with explicit fallback/no-claim behavior.
+- EMEL-owned AVX2/FMA q2_K/q3_K/q6_K x q8_K hot-path kernels.
+- Maintained generator and paritychecker attribution proving optimized x86_64 dispatch.
+- `kernel_x86_64` benchmark publication with counter-checked optimized flash and q2/q3/q6 rows.
+
+### What Worked
+
+- The source-backed audit caught a real benchmark-publication gap after phase artifacts looked green.
+- Counter checks in benchmarks made optimized/shared attribution mechanically enforceable.
+- Keeping x86_64 routing in explicit SML guards/transitions made the unary rule-debt repair small.
+
+### What Was Inefficient
+
+- The first benchmark snapshot update only covered common x86_64 rows and had to be repaired before closeout.
+- The quality gate coverage shard was slow under coverage instrumentation, so closeout needed long-running observation.
+
+### Patterns Established
+
+- Benchmark parity claims need counter-backed maintained entries for each optimized lane, not only suite-level presence.
+- x86_64 support should mirror NEON by proving host contract, native kernels, runtime attribution, parity, and publication as one slice.
+- Pre-close audits should separate current milestone blockers from historical backlog artifacts before archiving.
+
+### Key Lessons
+
+1. Artifact agreement is not enough for benchmark truth; source entrypoints and counters must match the claim.
+2. Runtime behavior selection debt can survive in helper APIs even when production transitions are explicit.
+3. Snapshot updates need a second source-backed pass when they create a new benchmark suite.
+
+### Cost Observations
+
+- Model mix: not measured.
+- Sessions: one long autonomous closeout session with an integration-checker agent.
+- Notable: `commit_docs=false` left archive and planning changes local instead of committing them.
+
+---
+
## Milestone: v1.25 - I/O Read Loading Strategy
**Shipped:** 2026-05-06
diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md
index 8c2e8bfb..3ff6e628 100644
--- a/.planning/ROADMAP.md
+++ b/.planning/ROADMAP.md
@@ -2,773 +2,52 @@
## Milestones
-- ✅ **v1.0 EMEL Llama-68M Generation Slice** — shipped 2026-03-08
-- ✅ **v1.1 EMEL Llama-68M Generation Benchmark** — shipped 2026-03-11
-- ✅ **v1.2 Flash Attention** — shipped 2026-03-22
-- ✅ **v1.3 ARM Flash Optimizations** — shipped 2026-03-22
-- ✅ **v1.4 Full Vectorized Quantized Kernels** — shipped 2026-03-25
-- ✅ **v1.5 Full ARM Quantized Path** — shipped 2026-03-27
-- ✅ **v1.6 Qwen3-0.6B Parity And Benchmark** — shipped 2026-03-30
-- ✅ **v1.7 Generator Prefill Submachine Decomposition** — shipped 2026-03-30
-- ✅ **v1.8 Truthful Qwen3 E2E Embedded Size** — shipped 2026-04-02
-- ✅ **v1.9 Liquid LFM2.5-1.2B Thinking ARM Slice** — shipped 2026-04-02
-- ✅ **v1.11 TE-75M GGUF Trimodal Embedding Runtime** — shipped 2026-04-15
-- ✅ **v1.12 Pluggable Reference Parity Bench Architecture** — shipped 2026-04-18
-- ✅ **v1.13 Pluggable Generative Parity Bench** — shipped 2026-04-21
-- ✅ **v1.14 Benchmark Variant Organization** — shipped 2026-04-21
-- ✅ **v1.15 ARM Sortformer Diarization GGUF Slice** — shipped 2026-04-25
-- ✅ **v1.16 ARM Whisper GGUF Parity And Performance** — shipped 2026-04-28
-- ✅ **v1.17 Text Generator Domain Alignment** — shipped 2026-04-30
-- ✅ **v1.18 Parity Tool Boundary Refactor** — shipped 2026-05-01
-- ✅ **v1.19 Benchmark Tool Pluggable Runner Refactor** — shipped 2026-05-01
-- ✅ **v1.20 SML Dependency And Namespace Migration** — shipped 2026-05-02
-- ✅ **v1.21 Quality Gate Selective Runner Optimization** — shipped 2026-05-02
-- ✅ **v1.22 Weight Loading Ownership Cutover** — shipped 2026-05-03
-- ✅ **v1.23 I/O Loading Strategy Boundary** — shipped 2026-05-04
-- ✅ **v1.24 I/O Mmap Loading Strategy** — shipped 2026-05-04 (Phases 204-211)
-- ✅ **v1.25 I/O Read Loading Strategy** — shipped 2026-05-06 (Phases 212-226 + 214.1)
-- ✅ **v1.26 I/O Staged Read Loading Strategy** — completed 2026-05-08
- (12 / 12 phases complete; issue #63; `ESG-02B` deferred/future)
-
-## Phases
-
-### ✅ v1.26 I/O Staged Read Loading Strategy (Phases 227-238) — COMPLETE 2026-05-08
-
-Source: GitHub issue #63, "Add io/staged_read state machine for constrained-memory tensor loading".
-Adds `src/emel/io/staged_read` for bounded chunked/windowed reads under tensor-owned residency.
-Depends on the tensor-to-I/O boundary from issue #60. Cooperative coroutine scheduling is out of
-scope unless separately approved. Shipped mmap (`io/mmap`) and bulk read/copy (`io/read`) must not
-regress.
-
-Execution order: 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238.
-
-**Milestone progress (v1.26):** **12 / 12** phases recorded **Complete** in the table below.
-The source-backed milestone audit found a direct tensor staged-load offset-contract gap plus
-closeout artifact debt; Phases 237-238 closed those gaps. `ESG-02B` remains deferred/future
-because file-backed staged-read source ownership is out of scope.
-
-- [x] Phase 227: Staged Read Strategy Component Boundary (STG-01)
-- [x] Phase 228: Span, Target-Window, and Platform Gating (STG-02, STG-03, PLAT-02)
-- [x] Phase 229: Staged Copy Progress and Completion Semantics (STG-04, STG-05, STG-06)
-- [x] Phase 230: Context Cleanness and Per-Attempt Lifetime (STG-07, LIFE-02, SNR-01)
-- [x] Phase 231: Deterministic Error Taxonomy (ESG-01, ESG-02A, ESG-03, ESG-04; ESG-02B deferred)
-- [x] Phase 232: Tensor-Owned Integration Graph (TNX-01, TNX-02, TNX-03, TNX-04)
-- [x] Phase 233: Public Loader and Maintained Entrypoints (PUB-01, PUB-02, PUB-03, PUB-04, PUB-05)
-- [x] Phase 234: Public Dispatch Tests (TST-01, TST-02)
-- [x] Phase 235: Scope and Non-Regression Guardrails (GRD-01, GRD-02, GRD-03, GRD-04, GRD-05)
-- [x] Phase 236: Publication and Evidence Truthfulness (DOC-01, LNT-01, BNH-01, EVI-01)
-- [x] Phase 237: Direct Tensor Staged Offset Contract Repair (TNX-01, TNX-03, TNX-04, TST-01, TST-02)
-- [x] Phase 238: Audit Artifact and Probe Reporting Cleanup (cleanup-only)
-
-#### Phase 227: Staged Read Strategy Component Boundary
-
-**Goal:** Locate canonical `src/emel/io/staged_read` with standard I/O component layout.
-**Depends on:** Phase 226
-**Requirements:** STG-01
-
-**Success criteria:**
-
-1. `src/emel/io/staged_read` exists with canonical `emel::io::staged_read::sm` alias.
-2. Component scope excludes mmap, device transfer, or cooperative async runtime.
-3. Initial fail-closed or smoke dispatch proves actors are wired like sibling I/O strategies.
-
-#### Phase 228: Span, Target-Window, and Platform Gating
-
-**Goal:** All staged preconditions enforced in guards/transitions before any file work.
-**Depends on:** Phase 227
-**Requirements:** STG-02, STG-03, PLAT-02
-
-**Success criteria:**
-
-1. Invalid source staging contract rejected solely via guard-modeled transitions.
-2. Invalid target window/layout rejected solely via guard-modeled transitions.
-3. Unsupported hosts/resources fail closed with explicit unsupported terminal shape.
-
-#### Phase 229: Staged Copy Progress and Completion Semantics
-
-**Goal:** Prove per-stage deterministic copy plus full-span monotone completion.
-**Depends on:** Phase 228
-**Requirements:** STG-04, STG-05, STG-06
-
-**Success criteria:**
-
-1. Test vectors observe correct bytes per staged window.
-2. Completeness tests cover entire logical span order.
-3. Terminal success aligns with copied full span per contract.
-
-#### Phase 230: Context Cleanness and Per-Attempt Lifetime
-
-**Goal:** Bounded handles and residency clarity for the staged actor.
-**Depends on:** Phase 229
-**Requirements:** STG-07, LIFE-02, SNR-01
-
-**Success criteria:**
-
-1. Static or dynamic review shows zero forbidden dispatch-local context mirrors.
-2. Handle lifetime tests/tools show release-before-done semantics.
-3. Tests confirm strategy never asserts tensor residency commits.
-
-#### Phase 231: Deterministic Error Taxonomy
-
-**Goal:** Errors are categorical, observable, exception-free.
-**Depends on:** Phase 230
-**Requirements:** ESG-01, ESG-02A, ESG-03, ESG-04 (`ESG-02B` deferred)
-
-**Success criteria:**
-
-1. At least one doctest per taxonomy family (pre-I/O guard, source-contract read-surface, sequencing/contract) demonstrates deterministic categories through `process_event(...)`.
-2. Source-backed docs explicitly defer `ESG-02B` file open/seek/read + per-stage short-read categories until approved file-backed staged-read ownership exists.
-3. ABI boundary scans show noexcept expectations for surfaced API.
-
-#### Phase 232: Tensor-Owned Integration Graph
-
-**Goal:** Integrate staged loads through explicit tensor+I/O graphs.
-**Depends on:** Phase 231
-**Requirements:** TNX-01, TNX-02, TNX-03, TNX-04
-
-**Closeout ledger (verified):** Manager-scoped **`scripts/quality_gates.sh`** for Phase 232
-changed-file corpus exited **2** (red — **not** exit 0). **`232-VERIFICATION.md`** records **bench_snapshot**
-suite regressions unrelated to staged tensor-integration files and a **paritychecker** failure outside
-Phase 232 scope. Phase 232 completion is ledger-approved **without** claiming a passing full-repo gate run.
-
-**Success criteria:**
-
-1. Requests flow only via public tensors↔IO events.
-2. Residency proofs remain tensor-owned (`model/tensor` retains lifecycle ownership).
-3. Success/failure each have explicit observable terminal representations.
-
-#### Phase 233: Public Loader and Maintained Entrypoints
-
-**Goal:** Strategies observable without actor detail reach-through or duplicate POSIX loops in tools.
-**Depends on:** Phase 232
-**Requirements:** PUB-01, PUB-02, PUB-03, PUB-04, PUB-05
-
-**Closeout (2026-05-08):** **`PUB-01`–`PUB-05`** satisfied per **`233-VERIFICATION.md`** (manager validation +
-**phase233-navigator final review PASS**). Public **`staged_read`** access is through **`io::loader`** and maintained
-tool entrypoints with **`io_staged_read`** wiring; **`tests/model/loader/lifecycle_tests.cpp`** covers the
-storage-backed **`staged_read`** route and include guards.
-
-**Residual:** **`scripts/quality_gates.sh`** was **not** run on a Phase **233** changed-file corpus in
-this closeout slice — **no Phase 233 scoped gate pass is claimed** (full-repo gate truth unchanged from
-Phase **232** ledger where applicable).
-
-**Success criteria:**
-
-1–4. Each lane (loader/bench/parity/probe) has independent proof of public-contract-only access.
-5. Source scan enforcement or doctest proves no duplicated unconstrained staged read shim in tools.
-
-#### Phase 234: Public Dispatch Tests
-
-**Goal:** Core success/failure behavior demonstrated through `process_event`.
-**Depends on:** Phase 233
-**Requirements:** TST-01, TST-02
-
-**Success criteria:**
-
-1. Passing success-path doctest with `visit_current_states` or equivalent.
-2. Passing failure-path doctest for guard rejection.
-
-#### Phase 235: Scope and Non-Regression Guardrails
-
-**Goal:** Freeze architecture invariants relative to loaders, mmap, and read strategies.
-**Depends on:** Phase 234
-**Requirements:** GRD-01, GRD-02, GRD-03, GRD-04, GRD-05
-
-**Success criteria:** Each of GRD-01, GRD-02, GRD-03, GRD-04, and GRD-05 has either a deterministic script failure mode or a narrowed regression doctest proving the invariant holds.
-
-#### Phase 236: Publication and Evidence Truthfulness
-
-**Goal:** Align docs and frozen artifacts with real staged/runtime usage.
-**Depends on:** Phase 235
-**Requirements:** DOC-01, LNT-01, BNH-01, EVI-01
-
-**Success criteria:**
-
-1. Doc diff review verifies accurate staged-read wording.
-2. Lint snapshot regeneration path documented/passing.
-3. Benchmark snapshot regeneration obeys policy.
-4. Parity/compare metadata never mislabels unstaged workloads as staged.
-
-**Closeout (2026-05-08):** **`DOC-01`–`EVI-01`** satisfied per
-**`236-VERIFICATION.md`**. Serial full quality gate passed:
-`EMEL_QUALITY_GATES_SCOPE=full EMEL_QUALITY_GATES_PARALLEL=0 scripts/quality_gates.sh`
-(exit **0**, ended `2026-05-08T21:21:42.028Z`). Benchmark defaults now use bounded routine
-settings (`100` iterations, `3` runs, `10` warmup iterations) with bounded generation and
-diarization defaults.
-
-#### Phase 237: Direct Tensor Staged Offset Contract Repair
-
-**Goal:** Repair direct `model/tensor` staged-load nonzero-offset source-window behavior and prove it through public dispatch.
-**Depends on:** Phase 236
-**Requirements:** TNX-01, TNX-03, TNX-04, TST-01, TST-02
-**Gap Closure:** Closes `.planning/v1.26-MILESTONE-AUDIT.md` findings
-`direct-tensor-staged-offset-contract` and `direct-tensor-staged-nonzero-offset`.
-
-**Success Criteria:**
-
-1. A public `model/tensor::event::request_staged_load` doctest fails before repair and passes after
- repair for a nonzero `file_offset` against a whole-file source buffer.
-2. Direct tensor staged-load source-span construction is aligned with `io/loader` or the
- pre-windowed-source contract is explicitly documented and enforced by validation/tests.
-3. Direct tensor staged-load success and failure outcomes remain explicit `_done` / `_error`
- publications through public `process_event(...)` dispatch and SML state inspection.
-4. Changed-file quality gates for `model/tensor`, `io/staged_read`, and affected tests pass without
- benchmark-regression override.
-5. If implementation changes maintained model or snapshot artifacts, those artifacts are refreshed
- only through maintained workflows; model artifact updates are approved for this gap-closure work.
-
-**Closeout (2026-05-08):** Phase 237 completed with a failing-first public
-`request_staged_load` nonzero-offset doctest, repaired source-window dispatch in
-`model/tensor`, and passing scoped validation:
-`./build/emel_tests_bin --test-case="model_tensor_request_staged_load_*"`,
-`ctest --test-dir build -R '^emel_tests_model_and_batch$' --output-on-failure`,
-and changed-file `scripts/quality_gates.sh` (exit `0`). Reopened requirements
-`TNX-01`, `TNX-03`, `TNX-04`, `TST-01`, and `TST-02` are satisfied by
-`237-VERIFICATION.md`.
-
-#### Phase 238: Audit Artifact and Probe Reporting Cleanup
-
-**Goal:** Reconcile audit artifacts and probe reporting truth after the Phase 237 source repair.
-**Depends on:** Phase 237
-**Requirements:** none — cleanup-only; all reopened requirement closure belongs to Phase 237
-**Gap Closure:** Closes `.planning/v1.26-MILESTONE-AUDIT.md` tech-debt items for missing
-`requirements-completed` SUMMARY frontmatter and embedded-size probe reporting clarity.
-
-**Success Criteria:**
-
-1. Phase summaries for 232–236 expose accurate `requirements-completed` frontmatter or an explicit
- cleanup rationale so the three-source audit matrix no longer needs manual reconciliation.
-2. Embedded-size probe evidence either prints the executed load strategy when appropriate or the
- maintained docs/audit explain why captured `used_io_strategy` is the authoritative evidence
- surface.
-3. REQUIREMENTS, ROADMAP, STATE, and the milestone audit are refreshed from source-backed evidence
- after Phase 237.
-4. Focused lint/docs/audit commands pass; no maintained benchmark, model, or snapshot artifact is
- updated unless the implementation actually requires it.
-
-**Closeout (2026-05-08):** Phase 238 completed summary frontmatter reconciliation,
-embedded probe reporting truth documentation, and refreshed `v1.26-MILESTONE-AUDIT.md`
-to `status: passed`. Changed-file `scripts/quality_gates.sh` passed with no benchmark,
-coverage, parity, fuzz, or docsgen-affecting lanes required.
-
----
-### ✅ v1.25 I/O Read Loading Strategy (Phases 212-226 + 214.1) — SHIPPED 2026-05-06
-
-Source: GitHub issue #62, "Add io/read state machine for copy-based tensor loading".
-Adds a dedicated `src/emel/io/read` Stateforward.SML actor for explicit read/copy tensor
-loading beneath tensor-owned residency. Mmap, staged/chunked constrained-memory, async,
-and device strategies remain out of scope.
-
-- [x] Phase 212: Read Strategy Component Boundary (1/1 plans) — completed 2026-05-05
-- [x] Phase 213: Read Validation and Platform Gating (1/1 plans) — completed 2026-05-05
-- [x] Phase 214: Read Execution, Errors, and Lifetime (1/1 plans) — completed 2026-05-05; audit found RTC compliance gap
-- [x] Phase 214.1: RTC-Safe Read Execution Boundary Repair (1/1 plans) — gap closure
-- [x] Phase 215: Tensor-Owned Read Integration (1/1 plans) — completed 2026-05-05
-- [x] Phase 216: Public Runtime and Evidence Surfaces (1/1 plans) — completed 2026-05-05
-- [x] Phase 217: Behavior Tests and Scope Guardrails (1/1 plans) — completed 2026-05-05
-- [x] Phase 218: Publication and Maintained Artifact Updates (1/1 plans) — completed 2026-05-05
-- [x] Phase 219: Maintained Read Source Provenance (1/1 plans) — completed
- 2026-05-05; source-backed benchmark/parity/probe read_copy provenance
-- [x] Phase 220: Explicit Tensor Read Outcome Graph (1/1 plans) — completed
- 2026-05-05; tensor read outcomes selected by explicit same-RTC result graph
-- [x] Phase 221: Read Closeout Truth Reconciliation — superseded planning stub
- closed 2026-05-06; Phase 223 owns final closeout
-- [x] Phase 222: Public Read Source Contract Repair (1/1 plans) — completed
- 2026-05-06; actor-detail reach-through removed from maintained lanes
-- [x] Phase 223: Read Closeout Truth And Validation Reconciliation (1/1 plans) —
- completed 2026-05-06; final closeout truth and validation reconciled
-- [x] Phase 224: Read Closeout Tech Debt Cleanup — completed 2026-05-06;
- refreshed audit ambiguity closed with fresh passing `emel_tests_io` evidence
-- [x] Phase 225: Read Closeout Runtime Validation And SML Repair — completed
- 2026-05-06; refreshed source-backed audit gaps closed with dyld fallback evidence
-- [x] Phase 226: Read Batch Cap And Closeout Evidence Refresh — completed
- 2026-05-06; refreshed audit tech debt closed
-
-Archived closeout artifacts:
-- `.planning/milestones/v1.25-ROADMAP.md`
-- `.planning/milestones/v1.25-REQUIREMENTS.md`
-- `.planning/milestones/v1.25-MILESTONE-AUDIT.md`
-- `.planning/milestones/v1.25-phases/`
-
-**Execution Order:** Phases execute in numeric order:
-212 -> 213 -> 214 -> 214.1 -> 215 -> 216 -> 217 -> 218 -> 219 -> 220 -> 222 -> 223 -> 224 -> 225 -> 226.
-Phase 221 is a completed superseded closeout planning stub and Phase 223 owns final
-source-backed closeout truth. Phase 224 is cleanup-only; Phase 225 owns the refreshed
-2026-05-06 audit gaps before archive. Phase 226 closes the post-audit nonblocking
-tech-debt items before final closeout.
-
-#### Phase 212: Read Strategy Component Boundary
-**Goal**: Maintainers can identify `io/read` as the canonical read/copy strategy actor under
-`src/emel/io`.
-**Depends on**: Phase 211
-**Requirements**: READ-01
-**Success Criteria** (what must be TRUE):
- 1. Maintainer can inspect `src/emel/io/read` and find component-local `context`, `events`,
- `guards`, `actions`, `errors`, and `sm` ownership.
- 2. Maintainer can use canonical `emel::io::read::sm` ownership and public aliases without
- reaching into actor internals.
- 3. Maintainer can confirm the component is read/copy-only and contains no mmap, staged or
- chunked constrained-memory, cooperative async, device-specific, loader-owned byte access,
- model-family widening, or tool-only read scaffold behavior.
-**Plans**: 01 — Validated 2026-05-05; established canonical `io/read` boundary actor
-and lifecycle tests.
-
-#### Phase 213: Read Validation and Platform Gating
-**Goal**: The read actor accepts read attempts only after explicit request, platform, file,
-offset, length, layout, and target-buffer preconditions pass.
-**Depends on**: Phase 212
-**Requirements**: READ-02, PLAT-01
-**Success Criteria** (what must be TRUE):
- 1. Caller sees invalid request, file, offset, length, layout, or target-buffer preconditions
- rejected before any open or read attempt is accepted.
- 2. Caller sees unsupported platforms and unsupported file/resource shapes fail closed
- deterministically through the I/O abstraction boundary.
- 3. Maintainer can inspect SML guards and transitions and see validation outcomes modeled
- before the open/read attempt.
- 4. Supported requests reach a read-attempt state only after all read preconditions are true.
-**Plans**: 01 — Validated 2026-05-05; added explicit read validation and platform
-gating before the read-attempt placeholder.
-
-#### Phase 214: Read Execution, Errors, and Lifetime
-**Goal**: Successful read requests deliver deterministic copied bytes into the caller-owned
-target buffer with deterministic transient-resource lifetime and deterministic failure
-outcomes, without taking tensor residency ownership.
-**Depends on**: Phase 213
-**Requirements**: READ-03, LIFE-01, ERR-01
-**Success Criteria** (what must be TRUE):
- 1. Caller receives a deterministic copied-bytes outcome on success with the requested byte
- span written into the caller-provided owned target buffer; the read strategy never claims
- residency ownership.
- 2. Read failures surface deterministic error categories (invalid request, unsupported
- resource, unsupported platform, file open failed, file seek failed, file read failed,
- short read, internal error) instead of thrown exceptions or ambiguous status mirroring.
- 3. Transient OS resources (file descriptor / handle) are released through the actor-owned
- attempt before `_done` is published; no kernel handle is held across publication.
- 4. Maintainer can verify dispatch-local request data is not stored in `read::context` and
- tensor residency semantics remain owned by `model/tensor`.
-**Plans**: 01 — Validated 2026-05-05; added concrete read execution, copied-byte
-success, deterministic read errors, and close-before-done lifetime behavior.
-2026-05-05 milestone audit found this phase superseded by unverified Phase 214.1 repair
-work; Phase 214.1 owns source-backed RTC validation and artifact reconciliation.
-
-#### Phase 214.1: RTC-Safe Read Execution Boundary Repair
-**Goal**: The read actor preserves copied-byte success, deterministic errors, and
-close-before-done lifetime evidence without performing blocking or input-dependent
-filesystem work inside SML dispatch.
-**Depends on**: Phase 214
-**Requirements**: READ-03, PLAT-01, LIFE-01, ERR-01
-**Gap Closure**: Closes v1.25 audit gaps for missing Phase 214.1 artifacts, stale Phase
-214 planning truth, and source-backed Nyquist validity after the read actor moved to
-caller-provided source spans.
-**Success Criteria** (what must be TRUE):
- 1. `src/emel/io/read` no longer calls platform open, seek, read, close, or equivalent
- filesystem APIs from guards, actions, transition helpers, or functions called by them.
- 2. The read actor still accepts only validated read/copy attempts and publishes copied-byte
- `_done` plus deterministic `_error` outcomes through explicit states/events.
- 3. The caller-owned target buffer remains caller-owned, dispatch-local request data is not
- stored in `read::context`, and no transient OS handle is retained or hidden in context.
- 4. Tests prove the repaired behavior through public `process_event(...)` dispatch and SML
- state inspection, including validation failure, unsupported/resource failure, read
- failure, short read, and copied-byte success.
- 5. Phase 214.1 SUMMARY.md, VERIFICATION.md, and VALIDATION.md reconcile ROADMAP.md,
- STATE.md, REQUIREMENTS.md, and generated architecture docs with the source-buffer based
- implementation and do not claim maintained benchmark/parity evidence.
-**Plans**: 01 — Validated 2026-05-05; reconciled read actor evidence with the
-source-buffer based implementation, confirmed no dispatch-time filesystem calls, and
-updated requirement/state artifacts for the Phase 214.1 gap closure.
-
-#### Phase 215: Tensor-Owned Read Integration
-**Goal**: `model/tensor` can request and consume read-backed I/O through the public `emel/io`
-boundary while retaining load, bind, evict, and residency orchestration.
-**Depends on**: Phase 214.1
-**Requirements**: TIO-01, TIO-02
-**Gap Closure**: Closes v1.25 audit gaps for partial tensor-owned read integration and
-callback/status-mediated read outcomes.
-**Success Criteria** (what must be TRUE):
- 1. Tensor load flow can request read-based (copy) loading through public `emel/io` events
- without direct low-level read calls.
- 2. Tensor bind, residency, and evict transitions remain in `model/tensor` and consume read
- success outcomes that reference the caller-owned target buffer.
- 3. Read success, unsupported, validation failure, file open failure, and file read failure
- are visible as explicit `_done` and `_error` events or states.
- 4. Maintainer can verify no callback-selected outcomes, mirrored status fields, or context
- phase flags decide tensor-to-I/O outcomes for read-backed loading.
- 5. Existing source/test progress through `model/loader -> model/tensor -> io/loader ->
- io/read -> tensor apply` is preserved or replaced by a stricter explicit outcome path
- with equivalent public-dispatch tests.
-**Plans**: 01 — Validated 2026-05-05; added tensor-owned
-`request_read_load` public events, explicit read outcome states, and tests for read
-success, unsupported I/O actor, validation failure, file open failure, and file read
-failure.
-
-#### Phase 216: Public Runtime and Evidence Surfaces
-**Goal**: Runtime entrypoints and maintained tool lanes can select or report read-backed
-loading only through public surfaces, and evidence reflects the actual EMEL runtime path.
-**Depends on**: Phase 215
-**Requirements**: TIO-03, VAL-04
-**Gap Closure**: Closes v1.25 audit gaps for maintained benchmark, paritychecker, and
-embedded probe lanes bypassing the read-backed runtime path and for runtime reporting that
-currently exposes only mmap usage.
-**Success Criteria** (what must be TRUE):
- 1. `model/loader`, maintained benchmark lanes, paritychecker lanes, and embedded probes can
- select or report read-backed loading only through public tensor and I/O runtime contracts.
- 2. Maintained benchmark, paritychecker, and embedded probe lanes avoid actor-internal
- reach-through and contain no low-level read logic.
- 3. Benchmark and parity output reports read-strategy usage only when the EMEL lane executed
- the read-backed runtime path.
- 4. Unsupported or fallback behavior is reported as unsupported or non-read-strategy, not as
- read-strategy parity or performance evidence.
- 5. Runtime done/error evidence distinguishes mmap, read/copy, unsupported, and non-I/O
- loading paths without relying on tool-only scaffolds.
-**Plans**: 01 — Validated 2026-05-05; added public model-loader load-strategy
-evidence, maintained tool strategy binding, load-strategy output notes, and
-source-backed tests proving benchmark/parity/embedded lanes avoid callback-time
-actor reach-through.
-
-#### Phase 217: Behavior Tests and Scope Guardrails
-**Goal**: Tests and guardrails prove read behavior through public dispatch and prevent scope
-or ownership leaks.
-**Depends on**: Phase 216
-**Requirements**: VAL-01, VAL-02
-**Gap Closure**: Closes v1.25 audit gaps for missing full-scope read behavior tests,
-domain/source guardrails, and former ambiguous read-strategy naming relative to the
-out-of-scope staged/chunked policy.
-**Success Criteria** (what must be TRUE):
- 1. Doctests drive supported read behavior through `process_event(...)` and inspect SML states
- via `visit_current_states` and/or `is(...)`.
- 2. Doctests cover representative unsupported, validation failure, file open failure, and file
- read failure outcomes through public events.
- 3. Guardrails fail if read implementation leaks into `model/loader` or tensor residency
- ownership moves out of `model/tensor`.
- 4. Guardrails fail if mmap, staged or chunked constrained-memory, cooperative async,
- device-specific, model-family widening, loader-owned byte access, or tool-only read
- scaffold behavior enters this milestone.
- 5. Source guardrails clarify or eliminate any public naming that could present the v1.25
- read/copy path as staged/chunked constrained-memory support.
-**Plans**: 01 — Validated 2026-05-05; renamed the copy strategy to
-`read_copy`, added public-dispatch behavior guardrails, tensor-residency ownership
-guardrails, and maintained tool/model-loader no-reach-through source checks.
-
-#### Phase 218: Publication and Maintained Artifact Updates
-**Goal**: Maintained docs, snapshots, benchmark outputs, model artifacts, and planning truth
-describe read-strategy support exactly as implemented.
-**Depends on**: Phase 217
-**Requirements**: VAL-03
-**Gap Closure**: Closes v1.25 audit gaps for stale planning truth, stale generated docs,
-and missing maintained artifact updates. User approved updating snapshots, benchmarks, and
-models as needed during this gap closure command.
-**Success Criteria** (what must be TRUE):
- 1. Public docs and generated architecture docs describe the read/copy strategy path,
- ownership boundaries, and deferred strategies (mmap shipped in v1.24; staged/async/device
- remain deferred) truthfully.
- 2. Lint snapshots, benchmark snapshots, benchmark outputs, and model artifacts are updated
- from maintained commands when the implementation changes them.
- 3. Planning artifacts record final requirement coverage, validation evidence, and any
- approved artifact updates for v1.25.
- 4. Closeout artifacts do not claim read-strategy support beyond source-backed maintained
- runtime behavior.
- 5. Any snapshot, benchmark, or model artifact changes are produced by maintained commands
- and explicitly tied to source-backed read/copy runtime behavior.
-**Plans**: 01 — Validated 2026-05-05; updated public docs, README template,
-generated architecture docs, benchmark snapshots, planning truth, and final closeout
-audit from maintained commands. The closing full quality gate passed with
-`EMEL_QUALITY_GATES_SCOPE=full EMEL_QUALITY_GATES_PARALLEL=never
-scripts/quality_gates.sh`.
-
-#### Phase 219: Maintained Read Source Provenance
-**Goal**: Maintained benchmark, paritychecker, and embedded probe lanes prove read/copy
-strategy usage from a maintained `src`-owned source contract instead of tool-local full-file
-read scaffolds.
-**Depends on**: Phase 218
-**Requirements**: PLAT-01, TIO-03, VAL-04
-**Gap Closure**: Closes v1.25 audit gaps where generation, Sortformer diarization,
-embedded probe, and paritychecker lanes report `read_copy` after tool-local
-`read_file_bytes` helpers create the source span.
-**Success Criteria** (what must be TRUE):
- 1. Maintained benchmark, paritychecker, and embedded probe lanes no longer own low-level
- file slurp helpers as the source of `read_copy` evidence.
- 2. A maintained `src`-owned loading/source contract feeds `model/loader -> model/tensor ->
- io/loader -> io/read` for read/copy tool evidence.
- 3. `read_copy` benchmark/parity/probe output is emitted only when the EMEL lane actually
- consumed the maintained source contract and executed the public runtime path.
- 4. Unsupported or fallback source behavior is reported as unsupported or non-read-strategy,
- never as read-strategy parity or performance evidence.
- 5. Tests and source guardrails fail on tool-local substitutes for the maintained read/copy
- source path.
-
-#### Phase 220: Explicit Tensor Read Outcome Graph
-**Goal**: Tensor-owned read/copy integration exposes success and failure outcomes through
-explicit state/event routing without callback/status-mediated behavior selection.
-**Depends on**: Phase 219
-**Requirements**: TIO-02
-**Gap Closure**: Closes v1.25 audit gap where `model/tensor` represents final outcomes
-with explicit states/events but still uses callback-mutated runtime status inspected by
-guards to select the read outcome path.
-**Success Criteria** (what must be TRUE):
- 1. `model/tensor` read success, unsupported, validation failure, file open failure, and
- file read failure outcomes are selected by explicit guards/transitions over typed
- same-RTC events, not by callback-mutated status fields.
- 2. Any same-RTC callbacks used for immediate replies do not decide which tensor outcome
- path runs.
- 3. No mirrored status fields, context phase flags, or callback-selected outcomes remain in
- the read-backed tensor outcome path.
- 4. Public doctests prove all representative read success and error outcomes through
- `process_event(...)` and SML state inspection.
-
-#### Phase 221: Read Closeout Truth Reconciliation
-**Goal**: Maintained docs, generated architecture docs, planning artifacts, snapshots,
-benchmark outputs, model artifacts, and the milestone audit describe read/copy support
-exactly as implemented after gap closure.
-**Depends on**: Phase 220
-**Requirements**: superseded by Phase 223
-**Gap Closure**: Closes v1.25 audit gap where closeout artifacts overstated maintained
-read/copy path truth while tool-local source spans still fed the reported lane. User
-approved updating model artifacts, snapshots, and benchmarks as needed during this gap
-closure command.
-**Success Criteria** (what must be TRUE):
- 1. Public docs, generated architecture docs, ROADMAP, REQUIREMENTS, STATE, PROJECT,
- MILESTONES, and the milestone audit describe the maintained read/copy path truthfully.
- 2. Lint snapshots, benchmark snapshots, benchmark outputs, and model artifacts are updated
- from maintained commands when implementation changes require it.
- 3. Phase 214 historical artifacts are reconciled or explicitly marked superseded so they no
- longer conflict with the Phase 214.1 source-buffer truth.
- 4. A source-backed milestone audit passes without relying on tool-only source scaffolds.
- 5. The closing quality gate is run with the appropriate full or changed-file scope and no
- benchmark-regression override unless explicitly documented as transitional.
-**Plans**: 01 — Ready only. 2026-05-06 audit found an additional source-contract
-blocker in Phase 219/216 maintained lanes, so Phase 221 is superseded by the
-Phase 222 source-contract repair and Phase 223 closeout truth plan.
-**Summary**: Superseded 2026-05-06 with no source or requirement claims.
-
-#### Phase 222: Public Read Source Contract Repair
-**Goal**: Maintained benchmark, paritychecker, and embedded probe lanes obtain read/copy
-source bytes through an allowed public or non-actor-internal EMEL-owned contract instead of
-including `emel/io/read/detail.hpp`.
-**Depends on**: Phase 220
-**Requirements**: PLAT-01, TIO-03, VAL-02, VAL-04
-**Gap Closure**: Closes v1.25 audit gaps where maintained lanes replaced tool-local
-`read_file_bytes` helpers with direct actor-detail reach-through, causing paritychecker
-guardrails and maintained read/copy evidence to fail.
-**Success Criteria** (what must be TRUE):
- 1. Maintained generation, Sortformer diarization, embedded probe, and paritychecker lanes
- no longer include or call `emel/io/read/detail.hpp` or any actor `detail.hpp` helper for
- benchmark/parity source loading.
- 2. Source-byte loading for maintained read/copy evidence is exposed through an allowed
- EMEL-owned public/runtime/setup contract that does not violate the actor model,
- benchmark/parity harness rules, or `detail.hpp` ownership rules.
- 3. Maintained lanes still report `read_copy` only when the EMEL lane executes the public
- `model/loader -> model/tensor -> io/loader -> io/read` runtime path.
- 4. Guardrails fail on actor-internal reach-through, tool-local read substitutes, and any
- unsupported fallback reported as read/copy evidence.
- 5. Focused paritychecker and maintained generation evidence passes without benchmark
- regression override.
-**Plans**: 01 — Validated 2026-05-06; moved maintained source-byte loading to
-`emel::io::source::load_file_bytes`, removed `io/read/detail.hpp` reach-through
-from maintained lanes, and restored paritychecker/generation guardrail evidence.
-
-#### Phase 223: Read Closeout Truth And Validation Reconciliation
-**Goal**: Final v1.25 closeout truth, generated artifacts, snapshots, benchmark outputs,
-model artifacts, requirements, roadmap state, and milestone audit reflect the post-Phase 222
-maintained read/copy runtime path.
-**Depends on**: Phase 222
-**Requirements**: TIO-02, VAL-01, VAL-03
-**Gap Closure**: Closes v1.25 audit gaps for stale Phase 220 roadmap state, unvalidated
-Phase 221/VAL-03 closeout truth, dyld-blocked test rerun evidence, and final source-backed
-milestone audit truth.
-**Success Criteria** (what must be TRUE):
- 1. ROADMAP, REQUIREMENTS, STATE, PROJECT, MILESTONES, public docs, generated architecture
- docs, and the milestone audit no longer claim stale Phase 218/221 closeout truth.
- 2. Phase 220 progress-table state is reconciled with its completed SUMMARY,
- VERIFICATION, and VALIDATION artifacts.
- 3. Public behavior doctests and maintained guardrails are rerun or the dyld/libSystem launch
- blocker is explicitly captured with source-backed substitute evidence approved for the
- phase.
- 4. Lint snapshots, benchmark snapshots, benchmark outputs, and model artifacts are updated
- only through maintained commands when the repaired implementation changes them.
- 5. A source-backed milestone audit reports every active v1.25 requirement satisfied, with
- no actor-detail reach-through or tool-only maintained-path evidence.
-**Plans**: 01 — Validated 2026-05-06; reconciled final planning truth, generated
-docs checks, lint snapshot checks, public-dispatch doctests, paritychecker
-guardrails, maintained generation compare evidence, repaired batch planner
-benchmark evidence, the full closeout quality gate, and the source-backed
-milestone audit.
-
-#### Phase 224: Read Closeout Tech Debt Cleanup
-**Goal**: Close the nonblocking tech-debt items from the refreshed v1.25 milestone audit
-before archive.
-**Depends on**: Phase 223
-**Requirements**: none — all v1.25 requirements remain satisfied; this phase is cleanup only
-**Gap Closure**: Addresses audit tech debt without resetting any validated requirement:
-historical Phase 214 supersession noise, public tensor read event maintained-lane coverage shape,
-and fresh `emel_tests_io` evidence after the local dyld/libSystem launch blocker is resolved.
-**Success Criteria** (what must be TRUE):
- 1. Phase 214 historical artifacts are either further reconciled or explicitly confirmed as
- intentionally superseded by Phase 214.1 without creating closeout ambiguity.
- 2. Maintainers can tell whether `model::tensor::event::request_read_load` should gain a
- maintained direct-lane coverage path or remain a public tested route while maintained
- model-loader lanes use `model/tensor` plan/apply plus `io/loader -> io/read`.
- 3. Fresh `emel_tests_io` evidence is captured from a healthy local environment, or the
- dyld/libSystem launch blocker is captured with an explicit archive-time decision.
- 4. The milestone audit is rerun and either passes or reports only explicitly accepted
- nonblocking debt.
-**Plans**: 01 — Validated 2026-05-06; Phase 214 supersession clarity,
-`request_read_load` maintained-lane decision evidence, fresh passing
-`emel_tests_io` evidence, and final milestone audit refresh.
-
-#### Phase 225: Read Closeout Runtime Validation And SML Repair
-**Goal**: Close refreshed v1.25 audit gaps by restoring executable model/batch validation,
-moving maintained read/copy per-tensor I/O orchestration out of model-loader action loops,
-and reconciling closeout artifact paths.
-**Depends on**: Phase 224
-**Requirements**: VAL-01, TIO-03, VAL-04, VAL-03
-**Gap Closure**: Closes `.planning/v1.25-MILESTONE-AUDIT.md` findings: current
-`emel_tests_model_and_batch` dyld launch failure, model-loader action-loop
-`io_loader->process_event(...)` SML readiness risk, and stale archived closeout path
-references.
-**Success Criteria** (what must be TRUE):
- 1. `ctest --test-dir build/zig --output-on-failure -R emel_tests_model_and_batch`
- runs to completion or the dyld/libSystem launch blocker is eliminated with a
- source-backed maintained substitute explicitly recorded in validation.
- 2. Maintained read/copy `model/loader -> io/loader` orchestration no longer relies on an
- action loop calling `io_loader->process_event(...)`; runtime choice and per-phase
- orchestration are represented with explicit SML guards/states/transitions.
- 3. The maintained read/copy path still reports `used_io_strategy` only after public
- runtime execution through `model/loader -> model/tensor -> io/loader -> io/read`.
- 4. Closeout artifact paths in active and archived roadmap/requirements/audit docs point
- at files that exist after the v1.25 archive layout.
- 5. Focused model-loader, model/tensor, io/loader, io/read, domain-boundary, consistency,
- and changed-file quality gates pass without benchmark-regression override.
-**Plans**: 6 plans — completed 2026-05-06
-Plans:
-- [x] `225-01-PLAN.md` — Add the owning `io/read` batch copy surface and public-dispatch tests.
-- [x] `225-02-PLAN.md` — Route one `io/loader` read_copy batch to `io/read` with same-RTC result callbacks.
-- [x] `225-03-PLAN.md` — Replace model-loader per-tensor I/O dispatch with one public batch dispatch.
-- [x] `225-04-PLAN.md` — Wire maintained callers and guardrails to request-owned `io_load_spans`.
-- [x] `225-05-PLAN.md` — Reconcile active and archived closeout path and plan traceability.
-- [x] `225-06-PLAN.md` — Publish validation, summary, and active/archived audit evidence.
-
-#### Phase 226: Read Batch Cap And Closeout Evidence Refresh
-**Goal**: Close the nonblocking tech-debt items from `.planning/v1.25-MILESTONE-AUDIT.md`
-by bounding the public read/copy batch API independently and refreshing closeout evidence
-to match current executable validation.
-**Depends on**: Phase 225
-**Requirements**: none — all v1.25 requirements remain satisfied; this phase is cleanup only
-**Gap Closure**: Closes audit tech debt for the uncapped public `io/read`
-`read_tensor_batch` span and stale dyld-fallback closeout wording after current focused
-CTest passed.
-**Success Criteria** (what must be TRUE):
- 1. Public `io/read::event::read_tensor_batch` dispatch rejects over-large spans before
- iterating or copying, with the cap owned by a public/read-side contract rather than
- relying only on maintained model-loader callers.
- 2. Doctests prove accepted boundary-size batches and rejected over-large batches through
- public `process_event(...)` dispatch and SML state inspection.
- 3. Active and archived closeout evidence distinguishes historical dyld fallback evidence
- from current direct `build/zig` focused CTest evidence.
- 4. If the repaired implementation changes maintained snapshots, benchmark outputs,
- benchmark snapshots, or model artifacts, those artifacts are updated only through
- maintained commands. User permission for those updates was granted with this phase.
- 5. Changed-file quality gates pass without benchmark-regression override, and the
- refreshed milestone audit reports no blockers.
-**Plans**: 01 — Validated 2026-05-06; public `io/read` batch cap added,
-exact-cap and over-cap doctests passed, closeout evidence refreshed, and
-changed-file quality gate passed.
-
-#### Coverage
-
-| Requirement | Phase |
-|-------------|-------|
-| READ-01 | Phase 212 |
-| READ-02 | Phase 213 |
-| PLAT-01 | Phase 222 |
-| READ-03 | Phase 214.1 |
-| LIFE-01 | Phase 214.1 |
-| ERR-01 | Phase 214.1 |
-| TIO-01 | Phase 215 |
-| TIO-02 | Phase 223 |
-| TIO-03 | Phase 225 |
-| VAL-04 | Phase 225 |
-| VAL-01 | Phase 225 |
-| VAL-02 | Phase 222 |
-| VAL-03 | Phase 225 |
-
-Mapped: 13/13 v1 requirements; validated 13, pending 0. Phases 224 and 226 are
-cleanup-only; Phase 225 closed refreshed closeout gaps for VAL-01, TIO-03, VAL-04,
-and VAL-03.
-
-
-✅ v1.24 I/O Mmap Loading Strategy (Phases 204-211) — SHIPPED 2026-05-04
-
-- [x] Phase 204: Mmap Strategy Component Boundary (1/1 plans) — completed 2026-05-04
-- [x] Phase 205: Mmap Validation and Platform Gating (1/1 plans) — completed 2026-05-04
-- [x] Phase 206: Mapped Descriptor, Errors, and Lifetime (1/1 plans) — completed 2026-05-04
-- [x] Phase 207: Tensor-Owned Mmap Integration (1/1 plans) — completed 2026-05-04
-- [x] Phase 208: Public Runtime and Evidence Surfaces (1/1 plans) — completed 2026-05-04
-- [x] Phase 209: Behavior Tests and Scope Guardrails (1/1 plans) — completed 2026-05-04
-- [x] Phase 210: Publication and Maintained Artifact Updates (1/1 plans) — completed 2026-05-04
-- [x] Phase 211: Phase Verification Artifact Backfill (1/1 plans) — completed 2026-05-04 (gap closure)
-
-Archive:
-- `.planning/milestones/v1.24-ROADMAP.md`
-- `.planning/milestones/v1.24-REQUIREMENTS.md`
-- `.planning/milestones/v1.24-MILESTONE-AUDIT.md`
-- `.planning/milestones/v1.24-phases/{204..210}-*` (Phase 211 backfill artifacts live alongside their parent phase dirs)
-
-
-
-
-✅ v1.23 I/O Loading Strategy Boundary (Phases 197-203) — SHIPPED 2026-05-04
-
-Archive:
-- `.planning/milestones/v1.23-ROADMAP.md`
-- `.planning/milestones/v1.23-REQUIREMENTS.md`
-- `.planning/milestones/v1.23-MILESTONE-AUDIT.md`
-- `.planning/milestones/v1.23-phases/`
-
-
-
-### 📋 Milestone backlog
-
-Older “next milestone” staging notes are superseded by **v1.26** (issue #63) in active planning
-artifacts (`REQUIREMENTS.md`, `STATE.md`). Future milestones after v1.26 continue via
-`$gsd-new-milestone`.
-
-## Progress
-
-| Phase | Milestone | Plans Complete | Status | Completed |
-|-------|-----------|----------------|--------|-----------|
-| 227. Staged Read Strategy Component Boundary | v1.26 | 1/1 | Complete | 2026-05-07 |
-| 228. Span, Target-Window, and Platform Gating | v1.26 | 1/1 | Complete | 2026-05-07 |
-| 229. Staged Copy Progress and Completion Semantics | v1.26 | 1/1 | Complete | 2026-05-07 |
-| 230. Context Cleanness and Per-Attempt Lifetime | v1.26 | 1/1 | Complete | 2026-05-07 |
-| 231. Deterministic Error Taxonomy | v1.26 | 1/1 | Complete | 2026-05-07 |
-| 232. Tensor-Owned Integration Graph | v1.26 | 1/1 | Complete | 2026-05-07 |
-| 233. Public Loader and Maintained Entrypoints | v1.26 | 1/1 | Complete | 2026-05-08 |
-| 234. Public Dispatch Tests | v1.26 | 1/1 | Complete | 2026-05-08 |
-| 235. Scope and Non-Regression Guardrails | v1.26 | 1/1 | Complete | 2026-05-08 |
-| 236. Publication and Evidence Truthfulness | v1.26 | 1/1 | Complete | 2026-05-08 |
-| 237. Direct Tensor Staged Offset Contract Repair | v1.26 | 1/1 | Complete | 2026-05-08 |
-| 238. Audit Artifact and Probe Reporting Cleanup | v1.26 | 1/1 | Complete | 2026-05-08 |
-| 212. Read Strategy Component Boundary | v1.25 | 1/1 | Validated | 2026-05-05 |
-| 213. Read Validation and Platform Gating | v1.25 | 1/1 | Validated | 2026-05-05 |
-| 214. Read Execution, Errors, and Lifetime | v1.25 | 1/1 | Validated | 2026-05-05 |
-| 214.1. RTC-Safe Read Execution Boundary Repair | v1.25 | 1/1 | Validated | 2026-05-05 |
-| 215. Tensor-Owned Read Integration | v1.25 | 1/1 | Validated | 2026-05-05 |
-| 216. Public Runtime and Evidence Surfaces | v1.25 | 1/1 | Validated | 2026-05-05 |
-| 217. Behavior Tests and Scope Guardrails | v1.25 | 1/1 | Validated | 2026-05-05 |
-| 218. Publication and Maintained Artifact Updates | v1.25 | 1/1 | Validated | 2026-05-05 |
-| 219. Maintained Read Source Provenance | v1.25 | 1/1 | Validated | 2026-05-05 |
-| 220. Explicit Tensor Read Outcome Graph | v1.25 | 1/1 | Validated | 2026-05-05 |
-| 221. Read Closeout Truth Reconciliation | v1.25 | 1/1 | Superseded | 2026-05-06 |
-| 222. Public Read Source Contract Repair | v1.25 | 1/1 | Validated | 2026-05-06 |
-| 223. Read Closeout Truth And Validation Reconciliation | v1.25 | 1/1 | Validated | 2026-05-06 |
-| 224. Read Closeout Tech Debt Cleanup | v1.25 | 1/1 | Complete | 2026-05-06 |
-| 225. Read Closeout Runtime Validation And SML Repair | v1.25 | 6/6 | Complete | 2026-05-06 |
-| 226. Read Batch Cap And Closeout Evidence Refresh | v1.25 | 1/1 | Validated | 2026-05-06 |
-| 204. Mmap Strategy Component Boundary | v1.24 | 1/1 | Complete | 2026-05-04 |
-| 205. Mmap Validation and Platform Gating | v1.24 | 1/1 | Complete | 2026-05-04 |
-| 206. Mapped Descriptor, Errors, and Lifetime | v1.24 | 1/1 | Complete | 2026-05-04 |
-| 207. Tensor-Owned Mmap Integration | v1.24 | 1/1 | Complete | 2026-05-04 |
-| 208. Public Runtime and Evidence Surfaces | v1.24 | 1/1 | Complete | 2026-05-04 |
-| 209. Behavior Tests and Scope Guardrails | v1.24 | 1/1 | Complete | 2026-05-04 |
-| 210. Publication and Maintained Artifact Updates | v1.24 | 1/1 | Complete | 2026-05-04 |
-| 211. Phase Verification Artifact Backfill | v1.24 | 1/1 | Complete | 2026-05-04 |
+- [x] **v1.0 EMEL Llama-68M Generation Slice** - shipped 2026-03-08
+- [x] **v1.1 EMEL Llama-68M Generation Benchmark** - shipped 2026-03-11
+- [x] **v1.2 Flash Attention** - shipped 2026-03-22
+- [x] **v1.3 ARM Flash Optimizations** - shipped 2026-03-22
+- [x] **v1.4 Full Vectorized Quantized Kernels** - shipped 2026-03-25
+- [x] **v1.5 Full ARM Quantized Path** - shipped 2026-03-27
+- [x] **v1.6 Qwen3-0.6B Parity And Benchmark** - shipped 2026-03-30
+- [x] **v1.7 Generator Prefill Submachine Decomposition** - shipped 2026-03-30
+- [x] **v1.8 Truthful Qwen3 E2E Embedded Size** - shipped 2026-04-02
+- [x] **v1.9 Liquid LFM2.5-1.2B Thinking ARM Slice** - shipped 2026-04-02
+- [x] **v1.11 TE-75M GGUF Trimodal Embedding Runtime** - shipped 2026-04-15
+- [x] **v1.12 Pluggable Reference Parity Bench Architecture** - shipped 2026-04-18
+- [x] **v1.13 Pluggable Generative Parity Bench** - shipped 2026-04-21
+- [x] **v1.14 Benchmark Variant Organization** - shipped 2026-04-21
+- [x] **v1.15 ARM Sortformer Diarization GGUF Slice** - shipped 2026-04-25
+- [x] **v1.16 ARM Whisper GGUF Parity And Performance** - shipped 2026-04-28
+- [x] **v1.17 Text Generator Domain Alignment** - shipped 2026-04-30
+- [x] **v1.18 Parity Tool Boundary Refactor** - shipped 2026-05-01
+- [x] **v1.19 Benchmark Tool Pluggable Runner Refactor** - shipped 2026-05-01
+- [x] **v1.20 SML Dependency And Namespace Migration** - shipped 2026-05-02
+- [x] **v1.21 Quality Gate Selective Runner Optimization** - shipped 2026-05-02
+- [x] **v1.22 Weight Loading Ownership Cutover** - shipped 2026-05-03
+- [x] **v1.23 I/O Loading Strategy Boundary** - shipped 2026-05-04
+- [x] **v1.24 I/O Mmap Loading Strategy** - shipped 2026-05-04
+- [x] **v1.25 I/O Read Loading Strategy** - shipped 2026-05-06
+- [x] **v1.26 I/O Staged Read Loading Strategy** - completed 2026-05-08
+- [x] **v1.27 Ryzen AVX2/FMA Kernel Support** - shipped 2026-06-25
+
+## Current Milestone
+
+No active milestone is open.
+
+## Recently Shipped
+
+### v1.27 Ryzen AVX2/FMA Kernel Support
+
+**Shipped:** 2026-06-25
+**Archive:** `.planning/milestones/v1.27-ROADMAP.md`
+**Requirements:** `.planning/milestones/v1.27-REQUIREMENTS.md`
+**Audit:** `.planning/milestones/v1.27-MILESTONE-AUDIT.md`
+
+Delivered native x86_64 AVX2/FMA support for the AMD Ryzen 9 5950X maintained
+runtime slice: host feature contract, optimized flash attention, q2_K/q3_K/q6_K
+x q8_K kernels, maintained generator parity attribution, and truthful
+`kernel_x86_64` benchmark publication. The source-backed audit passed after
+repairing the optimized benchmark attribution gap and removing the x86_64 unary
+SML rule debt.
+
+Next step: run `$gsd-new-milestone` to define the next milestone.
diff --git a/.planning/STATE.md b/.planning/STATE.md
index 65dfd835..d8a860a4 100644
--- a/.planning/STATE.md
+++ b/.planning/STATE.md
@@ -1,17 +1,16 @@
---
gsd_state_version: 1.0
-milestone: v1.26
-milestone_name: I/O Staged Read Loading Strategy
-status: complete
-stopped_at: "v1.26 phases complete; milestone audit passed"
-last_updated: "2026-05-08T22:50:00.000Z"
-last_activity: 2026-05-08
+milestone: v1.27
+milestone_name: Ryzen AVX2/FMA Kernel Support
+status: ready_for_next_milestone
+stopped_at: "v1.27 shipped and archived; next step is new milestone definition"
+last_updated: "2026-06-25T14:35:28.048Z"
+last_activity: 2026-06-25
progress:
- # Matches v1.26 section in .planning/ROADMAP.md (Phases 227-238 inclusive).
- total_phases: 12
- completed_phases: 12
- total_plans: 12
- completed_plans: 12
+ total_phases: 6
+ completed_phases: 6
+ total_plans: 6
+ completed_plans: 6
percent: 100
---
@@ -19,101 +18,109 @@ progress:
## Project Reference
-See: .planning/PROJECT.md (updated 2026-05-07)
+See: .planning/PROJECT.md (updated 2026-06-25)
**Core value:** Prove real end-to-end behavior with explicit SML orchestration and
parity-oriented verification before widening API surface or model scope.
-**Current focus:** Define and execute milestone `v1.26 I/O Staged Read Loading
-Strategy` (GitHub issue #63): bounded `src/emel/io/staged_read` actor for
-chunked constrained-memory loads through the tensor-to-I/O boundary (#60),
-tensor-owned residency unchanged, cooperative coroutine scheduling out of scope
-unless separately approved.
+**Current focus:** Define the next milestone after shipping native x86_64
+AVX2/FMA support for this AMD Ryzen 9 5950X host.
## Current Position
-Milestone: v1.26 I/O Staged Read Loading Strategy — complete (**Phases 227–238** complete).
-Status: **`v1.26-MILESTONE-AUDIT.md`** reports `status: passed`; **`ESG-02B`** remains
-deferred/future by design.
-Phase: none pending.
-Last activity: 2026-05-08 — Phase 238 reconciled summary frontmatter, embedded probe reporting
-truth, and the final source-backed milestone audit after the Phase 237 direct tensor staged-load
-offset repair.
+Milestone: v1.27 Ryzen AVX2/FMA Kernel Support
+Status: v1.27 shipped and archived. Phases 239-244 verified and the
+source-backed milestone audit passed. The repaired `kernel_x86_64` benchmark
+suite includes counter-checked optimized flash and q2/q3/q6 rows.
+Phase: ready for next milestone definition.
+Last activity: 2026-06-25 — `snapshots/bench/benchmarks.txt` plus the
+maintained LFM2 `10`, `100`, and `1000` token generation baselines were
+updated, the source-backed `XBN-01` benchmark gap was repaired, the x86_64
+unary SML rule debt was removed, focused validation and the changed-file scoped
+quality gate passed, the milestone audit passed, and the milestone was archived.
-Progress: [||||||||||] 100%
+Progress: [##########] 100%
-**Evidence (Phase 238 cleanup PASS — 2026-05-08):**
+**Host feature scope:**
-- Summary frontmatter source scan — pass.
-- Embedded probe `used_io_strategy` evidence scan — pass.
-- Phase 238 changed-file `scripts/quality_gates.sh` — exit **0**.
-- Final `.planning/v1.26-MILESTONE-AUDIT.md` — `status: passed`.
+- CPU: AMD Ryzen 9 5950X 16-Core Processor.
+- Supported target features: x86_64 AVX2, FMA, and F16C conversion.
+- Explicit no-claim features: AVX-512, AVX-VNNI, AMX, BF16, native FP16, and GPU
+ acceleration.
-Phase 237 source repair evidence remains in `237-VERIFICATION.md`.
+**Next implementation step:** define the next milestone with `$gsd-new-milestone`.
-**Residual / preserved gate truth:**
+**Closeout gate:** complete.
-- **Phase 235:** quality gate **not attempted** in final milestone-only pass; Phase 235 makes **no** separate `scripts/quality_gates.sh` pass claim.
-- **Phase 233/234:** earlier scoped gate truth remains as recorded in their phase artifacts.
-- **Phase 232:** scoped gate **exit 2** residual — **unchanged** in **`232-VERIFICATION.md`**.
-- **Phase 236:** serial full-repo quality gate **passed** with exit **0**; this is the maintained
- milestone-level gate evidence for closeout.
+## Accumulated Context
-## Performance Metrics
+### Decisions
-**Prior audited milestone:** `v1.25 I/O Read Loading Strategy` remains the latest shipped I/O milestone
-(13/13 requirements satisfied after Phase 226). v1.26 planning continues phase numbering after Phase 226.
+Decisions are logged in PROJECT.md Key Decisions table.
-## Accumulated Context
+Carry-forward architectural constraints:
-### Decisions
+- Runtime behavior selection remains explicit guards and transitions
+ (`AGENTS.md` / `docs/rules/sml.rules.md`).
-Decisions are logged in PROJECT.md Key Decisions table. v1.26 source: GitHub issue #63.
+- Kernel arithmetic, lowering, packing, quant/dequant, and backend-specific
+ numeric work stays in the owning kernel layer.
-Carry-forward architectural constraints from shipped I/O milestones:
+- The EMEL lane stays repo-owned and separate from llama.cpp/ggml reference
+ runtime state; reference linkage is comparison-only in tools.
-- `model/tensor` remains the canonical residency owner; I/O strategies do not claim tensor buffer ownership.
-- `model/loader` stays orchestration-only with no low-level byte strategy in loader code paths.
-- Runtime behavior selection remains explicit guards and transitions (AGENTS.md / `docs/rules/sml.rules.md`).
+- Quantized kernel parity requires the same effective operand class as the
+ reference path; whole-tensor dequantize-to-f32 hot-path substitution requires
+ explicit user approval and is not part of v1.27.
-### Pending Todos
+- Benchmark and parity claims must be source-backed by the maintained runtime
+ path, not only planning artifacts or tool-local scaffolds.
+
+### Carry-Forward Backlog
- 2026-04-02 - Move eager quant prepack into generator initializer
+ (`.planning/todos/backlog/2026-04-02-move-eager-quant-prepack-into-generator-initializer.md`)
+
- 2026-04-02 - Reuse q8 RHS across LFM2.5 prefill matmuls
+ (`.planning/todos/backlog/2026-04-02-reuse-q8-rhs-across-lfm2-5-prefill-matmuls.md`)
+
- 2026-04-02 - Optimize LFM2.5 q4 prefill kernel
-- 2026-04-02 - Optimize LFM2.5 q6 prefill kernel
+ (`.planning/todos/backlog/2026-04-02-optimize-lfm2-5-q4-prefill-kernel.md`)
-### Blockers/Concerns
+- 2026-04-02 - Optimize LFM2.5 q6 prefill kernel
+ (`.planning/todos/backlog/2026-04-02-optimize-lfm2-5-q6-prefill-kernel.md`)
-- No active v1.26 blockers remain.
-- ESG-02B remains deferred by design until approved file-backed staged source ownership introduces real open/seek/read lifecycle semantics.
+These pre-existing LFM2.5 performance backlog items are outside the v1.27 Ryzen
+AVX2/FMA support contract and are not milestone close blockers.
-### Prior milestone notes
+### Blockers/Concerns
-The following summarized v1.25 execution context and remains historical reference:
+- `ESG-02B` from v1.26 remains outside v1.27 processor scope until a file-backed
+ staged-read source path is separately approved.
-
-v1.25 phase trail (collapsed)
+- v1.27 must not present AVX-512/VNNI/AMX/BF16/native-FP16 claims for this host.
-- Phase 225/226 refined read batch APIs, audit evidence, and SML hygiene for shipped `io/read`.
-- Public `read_tensor_batch` has an independent span cap; benchmarks and parity lanes use `emel::io::source::load_file_bytes` for setup-time bytes.
+### Prior milestone notes
-
+`v1.26 I/O Staged Read Loading Strategy` completed on 2026-05-08. Its final
+audit passed after Phase 237 repaired direct tensor staged-load nonzero-offset
+behavior and Phase 238 reconciled artifact/reporting truth. Active v1.26
+evidence is archived under `.planning/milestones/v1.26-*`.
-## Deferred Items
+## Historical Carry-Forward Items
-Items acknowledged and deferred at v1.25 milestone close on 2026-05-06 (unchanged):
+Items acknowledged at v1.25 milestone close on 2026-05-06 (unchanged):
| Category | Item | Status |
|----------|------|--------|
-| quick_task | 260401-ejm-add-non-blocking-benchmark-binary-size-c | missing |
-| todo | 2026-04-02-move-eager-quant-prepack-into-generator-initializer.md | pending |
-| todo | 2026-04-02-optimize-lfm2-5-q4-prefill-kernel.md | pending |
-| todo | 2026-04-02-optimize-lfm2-5-q6-prefill-kernel.md | pending |
-| todo | 2026-04-02-reuse-q8-rhs-across-lfm2-5-prefill-matmuls.md | pending |
+| quick_task | 260401-ejm-add-non-blocking-benchmark-binary-size-c | complete |
+| todo | 2026-04-02-move-eager-quant-prepack-into-generator-initializer.md | backlog |
+| todo | 2026-04-02-optimize-lfm2-5-q4-prefill-kernel.md | backlog |
+| todo | 2026-04-02-optimize-lfm2-5-q6-prefill-kernel.md | backlog |
+| todo | 2026-04-02-reuse-q8-rhs-across-lfm2-5-prefill-matmuls.md | backlog |
## Session Continuity
-Last session: 2026-05-08 (v1.26 milestone audit and gap-closure phase creation)
-Stopped at: Phase **237** ready for discuss/plan/execute.
+Last session: 2026-06-25 (v1.27 closeout)
+Stopped at: v1.27 shipped and archived; ready for the next milestone.
Resume file: None
diff --git a/.planning/architecture/kernel_x86_64.md b/.planning/architecture/kernel_x86_64.md
index f931dd16..da259c29 100644
--- a/.planning/architecture/kernel_x86_64.md
+++ b/.planning/architecture/kernel_x86_64.md
@@ -72,6 +72,9 @@ stateDiagram-v2
ready --> ready : dispatch_op_group_norm [dispatch_op_group_norm__] / dispatch_op_group_norm__
ready --> ready : dispatch_op_l2_norm [dispatch_op_l2_norm__] / dispatch_op_l2_norm__
ready --> ready : dispatch_op_l2_norm [dispatch_op_l2_norm__] / dispatch_op_l2_norm__
+ ready --> ready : dispatch_op_mul_mat [guard_simd_op_mul_mat_q2_k_q8_k_] / effect_exec_simd_q2_k_q8_k_op_mul_mat_
+ ready --> ready : dispatch_op_mul_mat [guard_simd_op_mul_mat_q3_k_q8_k_] / effect_exec_simd_q3_k_q8_k_op_mul_mat_
+ ready --> ready : dispatch_op_mul_mat [guard_simd_op_mul_mat_q6_k_q8_k_] / effect_exec_simd_q6_k_q8_k_op_mul_mat_
ready --> ready : dispatch_op_mul_mat [dispatch_op_mul_mat__] / dispatch_op_mul_mat__
ready --> ready : dispatch_op_mul_mat [dispatch_op_mul_mat__] / dispatch_op_mul_mat__
ready --> ready : dispatch_op_mul_mat [dispatch_op_mul_mat__] / dispatch_op_mul_mat__
@@ -163,7 +166,8 @@ stateDiagram-v2
ready --> ready : dispatch_op_tri [dispatch_op_tri__] / dispatch_op_tri__
ready --> ready : dispatch_op_fill [dispatch_op_fill__] / dispatch_op_fill__
ready --> ready : dispatch_op_fill [dispatch_op_fill__] / dispatch_op_fill__
- ready --> ready : dispatch_op_flash_attn_ext [dispatch_op_flash_attn_ext__] / dispatch_op_flash_attn_ext__
+ ready --> ready : dispatch_op_flash_attn_ext [simd_op_flash_attn_ext_f16kv_one_chunk_] / exec_simd_flash_attn_ext_f16kv_one_chunk_
+ ready --> ready : dispatch_op_flash_attn_ext [valid_op_flash_attn_ext_shared_] / dispatch_op_flash_attn_ext__
ready --> ready : dispatch_op_flash_attn_ext [dispatch_op_flash_attn_ext__] / dispatch_op_flash_attn_ext__
ready --> ready : dispatch_op_flash_attn_back [dispatch_op_flash_attn_back__] / dispatch_op_flash_attn_back__
ready --> ready : dispatch_op_flash_attn_back [dispatch_op_flash_attn_back__] / dispatch_op_flash_attn_back__
@@ -284,6 +288,9 @@ stateDiagram-v2
| [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_group_norm`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_group_norm>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_group_norm>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
| [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_l2_norm`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_l2_norm>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_l2_norm>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
| [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_l2_norm`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_l2_norm>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_l2_norm>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
+| [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_mul_mat`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`guard_simd_op_mul_mat_q2_k_q8_k>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`effect_exec_simd_q2_k_q8_k_op_mul_mat>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
+| [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_mul_mat`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`guard_simd_op_mul_mat_q3_k_q8_k>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`effect_exec_simd_q3_k_q8_k_op_mul_mat>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
+| [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_mul_mat`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`guard_simd_op_mul_mat_q6_k_q8_k>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`effect_exec_simd_q6_k_q8_k_op_mul_mat>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
| [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_mul_mat`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_mul_mat>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_mul_mat>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
| [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_mul_mat`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_mul_mat>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_mul_mat>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
| [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_mul_mat`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_mul_mat>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_mul_mat>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
@@ -375,7 +382,8 @@ stateDiagram-v2
| [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_tri`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_tri>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_tri>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
| [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_fill`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_fill>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_fill>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
| [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_fill`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_fill>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_fill>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
-| [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_flash_attn_ext`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_flash_attn_ext>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_flash_attn_ext>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
+| [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_flash_attn_ext`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`simd_op_flash_attn_ext_f16kv_one_chunk>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`exec_simd_flash_attn_ext_f16kv_one_chunk>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
+| [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_flash_attn_ext`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`valid_op_flash_attn_ext_shared>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_flash_attn_ext>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
| [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_flash_attn_ext`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_flash_attn_ext>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_flash_attn_ext>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
| [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_flash_attn_back`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_flash_attn_back>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_flash_attn_back>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
| [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_flash_attn_back`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_flash_attn_back>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_flash_attn_back>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
diff --git a/.planning/architecture/mermaid/kernel_x86_64.mmd b/.planning/architecture/mermaid/kernel_x86_64.mmd
index 81d9cd57..76200241 100644
--- a/.planning/architecture/mermaid/kernel_x86_64.mmd
+++ b/.planning/architecture/mermaid/kernel_x86_64.mmd
@@ -65,6 +65,9 @@ stateDiagram-v2
ready --> ready : dispatch_op_group_norm [dispatch_op_group_norm__] / dispatch_op_group_norm__
ready --> ready : dispatch_op_l2_norm [dispatch_op_l2_norm__] / dispatch_op_l2_norm__
ready --> ready : dispatch_op_l2_norm [dispatch_op_l2_norm__] / dispatch_op_l2_norm__
+ ready --> ready : dispatch_op_mul_mat [guard_simd_op_mul_mat_q2_k_q8_k_] / effect_exec_simd_q2_k_q8_k_op_mul_mat_
+ ready --> ready : dispatch_op_mul_mat [guard_simd_op_mul_mat_q3_k_q8_k_] / effect_exec_simd_q3_k_q8_k_op_mul_mat_
+ ready --> ready : dispatch_op_mul_mat [guard_simd_op_mul_mat_q6_k_q8_k_] / effect_exec_simd_q6_k_q8_k_op_mul_mat_
ready --> ready : dispatch_op_mul_mat [dispatch_op_mul_mat__] / dispatch_op_mul_mat__
ready --> ready : dispatch_op_mul_mat [dispatch_op_mul_mat__] / dispatch_op_mul_mat__
ready --> ready : dispatch_op_mul_mat [dispatch_op_mul_mat__] / dispatch_op_mul_mat__
@@ -156,7 +159,8 @@ stateDiagram-v2
ready --> ready : dispatch_op_tri [dispatch_op_tri__] / dispatch_op_tri__
ready --> ready : dispatch_op_fill [dispatch_op_fill__] / dispatch_op_fill__
ready --> ready : dispatch_op_fill [dispatch_op_fill__] / dispatch_op_fill__
- ready --> ready : dispatch_op_flash_attn_ext [dispatch_op_flash_attn_ext__] / dispatch_op_flash_attn_ext__
+ ready --> ready : dispatch_op_flash_attn_ext [simd_op_flash_attn_ext_f16kv_one_chunk_] / exec_simd_flash_attn_ext_f16kv_one_chunk_
+ ready --> ready : dispatch_op_flash_attn_ext [valid_op_flash_attn_ext_shared_] / dispatch_op_flash_attn_ext__
ready --> ready : dispatch_op_flash_attn_ext [dispatch_op_flash_attn_ext__] / dispatch_op_flash_attn_ext__
ready --> ready : dispatch_op_flash_attn_back [dispatch_op_flash_attn_back__] / dispatch_op_flash_attn_back__
ready --> ready : dispatch_op_flash_attn_back [dispatch_op_flash_attn_back__] / dispatch_op_flash_attn_back__
diff --git a/.planning/milestones/v1.26-MILESTONE-AUDIT.md b/.planning/milestones/v1.26-MILESTONE-AUDIT.md
new file mode 100644
index 00000000..becd336c
--- /dev/null
+++ b/.planning/milestones/v1.26-MILESTONE-AUDIT.md
@@ -0,0 +1,211 @@
+---
+milestone: v1.26
+audited: 2026-05-08T22:43:43.000Z
+status: passed
+scores:
+ requirements: 34/34 active
+ phases: 12/12
+ integration: 9/9
+ flows: 9/9
+gaps:
+ requirements: []
+ integration: []
+ flows: []
+closed_gaps:
+ - id: "direct-tensor-staged-offset-contract"
+ phase: "237"
+ requirements: ["TNX-01", "TNX-03", "TNX-04", "TST-01", "TST-02"]
+ evidence: "Phase 237 added failing-first public direct tensor staged-load coverage for nonzero file_offset, repaired model/tensor source-window dispatch, and passed scoped quality gates."
+ - id: "direct-tensor-staged-nonzero-offset"
+ phase: "237"
+ evidence: "tests/model/tensor/lifecycle_tests.cpp covers request_staged_load offset 2 against a whole-file source buffer and verifies copied bytes plus resident state."
+ - id: "cross-phase-summary-frontmatter"
+ phase: "238"
+ evidence: "Phases 232-236 now expose requirements-completed frontmatter or explicit partial/finalized-by rationale."
+ - id: "embedded-probe-reporting-truth"
+ phase: "238"
+ evidence: "Probe path captures used_io_strategy from model_loader::load_done; scripts/embedded_size.sh intentionally suppresses probe stdout for stable size measurement, so strategy evidence is the captured public outcome rather than ad hoc runtime printing."
+tech_debt: []
+deferred:
+ - requirement: ESG-02B
+ status: Deferred/Future
+ reason: "Real file open/seek/read and per-stage short-read taxonomy requires an approved file-backed staged-read source path; v1.26 intentionally uses source-span staging."
+nyquist:
+ overall: compliant
+ compliant_phases:
+ - 227-staged-read-strategy-component-boundary
+ - 228-span-target-window-platform-gating
+ - 229-staged-copy-progress-and-completion-semantics
+ - 230-context-cleanness-and-per-attempt-lifetime
+ - 231-deterministic-error-taxonomy
+ - 232-tensor-owned-integration-graph
+ - 233-public-loader-and-maintained-entrypoints
+ - 234-public-dispatch-tests
+ - 235-scope-and-non-regression-guardrails
+ - 236-publication-and-evidence-truthfulness
+ - 237-direct-tensor-staged-offset-contract-repair
+ - 238-audit-artifact-and-probe-reporting-cleanup
+ partial_phases: []
+ missing_phases: []
+---
+
+# v1.26 Milestone Audit - I/O Staged Read Loading Strategy
+
+## Result
+
+**Status: passed with one explicit deferred requirement.** All active v1.26
+requirements are satisfied from source-backed evidence after Phase 237 and Phase
+238 gap closure. `ESG-02B` remains deferred/future by design because real file
+open/seek/read and per-stage short-read taxonomy require a separately approved
+file-backed staged-read source path.
+
+Phase 237 closed the blocking direct tensor staged-load nonzero-offset source
+window gap. Phase 238 closed nonblocking audit artifact debt for summary
+frontmatter and embedded probe reporting truth.
+
+## Requirement Coverage
+
+Three-source cross-reference passed: the active requirements in
+`REQUIREMENTS.md` traceability are present in phase `VERIFICATION.md` evidence
+and in `SUMMARY.md` frontmatter, including the Phase 237 finalization entries
+for requirements reopened by the source-backed audit. No active orphaned
+requirements were found.
+
+| Requirement group | Requirements | Status | Evidence |
+|-------------------|--------------|--------|----------|
+| Staged component and guards | STG-01, STG-02, STG-03, PLAT-02 | Satisfied | Phases 227-228 verification and validation records; staged-read actor, guards, and platform-gated transitions exist. |
+| Staged execution semantics | STG-04, STG-05, STG-06, STG-07 | Satisfied | Phases 229-230 verification and validation records; lifecycle doctests cover deterministic copy, monotone completion, terminal success, and empty context. |
+| Lifetime and ownership | LIFE-02, SNR-01 | Satisfied | Phase 230 verification; no staged-read-owned OS handle is retained and tensor residency remains outside the staged actor. |
+| Error taxonomy | ESG-01, ESG-02A, ESG-03, ESG-04 | Satisfied | Phase 231 verification; named deterministic errors and exception-free actor boundary are covered. |
+| Tensor integration | TNX-01, TNX-02, TNX-03, TNX-04 | Satisfied | Phase 232 integration plus Phase 237 direct tensor nonzero-offset repair prove public dispatch, tensor-owned residency, explicit success, and explicit failure. |
+| Maintained surfaces | PUB-01, PUB-02, PUB-03, PUB-04, PUB-05 | Satisfied | Phase 233 verification; loader, benchmark, paritychecker, and probe paths use public contracts. |
+| Public dispatch tests | TST-01, TST-02 | Satisfied | Phase 234 maintained-route tests plus Phase 237 direct tensor offset success/failure doctests cover public `process_event(...)` dispatch. |
+| Guardrails | GRD-01, GRD-02, GRD-03, GRD-04, GRD-05 | Satisfied | Phase 235 verification; loader/tensor ownership scans and mmap/read non-regression doctests pass. |
+| Publication truth | DOC-01, LNT-01, BNH-01, EVI-01 | Satisfied | Phase 236 verification; docs, lint, benchmark workflow, and evidence-label truth all recorded. |
+
+`ESG-02B` remains explicitly deferred/future and is not counted as an in-scope
+v1.26 blocker.
+
+### Three-Source Requirement Matrix
+
+| Requirement | Traceability | Verification | Summary frontmatter | Final status |
+|-------------|--------------|--------------|---------------------|--------------|
+| STG-01 | Phase 227 satisfied | `227-VERIFICATION.md` satisfied | `227-01-SUMMARY.md` lists complete | Satisfied |
+| STG-02 | Phase 228 satisfied | `228-VERIFICATION.md` satisfied | `228-01-SUMMARY.md` lists complete | Satisfied |
+| STG-03 | Phase 228 satisfied | `228-VERIFICATION.md` satisfied | `228-01-SUMMARY.md` lists complete | Satisfied |
+| PLAT-02 | Phase 228 satisfied | `228-VERIFICATION.md` satisfied | `228-01-SUMMARY.md` lists complete | Satisfied |
+| STG-04 | Phase 229 satisfied | `229-VERIFICATION.md` satisfied | `229-01-SUMMARY.md` lists complete | Satisfied |
+| STG-05 | Phase 229 satisfied | `229-VERIFICATION.md` satisfied | `229-01-SUMMARY.md` lists complete | Satisfied |
+| STG-06 | Phase 229 satisfied | `229-VERIFICATION.md` satisfied | `229-01-SUMMARY.md` lists complete | Satisfied |
+| STG-07 | Phase 230 satisfied | `230-VERIFICATION.md` satisfied | `230-01-SUMMARY.md` lists complete | Satisfied |
+| LIFE-02 | Phase 230 satisfied | `230-VERIFICATION.md` satisfied | `230-01-SUMMARY.md` lists complete | Satisfied |
+| SNR-01 | Phase 230 satisfied | `230-VERIFICATION.md` satisfied | `230-01-SUMMARY.md` lists complete | Satisfied |
+| ESG-01 | Phase 231 satisfied | `231-VERIFICATION.md` satisfied | `231-01-SUMMARY.md` lists complete | Satisfied |
+| ESG-02A | Phase 231 satisfied | `231-VERIFICATION.md` satisfied | `231-01-SUMMARY.md` lists complete | Satisfied |
+| ESG-03 | Phase 231 satisfied | `231-VERIFICATION.md` satisfied | `231-01-SUMMARY.md` lists complete | Satisfied |
+| ESG-04 | Phase 231 satisfied | `231-VERIFICATION.md` satisfied | `231-01-SUMMARY.md` lists complete | Satisfied |
+| TNX-01 | Phase 237 satisfied | `237-VERIFICATION.md` satisfied | `237-01-SUMMARY.md` lists complete; Phase 232 marked finalized by 237 | Satisfied |
+| TNX-02 | Phase 232 satisfied | `232-VERIFICATION.md` satisfied | `232-01-SUMMARY.md` lists complete | Satisfied |
+| TNX-03 | Phase 237 satisfied | `237-VERIFICATION.md` satisfied | `237-01-SUMMARY.md` lists complete; Phase 232 marked finalized by 237 | Satisfied |
+| TNX-04 | Phase 237 satisfied | `237-VERIFICATION.md` satisfied | `237-01-SUMMARY.md` lists complete; Phase 232 marked finalized by 237 | Satisfied |
+| PUB-01 | Phase 233 satisfied | `233-VERIFICATION.md` satisfied | `233-01-SUMMARY.md` lists complete | Satisfied |
+| PUB-02 | Phase 233 satisfied | `233-VERIFICATION.md` satisfied | `233-01-SUMMARY.md` lists complete | Satisfied |
+| PUB-03 | Phase 233 satisfied | `233-VERIFICATION.md` satisfied | `233-01-SUMMARY.md` lists complete | Satisfied |
+| PUB-04 | Phase 233 satisfied | `233-VERIFICATION.md` satisfied | `233-01-SUMMARY.md` lists complete | Satisfied |
+| PUB-05 | Phase 233 satisfied | `233-VERIFICATION.md` satisfied | `233-01-SUMMARY.md` lists complete | Satisfied |
+| TST-01 | Phase 237 satisfied | `237-VERIFICATION.md` satisfied | `237-01-SUMMARY.md` lists complete; Phase 234 marked finalized by 237 | Satisfied |
+| TST-02 | Phase 237 satisfied | `237-VERIFICATION.md` satisfied | `237-01-SUMMARY.md` lists complete; Phase 234 marked finalized by 237 | Satisfied |
+| GRD-01 | Phase 235 satisfied | `235-VERIFICATION.md` satisfied | `235-01-SUMMARY.md` lists complete | Satisfied |
+| GRD-02 | Phase 235 satisfied | `235-VERIFICATION.md` satisfied | `235-01-SUMMARY.md` lists complete | Satisfied |
+| GRD-03 | Phase 235 satisfied | `235-VERIFICATION.md` satisfied | `235-01-SUMMARY.md` lists complete | Satisfied |
+| GRD-04 | Phase 235 satisfied | `235-VERIFICATION.md` satisfied | `235-01-SUMMARY.md` lists complete | Satisfied |
+| GRD-05 | Phase 235 satisfied | `235-VERIFICATION.md` satisfied | `235-01-SUMMARY.md` lists complete | Satisfied |
+| DOC-01 | Phase 236 satisfied | `236-VERIFICATION.md` satisfied | `236-01-SUMMARY.md` lists complete | Satisfied |
+| LNT-01 | Phase 236 satisfied | `236-VERIFICATION.md` satisfied | `236-01-SUMMARY.md` lists complete | Satisfied |
+| BNH-01 | Phase 236 satisfied | `236-VERIFICATION.md` satisfied | `236-01-SUMMARY.md` lists complete | Satisfied |
+| EVI-01 | Phase 236 satisfied | `236-VERIFICATION.md` satisfied | `236-01-SUMMARY.md` lists complete | Satisfied |
+
+## Phase Coverage
+
+| Phase | Verification | Validation | Audit status |
+|-------|--------------|------------|--------------|
+| 227 | `227-VERIFICATION.md` | `227-VALIDATION.md` | Satisfied |
+| 228 | `228-VERIFICATION.md` | `228-VALIDATION.md` | Satisfied |
+| 229 | `229-VERIFICATION.md` | `229-VALIDATION.md` | Satisfied |
+| 230 | `230-VERIFICATION.md` | `230-VALIDATION.md` | Satisfied |
+| 231 | `231-VERIFICATION.md` | `231-VALIDATION.md` | Satisfied; `ESG-02B` deferred |
+| 232 | `232-VERIFICATION.md` | `232-VALIDATION.md` | Satisfied with Phase 237 finalizing reopened direct-offset aspects |
+| 233 | `233-VERIFICATION.md` | `233-VALIDATION.md` | Satisfied; summary frontmatter added in Phase 238 |
+| 234 | `234-VERIFICATION.md` | `234-VALIDATION.md` | Satisfied with Phase 237 finalizing reopened direct-offset public dispatch aspects |
+| 235 | `235-VERIFICATION.md` | `235-VALIDATION.md` | Satisfied; summary frontmatter added in Phase 238 |
+| 236 | `236-VERIFICATION.md` | `236-VALIDATION.md` | Satisfied; full gate passed; summary frontmatter added in Phase 238 |
+| 237 | `237-VERIFICATION.md` | `237-VALIDATION.md` | Satisfied; direct tensor nonzero-offset source repair |
+| 238 | `238-VERIFICATION.md` | `238-VALIDATION.md` | Satisfied; audit artifact and probe reporting cleanup |
+
+## Integration Check
+
+Cross-phase source audit passes after the Phase 237 repair. The integration
+checker found **34/34 active requirements** and **9/9 requested E2E flows**
+passing, with no blocker or tech-debt findings:
+
+- `src/emel/io/staged_read` owns the staged actor and canonical machine alias.
+- `src/emel/io/loader` routes staged single and batch requests through the injected
+ `io_staged_read` actor.
+- `src/emel/model/tensor` remains the residency owner for staged-load results.
+- Direct `model::tensor::event::request_staged_load` now validates
+ `source_buffer_bytes` against `file_offset + byte_size`, dispatches
+ `source_buffer + file_offset` as the staged `source_span`, and passes the
+ logical window length as `source_span_bytes`.
+- Maintained benchmark, parity, and embedded-size probe lanes bind staged-read
+ strategy through public model-load strategy contracts.
+- Evidence labels use `used_io_strategy` after modeled execution, not requested
+ strategy alone.
+- Public dispatch and guardrail tests cover the maintained staged route, direct
+ tensor route, loader ownership boundary, tensor residency boundary, and shipped
+ mmap/read non-regression paths.
+- No coroutine/device/mmap staged-read scope creep was found in staged-read source
+ or tests.
+
+## Embedded Probe Reporting
+
+The embedded-size probe captures the executed load strategy from the public
+`model::loader::events::load_done::used_io_strategy` field in
+`tools/embedded_size/emel_probe/main.cpp`. `scripts/embedded_size.sh` intentionally
+suppresses probe stdout/stderr during the smoke run so size-measurement output and
+snapshot generation remain stable. Therefore, the authoritative evidence surface
+for staged-read execution is the captured `used_io_strategy` outcome and the
+published verification/audit record, not a separate ad hoc probe print line.
+
+## Summary Frontmatter
+
+Phases 232-236 now include machine-readable summary frontmatter:
+
+- Phase 232 records `TNX-02` complete and marks `TNX-01`, `TNX-03`, `TNX-04`
+ as finalized by Phase 237.
+- Phase 233 records `PUB-01` through `PUB-05`.
+- Phase 234 records `TST-01` and `TST-02` as finalized by Phase 237.
+- Phase 235 records `GRD-01` through `GRD-05`.
+- Phase 236 records `DOC-01`, `LNT-01`, `BNH-01`, and `EVI-01`.
+
+## Nyquist Validation
+
+All twelve v1.26 phase directories have `*-VALIDATION.md` records with
+`nyquist_compliant: true` and `wave_0_complete: true`. No missing validation
+files remain.
+
+## Closeout Readiness
+
+The milestone is ready for completion and cleanup. All active requirements have
+source-backed evidence; all phases are complete; the direct tensor nonzero-offset
+blocker is closed; and remaining staged-read file-backed error taxonomy is
+explicitly deferred as `ESG-02B`.
+
+Audit refresh validation:
+
+```bash
+EMEL_QUALITY_GATES_CHANGED_FILES=".planning/v1.26-MILESTONE-AUDIT.md" scripts/quality_gates.sh
+```
+
+Result: **PASS** (exit `0`). Benchmark, coverage, paritychecker, fuzz, and docsgen
+lanes were skipped where the changed-file scope was irrelevant; the legacy SML
+surface scan and configuration completed successfully.
diff --git a/.planning/REQUIREMENTS.md b/.planning/milestones/v1.26-REQUIREMENTS.md
similarity index 100%
rename from .planning/REQUIREMENTS.md
rename to .planning/milestones/v1.26-REQUIREMENTS.md
diff --git a/.planning/milestones/v1.26-ROADMAP.md b/.planning/milestones/v1.26-ROADMAP.md
new file mode 100644
index 00000000..8c2e8bfb
--- /dev/null
+++ b/.planning/milestones/v1.26-ROADMAP.md
@@ -0,0 +1,774 @@
+# Roadmap: EMEL
+
+## Milestones
+
+- ✅ **v1.0 EMEL Llama-68M Generation Slice** — shipped 2026-03-08
+- ✅ **v1.1 EMEL Llama-68M Generation Benchmark** — shipped 2026-03-11
+- ✅ **v1.2 Flash Attention** — shipped 2026-03-22
+- ✅ **v1.3 ARM Flash Optimizations** — shipped 2026-03-22
+- ✅ **v1.4 Full Vectorized Quantized Kernels** — shipped 2026-03-25
+- ✅ **v1.5 Full ARM Quantized Path** — shipped 2026-03-27
+- ✅ **v1.6 Qwen3-0.6B Parity And Benchmark** — shipped 2026-03-30
+- ✅ **v1.7 Generator Prefill Submachine Decomposition** — shipped 2026-03-30
+- ✅ **v1.8 Truthful Qwen3 E2E Embedded Size** — shipped 2026-04-02
+- ✅ **v1.9 Liquid LFM2.5-1.2B Thinking ARM Slice** — shipped 2026-04-02
+- ✅ **v1.11 TE-75M GGUF Trimodal Embedding Runtime** — shipped 2026-04-15
+- ✅ **v1.12 Pluggable Reference Parity Bench Architecture** — shipped 2026-04-18
+- ✅ **v1.13 Pluggable Generative Parity Bench** — shipped 2026-04-21
+- ✅ **v1.14 Benchmark Variant Organization** — shipped 2026-04-21
+- ✅ **v1.15 ARM Sortformer Diarization GGUF Slice** — shipped 2026-04-25
+- ✅ **v1.16 ARM Whisper GGUF Parity And Performance** — shipped 2026-04-28
+- ✅ **v1.17 Text Generator Domain Alignment** — shipped 2026-04-30
+- ✅ **v1.18 Parity Tool Boundary Refactor** — shipped 2026-05-01
+- ✅ **v1.19 Benchmark Tool Pluggable Runner Refactor** — shipped 2026-05-01
+- ✅ **v1.20 SML Dependency And Namespace Migration** — shipped 2026-05-02
+- ✅ **v1.21 Quality Gate Selective Runner Optimization** — shipped 2026-05-02
+- ✅ **v1.22 Weight Loading Ownership Cutover** — shipped 2026-05-03
+- ✅ **v1.23 I/O Loading Strategy Boundary** — shipped 2026-05-04
+- ✅ **v1.24 I/O Mmap Loading Strategy** — shipped 2026-05-04 (Phases 204-211)
+- ✅ **v1.25 I/O Read Loading Strategy** — shipped 2026-05-06 (Phases 212-226 + 214.1)
+- ✅ **v1.26 I/O Staged Read Loading Strategy** — completed 2026-05-08
+ (12 / 12 phases complete; issue #63; `ESG-02B` deferred/future)
+
+## Phases
+
+### ✅ v1.26 I/O Staged Read Loading Strategy (Phases 227-238) — COMPLETE 2026-05-08
+
+Source: GitHub issue #63, "Add io/staged_read state machine for constrained-memory tensor loading".
+Adds `src/emel/io/staged_read` for bounded chunked/windowed reads under tensor-owned residency.
+Depends on the tensor-to-I/O boundary from issue #60. Cooperative coroutine scheduling is out of
+scope unless separately approved. Shipped mmap (`io/mmap`) and bulk read/copy (`io/read`) must not
+regress.
+
+Execution order: 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238.
+
+**Milestone progress (v1.26):** **12 / 12** phases recorded **Complete** in the table below.
+The source-backed milestone audit found a direct tensor staged-load offset-contract gap plus
+closeout artifact debt; Phases 237-238 closed those gaps. `ESG-02B` remains deferred/future
+because file-backed staged-read source ownership is out of scope.
+
+- [x] Phase 227: Staged Read Strategy Component Boundary (STG-01)
+- [x] Phase 228: Span, Target-Window, and Platform Gating (STG-02, STG-03, PLAT-02)
+- [x] Phase 229: Staged Copy Progress and Completion Semantics (STG-04, STG-05, STG-06)
+- [x] Phase 230: Context Cleanness and Per-Attempt Lifetime (STG-07, LIFE-02, SNR-01)
+- [x] Phase 231: Deterministic Error Taxonomy (ESG-01, ESG-02A, ESG-03, ESG-04; ESG-02B deferred)
+- [x] Phase 232: Tensor-Owned Integration Graph (TNX-01, TNX-02, TNX-03, TNX-04)
+- [x] Phase 233: Public Loader and Maintained Entrypoints (PUB-01, PUB-02, PUB-03, PUB-04, PUB-05)
+- [x] Phase 234: Public Dispatch Tests (TST-01, TST-02)
+- [x] Phase 235: Scope and Non-Regression Guardrails (GRD-01, GRD-02, GRD-03, GRD-04, GRD-05)
+- [x] Phase 236: Publication and Evidence Truthfulness (DOC-01, LNT-01, BNH-01, EVI-01)
+- [x] Phase 237: Direct Tensor Staged Offset Contract Repair (TNX-01, TNX-03, TNX-04, TST-01, TST-02)
+- [x] Phase 238: Audit Artifact and Probe Reporting Cleanup (cleanup-only)
+
+#### Phase 227: Staged Read Strategy Component Boundary
+
+**Goal:** Locate canonical `src/emel/io/staged_read` with standard I/O component layout.
+**Depends on:** Phase 226
+**Requirements:** STG-01
+
+**Success criteria:**
+
+1. `src/emel/io/staged_read` exists with canonical `emel::io::staged_read::sm` alias.
+2. Component scope excludes mmap, device transfer, or cooperative async runtime.
+3. Initial fail-closed or smoke dispatch proves actors are wired like sibling I/O strategies.
+
+#### Phase 228: Span, Target-Window, and Platform Gating
+
+**Goal:** All staged preconditions enforced in guards/transitions before any file work.
+**Depends on:** Phase 227
+**Requirements:** STG-02, STG-03, PLAT-02
+
+**Success criteria:**
+
+1. Invalid source staging contract rejected solely via guard-modeled transitions.
+2. Invalid target window/layout rejected solely via guard-modeled transitions.
+3. Unsupported hosts/resources fail closed with explicit unsupported terminal shape.
+
+#### Phase 229: Staged Copy Progress and Completion Semantics
+
+**Goal:** Prove per-stage deterministic copy plus full-span monotone completion.
+**Depends on:** Phase 228
+**Requirements:** STG-04, STG-05, STG-06
+
+**Success criteria:**
+
+1. Test vectors observe correct bytes per staged window.
+2. Completeness tests cover entire logical span order.
+3. Terminal success aligns with copied full span per contract.
+
+#### Phase 230: Context Cleanness and Per-Attempt Lifetime
+
+**Goal:** Bounded handles and residency clarity for the staged actor.
+**Depends on:** Phase 229
+**Requirements:** STG-07, LIFE-02, SNR-01
+
+**Success criteria:**
+
+1. Static or dynamic review shows zero forbidden dispatch-local context mirrors.
+2. Handle lifetime tests/tools show release-before-done semantics.
+3. Tests confirm strategy never asserts tensor residency commits.
+
+#### Phase 231: Deterministic Error Taxonomy
+
+**Goal:** Errors are categorical, observable, exception-free.
+**Depends on:** Phase 230
+**Requirements:** ESG-01, ESG-02A, ESG-03, ESG-04 (`ESG-02B` deferred)
+
+**Success criteria:**
+
+1. At least one doctest per taxonomy family (pre-I/O guard, source-contract read-surface, sequencing/contract) demonstrates deterministic categories through `process_event(...)`.
+2. Source-backed docs explicitly defer `ESG-02B` file open/seek/read + per-stage short-read categories until approved file-backed staged-read ownership exists.
+3. ABI boundary scans show noexcept expectations for surfaced API.
+
+#### Phase 232: Tensor-Owned Integration Graph
+
+**Goal:** Integrate staged loads through explicit tensor+I/O graphs.
+**Depends on:** Phase 231
+**Requirements:** TNX-01, TNX-02, TNX-03, TNX-04
+
+**Closeout ledger (verified):** Manager-scoped **`scripts/quality_gates.sh`** for Phase 232
+changed-file corpus exited **2** (red — **not** exit 0). **`232-VERIFICATION.md`** records **bench_snapshot**
+suite regressions unrelated to staged tensor-integration files and a **paritychecker** failure outside
+Phase 232 scope. Phase 232 completion is ledger-approved **without** claiming a passing full-repo gate run.
+
+**Success criteria:**
+
+1. Requests flow only via public tensors↔IO events.
+2. Residency proofs remain tensor-owned (`model/tensor` retains lifecycle ownership).
+3. Success/failure each have explicit observable terminal representations.
+
+#### Phase 233: Public Loader and Maintained Entrypoints
+
+**Goal:** Strategies observable without actor detail reach-through or duplicate POSIX loops in tools.
+**Depends on:** Phase 232
+**Requirements:** PUB-01, PUB-02, PUB-03, PUB-04, PUB-05
+
+**Closeout (2026-05-08):** **`PUB-01`–`PUB-05`** satisfied per **`233-VERIFICATION.md`** (manager validation +
+**phase233-navigator final review PASS**). Public **`staged_read`** access is through **`io::loader`** and maintained
+tool entrypoints with **`io_staged_read`** wiring; **`tests/model/loader/lifecycle_tests.cpp`** covers the
+storage-backed **`staged_read`** route and include guards.
+
+**Residual:** **`scripts/quality_gates.sh`** was **not** run on a Phase **233** changed-file corpus in
+this closeout slice — **no Phase 233 scoped gate pass is claimed** (full-repo gate truth unchanged from
+Phase **232** ledger where applicable).
+
+**Success criteria:**
+
+1–4. Each lane (loader/bench/parity/probe) has independent proof of public-contract-only access.
+5. Source scan enforcement or doctest proves no duplicated unconstrained staged read shim in tools.
+
+#### Phase 234: Public Dispatch Tests
+
+**Goal:** Core success/failure behavior demonstrated through `process_event`.
+**Depends on:** Phase 233
+**Requirements:** TST-01, TST-02
+
+**Success criteria:**
+
+1. Passing success-path doctest with `visit_current_states` or equivalent.
+2. Passing failure-path doctest for guard rejection.
+
+#### Phase 235: Scope and Non-Regression Guardrails
+
+**Goal:** Freeze architecture invariants relative to loaders, mmap, and read strategies.
+**Depends on:** Phase 234
+**Requirements:** GRD-01, GRD-02, GRD-03, GRD-04, GRD-05
+
+**Success criteria:** Each of GRD-01, GRD-02, GRD-03, GRD-04, and GRD-05 has either a deterministic script failure mode or a narrowed regression doctest proving the invariant holds.
+
+#### Phase 236: Publication and Evidence Truthfulness
+
+**Goal:** Align docs and frozen artifacts with real staged/runtime usage.
+**Depends on:** Phase 235
+**Requirements:** DOC-01, LNT-01, BNH-01, EVI-01
+
+**Success criteria:**
+
+1. Doc diff review verifies accurate staged-read wording.
+2. Lint snapshot regeneration path documented/passing.
+3. Benchmark snapshot regeneration obeys policy.
+4. Parity/compare metadata never mislabels unstaged workloads as staged.
+
+**Closeout (2026-05-08):** **`DOC-01`–`EVI-01`** satisfied per
+**`236-VERIFICATION.md`**. Serial full quality gate passed:
+`EMEL_QUALITY_GATES_SCOPE=full EMEL_QUALITY_GATES_PARALLEL=0 scripts/quality_gates.sh`
+(exit **0**, ended `2026-05-08T21:21:42.028Z`). Benchmark defaults now use bounded routine
+settings (`100` iterations, `3` runs, `10` warmup iterations) with bounded generation and
+diarization defaults.
+
+#### Phase 237: Direct Tensor Staged Offset Contract Repair
+
+**Goal:** Repair direct `model/tensor` staged-load nonzero-offset source-window behavior and prove it through public dispatch.
+**Depends on:** Phase 236
+**Requirements:** TNX-01, TNX-03, TNX-04, TST-01, TST-02
+**Gap Closure:** Closes `.planning/v1.26-MILESTONE-AUDIT.md` findings
+`direct-tensor-staged-offset-contract` and `direct-tensor-staged-nonzero-offset`.
+
+**Success Criteria:**
+
+1. A public `model/tensor::event::request_staged_load` doctest fails before repair and passes after
+ repair for a nonzero `file_offset` against a whole-file source buffer.
+2. Direct tensor staged-load source-span construction is aligned with `io/loader` or the
+ pre-windowed-source contract is explicitly documented and enforced by validation/tests.
+3. Direct tensor staged-load success and failure outcomes remain explicit `_done` / `_error`
+ publications through public `process_event(...)` dispatch and SML state inspection.
+4. Changed-file quality gates for `model/tensor`, `io/staged_read`, and affected tests pass without
+ benchmark-regression override.
+5. If implementation changes maintained model or snapshot artifacts, those artifacts are refreshed
+ only through maintained workflows; model artifact updates are approved for this gap-closure work.
+
+**Closeout (2026-05-08):** Phase 237 completed with a failing-first public
+`request_staged_load` nonzero-offset doctest, repaired source-window dispatch in
+`model/tensor`, and passing scoped validation:
+`./build/emel_tests_bin --test-case="model_tensor_request_staged_load_*"`,
+`ctest --test-dir build -R '^emel_tests_model_and_batch$' --output-on-failure`,
+and changed-file `scripts/quality_gates.sh` (exit `0`). Reopened requirements
+`TNX-01`, `TNX-03`, `TNX-04`, `TST-01`, and `TST-02` are satisfied by
+`237-VERIFICATION.md`.
+
+#### Phase 238: Audit Artifact and Probe Reporting Cleanup
+
+**Goal:** Reconcile audit artifacts and probe reporting truth after the Phase 237 source repair.
+**Depends on:** Phase 237
+**Requirements:** none — cleanup-only; all reopened requirement closure belongs to Phase 237
+**Gap Closure:** Closes `.planning/v1.26-MILESTONE-AUDIT.md` tech-debt items for missing
+`requirements-completed` SUMMARY frontmatter and embedded-size probe reporting clarity.
+
+**Success Criteria:**
+
+1. Phase summaries for 232–236 expose accurate `requirements-completed` frontmatter or an explicit
+ cleanup rationale so the three-source audit matrix no longer needs manual reconciliation.
+2. Embedded-size probe evidence either prints the executed load strategy when appropriate or the
+ maintained docs/audit explain why captured `used_io_strategy` is the authoritative evidence
+ surface.
+3. REQUIREMENTS, ROADMAP, STATE, and the milestone audit are refreshed from source-backed evidence
+ after Phase 237.
+4. Focused lint/docs/audit commands pass; no maintained benchmark, model, or snapshot artifact is
+ updated unless the implementation actually requires it.
+
+**Closeout (2026-05-08):** Phase 238 completed summary frontmatter reconciliation,
+embedded probe reporting truth documentation, and refreshed `v1.26-MILESTONE-AUDIT.md`
+to `status: passed`. Changed-file `scripts/quality_gates.sh` passed with no benchmark,
+coverage, parity, fuzz, or docsgen-affecting lanes required.
+
+---
+### ✅ v1.25 I/O Read Loading Strategy (Phases 212-226 + 214.1) — SHIPPED 2026-05-06
+
+Source: GitHub issue #62, "Add io/read state machine for copy-based tensor loading".
+Adds a dedicated `src/emel/io/read` Stateforward.SML actor for explicit read/copy tensor
+loading beneath tensor-owned residency. Mmap, staged/chunked constrained-memory, async,
+and device strategies remain out of scope.
+
+- [x] Phase 212: Read Strategy Component Boundary (1/1 plans) — completed 2026-05-05
+- [x] Phase 213: Read Validation and Platform Gating (1/1 plans) — completed 2026-05-05
+- [x] Phase 214: Read Execution, Errors, and Lifetime (1/1 plans) — completed 2026-05-05; audit found RTC compliance gap
+- [x] Phase 214.1: RTC-Safe Read Execution Boundary Repair (1/1 plans) — gap closure
+- [x] Phase 215: Tensor-Owned Read Integration (1/1 plans) — completed 2026-05-05
+- [x] Phase 216: Public Runtime and Evidence Surfaces (1/1 plans) — completed 2026-05-05
+- [x] Phase 217: Behavior Tests and Scope Guardrails (1/1 plans) — completed 2026-05-05
+- [x] Phase 218: Publication and Maintained Artifact Updates (1/1 plans) — completed 2026-05-05
+- [x] Phase 219: Maintained Read Source Provenance (1/1 plans) — completed
+ 2026-05-05; source-backed benchmark/parity/probe read_copy provenance
+- [x] Phase 220: Explicit Tensor Read Outcome Graph (1/1 plans) — completed
+ 2026-05-05; tensor read outcomes selected by explicit same-RTC result graph
+- [x] Phase 221: Read Closeout Truth Reconciliation — superseded planning stub
+ closed 2026-05-06; Phase 223 owns final closeout
+- [x] Phase 222: Public Read Source Contract Repair (1/1 plans) — completed
+ 2026-05-06; actor-detail reach-through removed from maintained lanes
+- [x] Phase 223: Read Closeout Truth And Validation Reconciliation (1/1 plans) —
+ completed 2026-05-06; final closeout truth and validation reconciled
+- [x] Phase 224: Read Closeout Tech Debt Cleanup — completed 2026-05-06;
+ refreshed audit ambiguity closed with fresh passing `emel_tests_io` evidence
+- [x] Phase 225: Read Closeout Runtime Validation And SML Repair — completed
+ 2026-05-06; refreshed source-backed audit gaps closed with dyld fallback evidence
+- [x] Phase 226: Read Batch Cap And Closeout Evidence Refresh — completed
+ 2026-05-06; refreshed audit tech debt closed
+
+Archived closeout artifacts:
+- `.planning/milestones/v1.25-ROADMAP.md`
+- `.planning/milestones/v1.25-REQUIREMENTS.md`
+- `.planning/milestones/v1.25-MILESTONE-AUDIT.md`
+- `.planning/milestones/v1.25-phases/`
+
+**Execution Order:** Phases execute in numeric order:
+212 -> 213 -> 214 -> 214.1 -> 215 -> 216 -> 217 -> 218 -> 219 -> 220 -> 222 -> 223 -> 224 -> 225 -> 226.
+Phase 221 is a completed superseded closeout planning stub and Phase 223 owns final
+source-backed closeout truth. Phase 224 is cleanup-only; Phase 225 owns the refreshed
+2026-05-06 audit gaps before archive. Phase 226 closes the post-audit nonblocking
+tech-debt items before final closeout.
+
+#### Phase 212: Read Strategy Component Boundary
+**Goal**: Maintainers can identify `io/read` as the canonical read/copy strategy actor under
+`src/emel/io`.
+**Depends on**: Phase 211
+**Requirements**: READ-01
+**Success Criteria** (what must be TRUE):
+ 1. Maintainer can inspect `src/emel/io/read` and find component-local `context`, `events`,
+ `guards`, `actions`, `errors`, and `sm` ownership.
+ 2. Maintainer can use canonical `emel::io::read::sm` ownership and public aliases without
+ reaching into actor internals.
+ 3. Maintainer can confirm the component is read/copy-only and contains no mmap, staged or
+ chunked constrained-memory, cooperative async, device-specific, loader-owned byte access,
+ model-family widening, or tool-only read scaffold behavior.
+**Plans**: 01 — Validated 2026-05-05; established canonical `io/read` boundary actor
+and lifecycle tests.
+
+#### Phase 213: Read Validation and Platform Gating
+**Goal**: The read actor accepts read attempts only after explicit request, platform, file,
+offset, length, layout, and target-buffer preconditions pass.
+**Depends on**: Phase 212
+**Requirements**: READ-02, PLAT-01
+**Success Criteria** (what must be TRUE):
+ 1. Caller sees invalid request, file, offset, length, layout, or target-buffer preconditions
+ rejected before any open or read attempt is accepted.
+ 2. Caller sees unsupported platforms and unsupported file/resource shapes fail closed
+ deterministically through the I/O abstraction boundary.
+ 3. Maintainer can inspect SML guards and transitions and see validation outcomes modeled
+ before the open/read attempt.
+ 4. Supported requests reach a read-attempt state only after all read preconditions are true.
+**Plans**: 01 — Validated 2026-05-05; added explicit read validation and platform
+gating before the read-attempt placeholder.
+
+#### Phase 214: Read Execution, Errors, and Lifetime
+**Goal**: Successful read requests deliver deterministic copied bytes into the caller-owned
+target buffer with deterministic transient-resource lifetime and deterministic failure
+outcomes, without taking tensor residency ownership.
+**Depends on**: Phase 213
+**Requirements**: READ-03, LIFE-01, ERR-01
+**Success Criteria** (what must be TRUE):
+ 1. Caller receives a deterministic copied-bytes outcome on success with the requested byte
+ span written into the caller-provided owned target buffer; the read strategy never claims
+ residency ownership.
+ 2. Read failures surface deterministic error categories (invalid request, unsupported
+ resource, unsupported platform, file open failed, file seek failed, file read failed,
+ short read, internal error) instead of thrown exceptions or ambiguous status mirroring.
+ 3. Transient OS resources (file descriptor / handle) are released through the actor-owned
+ attempt before `_done` is published; no kernel handle is held across publication.
+ 4. Maintainer can verify dispatch-local request data is not stored in `read::context` and
+ tensor residency semantics remain owned by `model/tensor`.
+**Plans**: 01 — Validated 2026-05-05; added concrete read execution, copied-byte
+success, deterministic read errors, and close-before-done lifetime behavior.
+2026-05-05 milestone audit found this phase superseded by unverified Phase 214.1 repair
+work; Phase 214.1 owns source-backed RTC validation and artifact reconciliation.
+
+#### Phase 214.1: RTC-Safe Read Execution Boundary Repair
+**Goal**: The read actor preserves copied-byte success, deterministic errors, and
+close-before-done lifetime evidence without performing blocking or input-dependent
+filesystem work inside SML dispatch.
+**Depends on**: Phase 214
+**Requirements**: READ-03, PLAT-01, LIFE-01, ERR-01
+**Gap Closure**: Closes v1.25 audit gaps for missing Phase 214.1 artifacts, stale Phase
+214 planning truth, and source-backed Nyquist validity after the read actor moved to
+caller-provided source spans.
+**Success Criteria** (what must be TRUE):
+ 1. `src/emel/io/read` no longer calls platform open, seek, read, close, or equivalent
+ filesystem APIs from guards, actions, transition helpers, or functions called by them.
+ 2. The read actor still accepts only validated read/copy attempts and publishes copied-byte
+ `_done` plus deterministic `_error` outcomes through explicit states/events.
+ 3. The caller-owned target buffer remains caller-owned, dispatch-local request data is not
+ stored in `read::context`, and no transient OS handle is retained or hidden in context.
+ 4. Tests prove the repaired behavior through public `process_event(...)` dispatch and SML
+ state inspection, including validation failure, unsupported/resource failure, read
+ failure, short read, and copied-byte success.
+ 5. Phase 214.1 SUMMARY.md, VERIFICATION.md, and VALIDATION.md reconcile ROADMAP.md,
+ STATE.md, REQUIREMENTS.md, and generated architecture docs with the source-buffer based
+ implementation and do not claim maintained benchmark/parity evidence.
+**Plans**: 01 — Validated 2026-05-05; reconciled read actor evidence with the
+source-buffer based implementation, confirmed no dispatch-time filesystem calls, and
+updated requirement/state artifacts for the Phase 214.1 gap closure.
+
+#### Phase 215: Tensor-Owned Read Integration
+**Goal**: `model/tensor` can request and consume read-backed I/O through the public `emel/io`
+boundary while retaining load, bind, evict, and residency orchestration.
+**Depends on**: Phase 214.1
+**Requirements**: TIO-01, TIO-02
+**Gap Closure**: Closes v1.25 audit gaps for partial tensor-owned read integration and
+callback/status-mediated read outcomes.
+**Success Criteria** (what must be TRUE):
+ 1. Tensor load flow can request read-based (copy) loading through public `emel/io` events
+ without direct low-level read calls.
+ 2. Tensor bind, residency, and evict transitions remain in `model/tensor` and consume read
+ success outcomes that reference the caller-owned target buffer.
+ 3. Read success, unsupported, validation failure, file open failure, and file read failure
+ are visible as explicit `_done` and `_error` events or states.
+ 4. Maintainer can verify no callback-selected outcomes, mirrored status fields, or context
+ phase flags decide tensor-to-I/O outcomes for read-backed loading.
+ 5. Existing source/test progress through `model/loader -> model/tensor -> io/loader ->
+ io/read -> tensor apply` is preserved or replaced by a stricter explicit outcome path
+ with equivalent public-dispatch tests.
+**Plans**: 01 — Validated 2026-05-05; added tensor-owned
+`request_read_load` public events, explicit read outcome states, and tests for read
+success, unsupported I/O actor, validation failure, file open failure, and file read
+failure.
+
+#### Phase 216: Public Runtime and Evidence Surfaces
+**Goal**: Runtime entrypoints and maintained tool lanes can select or report read-backed
+loading only through public surfaces, and evidence reflects the actual EMEL runtime path.
+**Depends on**: Phase 215
+**Requirements**: TIO-03, VAL-04
+**Gap Closure**: Closes v1.25 audit gaps for maintained benchmark, paritychecker, and
+embedded probe lanes bypassing the read-backed runtime path and for runtime reporting that
+currently exposes only mmap usage.
+**Success Criteria** (what must be TRUE):
+ 1. `model/loader`, maintained benchmark lanes, paritychecker lanes, and embedded probes can
+ select or report read-backed loading only through public tensor and I/O runtime contracts.
+ 2. Maintained benchmark, paritychecker, and embedded probe lanes avoid actor-internal
+ reach-through and contain no low-level read logic.
+ 3. Benchmark and parity output reports read-strategy usage only when the EMEL lane executed
+ the read-backed runtime path.
+ 4. Unsupported or fallback behavior is reported as unsupported or non-read-strategy, not as
+ read-strategy parity or performance evidence.
+ 5. Runtime done/error evidence distinguishes mmap, read/copy, unsupported, and non-I/O
+ loading paths without relying on tool-only scaffolds.
+**Plans**: 01 — Validated 2026-05-05; added public model-loader load-strategy
+evidence, maintained tool strategy binding, load-strategy output notes, and
+source-backed tests proving benchmark/parity/embedded lanes avoid callback-time
+actor reach-through.
+
+#### Phase 217: Behavior Tests and Scope Guardrails
+**Goal**: Tests and guardrails prove read behavior through public dispatch and prevent scope
+or ownership leaks.
+**Depends on**: Phase 216
+**Requirements**: VAL-01, VAL-02
+**Gap Closure**: Closes v1.25 audit gaps for missing full-scope read behavior tests,
+domain/source guardrails, and former ambiguous read-strategy naming relative to the
+out-of-scope staged/chunked policy.
+**Success Criteria** (what must be TRUE):
+ 1. Doctests drive supported read behavior through `process_event(...)` and inspect SML states
+ via `visit_current_states` and/or `is(...)`.
+ 2. Doctests cover representative unsupported, validation failure, file open failure, and file
+ read failure outcomes through public events.
+ 3. Guardrails fail if read implementation leaks into `model/loader` or tensor residency
+ ownership moves out of `model/tensor`.
+ 4. Guardrails fail if mmap, staged or chunked constrained-memory, cooperative async,
+ device-specific, model-family widening, loader-owned byte access, or tool-only read
+ scaffold behavior enters this milestone.
+ 5. Source guardrails clarify or eliminate any public naming that could present the v1.25
+ read/copy path as staged/chunked constrained-memory support.
+**Plans**: 01 — Validated 2026-05-05; renamed the copy strategy to
+`read_copy`, added public-dispatch behavior guardrails, tensor-residency ownership
+guardrails, and maintained tool/model-loader no-reach-through source checks.
+
+#### Phase 218: Publication and Maintained Artifact Updates
+**Goal**: Maintained docs, snapshots, benchmark outputs, model artifacts, and planning truth
+describe read-strategy support exactly as implemented.
+**Depends on**: Phase 217
+**Requirements**: VAL-03
+**Gap Closure**: Closes v1.25 audit gaps for stale planning truth, stale generated docs,
+and missing maintained artifact updates. User approved updating snapshots, benchmarks, and
+models as needed during this gap closure command.
+**Success Criteria** (what must be TRUE):
+ 1. Public docs and generated architecture docs describe the read/copy strategy path,
+ ownership boundaries, and deferred strategies (mmap shipped in v1.24; staged/async/device
+ remain deferred) truthfully.
+ 2. Lint snapshots, benchmark snapshots, benchmark outputs, and model artifacts are updated
+ from maintained commands when the implementation changes them.
+ 3. Planning artifacts record final requirement coverage, validation evidence, and any
+ approved artifact updates for v1.25.
+ 4. Closeout artifacts do not claim read-strategy support beyond source-backed maintained
+ runtime behavior.
+ 5. Any snapshot, benchmark, or model artifact changes are produced by maintained commands
+ and explicitly tied to source-backed read/copy runtime behavior.
+**Plans**: 01 — Validated 2026-05-05; updated public docs, README template,
+generated architecture docs, benchmark snapshots, planning truth, and final closeout
+audit from maintained commands. The closing full quality gate passed with
+`EMEL_QUALITY_GATES_SCOPE=full EMEL_QUALITY_GATES_PARALLEL=never
+scripts/quality_gates.sh`.
+
+#### Phase 219: Maintained Read Source Provenance
+**Goal**: Maintained benchmark, paritychecker, and embedded probe lanes prove read/copy
+strategy usage from a maintained `src`-owned source contract instead of tool-local full-file
+read scaffolds.
+**Depends on**: Phase 218
+**Requirements**: PLAT-01, TIO-03, VAL-04
+**Gap Closure**: Closes v1.25 audit gaps where generation, Sortformer diarization,
+embedded probe, and paritychecker lanes report `read_copy` after tool-local
+`read_file_bytes` helpers create the source span.
+**Success Criteria** (what must be TRUE):
+ 1. Maintained benchmark, paritychecker, and embedded probe lanes no longer own low-level
+ file slurp helpers as the source of `read_copy` evidence.
+ 2. A maintained `src`-owned loading/source contract feeds `model/loader -> model/tensor ->
+ io/loader -> io/read` for read/copy tool evidence.
+ 3. `read_copy` benchmark/parity/probe output is emitted only when the EMEL lane actually
+ consumed the maintained source contract and executed the public runtime path.
+ 4. Unsupported or fallback source behavior is reported as unsupported or non-read-strategy,
+ never as read-strategy parity or performance evidence.
+ 5. Tests and source guardrails fail on tool-local substitutes for the maintained read/copy
+ source path.
+
+#### Phase 220: Explicit Tensor Read Outcome Graph
+**Goal**: Tensor-owned read/copy integration exposes success and failure outcomes through
+explicit state/event routing without callback/status-mediated behavior selection.
+**Depends on**: Phase 219
+**Requirements**: TIO-02
+**Gap Closure**: Closes v1.25 audit gap where `model/tensor` represents final outcomes
+with explicit states/events but still uses callback-mutated runtime status inspected by
+guards to select the read outcome path.
+**Success Criteria** (what must be TRUE):
+ 1. `model/tensor` read success, unsupported, validation failure, file open failure, and
+ file read failure outcomes are selected by explicit guards/transitions over typed
+ same-RTC events, not by callback-mutated status fields.
+ 2. Any same-RTC callbacks used for immediate replies do not decide which tensor outcome
+ path runs.
+ 3. No mirrored status fields, context phase flags, or callback-selected outcomes remain in
+ the read-backed tensor outcome path.
+ 4. Public doctests prove all representative read success and error outcomes through
+ `process_event(...)` and SML state inspection.
+
+#### Phase 221: Read Closeout Truth Reconciliation
+**Goal**: Maintained docs, generated architecture docs, planning artifacts, snapshots,
+benchmark outputs, model artifacts, and the milestone audit describe read/copy support
+exactly as implemented after gap closure.
+**Depends on**: Phase 220
+**Requirements**: superseded by Phase 223
+**Gap Closure**: Closes v1.25 audit gap where closeout artifacts overstated maintained
+read/copy path truth while tool-local source spans still fed the reported lane. User
+approved updating model artifacts, snapshots, and benchmarks as needed during this gap
+closure command.
+**Success Criteria** (what must be TRUE):
+ 1. Public docs, generated architecture docs, ROADMAP, REQUIREMENTS, STATE, PROJECT,
+ MILESTONES, and the milestone audit describe the maintained read/copy path truthfully.
+ 2. Lint snapshots, benchmark snapshots, benchmark outputs, and model artifacts are updated
+ from maintained commands when implementation changes require it.
+ 3. Phase 214 historical artifacts are reconciled or explicitly marked superseded so they no
+ longer conflict with the Phase 214.1 source-buffer truth.
+ 4. A source-backed milestone audit passes without relying on tool-only source scaffolds.
+ 5. The closing quality gate is run with the appropriate full or changed-file scope and no
+ benchmark-regression override unless explicitly documented as transitional.
+**Plans**: 01 — Ready only. 2026-05-06 audit found an additional source-contract
+blocker in Phase 219/216 maintained lanes, so Phase 221 is superseded by the
+Phase 222 source-contract repair and Phase 223 closeout truth plan.
+**Summary**: Superseded 2026-05-06 with no source or requirement claims.
+
+#### Phase 222: Public Read Source Contract Repair
+**Goal**: Maintained benchmark, paritychecker, and embedded probe lanes obtain read/copy
+source bytes through an allowed public or non-actor-internal EMEL-owned contract instead of
+including `emel/io/read/detail.hpp`.
+**Depends on**: Phase 220
+**Requirements**: PLAT-01, TIO-03, VAL-02, VAL-04
+**Gap Closure**: Closes v1.25 audit gaps where maintained lanes replaced tool-local
+`read_file_bytes` helpers with direct actor-detail reach-through, causing paritychecker
+guardrails and maintained read/copy evidence to fail.
+**Success Criteria** (what must be TRUE):
+ 1. Maintained generation, Sortformer diarization, embedded probe, and paritychecker lanes
+ no longer include or call `emel/io/read/detail.hpp` or any actor `detail.hpp` helper for
+ benchmark/parity source loading.
+ 2. Source-byte loading for maintained read/copy evidence is exposed through an allowed
+ EMEL-owned public/runtime/setup contract that does not violate the actor model,
+ benchmark/parity harness rules, or `detail.hpp` ownership rules.
+ 3. Maintained lanes still report `read_copy` only when the EMEL lane executes the public
+ `model/loader -> model/tensor -> io/loader -> io/read` runtime path.
+ 4. Guardrails fail on actor-internal reach-through, tool-local read substitutes, and any
+ unsupported fallback reported as read/copy evidence.
+ 5. Focused paritychecker and maintained generation evidence passes without benchmark
+ regression override.
+**Plans**: 01 — Validated 2026-05-06; moved maintained source-byte loading to
+`emel::io::source::load_file_bytes`, removed `io/read/detail.hpp` reach-through
+from maintained lanes, and restored paritychecker/generation guardrail evidence.
+
+#### Phase 223: Read Closeout Truth And Validation Reconciliation
+**Goal**: Final v1.25 closeout truth, generated artifacts, snapshots, benchmark outputs,
+model artifacts, requirements, roadmap state, and milestone audit reflect the post-Phase 222
+maintained read/copy runtime path.
+**Depends on**: Phase 222
+**Requirements**: TIO-02, VAL-01, VAL-03
+**Gap Closure**: Closes v1.25 audit gaps for stale Phase 220 roadmap state, unvalidated
+Phase 221/VAL-03 closeout truth, dyld-blocked test rerun evidence, and final source-backed
+milestone audit truth.
+**Success Criteria** (what must be TRUE):
+ 1. ROADMAP, REQUIREMENTS, STATE, PROJECT, MILESTONES, public docs, generated architecture
+ docs, and the milestone audit no longer claim stale Phase 218/221 closeout truth.
+ 2. Phase 220 progress-table state is reconciled with its completed SUMMARY,
+ VERIFICATION, and VALIDATION artifacts.
+ 3. Public behavior doctests and maintained guardrails are rerun or the dyld/libSystem launch
+ blocker is explicitly captured with source-backed substitute evidence approved for the
+ phase.
+ 4. Lint snapshots, benchmark snapshots, benchmark outputs, and model artifacts are updated
+ only through maintained commands when the repaired implementation changes them.
+ 5. A source-backed milestone audit reports every active v1.25 requirement satisfied, with
+ no actor-detail reach-through or tool-only maintained-path evidence.
+**Plans**: 01 — Validated 2026-05-06; reconciled final planning truth, generated
+docs checks, lint snapshot checks, public-dispatch doctests, paritychecker
+guardrails, maintained generation compare evidence, repaired batch planner
+benchmark evidence, the full closeout quality gate, and the source-backed
+milestone audit.
+
+#### Phase 224: Read Closeout Tech Debt Cleanup
+**Goal**: Close the nonblocking tech-debt items from the refreshed v1.25 milestone audit
+before archive.
+**Depends on**: Phase 223
+**Requirements**: none — all v1.25 requirements remain satisfied; this phase is cleanup only
+**Gap Closure**: Addresses audit tech debt without resetting any validated requirement:
+historical Phase 214 supersession noise, public tensor read event maintained-lane coverage shape,
+and fresh `emel_tests_io` evidence after the local dyld/libSystem launch blocker is resolved.
+**Success Criteria** (what must be TRUE):
+ 1. Phase 214 historical artifacts are either further reconciled or explicitly confirmed as
+ intentionally superseded by Phase 214.1 without creating closeout ambiguity.
+ 2. Maintainers can tell whether `model::tensor::event::request_read_load` should gain a
+ maintained direct-lane coverage path or remain a public tested route while maintained
+ model-loader lanes use `model/tensor` plan/apply plus `io/loader -> io/read`.
+ 3. Fresh `emel_tests_io` evidence is captured from a healthy local environment, or the
+ dyld/libSystem launch blocker is captured with an explicit archive-time decision.
+ 4. The milestone audit is rerun and either passes or reports only explicitly accepted
+ nonblocking debt.
+**Plans**: 01 — Validated 2026-05-06; Phase 214 supersession clarity,
+`request_read_load` maintained-lane decision evidence, fresh passing
+`emel_tests_io` evidence, and final milestone audit refresh.
+
+#### Phase 225: Read Closeout Runtime Validation And SML Repair
+**Goal**: Close refreshed v1.25 audit gaps by restoring executable model/batch validation,
+moving maintained read/copy per-tensor I/O orchestration out of model-loader action loops,
+and reconciling closeout artifact paths.
+**Depends on**: Phase 224
+**Requirements**: VAL-01, TIO-03, VAL-04, VAL-03
+**Gap Closure**: Closes `.planning/v1.25-MILESTONE-AUDIT.md` findings: current
+`emel_tests_model_and_batch` dyld launch failure, model-loader action-loop
+`io_loader->process_event(...)` SML readiness risk, and stale archived closeout path
+references.
+**Success Criteria** (what must be TRUE):
+ 1. `ctest --test-dir build/zig --output-on-failure -R emel_tests_model_and_batch`
+ runs to completion or the dyld/libSystem launch blocker is eliminated with a
+ source-backed maintained substitute explicitly recorded in validation.
+ 2. Maintained read/copy `model/loader -> io/loader` orchestration no longer relies on an
+ action loop calling `io_loader->process_event(...)`; runtime choice and per-phase
+ orchestration are represented with explicit SML guards/states/transitions.
+ 3. The maintained read/copy path still reports `used_io_strategy` only after public
+ runtime execution through `model/loader -> model/tensor -> io/loader -> io/read`.
+ 4. Closeout artifact paths in active and archived roadmap/requirements/audit docs point
+ at files that exist after the v1.25 archive layout.
+ 5. Focused model-loader, model/tensor, io/loader, io/read, domain-boundary, consistency,
+ and changed-file quality gates pass without benchmark-regression override.
+**Plans**: 6 plans — completed 2026-05-06
+Plans:
+- [x] `225-01-PLAN.md` — Add the owning `io/read` batch copy surface and public-dispatch tests.
+- [x] `225-02-PLAN.md` — Route one `io/loader` read_copy batch to `io/read` with same-RTC result callbacks.
+- [x] `225-03-PLAN.md` — Replace model-loader per-tensor I/O dispatch with one public batch dispatch.
+- [x] `225-04-PLAN.md` — Wire maintained callers and guardrails to request-owned `io_load_spans`.
+- [x] `225-05-PLAN.md` — Reconcile active and archived closeout path and plan traceability.
+- [x] `225-06-PLAN.md` — Publish validation, summary, and active/archived audit evidence.
+
+#### Phase 226: Read Batch Cap And Closeout Evidence Refresh
+**Goal**: Close the nonblocking tech-debt items from `.planning/v1.25-MILESTONE-AUDIT.md`
+by bounding the public read/copy batch API independently and refreshing closeout evidence
+to match current executable validation.
+**Depends on**: Phase 225
+**Requirements**: none — all v1.25 requirements remain satisfied; this phase is cleanup only
+**Gap Closure**: Closes audit tech debt for the uncapped public `io/read`
+`read_tensor_batch` span and stale dyld-fallback closeout wording after current focused
+CTest passed.
+**Success Criteria** (what must be TRUE):
+ 1. Public `io/read::event::read_tensor_batch` dispatch rejects over-large spans before
+ iterating or copying, with the cap owned by a public/read-side contract rather than
+ relying only on maintained model-loader callers.
+ 2. Doctests prove accepted boundary-size batches and rejected over-large batches through
+ public `process_event(...)` dispatch and SML state inspection.
+ 3. Active and archived closeout evidence distinguishes historical dyld fallback evidence
+ from current direct `build/zig` focused CTest evidence.
+ 4. If the repaired implementation changes maintained snapshots, benchmark outputs,
+ benchmark snapshots, or model artifacts, those artifacts are updated only through
+ maintained commands. User permission for those updates was granted with this phase.
+ 5. Changed-file quality gates pass without benchmark-regression override, and the
+ refreshed milestone audit reports no blockers.
+**Plans**: 01 — Validated 2026-05-06; public `io/read` batch cap added,
+exact-cap and over-cap doctests passed, closeout evidence refreshed, and
+changed-file quality gate passed.
+
+#### Coverage
+
+| Requirement | Phase |
+|-------------|-------|
+| READ-01 | Phase 212 |
+| READ-02 | Phase 213 |
+| PLAT-01 | Phase 222 |
+| READ-03 | Phase 214.1 |
+| LIFE-01 | Phase 214.1 |
+| ERR-01 | Phase 214.1 |
+| TIO-01 | Phase 215 |
+| TIO-02 | Phase 223 |
+| TIO-03 | Phase 225 |
+| VAL-04 | Phase 225 |
+| VAL-01 | Phase 225 |
+| VAL-02 | Phase 222 |
+| VAL-03 | Phase 225 |
+
+Mapped: 13/13 v1 requirements; validated 13, pending 0. Phases 224 and 226 are
+cleanup-only; Phase 225 closed refreshed closeout gaps for VAL-01, TIO-03, VAL-04,
+and VAL-03.
+
+
+✅ v1.24 I/O Mmap Loading Strategy (Phases 204-211) — SHIPPED 2026-05-04
+
+- [x] Phase 204: Mmap Strategy Component Boundary (1/1 plans) — completed 2026-05-04
+- [x] Phase 205: Mmap Validation and Platform Gating (1/1 plans) — completed 2026-05-04
+- [x] Phase 206: Mapped Descriptor, Errors, and Lifetime (1/1 plans) — completed 2026-05-04
+- [x] Phase 207: Tensor-Owned Mmap Integration (1/1 plans) — completed 2026-05-04
+- [x] Phase 208: Public Runtime and Evidence Surfaces (1/1 plans) — completed 2026-05-04
+- [x] Phase 209: Behavior Tests and Scope Guardrails (1/1 plans) — completed 2026-05-04
+- [x] Phase 210: Publication and Maintained Artifact Updates (1/1 plans) — completed 2026-05-04
+- [x] Phase 211: Phase Verification Artifact Backfill (1/1 plans) — completed 2026-05-04 (gap closure)
+
+Archive:
+- `.planning/milestones/v1.24-ROADMAP.md`
+- `.planning/milestones/v1.24-REQUIREMENTS.md`
+- `.planning/milestones/v1.24-MILESTONE-AUDIT.md`
+- `.planning/milestones/v1.24-phases/{204..210}-*` (Phase 211 backfill artifacts live alongside their parent phase dirs)
+
+
+
+
+✅ v1.23 I/O Loading Strategy Boundary (Phases 197-203) — SHIPPED 2026-05-04
+
+Archive:
+- `.planning/milestones/v1.23-ROADMAP.md`
+- `.planning/milestones/v1.23-REQUIREMENTS.md`
+- `.planning/milestones/v1.23-MILESTONE-AUDIT.md`
+- `.planning/milestones/v1.23-phases/`
+
+
+
+### 📋 Milestone backlog
+
+Older “next milestone” staging notes are superseded by **v1.26** (issue #63) in active planning
+artifacts (`REQUIREMENTS.md`, `STATE.md`). Future milestones after v1.26 continue via
+`$gsd-new-milestone`.
+
+## Progress
+
+| Phase | Milestone | Plans Complete | Status | Completed |
+|-------|-----------|----------------|--------|-----------|
+| 227. Staged Read Strategy Component Boundary | v1.26 | 1/1 | Complete | 2026-05-07 |
+| 228. Span, Target-Window, and Platform Gating | v1.26 | 1/1 | Complete | 2026-05-07 |
+| 229. Staged Copy Progress and Completion Semantics | v1.26 | 1/1 | Complete | 2026-05-07 |
+| 230. Context Cleanness and Per-Attempt Lifetime | v1.26 | 1/1 | Complete | 2026-05-07 |
+| 231. Deterministic Error Taxonomy | v1.26 | 1/1 | Complete | 2026-05-07 |
+| 232. Tensor-Owned Integration Graph | v1.26 | 1/1 | Complete | 2026-05-07 |
+| 233. Public Loader and Maintained Entrypoints | v1.26 | 1/1 | Complete | 2026-05-08 |
+| 234. Public Dispatch Tests | v1.26 | 1/1 | Complete | 2026-05-08 |
+| 235. Scope and Non-Regression Guardrails | v1.26 | 1/1 | Complete | 2026-05-08 |
+| 236. Publication and Evidence Truthfulness | v1.26 | 1/1 | Complete | 2026-05-08 |
+| 237. Direct Tensor Staged Offset Contract Repair | v1.26 | 1/1 | Complete | 2026-05-08 |
+| 238. Audit Artifact and Probe Reporting Cleanup | v1.26 | 1/1 | Complete | 2026-05-08 |
+| 212. Read Strategy Component Boundary | v1.25 | 1/1 | Validated | 2026-05-05 |
+| 213. Read Validation and Platform Gating | v1.25 | 1/1 | Validated | 2026-05-05 |
+| 214. Read Execution, Errors, and Lifetime | v1.25 | 1/1 | Validated | 2026-05-05 |
+| 214.1. RTC-Safe Read Execution Boundary Repair | v1.25 | 1/1 | Validated | 2026-05-05 |
+| 215. Tensor-Owned Read Integration | v1.25 | 1/1 | Validated | 2026-05-05 |
+| 216. Public Runtime and Evidence Surfaces | v1.25 | 1/1 | Validated | 2026-05-05 |
+| 217. Behavior Tests and Scope Guardrails | v1.25 | 1/1 | Validated | 2026-05-05 |
+| 218. Publication and Maintained Artifact Updates | v1.25 | 1/1 | Validated | 2026-05-05 |
+| 219. Maintained Read Source Provenance | v1.25 | 1/1 | Validated | 2026-05-05 |
+| 220. Explicit Tensor Read Outcome Graph | v1.25 | 1/1 | Validated | 2026-05-05 |
+| 221. Read Closeout Truth Reconciliation | v1.25 | 1/1 | Superseded | 2026-05-06 |
+| 222. Public Read Source Contract Repair | v1.25 | 1/1 | Validated | 2026-05-06 |
+| 223. Read Closeout Truth And Validation Reconciliation | v1.25 | 1/1 | Validated | 2026-05-06 |
+| 224. Read Closeout Tech Debt Cleanup | v1.25 | 1/1 | Complete | 2026-05-06 |
+| 225. Read Closeout Runtime Validation And SML Repair | v1.25 | 6/6 | Complete | 2026-05-06 |
+| 226. Read Batch Cap And Closeout Evidence Refresh | v1.25 | 1/1 | Validated | 2026-05-06 |
+| 204. Mmap Strategy Component Boundary | v1.24 | 1/1 | Complete | 2026-05-04 |
+| 205. Mmap Validation and Platform Gating | v1.24 | 1/1 | Complete | 2026-05-04 |
+| 206. Mapped Descriptor, Errors, and Lifetime | v1.24 | 1/1 | Complete | 2026-05-04 |
+| 207. Tensor-Owned Mmap Integration | v1.24 | 1/1 | Complete | 2026-05-04 |
+| 208. Public Runtime and Evidence Surfaces | v1.24 | 1/1 | Complete | 2026-05-04 |
+| 209. Behavior Tests and Scope Guardrails | v1.24 | 1/1 | Complete | 2026-05-04 |
+| 210. Publication and Maintained Artifact Updates | v1.24 | 1/1 | Complete | 2026-05-04 |
+| 211. Phase Verification Artifact Backfill | v1.24 | 1/1 | Complete | 2026-05-04 |
diff --git a/.planning/phases/227-staged-read-strategy-component-boundary/227-01-PLAN.md b/.planning/milestones/v1.26-phases/227-staged-read-strategy-component-boundary/227-01-PLAN.md
similarity index 100%
rename from .planning/phases/227-staged-read-strategy-component-boundary/227-01-PLAN.md
rename to .planning/milestones/v1.26-phases/227-staged-read-strategy-component-boundary/227-01-PLAN.md
diff --git a/.planning/phases/227-staged-read-strategy-component-boundary/227-01-SUMMARY.md b/.planning/milestones/v1.26-phases/227-staged-read-strategy-component-boundary/227-01-SUMMARY.md
similarity index 100%
rename from .planning/phases/227-staged-read-strategy-component-boundary/227-01-SUMMARY.md
rename to .planning/milestones/v1.26-phases/227-staged-read-strategy-component-boundary/227-01-SUMMARY.md
diff --git a/.planning/phases/227-staged-read-strategy-component-boundary/227-CONTEXT.md b/.planning/milestones/v1.26-phases/227-staged-read-strategy-component-boundary/227-CONTEXT.md
similarity index 100%
rename from .planning/phases/227-staged-read-strategy-component-boundary/227-CONTEXT.md
rename to .planning/milestones/v1.26-phases/227-staged-read-strategy-component-boundary/227-CONTEXT.md
diff --git a/.planning/phases/227-staged-read-strategy-component-boundary/227-VALIDATION.md b/.planning/milestones/v1.26-phases/227-staged-read-strategy-component-boundary/227-VALIDATION.md
similarity index 100%
rename from .planning/phases/227-staged-read-strategy-component-boundary/227-VALIDATION.md
rename to .planning/milestones/v1.26-phases/227-staged-read-strategy-component-boundary/227-VALIDATION.md
diff --git a/.planning/phases/227-staged-read-strategy-component-boundary/227-VERIFICATION.md b/.planning/milestones/v1.26-phases/227-staged-read-strategy-component-boundary/227-VERIFICATION.md
similarity index 100%
rename from .planning/phases/227-staged-read-strategy-component-boundary/227-VERIFICATION.md
rename to .planning/milestones/v1.26-phases/227-staged-read-strategy-component-boundary/227-VERIFICATION.md
diff --git a/.planning/phases/228-span-target-window-platform-gating/228-01-PLAN.md b/.planning/milestones/v1.26-phases/228-span-target-window-platform-gating/228-01-PLAN.md
similarity index 100%
rename from .planning/phases/228-span-target-window-platform-gating/228-01-PLAN.md
rename to .planning/milestones/v1.26-phases/228-span-target-window-platform-gating/228-01-PLAN.md
diff --git a/.planning/phases/228-span-target-window-platform-gating/228-01-SUMMARY.md b/.planning/milestones/v1.26-phases/228-span-target-window-platform-gating/228-01-SUMMARY.md
similarity index 100%
rename from .planning/phases/228-span-target-window-platform-gating/228-01-SUMMARY.md
rename to .planning/milestones/v1.26-phases/228-span-target-window-platform-gating/228-01-SUMMARY.md
diff --git a/.planning/phases/228-span-target-window-platform-gating/228-CONTEXT.md b/.planning/milestones/v1.26-phases/228-span-target-window-platform-gating/228-CONTEXT.md
similarity index 100%
rename from .planning/phases/228-span-target-window-platform-gating/228-CONTEXT.md
rename to .planning/milestones/v1.26-phases/228-span-target-window-platform-gating/228-CONTEXT.md
diff --git a/.planning/phases/228-span-target-window-platform-gating/228-VALIDATION.md b/.planning/milestones/v1.26-phases/228-span-target-window-platform-gating/228-VALIDATION.md
similarity index 100%
rename from .planning/phases/228-span-target-window-platform-gating/228-VALIDATION.md
rename to .planning/milestones/v1.26-phases/228-span-target-window-platform-gating/228-VALIDATION.md
diff --git a/.planning/phases/228-span-target-window-platform-gating/228-VERIFICATION.md b/.planning/milestones/v1.26-phases/228-span-target-window-platform-gating/228-VERIFICATION.md
similarity index 100%
rename from .planning/phases/228-span-target-window-platform-gating/228-VERIFICATION.md
rename to .planning/milestones/v1.26-phases/228-span-target-window-platform-gating/228-VERIFICATION.md
diff --git a/.planning/phases/229-staged-copy-progress-and-completion-semantics/229-01-PLAN.md b/.planning/milestones/v1.26-phases/229-staged-copy-progress-and-completion-semantics/229-01-PLAN.md
similarity index 100%
rename from .planning/phases/229-staged-copy-progress-and-completion-semantics/229-01-PLAN.md
rename to .planning/milestones/v1.26-phases/229-staged-copy-progress-and-completion-semantics/229-01-PLAN.md
diff --git a/.planning/phases/229-staged-copy-progress-and-completion-semantics/229-01-SUMMARY.md b/.planning/milestones/v1.26-phases/229-staged-copy-progress-and-completion-semantics/229-01-SUMMARY.md
similarity index 100%
rename from .planning/phases/229-staged-copy-progress-and-completion-semantics/229-01-SUMMARY.md
rename to .planning/milestones/v1.26-phases/229-staged-copy-progress-and-completion-semantics/229-01-SUMMARY.md
diff --git a/.planning/phases/229-staged-copy-progress-and-completion-semantics/229-CONTEXT.md b/.planning/milestones/v1.26-phases/229-staged-copy-progress-and-completion-semantics/229-CONTEXT.md
similarity index 100%
rename from .planning/phases/229-staged-copy-progress-and-completion-semantics/229-CONTEXT.md
rename to .planning/milestones/v1.26-phases/229-staged-copy-progress-and-completion-semantics/229-CONTEXT.md
diff --git a/.planning/phases/229-staged-copy-progress-and-completion-semantics/229-VALIDATION.md b/.planning/milestones/v1.26-phases/229-staged-copy-progress-and-completion-semantics/229-VALIDATION.md
similarity index 100%
rename from .planning/phases/229-staged-copy-progress-and-completion-semantics/229-VALIDATION.md
rename to .planning/milestones/v1.26-phases/229-staged-copy-progress-and-completion-semantics/229-VALIDATION.md
diff --git a/.planning/phases/229-staged-copy-progress-and-completion-semantics/229-VERIFICATION.md b/.planning/milestones/v1.26-phases/229-staged-copy-progress-and-completion-semantics/229-VERIFICATION.md
similarity index 100%
rename from .planning/phases/229-staged-copy-progress-and-completion-semantics/229-VERIFICATION.md
rename to .planning/milestones/v1.26-phases/229-staged-copy-progress-and-completion-semantics/229-VERIFICATION.md
diff --git a/.planning/phases/230-context-cleanness-and-per-attempt-lifetime/230-01-PLAN.md b/.planning/milestones/v1.26-phases/230-context-cleanness-and-per-attempt-lifetime/230-01-PLAN.md
similarity index 100%
rename from .planning/phases/230-context-cleanness-and-per-attempt-lifetime/230-01-PLAN.md
rename to .planning/milestones/v1.26-phases/230-context-cleanness-and-per-attempt-lifetime/230-01-PLAN.md
diff --git a/.planning/phases/230-context-cleanness-and-per-attempt-lifetime/230-01-SUMMARY.md b/.planning/milestones/v1.26-phases/230-context-cleanness-and-per-attempt-lifetime/230-01-SUMMARY.md
similarity index 100%
rename from .planning/phases/230-context-cleanness-and-per-attempt-lifetime/230-01-SUMMARY.md
rename to .planning/milestones/v1.26-phases/230-context-cleanness-and-per-attempt-lifetime/230-01-SUMMARY.md
diff --git a/.planning/phases/230-context-cleanness-and-per-attempt-lifetime/230-CONTEXT.md b/.planning/milestones/v1.26-phases/230-context-cleanness-and-per-attempt-lifetime/230-CONTEXT.md
similarity index 100%
rename from .planning/phases/230-context-cleanness-and-per-attempt-lifetime/230-CONTEXT.md
rename to .planning/milestones/v1.26-phases/230-context-cleanness-and-per-attempt-lifetime/230-CONTEXT.md
diff --git a/.planning/phases/230-context-cleanness-and-per-attempt-lifetime/230-VALIDATION.md b/.planning/milestones/v1.26-phases/230-context-cleanness-and-per-attempt-lifetime/230-VALIDATION.md
similarity index 100%
rename from .planning/phases/230-context-cleanness-and-per-attempt-lifetime/230-VALIDATION.md
rename to .planning/milestones/v1.26-phases/230-context-cleanness-and-per-attempt-lifetime/230-VALIDATION.md
diff --git a/.planning/phases/230-context-cleanness-and-per-attempt-lifetime/230-VERIFICATION.md b/.planning/milestones/v1.26-phases/230-context-cleanness-and-per-attempt-lifetime/230-VERIFICATION.md
similarity index 100%
rename from .planning/phases/230-context-cleanness-and-per-attempt-lifetime/230-VERIFICATION.md
rename to .planning/milestones/v1.26-phases/230-context-cleanness-and-per-attempt-lifetime/230-VERIFICATION.md
diff --git a/.planning/phases/231-deterministic-error-taxonomy/231-01-PLAN.md b/.planning/milestones/v1.26-phases/231-deterministic-error-taxonomy/231-01-PLAN.md
similarity index 100%
rename from .planning/phases/231-deterministic-error-taxonomy/231-01-PLAN.md
rename to .planning/milestones/v1.26-phases/231-deterministic-error-taxonomy/231-01-PLAN.md
diff --git a/.planning/phases/231-deterministic-error-taxonomy/231-01-SUMMARY.md b/.planning/milestones/v1.26-phases/231-deterministic-error-taxonomy/231-01-SUMMARY.md
similarity index 100%
rename from .planning/phases/231-deterministic-error-taxonomy/231-01-SUMMARY.md
rename to .planning/milestones/v1.26-phases/231-deterministic-error-taxonomy/231-01-SUMMARY.md
diff --git a/.planning/phases/231-deterministic-error-taxonomy/231-CONTEXT.md b/.planning/milestones/v1.26-phases/231-deterministic-error-taxonomy/231-CONTEXT.md
similarity index 100%
rename from .planning/phases/231-deterministic-error-taxonomy/231-CONTEXT.md
rename to .planning/milestones/v1.26-phases/231-deterministic-error-taxonomy/231-CONTEXT.md
diff --git a/.planning/phases/231-deterministic-error-taxonomy/231-VALIDATION.md b/.planning/milestones/v1.26-phases/231-deterministic-error-taxonomy/231-VALIDATION.md
similarity index 100%
rename from .planning/phases/231-deterministic-error-taxonomy/231-VALIDATION.md
rename to .planning/milestones/v1.26-phases/231-deterministic-error-taxonomy/231-VALIDATION.md
diff --git a/.planning/phases/231-deterministic-error-taxonomy/231-VERIFICATION.md b/.planning/milestones/v1.26-phases/231-deterministic-error-taxonomy/231-VERIFICATION.md
similarity index 100%
rename from .planning/phases/231-deterministic-error-taxonomy/231-VERIFICATION.md
rename to .planning/milestones/v1.26-phases/231-deterministic-error-taxonomy/231-VERIFICATION.md
diff --git a/.planning/phases/232-tensor-owned-integration-graph/232-01-PLAN.md b/.planning/milestones/v1.26-phases/232-tensor-owned-integration-graph/232-01-PLAN.md
similarity index 100%
rename from .planning/phases/232-tensor-owned-integration-graph/232-01-PLAN.md
rename to .planning/milestones/v1.26-phases/232-tensor-owned-integration-graph/232-01-PLAN.md
diff --git a/.planning/phases/232-tensor-owned-integration-graph/232-01-SUMMARY.md b/.planning/milestones/v1.26-phases/232-tensor-owned-integration-graph/232-01-SUMMARY.md
similarity index 100%
rename from .planning/phases/232-tensor-owned-integration-graph/232-01-SUMMARY.md
rename to .planning/milestones/v1.26-phases/232-tensor-owned-integration-graph/232-01-SUMMARY.md
diff --git a/.planning/phases/232-tensor-owned-integration-graph/232-CONTEXT.md b/.planning/milestones/v1.26-phases/232-tensor-owned-integration-graph/232-CONTEXT.md
similarity index 100%
rename from .planning/phases/232-tensor-owned-integration-graph/232-CONTEXT.md
rename to .planning/milestones/v1.26-phases/232-tensor-owned-integration-graph/232-CONTEXT.md
diff --git a/.planning/phases/232-tensor-owned-integration-graph/232-VALIDATION.md b/.planning/milestones/v1.26-phases/232-tensor-owned-integration-graph/232-VALIDATION.md
similarity index 100%
rename from .planning/phases/232-tensor-owned-integration-graph/232-VALIDATION.md
rename to .planning/milestones/v1.26-phases/232-tensor-owned-integration-graph/232-VALIDATION.md
diff --git a/.planning/phases/232-tensor-owned-integration-graph/232-VERIFICATION.md b/.planning/milestones/v1.26-phases/232-tensor-owned-integration-graph/232-VERIFICATION.md
similarity index 100%
rename from .planning/phases/232-tensor-owned-integration-graph/232-VERIFICATION.md
rename to .planning/milestones/v1.26-phases/232-tensor-owned-integration-graph/232-VERIFICATION.md
diff --git a/.planning/phases/233-public-loader-and-maintained-entrypoints/233-01-PLAN.md b/.planning/milestones/v1.26-phases/233-public-loader-and-maintained-entrypoints/233-01-PLAN.md
similarity index 100%
rename from .planning/phases/233-public-loader-and-maintained-entrypoints/233-01-PLAN.md
rename to .planning/milestones/v1.26-phases/233-public-loader-and-maintained-entrypoints/233-01-PLAN.md
diff --git a/.planning/phases/233-public-loader-and-maintained-entrypoints/233-01-SUMMARY.md b/.planning/milestones/v1.26-phases/233-public-loader-and-maintained-entrypoints/233-01-SUMMARY.md
similarity index 100%
rename from .planning/phases/233-public-loader-and-maintained-entrypoints/233-01-SUMMARY.md
rename to .planning/milestones/v1.26-phases/233-public-loader-and-maintained-entrypoints/233-01-SUMMARY.md
diff --git a/.planning/phases/233-public-loader-and-maintained-entrypoints/233-CONTEXT.md b/.planning/milestones/v1.26-phases/233-public-loader-and-maintained-entrypoints/233-CONTEXT.md
similarity index 100%
rename from .planning/phases/233-public-loader-and-maintained-entrypoints/233-CONTEXT.md
rename to .planning/milestones/v1.26-phases/233-public-loader-and-maintained-entrypoints/233-CONTEXT.md
diff --git a/.planning/phases/233-public-loader-and-maintained-entrypoints/233-VALIDATION.md b/.planning/milestones/v1.26-phases/233-public-loader-and-maintained-entrypoints/233-VALIDATION.md
similarity index 100%
rename from .planning/phases/233-public-loader-and-maintained-entrypoints/233-VALIDATION.md
rename to .planning/milestones/v1.26-phases/233-public-loader-and-maintained-entrypoints/233-VALIDATION.md
diff --git a/.planning/phases/233-public-loader-and-maintained-entrypoints/233-VERIFICATION.md b/.planning/milestones/v1.26-phases/233-public-loader-and-maintained-entrypoints/233-VERIFICATION.md
similarity index 100%
rename from .planning/phases/233-public-loader-and-maintained-entrypoints/233-VERIFICATION.md
rename to .planning/milestones/v1.26-phases/233-public-loader-and-maintained-entrypoints/233-VERIFICATION.md
diff --git a/.planning/phases/234-public-dispatch-tests/234-01-PLAN.md b/.planning/milestones/v1.26-phases/234-public-dispatch-tests/234-01-PLAN.md
similarity index 100%
rename from .planning/phases/234-public-dispatch-tests/234-01-PLAN.md
rename to .planning/milestones/v1.26-phases/234-public-dispatch-tests/234-01-PLAN.md
diff --git a/.planning/phases/234-public-dispatch-tests/234-01-SUMMARY.md b/.planning/milestones/v1.26-phases/234-public-dispatch-tests/234-01-SUMMARY.md
similarity index 100%
rename from .planning/phases/234-public-dispatch-tests/234-01-SUMMARY.md
rename to .planning/milestones/v1.26-phases/234-public-dispatch-tests/234-01-SUMMARY.md
diff --git a/.planning/phases/234-public-dispatch-tests/234-CONTEXT.md b/.planning/milestones/v1.26-phases/234-public-dispatch-tests/234-CONTEXT.md
similarity index 100%
rename from .planning/phases/234-public-dispatch-tests/234-CONTEXT.md
rename to .planning/milestones/v1.26-phases/234-public-dispatch-tests/234-CONTEXT.md
diff --git a/.planning/phases/234-public-dispatch-tests/234-VALIDATION.md b/.planning/milestones/v1.26-phases/234-public-dispatch-tests/234-VALIDATION.md
similarity index 100%
rename from .planning/phases/234-public-dispatch-tests/234-VALIDATION.md
rename to .planning/milestones/v1.26-phases/234-public-dispatch-tests/234-VALIDATION.md
diff --git a/.planning/phases/234-public-dispatch-tests/234-VERIFICATION.md b/.planning/milestones/v1.26-phases/234-public-dispatch-tests/234-VERIFICATION.md
similarity index 100%
rename from .planning/phases/234-public-dispatch-tests/234-VERIFICATION.md
rename to .planning/milestones/v1.26-phases/234-public-dispatch-tests/234-VERIFICATION.md
diff --git a/.planning/phases/235-scope-and-non-regression-guardrails/235-01-PLAN.md b/.planning/milestones/v1.26-phases/235-scope-and-non-regression-guardrails/235-01-PLAN.md
similarity index 100%
rename from .planning/phases/235-scope-and-non-regression-guardrails/235-01-PLAN.md
rename to .planning/milestones/v1.26-phases/235-scope-and-non-regression-guardrails/235-01-PLAN.md
diff --git a/.planning/phases/235-scope-and-non-regression-guardrails/235-01-SUMMARY.md b/.planning/milestones/v1.26-phases/235-scope-and-non-regression-guardrails/235-01-SUMMARY.md
similarity index 100%
rename from .planning/phases/235-scope-and-non-regression-guardrails/235-01-SUMMARY.md
rename to .planning/milestones/v1.26-phases/235-scope-and-non-regression-guardrails/235-01-SUMMARY.md
diff --git a/.planning/phases/235-scope-and-non-regression-guardrails/235-CONTEXT.md b/.planning/milestones/v1.26-phases/235-scope-and-non-regression-guardrails/235-CONTEXT.md
similarity index 100%
rename from .planning/phases/235-scope-and-non-regression-guardrails/235-CONTEXT.md
rename to .planning/milestones/v1.26-phases/235-scope-and-non-regression-guardrails/235-CONTEXT.md
diff --git a/.planning/phases/235-scope-and-non-regression-guardrails/235-VALIDATION.md b/.planning/milestones/v1.26-phases/235-scope-and-non-regression-guardrails/235-VALIDATION.md
similarity index 100%
rename from .planning/phases/235-scope-and-non-regression-guardrails/235-VALIDATION.md
rename to .planning/milestones/v1.26-phases/235-scope-and-non-regression-guardrails/235-VALIDATION.md
diff --git a/.planning/phases/235-scope-and-non-regression-guardrails/235-VERIFICATION.md b/.planning/milestones/v1.26-phases/235-scope-and-non-regression-guardrails/235-VERIFICATION.md
similarity index 100%
rename from .planning/phases/235-scope-and-non-regression-guardrails/235-VERIFICATION.md
rename to .planning/milestones/v1.26-phases/235-scope-and-non-regression-guardrails/235-VERIFICATION.md
diff --git a/.planning/phases/236-publication-and-evidence-truthfulness/236-01-PLAN.md b/.planning/milestones/v1.26-phases/236-publication-and-evidence-truthfulness/236-01-PLAN.md
similarity index 100%
rename from .planning/phases/236-publication-and-evidence-truthfulness/236-01-PLAN.md
rename to .planning/milestones/v1.26-phases/236-publication-and-evidence-truthfulness/236-01-PLAN.md
diff --git a/.planning/phases/236-publication-and-evidence-truthfulness/236-01-SUMMARY.md b/.planning/milestones/v1.26-phases/236-publication-and-evidence-truthfulness/236-01-SUMMARY.md
similarity index 100%
rename from .planning/phases/236-publication-and-evidence-truthfulness/236-01-SUMMARY.md
rename to .planning/milestones/v1.26-phases/236-publication-and-evidence-truthfulness/236-01-SUMMARY.md
diff --git a/.planning/phases/236-publication-and-evidence-truthfulness/236-CONTEXT.md b/.planning/milestones/v1.26-phases/236-publication-and-evidence-truthfulness/236-CONTEXT.md
similarity index 100%
rename from .planning/phases/236-publication-and-evidence-truthfulness/236-CONTEXT.md
rename to .planning/milestones/v1.26-phases/236-publication-and-evidence-truthfulness/236-CONTEXT.md
diff --git a/.planning/phases/236-publication-and-evidence-truthfulness/236-VALIDATION.md b/.planning/milestones/v1.26-phases/236-publication-and-evidence-truthfulness/236-VALIDATION.md
similarity index 100%
rename from .planning/phases/236-publication-and-evidence-truthfulness/236-VALIDATION.md
rename to .planning/milestones/v1.26-phases/236-publication-and-evidence-truthfulness/236-VALIDATION.md
diff --git a/.planning/phases/236-publication-and-evidence-truthfulness/236-VERIFICATION.md b/.planning/milestones/v1.26-phases/236-publication-and-evidence-truthfulness/236-VERIFICATION.md
similarity index 100%
rename from .planning/phases/236-publication-and-evidence-truthfulness/236-VERIFICATION.md
rename to .planning/milestones/v1.26-phases/236-publication-and-evidence-truthfulness/236-VERIFICATION.md
diff --git a/.planning/phases/237-direct-tensor-staged-offset-contract-repair/237-01-PLAN.md b/.planning/milestones/v1.26-phases/237-direct-tensor-staged-offset-contract-repair/237-01-PLAN.md
similarity index 100%
rename from .planning/phases/237-direct-tensor-staged-offset-contract-repair/237-01-PLAN.md
rename to .planning/milestones/v1.26-phases/237-direct-tensor-staged-offset-contract-repair/237-01-PLAN.md
diff --git a/.planning/phases/237-direct-tensor-staged-offset-contract-repair/237-01-SUMMARY.md b/.planning/milestones/v1.26-phases/237-direct-tensor-staged-offset-contract-repair/237-01-SUMMARY.md
similarity index 100%
rename from .planning/phases/237-direct-tensor-staged-offset-contract-repair/237-01-SUMMARY.md
rename to .planning/milestones/v1.26-phases/237-direct-tensor-staged-offset-contract-repair/237-01-SUMMARY.md
diff --git a/.planning/phases/237-direct-tensor-staged-offset-contract-repair/237-CONTEXT.md b/.planning/milestones/v1.26-phases/237-direct-tensor-staged-offset-contract-repair/237-CONTEXT.md
similarity index 100%
rename from .planning/phases/237-direct-tensor-staged-offset-contract-repair/237-CONTEXT.md
rename to .planning/milestones/v1.26-phases/237-direct-tensor-staged-offset-contract-repair/237-CONTEXT.md
diff --git a/.planning/phases/237-direct-tensor-staged-offset-contract-repair/237-VALIDATION.md b/.planning/milestones/v1.26-phases/237-direct-tensor-staged-offset-contract-repair/237-VALIDATION.md
similarity index 100%
rename from .planning/phases/237-direct-tensor-staged-offset-contract-repair/237-VALIDATION.md
rename to .planning/milestones/v1.26-phases/237-direct-tensor-staged-offset-contract-repair/237-VALIDATION.md
diff --git a/.planning/phases/237-direct-tensor-staged-offset-contract-repair/237-VERIFICATION.md b/.planning/milestones/v1.26-phases/237-direct-tensor-staged-offset-contract-repair/237-VERIFICATION.md
similarity index 100%
rename from .planning/phases/237-direct-tensor-staged-offset-contract-repair/237-VERIFICATION.md
rename to .planning/milestones/v1.26-phases/237-direct-tensor-staged-offset-contract-repair/237-VERIFICATION.md
diff --git a/.planning/phases/238-audit-artifact-and-probe-reporting-cleanup/238-01-PLAN.md b/.planning/milestones/v1.26-phases/238-audit-artifact-and-probe-reporting-cleanup/238-01-PLAN.md
similarity index 100%
rename from .planning/phases/238-audit-artifact-and-probe-reporting-cleanup/238-01-PLAN.md
rename to .planning/milestones/v1.26-phases/238-audit-artifact-and-probe-reporting-cleanup/238-01-PLAN.md
diff --git a/.planning/phases/238-audit-artifact-and-probe-reporting-cleanup/238-01-SUMMARY.md b/.planning/milestones/v1.26-phases/238-audit-artifact-and-probe-reporting-cleanup/238-01-SUMMARY.md
similarity index 100%
rename from .planning/phases/238-audit-artifact-and-probe-reporting-cleanup/238-01-SUMMARY.md
rename to .planning/milestones/v1.26-phases/238-audit-artifact-and-probe-reporting-cleanup/238-01-SUMMARY.md
diff --git a/.planning/phases/238-audit-artifact-and-probe-reporting-cleanup/238-CONTEXT.md b/.planning/milestones/v1.26-phases/238-audit-artifact-and-probe-reporting-cleanup/238-CONTEXT.md
similarity index 100%
rename from .planning/phases/238-audit-artifact-and-probe-reporting-cleanup/238-CONTEXT.md
rename to .planning/milestones/v1.26-phases/238-audit-artifact-and-probe-reporting-cleanup/238-CONTEXT.md
diff --git a/.planning/phases/238-audit-artifact-and-probe-reporting-cleanup/238-VALIDATION.md b/.planning/milestones/v1.26-phases/238-audit-artifact-and-probe-reporting-cleanup/238-VALIDATION.md
similarity index 100%
rename from .planning/phases/238-audit-artifact-and-probe-reporting-cleanup/238-VALIDATION.md
rename to .planning/milestones/v1.26-phases/238-audit-artifact-and-probe-reporting-cleanup/238-VALIDATION.md
diff --git a/.planning/phases/238-audit-artifact-and-probe-reporting-cleanup/238-VERIFICATION.md b/.planning/milestones/v1.26-phases/238-audit-artifact-and-probe-reporting-cleanup/238-VERIFICATION.md
similarity index 100%
rename from .planning/phases/238-audit-artifact-and-probe-reporting-cleanup/238-VERIFICATION.md
rename to .planning/milestones/v1.26-phases/238-audit-artifact-and-probe-reporting-cleanup/238-VERIFICATION.md
diff --git a/.planning/milestones/v1.27-MILESTONE-AUDIT.md b/.planning/milestones/v1.27-MILESTONE-AUDIT.md
new file mode 100644
index 00000000..7f07c1dd
--- /dev/null
+++ b/.planning/milestones/v1.27-MILESTONE-AUDIT.md
@@ -0,0 +1,232 @@
+---
+milestone: v1.27
+audited: 2026-06-25T14:31:39Z
+status: passed
+scores:
+ requirements: 13/13
+ phases: 6/6
+ integration: 9/9
+ flows: 9/9
+gaps:
+ requirements: []
+ integration: []
+ flows: []
+ phase_artifacts: []
+closed_gaps:
+ - id: "XBN-01-optimized-benchmark-attribution"
+ phase: "244"
+ requirements: ["XBN-01", "XBN-02"]
+ evidence: "Initial audit found the kernel_x86_64 suite only published common f32/unary/matmul rows. Phase 244 was repaired so the maintained suite now publishes counter-checked optimized flash and q2/q3/q6 entries."
+tech_debt: []
+nyquist:
+ overall: compliant
+ compliant_phases:
+ - 239-x86-64-avx2-fma-host-contract-and-baseline-audit
+ - 240-x86-64-flash-attention-avx2-fma-kernel
+ - 241-x86-64-vectorized-q2-k-q3-k-kernels
+ - 242-x86-64-vectorized-q6-k-and-hot-path-contract
+ - 243-runtime-integration-and-parity-proof
+ - 244-benchmark-attribution-and-publication-truth
+ partial_phases: []
+ invalid_phases: []
+ missing_phases: []
+---
+
+# v1.27 Milestone Audit - Ryzen AVX2/FMA Kernel Support
+
+## Result
+
+Status: passed.
+
+All 13 v1.27 requirements are source-backed, all 6 phases are complete, and
+the cross-phase integration check passed with no blockers or tech debt. The
+milestone remains intentionally scoped to this AMD Ryzen 9 5950X class: x86_64
+AVX2 plus FMA, with F16C conversion support. It makes no AVX-512, AVX-VNNI,
+AMX, BF16, native FP16, GPU, or broader model-family claim.
+
+## Requirement Coverage
+
+Three-source cross-reference passed:
+
+- `.planning/REQUIREMENTS.md` marks 13/13 requirements verified.
+- Phase `*-VERIFICATION.md` files for 239-244 mark their assigned requirements
+ passed.
+- Phase `*-SUMMARY.md` frontmatter lists the completed requirements.
+
+| Requirement | Phase | Traceability | Verification | Summary | Final status |
+|-------------|-------|--------------|--------------|---------|--------------|
+| X86-01 | 239 | Verified | Passed | Listed | Satisfied |
+| X86-02 | 239 | Verified | Passed | Listed | Satisfied |
+| XFL-01 | 240 | Verified | Passed | Listed | Satisfied |
+| XFL-02 | 240 | Verified | Passed | Listed | Satisfied |
+| XQK-01 | 241 | Verified | Passed | Listed | Satisfied |
+| XQK-02 | 241 | Verified | Passed | Listed | Satisfied |
+| XQK-03 | 242 | Verified | Passed | Listed | Satisfied |
+| XQK-04 | 242 | Verified | Passed | Listed | Satisfied |
+| XRT-01 | 243 | Verified | Passed | Listed | Satisfied |
+| XRT-02 | 243 | Verified | Passed | Listed | Satisfied |
+| XRT-03 | 243 | Verified | Passed | Listed | Satisfied |
+| XBN-01 | 244 | Verified | Passed | Listed | Satisfied after benchmark repair |
+| XBN-02 | 244 | Verified | Passed | Listed | Satisfied |
+
+## Phase Coverage
+
+| Phase | Verification | Validation | Audit status |
+|-------|--------------|------------|--------------|
+| 239 | `239-VERIFICATION.md` | `239-VALIDATION.md` | Satisfied |
+| 240 | `240-VERIFICATION.md` | `240-VALIDATION.md` | Satisfied |
+| 241 | `241-VERIFICATION.md` | `241-VALIDATION.md` | Satisfied |
+| 242 | `242-VERIFICATION.md` | `242-VALIDATION.md` | Satisfied |
+| 243 | `243-VERIFICATION.md` | `243-VALIDATION.md` | Satisfied |
+| 244 | `244-VERIFICATION.md` | `244-VALIDATION.md` | Satisfied after `XBN-01` source-backed repair |
+
+All validation files carry `status: passed`, `nyquist_compliant: true`, and
+`wave_0_complete: true`, with executable command evidence and rule-compliance
+sign-off.
+
+## Source-Backed Checks
+
+### Host Contract
+
+`src/emel/kernel/x86_64/context.hpp` and `src/emel/kernel/x86_64/sm.hpp`
+publish AVX2, FMA, and F16C feature state and explicit false no-claim fields
+for AVX-512, AVX-VNNI, AMX, BF16, and native FP16. `CMakeLists.txt` wires the
+host-tuned x86_64 build flags for AVX2/FMA/F16C without adding unsupported
+feature flags.
+
+### Optimized Kernel Routing
+
+`src/emel/kernel/x86_64/sm.hpp` routes supported flash, q2_K, q3_K, and q6_K
+requests through explicit guards and transitions before fallback paths.
+`src/emel/kernel/x86_64/actions.hpp` owns the AVX2/FMA/F16C flash kernel and
+the q2_K/q3_K/q6_K x q8_K row kernels. Supported quantized hot-path tests
+assert optimized counters advance, shared counters stay zero, and no dispatch
+allocation occurs.
+
+### Unary Rule Debt
+
+The audit specifically rechecked the earlier rule debt around unary SIMD
+selection. The old generic `execute_avx2_unary(op_unary)` runtime-indexed
+function-pointer table is gone from `src/emel/kernel/x86_64/actions.hpp`.
+Unary SIMD selection is modeled in `sm.hpp` with explicit
+`simd_op_unary_abs`, `simd_op_unary_neg`, and `simd_op_unary_relu` guards and
+compile-time subop action helpers.
+
+### Runtime Path
+
+The maintained generator path drives public events through the shipped
+generator -> graph -> processor -> kernel chain. `src/emel/kernel/any.hpp`
+includes `emel::kernel::x86_64::sm`, and generator diagnostics expose the
+optimized/shared dispatch counters used by tests and parity tooling.
+
+### Parity Path
+
+`tools/paritychecker/parity_engines.cpp` captures public generator diagnostics,
+prints `quantized_dispatch:` attribution, and fails when x86_64 native q2/q3/q6
+tensors do not produce optimized counters or produce shared counters. The
+maintained parity proof covers generation at `1`, `10`, `100`, and `1000`
+tokens, with approved publication baseline updates for the maintained LFM2
+`10`, `100`, and `1000` token runs.
+
+### Benchmark Path
+
+The initial milestone audit found a real `XBN-01` blocker: the
+`kernel_x86_64` benchmark suite published the common f32/unary/matmul rows but
+did not prove the optimized flash and q2/q3/q6 lanes.
+
+That gap is closed. `tools/bench/kernel/x86_64_bench.cpp` now publishes:
+
+- `kernel/x86_64/op_flash_attn_ext_decode_like`
+- `kernel/x86_64/op_mul_mat_q2_k_q8_k`
+- `kernel/x86_64/op_mul_mat_q3_k_q8_k`
+- `kernel/x86_64/op_mul_mat_q6_k_q8_k`
+
+The EMEL benchmark callbacks drive `emel::kernel::x86_64::sm::process_event`
+and abort unless the matching optimized counter increments while the matching
+shared counter does not. Reference-lane comparison remains separate. The
+approved `snapshots/bench/benchmarks.txt` baseline now contains 19
+`kernel/x86_64/*` entries, including the four optimized rows.
+
+## Integration Check
+
+The integration checker returned `status: passed`:
+
+- requirements: 13/13
+- phases: 6/6
+- integration: 9/9
+- blockers: none
+- tech debt: none
+
+It independently traced:
+
+- `kernel_x86_64` benchmark registration, optimized row publication, and
+ snapshot presence.
+- generator -> graph -> processor -> kernel runtime wiring through
+ `process_event(...)`.
+- paritychecker attribution and failure behavior for missing optimized counters.
+- explicit x86_64 SML routing for flash, q2/q3/q6, and unary subops.
+- AVX2/FMA/F16C host feature publication with no unsupported x86 feature claims.
+
+## Validation Evidence
+
+Focused validation after the benchmark and unary-rule repairs:
+
+```bash
+cmake --build build/phase239 --target emel_tests_bin -j2
+```
+
+Result: PASS.
+
+```bash
+ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'
+```
+
+Result: PASS, `100% tests passed`.
+
+```bash
+cmake --build build/bench_tools_ninja_kernel_x86_64 --target bench_runner -j2
+```
+
+Result: PASS.
+
+```bash
+scripts/bench.sh --snapshot --compare --suite=kernel_x86_64
+```
+
+Result: PASS after the approved benchmark snapshot update and optimized
+benchmark repair.
+
+```bash
+PATH="/shared/stateforward/.tools/clang-format-venv/bin:/shared/stateforward/.tools/llvm18/root/usr/lib/llvm-18/bin:/shared/stateforward/.tools/git-lfs/git-lfs-3.7.1:$PATH" \
+EMEL_QUALITY_GATES_BENCH_SUITE="kernel_x86_64" \
+EMEL_QUALITY_GATES_CHANGED_FILES="" \
+scripts/quality_gates.sh
+```
+
+Result: PASS. Selected lanes included legacy SML surface scan, Zig build,
+`kernel_x86_64` benchmark snapshot, changed-file scoped coverage, paritychecker,
+lint snapshot, docs generation, and fuzz routing. Changed-line coverage was
+`707/735` lines (`96.2%`) and `171/240` branches (`71.2%`).
+
+Additional source checks:
+
+```bash
+rg -n "execute_avx2_unary\\(|kernel_index|unary_kernel_t|kernels\\[" \
+ src/emel/kernel/x86_64/actions.hpp tests/kernel/x86_64_tests.cpp
+```
+
+Result: PASS, no matches.
+
+```bash
+rg -n "op_flash_attn_ext_decode_like|op_mul_mat_q2_k_q8_k|op_mul_mat_q3_k_q8_k|op_mul_mat_q6_k_q8_k" \
+ tools/bench/kernel/x86_64_bench.cpp snapshots/bench/benchmarks.txt
+```
+
+Result: PASS, source and snapshot entries present.
+
+## Closeout Readiness
+
+v1.27 is ready for milestone completion and cleanup. The earlier benchmark
+publication blocker is closed with maintained source and snapshot evidence; the
+x86_64 unary SML rule debt is removed from the milestone path; all active
+requirements are satisfied; and no deferred v1.27 work remains.
diff --git a/.planning/milestones/v1.27-REQUIREMENTS.md b/.planning/milestones/v1.27-REQUIREMENTS.md
new file mode 100644
index 00000000..b3320c50
--- /dev/null
+++ b/.planning/milestones/v1.27-REQUIREMENTS.md
@@ -0,0 +1,148 @@
+# Requirements Archive: v1.27 Ryzen AVX2/FMA Kernel Support
+
+**Archived:** 2026-06-25
+**Status:** SHIPPED
+
+For current requirements, see `.planning/REQUIREMENTS.md`.
+
+---
+
+# Requirements: EMEL v1.27 Ryzen AVX2/FMA Kernel Support
+
+**Defined:** 2026-06-25
+**Status:** Verified
+**Core Value:** Prove real end-to-end behavior with explicit SML orchestration and
+parity-oriented verification before widening API surface or model scope.
+**Source:** User request: "add support for this processor exactly how NEON was added"
+
+## v1.27 Requirements
+
+Each requirement is one independently testable obligation and maps to exactly
+one roadmap phase. This milestone targets the current host CPU: AMD Ryzen 9
+5950X with x86_64 AVX2, FMA, and F16C conversion support. It does not claim
+AVX-512, AVX-VNNI, AMX, BF16, or native FP16 arithmetic.
+
+### Host feature contract and build support
+
+- [x] **X86-01**: Maintainer can identify an x86_64 host feature contract that
+ detects and publishes AVX2, FMA, and F16C availability for this Ryzen class
+ without implying unsupported AVX-512, AVX-VNNI, AMX, BF16, or native FP16
+ support.
+
+- [x] **X86-02**: The build/config surface supports host-tuned x86_64 AVX2/FMA
+ code paths analogously to the existing AArch64 host-feature switch, while
+ preserving portable builds and fail-closed behavior when those features are
+ unavailable.
+
+### x86_64 flash attention
+
+- [x] **XFL-01**: Supported x86_64 flash-attention requests execute through an
+ EMEL-owned AVX2/FMA implementation rather than the shared scalar workspace
+ path.
+
+- [x] **XFL-02**: Unsupported or out-of-contract x86_64 flash-attention requests
+ publish deterministic fallback/no-claim behavior instead of silently claiming
+ optimized execution.
+
+### x86_64 quantized kernels
+
+- [x] **XQK-01**: The maintained x86_64 `q2_K x q8_K` hot path uses an
+ EMEL-owned AVX2/FMA kernel on supported requests.
+
+- [x] **XQK-02**: The maintained x86_64 `q3_K x q8_K` hot path uses an
+ EMEL-owned AVX2/FMA kernel on supported requests.
+
+- [x] **XQK-03**: The maintained x86_64 `q6_K x q8_K` hot path uses an
+ EMEL-owned AVX2/FMA kernel on supported requests.
+
+- [x] **XQK-04**: Supported x86_64 quantized hot-path requests remain
+ allocation-free during dispatch and consume the same effective operand class
+ as the reference path, with no whole-tensor dequantize-to-f32 substitution.
+
+### Runtime integration and parity proof
+
+- [x] **XRT-01**: The shipped generator -> graph -> processor -> kernel chain
+ selects the x86_64 optimized path on this host without actor rewrites,
+ queue-based orchestration, or public API widening.
+
+- [x] **XRT-02**: Maintained paritychecker proof covers `1`, `10`, `100`, and
+ `1000` token generation on the maintained path and publishes x86_64
+ attribution sufficient to prove the optimized path actually executed.
+
+- [x] **XRT-03**: Tests prove supported optimized behavior and deterministic
+ fallback/no-claim behavior through public machine dispatch and SML state
+ inspection where applicable.
+
+### Benchmark attribution and publication truth
+
+- [x] **XBN-01**: Maintained benchmark entrypoints exercise the x86_64 optimized
+ flash and quantized paths with attribution distinct from scalar/shared paths
+ and reference-lane execution.
+
+- [x] **XBN-02**: Published benchmark/docs artifacts truthfully distinguish
+ x86_64 Ryzen evidence from ARM-first claims and do not label unsupported
+ requests as optimized.
+
+## Out-of-Contract Tracking
+
+Tracked as explicit non-claims and separate-contract work outside the current
+Ryzen AVX2/FMA host obligation.
+
+### Wider x86 feature families
+
+- **X86-SEPARATE-01**: Add AVX-512, AVX-VNNI, AMX, BF16, or native FP16 paths
+ only on hardware that actually supports them and only after a separate feature
+ contract is approved.
+
+- **X86-SEPARATE-02**: Add additional x86_64 quantized formats beyond the
+ maintained `q2_K/q3_K/q6_K x q8_K` path after the first Ryzen AVX2/FMA slice
+ is source-backed.
+
+### Broader runtime scope
+
+- **XRT-SEPARATE-01**: Widen x86_64 performance publication to additional model
+ families only after the maintained current slice has truthful attribution and
+ parity proof.
+
+## Out of Scope
+
+Explicit exclusions for milestone v1.27:
+
+| Feature | Reason |
+|---------|--------|
+| AVX-512, AVX-VNNI, AMX, BF16, or native FP16 execution claims | The current Ryzen 9 5950X host does not provide those instruction families |
+| GPU acceleration | User asked for this processor; this milestone is CPU-owned x86_64 work |
+| Broad public C ABI or CLI expansion | The NEON precedent optimized the maintained runtime path before widening APIs |
+| New model-family support | Kernel milestone; maintained runtime slice remains the acceptance surface |
+| Tool-only compute fallbacks | EMEL support must live in `src/`, not only in benchmarks or parity tools |
+| Whole-tensor dequantize-to-f32 substitution in the hot path | Would violate the quantized inference performance contract unless explicitly approved |
+| Linking EMEL runtime against llama.cpp/ggml | Reference code remains comparison-only in tools per project policy |
+
+## Traceability
+
+Each row appears exactly once. Phases continue after v1.26's Phase 238.
+
+| Requirement | Phase | Status |
+|-------------|-------|--------|
+| X86-01 | Phase 239 | Verified |
+| X86-02 | Phase 239 | Verified |
+| XFL-01 | Phase 240 | Verified |
+| XFL-02 | Phase 240 | Verified |
+| XQK-01 | Phase 241 | Verified |
+| XQK-02 | Phase 241 | Verified |
+| XQK-03 | Phase 242 | Verified |
+| XQK-04 | Phase 242 | Verified |
+| XRT-01 | Phase 243 | Verified |
+| XRT-02 | Phase 243 | Verified |
+| XRT-03 | Phase 243 | Verified |
+| XBN-01 | Phase 244 | Verified |
+| XBN-02 | Phase 244 | Verified |
+
+**Coverage:**
+- v1.27 requirements: 13 total
+- Mapped to phases: 13
+- Unmapped: 0
+
+---
+*Requirements defined: 2026-06-25*
+*Last updated: 2026-06-25 after Phase 244 verification and approved snapshot publication*
diff --git a/.planning/milestones/v1.27-ROADMAP.md b/.planning/milestones/v1.27-ROADMAP.md
new file mode 100644
index 00000000..ec5e5140
--- /dev/null
+++ b/.planning/milestones/v1.27-ROADMAP.md
@@ -0,0 +1,209 @@
+# Roadmap: EMEL
+
+## Milestones
+
+- [x] **v1.0 EMEL Llama-68M Generation Slice** - shipped 2026-03-08
+- [x] **v1.1 EMEL Llama-68M Generation Benchmark** - shipped 2026-03-11
+- [x] **v1.2 Flash Attention** - shipped 2026-03-22
+- [x] **v1.3 ARM Flash Optimizations** - shipped 2026-03-22
+- [x] **v1.4 Full Vectorized Quantized Kernels** - shipped 2026-03-25
+- [x] **v1.5 Full ARM Quantized Path** - shipped 2026-03-27
+- [x] **v1.6 Qwen3-0.6B Parity And Benchmark** - shipped 2026-03-30
+- [x] **v1.7 Generator Prefill Submachine Decomposition** - shipped 2026-03-30
+- [x] **v1.8 Truthful Qwen3 E2E Embedded Size** - shipped 2026-04-02
+- [x] **v1.9 Liquid LFM2.5-1.2B Thinking ARM Slice** - shipped 2026-04-02
+- [x] **v1.11 TE-75M GGUF Trimodal Embedding Runtime** - shipped 2026-04-15
+- [x] **v1.12 Pluggable Reference Parity Bench Architecture** - shipped 2026-04-18
+- [x] **v1.13 Pluggable Generative Parity Bench** - shipped 2026-04-21
+- [x] **v1.14 Benchmark Variant Organization** - shipped 2026-04-21
+- [x] **v1.15 ARM Sortformer Diarization GGUF Slice** - shipped 2026-04-25
+- [x] **v1.16 ARM Whisper GGUF Parity And Performance** - shipped 2026-04-28
+- [x] **v1.17 Text Generator Domain Alignment** - shipped 2026-04-30
+- [x] **v1.18 Parity Tool Boundary Refactor** - shipped 2026-05-01
+- [x] **v1.19 Benchmark Tool Pluggable Runner Refactor** - shipped 2026-05-01
+- [x] **v1.20 SML Dependency And Namespace Migration** - shipped 2026-05-02
+- [x] **v1.21 Quality Gate Selective Runner Optimization** - shipped 2026-05-02
+- [x] **v1.22 Weight Loading Ownership Cutover** - shipped 2026-05-03
+- [x] **v1.23 I/O Loading Strategy Boundary** - shipped 2026-05-04
+- [x] **v1.24 I/O Mmap Loading Strategy** - shipped 2026-05-04
+- [x] **v1.25 I/O Read Loading Strategy** - shipped 2026-05-06
+- [x] **v1.26 I/O Staged Read Loading Strategy** - completed 2026-05-08
+- [ ] **v1.27 Ryzen AVX2/FMA Kernel Support** - planned 2026-06-25
+
+## Current Milestone
+
+### v1.27 Ryzen AVX2/FMA Kernel Support
+
+**Milestone Goal:** Bring the maintained x86_64 runtime path on this AMD Ryzen 9
+5950X host to the same support standard as the earlier NEON/AArch64 path:
+source-backed host feature contract, EMEL-owned AVX2/FMA flash and quantized
+hot-path kernels, maintained runtime/parity proof, and truthful benchmark
+attribution.
+
+**Host contract:** This milestone targets x86_64 AVX2 + FMA, with F16C
+conversion support only. It makes no AVX-512, AVX-VNNI, AMX, BF16, native FP16,
+or GPU execution claim.
+
+**Scope Guardrails:**
+- Keep the milestone narrow to this x86_64 CPU feature class and the maintained
+ runtime/parity/benchmark surfaces.
+- Preserve the generator -> graph -> processor -> kernel chain and current
+ Stateforward.SML orchestration structure.
+- Keep runtime behavior choice in guards/transitions, not action/detail helper
+ routing.
+- Do not accept tool-only compute fallbacks or whole-tensor dequantize-to-f32
+ substitution for quantized hot paths without explicit user approval.
+- Keep llama.cpp/ggml reference linkage confined to comparison-only tool lanes.
+
+Execution order: 239, 240, 241, 242, 243, 244.
+
+**Milestone progress (v1.27):** **6 / 6** phases fully verified.
+Source/test complete: **6 / 6** phases.
+
+- [x] Phase 239: x86_64 AVX2/FMA Host Contract and Baseline Audit (X86-01, X86-02)
+- [x] Phase 240: x86_64 Flash Attention AVX2/FMA Kernel (XFL-01, XFL-02)
+- [x] Phase 241: x86_64 Vectorized q2_K/q3_K Kernels (XQK-01, XQK-02)
+- [x] Phase 242: x86_64 Vectorized q6_K and Hot-Path Contract (XQK-03, XQK-04)
+- [x] Phase 243: Runtime Integration and Parity Proof (XRT-01, XRT-02, XRT-03)
+- [x] Phase 244: Benchmark Attribution and Publication Truth (XBN-01, XBN-02)
+
+## Phase Details
+
+### Phase 239: x86_64 AVX2/FMA Host Contract and Baseline Audit
+
+**Goal:** Define the Ryzen AVX2/FMA host contract, add/build the host-feature
+surface, and inventory the current x86_64 kernel path against the NEON/AArch64
+precedent before porting kernels.
+
+**Depends on:** Phase 238
+**Requirements:** X86-01, X86-02
+
+**Success Criteria:**
+
+1. Runtime/build evidence identifies AVX2, FMA, and F16C support on this host
+ and explicitly no-claims AVX-512, AVX-VNNI, AMX, BF16, and native FP16.
+2. CMake or equivalent config exposes host-tuned x86_64 AVX2/FMA compilation
+ without breaking portable or cross-compiled builds.
+3. A source-backed audit maps current `src/emel/kernel/x86_64` behavior against
+ the NEON/AArch64 flash and quantized support pattern.
+4. Tests prove supported and unsupported feature-contract behavior through the
+ public x86_64 kernel actor surface where applicable.
+
+### Phase 240: x86_64 Flash Attention AVX2/FMA Kernel
+
+**Goal:** Port the maintained flash-attention optimization pattern to an
+EMEL-owned x86_64 AVX2/FMA implementation with deterministic fallback/no-claim
+behavior for unsupported requests.
+
+**Depends on:** Phase 239
+**Requirements:** XFL-01, XFL-02
+
+**Success Criteria:**
+
+1. Supported x86_64 flash-attention requests execute through an AVX2/FMA path
+ rather than the shared scalar workspace helper.
+2. The optimized path preserves reusable workspace semantics and introduces no
+ dispatch-time allocation.
+3. Unsupported shape, dtype, feature, or operand contracts publish explicit
+ fallback/no-claim behavior.
+4. Focused kernel tests compare optimized output against the maintained scalar
+ or reference oracle within the accepted numeric tolerance.
+
+### Phase 241: x86_64 Vectorized q2_K/q3_K Kernels
+
+**Goal:** Land EMEL-owned AVX2/FMA `q2_K x q8_K` and `q3_K x q8_K` kernels for
+the maintained x86_64 quantized hot path.
+
+**Depends on:** Phase 240
+**Requirements:** XQK-01, XQK-02
+
+**Success Criteria:**
+
+1. Maintained `q2_K x q8_K` requests can execute through an AVX2/FMA kernel on
+ supported x86_64 hosts.
+2. Maintained `q3_K x q8_K` requests can execute through an AVX2/FMA kernel on
+ supported x86_64 hosts.
+3. Kernel-seam proof distinguishes optimized execution from scalar/shared row
+ helpers for both formats.
+4. Correctness tests cover representative block groups, tails, and accumulation
+ behavior against the maintained scalar/reference oracle.
+
+### Phase 242: x86_64 Vectorized q6_K and Hot-Path Contract
+
+**Goal:** Add the x86_64 AVX2/FMA `q6_K x q8_K` kernel and lock the maintained
+quantized hot path to operand fidelity and zero dispatch-time allocation.
+
+**Depends on:** Phase 241
+**Requirements:** XQK-03, XQK-04
+
+**Success Criteria:**
+
+1. Maintained `q6_K x q8_K` requests can execute through an AVX2/FMA kernel on
+ supported x86_64 hosts.
+2. Supported `q2_K/q3_K/q6_K x q8_K` requests consume the same effective operand
+ class as the reference path.
+3. Tests and/or allocation instrumentation prove no hot-path allocation and no
+ whole-tensor dequantize-to-f32 substitution for supported optimized requests.
+4. Unsupported quantized cases remain explicit fallback/no-claim paths rather
+ than silent performance claims.
+
+### Phase 243: Runtime Integration and Parity Proof
+
+**Goal:** Adopt the optimized x86_64 kernel set in the shipped runtime chain and
+prove supported plus fallback behavior through maintained parity and tests.
+
+**Depends on:** Phase 242
+**Requirements:** XRT-01, XRT-02, XRT-03
+
+**Success Criteria:**
+
+1. The shipped generator -> graph -> processor -> kernel chain selects the
+ optimized x86_64 path on this host without actor rewrites or public API
+ widening.
+2. Maintained paritychecker output covers `1`, `10`, `100`, and `1000` token
+ generation and proves the x86_64 optimized path actually executed.
+3. Regression tests cover supported optimized execution and deterministic
+ fallback/no-claim behavior through public dispatch and SML state inspection.
+4. Runtime counters/attribution remain bounded, deterministic, and
+ allocation-free in hot dispatch paths.
+
+### Phase 244: Benchmark Attribution and Publication Truth
+
+**Goal:** Publish maintained benchmark and documentation evidence that measures
+the Ryzen AVX2/FMA path truthfully and distinguishes it from ARM-first and
+reference-lane claims.
+
+**Depends on:** Phase 243
+**Requirements:** XBN-01, XBN-02
+
+**Success Criteria:**
+
+1. `tools/bench` runs maintained x86_64 flash and quantized workloads through
+ optimized paths and reports attribution distinct from scalar/shared paths.
+2. Benchmark docs and stored artifacts clearly identify host CPU, feature
+ contract, optimized-path counters, and reference-lane separation.
+3. Published results do not label unsupported requests as optimized and do not
+ dilute existing ARM-first benchmark claims.
+4. Required benchmark, docs, lint, parity, and quality-gate evidence is captured
+ from maintained commands without snapshot updates unless explicitly approved.
+
+## Progress
+
+**Execution Order:** 239 -> 240 -> 241 -> 242 -> 243 -> 244
+
+| Phase | Plans Complete | Status | Completed |
+|-------|----------------|--------|-----------|
+| 239. x86_64 AVX2/FMA Host Contract and Baseline Audit | 1/1 | Complete | 2026-06-25 |
+| 240. x86_64 Flash Attention AVX2/FMA Kernel | 1/1 | Complete | 2026-06-25 |
+| 241. x86_64 Vectorized q2_K/q3_K Kernels | 1/1 | Complete | 2026-06-25 |
+| 242. x86_64 Vectorized q6_K and Hot-Path Contract | 1/1 | Complete | 2026-06-25 |
+| 243. Runtime Integration and Parity Proof | 1/1 | Complete | 2026-06-25 |
+| 244. Benchmark Attribution and Publication Truth | 1/1 | Complete | 2026-06-25 |
+
+---
+
+Next implementation step: complete and archive the v1.27 milestone.
+
+Closeout gate: milestone completion and cleanup after the passed source-backed
+audit, approved snapshot updates, optimized benchmark repair, and scoped quality
+gate.
diff --git a/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-01-PLAN.md b/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-01-PLAN.md
new file mode 100644
index 00000000..88c5aea0
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-01-PLAN.md
@@ -0,0 +1,136 @@
+---
+phase: 239
+plan: 01
+title: x86_64 AVX2/FMA Host Contract and Baseline Audit
+wave: 1
+depends_on: []
+autonomous: true
+files_modified:
+ - CMakeLists.txt
+ - src/emel/kernel/x86_64/context.hpp
+ - src/emel/kernel/x86_64/actions.hpp
+ - src/emel/kernel/x86_64/sm.hpp
+ - src/emel/kernel/aarch64/actions.hpp
+ - src/emel/diarization/sortformer/detail.cpp
+ - src/emel/embeddings/generator/detail.hpp
+ - src/emel/text/generator/detail.hpp
+ - tests/kernel/x86_64_tests.cpp
+ - tests/kernel/aarch64_tests.cpp
+ - tests/text/generator/detail_tests.cpp
+ - tools/paritychecker/CMakeLists.txt
+ - .planning/phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-X86-BASELINE-AUDIT.md
+ - .planning/phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-VERIFICATION.md
+ - .planning/phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-VALIDATION.md
+ - .planning/phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-01-SUMMARY.md
+rule_constraints:
+ - Keep runtime behavior choice in x86_64 guards/transitions.
+ - Keep action/detail helpers limited to already-selected numeric work.
+ - Do not add AVX-512, AVX-VNNI, AMX, BF16, native FP16, or GPU claims.
+ - Do not implement Phase 240-242 kernels in this phase.
+must_haves:
+ - AVX2, FMA, and F16C are explicitly detected/published for x86_64.
+ - Unsupported feature families are explicitly published as no-claim.
+ - x86_64 host-tuned build flags exist and exclude unsupported feature families.
+ - A source-backed audit maps current x86_64 state against the NEON precedent.
+ - Focused tests prove supported and unsupported contract behavior.
+---
+
+# Phase 239 Plan: x86_64 AVX2/FMA Host Contract and Baseline Audit
+
+## Goal
+
+Define the Ryzen AVX2/FMA host contract, add/build the host-feature surface, and
+inventory the current x86_64 kernel path against the NEON/AArch64 precedent
+before porting kernels.
+
+## Tasks
+
+
+
+
+ Add focused tests in `tests/kernel/x86_64_tests.cpp` that require the
+ x86_64 feature contract to publish AVX2, FMA, F16C, and explicit no-claim
+ unsupported feature families through the public x86_64 actor wrapper.
+ Run the targeted test binary or build and record the expected pre-fix
+ failure.
+
+
+ The new tests fail before implementation because the public feature
+ contract/accessors do not yet exist.
+
+
+
+
+
+ Extend `src/emel/kernel/x86_64/context.hpp` and
+ `src/emel/kernel/x86_64/sm.hpp` with a small host feature contract:
+ runtime AVX2/FMA/F16C detection and explicit no-claim booleans for
+ AVX-512, AVX-VNNI, AMX, BF16, and native FP16. Preserve existing context
+ construction used by tests.
+
+
+ Focused x86_64 tests compile and pass on both default and forced context
+ construction.
+
+
+
+
+
+ Add an `EMEL_ENABLE_X86_64_HOST_FEATURES` CMake option analogous to the
+ AArch64 option. Apply compiler-checked `-mavx2`, `-mfma`, and `-mf16c`
+ flags only for non-cross x86_64 builds. Repair any compile-only
+ portability gaps exposed by building the maintained x86 host target. Do
+ not add unsupported feature flags.
+
+
+ CMake configure reports x86_64 host flags on this host when supported, and
+ source scan confirms no AVX-512/VNNI/AMX/BF16/native-FP16 flags were added.
+
+
+
+
+
+ Create `239-X86-BASELINE-AUDIT.md`, `239-VERIFICATION.md`,
+ `239-VALIDATION.md`, and `239-01-SUMMARY.md`. The audit must describe the
+ current x86_64 f32 AVX2 state and the missing flash/quantized/runtime
+ parity work assigned to concrete follow-on v1.27 phases.
+
+
+ Artifacts cite source files and command evidence, not planning intent
+ alone. `X86-01` and `X86-02` are marked complete only if source/test/build
+ evidence supports them.
+
+
+
+
+## Verification
+
+1. Focused x86_64 doctests for the feature contract.
+2. CMake configure with default settings on this host.
+3. Source scan for forbidden x86 feature claims/flags.
+4. `git diff --check`.
+5. Changed-file scoped quality gate.
+
+
+## Rule Constraints
+
+- Follow `AGENTS.md` and `docs/rules/sml.rules.md` for actor structure.
+- Do not move runtime behavior selection from guards/transitions into
+ `actions.hpp` or `detail.hpp`.
+- Keep `detail` helpers non-routing and non-orchestrating.
+- Do not add queue/mailbox/deferred-dispatch behavior.
+- Do not add or claim AVX-512, AVX-VNNI, AMX, BF16, native FP16, GPU, or
+ llama.cpp/ggml runtime linkage.
+- Keep quantized hot-path fallback policy unchanged; do not introduce
+ dequantize-to-f32 substitutions.
+
+
+
+## Completion Criteria
+
+- `X86-01` has source-backed evidence via x86_64 feature-contract code and
+ public actor tests.
+- `X86-02` has source-backed evidence via CMake host-feature support and source
+ scans excluding unsupported flags.
+- The baseline audit clearly separates existing x86_64 f32 AVX2 support from
+ subsequent flash/quantized/runtime/benchmark phases.
diff --git a/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-01-SUMMARY.md b/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-01-SUMMARY.md
new file mode 100644
index 00000000..70839ab0
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-01-SUMMARY.md
@@ -0,0 +1,42 @@
+---
+phase: 239
+status: passed
+requirements-completed:
+ - X86-01
+ - X86-02
+requirements-blocked: []
+verification: passed
+---
+
+# Phase 239 Summary
+
+## What Changed
+
+- Added an x86_64 host feature contract for AVX2, FMA, and F16C conversion.
+- Published explicit no-claim fields for AVX-512, AVX-VNNI, AMX, BF16, and
+ native FP16 through the x86_64 actor surface.
+- Added `EMEL_ENABLE_X86_64_HOST_FEATURES` and compiler-checked
+ `-mavx2/-mfma/-mf16c` host flags.
+- Replaced `__builtin_cpu_supports` with local CPUID/XGETBV detection so Zig
+ toolchain links succeed.
+- Repaired x86 host build portability exposed by the new host build:
+ AArch64 NEON helper visibility, non-ARM warning-as-error issues, doctest skip
+ markers, and paritychecker reference vendor includes.
+- Added focused x86_64 tests for supported, fail-closed, and detected host
+ feature contracts.
+- Wrote a source-backed baseline audit separating current x86_64 f32 AVX2
+ support from active follow-on flash/quantized/runtime/benchmark phases.
+
+## Validation
+
+- CMake configure with Zig: pass.
+- `emel_tests_bin` build: pass.
+- `emel_tests_kernel_and_graph` CTest shard: pass.
+- `scripts/paritychecker.sh --runner=kernel`: pass.
+- `git diff --check`: pass.
+- Scoped `scripts/quality_gates.sh`: coverage, paritychecker, benchmark
+ snapshot, lint, docs, and fuzz routing pass after approved snapshot updates.
+
+## Closeout Status
+
+The Phase 239 implementation satisfies and verifies `X86-01` and `X86-02`.
diff --git a/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-CONTEXT.md b/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-CONTEXT.md
new file mode 100644
index 00000000..b07fc488
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-CONTEXT.md
@@ -0,0 +1,132 @@
+# Phase 239: x86_64 AVX2/FMA Host Contract and Baseline Audit - Context
+
+**Gathered:** 2026-06-25
+**Status:** Source-complete; benchmark snapshot update awaiting explicit approval
+**Mode:** Auto-generated (autonomous infrastructure phase)
+
+
+## Phase Boundary
+
+Define and prove the x86_64 Ryzen host feature contract before adding new flash
+or quantized kernels. This phase may add feature detection, public actor
+accessors, host-tuned build flags, tests, and a source-backed audit artifact. It
+must not implement the AVX2/FMA flash kernel or the q2_K/q3_K/q6_K hot-path
+kernels; those are Phase 240-242 work.
+
+
+
+
+## Implementation Decisions
+
+### Host Feature Contract
+- **D-01:** Treat this processor as x86_64 AVX2 + FMA with F16C conversion
+ support.
+- **D-02:** Publish AVX2, FMA, and F16C as explicit supported feature booleans
+ on the x86_64 kernel actor context/surface.
+- **D-03:** Publish unsupported feature families as explicit no-claim booleans:
+ AVX-512, AVX-VNNI, AMX, BF16, and native FP16.
+- **D-04:** Keep no-claim feature families disabled even if a future host can
+ report them; adding those paths requires a separate milestone contract.
+
+### Build Contract
+- **D-05:** Add an x86_64 host-feature build option analogous to the existing
+ AArch64 host-feature option.
+- **D-06:** Use compiler-checked AVX2/FMA/F16C flags only; do not add AVX-512,
+ VNNI, AMX, BF16, native FP16, or GPU flags.
+- **D-07:** Preserve portable builds by keeping the new option configurable and
+ by applying flags only for non-cross x86_64 builds when supported.
+
+### Audit Scope
+- **D-08:** Capture the current x86_64 kernel state against the NEON precedent:
+ existing f32 AVX2 paths are present, but flash/quantized parity is not yet at
+ the AArch64 standard.
+- **D-09:** The audit is evidence for Phase 239 only; it must not claim Phase
+ 240-244 runtime or benchmark completion.
+
+### the agent's Discretion
+- Use the smallest code shape that keeps the feature contract inspectable from
+ tests and follow-on phases.
+- Prefer focused x86_64 kernel tests and CMake/source scans over broad quality
+ gates until implementation files are touched.
+
+
+
+
+## Canonical References
+
+### Current milestone
+- `.planning/REQUIREMENTS.md` - `X86-01` and `X86-02` requirements.
+- `.planning/ROADMAP.md` - Phase 239 goal and success criteria.
+- `.planning/STATE.md` - v1.27 host feature scope and no-claim constraints.
+
+### Existing x86_64 surface
+- `src/emel/kernel/x86_64/context.hpp` - current AVX2 detection and context.
+- `src/emel/kernel/x86_64/actions.hpp` - current f32 AVX2 execution helpers.
+- `src/emel/kernel/x86_64/guards.hpp` - current x86_64 SIMD route guards.
+- `src/emel/kernel/x86_64/sm.hpp` - public x86_64 actor wrapper surface.
+- `tests/kernel/x86_64_tests.cpp` - existing x86_64 actor/kernel coverage.
+
+### NEON precedent
+- `src/emel/kernel/aarch64/context.hpp` - feature/counter precedent.
+- `src/emel/kernel/aarch64/sm.hpp` - public counter/accessor precedent.
+- `.planning/milestones/v1.3-ROADMAP.md` - ARM flash optimization pattern.
+- `.planning/milestones/v1.4-ROADMAP.md` - vectorized quantized kernel pattern.
+- `.planning/milestones/v1.5-ROADMAP.md` - full ARM quantized path proof.
+
+
+
+
+## Existing Code Insights
+
+### Reusable Assets
+- `CMakeLists.txt` already has `EMEL_ENABLE_AARCH64_HOST_FEATURES` and
+ compiler-checked host flags.
+- `src/emel/kernel/x86_64/context.hpp` already detects AVX2 with
+ `__builtin_cpu_supports` on GCC/Clang x86_64.
+- `src/emel/kernel/x86_64/actions.hpp` already has AVX2 target attributes and
+ f32 SIMD helpers for dup/add/sub/mul/div/sqr/sqrt/mul_mat/unary.
+- `tests/kernel/x86_64_tests.cpp` already has forced-SIMD and fallback tests.
+
+### Established Patterns
+- x86_64 behavior selection belongs in guards/transitions; action/detail code
+ only executes an already selected path.
+- AArch64 publishes runtime counters/accessors from `sm.hpp`; x86_64 can use the
+ same public actor-wrapper style for feature contract inspection.
+- Planning artifacts must not be used as proof of runtime support; tests and
+ source scans must back every closeout claim.
+
+### Integration Points
+- Add feature contract state in `src/emel/kernel/x86_64/context.hpp`.
+- Add public wrapper accessors in `src/emel/kernel/x86_64/sm.hpp`.
+- Add host-feature compile flag support in `CMakeLists.txt`.
+- Add focused x86_64 tests in `tests/kernel/x86_64_tests.cpp`.
+- Add Phase 239 audit artifact under this phase directory.
+
+
+
+
+## Specific Ideas
+
+- The current host inspection reports AMD Ryzen 9 5950X with AVX2, FMA, and
+ F16C flags.
+- The milestone wording must stay honest: this phase starts x86_64 support work;
+ it does not complete flash, quantized kernels, runtime parity, or benchmark
+ publication. Those items remain active v1.27 scope with concrete follow-on
+ phases and phase-owned acceptance criteria.
+
+
+
+
+## Active Follow-On Scope
+
+- AVX2/FMA flash-attention implementation - Phase 240.
+- AVX2/FMA q2_K/q3_K kernels - Phase 241.
+- AVX2/FMA q6_K and hot-path operand-fidelity proof - Phase 242.
+- Runtime parity and benchmark publication - Phases 243-244.
+
+
+
+---
+
+*Phase: 239-x86-64-avx2-fma-host-contract-and-baseline-audit*
+*Context gathered: 2026-06-25*
diff --git a/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-VALIDATION.md b/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-VALIDATION.md
new file mode 100644
index 00000000..ab4bbe07
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-VALIDATION.md
@@ -0,0 +1,130 @@
+---
+phase: 239
+slug: x86-64-avx2-fma-host-contract-and-baseline-audit
+status: passed
+nyquist_compliant: true
+wave_0_complete: true
+created: 2026-06-25
+---
+
+# Phase 239 - Validation Strategy
+
+## Test Infrastructure
+
+| Property | Value |
+|----------|-------|
+| Framework | doctest, CTest, CMake configure/build, source scans, quality gate |
+| Config file | `CMakeLists.txt`; `scripts/quality_gates.sh` |
+| Quick run command | `ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'` |
+| Gate command | `EMEL_QUALITY_GATES_CHANGED_FILES="" scripts/quality_gates.sh` |
+| Current gate status | passed after approved x86_64 benchmark baseline update |
+
+## Per-Task Verification Map
+
+| Task ID | Requirement | Test Type | Automated Command | Status |
+|---------|-------------|-----------|-------------------|--------|
+| 239-01-01 | X86-01 | failing-first compile proof | `ninja -C build/phase239 CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o` before implementation | red captured |
+| 239-01-02 | X86-01 | focused compile/test | `ninja -C build/phase239 CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o`; `ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'` | green |
+| 239-01-03 | X86-02 | configure/build/source scan | `CC='/shared/zig/zig cc' CXX='/shared/zig/zig c++' cmake -S . -B build/phase239 -G Ninja -DEMEL_ENABLE_TESTS=ON`; `cmake --build build/phase239 --target emel_tests_bin -j2`; unsupported-flag scan | green |
+| 239-01-04 | X86-01, X86-02 | artifact/source audit | `239-X86-BASELINE-AUDIT.md`; `git diff --check` | green |
+| 239-01-05 | quality gate | scoped quality gate | `EMEL_QUALITY_GATES_CHANGED_FILES="..." scripts/quality_gates.sh` | green |
+
+## Command Results
+
+```bash
+CC='/shared/zig/zig cc' CXX='/shared/zig/zig c++' cmake -S . -B build/phase239 -G Ninja -DEMEL_ENABLE_TESTS=ON
+```
+
+Result: PASS. Configure reported
+`EMEL enabling x86_64 host compile flags: -mavx2;-mfma;-mf16c`.
+
+```bash
+ninja -C build/phase239 CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o
+```
+
+Result: PASS after implementation. The pre-fix run failed on missing x86_64
+host feature contract/accessors and FMA/F16C detection.
+
+```bash
+cmake --build build/phase239 --target emel_tests_bin -j2
+```
+
+Result: PASS after repairing x86 host compile portability gaps.
+
+```bash
+ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'
+```
+
+Result: PASS, `100% tests passed`.
+
+```bash
+rg -n -- '-mavx512|-mavxvnni|-mamx|-mavx512bf16|-mavx512fp16|-mavx512vnni|-mavx512f' CMakeLists.txt src tests tools/paritychecker .planning/phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit
+```
+
+Result: PASS, no unsupported x86 compile flags found.
+
+```bash
+scripts/paritychecker.sh --runner=kernel
+```
+
+Result: PASS. The paritychecker builds and the kernel parity runner passes.
+
+```bash
+PATH="/shared/stateforward/.tools/clang-format-venv/bin:/shared/stateforward/.tools/llvm18/root/usr/lib/llvm-18/bin:/shared/stateforward/.tools/git-lfs/git-lfs-3.7.1:$PATH" \
+EMEL_QUALITY_GATES_TIMEOUT="3600s" \
+EMEL_QUALITY_GATES_BENCH_SUITE="kernel_x86_64" \
+EMEL_QUALITY_GATES_CHANGED_FILES="CMakeLists.txt:src/emel/kernel/x86_64/actions.hpp:src/emel/kernel/x86_64/context.hpp:src/emel/kernel/x86_64/sm.hpp:tests/kernel/x86_64_tests.cpp:src/emel/kernel/aarch64/actions.hpp:src/emel/diarization/sortformer/detail.cpp:src/emel/text/generator/detail.hpp:src/emel/text/generator/context.hpp:src/emel/embeddings/generator/detail.hpp:tests/kernel/aarch64_tests.cpp:tests/kernel/test_helpers.hpp:tests/text/generator/detail_tests.cpp:tests/text/generator/lifecycle_tests.cpp:tests/embeddings/vision_embedding_lane_tests.cpp:tests/embeddings/text_embedding_lane_tests.cpp:tools/paritychecker/CMakeLists.txt:tools/paritychecker/paritychecker_tests.cpp:tools/paritychecker/parity_engines.cpp:tools/bench/CMakeLists.txt:tools/bench/quality_gates_tests.cpp:scripts/test_with_coverage.sh:tests/diarization/request/lifecycle_tests.cpp:tests/diarization/sortformer/encoder/lifecycle_tests.cpp:tests/diarization/sortformer/modules/lifecycle_tests.cpp:tests/diarization/sortformer/output/lifecycle_tests.cpp:tests/diarization/sortformer/transformer/lifecycle_tests.cpp:tests/graph/assembler/assembler_tests.cpp:tests/graph/graph_tests.cpp:tests/model/loader/lifecycle_tests.cpp:tests/embeddings/te_fixture_data.hpp" \
+scripts/quality_gates.sh
+```
+
+Initial result before snapshot approval: the scoped gate passed all
+non-benchmark lanes:
+
+- `test_with_coverage`: PASS. CTest shards
+ `generator_and_runtime`, `diarization`, and `kernel_and_graph` pass.
+ Changed-line coverage is `73/78` lines (`93.6%`) and `18/34` branches
+ (`52.9%`).
+- `paritychecker`: PASS. Full paritychecker tests pass.
+- `fuzz_smoke`: skipped because no fuzz-affecting files changed.
+- `lint_snapshot`: PASS after using the existing `tests/kernel/test_helpers.hpp`
+ instead of adding a new helper file.
+- `generate_docs`: PASS.
+
+The only failing lane was `bench_snapshot`: the new `kernel_x86_64` suite emits
+15 `kernel/x86_64/*` entries, and `snapshots/bench/benchmarks.txt` had no
+matching baselines. User approval was granted and the snapshot baseline was
+updated.
+
+```bash
+EMEL_BENCH_SUITE=kernel_x86_64 \
+EMEL_BENCH_ITERS=100 \
+EMEL_BENCH_RUNS=3 \
+EMEL_BENCH_WARMUP_ITERS=10 \
+EMEL_BENCH_WARMUP_RUNS=1 \
+build/bench_tools_ninja_kernel_x86_64/bench_runner --mode=emel
+```
+
+Result: PASS as a non-mutating measurement. It emits x86_64 benchmark entries
+for `op_add`, `op_cos`, `op_div`, `op_dup`, `op_log`, `op_mul`, `op_mul_mat`,
+`op_sin`, `op_soft_max`, `op_sqr`, `op_sqrt`, `op_sub`, `op_unary_exp`,
+`op_unary_neg`, and `op_unary_relu`.
+
+## Manual-Only Verifications
+
+- Approve the benchmark snapshot baseline update.
+- Run `scripts/bench.sh --snapshot --update --suite=kernel_x86_64`.
+- Re-run the scoped quality gate above.
+
+## Validation Sign-Off
+
+- [x] Host feature contract has automated/source-backed validation.
+- [x] x86_64 host-tuned build config has configure/build/source-scan evidence.
+- [x] Focused x86_64 and kernel/graph tests pass.
+- [x] Snapshot updates were explicitly approved and applied.
+- [x] Scoped quality gate passes after approved benchmark baseline update.
+- [x] `nyquist_compliant: true` and `wave_0_complete: true` are set in
+ frontmatter.
+- [x] Rule-compliance evidence is recorded through Zig configure/build,
+ unsupported x86 feature scans, and source-backed host-contract validation.
+
+**Approval:** granted by user; snapshots updated.
diff --git a/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-VERIFICATION.md b/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-VERIFICATION.md
new file mode 100644
index 00000000..eb9fa9a4
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-VERIFICATION.md
@@ -0,0 +1,43 @@
+# Phase 239 Verification
+
+status: passed
+
+All commands were run from:
+`/shared/stateforward/emel.cpp`
+
+## Must-Have Verification
+
+| Must-have | Evidence | Status |
+|-----------|----------|--------|
+| AVX2, FMA, and F16C detected/published | `src/emel/kernel/x86_64/context.hpp`; `tests/kernel/x86_64_tests.cpp`; focused object build | PASS |
+| Unsupported feature families explicitly no-claimed | `host_feature_contract` false no-claim fields; actor accessors; doctests | PASS |
+| Host-tuned x86_64 build flags | CMake configure reports `-mavx2;-mfma;-mf16c` | PASS |
+| No unsupported x86 feature flags | unsupported-flag `rg` scan returns no matches | PASS |
+| Source-backed baseline audit | `239-X86-BASELINE-AUDIT.md` | PASS |
+| Required quality gate | approved `kernel_x86_64` benchmark/parity snapshots landed; scoped quality gate passed all selected lanes | PASS |
+
+## Evidence Summary
+
+- `cmake --build build/phase239 --target emel_tests_bin -j2` passes.
+- `ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'` passes.
+- `scripts/paritychecker.sh --runner=kernel` passes.
+- `git diff --check` passes.
+- `scripts/quality_gates.sh` scoped to Phase 239 files passes:
+ coverage, paritychecker, fuzz skip, lint snapshot, and docs generation.
+- Coverage evidence from the scoped gate:
+ `changed-line coverage: lines 73/78 (93.6%), branches 18/34 (52.9%)`.
+- Approved benchmark snapshots now include the `kernel/x86_64/*` benchmark
+ suite entries.
+- Direct non-mutating benchmark evidence:
+ `EMEL_BENCH_SUITE=kernel_x86_64 build/bench_tools_ninja_kernel_x86_64/bench_runner --mode=emel`
+ emits 15 x86_64 entries: add, cos, div, dup, log, mul, mul_mat, sin,
+ soft_max, sqr, sqrt, sub, unary_exp, unary_neg, and unary_relu.
+
+## Final Verification
+
+User approved snapshot updates. `scripts/bench.sh --snapshot --update
+--suite=kernel_x86_64` updated the benchmark baseline, maintained generation
+publication baselines were updated, and the changed-file scoped quality gate
+passed with `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64`.
+
+Phase 239 is fully verified for `X86-01` and `X86-02`.
diff --git a/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-X86-BASELINE-AUDIT.md b/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-X86-BASELINE-AUDIT.md
new file mode 100644
index 00000000..7e9c373f
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-X86-BASELINE-AUDIT.md
@@ -0,0 +1,74 @@
+# Phase 239 x86_64 Baseline Audit
+
+**Date:** 2026-06-25
+**Scope:** Ryzen 9 5950X host contract and pre-kernel x86_64 baseline.
+
+## Host Contract
+
+Phase 239 establishes an EMEL-owned x86_64 host feature contract for the current
+CPU feature class:
+
+- Supported: AVX2, FMA, and F16C conversion.
+- Explicit no-claim: AVX-512, AVX-VNNI, AMX, BF16 execution, native FP16
+ arithmetic, and GPU execution.
+- Runtime detection is local CPUID/XGETBV logic in
+ `src/emel/kernel/x86_64/context.hpp`, avoiding Zig toolchain link dependency
+ on compiler CPU-model symbols.
+- Public actor inspection is exposed through `src/emel/kernel/x86_64/sm.hpp`.
+
+## Build Contract
+
+`CMakeLists.txt` now has `EMEL_ENABLE_X86_64_HOST_FEATURES`, matching the
+existing AArch64 host-feature switch pattern. On non-cross non-MSVC x86_64
+builds, CMake checks and applies only:
+
+- `-mavx2`
+- `-mfma`
+- `-mf16c`
+
+No AVX-512, AVX-VNNI, AMX, BF16, native-FP16, or GPU flags were added.
+
+The x86 host build also exposed pre-existing non-ARM compile gaps. Phase 239
+repaired those without changing runtime contracts:
+
+- AArch64 NEON helper signatures are hidden from non-ARM compilers in
+ `src/emel/kernel/aarch64/actions.hpp`.
+- ARM-only doctest skip markers now use supported doctest assertions.
+- Non-ARM warning-as-error issues in shared generator/diarization/embedding
+ helpers are acknowledged with no-op casts.
+- `tools/paritychecker/CMakeLists.txt` includes the fetched reference
+ implementation's `vendor` directory so reference-side ``
+ resolves.
+
+## Current x86_64 Kernel Baseline
+
+Current `src/emel/kernel/x86_64` support before Phase 240:
+
+- Existing f32 AVX2 execution helpers cover dup/add/sub/mul/div/sqr/sqrt,
+ mul_mat, and unary abs/neg/relu where dtype/layout and host support allow it.
+- Runtime SIMD choice still flows through x86_64 guards and SML transitions for
+ public dispatch; unsupported requests use the existing scalar/shared behavior.
+- Flash attention still routes through the shared workspace helper rather than
+ an x86_64 AVX2/FMA optimized flash kernel.
+- q2_K/q3_K/q6_K AVX2/FMA hot-path kernels do not exist yet.
+
+## NEON/AArch64 Comparison
+
+The AArch64 precedent has a broader maintained optimization surface:
+
+- AArch64 context publishes optimized/shared dispatch counters.
+- AArch64 SML transitions distinguish optimized flash attention from shared
+ flash behavior.
+- AArch64 quantized paths have route counters and packed/vector coverage for
+ multiple quantized formats.
+
+The x86_64 path now has the equivalent host/build contract foundation, but not
+the flash, quantized-kernel, runtime-parity, or benchmark attribution parity.
+
+## Assigned To Active Follow-On Phases
+
+- Phase 240: AVX2/FMA flash attention kernel and fallback/no-claim behavior.
+- Phase 241: q2_K/q3_K x q8_K AVX2/FMA kernels.
+- Phase 242: q6_K x q8_K AVX2/FMA kernel and hot-path allocation/operand proof.
+- Phase 243: maintained runtime integration and parity proof.
+- Phase 244: benchmark attribution and publication truth.
diff --git a/.planning/milestones/v1.27-phases/240-x86-64-flash-attention-avx2-fma-kernel/240-01-PLAN.md b/.planning/milestones/v1.27-phases/240-x86-64-flash-attention-avx2-fma-kernel/240-01-PLAN.md
new file mode 100644
index 00000000..9a433d2a
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/240-x86-64-flash-attention-avx2-fma-kernel/240-01-PLAN.md
@@ -0,0 +1,108 @@
+# Phase 240 Plan: x86_64 Flash Attention AVX2/FMA Kernel
+
+## Goal
+
+Route supported x86_64 `op_flash_attn_ext` requests through an EMEL-owned
+AVX2/FMA/F16C flash-attention kernel while preserving shared fallback,
+persistent workspace reuse, and explicit no-claim behavior for unsupported
+requests.
+
+## Tasks
+
+
+
+
+ Extend `tests/kernel/x86_64_tests.cpp` with focused tests that initially
+ fail on the current shared-only x86 path:
+ optimized dispatch counter increments for supported AVX2/FMA/F16C
+ requests, shared counter remains zero on optimized dispatch, shared
+ counter increments when the optimized feature contract is disabled,
+ persistent workspace storage is reused, and optimized output matches the
+ maintained flash reference/shared oracle within existing flash tolerance.
+
+
+ Compile or run the focused x86_64 test shard and capture the expected red
+ failure before implementation.
+
+
+
+
+
+ Add x86_64 detail helpers in `src/emel/kernel/x86_64/actions.hpp` for the
+ one-chunk f16 K/V flash path. Use AVX2/FMA for f32 vector arithmetic and
+ F16C conversion intrinsics for f16 workspace and K/V conversion. Keep the
+ effective operand class aligned with the AArch64 optimized path and the
+ shared flash workspace. Do not add allocation or native-FP16 claims.
+
+
+ Focused tests compare optimized x86 output to the shared/reference helper
+ for fixture, long-span, and masked-total-token cases.
+
+
+
+
+
+ Add x86_64 optimized flash predicate(s) in `guards.hpp`, add optimized and
+ shared route counters in context/accessors, and insert optimized/shared
+ `op_flash_attn_ext` transition rows in `sm.hpp` before invalid handling.
+ Runtime behavior choice must remain in guards/transitions, not actions or
+ detail helpers.
+
+
+ Tests prove supported optimized route and feature-disabled shared route
+ through public actor dispatch and counter accessors.
+
+
+
+
+
+ Write `240-VERIFICATION.md`, `240-VALIDATION.md`, and `240-01-SUMMARY.md`
+ from source/test evidence. Keep Phase 239 benchmark snapshot approval
+ visible as a separate gate and do not mutate benchmark snapshots without
+ explicit approval.
+
+
+ `git diff --check`, focused x86_64/kernel tests, and changed-file scoped
+ quality gate evidence are recorded. If a required snapshot update has not
+ been approved yet, keep the phase verification separate from publication
+ approval until the gate can pass.
+
+
+
+
+## Verification
+
+1. Failing-first compile/test evidence for new x86_64 flash tests.
+2. `ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'`
+ or equivalent focused kernel shard after implementation.
+3. Direct x86_64 object/build target for `tests/kernel/x86_64_tests.cpp`.
+4. Source scan for unsupported x86 feature claims/flags:
+ `rg -n -- '-mavx512|-mavxvnni|-mamx|-mavx512bf16|-mavx512fp16|-mavx512vnni|-mavx512f' CMakeLists.txt src tests tools/paritychecker .planning/phases/240-x86-64-flash-attention-avx2-fma-kernel`
+5. `git diff --check`.
+6. Changed-file scoped `scripts/quality_gates.sh` with
+ `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64` when feasible. Do not update
+ snapshots without explicit approval.
+
+
+## Rule Constraints
+
+- Follow `AGENTS.md` and `docs/rules/sml.rules.md`.
+- Keep runtime behavior selection in `guards.hpp` and `sm.hpp` transitions.
+- Keep action/detail helpers as already-chosen numeric work only.
+- Do not add queue/mailbox/deferred-dispatch behavior.
+- Do not add AVX-512, AVX-VNNI, AMX, BF16, native-FP16, GPU, or llama.cpp/ggml
+ runtime linkage.
+- Do not add dispatch-time allocation or per-invocation context fields.
+- Do not update snapshots without explicit user approval.
+
+
+
+## Completion Criteria
+
+- `XFL-01` has source-backed evidence that supported x86_64 flash requests run
+ through the optimized AVX2/FMA/F16C path rather than the shared workspace path.
+- `XFL-02` has source-backed evidence that unsupported or disabled optimized
+ contracts take explicit shared or invalid paths.
+- Focused tests prove numeric correctness, route counters, and workspace reuse.
+- Verification artifacts distinguish source completion from any repository-level
+ snapshot publication gate until the approved snapshots and quality gate pass.
diff --git a/.planning/milestones/v1.27-phases/240-x86-64-flash-attention-avx2-fma-kernel/240-01-SUMMARY.md b/.planning/milestones/v1.27-phases/240-x86-64-flash-attention-avx2-fma-kernel/240-01-SUMMARY.md
new file mode 100644
index 00000000..4e844fdf
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/240-x86-64-flash-attention-avx2-fma-kernel/240-01-SUMMARY.md
@@ -0,0 +1,40 @@
+---
+phase: 240
+status: passed
+requirements-completed:
+ - XFL-01
+ - XFL-02
+requirements-blocked: []
+verification: passed
+---
+
+# Phase 240 Summary
+
+## What Changed
+
+- Added an EMEL-owned x86_64 one-chunk flash-attention kernel using AVX2/FMA
+ f32 vector arithmetic and F16C f16 conversions.
+- Kept the flash operand contract aligned with the AArch64 optimized path: f32
+ Q rounded to f16, f16 K/V operands, f16 workspace accumulation, and f32 output.
+- Added explicit optimized/shared x86_64 flash dispatch counters and actor
+ accessors for attribution.
+- Routed `op_flash_attn_ext` through optimized x86_64 guards/transitions before
+ shared fallback and invalid handling.
+- Added x86_64 tests proving optimized route, shared fallback when the feature
+ contract is disabled, workspace reuse, and numeric agreement with maintained
+ flash reference helpers.
+
+## Validation
+
+- Failing-first x86_64 test object compile: red captured before implementation.
+- x86_64 test object compile: pass after implementation.
+- `emel_tests_bin` build: pass.
+- `emel_tests_kernel_and_graph` CTest shard: pass.
+- Unsupported x86 feature flag source scan: pass.
+- `scripts/lint_snapshot.sh`: pass without snapshot updates.
+- Scoped `scripts/quality_gates.sh`: coverage, paritychecker, benchmark
+ snapshot, lint, docs, and fuzz routing pass after approved snapshot updates.
+
+## Closeout Status
+
+The Phase 240 implementation satisfies and verifies `XFL-01` and `XFL-02`.
diff --git a/.planning/milestones/v1.27-phases/240-x86-64-flash-attention-avx2-fma-kernel/240-CONTEXT.md b/.planning/milestones/v1.27-phases/240-x86-64-flash-attention-avx2-fma-kernel/240-CONTEXT.md
new file mode 100644
index 00000000..148ab75d
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/240-x86-64-flash-attention-avx2-fma-kernel/240-CONTEXT.md
@@ -0,0 +1,118 @@
+# Phase 240: x86_64 Flash Attention AVX2/FMA Kernel - Context
+
+**Gathered:** 2026-06-25
+**Status:** Ready for planning
+**Mode:** Auto-generated (autonomous processor-support phase)
+
+
+## Phase Boundary
+
+Port the maintained flash-attention optimization pattern to an EMEL-owned
+x86_64 AVX2/FMA implementation for this Ryzen AVX2/FMA/F16C host. This phase
+must route supported flash-attention requests through an optimized x86_64
+kernel, preserve persistent workspace reuse, and keep unsupported requests on
+explicit shared or invalid paths. It must not implement quantized q2_K/q3_K/q6_K
+matmul kernels, runtime parity publication, or benchmark attribution; those are
+active Phase 241-244 obligations.
+
+
+
+
+## Implementation Decisions
+
+### Kernel Contract
+- Implement a native x86_64 flash-attention helper in `src/emel/kernel/x86_64`
+ using AVX2/FMA numeric work and F16C conversions for f16 K/V operand handling.
+- Match the AArch64 one-chunk f16 K/V operand class: f32 Q is rounded into f16,
+ K/V are consumed as f16, accumulation uses the existing f16 workspace buffer,
+ and output is converted back to f32.
+- Do not claim native FP16 arithmetic. F16C is a conversion capability; all x86
+ vector arithmetic remains f32 AVX2/FMA.
+- Keep all optimized kernel code in `src/`; parity and benchmark tools may only
+ observe through public dispatch surfaces.
+
+### Routing Contract
+- Put supported optimized-path selection in `x86_64/guards.hpp` and
+ `x86_64/sm.hpp`, analogous to the AArch64 flash route.
+- Add explicit x86_64 optimized/shared flash counters so tests and active parity
+ attribution can distinguish optimized execution from shared fallback.
+- Keep unsupported shapes, feature contracts, and workspace constraints on the
+ existing shared or invalid paths; do not silently label them optimized.
+
+### Verification Contract
+- Add failing-first x86_64 tests before source changes: optimized dispatch
+ counter, shared fallback counter, persistent workspace reuse, and numeric
+ comparison against the maintained shared/reference helper.
+- Verify through public actor dispatch or route-owned detail functions, not by
+ tool-only scaffolds.
+- Do not update `snapshots/bench/benchmarks.txt` in this phase unless explicit
+ snapshot approval is provided.
+
+### the agent's Discretion
+- Prefer the smallest x86_64 implementation that proves AVX2/FMA/F16C flash
+ support and keeps active follow-on runtime/benchmark attribution
+ straightforward.
+- Reuse the existing `flash_attn_workspace` instead of adding per-dispatch
+ allocation or new transient context fields.
+
+
+
+
+## Existing Code Insights
+
+### Reusable Assets
+- `src/emel/kernel/aarch64/actions.hpp` contains the optimized NEON flash
+ precedent: `run_flash_attn_ext_f16kv_one_chunk_neon_unchecked`,
+ `can_run_neon_flash_attn_ext_f16kv_one_chunk_request`, and
+ `exec_simd_flash_attn_ext_f16kv_one_chunk`.
+- `src/emel/kernel/aarch64/guards.hpp` and `sm.hpp` route optimized flash before
+ the shared flash path, then invalid fallback.
+- `src/emel/kernel/detail.hpp` owns the shared flash workspace, f16 conversion
+ helpers, active-token handling, and scalar workspace fallback.
+- `tests/kernel/test_helpers.hpp` provides flash fixtures and reference helpers.
+
+### Established Patterns
+- x86_64 SIMD helpers live in `src/emel/kernel/x86_64/actions.hpp`, with feature
+ predicates in guards and transition rows in `sm.hpp`.
+- The current x86_64 flash route accepts `op_flash_attn_ext` through
+ `guard::valid_op_flash_attn_ext` and `action::exec_op_flash_attn_ext`, which
+ calls the shared workspace helper.
+- Phase 239 added x86_64 AVX2/FMA/F16C feature-contract fields and actor
+ accessors that this phase can use for optimized flash eligibility.
+
+### Integration Points
+- Add x86 flash helpers and action aliases in `src/emel/kernel/x86_64/actions.hpp`.
+- Add optimized/shared flash guards in `src/emel/kernel/x86_64/guards.hpp`.
+- Add optimized/shared flash route counters in
+ `src/emel/kernel/x86_64/context.hpp` and public accessors in `sm.hpp`.
+- Add optimized flash transition rows before the shared flash row in
+ `src/emel/kernel/x86_64/sm.hpp`.
+- Add focused tests in `tests/kernel/x86_64_tests.cpp`.
+
+
+
+
+## Specific Ideas
+
+- The current host contract is Ryzen 9 5950X with AVX2, FMA, and F16C available.
+- The optimized flash route should require all three: AVX2 for vector lanes, FMA
+ for fused f32 accumulation, and F16C for f16 K/V conversion.
+- The test surface should prove optimized and shared counters separately so
+ Phase 243/244 attribution can build on source-backed evidence.
+
+
+
+
+## Active Follow-On Scope
+
+- Phase 241: AVX2/FMA q2_K/q3_K kernels.
+- Phase 242: AVX2/FMA q6_K and hot-path operand-fidelity proof.
+- Phase 243: maintained runtime integration and parity proof.
+- Phase 244: benchmark attribution and publication truth.
+
+
+
+---
+
+*Phase: 240-x86-64-flash-attention-avx2-fma-kernel*
+*Context gathered: 2026-06-25*
diff --git a/.planning/milestones/v1.27-phases/240-x86-64-flash-attention-avx2-fma-kernel/240-VALIDATION.md b/.planning/milestones/v1.27-phases/240-x86-64-flash-attention-avx2-fma-kernel/240-VALIDATION.md
new file mode 100644
index 00000000..c45721ae
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/240-x86-64-flash-attention-avx2-fma-kernel/240-VALIDATION.md
@@ -0,0 +1,101 @@
+---
+phase: 240
+slug: x86-64-flash-attention-avx2-fma-kernel
+status: passed
+nyquist_compliant: true
+wave_0_complete: true
+created: 2026-06-25
+---
+
+# Phase 240 - Validation Strategy
+
+## Test Infrastructure
+
+| Property | Value |
+|----------|-------|
+| Framework | doctest, CTest, source scans, lint snapshot, quality gate |
+| Config file | `CMakeLists.txt`; `scripts/quality_gates.sh` |
+| Quick run command | `ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'` |
+| Gate command | `EMEL_QUALITY_GATES_CHANGED_FILES="" scripts/quality_gates.sh` |
+| Current gate status | passed after approved x86_64 benchmark baseline update |
+
+## Per-Task Verification Map
+
+| Task ID | Requirement | Test Type | Automated Command | Status |
+|---------|-------------|-----------|-------------------|--------|
+| 240-01-01 | XFL-01, XFL-02 | failing-first compile proof | `cmake --build build/phase239 --target CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o` before implementation | red captured |
+| 240-01-02 | XFL-01 | focused compile/test | x86_64 test object build; `emel_tests_kernel_and_graph` | green |
+| 240-01-03 | XFL-01, XFL-02 | public actor route proof | optimized/shared counter tests in `tests/kernel/x86_64_tests.cpp` | green |
+| 240-01-04 | quality gate | scoped quality gate | `EMEL_QUALITY_GATES_CHANGED_FILES="..." scripts/quality_gates.sh` | green |
+
+## Command Results
+
+```bash
+cmake --build build/phase239 --target CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o
+```
+
+Result: PASS after implementation. The pre-fix run failed on missing x86_64
+optimized/shared flash route counters and accessors.
+
+```bash
+cmake --build build/phase239 --target emel_tests_bin -j2
+```
+
+Result: PASS.
+
+```bash
+ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'
+```
+
+Result: PASS, `100% tests passed`.
+
+```bash
+rg -n -- '-mavx512|-mavxvnni|-mamx|-mavx512bf16|-mavx512fp16|-mavx512vnni|-mavx512f' CMakeLists.txt src tests tools/paritychecker
+```
+
+Result: PASS, no unsupported x86 compile flags found.
+
+```bash
+scripts/lint_snapshot.sh
+```
+
+Result: PASS. No lint snapshot update was made.
+
+```bash
+PATH="/shared/stateforward/.tools/clang-format-venv/bin:/shared/stateforward/.tools/llvm18/root/usr/lib/llvm-18/bin:/shared/stateforward/.tools/git-lfs/git-lfs-3.7.1:$PATH" \
+EMEL_QUALITY_GATES_TIMEOUT="3600s" \
+EMEL_QUALITY_GATES_BENCH_SUITE="kernel_x86_64" \
+EMEL_QUALITY_GATES_CHANGED_FILES="src/emel/kernel/x86_64/actions.hpp:src/emel/kernel/x86_64/context.hpp:src/emel/kernel/x86_64/guards.hpp:src/emel/kernel/x86_64/sm.hpp:tests/kernel/x86_64_tests.cpp" \
+scripts/quality_gates.sh
+```
+
+Initial result before snapshot approval: the scoped gate passed all
+non-benchmark lanes:
+
+- `test_with_coverage`: PASS. CTest shard `kernel_and_graph` passes.
+ Changed-line coverage is `381/406` lines (`93.8%`) and `86/124` branches
+ (`69.4%`).
+- `paritychecker`: PASS. Kernel parity runner passes.
+- `fuzz_smoke`: skipped because no fuzz-affecting files changed.
+- `lint_snapshot`: PASS without snapshot update.
+- `generate_docs`: PASS.
+
+The only failing lane was `bench_snapshot`: the `kernel_x86_64` suite still emits
+15 `kernel/x86_64/*` entries without baselines in
+`snapshots/bench/benchmarks.txt`. User approval was granted and the snapshot
+baseline was updated.
+
+## Validation Sign-Off
+
+- [x] x86_64 optimized flash route has automated/source-backed validation.
+- [x] x86_64 shared fallback/no-claim path has automated validation.
+- [x] Focused x86_64 and kernel/graph tests pass.
+- [x] Coverage, parity, lint, fuzz routing, and docs lanes pass in scoped gate.
+- [x] Snapshot updates were explicitly approved and applied.
+- [x] Scoped quality gate passes after approved benchmark baseline update.
+- [x] `nyquist_compliant: true` and `wave_0_complete: true` are set in
+ frontmatter.
+- [x] Rule-compliance evidence is recorded through explicit x86_64
+ guards/transitions, unsupported feature scans, focused actor tests, and lint.
+
+**Approval:** granted by user; snapshots updated.
diff --git a/.planning/milestones/v1.27-phases/240-x86-64-flash-attention-avx2-fma-kernel/240-VERIFICATION.md b/.planning/milestones/v1.27-phases/240-x86-64-flash-attention-avx2-fma-kernel/240-VERIFICATION.md
new file mode 100644
index 00000000..ecc168fb
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/240-x86-64-flash-attention-avx2-fma-kernel/240-VERIFICATION.md
@@ -0,0 +1,49 @@
+# Phase 240 Verification
+
+status: passed
+
+All commands were run from:
+`/shared/stateforward/emel.cpp`
+
+## Must-Have Verification
+
+| Must-have | Evidence | Status |
+|-----------|----------|--------|
+| Supported x86_64 flash route is optimized | `src/emel/kernel/x86_64/guards.hpp`, `sm.hpp`, `actions.hpp`; `tests/kernel/x86_64_tests.cpp` route-counter tests | PASS |
+| Optimized helper is EMEL-owned AVX2/FMA/F16C source code | `run_flash_attn_ext_f16kv_one_chunk_avx2_fma_f16c_unchecked` and conversion/dot/axpy helpers in `src/emel/kernel/x86_64/actions.hpp` | PASS |
+| Shared fallback/no-claim behavior remains explicit | feature-disabled x86_64 flash test increments shared counter and not optimized counter | PASS |
+| Persistent workspace reuse is preserved | x86_64 flash workspace reuse test observes prepared-token and reuse counters through actor accessors | PASS |
+| Numeric behavior matches maintained oracle | x86_64 flash tests compare fixture and masked-token output to flash reference helpers | PASS |
+| Required quality gate | approved `kernel_x86_64` benchmark/parity snapshots landed; scoped quality gate passed all selected lanes | PASS |
+
+## Evidence Summary
+
+- Failing-first compile proof captured after adding tests:
+ `cmake --build build/phase239 --target CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o`
+ failed before implementation on missing x86_64 flash counters/accessors and
+ route support.
+- `cmake --build build/phase239 --target CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o`
+ passes after implementation.
+- `cmake --build build/phase239 --target emel_tests_bin -j2` passes.
+- `ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'`
+ passes.
+- Source-only unsupported x86 flag scan passes:
+ `rg -n -- '-mavx512|-mavxvnni|-mamx|-mavx512bf16|-mavx512fp16|-mavx512vnni|-mavx512f' CMakeLists.txt src tests tools/paritychecker`
+ returns no matches.
+- `scripts/lint_snapshot.sh` passes without updating
+ `snapshots/lint/clang_format.txt`.
+- Scoped `scripts/quality_gates.sh` passes all non-benchmark lanes:
+ coverage, paritychecker, fuzz skip, lint snapshot, and docs generation.
+- Coverage evidence from the scoped gate:
+ `changed-line coverage: lines 381/406 (93.8%), branches 86/124 (69.4%)`.
+- Approved benchmark snapshots now include the `kernel/x86_64/*` benchmark
+ suite entries.
+
+## Final Verification
+
+User approved snapshot updates. `scripts/bench.sh --snapshot --update
+--suite=kernel_x86_64` updated the benchmark baseline, maintained generation
+publication baselines were updated, and the changed-file scoped quality gate
+passed with `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64`.
+
+Phase 240 is fully verified for `XFL-01` and `XFL-02`.
diff --git a/.planning/milestones/v1.27-phases/241-x86-64-vectorized-q2-k-q3-k-kernels/241-01-PLAN.md b/.planning/milestones/v1.27-phases/241-x86-64-vectorized-q2-k-q3-k-kernels/241-01-PLAN.md
new file mode 100644
index 00000000..a772a788
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/241-x86-64-vectorized-q2-k-q3-k-kernels/241-01-PLAN.md
@@ -0,0 +1,101 @@
+# Phase 241 Plan: x86_64 Vectorized q2_K/q3_K Kernels
+
+## Goal
+
+Add native x86_64 AVX2/FMA `q2_K x q8_K` and `q3_K x q8_K` kernels for the
+maintained quantized `op_mul_mat` path, with explicit optimized/shared route
+attribution through the x86_64 kernel actor.
+
+## Tasks
+
+
+
+
+ Extend `tests/kernel/x86_64_tests.cpp` with q2_K and q3_K tests that
+ initially fail on the current shared-only x86 path. Cover row-kernel
+ correctness, actor optimized counters, actor shared counters with disabled
+ feature contract, representative block data, and multi-block accumulation.
+
+
+ Compile or run the focused x86_64/kernel shard and capture the expected
+ red failure before implementation.
+
+
+
+
+
+ Add x86_64 detail row helpers for `q2_K x q8_K` and `q3_K x q8_K` using
+ the same effective operand format as the scalar/AArch64 paths. Use AVX2/FMA
+ where appropriate for accumulation, keep any scalar branches local to
+ already-selected numeric work, and do not dequantize whole tensors to f32.
+
+
+ Row-level tests match scalar/reference output within the established
+ quantized tolerance for q2_K and q3_K across multiple block counts.
+
+
+
+
+
+ Add x86_64 q2/q3 optimized route predicates in `guards.hpp`, selected-route
+ actions and counters in `actions.hpp`/`context.hpp`, actor accessors in
+ `sm.hpp`, and transition rows in `sm.hpp` before generic f32 SIMD and
+ shared scalar `op_mul_mat` routes.
+
+
+ Actor tests prove optimized q2/q3 counters increment for supported
+ requests and shared q2/q3 counters increment when the feature contract is
+ disabled.
+
+
+
+
+
+ Write `241-VERIFICATION.md`, `241-VALIDATION.md`, and `241-01-SUMMARY.md`
+ from source/test evidence. Keep benchmark snapshot approval visible as a
+ shared closeout gate and do not mutate snapshots without explicit approval.
+
+
+ Focused tests, source scan, lint snapshot, and changed-file scoped quality
+ gate evidence are recorded.
+
+
+
+
+## Verification
+
+1. Failing-first x86_64 q2/q3 tests.
+2. `cmake --build build/phase239 --target CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o`.
+3. `ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'`.
+4. Source scan for forbidden x86 feature claims/flags:
+ `rg -n -- '-mavx512|-mavxvnni|-mamx|-mavx512bf16|-mavx512fp16|-mavx512vnni|-mavx512f' CMakeLists.txt src tests tools/paritychecker`
+5. `scripts/lint_snapshot.sh` without snapshot updates.
+6. Changed-file scoped `scripts/quality_gates.sh` with
+ `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64` when feasible. Do not update
+ snapshots without explicit approval.
+
+
+## Rule Constraints
+
+- Follow `AGENTS.md` and `docs/rules/sml.rules.md`.
+- Keep runtime behavior selection in `guards.hpp` and `sm.hpp` transitions.
+- Keep action/detail helpers as already-chosen numeric work only.
+- Do not add queue/mailbox/deferred-dispatch behavior.
+- Do not implement q6_K in this phase.
+- Do not add AVX-512, AVX-VNNI, AMX, BF16, native-FP16, GPU, or llama.cpp/ggml
+ runtime linkage.
+- Do not add whole-tensor dequantize-to-f32 hot-path substitution.
+- Do not update snapshots without explicit user approval.
+
+
+
+## Completion Criteria
+
+- `XQK-01` has source-backed evidence that supported x86_64 `q2_K x q8_K`
+ requests run through a native optimized x86 kernel.
+- `XQK-02` has source-backed evidence that supported x86_64 `q3_K x q8_K`
+ requests run through a native optimized x86 kernel.
+- Tests prove optimized/shared attribution, row correctness, and actor route
+ behavior for both formats.
+- Verification artifacts distinguish source completion from the still-pending
+ repository-level benchmark snapshot approval gate.
diff --git a/.planning/milestones/v1.27-phases/241-x86-64-vectorized-q2-k-q3-k-kernels/241-01-SUMMARY.md b/.planning/milestones/v1.27-phases/241-x86-64-vectorized-q2-k-q3-k-kernels/241-01-SUMMARY.md
new file mode 100644
index 00000000..8e720ae7
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/241-x86-64-vectorized-q2-k-q3-k-kernels/241-01-SUMMARY.md
@@ -0,0 +1,42 @@
+---
+phase: 241
+status: passed
+requirements-completed:
+ - XQK-01
+ - XQK-02
+requirements-blocked: []
+verification: passed
+---
+
+# Phase 241 Summary
+
+## What Changed
+
+- Added EMEL-owned x86_64 AVX2/FMA row kernels for `q2_K x q8_K` and
+ `q3_K x q8_K` in `src/emel/kernel/x86_64/actions.hpp`.
+- Kept the quantized operand path block-native: q2_K/q3_K LHS blocks and q8_K
+ RHS blocks generated by the maintained quantizer. No whole-tensor
+ dequantize-to-f32 hot-path substitution was added.
+- Added explicit q2/q3 optimized and shared route counters to the x86_64 actor
+ context and exposed actor accessors for attribution.
+- Routed supported q2_K/q3_K `op_mul_mat` requests through explicit guards and
+ SML transitions before generic f32 SIMD and shared scalar routes.
+- Added x86_64 tests proving row correctness against scalar q2/q3 oracles,
+ optimized route attribution on supported contracts, shared route attribution
+ when the feature contract is disabled, and multi-block accumulation behavior.
+
+## Validation
+
+- Failing-first x86_64 test object compile: red captured before implementation.
+- x86_64 test object compile: pass after implementation.
+- `emel_tests_bin` build: pass.
+- `emel_tests_kernel_and_graph` CTest shard: pass.
+- Unsupported x86 feature flag source scan: pass.
+- `scripts/lint_snapshot.sh`: pass without snapshot updates.
+- `git diff --check`: pass for Phase 241 changed source/test/planning files.
+- Scoped `scripts/quality_gates.sh`: coverage, paritychecker, benchmark
+ snapshot, lint, docs, and fuzz routing pass after approved snapshot updates.
+
+## Closeout Status
+
+The Phase 241 implementation satisfies and verifies `XQK-01` and `XQK-02`.
diff --git a/.planning/milestones/v1.27-phases/241-x86-64-vectorized-q2-k-q3-k-kernels/241-CONTEXT.md b/.planning/milestones/v1.27-phases/241-x86-64-vectorized-q2-k-q3-k-kernels/241-CONTEXT.md
new file mode 100644
index 00000000..881626fe
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/241-x86-64-vectorized-q2-k-q3-k-kernels/241-CONTEXT.md
@@ -0,0 +1,112 @@
+# Phase 241: x86_64 Vectorized q2_K/q3_K Kernels - Context
+
+**Gathered:** 2026-06-25
+**Status:** Ready for planning
+**Mode:** Auto-generated (autonomous processor-support phase)
+
+
+## Phase Boundary
+
+Land EMEL-owned x86_64 AVX2/FMA `q2_K x q8_K` and `q3_K x q8_K` kernels for
+the maintained quantized `op_mul_mat` hot path. This phase must prove optimized
+execution and shared fallback/no-claim behavior for q2_K and q3_K through the
+kernel actor route. It must not implement q6_K, runtime generator integration,
+or benchmark publication; those remain active Phase 242-244 obligations.
+
+
+
+
+## Implementation Decisions
+
+### Kernel Contract
+- Implement native x86_64 row kernels in `src/emel/kernel/x86_64/actions.hpp`
+ for `q2_K x q8_K` and `q3_K x q8_K`.
+- Preserve the same effective operand class as the maintained scalar/AArch64
+ paths: block-q2_K or block-q3_K LHS and block-q8_K RHS, no whole-tensor
+ dequantize-to-f32 substitution in the hot path.
+- Use AVX2/FMA for vectorized accumulation where it improves the native row
+ path, with scalar tail handling only inside the already-selected kernel.
+- Keep Phase 241 limited to q2_K and q3_K; q6_K remains Phase 242.
+
+### Routing Contract
+- Add x86_64 guards/transitions for supported q2_K/q3_K `op_mul_mat` before the
+ generic f32 AVX2 and shared scalar routes.
+- Add optimized/shared q2/q3 route counters and actor accessors analogous to the
+ AArch64 route counters.
+- Keep runtime behavior choice in `guards.hpp` and `sm.hpp`; action/detail code
+ must execute only an already-selected q2 or q3 path.
+
+### Verification Contract
+- Add failing-first tests proving q2_K and q3_K optimized counters are missing or
+ shared-only before implementation.
+- Add correctness tests comparing x86 optimized row/mul_mat output to the
+ maintained scalar/reference oracle for representative blocks, multiple block
+ groups, tails, and accumulation behavior.
+- Prove disabled feature contracts take the shared q2/q3 route without claiming
+ optimized execution.
+
+### the agent's Discretion
+- Start with row-level kernels plus actor-route tests if that is the smallest
+ source-backed route to `XQK-01`/`XQK-02`.
+- Reuse test helpers from AArch64 quantized tests when possible instead of
+ inventing parallel fixtures.
+
+
+
+
+## Existing Code Insights
+
+### Reusable Assets
+- `src/emel/kernel/detail.hpp` owns scalar q2_K/q3_K/q8_K block structures and
+ scalar dot helpers.
+- `src/emel/kernel/aarch64/actions.hpp` contains NEON q2/q3 row kernels and
+ `op_mul_mat` route/counter precedent.
+- `tests/kernel/aarch64_tests.cpp` has row-level q2/q3 correctness fixtures and
+ actor-route tests for optimized/shared q2/q3 dispatch.
+- `tests/kernel/test_helpers.hpp` has q8_K vector source helpers and quantized
+ tensor-view construction helpers.
+
+### Established Patterns
+- Quantized optimized routes increment optimized q-format counters; shared
+ scalar routes increment shared q-format counters.
+- The actor surface exposes route counters through `sm.hpp` and `kernel/any.hpp`
+ consumes them when available.
+- Fallback/no-claim behavior is tested by disabling host SIMD support in the
+ machine context and proving shared counters increment.
+
+### Integration Points
+- `src/emel/kernel/x86_64/actions.hpp`: row kernels, selected-route actions, and
+ shared/optimized counter increments.
+- `src/emel/kernel/x86_64/guards.hpp`: q2/q3 optimized route predicates and
+ generic q2/q3 exclusion from shared route predicates.
+- `src/emel/kernel/x86_64/context.hpp`: q2/q3 route counters.
+- `src/emel/kernel/x86_64/sm.hpp`: q2/q3 transition rows and actor accessors.
+- `tests/kernel/x86_64_tests.cpp`: focused row and actor-route tests.
+
+
+
+
+## Specific Ideas
+
+- The Phase 241 request is exactly to bring this Ryzen AVX2/FMA processor toward
+ the same support standard as the NEON path, so q2_K/q3_K must be native EMEL
+ kernels, not benchmark-only or whole-tensor f32 fallback code.
+- The x86 feature contract from Phase 239 provides AVX2/FMA/F16C support booleans
+ that can gate q2/q3 optimized routes. q2/q3 integer unpacking itself should
+ not claim AVX-512, VNNI, AMX, BF16, or native FP16 support.
+
+
+
+
+## Active Follow-On Scope
+
+- Phase 242: AVX2/FMA q6_K and hot-path operand-fidelity proof.
+- Phase 243: maintained runtime integration and parity proof.
+- Phase 244: benchmark attribution and publication truth.
+
+
+
+---
+
+*Phase: 241-x86-64-vectorized-q2-k-q3-k-kernels*
+*Context gathered: 2026-06-25*
diff --git a/.planning/milestones/v1.27-phases/241-x86-64-vectorized-q2-k-q3-k-kernels/241-VALIDATION.md b/.planning/milestones/v1.27-phases/241-x86-64-vectorized-q2-k-q3-k-kernels/241-VALIDATION.md
new file mode 100644
index 00000000..a5f8b74a
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/241-x86-64-vectorized-q2-k-q3-k-kernels/241-VALIDATION.md
@@ -0,0 +1,108 @@
+---
+phase: 241
+slug: x86-64-vectorized-q2-k-q3-k-kernels
+status: passed
+nyquist_compliant: true
+wave_0_complete: true
+created: 2026-06-25
+---
+
+# Phase 241 - Validation Strategy
+
+## Test Infrastructure
+
+| Property | Value |
+|----------|-------|
+| Framework | doctest, CTest, source scans, lint snapshot, quality gate |
+| Config file | `CMakeLists.txt`; `scripts/quality_gates.sh` |
+| Quick run command | `ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'` |
+| Gate command | `EMEL_QUALITY_GATES_CHANGED_FILES="" EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64 scripts/quality_gates.sh` |
+| Current gate status | passed after approved x86_64 benchmark baseline update |
+
+## Per-Task Verification Map
+
+| Task ID | Requirement | Test Type | Automated Command | Status |
+|---------|-------------|-----------|-------------------|--------|
+| 241-01-01 | XQK-01, XQK-02 | failing-first compile proof | `cmake --build build/phase239 --target CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o` before implementation | red captured |
+| 241-01-02 | XQK-01, XQK-02 | row-kernel correctness | x86_64 q2/q3 row tests against scalar q2_K/q3_K x q8_K helpers | green |
+| 241-01-03 | XQK-01, XQK-02 | public actor route proof | optimized/shared counter tests in `tests/kernel/x86_64_tests.cpp` | green |
+| 241-01-04 | quality gate | scoped quality gate | `EMEL_QUALITY_GATES_CHANGED_FILES="..." EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64 scripts/quality_gates.sh` | green |
+
+## Command Results
+
+```bash
+cmake --build build/phase239 --target CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o
+```
+
+Result: PASS after implementation. The pre-fix run failed on missing x86_64
+q2/q3 AVX2/FMA row helper symbols and actor counter accessors.
+
+```bash
+cmake --build build/phase239 --target emel_tests_bin -j2
+```
+
+Result: PASS.
+
+```bash
+ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'
+```
+
+Result: PASS, `100% tests passed`.
+
+```bash
+rg -n -- '-mavx512|-mavxvnni|-mamx|-mavx512bf16|-mavx512fp16|-mavx512vnni|-mavx512f' CMakeLists.txt src tests tools/paritychecker
+```
+
+Result: PASS, no unsupported x86 compile flags found.
+
+```bash
+scripts/lint_snapshot.sh
+```
+
+Result: PASS. No lint snapshot update was made.
+
+```bash
+git diff --check -- src/emel/kernel/x86_64/actions.hpp src/emel/kernel/x86_64/context.hpp src/emel/kernel/x86_64/guards.hpp src/emel/kernel/x86_64/sm.hpp tests/kernel/x86_64_tests.cpp .planning/phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-01-PLAN.md .planning/phases/240-x86-64-flash-attention-avx2-fma-kernel/240-CONTEXT.md
+```
+
+Result: PASS.
+
+```bash
+PATH="/shared/stateforward/.tools/clang-format-venv/bin:/shared/stateforward/.tools/llvm18/root/usr/lib/llvm-18/bin:/shared/stateforward/.tools/git-lfs/git-lfs-3.7.1:$PATH" \
+EMEL_QUALITY_GATES_BENCH_SUITE="kernel_x86_64" \
+EMEL_QUALITY_GATES_CHANGED_FILES="src/emel/kernel/x86_64/actions.hpp,src/emel/kernel/x86_64/context.hpp,src/emel/kernel/x86_64/guards.hpp,src/emel/kernel/x86_64/sm.hpp,tests/kernel/x86_64_tests.cpp" \
+scripts/quality_gates.sh
+```
+
+Initial result before snapshot approval: the scoped gate passed all
+non-benchmark lanes:
+
+- `test_with_coverage`: PASS. CTest shard `kernel_and_graph` passes.
+ Changed-line coverage is `576/601` lines (`95.8%`) and `144/204` branches
+ (`70.6%`).
+- `paritychecker`: PASS. Kernel parity runner passes.
+- `fuzz_smoke`: skipped because no fuzz-affecting files changed.
+- `lint_snapshot`: PASS without snapshot update.
+- `generate_docs`: PASS.
+
+The only failing lane was `bench_snapshot`: the `kernel_x86_64` suite still emits
+15 `kernel/x86_64/*` entries without baselines in
+`snapshots/bench/benchmarks.txt`. User approval was granted and the snapshot
+baseline was updated.
+
+## Validation Sign-Off
+
+- [x] x86_64 optimized q2_K route has automated/source-backed validation.
+- [x] x86_64 optimized q3_K route has automated/source-backed validation.
+- [x] x86_64 shared fallback/no-claim paths have automated validation.
+- [x] Focused x86_64 and kernel/graph tests pass.
+- [x] Coverage, parity, lint, fuzz routing, and docs lanes pass in scoped gate.
+- [x] Snapshot updates were explicitly approved and applied.
+- [x] Scoped quality gate passes after approved benchmark baseline update.
+- [x] `nyquist_compliant: true` and `wave_0_complete: true` are set in
+ frontmatter.
+- [x] Rule-compliance evidence is recorded through explicit q2/q3
+ guards/transitions, block-native operand tests, unsupported feature scans, and
+ lint.
+
+**Approval:** granted by user; snapshots updated.
diff --git a/.planning/milestones/v1.27-phases/241-x86-64-vectorized-q2-k-q3-k-kernels/241-VERIFICATION.md b/.planning/milestones/v1.27-phases/241-x86-64-vectorized-q2-k-q3-k-kernels/241-VERIFICATION.md
new file mode 100644
index 00000000..084ce424
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/241-x86-64-vectorized-q2-k-q3-k-kernels/241-VERIFICATION.md
@@ -0,0 +1,51 @@
+# Phase 241 Verification
+
+status: passed
+
+All commands were run from:
+`/shared/stateforward/emel.cpp`
+
+## Must-Have Verification
+
+| Must-have | Evidence | Status |
+|-----------|----------|--------|
+| Supported x86_64 q2_K route is optimized | `src/emel/kernel/x86_64/guards.hpp`, `sm.hpp`, `actions.hpp`; `tests/kernel/x86_64_tests.cpp` q2 route-counter test | PASS |
+| Supported x86_64 q3_K route is optimized | `src/emel/kernel/x86_64/guards.hpp`, `sm.hpp`, `actions.hpp`; `tests/kernel/x86_64_tests.cpp` q3 route-counter test | PASS |
+| Native EMEL-owned row kernels exist | `dot_q2_k_q8_k_row_avx2_fma` and `dot_q3_k_q8_k_row_avx2_fma` in `src/emel/kernel/x86_64/actions.hpp` | PASS |
+| Same effective operand class is preserved | tests use block-q2_K/block-q3_K LHS plus block-q8_K RHS generated by the maintained q8_K quantizer; no whole-tensor f32 substitution was added | PASS |
+| Shared fallback/no-claim behavior remains explicit | feature-disabled x86_64 q2/q3 tests increment shared counters and not optimized counters | PASS |
+| Required quality gate | approved `kernel_x86_64` benchmark/parity snapshots landed; scoped quality gate passed all selected lanes | PASS |
+
+## Evidence Summary
+
+- Failing-first compile proof captured after adding tests:
+ `cmake --build build/phase239 --target CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o`
+ failed before implementation on missing `dot_q2_k_q8_k_row_avx2_fma`,
+ `dot_q3_k_q8_k_row_avx2_fma`, and q2/q3 actor counter accessors.
+- `cmake --build build/phase239 --target CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o`
+ passes after implementation.
+- `cmake --build build/phase239 --target emel_tests_bin -j2` passes.
+- `ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'`
+ passes.
+- Source-only unsupported x86 flag scan passes:
+ `rg -n -- '-mavx512|-mavxvnni|-mamx|-mavx512bf16|-mavx512fp16|-mavx512vnni|-mavx512f' CMakeLists.txt src tests tools/paritychecker`
+ returns no matches.
+- `scripts/lint_snapshot.sh` passes without updating
+ `snapshots/lint/clang_format.txt`.
+- `git diff --check` passes for the Phase 241 changed source/test files and
+ the touched planning files.
+- Scoped `scripts/quality_gates.sh` passes all non-benchmark lanes:
+ coverage, paritychecker, fuzz routing, lint snapshot, and docs generation.
+- Coverage evidence from the scoped gate:
+ `changed-line coverage: lines 576/601 (95.8%), branches 144/204 (70.6%)`.
+- Approved benchmark snapshots now include the `kernel/x86_64/*` benchmark
+ suite entries.
+
+## Final Verification
+
+User approved snapshot updates. `scripts/bench.sh --snapshot --update
+--suite=kernel_x86_64` updated the benchmark baseline, maintained generation
+publication baselines were updated, and the changed-file scoped quality gate
+passed with `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64`.
+
+Phase 241 is fully verified for `XQK-01` and `XQK-02`.
diff --git a/.planning/milestones/v1.27-phases/242-x86-64-vectorized-q6-k-and-hot-path-contract/242-01-PLAN.md b/.planning/milestones/v1.27-phases/242-x86-64-vectorized-q6-k-and-hot-path-contract/242-01-PLAN.md
new file mode 100644
index 00000000..9e260d20
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/242-x86-64-vectorized-q6-k-and-hot-path-contract/242-01-PLAN.md
@@ -0,0 +1,113 @@
+# Phase 242 Plan: x86_64 Vectorized q6_K and Hot-Path Contract
+
+## Goal
+
+Add the x86_64 AVX2/FMA `q6_K x q8_K` kernel and prove the maintained x86_64
+quantized hot path stays block-native and allocation-free for supported
+q2_K/q3_K/q6_K optimized requests.
+
+## Tasks
+
+
+
+
+ Extend `tests/kernel/x86_64_tests.cpp` with q6_K tests that fail before
+ implementation. Cover row-kernel correctness, optimized q6 counters,
+ shared q6 counters when the feature contract is disabled, multi-block
+ accumulation, and source-backed hot-path contract assertions for q2/q3/q6.
+
+
+ Compile or run the focused x86_64/kernel shard and capture the expected
+ red failure before implementation.
+
+
+
+
+
+ Add an x86_64 `q6_K x q8_K` row helper using AVX2/FMA numeric work and
+ the same block-q6_K/block-q8_K operand format as scalar/AArch64. Keep any
+ branch/tail handling local to the already-selected q6 numeric kernel.
+
+
+ Row-level tests match scalar/reference output within the established
+ quantized tolerance across multiple block counts.
+
+
+
+
+
+ Add q6 optimized route predicates in `guards.hpp`, selected-route actions
+ and counters in `actions.hpp`/`context.hpp`, actor accessors in `sm.hpp`,
+ and transition rows in `sm.hpp` before generic f32 SIMD and shared scalar
+ `op_mul_mat` routes.
+
+
+ Actor tests prove optimized q6 counters increment for supported requests
+ and shared q6 counters increment when the feature contract is disabled.
+
+
+
+
+
+ Add focused proof that supported q2_K/q3_K/q6_K optimized requests stay on
+ q*_K x q8_K block operands and do not allocate during dispatch. Prefer a
+ maintained allocation guard or source-backed route assertions over broad
+ instrumentation.
+
+
+ Tests or source-backed validation show no whole-tensor dequantize-to-f32
+ substitution and no dispatch-time heap allocation for supported optimized
+ q2/q3/q6 requests.
+
+
+
+
+
+ Write `242-VERIFICATION.md`, `242-VALIDATION.md`, and `242-01-SUMMARY.md`
+ from source/test evidence. Keep benchmark snapshot approval visible as a
+ shared closeout gate and do not mutate snapshots without explicit
+ approval.
+
+
+ Focused tests, source scan, lint snapshot, and changed-file scoped quality
+ gate evidence are recorded.
+
+
+
+
+## Verification
+
+1. Failing-first x86_64 q6/hot-path tests.
+2. `cmake --build build/phase239 --target CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o`.
+3. `ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'`.
+4. Source scan for forbidden x86 feature claims/flags:
+ `rg -n -- '-mavx512|-mavxvnni|-mamx|-mavx512bf16|-mavx512fp16|-mavx512vnni|-mavx512f' CMakeLists.txt src tests tools/paritychecker`
+5. `scripts/lint_snapshot.sh` without snapshot updates.
+6. Changed-file scoped `scripts/quality_gates.sh` with
+ `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64`. Do not update snapshots
+ without explicit approval.
+
+
+## Rule Constraints
+
+- Follow `AGENTS.md` and `docs/rules/sml.rules.md`.
+- Keep runtime behavior selection in `guards.hpp` and `sm.hpp` transitions.
+- Keep action/detail helpers as already-chosen numeric work only.
+- Do not add queue/mailbox/deferred-dispatch behavior.
+- Do not add AVX-512, AVX-VNNI, AMX, BF16, native-FP16, GPU, or llama.cpp/ggml
+ runtime linkage.
+- Do not add whole-tensor dequantize-to-f32 hot-path substitution.
+- Do not update snapshots without explicit user approval.
+
+
+
+## Completion Criteria
+
+- `XQK-03` has source-backed evidence that supported x86_64 `q6_K x q8_K`
+ requests run through a native optimized x86 kernel.
+- `XQK-04` has source-backed evidence that supported q2_K/q3_K/q6_K optimized
+ requests stay block-native and allocation-free during dispatch.
+- Tests prove optimized/shared q6 attribution, row correctness, and hot-path
+ contract behavior.
+- Verification artifacts distinguish source completion from the still-pending
+ repository-level benchmark snapshot approval gate.
diff --git a/.planning/milestones/v1.27-phases/242-x86-64-vectorized-q6-k-and-hot-path-contract/242-01-SUMMARY.md b/.planning/milestones/v1.27-phases/242-x86-64-vectorized-q6-k-and-hot-path-contract/242-01-SUMMARY.md
new file mode 100644
index 00000000..371d457b
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/242-x86-64-vectorized-q6-k-and-hot-path-contract/242-01-SUMMARY.md
@@ -0,0 +1,43 @@
+---
+phase: 242
+status: passed
+requirements-completed:
+ - XQK-03
+ - XQK-04
+requirements-blocked: []
+verification: passed
+---
+
+# Phase 242 Summary
+
+## What Changed
+
+- Added an EMEL-owned x86_64 AVX2/FMA row kernel for `q6_K x q8_K` in
+ `src/emel/kernel/x86_64/actions.hpp`.
+- Kept the quantized operand path block-native: q6_K LHS blocks and q8_K RHS
+ blocks generated by the maintained quantizer. No whole-tensor
+ dequantize-to-f32 hot-path substitution was added.
+- Added explicit q6 optimized and shared route counters to the x86_64 actor
+ context and exposed actor accessors for attribution.
+- Routed supported q6_K `op_mul_mat` requests through explicit guards and SML
+ transitions before generic f32 SIMD and shared scalar routes.
+- Added x86_64 tests proving q6 row correctness against the scalar q6 oracle,
+ optimized route attribution on supported contracts, shared route attribution
+ when the feature contract is disabled, and allocation-free q2/q3/q6 optimized
+ hot-path dispatch.
+
+## Validation
+
+- Failing-first x86_64 test object compile: red captured before implementation.
+- x86_64 test object compile: pass after implementation.
+- `emel_tests_bin` build: pass.
+- `emel_tests_kernel_and_graph` CTest shard: pass.
+- Unsupported x86 feature flag source scan: pass.
+- `scripts/lint_snapshot.sh`: pass without snapshot updates.
+- `git diff --check`: pass.
+- Scoped `scripts/quality_gates.sh`: coverage, paritychecker, benchmark
+ snapshot, lint, docs, and fuzz routing pass after approved snapshot updates.
+
+## Closeout Status
+
+The Phase 242 implementation satisfies and verifies `XQK-03` and `XQK-04`.
diff --git a/.planning/milestones/v1.27-phases/242-x86-64-vectorized-q6-k-and-hot-path-contract/242-CONTEXT.md b/.planning/milestones/v1.27-phases/242-x86-64-vectorized-q6-k-and-hot-path-contract/242-CONTEXT.md
new file mode 100644
index 00000000..2e0a9166
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/242-x86-64-vectorized-q6-k-and-hot-path-contract/242-CONTEXT.md
@@ -0,0 +1,112 @@
+# Phase 242: x86_64 Vectorized q6_K and Hot-Path Contract - Context
+
+**Gathered:** 2026-06-25
+**Status:** Ready for planning
+**Mode:** Auto-generated (autonomous processor-support phase)
+
+
+## Phase Boundary
+
+Add the EMEL-owned x86_64 AVX2/FMA `q6_K x q8_K` kernel for the maintained
+quantized `op_mul_mat` hot path and prove the q2_K/q3_K/q6_K optimized x86_64
+routes preserve the hot-path contract: same effective operand class, no
+whole-tensor dequantize-to-f32 substitution, and no dispatch-time allocation.
+This phase does not own runtime generator integration, parity publication, or
+benchmark publication; those remain active Phase 243-244 obligations.
+
+
+
+
+## Implementation Decisions
+
+### Kernel Contract
+- Implement a native x86_64 `q6_K x q8_K` row kernel in
+ `src/emel/kernel/x86_64/actions.hpp`.
+- Preserve the same operand path as scalar/AArch64 q6: block-q6_K LHS,
+ block-q8_K RHS produced by the maintained q8_K quantizer, and f32 output.
+- Do not add whole-tensor dequantize-to-f32 hot-path substitution.
+- Keep q6_K scope to the maintained unpacked block-q6_K route; packed/prepared
+ q6 vector variants are not required for this phase unless already needed by
+ the maintained x86 route under test.
+
+### Routing Contract
+- Add x86_64 q6 optimized route predicates in `guards.hpp`, selected-route
+ actions in `actions.hpp`, counters in `context.hpp`, actor accessors in
+ `sm.hpp`, and transition rows before generic f32 SIMD/shared scalar
+ `op_mul_mat` routes.
+- Keep runtime behavior choice in guards and SML transitions; q6 actions must
+ execute only the already-selected q6 path.
+- Add shared q6 counter attribution for the scalar fallback/no-claim path.
+
+### Hot-Path Contract Proof
+- Prove q2_K/q3_K/q6_K optimized routes consume q*_K blocks and q8_K RHS blocks,
+ not whole-tensor dequantized f32 intermediates.
+- Prove supported optimized dispatch performs no heap allocation by exercising
+ q2/q3/q6 route calls under the repo's maintained allocation/checking pattern
+ or a focused deterministic allocation guard.
+- Keep validation source-backed: tests must drive public actor dispatch, not
+ private action helpers directly.
+
+### the agent's Discretion
+- Start with row helper correctness plus actor route tests for q6_K, then add
+ the narrowest allocation/operand-fidelity assertions that are source-backed
+ and maintainable.
+- Reuse Phase 241 x86 fixtures and AArch64 q6 test patterns where practical.
+
+
+
+
+## Existing Code Insights
+
+### Reusable Assets
+- `src/emel/kernel/detail.hpp` owns scalar `dot_q6_k_q8_k_block_scalar` and
+ `dot_q6_k_q8_k_row_scalar`.
+- `src/emel/kernel/aarch64/actions.hpp` contains NEON q6 row and `op_mul_mat`
+ route/counter precedent.
+- `tests/kernel/aarch64_tests.cpp` has q6 row correctness and actor route
+ examples.
+- Phase 241 added x86 q2/q3 helpers, route counters, guards/transitions, and
+ focused x86 tests that can be extended for q6.
+
+### Established Patterns
+- Quantized optimized routes increment optimized q-format counters; shared
+ scalar routes increment shared q-format counters.
+- Feature-disabled machine contexts prove fallback/no-claim behavior.
+- Focused x86 tests compare optimized row/mul_mat output against scalar q*_K x
+ q8_K oracles and then validate actor route attribution.
+
+### Integration Points
+- `src/emel/kernel/x86_64/actions.hpp`: q6 row kernel, selected-route action,
+ shared/optimized counter increments, and hot-path helper code.
+- `src/emel/kernel/x86_64/guards.hpp`: q6 optimized route predicate and generic
+ q6 exclusion from shared route predicates.
+- `src/emel/kernel/x86_64/context.hpp`: q6 route counters.
+- `src/emel/kernel/x86_64/sm.hpp`: q6 transition row and actor accessors.
+- `tests/kernel/x86_64_tests.cpp`: q6 row/route tests and hot-path contract
+ tests.
+
+
+
+
+## Specific Ideas
+
+- Phase 242 closes the quantized-kernel set named in v1.27: q2_K, q3_K, and
+ q6_K on this AVX2/FMA host.
+- The hot-path contract is an implementation obligation in this phase, not a
+ publication-only claim: supported optimized requests must stay block-native
+ and allocation-free during dispatch.
+
+
+
+
+## Active Follow-On Scope
+
+- Phase 243: maintained runtime integration and parity proof.
+- Phase 244: benchmark attribution and publication truth.
+
+
+
+---
+
+*Phase: 242-x86-64-vectorized-q6-k-and-hot-path-contract*
+*Context gathered: 2026-06-25*
diff --git a/.planning/milestones/v1.27-phases/242-x86-64-vectorized-q6-k-and-hot-path-contract/242-VALIDATION.md b/.planning/milestones/v1.27-phases/242-x86-64-vectorized-q6-k-and-hot-path-contract/242-VALIDATION.md
new file mode 100644
index 00000000..921e7196
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/242-x86-64-vectorized-q6-k-and-hot-path-contract/242-VALIDATION.md
@@ -0,0 +1,109 @@
+---
+phase: 242
+slug: x86-64-vectorized-q6-k-and-hot-path-contract
+status: passed
+nyquist_compliant: true
+wave_0_complete: true
+created: 2026-06-25
+---
+
+# Phase 242 - Validation Strategy
+
+## Test Infrastructure
+
+| Property | Value |
+|----------|-------|
+| Framework | doctest, CTest, source scans, lint snapshot, quality gate |
+| Config file | `CMakeLists.txt`; `scripts/quality_gates.sh` |
+| Quick run command | `ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'` |
+| Gate command | `EMEL_QUALITY_GATES_CHANGED_FILES="" EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64 scripts/quality_gates.sh` |
+| Current gate status | passed after approved x86_64 benchmark baseline update |
+
+## Per-Task Verification Map
+
+| Task ID | Requirement | Test Type | Automated Command | Status |
+|---------|-------------|-----------|-------------------|--------|
+| 242-01-01 | XQK-03, XQK-04 | failing-first compile proof | `cmake --build build/phase239 --target CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o` before implementation | red captured |
+| 242-01-02 | XQK-03 | row-kernel correctness | x86_64 q6 row test against scalar q6_K x q8_K helper | green |
+| 242-01-03 | XQK-03 | public actor route proof | optimized/shared q6 counter tests in `tests/kernel/x86_64_tests.cpp` | green |
+| 242-01-04 | XQK-04 | hot-path contract proof | allocation-guarded q2/q3/q6 optimized dispatch test plus source review of q*_K x q8_K operand path | green |
+| 242-01-05 | quality gate | scoped quality gate | `EMEL_QUALITY_GATES_CHANGED_FILES="..." EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64 scripts/quality_gates.sh` | green |
+
+## Command Results
+
+```bash
+cmake --build build/phase239 --target CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o
+```
+
+Result: PASS after implementation. The pre-fix run failed on missing x86_64 q6
+AVX2/FMA row helper symbols and actor q6 counter accessors.
+
+```bash
+cmake --build build/phase239 --target emel_tests_bin -j2
+```
+
+Result: PASS.
+
+```bash
+ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'
+```
+
+Result: PASS, `100% tests passed`.
+
+```bash
+rg -n -- '-mavx512|-mavxvnni|-mamx|-mavx512bf16|-mavx512fp16|-mavx512vnni|-mavx512f' CMakeLists.txt src tests tools/paritychecker
+```
+
+Result: PASS, no unsupported x86 compile flags found.
+
+```bash
+scripts/lint_snapshot.sh
+```
+
+Result: PASS. No lint snapshot update was made.
+
+```bash
+git diff --check
+```
+
+Result: PASS.
+
+```bash
+PATH="/shared/stateforward/.tools/clang-format-venv/bin:/shared/stateforward/.tools/llvm18/root/usr/lib/llvm-18/bin:/shared/stateforward/.tools/git-lfs/git-lfs-3.7.1:$PATH" \
+EMEL_QUALITY_GATES_BENCH_SUITE="kernel_x86_64" \
+EMEL_QUALITY_GATES_CHANGED_FILES="src/emel/kernel/x86_64/actions.hpp,src/emel/kernel/x86_64/context.hpp,src/emel/kernel/x86_64/guards.hpp,src/emel/kernel/x86_64/sm.hpp,tests/kernel/x86_64_tests.cpp" \
+scripts/quality_gates.sh
+```
+
+Initial result before snapshot approval: the scoped gate passed all
+non-benchmark lanes:
+
+- `test_with_coverage`: PASS. CTest shard `kernel_and_graph` passes.
+ Changed-line coverage is `676/701` lines (`96.4%`) and `169/238` branches
+ (`71.0%`).
+- `paritychecker`: PASS. Kernel parity runner passes.
+- `fuzz_smoke`: skipped because no fuzz-affecting files changed.
+- `lint_snapshot`: PASS without snapshot update.
+- `generate_docs`: PASS.
+
+The only failing lane was `bench_snapshot`: the `kernel_x86_64` suite still emits
+15 `kernel/x86_64/*` entries without baselines in
+`snapshots/bench/benchmarks.txt`. User approval was granted and the snapshot
+baseline was updated.
+
+## Validation Sign-Off
+
+- [x] x86_64 optimized q6_K route has automated/source-backed validation.
+- [x] x86_64 shared q6 fallback/no-claim path has automated validation.
+- [x] q2_K/q3_K/q6_K optimized dispatch has allocation-free hot-path validation.
+- [x] Focused x86_64 and kernel/graph tests pass.
+- [x] Coverage, parity, lint, fuzz routing, and docs lanes pass in scoped gate.
+- [x] Snapshot updates were explicitly approved and applied.
+- [x] Scoped quality gate passes after approved benchmark baseline update.
+- [x] `nyquist_compliant: true` and `wave_0_complete: true` are set in
+ frontmatter.
+- [x] Rule-compliance evidence is recorded through explicit q6
+ guards/transitions, allocation-guarded optimized dispatch tests, block-native
+ operand review, unsupported feature scans, and lint.
+
+**Approval:** granted by user; snapshots updated.
diff --git a/.planning/milestones/v1.27-phases/242-x86-64-vectorized-q6-k-and-hot-path-contract/242-VERIFICATION.md b/.planning/milestones/v1.27-phases/242-x86-64-vectorized-q6-k-and-hot-path-contract/242-VERIFICATION.md
new file mode 100644
index 00000000..b357797d
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/242-x86-64-vectorized-q6-k-and-hot-path-contract/242-VERIFICATION.md
@@ -0,0 +1,51 @@
+# Phase 242 Verification
+
+status: passed
+
+All commands were run from:
+`/shared/stateforward/emel.cpp`
+
+## Must-Have Verification
+
+| Must-have | Evidence | Status |
+|-----------|----------|--------|
+| Supported x86_64 q6_K route is optimized | `src/emel/kernel/x86_64/guards.hpp`, `sm.hpp`, `actions.hpp`; `tests/kernel/x86_64_tests.cpp` q6 route-counter test | PASS |
+| Native EMEL-owned q6 row kernel exists | `dot_q6_k_q8_k_row_avx2_fma` and `dot_q6_k_q8_k_block_avx2_fma` in `src/emel/kernel/x86_64/actions.hpp` | PASS |
+| Same effective operand class is preserved | q6 tests use block-q6_K LHS plus block-q8_K RHS generated by the maintained q8_K quantizer; q2/q3/q6 hot-path test routes through block-native kernels | PASS |
+| Supported q2/q3/q6 dispatch is allocation-free | `kernel_x86_64_quantized_hot_path_dispatches_without_allocation` wraps q2/q3/q6 optimized dispatch in `allocation_scope` and observes zero allocations | PASS |
+| No whole-tensor dequantize-to-f32 hot-path substitution was added | optimized q2/q3/q6 actions quantize RHS columns to q8_K blocks and call q*_K x q8_K row helpers; source review found no dequantize-to-f32 substitution in the x86_64 optimized path | PASS |
+| Shared fallback/no-claim behavior remains explicit | feature-disabled q6 actor test increments shared q6 counter and not optimized q6 counter | PASS |
+| Required quality gate | approved `kernel_x86_64` benchmark/parity snapshots landed; scoped quality gate passed all selected lanes | PASS |
+
+## Evidence Summary
+
+- Failing-first compile proof captured after adding q6/hot-path tests:
+ `cmake --build build/phase239 --target CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o`
+ failed before implementation on missing `dot_q6_k_q8_k_row_avx2_fma` and q6
+ actor counter accessors.
+- `cmake --build build/phase239 --target CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o`
+ passes after implementation.
+- `cmake --build build/phase239 --target emel_tests_bin -j2` passes.
+- `ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'`
+ passes.
+- Source-only unsupported x86 flag scan passes:
+ `rg -n -- '-mavx512|-mavxvnni|-mamx|-mavx512bf16|-mavx512fp16|-mavx512vnni|-mavx512f' CMakeLists.txt src tests tools/paritychecker`
+ returns no matches.
+- `scripts/lint_snapshot.sh` passes without updating
+ `snapshots/lint/clang_format.txt`.
+- `git diff --check` passes.
+- Scoped `scripts/quality_gates.sh` passes all non-benchmark lanes:
+ coverage, paritychecker, fuzz routing, lint snapshot, and docs generation.
+- Coverage evidence from the scoped gate:
+ `changed-line coverage: lines 676/701 (96.4%), branches 169/238 (71.0%)`.
+- Approved benchmark snapshots now include the `kernel/x86_64/*` benchmark
+ suite entries.
+
+## Final Verification
+
+User approved snapshot updates. `scripts/bench.sh --snapshot --update
+--suite=kernel_x86_64` updated the benchmark baseline, maintained generation
+publication baselines were updated, and the changed-file scoped quality gate
+passed with `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64`.
+
+Phase 242 is fully verified for `XQK-03` and `XQK-04`.
diff --git a/.planning/milestones/v1.27-phases/243-runtime-integration-and-parity-proof/243-01-PLAN.md b/.planning/milestones/v1.27-phases/243-runtime-integration-and-parity-proof/243-01-PLAN.md
new file mode 100644
index 00000000..e643fad5
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/243-runtime-integration-and-parity-proof/243-01-PLAN.md
@@ -0,0 +1,144 @@
+# Phase 243 Plan: Runtime Integration and Parity Proof
+
+## Goal
+
+Prove the maintained generator -> graph -> processor -> kernel chain selects
+the x86_64 AVX2/FMA optimized quantized path on this host and publish that
+proof through maintained generator tests and paritychecker generation
+attribution.
+
+## Tasks
+
+
+
+
+ - `tests/text/generator/lifecycle_tests.cpp`
+ - `src/emel/text/generator/events.hpp`
+ - `src/emel/text/generator/actions.hpp`
+ - `src/emel/kernel/any.hpp`
+
+
+ - `tests/text/generator/lifecycle_tests.cpp`
+
+
+ Strengthen `generator_generate_quantized_contract_fixture_preserves_zero_disallowed_fallback`
+ so the x86_64 branch requires:
+ `diagnostics.optimized_q2_dispatch_calls > 0u`,
+ `diagnostics.optimized_q3_dispatch_calls > 0u`,
+ `diagnostics.optimized_q6_dispatch_calls > 0u`,
+ `diagnostics.shared_q2_dispatch_calls == 0u`,
+ `diagnostics.shared_q3_dispatch_calls == 0u`, and
+ `diagnostics.shared_q6_dispatch_calls == 0u`.
+ Keep the existing AArch64 q6-vector assertions and non-optimized vector
+ assertions for other hosts intact.
+
+
+ - `tests/text/generator/lifecycle_tests.cpp` contains `diagnostics.optimized_q2_dispatch_calls > 0u`.
+ - `tests/text/generator/lifecycle_tests.cpp` contains `diagnostics.optimized_q3_dispatch_calls > 0u`.
+ - `tests/text/generator/lifecycle_tests.cpp` contains `diagnostics.optimized_q6_dispatch_calls > 0u`.
+ - `build/phase239/emel_tests_bin --test-case='generator_generate_quantized_contract_fixture_preserves_zero_disallowed_fallback' --no-skipped-summary` exits 0.
+
+
+
+
+
+ - `tools/paritychecker/parity_engines.cpp`
+ - `tools/paritychecker/paritychecker_tests.cpp`
+ - `tools/paritychecker/parity_runner.cpp`
+
+
+ - `tools/paritychecker/parity_engines.cpp`
+ - `tools/paritychecker/paritychecker_tests.cpp`
+
+
+ Update the generation parity attribution check in
+ `tools/paritychecker/parity_engines.cpp` so `kernel_kind::x86_64`
+ requires optimized q2/q3/q6 dispatch counters to be positive and shared
+ q2/q3/q6 counters to be zero. Keep AArch64 and non-x86 behavior explicit.
+ Extend `check_generation_quantized_attribution` in
+ `tools/paritychecker/paritychecker_tests.cpp` so x86_64 output requires
+ positive `optimized_q2_dispatch_calls`, `optimized_q3_dispatch_calls`,
+ and `optimized_q6_dispatch_calls`, with zero shared q2/q3/q6 counters.
+
+
+ - `tools/paritychecker/parity_engines.cpp` contains `kernel_kind::x86_64` near the generation quantized dispatch proof.
+ - `tools/paritychecker/paritychecker_tests.cpp` contains x86 assertions for `optimized_q2_dispatch_calls`, `optimized_q3_dispatch_calls`, and `optimized_q6_dispatch_calls`.
+ - `cmake --build build/paritychecker_zig --target paritychecker paritychecker_tests -j2` exits 0.
+ - `build/paritychecker_zig/paritychecker_tests --test-case='paritychecker matches current maintained generation publication across max-token counts' --no-skipped-summary` exits 0 when maintained fixtures are present.
+
+
+
+
+
+ - `.planning/REQUIREMENTS.md`
+ - `.planning/ROADMAP.md`
+ - `.planning/STATE.md`
+ - `.planning/phases/243-runtime-integration-and-parity-proof/243-01-PLAN.md`
+
+
+ - `.planning/phases/243-runtime-integration-and-parity-proof/243-VERIFICATION.md`
+ - `.planning/phases/243-runtime-integration-and-parity-proof/243-VALIDATION.md`
+ - `.planning/phases/243-runtime-integration-and-parity-proof/243-01-SUMMARY.md`
+ - `.planning/REQUIREMENTS.md`
+ - `.planning/ROADMAP.md`
+ - `.planning/STATE.md`
+ - `.planning/PROJECT.md`
+
+
+ Record source/test evidence for `XRT-01`, `XRT-02`, and `XRT-03`.
+ Keep publication and quality-gate state source-backed; do not update
+ snapshots unless approval has been granted.
+
+
+ - `.planning/phases/243-runtime-integration-and-parity-proof/243-VERIFICATION.md` exists.
+ - `.planning/phases/243-runtime-integration-and-parity-proof/243-VALIDATION.md` exists.
+ - `.planning/phases/243-runtime-integration-and-parity-proof/243-01-SUMMARY.md` exists.
+ - `.planning/REQUIREMENTS.md` traceability rows for `XRT-01`, `XRT-02`, and `XRT-03` reflect the current verified state.
+
+
+
+
+## Verification
+
+1. Focused generator doctest:
+ `build/phase239/emel_tests_bin --test-case='generator_generate_quantized_contract_fixture_preserves_zero_disallowed_fallback' --no-skipped-summary`.
+2. Paritychecker build:
+ `cmake --build build/paritychecker_zig --target paritychecker paritychecker_tests -j2`.
+3. Paritychecker maintained generation publication test:
+ `build/paritychecker_zig/paritychecker_tests --test-case='paritychecker matches current maintained generation publication across max-token counts' --no-skipped-summary`.
+4. Source scan for forbidden x86 feature claims/flags:
+ `rg -n -- '-mavx512|-mavxvnni|-mamx|-mavx512bf16|-mavx512fp16|-mavx512vnni|-mavx512f' CMakeLists.txt src tests tools/paritychecker`.
+5. `scripts/lint_snapshot.sh` without snapshot updates.
+6. Changed-file scoped `scripts/quality_gates.sh` with
+ `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64`. Do not update snapshots
+ without explicit approval.
+
+
+## Rule Constraints
+
+- Follow `AGENTS.md` and `docs/rules/sml.rules.md`.
+- Keep runtime behavior selection in guards and SML transitions; Phase 243
+ should not move routing choice into actions/detail.
+- Drive proof through public events and maintained state machines. Do not reach
+ into actor `actions.hpp` or private helpers from paritychecker/tests.
+- Keep EMEL and llama.cpp/ggml lanes separated. Paritychecker may use
+ llama.cpp/ggml only on the reference comparison lane.
+- Do not add queue/mailbox/deferred-dispatch behavior.
+- Do not add AVX-512, AVX-VNNI, AMX, BF16, native-FP16, GPU, or llama.cpp/ggml
+ runtime linkage.
+- Do not add whole-tensor dequantize-to-f32 hot-path substitution.
+- Do not update snapshots without explicit user approval.
+
+
+
+## Completion Criteria
+
+- `XRT-01` has source-backed evidence that generator diagnostics report the
+ x86_64 kernel kind and optimized q2/q3/q6 dispatch on the maintained
+ quantized-contract generation path.
+- `XRT-02` has paritychecker generation evidence for maintained fixture token
+ counts `1`, `10`, `100`, and `1000`.
+- `XRT-03` has tests proving supported optimized behavior and deterministic
+ fallback/no-claim behavior through public generator dispatch and diagnostics.
+- Verification artifacts distinguish source completion from the still-pending
+ repository-level benchmark snapshot approval gate.
diff --git a/.planning/milestones/v1.27-phases/243-runtime-integration-and-parity-proof/243-01-SUMMARY.md b/.planning/milestones/v1.27-phases/243-runtime-integration-and-parity-proof/243-01-SUMMARY.md
new file mode 100644
index 00000000..650c835c
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/243-runtime-integration-and-parity-proof/243-01-SUMMARY.md
@@ -0,0 +1,50 @@
+---
+phase: 243
+status: passed
+requirements-completed:
+ - XRT-01
+ - XRT-02
+ - XRT-03
+requirements-blocked: []
+verification: passed
+---
+
+# Phase 243 Summary
+
+## What Changed
+
+- Strengthened the maintained quantized-contract generator lifecycle test so
+ x86_64 hosts must report optimized q2/q3/q6 dispatch counters and zero shared
+ q2/q3/q6 counters through public generator diagnostics.
+- Updated paritychecker generation attribution so x86_64 maintained generation
+ proof requires native q2/q3/q6 optimized counters when those native tensor
+ types are present.
+- Extended paritychecker tests to parse and assert the x86_64
+ `quantized_dispatch:` counters emitted by the maintained generation path.
+- Fixed reference context sizing so live generation parity works for larger
+ `--max-tokens` runs after prompt tokenization.
+- Bound model-specific RoPE pairing metadata for Qwen3, Gemma4, and LFM2 so
+ maintained generation parity uses the correct NeoX/normal RoPE layout without
+ adding runtime hot-path layout routing.
+- Removed temporary generation/parity diagnostics probes that were not part of
+ the maintained runtime proof surface.
+
+## Validation
+
+- `emel_tests_bin` build: pass.
+- `paritychecker` and `paritychecker_tests` build: pass.
+- Focused generator, model-binding, and generator-detail doctests: pass.
+- `paritychecker_tests`: pass.
+- Maintained generation publication test against live reference: pass.
+- Live EMEL/reference generation parity for `1`, `10`, `100`, and `1000`
+ tokens: match. Checked-in generation baselines for `10`, `100`, and `1000`
+ tokens were updated after explicit approval.
+- Domain-boundary guard and unsupported x86 feature scan: pass.
+- `scripts/lint_snapshot.sh`: pass without snapshot updates.
+- `git diff --check`: pass.
+- Scoped `scripts/quality_gates.sh`: build, coverage, paritychecker, benchmark
+ snapshot, lint, docs, and fuzz routing pass after approved snapshot updates.
+
+## Closeout Status
+
+Phase 243 satisfies and verifies `XRT-01`, `XRT-02`, and `XRT-03`.
diff --git a/.planning/milestones/v1.27-phases/243-runtime-integration-and-parity-proof/243-CONTEXT.md b/.planning/milestones/v1.27-phases/243-runtime-integration-and-parity-proof/243-CONTEXT.md
new file mode 100644
index 00000000..474377f4
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/243-runtime-integration-and-parity-proof/243-CONTEXT.md
@@ -0,0 +1,112 @@
+# Phase 243: Runtime Integration and Parity Proof - Context
+
+**Gathered:** 2026-06-25
+**Status:** Ready for planning
+**Mode:** Auto-generated (autonomous processor-support phase)
+
+
+## Phase Boundary
+
+Adopt the x86_64 AVX2/FMA kernel work from Phases 239-242 in the maintained
+generator -> graph -> processor -> kernel proof surfaces. This phase does not
+add new numeric kernels; it proves the shipped runtime chain selects the new
+x86_64 optimized routes where the maintained generation fixture actually uses
+q2_K/q3_K/q6_K tensors, and that paritychecker publishes the corresponding
+attribution for `1`, `10`, `100`, and `1000` token generation runs.
+
+
+
+
+## Implementation Decisions
+
+### Runtime Diagnostics Contract
+- Use the existing generator `capture_diagnostics` event as the runtime proof
+ surface. It already exposes `kernel_kind`, total kernel dispatch count,
+ flash attribution, q2/q3/q6 optimized and shared counters, and quantized
+ contract stage counts.
+- Do not add public API or C ABI surface. Phase 243 proof stays inside
+ maintained tests and paritychecker attribution.
+- Do not reach into actor actions or private helpers from tests or tools; drive
+ generator proof through `process_event(...)` and public generator events.
+
+### Maintained Generation Fixture
+- Use `generator_fixture::model_variant::quantized_contract` in
+ `tests/text/generator/lifecycle_tests.cpp` for source-backed generator-chain
+ assertions. Its tensor setup assigns q2_K/q3_K/q6_K to maintained model
+ stages.
+- On x86_64 hosts, require optimized q2/q3/q6 dispatch counters to be positive
+ and shared q2/q3/q6 counters to stay zero for the quantized-contract generate
+ path.
+- On non-x86 hosts, keep existing platform-specific expectations intact.
+
+### Paritychecker Proof
+- Update `tools/paritychecker/parity_engines.cpp` so generation parity accepts
+ and requires x86_64 optimized q2/q3/q6 attribution when the maintained
+ generation fixture runs on the x86_64 kernel kind.
+- Keep non-x86 and AArch64 expectations explicit; do not claim x86 optimized
+ attribution on other kernel kinds.
+- Extend existing `tools/paritychecker/paritychecker_tests.cpp` attribution
+ checks so emitted `quantized_dispatch:` output proves the x86 counters are
+ positive and shared counters are zero.
+
+### Validation
+- Run focused generator quantized-contract doctest cases directly when the
+ broad `emel_tests_generator_and_runtime` shard is blocked by unrelated dirty
+ embedding fixture failures.
+- Run paritychecker generation proof for the maintained fixture and token
+ counts `1`, `10`, `100`, and `1000` when fixture assets are present.
+- Keep benchmark snapshot approval as the shared milestone closeout gate; do
+ not update snapshots without explicit approval.
+
+
+
+
+## Existing Code Insights
+
+### Reusable Assets
+- `src/emel/text/generator/actions.hpp` fills generator diagnostics from
+ `ctx.compute.backend.kernel.*_dispatch_count()` accessors.
+- `src/emel/kernel/any.hpp` exposes q2/q3/q6 optimized/shared counters across
+ x86_64 and AArch64 kernel actors.
+- `tests/text/generator/lifecycle_tests.cpp` contains the
+ `quantized_contract` generator fixture and existing quantized contract tests.
+- `tools/paritychecker/parity_engines.cpp` prints `quantized_dispatch:` and
+ validates runtime quantized attribution during generation parity.
+- `tools/paritychecker/paritychecker_tests.cpp` already parses generation
+ attribution output.
+
+### Integration Points
+- `tests/text/generator/lifecycle_tests.cpp`: strengthen maintained generator
+ diagnostics assertions for x86_64 q2/q3/q6 optimized dispatch.
+- `tools/paritychecker/parity_engines.cpp`: require the x86_64 generation
+ parity path to show optimized q2/q3/q6 dispatch and zero shared q2/q3/q6
+ dispatch.
+- `tools/paritychecker/paritychecker_tests.cpp`: assert the emitted
+ `quantized_dispatch:` metrics match the x86_64 runtime contract.
+
+
+
+
+## Specific Ideas
+
+- The generator-level proof must distinguish two facts:
+ 1. f32/default fixtures do not claim quantized optimized dispatch.
+ 2. the quantized-contract fixture does claim optimized q2/q3/q6 dispatch on
+ x86_64 and does not fall back to shared q2/q3/q6 dispatch.
+- Paritychecker should fail if x86_64 generation parity succeeds numerically
+ while the optimized attribution counters are missing.
+
+
+
+
+## Active Next Scope
+
+- Phase 244: benchmark attribution and publication truth after Phase 243
+ runtime/parity proof is source-backed.
+
+
+
+---
+
+*Phase: 243-runtime-integration-and-parity-proof*
+*Context gathered: 2026-06-25*
diff --git a/.planning/milestones/v1.27-phases/243-runtime-integration-and-parity-proof/243-VALIDATION.md b/.planning/milestones/v1.27-phases/243-runtime-integration-and-parity-proof/243-VALIDATION.md
new file mode 100644
index 00000000..654bcf49
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/243-runtime-integration-and-parity-proof/243-VALIDATION.md
@@ -0,0 +1,140 @@
+---
+phase: 243
+slug: runtime-integration-and-parity-proof
+status: passed
+nyquist_compliant: true
+wave_0_complete: true
+created: 2026-06-25
+---
+
+# Phase 243 - Validation Strategy
+
+## Test Infrastructure
+
+| Property | Value |
+|----------|-------|
+| Framework | doctest, paritychecker, source scans, lint snapshot, quality gate |
+| Config file | `CMakeLists.txt`; `scripts/quality_gates.sh` |
+| Quick run command | `build/phase239/emel_tests_bin --test-case='generator_generate_quantized_contract_fixture_preserves_zero_disallowed_fallback' --no-skipped-summary` |
+| Gate command | `EMEL_QUALITY_GATES_CHANGED_FILES="" EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64 scripts/quality_gates.sh` |
+| Current gate status | passed after approved benchmark and generation baseline snapshots |
+
+## Per-Task Verification Map
+
+| Task ID | Requirement | Test Type | Automated Command | Status |
+|---------|-------------|-----------|-------------------|--------|
+| 243-01-01 | XRT-01, XRT-03 | generator-chain route proof | focused quantized-contract generator doctest | green |
+| 243-01-02 | XRT-02, XRT-03 | paritychecker attribution proof | paritychecker tests and live reference generation parity | green, publication baselines stale |
+| 243-01-03 | quality gate | scoped quality gate | `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64 scripts/quality_gates.sh` | green |
+
+## Command Results
+
+```bash
+cmake --build build/phase239 --target emel_tests_bin -j2
+```
+
+Result: PASS.
+
+```bash
+cmake --build build/paritychecker_zig --target paritychecker paritychecker_tests -j2
+```
+
+Result: PASS.
+
+```bash
+build/phase239/emel_tests_bin --test-case='generator_generate_quantized_contract_fixture_preserves_zero_disallowed_fallback' --no-skipped-summary
+```
+
+Result: PASS. On x86_64 the maintained quantized-contract fixture reports positive
+optimized q2/q3/q6 dispatch counters and zero shared q2/q3/q6 counters.
+
+```bash
+build/paritychecker_zig/paritychecker_tests
+```
+
+Result: PASS.
+
+```bash
+build/paritychecker_zig/paritychecker_tests --test-case="paritychecker matches current maintained generation publication against live reference" --no-skipped-summary
+```
+
+Result: PASS.
+
+```bash
+build/paritychecker_zig/paritychecker --generation --model tests/models/LFM2.5-1.2B-Thinking-Q4_K_M.gguf --text hello --max-tokens=1
+build/paritychecker_zig/paritychecker --generation --model tests/models/LFM2.5-1.2B-Thinking-Q4_K_M.gguf --text hello --max-tokens=10
+build/paritychecker_zig/paritychecker --generation --model tests/models/LFM2.5-1.2B-Thinking-Q4_K_M.gguf --text hello --max-tokens=100
+build/paritychecker_zig/paritychecker --generation --model tests/models/LFM2.5-1.2B-Thinking-Q4_K_M.gguf --text hello --max-tokens=1000
+```
+
+Result: live EMEL/reference generation matched for all four token counts. The
+`--max-tokens=1` run also matches the checked-in baseline. The `10`, `100`, and
+`1000` token runs exit nonzero only because their checked-in generation baselines
+still contain the previous stale publication text.
+
+```bash
+scripts/check_domain_boundaries.sh
+```
+
+Result: PASS.
+
+```bash
+rg -n -- '-mavx512|-mavxvnni|-mamx|-mavx512bf16|-mavx512fp16|-mavx512vnni|-mavx512f' CMakeLists.txt src tests tools/paritychecker
+```
+
+Result: PASS, no unsupported x86 feature claims or compile flags found. `rg`
+returns exit 1 for this no-match scan.
+
+```bash
+PATH="/shared/stateforward/.tools/clang-format-venv/bin:/shared/stateforward/.tools/llvm18/root/usr/lib/llvm-18/bin:/shared/stateforward/.tools/git-lfs/git-lfs-3.7.1:$PATH" \
+scripts/lint_snapshot.sh
+```
+
+Result: PASS. No lint snapshot update was made.
+
+```bash
+git diff --check
+```
+
+Result: PASS.
+
+```bash
+PATH="/shared/stateforward/.tools/clang-format-venv/bin:/shared/stateforward/.tools/llvm18/root/usr/lib/llvm-18/bin:/shared/stateforward/.tools/git-lfs/git-lfs-3.7.1:$PATH" \
+EMEL_QUALITY_GATES_BENCH_SUITE="kernel_x86_64" \
+EMEL_QUALITY_GATES_CHANGED_FILES="src/emel/model/data.hpp,src/emel/model/gemma4/detail.cpp,src/emel/model/lfm2/detail.cpp,src/emel/model/qwen3/detail.cpp,src/emel/text/generator/detail.hpp,tests/model/loader/lifecycle_tests.cpp,tests/text/generator/detail_tests.cpp,tests/text/generator/lifecycle_tests.cpp,tools/paritychecker/parity_engines.cpp,tools/paritychecker/paritychecker_tests.cpp" \
+scripts/quality_gates.sh
+```
+
+Initial result before snapshot approval: the scoped gate passed all
+non-benchmark lanes:
+
+- `build`: PASS.
+- `test_with_coverage`: PASS. Changed-line coverage is `715/744` lines (`96.1%`)
+ and `171/240` branches (`71.2%`).
+- `paritychecker`: PASS.
+- `fuzz_smoke`: skipped because no fuzz-affecting files changed.
+- `lint_snapshot`: PASS without snapshot update.
+- `generate_docs`: PASS.
+
+The only failing lane was `bench_snapshot`: the `kernel_x86_64` suite still emits
+15 `kernel/x86_64/*` entries without approved baselines in
+`snapshots/bench/benchmarks.txt`.
+
+## Validation Sign-Off
+
+- [x] Generator-chain x86_64 optimized q2/q3/q6 dispatch has automated validation.
+- [x] Paritychecker x86_64 quantized attribution has automated validation.
+- [x] Live EMEL/reference generation matches for `1`, `10`, `100`, and `1000`
+ token runs.
+- [x] Domain-boundary and unsupported x86 feature scans pass.
+- [x] Coverage, paritychecker, lint, fuzz routing, and docs lanes pass in scoped gate.
+- [x] Snapshot updates were explicitly approved and applied.
+- [x] Scoped quality gate passes after approved benchmark baseline update.
+- [x] Maintained generation publication baselines are updated after explicit approval.
+- [x] `nyquist_compliant: true` and `wave_0_complete: true` are set in
+ frontmatter.
+- [x] Rule-compliance evidence is recorded through public generator dispatch,
+ domain-boundary checks, unsupported feature scans, paritychecker attribution,
+ and lint.
+
+**Approval:** granted by user; snapshots updated.
diff --git a/.planning/milestones/v1.27-phases/243-runtime-integration-and-parity-proof/243-VERIFICATION.md b/.planning/milestones/v1.27-phases/243-runtime-integration-and-parity-proof/243-VERIFICATION.md
new file mode 100644
index 00000000..428278e6
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/243-runtime-integration-and-parity-proof/243-VERIFICATION.md
@@ -0,0 +1,60 @@
+# Phase 243 Verification
+
+status: passed
+
+All commands were run from:
+`/shared/stateforward/emel.cpp`
+
+## Must-Have Verification
+
+| Must-have | Evidence | Status |
+|-----------|----------|--------|
+| Shipped generator chain selects x86_64 optimized q2/q3/q6 paths | `tests/text/generator/lifecycle_tests.cpp` requires positive optimized q2/q3/q6 counters and zero shared q2/q3/q6 counters on the maintained quantized-contract fixture | PASS |
+| Runtime proof uses public machine dispatch and diagnostics | Generator lifecycle tests drive `process_event(...)` and `capture_diagnostics`; paritychecker reads maintained generator diagnostics instead of actor private helpers | PASS |
+| Paritychecker publishes x86_64 attribution | `tools/paritychecker/parity_engines.cpp` prints `quantized_dispatch:` counters and requires x86_64 native q2/q3/q6 optimized counters when those native tensor types are present | PASS |
+| Maintained generation parity covers 1, 10, 100, and 1000 token runs | Live EMEL/reference generation parity matched at `--max-tokens` 1, 10, 100, and 1000; 10/100/1000 are blocked only by stale checked-in generation baselines | PASS |
+| Supported and fallback/no-claim behavior remain deterministic | `tools/paritychecker/paritychecker_tests.cpp` asserts x86 optimized counters are positive only when native tensor types are present and shared q2/q3/q6 counters stay zero | PASS |
+| Required quality gate | approved `kernel_x86_64` benchmark/parity snapshots landed; scoped quality gate passed all selected lanes | PASS |
+
+## Evidence Summary
+
+- `cmake --build build/phase239 --target emel_tests_bin -j2` passes.
+- `cmake --build build/paritychecker_zig --target paritychecker paritychecker_tests -j2`
+ passes.
+- Focused generator/model tests pass:
+ `generator_generate_quantized_contract_fixture_preserves_zero_disallowed_fallback`,
+ `generator_generate_runs_native_generator_contract`,
+ `generator_detail_lfm2_attention_uses_neox_rope_layout`,
+ `generator_detail_qwen3_generator_applies_per_head_qk_norm_before_rope`,
+ `generator_detail_gemma4_generator_applies_per_head_qk_norm_before_rope`,
+ and the Qwen3, Gemma4, and LFM2 model hparam binding tests.
+- `build/paritychecker_zig/paritychecker_tests` passes.
+- `build/paritychecker_zig/paritychecker_tests --test-case="paritychecker matches current maintained generation publication against live reference" --no-skipped-summary`
+ passes.
+- Live generation parity:
+ - `--max-tokens 1`: EMEL and reference match and the checked-in baseline matches.
+ - `--max-tokens 10`: EMEL and reference match; checked-in generation baseline is stale.
+ - `--max-tokens 100`: EMEL and reference match; checked-in generation baseline is stale.
+ - `--max-tokens 1000`: EMEL and reference match; checked-in generation baseline is stale.
+- `scripts/check_domain_boundaries.sh` passes.
+- Unsupported x86 flag scan passes:
+ `rg -n -- '-mavx512|-mavxvnni|-mamx|-mavx512bf16|-mavx512fp16|-mavx512vnni|-mavx512f' CMakeLists.txt src tests tools/paritychecker`
+ returns no matches.
+- `scripts/lint_snapshot.sh` passes with the maintained local tool PATH and without
+ updating snapshots.
+- `git diff --check` passes.
+- Scoped `scripts/quality_gates.sh` passes build, coverage, paritychecker, lint
+ snapshot, docs generation, and fuzz routing. Coverage evidence from the scoped gate:
+ `changed-line coverage: lines 715/744 (96.1%), branches 171/240 (71.2%)`.
+- Approved benchmark snapshots now include the `kernel/x86_64/*` benchmark suite
+ entries, and maintained LFM2 generation publication baselines are current for
+ `10`, `100`, and `1000` token runs.
+
+## Final Verification
+
+User approved snapshot updates. `scripts/bench.sh --snapshot --update
+--suite=kernel_x86_64` updated the benchmark baseline, maintained generation
+publication baselines were updated, and the changed-file scoped quality gate
+passed with `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64`.
+
+Phase 243 is fully verified for `XRT-01`, `XRT-02`, and `XRT-03`.
diff --git a/.planning/milestones/v1.27-phases/244-benchmark-attribution-and-publication-truth/244-01-PLAN.md b/.planning/milestones/v1.27-phases/244-benchmark-attribution-and-publication-truth/244-01-PLAN.md
new file mode 100644
index 00000000..2061c153
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/244-benchmark-attribution-and-publication-truth/244-01-PLAN.md
@@ -0,0 +1,145 @@
+# Phase 244 Plan: Benchmark Attribution and Publication Truth
+
+## Goal
+
+Publish maintained benchmark and generation baseline evidence for the Ryzen
+AVX2/FMA path, with source-backed attribution that distinguishes x86_64 EMEL
+optimized execution from scalar/shared paths, ARM-first claims, and reference
+lane execution.
+
+## Tasks
+
+
+
+
+ - `scripts/bench.sh`
+ - `scripts/quality_gates.sh`
+ - `tools/bench/kernel/x86_64_bench.cpp`
+ - `tools/bench/bench_runner_registry.cpp`
+ - `tools/bench/bench_dependency_manifest.cpp`
+ - `snapshots/bench/benchmarks.txt`
+
+
+ - `.planning/phases/244-benchmark-attribution-and-publication-truth/244-VALIDATION.md`
+
+
+ Run the suite-scoped benchmark snapshot preflight without updating
+ snapshots:
+ `scripts/bench.sh --snapshot --compare --suite=kernel_x86_64`.
+ Record the exact missing or stale `kernel/x86_64/*` snapshot entries and
+ confirm no unrelated benchmark suites are required for this phase.
+
+
+ - Validation records the `kernel_x86_64` preflight command and result.
+ - Any failure is limited to missing/stale maintained `kernel/x86_64/*`
+ snapshot entries.
+ - No snapshot file is changed by the preflight.
+
+
+
+
+
+ - `scripts/bench.sh`
+ - `tools/paritychecker/parity_runner.cpp`
+ - `tools/paritychecker/parity_engines.cpp`
+ - `snapshots/bench/benchmarks.txt`
+ - `snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_10.txt`
+ - `snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_100.txt`
+ - `snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_1000.txt`
+
+
+ - `snapshots/bench/benchmarks.txt`
+ - `snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_10.txt`
+ - `snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_100.txt`
+ - `snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_1000.txt`
+
+
+ Only after explicit user approval, update the maintained publication
+ snapshots:
+ `scripts/bench.sh --snapshot --update --suite=kernel_x86_64`, then run
+ paritychecker with `--write-generation-baseline` for the live-matching
+ LFM2 `10`, `100`, and `1000` token generation baselines.
+
+
+ - User approval for snapshot updates is recorded in the session.
+ - `snapshots/bench/benchmarks.txt` contains the new `kernel/x86_64/*`
+ maintained benchmark entries.
+ - The stale LFM2 maintained generation baselines for `10`, `100`, and
+ `1000` token runs match live EMEL/reference output.
+ - Snapshot diffs do not change unrelated benchmark or parity baselines.
+
+
+
+
+
+ - `.planning/REQUIREMENTS.md`
+ - `.planning/ROADMAP.md`
+ - `.planning/STATE.md`
+ - `.planning/PROJECT.md`
+ - `.planning/phases/244-benchmark-attribution-and-publication-truth/244-01-PLAN.md`
+
+
+ - `.planning/phases/244-benchmark-attribution-and-publication-truth/244-VERIFICATION.md`
+ - `.planning/phases/244-benchmark-attribution-and-publication-truth/244-VALIDATION.md`
+ - `.planning/phases/244-benchmark-attribution-and-publication-truth/244-01-SUMMARY.md`
+ - `.planning/REQUIREMENTS.md`
+ - `.planning/ROADMAP.md`
+ - `.planning/STATE.md`
+ - `.planning/PROJECT.md`
+
+
+ Re-run the changed-file scoped quality gate with
+ `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64`. If it passes, mark
+ `XBN-01` and `XBN-02` complete and update the v1.27 progress ledger.
+ Keep completion tied to approved snapshots and passing gate evidence.
+
+
+ - `244-VERIFICATION.md`, `244-VALIDATION.md`, and `244-01-SUMMARY.md`
+ exist.
+ - The scoped quality gate result is recorded.
+ - `XBN-01` and `XBN-02` are not marked complete unless the approved
+ snapshot updates have landed and the scoped quality gate passes.
+
+
+
+
+## Verification
+
+1. Benchmark publication preflight:
+ `scripts/bench.sh --snapshot --compare --suite=kernel_x86_64`.
+2. Approved benchmark snapshot update:
+ `scripts/bench.sh --snapshot --update --suite=kernel_x86_64`.
+3. Approved maintained generation baseline writes for LFM2 `10`, `100`, and
+ `1000` token runs using paritychecker `--write-generation-baseline`.
+4. Paritychecker maintained generation publication test:
+ `build/paritychecker_zig/paritychecker_tests --test-case="paritychecker matches current maintained generation publication against live reference" --no-skipped-summary`.
+5. `scripts/lint_snapshot.sh` without unapproved lint snapshot changes.
+6. Changed-file scoped `scripts/quality_gates.sh` with
+ `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64`.
+
+
+## Rule Constraints
+
+- Follow `AGENTS.md` and `docs/rules/sml.rules.md`.
+- Do not add new runtime routing, queueing, deferred dispatch, or actor
+ rewrites.
+- Do not add AVX-512, AVX-VNNI, AMX, BF16, native-FP16, GPU, or llama.cpp/ggml
+ runtime claims.
+- Keep benchmark and parity lanes separated. EMEL benchmark results must come
+ from EMEL-owned runtime code; reference results are comparison-only.
+- Do not update benchmark, parity, lint, or docs snapshots without explicit user
+ approval.
+- Do not mark publication requirements complete from planning artifacts alone.
+
+
+
+## Completion Criteria
+
+- `XBN-01` has maintained benchmark evidence that `tools/bench` runs x86_64
+ flash and quantized workloads through optimized paths with attribution
+ distinct from scalar/shared paths and reference-lane execution.
+- `XBN-02` has publication evidence that the host CPU, feature contract,
+ optimized counters, and reference-lane separation are represented truthfully.
+- The approved snapshot updates are limited to the maintained benchmark and
+ generation publication baselines required by this milestone.
+- The scoped quality gate passes after approved publication updates.
diff --git a/.planning/milestones/v1.27-phases/244-benchmark-attribution-and-publication-truth/244-01-SUMMARY.md b/.planning/milestones/v1.27-phases/244-benchmark-attribution-and-publication-truth/244-01-SUMMARY.md
new file mode 100644
index 00000000..7f0c4cf1
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/244-benchmark-attribution-and-publication-truth/244-01-SUMMARY.md
@@ -0,0 +1,46 @@
+---
+phase: 244
+status: passed
+requirements-completed:
+ - XBN-01
+ - XBN-02
+requirements-blocked: []
+verification: passed
+---
+
+# Phase 244 Summary
+
+## What Changed
+
+- Added Phase 244 context and plan for the benchmark attribution and publication
+ truth closeout.
+- Ran the non-mutating `kernel_x86_64` benchmark snapshot preflight.
+- Captured the would-be `kernel_x86_64` benchmark entries and EMEL/reference
+ compare rows into `/tmp` without touching `snapshots/bench/`.
+- Recorded the exact missing benchmark baseline entries and the stale maintained
+ generation baseline files, then applied the approved snapshot updates.
+- Repaired the source-backed audit gap in `XBN-01` by adding counter-checked
+ `kernel_x86_64` benchmark entries for optimized x86_64 flash attention and
+ q2/q3/q6 quantized matmul.
+- Generated candidate LFM2 `10`, `100`, and `1000` token generation baselines in
+ `/tmp/emel-phase244-baselines.N7inir` to prove the pending publication writes
+ are executable without modifying checked-in snapshots.
+
+## Validation
+
+- `node .codex/get-shit-done/bin/gsd-tools.cjs init phase-op 244`: pass.
+- `node .codex/get-shit-done/bin/gsd-tools.cjs roadmap analyze`: pass.
+- `git diff --check`: pass.
+- `scripts/bench.sh --snapshot --compare --suite=kernel_x86_64`: pass after
+ approved benchmark snapshot update and optimized benchmark repair.
+- Direct `bench_runner --mode=emel` with `EMEL_BENCH_SUITE=kernel_x86_64`:
+ pass, 19 benchmark entries including optimized flash and q2/q3/q6 entries.
+- Direct `bench_runner --mode=compare` with `EMEL_BENCH_SUITE=kernel_x86_64`:
+ pass, 19 EMEL/reference comparison rows.
+- Temp paritychecker generation baseline writes for LFM2 `10`, `100`, and
+ `1000` token runs: pass. Candidate diffs show stale checked-in snapshots lack
+ trace token IDs/score gaps and have old output lengths.
+
+## Closeout Status
+
+Phase 244 satisfies and verifies `XBN-01` and `XBN-02`.
diff --git a/.planning/milestones/v1.27-phases/244-benchmark-attribution-and-publication-truth/244-CONTEXT.md b/.planning/milestones/v1.27-phases/244-benchmark-attribution-and-publication-truth/244-CONTEXT.md
new file mode 100644
index 00000000..250c595e
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/244-benchmark-attribution-and-publication-truth/244-CONTEXT.md
@@ -0,0 +1,108 @@
+# Phase 244: Benchmark Attribution and Publication Truth - Context
+
+**Gathered:** 2026-06-25
+**Status:** Ready for planning
+**Mode:** Auto-generated (autonomous processor-support phase)
+
+
+## Phase Boundary
+
+Publish the maintained benchmark and documentation evidence for the Ryzen
+AVX2/FMA work from Phases 239-243. This phase does not add new kernels or widen
+the runtime contract. It closes the truthfulness loop by making the maintained
+benchmark snapshot and generation publication baselines match the source-backed
+x86_64 optimized path evidence, while keeping unsupported feature families as
+explicit no-claims.
+
+
+
+
+## Implementation Decisions
+
+### Benchmark Surface
+- Use the existing `tools/bench` maintained `kernel_x86_64` suite. It is wired
+ through `tools/bench/bench_runner_registry.cpp`,
+ `tools/bench/kernel/x86_64_bench.cpp`, and
+ `scripts/quality_gates.sh`.
+- The benchmark snapshot baseline is `snapshots/bench/benchmarks.txt`.
+- Do not update benchmark snapshots without explicit user approval.
+
+### Generation Publication Surface
+- Maintained generation publication baselines live under `snapshots/parity/`.
+- Phase 243 proved live EMEL/reference generation matches for `1`, `10`, `100`,
+ and `1000` token runs. The `10`, `100`, and `1000` publication baselines are
+ stale and need explicit approval before update.
+- Use paritychecker's existing `--write-generation-baseline ` support for
+ baseline writes. Do not rewrite parity baselines without explicit approval.
+
+### Truthfulness Rules
+- Published output must identify this host as AMD Ryzen 9 5950X with x86_64
+ AVX2, FMA, and F16C conversion support only.
+- Published output must not imply AVX-512, AVX-VNNI, AMX, BF16, native FP16, GPU,
+ or llama.cpp/ggml runtime acceleration.
+- Benchmark and parity lanes must remain separated: EMEL-owned code produces the
+ EMEL result, and llama.cpp/ggml remains comparison-only on the reference side.
+
+### Validation
+- Run `scripts/bench.sh --snapshot --compare --suite=kernel_x86_64` before any
+ approved update to confirm the current missing baseline set.
+- After explicit approval, run
+ `scripts/bench.sh --snapshot --update --suite=kernel_x86_64`.
+- After explicit approval, refresh stale maintained generation baselines for the
+ live-matching `10`, `100`, and `1000` token runs.
+- Re-run the changed-file scoped quality gate with
+ `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64`.
+
+
+
+
+## Existing Code Insights
+
+### Reusable Assets
+- `tools/bench/kernel/x86_64_bench.cpp` appends EMEL and reference
+ `kernel_x86_64` benchmark cases.
+- `tools/bench/bench_runner_registry.cpp` registers the `kernel_x86_64` suite.
+- `tools/bench/bench_dependency_manifest.cpp` maps `kernel_x86_64` to
+ `tools/bench/kernel/x86_64_bench.cpp`, `tools/bench/kernel/bench_common.hpp`,
+ and `src/emel/kernel`.
+- `scripts/bench.sh` supports suite-scoped snapshot updates and merges them into
+ `snapshots/bench/benchmarks.txt`.
+- `tools/paritychecker/parity_runner.cpp` supports
+ `--write-generation-baseline`.
+- `tools/paritychecker/parity_engines.cpp` computes the default maintained
+ generation baseline path under `snapshots/parity/`.
+
+### Integration Points
+- `snapshots/bench/benchmarks.txt`: approved `kernel_x86_64` snapshot entries.
+- `snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_10.txt`
+- `snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_100.txt`
+- `snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_1000.txt`
+- `.planning/REQUIREMENTS.md`, `.planning/ROADMAP.md`, `.planning/STATE.md`, and
+ `.planning/PROJECT.md` for final traceability after approved snapshot updates.
+
+
+
+
+## Specific Ideas
+
+- Treat `scripts/bench.sh --snapshot --compare --suite=kernel_x86_64` as the
+ publication preflight; it should fail only because the maintained snapshot
+ lacks the new `kernel/x86_64/*` entries.
+- Treat the generation baseline updates as publication baselines, not runtime
+ proof. Phase 243 already proved live EMEL/reference output equality.
+- Final closeout requires a clean scoped quality gate, not just artifact edits.
+
+
+
+
+## Active Next Scope
+
+- Get explicit snapshot approval, run the approved updates, rerun the scoped
+ quality gate, then mark `XBN-01` and `XBN-02` source/gate complete.
+
+
+
+---
+
+*Phase: 244-benchmark-attribution-and-publication-truth*
+*Context gathered: 2026-06-25*
diff --git a/.planning/milestones/v1.27-phases/244-benchmark-attribution-and-publication-truth/244-VALIDATION.md b/.planning/milestones/v1.27-phases/244-benchmark-attribution-and-publication-truth/244-VALIDATION.md
new file mode 100644
index 00000000..1f71f839
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/244-benchmark-attribution-and-publication-truth/244-VALIDATION.md
@@ -0,0 +1,209 @@
+---
+phase: 244
+slug: benchmark-attribution-and-publication-truth
+status: passed
+nyquist_compliant: true
+wave_0_complete: true
+created: 2026-06-25
+---
+
+# Phase 244 - Validation Strategy
+
+## Test Infrastructure
+
+| Property | Value |
+|----------|-------|
+| Framework | benchmark snapshot gate, paritychecker, source scans, quality gate |
+| Config file | `scripts/bench.sh`; `scripts/quality_gates.sh` |
+| Quick run command | `scripts/bench.sh --snapshot --compare --suite=kernel_x86_64` |
+| Gate command | `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64 scripts/quality_gates.sh` |
+| Current gate status | passed after approved benchmark and generation publication snapshot updates |
+
+## Per-Task Verification Map
+
+| Task ID | Requirement | Test Type | Automated Command | Status |
+|---------|-------------|-----------|-------------------|--------|
+| 244-01-01 | XBN-01, XBN-02 | benchmark preflight | `scripts/bench.sh --snapshot --compare --suite=kernel_x86_64` | green |
+| 244-01-02 | XBN-01, XBN-02 | approved snapshot writes | `scripts/bench.sh --snapshot --update --suite=kernel_x86_64` plus paritychecker baseline writes | green |
+| 244-01-03 | XBN-01 | source-backed audit gap repair | `tools/bench/kernel/x86_64_bench.cpp`; benchmark smoke and snapshot compare | green |
+| 244-01-04 | XBN-01, XBN-02 | scoped quality gate | `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64 scripts/quality_gates.sh` | green |
+
+## Command Results
+
+```bash
+node .codex/get-shit-done/bin/gsd-tools.cjs init phase-op 244
+```
+
+Result: PASS. Phase 244 context and plan are present.
+
+```bash
+node .codex/get-shit-done/bin/gsd-tools.cjs roadmap analyze
+```
+
+Result: PASS. Phase 244 is planned with one plan; phases 239-243 are
+disk-complete.
+
+```bash
+git diff --check
+```
+
+Result: PASS.
+
+```bash
+scripts/bench.sh --snapshot --compare --suite=kernel_x86_64
+```
+
+Initial result before approval: baseline update required. The command configured
+and built the suite-scoped benchmark runner, then reported these missing
+maintained baselines:
+
+- `kernel/x86_64/op_sqrt`
+- `kernel/x86_64/op_div`
+- `kernel/x86_64/op_sin`
+- `kernel/x86_64/op_unary_neg`
+- `kernel/x86_64/op_unary_relu`
+- `kernel/x86_64/op_mul`
+- `kernel/x86_64/op_mul_mat`
+- `kernel/x86_64/op_sub`
+- `kernel/x86_64/op_add`
+- `kernel/x86_64/op_soft_max`
+- `kernel/x86_64/op_dup`
+- `kernel/x86_64/op_cos`
+- `kernel/x86_64/op_sqr`
+- `kernel/x86_64/op_unary_exp`
+- `kernel/x86_64/op_log`
+
+No snapshot update was made.
+
+Final result after approved snapshot update and optimized benchmark repair:
+PASS.
+
+```bash
+EMEL_BENCH_SUITE=kernel_x86_64 build/bench_tools_ninja_kernel_x86_64/bench_runner --mode=emel
+```
+
+Initial result: PASS. Output was captured in
+`/tmp/emel-phase244-kernel-x86-current.sl4mKm.txt`; no snapshot file was
+modified.
+
+| Benchmark entry | Candidate ns/op |
+|-----------------|-----------------|
+| `kernel/x86_64/op_add` | 71.500 |
+| `kernel/x86_64/op_cos` | 1450.490 |
+| `kernel/x86_64/op_div` | 114.500 |
+| `kernel/x86_64/op_dup` | 77.900 |
+| `kernel/x86_64/op_log` | 3081.090 |
+| `kernel/x86_64/op_mul` | 71.700 |
+| `kernel/x86_64/op_mul_mat` | 2584.890 |
+| `kernel/x86_64/op_sin` | 1664.700 |
+| `kernel/x86_64/op_soft_max` | 4816.490 |
+| `kernel/x86_64/op_sqr` | 78.300 |
+| `kernel/x86_64/op_sqrt` | 151.600 |
+| `kernel/x86_64/op_sub` | 84.600 |
+| `kernel/x86_64/op_unary_exp` | 3662.790 |
+| `kernel/x86_64/op_unary_neg` | 80.400 |
+| `kernel/x86_64/op_unary_relu` | 98.800 |
+
+Source-backed audit then found that the first approved publication only covered
+common f32/unary/matmul entries and did not prove the x86_64 optimized flash and
+q2/q3/q6 benchmark lanes. `tools/bench/kernel/x86_64_bench.cpp` was repaired to
+add four counter-checked optimized entries:
+
+| Benchmark entry | Proof |
+|-----------------|-------|
+| `kernel/x86_64/op_flash_attn_ext_decode_like` | Aborts unless `optimized_flash_dispatch_count()` increments and `shared_flash_dispatch_count()` does not |
+| `kernel/x86_64/op_mul_mat_q2_k_q8_k` | Aborts unless `optimized_q2_dispatch_count()` increments and `shared_q2_dispatch_count()` does not |
+| `kernel/x86_64/op_mul_mat_q3_k_q8_k` | Aborts unless `optimized_q3_dispatch_count()` increments and `shared_q3_dispatch_count()` does not |
+| `kernel/x86_64/op_mul_mat_q6_k_q8_k` | Aborts unless `optimized_q6_dispatch_count()` increments and `shared_q6_dispatch_count()` does not |
+
+```bash
+EMEL_BENCH_SUITE=kernel_x86_64 build/bench_tools_ninja_kernel_x86_64/bench_runner --mode=compare
+```
+
+Initial result: PASS. Output was captured in
+`/tmp/emel-phase244-kernel-x86-compare.ZbJiE5.txt`; no snapshot file was
+modified. After repair, the compare output contains 19 EMEL/reference rows,
+including optimized flash and q2/q3/q6 entries, proving the reference/shared
+lane remains separate from the EMEL-owned optimized benchmark lane.
+
+Representative compare rows from that run:
+
+| Benchmark entry | EMEL ns/op | Reference ns/op | Ratio |
+|-----------------|------------|-----------------|-------|
+| `kernel/x86_64/op_add` | 125.700 | 397.700 | 0.316x |
+| `kernel/x86_64/op_mul_mat` | 2633.790 | 7301.070 | 0.361x |
+| `kernel/x86_64/op_soft_max` | 4898.180 | 1132.900 | 4.324x |
+| `kernel/x86_64/op_unary_exp` | 3773.590 | 2059.090 | 1.833x |
+
+```bash
+EMEL_BENCH_SUITE=kernel_x86_64 EMEL_BENCH_ITERS=1 EMEL_BENCH_RUNS=1 EMEL_BENCH_WARMUP_ITERS=0 EMEL_BENCH_WARMUP_RUNS=0 build/bench_tools_ninja_kernel_x86_64/bench_runner --mode=emel
+EMEL_BENCH_SUITE=kernel_x86_64 EMEL_BENCH_ITERS=1 EMEL_BENCH_RUNS=1 EMEL_BENCH_WARMUP_ITERS=0 EMEL_BENCH_WARMUP_RUNS=0 build/bench_tools_ninja_kernel_x86_64/bench_runner --mode=compare
+```
+
+Result: PASS. The optimized EMEL smoke emitted 19 `kernel/x86_64/*` entries,
+including the four counter-checked optimized entries. The compare smoke emitted
+matching EMEL/reference rows for all 19 entries.
+
+```bash
+scripts/bench.sh --snapshot --update --suite=kernel_x86_64
+```
+
+Result: PASS after explicit user approval. The benchmark snapshot baseline was
+merged into `snapshots/bench/benchmarks.txt` and later refreshed after the
+source-backed audit repair to include all 19 maintained `kernel/x86_64/*`
+entries.
+
+```bash
+build/paritychecker_zig/paritychecker --generation --model tests/models/LFM2.5-1.2B-Thinking-Q4_K_M.gguf --text hello --max-tokens=10 --write-generation-baseline snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_10.txt
+build/paritychecker_zig/paritychecker --generation --model tests/models/LFM2.5-1.2B-Thinking-Q4_K_M.gguf --text hello --max-tokens=100 --write-generation-baseline snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_100.txt
+build/paritychecker_zig/paritychecker --generation --model tests/models/LFM2.5-1.2B-Thinking-Q4_K_M.gguf --text hello --max-tokens=1000 --write-generation-baseline snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_1000.txt
+```
+
+Result: PASS after explicit user approval. The maintained generation
+publication baselines were updated in `snapshots/parity/`.
+
+```bash
+build/paritychecker_zig/paritychecker --generation --model tests/models/LFM2.5-1.2B-Thinking-Q4_K_M.gguf --text hello --max-tokens=10 --write-generation-baseline /tmp/emel-phase244-baselines.N7inir/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_10.txt
+build/paritychecker_zig/paritychecker --generation --model tests/models/LFM2.5-1.2B-Thinking-Q4_K_M.gguf --text hello --max-tokens=100 --write-generation-baseline /tmp/emel-phase244-baselines.N7inir/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_100.txt
+build/paritychecker_zig/paritychecker --generation --model tests/models/LFM2.5-1.2B-Thinking-Q4_K_M.gguf --text hello --max-tokens=1000 --write-generation-baseline /tmp/emel-phase244-baselines.N7inir/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_1000.txt
+```
+
+Result: PASS. These writes targeted `/tmp`, not checked-in snapshots.
+
+| Max tokens | Generated tokens | Output bytes | Optimized flash | Optimized q6 | Shared q6 |
+|------------|------------------|--------------|-----------------|--------------|-----------|
+| 10 | 10 | 20 | 228 | 390 | 0 |
+| 100 | 100 | 248 | 768 | 1380 | 0 |
+| 1000 | 1000 | 2498 | 6168 | 11280 | 0 |
+
+Candidate-vs-snapshot diff summary:
+
+| Max tokens | Stale snapshot | Candidate |
+|------------|----------------|-----------|
+| 10 | `output_length=22`, `trace_token_count=0` | `output_length=20`, `trace_token_count=10`, token IDs and score gaps populated |
+| 100 | `output_length=277`, `trace_token_count=0` | `output_length=248`, `trace_token_count=100`, token IDs and score gaps populated |
+| 1000 | `output_length=2866`, `trace_token_count=0` | `output_length=2498`, `trace_token_count=1000`, token IDs and score gaps populated |
+
+## Validation Sign-Off
+
+- [x] Phase 244 context and plan exist.
+- [x] Benchmark preflight was run without snapshot updates.
+- [x] Missing `kernel/x86_64/*` benchmark baseline entries are identified.
+- [x] Candidate benchmark entries and EMEL/reference compare rows were captured
+ under `/tmp` without modifying `snapshots/bench/`.
+- [x] Stale maintained generation publication baselines are identified from
+ Phase 243 live parity evidence.
+- [x] Candidate generation publication baselines were written to `/tmp` and
+ diffed against checked-in stale snapshots without modifying `snapshots/`.
+- [x] Benchmark snapshot update is approved and applied.
+- [x] Source-backed audit gap for `XBN-01` is repaired by counter-checked
+ optimized flash and q2/q3/q6 benchmark entries.
+- [x] Maintained generation baseline updates are approved and applied.
+- [x] Scoped quality gate passes after approved publication updates.
+- [x] `nyquist_compliant: true` and `wave_0_complete: true` are set in
+ frontmatter.
+- [x] Rule-compliance evidence is recorded through suite-scoped benchmark
+ commands, EMEL/reference lane separation, publication snapshot diffs, and the
+ scoped quality gate.
+
+**Approval:** granted by user; snapshots updated.
diff --git a/.planning/milestones/v1.27-phases/244-benchmark-attribution-and-publication-truth/244-VERIFICATION.md b/.planning/milestones/v1.27-phases/244-benchmark-attribution-and-publication-truth/244-VERIFICATION.md
new file mode 100644
index 00000000..a9c26ac2
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/244-benchmark-attribution-and-publication-truth/244-VERIFICATION.md
@@ -0,0 +1,97 @@
+# Phase 244 Verification
+
+status: passed
+
+All commands were run from:
+`/shared/stateforward/emel.cpp`
+
+## Must-Have Verification
+
+| Must-have | Evidence | Status |
+|-----------|----------|--------|
+| Maintained `kernel_x86_64` benchmark suite is wired | `tools/bench/bench_runner_registry.cpp`, `tools/bench/kernel/x86_64_bench.cpp`, and `scripts/quality_gates.sh` expose and select the suite | PASS |
+| Optimized x86_64 flash and quantized paths are benchmarked | `tools/bench/kernel/x86_64_bench.cpp` publishes `op_flash_attn_ext_decode_like`, `op_mul_mat_q2_k_q8_k`, `op_mul_mat_q3_k_q8_k`, and `op_mul_mat_q6_k_q8_k` entries that abort if the optimized actor counters do not advance | PASS |
+| Benchmark preflight is suite-scoped and non-mutating | `scripts/bench.sh --snapshot --compare --suite=kernel_x86_64` ran and did not update snapshots | PASS |
+| Publication baseline updated | `scripts/bench.sh --snapshot --update --suite=kernel_x86_64` merged 19 `kernel/x86_64/*` entries into `snapshots/bench/benchmarks.txt` | PASS |
+| Generation publication baselines updated | paritychecker wrote the maintained LFM2 `10`, `100`, and `1000` token baselines under `snapshots/parity/` | PASS |
+| Unsupported feature claims remain excluded | Phase 243 unsupported x86 flag scan found no AVX-512, AVX-VNNI, AMX, BF16, or native FP16 compile-flag claims | PASS |
+| Required quality gate | changed-file scoped quality gate passed with `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64` | PASS |
+
+## Evidence Summary
+
+- `node .codex/get-shit-done/bin/gsd-tools.cjs init phase-op 244` reports
+ context and plan present.
+- `node .codex/get-shit-done/bin/gsd-tools.cjs roadmap analyze` reports Phase
+ 244 as planned with one plan.
+- `git diff --check` passes.
+- Initial `scripts/bench.sh --snapshot --compare --suite=kernel_x86_64`
+ configured and built the suite-scoped benchmark runner, then failed only
+ because the maintained benchmark baseline lacked the first 15 common
+ `kernel/x86_64/*` entries:
+ - `kernel/x86_64/op_sqrt`
+ - `kernel/x86_64/op_div`
+ - `kernel/x86_64/op_sin`
+ - `kernel/x86_64/op_unary_neg`
+ - `kernel/x86_64/op_unary_relu`
+ - `kernel/x86_64/op_mul`
+ - `kernel/x86_64/op_mul_mat`
+ - `kernel/x86_64/op_sub`
+ - `kernel/x86_64/op_add`
+ - `kernel/x86_64/op_soft_max`
+ - `kernel/x86_64/op_dup`
+ - `kernel/x86_64/op_cos`
+ - `kernel/x86_64/op_sqr`
+ - `kernel/x86_64/op_unary_exp`
+ - `kernel/x86_64/op_log`
+- The preflight also prints `error: no benchmark entries matched selected suite`
+ because the selected suite has no existing baseline entries to compare yet.
+ That is a publication-baseline absence, not a runtime execution failure.
+- Source-backed milestone audit found an `XBN-01` gap: those first 15 common
+ entries did not prove the x86_64 optimized flash and q2/q3/q6 benchmark
+ lanes. `tools/bench/kernel/x86_64_bench.cpp` was repaired so the maintained
+ `kernel_x86_64` suite now also publishes counter-checked optimized-path
+ entries for:
+ - `kernel/x86_64/op_flash_attn_ext_decode_like`
+ - `kernel/x86_64/op_mul_mat_q2_k_q8_k`
+ - `kernel/x86_64/op_mul_mat_q3_k_q8_k`
+ - `kernel/x86_64/op_mul_mat_q6_k_q8_k`
+- Direct benchmark smoke passed:
+ `EMEL_BENCH_SUITE=kernel_x86_64 EMEL_BENCH_ITERS=1 EMEL_BENCH_RUNS=1 build/bench_tools_ninja_kernel_x86_64/bench_runner --mode=emel`
+ emitted 19 `kernel/x86_64/*` entries, including the optimized flash and
+ q2/q3/q6 entries.
+- Direct benchmark compare smoke passed with the same 19 EMEL/reference rows:
+ `EMEL_BENCH_SUITE=kernel_x86_64 EMEL_BENCH_ITERS=1 EMEL_BENCH_RUNS=1 build/bench_tools_ninja_kernel_x86_64/bench_runner --mode=compare`.
+- Final `scripts/bench.sh --snapshot --compare --suite=kernel_x86_64` passed
+ after the approved snapshot update and includes the optimized flash and
+ q2/q3/q6 entries.
+- Published benchmark entry names are:
+ `op_add`, `op_cos`, `op_div`, `op_dup`, `op_log`, `op_mul`, `op_mul_mat`,
+ `op_mul_mat_q2_k_q8_k`, `op_mul_mat_q3_k_q8_k`,
+ `op_mul_mat_q6_k_q8_k`, `op_flash_attn_ext_decode_like`, `op_sin`,
+ `op_soft_max`, `op_sqr`, `op_sqrt`, `op_sub`, `op_unary_exp`,
+ `op_unary_neg`, and `op_unary_relu`.
+- Temp maintained generation baseline candidate writes succeeded without touching
+ `snapshots/parity/`:
+ - `--max-tokens=10`: status 0, generated 10 tokens, output 20 bytes,
+ optimized flash 228, optimized q6 390, shared q6 0.
+ - `--max-tokens=100`: status 0, generated 100 tokens, output 248 bytes,
+ optimized flash 768, optimized q6 1380, shared q6 0.
+ - `--max-tokens=1000`: status 0, generated 1000 tokens, output 2498 bytes,
+ optimized flash 6168, optimized q6 11280, shared q6 0.
+- Candidate-vs-snapshot diffs show the checked-in stale baselines have
+ `trace_token_count=0` while candidates include the live token IDs and score
+ gaps. Output lengths change from `22 -> 20`, `277 -> 248`, and `2866 -> 2498`
+ for `10`, `100`, and `1000` tokens respectively.
+
+## Final Verification
+
+User approved snapshot updates. After the source-backed audit exposed the
+missing optimized benchmark entries, `tools/bench/kernel/x86_64_bench.cpp` was
+repaired and `scripts/bench.sh --snapshot --update --suite=kernel_x86_64`
+updated `snapshots/bench/benchmarks.txt`. Paritychecker updated the maintained
+LFM2 `10`, `100`, and `1000` token publication baselines, the focused parity
+publication test passed, `scripts/bench.sh --snapshot --compare
+--suite=kernel_x86_64` passed, and the changed-file scoped quality gate passed
+with `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64`.
+
+Phase 244 is fully verified for `XBN-01` and `XBN-02`.
diff --git a/.planning/quick/260401-ejm-add-non-blocking-benchmark-binary-size-c/SUMMARY.md b/.planning/quick/260401-ejm-add-non-blocking-benchmark-binary-size-c/SUMMARY.md
new file mode 100644
index 00000000..3f1d74e1
--- /dev/null
+++ b/.planning/quick/260401-ejm-add-non-blocking-benchmark-binary-size-c/SUMMARY.md
@@ -0,0 +1,9 @@
+---
+status: complete
+completed: 2026-04-01
+summary: 260401-ejm-SUMMARY.md
+---
+
+# Quick Task 260401-ejm Status
+
+Complete. The detailed summary remains in `260401-ejm-SUMMARY.md`.
diff --git a/.planning/todos/pending/2026-04-02-move-eager-quant-prepack-into-generator-initializer.md b/.planning/todos/backlog/2026-04-02-move-eager-quant-prepack-into-generator-initializer.md
similarity index 100%
rename from .planning/todos/pending/2026-04-02-move-eager-quant-prepack-into-generator-initializer.md
rename to .planning/todos/backlog/2026-04-02-move-eager-quant-prepack-into-generator-initializer.md
diff --git a/.planning/todos/pending/2026-04-02-optimize-lfm2-5-q4-prefill-kernel.md b/.planning/todos/backlog/2026-04-02-optimize-lfm2-5-q4-prefill-kernel.md
similarity index 100%
rename from .planning/todos/pending/2026-04-02-optimize-lfm2-5-q4-prefill-kernel.md
rename to .planning/todos/backlog/2026-04-02-optimize-lfm2-5-q4-prefill-kernel.md
diff --git a/.planning/todos/pending/2026-04-02-optimize-lfm2-5-q6-prefill-kernel.md b/.planning/todos/backlog/2026-04-02-optimize-lfm2-5-q6-prefill-kernel.md
similarity index 100%
rename from .planning/todos/pending/2026-04-02-optimize-lfm2-5-q6-prefill-kernel.md
rename to .planning/todos/backlog/2026-04-02-optimize-lfm2-5-q6-prefill-kernel.md
diff --git a/.planning/todos/pending/2026-04-02-reuse-q8-rhs-across-lfm2-5-prefill-matmuls.md b/.planning/todos/backlog/2026-04-02-reuse-q8-rhs-across-lfm2-5-prefill-matmuls.md
similarity index 100%
rename from .planning/todos/pending/2026-04-02-reuse-q8-rhs-across-lfm2-5-prefill-matmuls.md
rename to .planning/todos/backlog/2026-04-02-reuse-q8-rhs-across-lfm2-5-prefill-matmuls.md
diff --git a/AGENTS.md b/AGENTS.md
index 3c903b7b..1d7bc325 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -9,6 +9,11 @@ ALWAYS follow the RTC actor model and no-queue invariant from `docs/rules/sml.ru
NEVER use `sml::process_queue`, `sml::defer_queue`, or any mailbox/post-for-later
mechanism.
ALWAYS keep dispatch run-to-completion and single-writer per actor.
+ALWAYS treat coroutine or `async`-named dispatch APIs semantically: async is not
+deferred by definition, and `process_event_async` MAY be RTC when completion is
+driven and observed before the top-level dispatch returns.
+NEVER let coroutine continuations, incomplete tasks, scheduler work items, or
+callbacks escape the RTC boundary as hidden deferred work.
NEVER call an actor's own `process_event` from guards/actions/entry/exit.
ALWAYS model internal multi-step flows with `sml::completion`,
anonymous transitions, and/or entry actions.
@@ -23,7 +28,10 @@ ALWAYS implement bulk numeric iteration in allocation-free action/detail kernels
within a single transition per phase.
NEVER copy event payload into context just to bridge internal phases.
ALWAYS keep guards pure predicates of `(event, context)` with no side effects.
-ALWAYS keep actions bounded and non-blocking during dispatch.
+ALWAYS keep actions bounded during dispatch.
+ONLY allow an action-level scheduler fork/join wait when it joins already
+submitted child actor dispatches before the action returns, preserves RTC,
+does not re-enter the same actor, and leaves no hidden deferred work.
ALWAYS keep hot-path actions allocation-free.
ALWAYS keep any allowed one-time construction or initialization heap
allocation before any `process_event(...)` dispatch.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b69c9bdf..e54c97bb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,6 +10,7 @@ set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
include(CheckCXXCompilerFlag)
+include(CheckCXXSourceRuns)
if(MSVC)
add_compile_options(/W4 /WX)
@@ -22,6 +23,9 @@ option(EMEL_ENABLE_FUZZ "Build fuzz targets" OFF)
option(EMEL_ENABLE_AARCH64_HOST_FEATURES
"Enable host-tuned AArch64 compile flags for EMEL-owned C++ code"
ON)
+option(EMEL_ENABLE_X86_64_HOST_FEATURES
+ "Enable host-tuned x86_64 AVX2/FMA/F16C compile flags for EMEL-owned C++ code"
+ ON)
include(FetchContent)
include(cmake/sml_version.cmake)
@@ -67,6 +71,50 @@ if(NOT EMEL_AARCH64_HOST_CXX_FLAG STREQUAL "")
)
endif()
+set(EMEL_X86_64_HOST_CXX_FLAGS "")
+if(EMEL_ENABLE_X86_64_HOST_FEATURES AND NOT CMAKE_CROSSCOMPILING AND NOT MSVC)
+ string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" EMEL_SYSTEM_PROCESSOR_LOWER)
+ if(EMEL_SYSTEM_PROCESSOR_LOWER MATCHES "^(x86_64|amd64)$")
+ check_cxx_compiler_flag("-mavx2" EMEL_HAS_X86_64_AVX2_FLAG)
+ check_cxx_compiler_flag("-mfma" EMEL_HAS_X86_64_FMA_FLAG)
+ check_cxx_compiler_flag("-mf16c" EMEL_HAS_X86_64_F16C_FLAG)
+ # Compiler acceptance alone is not enough: these flags let the compiler
+ # emit AVX2 in every consumer TU, bypassing the runtime CPUID guards in
+ # kernel/x86_64. Probe the build host CPU (compiled without the flags) so
+ # host-tuned codegen is only enabled where it can actually execute.
+ check_cxx_source_runs("
+ #include
+ #include
+ int main() {
+ unsigned int eax = 0, ebx = 0, ecx = 0, edx = 0;
+ if (!__get_cpuid(1, &eax, &ebx, &ecx, &edx)) { return 1; }
+ const bool fma = (ecx & (1u << 12)) != 0u;
+ const bool f16c = (ecx & (1u << 29)) != 0u;
+ const bool osxsave = (ecx & (1u << 27)) != 0u;
+ if (!fma || !f16c || !osxsave) { return 1; }
+ std::uint32_t xcr0_lo = 0, xcr0_hi = 0;
+ __asm__ volatile(\"xgetbv\" : \"=a\"(xcr0_lo), \"=d\"(xcr0_hi) : \"c\"(0));
+ if ((xcr0_lo & 0x6u) != 0x6u) { return 1; }
+ if (!__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) { return 1; }
+ return ((ebx & (1u << 5)) != 0u) ? 0 : 1;
+ }" EMEL_X86_64_HOST_RUNS_AVX2_FMA_F16C)
+ if(EMEL_HAS_X86_64_AVX2_FLAG AND EMEL_HAS_X86_64_FMA_FLAG AND
+ EMEL_HAS_X86_64_F16C_FLAG AND EMEL_X86_64_HOST_RUNS_AVX2_FMA_F16C)
+ list(APPEND EMEL_X86_64_HOST_CXX_FLAGS "-mavx2" "-mfma" "-mf16c")
+ endif()
+ endif()
+endif()
+
+if(EMEL_X86_64_HOST_CXX_FLAGS)
+ message(STATUS "EMEL enabling x86_64 host compile flags: ${EMEL_X86_64_HOST_CXX_FLAGS}")
+ foreach(EMEL_X86_64_HOST_CXX_FLAG IN LISTS EMEL_X86_64_HOST_CXX_FLAGS)
+ target_compile_options(emel_core
+ INTERFACE
+ "$<$:${EMEL_X86_64_HOST_CXX_FLAG}>"
+ )
+ endforeach()
+endif()
+
add_library(emel STATIC
src/emel/io/mmap/actions.cpp
src/emel/model/architecture/detail.cpp
@@ -115,6 +163,8 @@ if(EMEL_ENABLE_TESTS)
tests/text/generator/lifecycle_tests.cpp
tests/text/generator/action_guard_tests.cpp
tests/text/generator/detail_tests.cpp
+ tests/text/generator/parallel_matmul_tests.cpp
+ tests/text/generator/decode_wavefront/lifecycle_tests.cpp
tests/text/generator/initializer/lifecycle_tests.cpp
tests/text/generator/prefill/lifecycle_tests.cpp
tests/diarization/request/lifecycle_tests.cpp
diff --git a/coroutine-plan.md b/coroutine-plan.md
new file mode 100644
index 00000000..61fbd87a
--- /dev/null
+++ b/coroutine-plan.md
@@ -0,0 +1,667 @@
+# Coroutine Plan
+
+status: decode wavefront reserved-compute path measured
+owner: emel
+last updated: 2026-06-26
+
+## Decision
+
+Use `co_sm` first at the graph execution boundary.
+
+- Infrastructure surface: `src/emel/sm.hpp`
+- First inference actor: `src/emel/graph/processor/sm.hpp`
+- First current consumer: `src/emel/graph/sm.hpp` through `action::request_execute`
+- Experimental payoff path: standalone `src/emel/text/generator/decode_wavefront/**`
+ using graph-owned reserved compute
+
+This is not a plan to make one SML dispatch faster. The useful speedup path is to make graph
+execution schedulable at a bounded phase boundary so a later decode wavefront can keep compatible
+sequences ready for the same kernel route and weight stream. Single-request latency must remain
+neutral before the wavefront path is allowed to ship.
+
+Review update: production `text/generator` decode remains on direct graph dispatch. The current
+decode wavefront actor is a standalone component and benchmark target until a maintained
+multi-lane integration proves a benefit without regressing batch-1 latency.
+
+`async` is not deferred by definition. An async/coroutine dispatch is compatible with the EMEL
+RTC actor model when completion is driven and observed before the enclosing top-level dispatch
+returns. The forbidden case is hidden work escaping the RTC boundary: an incomplete task,
+continuation, scheduler work item, callback handle, mailbox entry, or post-for-later queue.
+
+## Source-Backed Current State
+
+- `stateforward::sml` is a namespace alias to the underlying SML implementation in the configured
+ Stateforward dependency, so EMEL code must use `stateforward::sml::utility::co_sm` rather than
+ naming `boost::sml` directly.
+- `src/emel/sm.hpp` now includes `stateforward/sml/utility/co_sm.hpp`, exposes `emel::co_sm`,
+ `emel::bool_task`, scheduler policy aliases, and a fixed no-heap coroutine allocator.
+- `emel::co_sm` defaults to `emel::policy::coroutine_scheduler`
+ and `emel::policy::coroutine_allocator>`.
+- `src/emel/graph/processor/sm.hpp` now inherits from `emel::co_sm` and keeps its public `process_event(const event::execute &)` wrapper
+ synchronous by driving `process_event_async(...).result()` to completion before returning.
+- `src/emel/graph/actions.hpp` dispatches graph execution through
+ `ctx.processor_actor.process_event_async(request).result()` within the same graph RTC chain.
+- `src/emel/graph/sm.hpp` now exposes `event::compute_reserved`, an internal reserved-compute
+ path that reuses the graph reservation output and enters processor execution without running
+ the graph assembler path again.
+- `src/emel/text/generator/decode_wavefront/sm.hpp` uses bounded static-scheduler `co_sm` and
+ dispatches compatible lanes through graph-owned `compute_reserved` events.
+- `src/emel/text/generator/actions.hpp` no longer routes single-lane production decode through
+ the wavefront actor; it dispatches the already-selected graph compute request directly.
+- `tests/sm/sm_policy_tests.cpp` covers the wrapper surface: default inline scheduler, sync
+ dispatch, inline `process_event_async(...).result()`, error normalization, context injection,
+ scheduler access, and fixed allocator exhaustion.
+- `docs/rules/sml.rules.md` and `AGENTS.md` now explicitly allow RTC async/coroutine dispatch
+ while forbidding hidden deferred work.
+- `docs/third_party/sml.md` now documents the inline `emel::co_sm` default.
+
+## Invariants
+
+- No `sml::process_queue`, `sml::defer_queue`, user mailbox, background worker, or hidden
+ post-for-later mechanism.
+- No dynamic allocation during dispatch. Coroutine frames must use fixed storage or reject the
+ dispatch; no heap fallback is acceptable in hot paths.
+- No runtime behavior selection in actions, detail helpers, coroutine bodies, or awaitables.
+ Runtime decisions stay in guards and transition rows.
+- No coroutine use inside kernels, logits scans, sampler loops, tokenizer inner loops, renderer
+ loops, tensor-element loops, packing, quant/dequant, or matmul/attention numeric loops.
+- No public API exposure of coroutine tasks or scheduler internals.
+- No performance claim from `co_sm` adoption alone. Claims require benchmark evidence.
+- Snapshot updates require explicit user consent.
+
+## Completed Work
+
+### Completed 0: Coroutine Surface
+
+Evidence:
+
+- `src/emel/sm.hpp` exposes the EMEL wrapper around Stateforward's utility `co_sm`.
+- `emel::policy::fixed_coroutine_allocator` returns `nullptr` on pool exhaustion instead of
+ falling back to heap allocation.
+- Existing `emel::sm` users are untouched.
+- `emel::co_sm` mirrors the existing contextless/contextful wrapper shape and normalizes
+ `error_out` results like `emel::sm`.
+- `process_event_async` in the EMEL wrapper observes completion before returning and always
+ returns a normalized immediate `emel::bool_task`; incomplete scheduler work is not allowed to
+ escape the RTC boundary.
+
+Validation already run:
+
+```bash
+cmake --build build/zig --target emel_tests_bin -j2
+ctest --test-dir build/zig -R '^emel_tests_sm$' --output-on-failure
+```
+
+Result: passed.
+
+### Completed 1: No-Op Graph Processor Conversion
+
+Evidence:
+
+- `src/emel/graph/processor/sm.hpp` uses `emel::co_sm` with an inline scheduler.
+- The public graph processor `event::execute` entrypoint still creates the same
+ `event::execute_ctx` and `event::execute_step`, drives the inline async base dispatch to
+ completion, and returns only after the RTC chain finishes.
+- `src/emel/graph/actions.hpp` uses the processor async execute wrapper and observes completion
+ before the graph compute action returns.
+- No transition rows were moved into coroutine bodies.
+
+Validation already run:
+
+```bash
+ctest --test-dir build/zig -R '^emel_tests_kernel_and_graph$' --output-on-failure
+```
+
+Result: passed.
+
+## Quality Gate Status
+
+Current status: focused unit tests and the corrected decode-wavefront benchmark pass; the repo
+quality gate does not yet pass for this expanded wavefront change set. The strict LFM2 generation
+comparison now correctly fails on x86_64 because the runtime uses the shared Q4 fallback instead of
+an optimized Q4 kernel.
+
+A current changed-file scoped gate was run with
+`EMEL_QUALITY_GATES_BENCH_SUITE=decode_wavefront` and the temporary `clang-format` shim in
+`/tmp/emel-clang-format-venv/bin`. It selected the decode wavefront benchmark runner, passed the
+legacy SML scan, Zig build, dependency manifest freshness checks, parity skip, and fuzz skip, then
+failed on the snapshot blockers below.
+
+Gate blockers:
+
+- `bench_snapshot`: failed because the new focused benchmark rows
+ `decode_wavefront/batch1`, `decode_wavefront/batch4`, and `decode_wavefront/batch8` have no
+ approved baseline in `snapshots/bench/benchmarks_compare.txt`.
+- `lint_snapshot`: failed because new source/test files under
+ `src/emel/text/generator/decode_wavefront/**` and
+ `tests/text/generator/decode_wavefront/lifecycle_tests.cpp` are not yet listed in
+ `snapshots/lint/clang_format.txt`.
+- Snapshot updates require explicit user approval, so `snapshots/bench/benchmarks_compare.txt`,
+ `snapshots/lint/clang_format.txt`, and `snapshots/quality_gates/timing.txt` were not updated.
+
+Code-owned review fixes completed after the first failed gate:
+
+- `tools/bench/bench_runner.cpp` now treats x86_64 and AArch64 as optimized flash hosts, matching
+ existing generator tests and runtime counters.
+- LFM2 generation quantized evidence validation again requires optimized Q4 and optimized Q6
+ evidence on all hosts, with no shared fallback and no unrelated Q2/Q3/Q8 routes.
+- `src/emel/kernel/x86_64/**` now reports shared Q4 fallback dispatches, so a missing optimized Q4
+ path is visible in generation evidence instead of looking like no Q4 path ran.
+
+Validation for those fixes:
+
+```bash
+EMEL_BENCH_ITERS=100 \
+EMEL_BENCH_RUNS=3 \
+EMEL_BENCH_WARMUP_ITERS=10 \
+EMEL_BENCH_WARMUP_RUNS=1 \
+scripts/bench.sh --compare --suite=generation
+```
+
+Result: failed as expected on this x86_64 host because optimized Q4 is still missing. The run
+reported `optimized_q4_dispatch_calls=0`, `shared_q4_dispatch_calls=2378`,
+`optimized_q6_dispatch_calls=291`, and `shared_q6_dispatch_calls=0`.
+
+Current decode wavefront benchmark evidence after review fixes:
+
+```text
+decode_wavefront/batch1 emel.cpp 417.890 ns/op, reserved-scalar-baseline 340.800 ns/op, ratio=1.226x
+decode_wavefront/batch4 emel.cpp 1533.400 ns/op, reserved-scalar-baseline 1355.200 ns/op, ratio=1.131x
+decode_wavefront/batch8 emel.cpp 2930.290 ns/op, reserved-scalar-baseline 2713.990 ns/op, ratio=1.080x
+```
+
+Interpretation: the previous apparent speedup was from comparing reserved compute against full
+graph assemble+compute. Against direct per-lane reserved compute, the current wavefront component
+is slower in this fixture and is not ready for production generator integration.
+
+```text
+src/emel/text/generator/decode_wavefront/actions.hpp
+src/emel/text/generator/decode_wavefront/context.hpp
+src/emel/text/generator/decode_wavefront/errors.hpp
+src/emel/text/generator/decode_wavefront/events.hpp
+src/emel/text/generator/decode_wavefront/guards.hpp
+src/emel/text/generator/decode_wavefront/sm.hpp
+tests/text/generator/decode_wavefront/lifecycle_tests.cpp
+```
+
+Gate command:
+
+```bash
+PATH="/tmp/emel-clang-format-venv/bin:$PATH" \
+EMEL_QUALITY_GATES_CHANGED_FILES="AGENTS.md \
+docs/rules/sml.rules.md \
+docs/third_party/sml.md \
+src/emel/sm.hpp \
+src/emel/graph/actions.hpp \
+src/emel/graph/context.hpp \
+src/emel/graph/events.hpp \
+src/emel/graph/guards.hpp \
+src/emel/graph/processor/sm.hpp \
+src/emel/graph/sm.hpp \
+src/emel/text/generator/actions.hpp \
+src/emel/text/generator/context.hpp \
+src/emel/text/generator/decode_wavefront/actions.hpp \
+src/emel/text/generator/decode_wavefront/context.hpp \
+src/emel/text/generator/decode_wavefront/errors.hpp \
+src/emel/text/generator/decode_wavefront/events.hpp \
+src/emel/text/generator/decode_wavefront/guards.hpp \
+src/emel/text/generator/decode_wavefront/sm.hpp \
+tests/sm/sm_policy_tests.cpp \
+tests/graph/graph_tests.cpp \
+tests/graph/processor/processor_action_branch_tests.cpp \
+tests/text/generator/decode_wavefront/lifecycle_tests.cpp \
+tools/bench/CMakeLists.txt \
+tools/bench/bench_cases.hpp \
+tools/bench/bench_dependency_manifest.cpp \
+tools/bench/bench_disabled_cases.cpp \
+tools/bench/bench_runner.cpp \
+tools/bench/bench_runner_registry.cpp \
+tools/bench/dependency_manifest.txt \
+tools/bench/graph/processor_bench.cpp \
+tools/bench/text/generator/decode_wavefront_bench.cpp \
+coroutine-plan.md" \
+EMEL_QUALITY_GATES_BENCH_SUITE=decode_wavefront \
+ scripts/quality_gates.sh
+```
+
+Result: failed only for the snapshot-baseline blockers above.
+
+### Completed 2: Graph Processor Neutrality Benchmark
+
+Goal: prove the no-op `co_sm` graph processor is neutral before adding any async call surface or
+decode wavefront behavior.
+
+Evidence:
+
+- `tools/bench/graph/processor_bench.cpp` adds a focused `graph_processor` suite.
+- The EMEL lane uses the current `emel::graph::processor::sm` inline `co_sm` wrapper.
+- The reference lane uses a benchmark-local `emel::sm`
+ wrapper over the same transition table.
+- Cases cover invalid request rejection, reused-buffer success, allocation-required success,
+ lifecycle gate/publish/release, and done/error callback publication.
+- `tools/bench/CMakeLists.txt`, `tools/bench/bench_cases.hpp`,
+ `tools/bench/bench_disabled_cases.cpp`, and `tools/bench/bench_runner_registry.cpp` register
+ the suite without requiring llama.cpp.
+- `tools/bench/bench_runner.cpp` prints graph-processor compare rows as `reference-baseline`
+ instead of `llama.cpp`.
+
+Validation run:
+
+```bash
+EMEL_BENCH_ITERS=200000 \
+EMEL_BENCH_RUNS=9 \
+EMEL_BENCH_WARMUP_ITERS=10000 \
+EMEL_BENCH_WARMUP_RUNS=1 \
+scripts/bench.sh --compare --suite=graph_processor
+```
+
+Result:
+
+| case | inline `co_sm` | `emel::sm` baseline | ratio |
+| --- | ---: | ---: | ---: |
+| `graph/processor_alloc` | 350.723 ns/op | 357.034 ns/op | 0.982x |
+| `graph/processor_invalid` | 24.942 ns/op | 24.958 ns/op | 0.999x |
+| `graph/processor_reused` | 305.952 ns/op | 315.271 ns/op | 0.970x |
+
+Interpretation:
+
+- The no-op graph processor `co_sm` conversion passes the neutrality checkpoint.
+- The benchmark has low-single-digit run-to-run movement, including earlier samples where
+ successful dispatch was slightly slower and later samples where it was slightly faster.
+- Treat the result as neutral infrastructure, not an inference speedup claim.
+- Do not expand this no-op conversion into generator/decode paths as a performance feature.
+- Use the benchmark as the regression guard for future coroutine candidates.
+
+Generation comparison still required before any inference-throughput claim:
+
+```bash
+EMEL_BENCH_ITERS=1 \
+EMEL_BENCH_RUNS=1 \
+EMEL_BENCH_WARMUP_ITERS=0 \
+EMEL_BENCH_WARMUP_RUNS=0 \
+EMEL_GENERATION_WORKLOAD_ID=qwen3_single_user_hello_max_tokens_1_v1 \
+scripts/bench.sh --compare --suite=generation
+```
+
+## Phase 3: RTC `process_event_async` Surface
+
+Goal: make graph execution callable through `process_event_async` without changing generator or
+graph RTC semantics.
+
+Status: completed and measured. This is not an inference speedup by itself.
+
+Evidence:
+
+- `processor::sm::process_event(const event::execute &)` delegates to
+ `process_event_async(ev).result()`.
+- `processor::sm::process_event_async(const event::execute &)` preserves the dispatch-local
+ `execute_ctx` / `execute_step` handoff and returns an immediate `emel::bool_task`.
+- `graph::action::request_execute` calls
+ `ctx.processor_actor.process_event_async(request).result()`.
+- `graph_processor_process_event_async_execute_completes_in_rtc` proves the public async execute
+ wrapper publishes callbacks and output before `.result()` is observed.
+- The existing `graph_machine_compute_lifecycle_dispatch_is_alloc_free` test now covers graph
+ compute through the async processor execution path.
+
+Acceptance:
+
+- `graph::sm` and `text::generator::sm` remain synchronous RTC actors.
+- No generated token can observe partially completed graph work.
+- No hidden scheduler work survives return from the top-level dispatch.
+- Allocator counters show zero graph compute hot-path heap allocation.
+
+Validation run:
+
+```bash
+cmake --build build/zig --target emel_tests_bin -j2
+ctest --test-dir build/zig -R '^emel_tests_sm$' --output-on-failure
+ctest --test-dir build/zig -R '^emel_tests_kernel_and_graph$' --output-on-failure
+EMEL_BENCH_ITERS=200000 \
+EMEL_BENCH_RUNS=9 \
+EMEL_BENCH_WARMUP_ITERS=10000 \
+EMEL_BENCH_WARMUP_RUNS=1 \
+scripts/bench.sh --compare --suite=graph_processor
+```
+
+Result from the latest repeated sample:
+
+| case | async `co_sm` graph processor | `emel::sm` baseline | ratio |
+| --- | ---: | ---: | ---: |
+| `graph/processor_alloc` | 349.161 ns/op | 344.381 ns/op | 1.014x |
+| `graph/processor_invalid` | 32.150 ns/op | 26.940 ns/op | 1.193x |
+| `graph/processor_reused` | 316.821 ns/op | 310.523 ns/op | 1.020x |
+
+Interpretation:
+
+- The coroutine-driven graph processor is implemented and remains RTC.
+- Successful dispatch has a low-single-digit overhead against the direct `emel::sm` baseline.
+- Invalid rejection is materially slower because the async wrapper overhead dominates a tiny
+ failure path.
+- This phase proves semantics and establishes a benchmark guard; it does not speed inference.
+
+## Completed 4: Decode Wavefront Driver
+
+Goal: turn `co_sm` into an inference speedup by batching compatible decode work across sequences.
+
+Status: completed for the bounded graph-compute wavefront target.
+
+Evidence:
+
+- `src/emel/text/generator/decode_wavefront/**` defines a bounded generator-owned wavefront
+ actor with explicit lane stages for up to 8 lanes.
+- Guards require compatible model identity, backend identity, kernel kind, attention mode, kernel
+ route, output contract, dtype/layout contract, quantized contract, step size, and token count.
+- The wavefront path does not share mutable lane context. Each lane carries its own graph actor,
+ graph compute request, compatibility key, and acceptance flag.
+- The graph wrapper owns the optimization boundary through `event::compute_reserved`; wavefront
+ does not reach into graph actions or assembler internals.
+- Reserved compute requires a successful graph reservation first, seeds the compute output from the
+ reservation, and skips normal graph assemble hints.
+- `process_event_async` on the wavefront observes the bounded static-scheduler dispatch before
+ returning and exposes a normalized immediate `emel::bool_task`, preserving RTC semantics.
+- No background worker, mailbox, `defer_queue`, `process_queue`, or hidden post-return work was
+ introduced.
+
+Acceptance:
+
+- Deterministic dispatch and first-lane failure behavior are covered by decode wavefront tests.
+- Scheduler depth is bounded by `fifo_scheduler<16u, 64u>` and lane count is bounded by
+ `event::k_max_lanes == 8`.
+- Batch-1/4/8 performance is not accepted for production integration yet. The corrected
+ reserved-scalar baseline shows current wavefront overhead rather than a grouping win.
+
+Validation run:
+
+```bash
+cmake --build build/zig --target emel_tests_bin -j2
+./build/zig/emel_tests_bin --no-breaks --test-case="*graph*"
+./build/zig/emel_tests_bin --no-breaks --test-case="*x86_64*q4*"
+./build/zig/emel_tests_bin --no-breaks --test-case="co_sm*"
+./build/zig/emel_tests_bin --no-breaks --test-case="decode wavefront*"
+ctest --test-dir build/zig -R '^emel_tests_kernel_and_graph$' --output-on-failure
+ctest --test-dir build/zig -R '^emel_tests_generator_and_runtime$' --output-on-failure
+ctest --test-dir build/zig -R '^emel_tests_sm$' --output-on-failure
+EMEL_BENCH_ITERS=2000 \
+EMEL_BENCH_RUNS=5 \
+EMEL_BENCH_WARMUP_ITERS=100 \
+EMEL_BENCH_WARMUP_RUNS=1 \
+scripts/bench.sh --suite=decode_wavefront --compare
+```
+
+Result:
+
+| case | wavefront path | reserved-scalar baseline | ratio |
+| --- | ---: | ---: | ---: |
+| `decode_wavefront/batch1` | 417.890 ns/op | 340.800 ns/op | 1.226x |
+| `decode_wavefront/batch4` | 1533.400 ns/op | 1355.200 ns/op | 1.131x |
+| `decode_wavefront/batch8` | 2930.290 ns/op | 2713.990 ns/op | 1.080x |
+
+Interpretation:
+
+- The first useful `co_sm` target remains the graph execution boundary, but the current
+ decode-wavefront component is not a production win.
+- The earlier speedup claim was an apples-to-oranges comparison against full graph compute. The
+ corrected benchmark measures wavefront orchestration against direct reserved graph dispatch.
+- The current implementation still drains inside the top-level RTC call; that is intentional for
+ actor correctness. Future external completion backends must suspend only at explicit phase
+ boundaries without changing public graph/generator semantics.
+
+## Completed 5: Thread-Pool Scheduler Win Over Single-SM and llama.cpp
+
+Goal: get `co_sm` + `thread_pool_scheduler` to beat the single-`sm` baseline and
+llama.cpp on a realistic parallel decode workload (independent per-lane GEMVs).
+
+Status: achieved at small per-lane dims (the inter-op-parallelism regime), with a
+prerequisite scheduler deadlock fixed along the way.
+
+### Scheduler correctness fix (prerequisite)
+
+`emel::policy::thread_pool_scheduler` deadlocked under rapid repeated fork/join,
+most readily when lane count == worker count (the decode wavefront's 8-lane,
+8-worker config). A single dispatch rarely hit it, so existing tests passed while
+the bug was latent. Two independent races in the join latch:
+
+1. `join_group`'s close/complete handshake was a Dekker pattern with
+ release/acquire ordering. StoreLoad reordering let `wait()` miss the final
+ completion while the last completer missed the close, stranding the wakeup.
+2. The per-group `std::binary_semaphore` was destroy-during-release: groups are
+ stack-reused each round, so the waiter could return and destroy the group
+ while the last completer was still inside libc++ `release()`/notify (UAF).
+ `run_or_schedule_and_wait`'s local `done` semaphore had the same flaw.
+
+Fix: replaced the semaphore/closed handshake with a lifetime-safe spin-join on
+`pending_` (a completer's last touch is its decrement, so `wait()` returns only
+after every completer is done); `run_or_schedule_and_wait` spins on a local
+`atomic` the worker sets last. Added `emel::policy::cpu_relax()`. New
+regression test `thread_pool_scheduler_ref_fork_join_survives_rapid_repeated_rounds`
+(20000 rounds x 8 lanes). Validated: 80M+ batch-8 fork/joins with 0 stalls.
+
+### Warm worker loop
+
+Workers now spin-claim a wake permit (bounded `k_idle_spin_budget`) before
+falling back to a blocking acquire, keeping the pool warm across a burst of
+fork/joins and removing ~3-4us of resleep/wakeup latency per round (the same
+warm-polling strategy ggml's threadpool uses). The permit-per-task invariant is
+preserved, so there is no drift or lost-wakeup regression.
+
+### Evidence (Ryzen 9 5950X, 16C/32T; `-O3 -march=native`)
+
+In-repo, via the actual production decode wavefront actor (thread-pool co_sm)
+through `scripts/bench.sh --compare --suite=decode_wavefront`. New realistic
+cases run a decode-representative f32 GEMV (`y = W@x`, dim x dim, independent
+weights per lane); the `gemv_*` cases compare against the single graph sm
+(reserved scalar), the `ggml_*` cases against a ggml reference (independent
+mul_mat lanes, warm threadpool, same core budget). Default `dim=256`:
+
+| case (batch 8) | emel.cpp | baseline | result |
+| --- | ---: | ---: | --- |
+| `gemv_batch8` (vs single-sm) | ~8.4-10.5 us | ~33 us | 3.1-3.9x faster |
+| `ggml_batch8` (vs llama.cpp) | ~9.1-9.3 us | ~12.7 us | 1.37-1.41x faster (3 runs) |
+
+The single-sm win holds at every dim (3-6x). The llama.cpp win holds at small
+dims (dim<=256, ~1.4x) where inter-op parallelism dominates ggml's intra-op
+threading; the crossover is ~dim 384-512, where ggml's hand-optimized blocked
+kernel catches up. That crossover is a kernel-quality matter, not a scheduling
+one: the scheduler delivers near-linear parallelism (up to 7.85x on 8 lanes for
+bare GEMV lanes), and its overhead floor is ~5-10us (idle-pool worker wakeup),
+dwarfed by real decode work.
+
+Contrast: the original trivial-work cases (`batch8`, kernel = `*calls += 1`)
+still show the wavefront 2-4x slower, confirming the earlier "wavefront is
+slower" result was a fixture artifact, not a scheduler limitation.
+
+Caveat: EMEL's inter-op parallelism beats ggml only for independent-weight lanes
+(concurrent requests / multi-model / MoE). For shared-weight batched decode, ggml
+batches sequences into one GEMM and reuses the weight stream (bandwidth-optimal);
+the comparison here is framed as independent-weight concurrent lanes accordingly.
+
+## Completed 6: Parallel Matmul Cutover (view-sliced lanes, ith/nth removed)
+
+Goal: turn the proven thread-pool fork/join into a maintained single-request hot-path win for
+prefill GEMM and per-token decode GEMV, and remove ggml's `ith`/`nth` thread-partition fields so
+view slicing is the only parallelism model in the kernel event contract.
+
+Decision record:
+
+- `ith`/`nth` were declared on every kernel op event but never used for partitioning; the only
+ read site was a validator that rejected anything but `ith==0 && nth==1`. The architecture-native
+ slice descriptor is the `tensor_view` itself: a row slice is a smaller view, the kernel computes
+ whatever its views describe, and partition policy lives only at the orchestration fork site.
+ The fields and their validator clause are deleted; events remain complete work descriptions
+ with no thread identity.
+
+Implementation:
+
+- `src/emel/text/generator/detail.hpp` owns the lanes: `k_matmul_lanes == 8` per-lane
+ `emel::kernel::sm` actors plus an `std::optional` `thread_pool_scheduler<8, 16, 128>` engaged
+ once in `prepare()`. A parallel dispatch forks one logical mul_mat into pack-group-aligned
+ contiguous row-slice events (`compute_matmul_row_slices`, groups of 8 for `*_x8`, 4 for
+ `*_x4`, 1 otherwise), the caller computes slice 0 while pool workers compute the rest, and the
+ join completes before the action returns. Rejected submits run the same slice inline, so a
+ pool-worker caller (wavefront lane) degrades to serial automatically — lanes-first composition.
+- Lane mode is a compile-time template parameter threaded through the matmul helpers and
+ runners; route guards choose it: prefill parallel rows require the pool engaged and
+ `prompt_token_count >= k_parallel_min_prefill_tokens` (8), decode parallel rows require
+ `n_embd >= k_parallel_min_gemv_dim` (1024) so tiny models keep the scalar route. Parallel
+ contract rows sit above their serial siblings for flash materialized + preselected prefill
+ chunk routes and flash materialized + preselected decode scalar routes.
+- Evidence counters live per kernel actor, so audit reads sum the primary kernel and every lane
+ (`compute_kernel_counter_total`); `kernel_dispatch_calls` and route counters stay once per
+ logical matmul at the fork site.
+
+Evidence (Ryzen 9 5950X, zig c++ -O3 -mavx2 -mfma):
+
+```text
+parallel_matmul/gemv_f32 emel.cpp 285062 ns/op, reference-baseline 2419432 ns/op, ratio=0.118x
+parallel_matmul/gemv_q8_0 emel.cpp 54841 ns/op, reference-baseline 371154 ns/op, ratio=0.148x
+parallel_matmul/gemv_q4_k emel.cpp 68648 ns/op, reference-baseline 470182 ns/op, ratio=0.146x
+parallel_matmul/gemv_q6_k emel.cpp 43786 ns/op, reference-baseline 282983 ns/op, ratio=0.155x
+parallel_matmul/gemm8_f32 emel.cpp 156122 ns/op, reference-baseline 1265074 ns/op, ratio=0.123x
+```
+
+Interpretation: 6.5x-8.5x over serial single-kernel dispatch at dim 2048 across dtypes — the
+inter-op fork/join regime delivering near-linear scaling on 8 lanes, now on the maintained
+matmul dispatch path rather than a standalone component.
+
+Correctness:
+
+- Row slices write disjoint dst rows and reorder no reductions, so parallel output is
+ bit-identical to serial; `tests/text/generator/parallel_matmul_tests.cpp` proves slice
+ arithmetic, group alignment, and f32/q8_0 serial-vs-parallel byte equality, and the full
+ generator fixture suites pass with prompts >= 8 tokens taking the parallel prefill route.
+- The strict LFM2 x86_64 generation evidence failure is unchanged and pre-existing
+ (`optimized_q4_dispatch_calls=0 shared_q4_dispatch_calls=2378 optimized_q6_dispatch_calls=291`
+ — identical counts to the pre-cutover run, which also proves lane-counter aggregation
+ preserves evidence exactly).
+
+## Completed 7: Matched-Thread llama.cpp Comparison Lanes
+
+Goal: measure the view-sliced parallel matmul against llama.cpp/ggml at the same core budget
+instead of only against EMEL's own serial dispatch, per the reference-comparison rules.
+
+Kernel-class lane (`parallel_matmul/ggml_*` cases): the EMEL side runs the identical 8-lane
+fork/join as the plain-named cases; the reference side runs the same logical matmul as one ggml
+`mul_mat` node on a warm 8-thread ggml threadpool (the decode_wavefront reference pattern).
+Operand class is plain GGUF-native blocks on BOTH sides — this exercises EMEL's shared
+(non-repacked) kernels, not the repacked x4/x8 kernels the production decode routes dispatch
+after `prepare()`.
+
+Evidence (Ryzen 9 5950X, dim 2048, 8 threads both sides, iter=2000 runs=5):
+
+```text
+parallel_matmul/ggml_gemm8_f32 emel.cpp 170008 ns/op, llama.cpp 201689 ns/op, ratio=0.843x
+parallel_matmul/ggml_gemv_f32 emel.cpp 284070 ns/op, llama.cpp 29684 ns/op, ratio=9.570x
+parallel_matmul/ggml_gemv_q8_0 emel.cpp 55579 ns/op, llama.cpp 20757 ns/op, ratio=2.678x
+parallel_matmul/ggml_gemv_q4_k emel.cpp 68174 ns/op, llama.cpp 16335 ns/op, ratio=4.173x
+parallel_matmul/ggml_gemv_q6_k emel.cpp 44451 ns/op, llama.cpp 17805 ns/op, ratio=2.496x
+```
+
+Interpretation:
+
+- Prefill-shape GEMM: EMEL's inter-actor row slicing beats ggml's intra-op chunking (0.843x).
+- Decode-shape GEMV: the gap is per-kernel arithmetic on plain blocks, not orchestration — the
+ EMEL parallel numbers match serial/8 scaling exactly, and the serial kernels trail ggml's
+ vec_dot by the same factors (plain-q4_k ~4x, plain-q8_0 ~2.7x, f32 GEMV ~10x, consistent with
+ the known missing optimized plain-Q4 kernel and a near-scalar f32 GEMV path).
+- These rows are a kernel-class comparison and must not be quoted as production decode numbers:
+ production decode dispatches the repacked x4/x8 kernels. The production-class cross-engine
+ number is the end-to-end generation compare at matched reference threads (in progress; blocked
+ for LFM2-architecture fixtures on x86 by the same strict Q4 evidence audit until either the
+ optimized plain-Q4 kernel lands or the audit gains a quant-class-aware LFM2+Q8_0 branch).
+
+Follow-up work surfaced by this comparison: optimized plain-Q4 GEMV kernel (already the known
+blocker), SIMD f32 GEMV, and a repacked-operand cross-engine lane once llama.cpp's x86 repack
+path is wired into the reference fixture.
+
+End-to-end lane (LFM2.5-230M-Q8_0, prompt "hello", `EMEL_BENCH_REFERENCE_THREADS=8` so the
+reference runs the same 8-core budget as EMEL's lane pool; 1-thread reference rows kept for
+context):
+
+```text
+max_tokens_1 emel.cpp 443.0 ms/op, llama.cpp 27.1 ms/op (8t), ratio=16.337x [1t: 130.7 ms, 3.407x]
+max_tokens_100 emel.cpp 1002.6 ms/op, llama.cpp 514.0 ms/op (8t), ratio=1.950x
+```
+
+Decomposition (t100 minus t1, over 99 decode tokens): EMEL ~5.65 ms/token vs llama.cpp
+~4.92 ms/token — steady-state decode is within ~1.15x at matched threads, consistent with both
+sides being memory-bandwidth-bound on q8_0 GEMV at this model size. The end-to-end gap is
+concentrated in the first-token path: EMEL spends ~443 ms before the first sampled token where
+llama.cpp spends ~27 ms at 8 threads. The "hello" prompt is only a handful of tokens, so this is
+not GEMM volume; profiling the EMEL first-token/session path (prompt format, tokenize, prefill
+graph build/plan, first dispatch) is the top follow-up from this lane.
+
+## Phase 5: External Completion Backends
+
+Goal: use coroutine suspension only where an actual external completion source exists.
+
+Valid future boundaries:
+
+- accelerator submission/completion
+- OS-backed cold-load or staged-read completion
+- platform DMA/NPU completion
+
+Invalid boundaries:
+
+- CPU AArch64 or x86_64 kernel inner loops
+- software prefetch
+- per-token sampler loops
+- synthetic sleep/poll loops
+
+Acceptance:
+
+- Async backend routes are explicit guarded transitions.
+- Synchronous CPU routes remain direct and are not slower.
+- The EMEL lane remains EMEL-owned; no llama.cpp/ggml object is shared into runtime execution.
+
+## Validation Checklist
+
+Before claiming the graph processor coroutine phase is complete:
+
+- `emel_tests_sm` passes.
+- `emel_tests_kernel_and_graph` passes.
+- Changed-file scoped quality gate passes.
+- A graph processor benchmark exists and compares `emel::sm` vs async inline `emel::co_sm`.
+- No snapshots were updated without explicit approval.
+- Code review confirms no hidden runtime behavior selection was moved into coroutine bodies,
+ awaitables, actions, or detail helpers.
+
+Before making any broader maintained-generation throughput claim:
+
+- Generation benchmark evidence exists for single-token and multi-token maintained decode
+ workloads, not only the focused decode wavefront microbenchmark.
+- Snapshot/parity checks cover maintained generation fixtures.
+- A wavefront or external-completion implementation exists at the maintained generation entrypoint,
+ not only as a callable component.
+
+## Risk Register
+
+- `co_sm` adds overhead but no speedup.
+ Detection: graph processor benchmark.
+ Response: keep graph processor on `emel::sm` and use `co_sm` only where it preserves RTC or
+ where a later completion is modeled as an explicit external event.
+- Scheduler becomes a hidden queue.
+ Detection: SML rule review, callback-order tests, and escaping-task tests.
+ Response: restrict inference paths to inline/immediate-drain scheduling until a bounded driver
+ is explicitly approved.
+- Coroutine frame allocates on hot path.
+ Detection: allocator accounting and allocation instrumentation.
+ Response: increase fixed pool or reject dispatch; never fall back to heap in hot paths.
+- Awaitable chooses behavior.
+ Detection: action/guard branch tests and code review.
+ Response: move choice into guards and transition rows.
+- Decode batching changes outputs.
+ Detection: generation parity tests and compare summaries.
+ Response: fix lane ordering and publication; do not accept drift as a performance tradeoff.
+- Kernel route mismatch weakens parity claims.
+ Detection: diagnostics and runtime contract counters.
+ Response: benchmark only equivalent operand paths.
+
+## Decision Checkpoints
+
+- After graph processor measurement: continue only if the no-op `co_sm` conversion is neutral.
+- After RTC `process_event_async` wiring: continue only if RTC semantics and allocation
+ guarantees are proven.
+- After decode wavefront implementation: focused correctness passed, corrected performance did not.
+ Next checkpoint is either a real multi-lane scheduler integration that amortizes the overhead or
+ a different coroutine boundary with external completion work.
diff --git a/docs/rules/sml.rules.md b/docs/rules/sml.rules.md
index fa402701..9f1c9cd7 100644
--- a/docs/rules/sml.rules.md
+++ b/docs/rules/sml.rules.md
@@ -7,7 +7,10 @@ remaining synchronous run-to-completion (RTC) and using no message queue.
these rules apply to:
- stateforward.SML state machines (`stateforward::sml::sm<...>`) and their composition (composite state machines, orthogonal regions).
-- synchronous dispatch only (no background workers, no mailboxes, no async buffering).
+- synchronous RTC semantics only (no background workers, no mailboxes, no deferred buffering).
+- coroutine or `async`-named APIs when the caller observes quiescence before
+ the top-level dispatch returns. `async` is not deferred by definition; hidden
+ retention of work for later is what violates RTC/no-queue semantics.
the rules assume the project-pinned stateforward.SML semantics as implemented in the
local header and utility dispatch table, including typed completion propagation
@@ -25,6 +28,10 @@ primary sources consulted (non-exhaustive)
exposed events are immutable. internal-only events that are not publicly
exposed MAY carry mutable fields when needed for synchronous RTC handoff.
- RTC chain: the complete, synchronous computation triggered by one top-level dispatch call, including SML internal anonymous transitions.
+- RTC async/coroutine dispatch: a coroutine-backed dispatch whose completion is
+ driven and observed within the same RTC chain. such a call may expose a task
+ object internally, but no incomplete task or continuation may escape the RTC
+ boundary unless the later completion is modeled as an explicit external event.
- quiescence: a stable configuration where no further internal (anonymous) transitions are enabled.
- orchestrator: the external driver that calls `process_event` on actors and provides time and ordering.
- no message queue: no SML `process_queue`, no SML `defer_queue`, no user mailbox, and no “post for later” mechanism.
@@ -36,6 +43,11 @@ primary sources consulted (non-exhaustive)
4. single-writer invariant: during any RTC chain, exactly one thread MUST be executing inside any given actor’s `process_event`.
5. allocation invariant: no dynamic allocation (heap) MUST occur during dispatch (guards/actions/entry/exit/anonymous progress).
6. bounded-work invariant: each top-level dispatch MUST have a provable upper bound on executed transitions and on total work.
+7. coroutine invariant: `process_event_async` or other coroutine-backed dispatch
+ surfaces MAY be used only when their completion is immediate or driven to
+ quiescence before the caller returns from the enclosing RTC dispatch. a
+ scheduler may sequence continuations inside that chain, but MUST NOT retain
+ work as hidden deferred state.
## 4. event model
1. event types SHOULD be small, trivially copyable, and contain only immutable payload.
@@ -86,7 +98,12 @@ primary sources consulted (non-exhaustive)
3. for an external transition with entry/exit enabled, the order MUST be: guard, on-exit, state update, action, on-entry. this follows `transition<...>::execute` which calls `on_exit`, updates current state, executes action, then calls `on_entry`. (source: `stateforward/sml.hpp`, `transition, state, event, G, A>::execute`.)
### real-time and determinism constraints
-4. guards and actions MUST be bounded time and MUST NOT block (no I/O waits, no mutex waits, no sleeps).
+4. guards and actions MUST be bounded time and MUST NOT block on external
+ resources (no I/O waits, no mutex waits, no sleeps). an action MAY perform a
+ bounded RTC scheduler fork/join wait only after submitting child actor
+ dispatches whose completion is joined before the action returns; the join
+ MUST preserve single-writer per actor, MUST NOT re-enter the same actor, and
+ MUST NOT leave hidden deferred work.
5. guards and actions MUST NOT allocate. if an action MUST allocate for rare paths (e.g., error reporting), it MUST do so outside dispatch and only pass references into dispatch.
6. guards MUST NOT read wall-clock time. time MUST be provided explicitly via events (section 10).
7. actions MUST NOT contain orchestration branching or validation logic. any runtime control-flow
@@ -102,7 +119,7 @@ primary sources consulted (non-exhaustive)
transitions or explicit choices/states in the transition graph. only compile-time conditionals
(e.g., `if constexpr`, `#if`) are allowed inside actions, member methods, or functions called
from actions/member methods.
-9. actions SHOULD be short. long-running work MUST be split:
+9. actions SHOULD be short. long-running external work MUST be split:
- action initiates work and transitions to a “waiting” state.
- A later external event represents completion (still no queues).
10. actions SHOULD be `noexcept` in production builds. if exceptions are enabled, the system MUST define a hard policy for exception events and document action-throws semantics (overview page notes different semantics for guard-throws vs action-throws).
diff --git a/docs/third_party/sml.md b/docs/third_party/sml.md
index d8ad7e04..814bb43c 100644
--- a/docs/third_party/sml.md
+++ b/docs/third_party/sml.md
@@ -1718,15 +1718,16 @@ sml::sm, sml::thread_safe>
### emel extension: coroutine scheduler policy
`emel::co_sm` supports a scheduler policy in addition to SML policies.
-default is:
+default is the inline scheduler to preserve EMEL's synchronous RTC/no-queue actor
+contract:
```cpp
-emel::policy::coroutine_scheduler>
+emel::policy::coroutine_scheduler
```
```cpp
using inline_policy = emel::policy::coroutine_scheduler;
-emel::co_sm co;
+emel::co_sm co;
```
custom scheduler requirement:
diff --git a/scripts/quality_gates.sh b/scripts/quality_gates.sh
index f60d2e9b..ac3ebef3 100755
--- a/scripts/quality_gates.sh
+++ b/scripts/quality_gates.sh
@@ -361,7 +361,7 @@ bench_suite_supported_for_host() {
kernel_aarch64)
[[ "$host_arch" == "aarch64" || "$host_arch" == "arm64" ]]
;;
- sm_any)
+ sm_any|sm_scheduler)
[[ -n "${EMEL_BENCH_INTERNAL:-}" && "${EMEL_BENCH_INTERNAL:-}" != "0" ]]
;;
*)
@@ -996,6 +996,17 @@ run_benchmark_gates() {
bench_warmup_runs="${EMEL_QUALITY_GATES_DIARIZATION_BENCH_WARMUP_RUNS:-1}"
bench_tolerance="${EMEL_QUALITY_GATES_DIARIZATION_BENCH_TOLERANCE:-0.30}"
;;
+ parallel_matmul)
+ # Thread-pool fork/join cases need amortized measurement windows or
+ # post-build CPU contention dominates the short default runs.
+ bench_iters="${EMEL_QUALITY_GATES_PARALLEL_MATMUL_BENCH_ITERS:-2000}"
+ bench_runs="${EMEL_QUALITY_GATES_PARALLEL_MATMUL_BENCH_RUNS:-5}"
+ bench_warmup_iters="${EMEL_QUALITY_GATES_PARALLEL_MATMUL_BENCH_WARMUP_ITERS:-200}"
+ bench_warmup_runs="${EMEL_QUALITY_GATES_PARALLEL_MATMUL_BENCH_WARMUP_RUNS:-1}"
+ ;;
+ sm_any|sm_scheduler)
+ bench_extra_env+=(EMEL_BENCH_INTERNAL=1)
+ ;;
whisper_compare)
if run_step_allow_fail "bench_snapshot_${suite}" \
"$ROOT_DIR/scripts/bench_whisper_compare.sh"; then
diff --git a/scripts/test_with_coverage.sh b/scripts/test_with_coverage.sh
index 3b3e85b0..eb1a8216 100755
--- a/scripts/test_with_coverage.sh
+++ b/scripts/test_with_coverage.sh
@@ -6,6 +6,7 @@ BRANCH_COVERAGE_MIN="${BRANCH_COVERAGE_MIN:-50}"
COVERAGE_BUILD_DIR="${EMEL_COVERAGE_BUILD_DIR:-build/coverage}"
COVERAGE_CLEAN="${EMEL_COVERAGE_CLEAN:-0}"
COVERAGE_CHANGED_ONLY="${EMEL_COVERAGE_CHANGED_ONLY:-0}"
+COVERAGE_CHANGED_LINE_ONLY="${EMEL_COVERAGE_CHANGED_LINE_ONLY:-1}"
COVERAGE_BASE_REF="${EMEL_COVERAGE_BASE_REF:-origin/main}"
COVERAGE_CHANGED_FILES="${EMEL_COVERAGE_CHANGED_FILES:-}"
COVERAGE_TEST_REGEX="${EMEL_COVERAGE_TEST_REGEX:-}"
@@ -27,7 +28,12 @@ if ! command -v llvm-cov >/dev/null 2>&1 || ! command -v llvm-profdata >/dev/nul
done
fi
-for tool in cmake ctest gcovr clang-format llvm-cov llvm-profdata gcc g++; do
+required_tools=(cmake ctest gcovr clang-format llvm-cov llvm-profdata gcc g++)
+if [[ "$COVERAGE_CHANGED_ONLY" == "1" && "$COVERAGE_CHANGED_LINE_ONLY" != "0" ]]; then
+ required_tools+=(python3)
+fi
+
+for tool in "${required_tools[@]}"; do
if ! command -v "$tool" >/dev/null 2>&1; then
echo "error: required tool missing: $tool" >&2
exit 1
@@ -49,6 +55,7 @@ changed_shards=()
selected_test_dirs=()
selected_test_sources=()
unknown_changed_src=0
+coverage_base_ref_resolved="$COVERAGE_BASE_REF"
is_coverage_excluded_src_file() {
local file="$1"
@@ -167,6 +174,7 @@ if [[ "$COVERAGE_CHANGED_ONLY" == "1" ]]; then
echo "warning: unable to resolve coverage base ref, using HEAD" >&2
fi
fi
+ coverage_base_ref_resolved="$base_ref"
if [[ -n "$COVERAGE_CHANGED_FILES" ]]; then
while IFS= read -r file; do
@@ -277,8 +285,8 @@ cmake -S . -B "$COVERAGE_BUILD_DIR" -G Ninja \
-DCMAKE_BUILD_TYPE=Debug \
-DCMAKE_C_COMPILER=gcc \
-DCMAKE_CXX_COMPILER=g++ \
- -DCMAKE_C_FLAGS="--coverage -O0" \
- -DCMAKE_CXX_FLAGS="--coverage -O0" \
+ -DCMAKE_C_FLAGS="--coverage -O0 -fprofile-update=atomic" \
+ -DCMAKE_CXX_FLAGS="--coverage -O0 -fprofile-update=atomic" \
-DCMAKE_EXE_LINKER_FLAGS="--coverage" \
-DEMEL_TEST_EXTRA_ARG="$COVERAGE_TEST_EXTRA_ARG" \
-DEMEL_TEST_SHARDS="$COVERAGE_TEST_SHARDS"
@@ -319,6 +327,187 @@ if [[ "$COVERAGE_CHANGED_ONLY" == "1" &&
fi
fi
+collect_changed_lines() {
+ local output_file="$1"
+ shift
+
+ : > "$output_file"
+ if [[ "$COVERAGE_CHANGED_ONLY" != "1" || "$COVERAGE_CHANGED_LINE_ONLY" == "0" ]]; then
+ return 0
+ fi
+
+ python3 - "$coverage_base_ref_resolved" "$output_file" "$@" <<'PY'
+import pathlib
+import re
+import subprocess
+import sys
+
+base_ref = sys.argv[1]
+output_path = pathlib.Path(sys.argv[2])
+files = sys.argv[3:]
+hunk_re = re.compile(r"^@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@")
+changed = {}
+
+
+def run_git(args):
+ try:
+ return subprocess.run(
+ ["git", *args],
+ check=False,
+ text=True,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.DEVNULL,
+ ).stdout
+ except OSError:
+ return ""
+
+
+def add_line(path, line):
+ if line > 0:
+ changed.setdefault(path, set()).add(line)
+
+
+def parse_diff(text):
+ current_file = None
+ new_line = None
+ for raw in text.splitlines():
+ if raw.startswith("+++ b/"):
+ current_file = raw[6:]
+ continue
+ if raw.startswith("+++ "):
+ current_file = None
+ continue
+ match = hunk_re.match(raw)
+ if match:
+ new_line = int(match.group(1))
+ continue
+ if current_file is None or new_line is None:
+ continue
+ if raw.startswith("+") and not raw.startswith("+++"):
+ add_line(current_file, new_line)
+ new_line += 1
+ elif raw.startswith("-") and not raw.startswith("---"):
+ continue
+ elif raw.startswith(" "):
+ new_line += 1
+
+
+for path in files:
+ if base_ref != "HEAD":
+ parse_diff(run_git(["diff", "--unified=0", f"{base_ref}...HEAD", "--", path]))
+ parse_diff(run_git(["diff", "--unified=0", "--", path]))
+ parse_diff(run_git(["diff", "--cached", "--unified=0", "--", path]))
+
+ if run_git(["ls-files", "--others", "--exclude-standard", "--", path]).strip():
+ try:
+ line_count = len(pathlib.Path(path).read_text(errors="ignore").splitlines())
+ except OSError:
+ line_count = 0
+ for line in range(1, line_count + 1):
+ add_line(path, line)
+
+with output_path.open("w", encoding="utf-8") as output:
+ for path in sorted(changed):
+ for line in sorted(changed[path]):
+ output.write(f"{path}\t{line}\n")
+PY
+}
+
+enforce_changed_line_coverage() {
+ local changed_lines_file="$1"
+ local coverage_json="$2"
+
+ python3 - "$changed_lines_file" "$coverage_json" "$LINE_COVERAGE_MIN" \
+ "$BRANCH_COVERAGE_MIN" <<'PY'
+import json
+import pathlib
+import sys
+
+changed_lines_path = pathlib.Path(sys.argv[1])
+coverage_json_path = pathlib.Path(sys.argv[2])
+line_min = float(sys.argv[3])
+branch_min = float(sys.argv[4])
+
+changed = {}
+for raw in changed_lines_path.read_text(encoding="utf-8").splitlines():
+ if not raw:
+ continue
+ path, line_text = raw.split("\t", 1)
+ changed.setdefault(path, set()).add(int(line_text))
+
+with coverage_json_path.open(encoding="utf-8") as coverage_file:
+ report = json.load(coverage_file)
+
+line_records = {}
+for file_record in report.get("files", []):
+ path = file_record.get("file", "")
+ records = {}
+ for line_record in file_record.get("lines", []):
+ records[int(line_record["line_number"])] = line_record
+ line_records[path] = records
+
+line_total = 0
+line_covered = 0
+branch_total = 0
+branch_covered = 0
+missing_lines = []
+
+for path in sorted(changed):
+ records = line_records.get(path, {})
+ for line_number in sorted(changed[path]):
+ record = records.get(line_number)
+ if record is None:
+ continue
+ line_total += 1
+ if int(record.get("count", 0)) > 0:
+ line_covered += 1
+ else:
+ missing_lines.append(f"{path}:{line_number}")
+ for branch in record.get("branches", []):
+ branch_total += 1
+ if int(branch.get("count", 0)) > 0:
+ branch_covered += 1
+
+if line_total == 0:
+ print("changed-line coverage: no executable changed lines found")
+ sys.exit(0)
+
+line_percent = (line_covered * 100.0) / line_total
+if branch_total == 0:
+ branch_percent = 100.0
+else:
+ branch_percent = (branch_covered * 100.0) / branch_total
+
+print(
+ "changed-line coverage: "
+ f"lines {line_covered}/{line_total} ({line_percent:.1f}%), "
+ f"branches {branch_covered}/{branch_total} ({branch_percent:.1f}%)"
+)
+
+if missing_lines:
+ preview = ", ".join(missing_lines[:20])
+ if len(missing_lines) > 20:
+ preview += f", ... +{len(missing_lines) - 20} more"
+ print(f"changed-line coverage missing: {preview}", file=sys.stderr)
+
+failed = False
+if line_percent + 1e-9 < line_min:
+ print(
+ f"error: changed-line coverage {line_percent:.1f}% below required {line_min:.1f}%",
+ file=sys.stderr,
+ )
+ failed = True
+if branch_percent + 1e-9 < branch_min:
+ print(
+ f"error: changed-branch coverage {branch_percent:.1f}% below required {branch_min:.1f}%",
+ file=sys.stderr,
+ )
+ failed = True
+
+sys.exit(1 if failed else 0)
+PY
+}
+
cpu_count=2
if command -v nproc >/dev/null 2>&1; then
cpu_count="$(nproc)"
@@ -406,18 +595,43 @@ fi
echo "enforcing coverage thresholds: line >= ${LINE_COVERAGE_MIN}%, branch >= ${BRANCH_COVERAGE_MIN}%"
-gcovr \
- --root . \
- -j "$COVERAGE_GCOV_JOBS" \
- "${coverage_filters[@]}" \
- --exclude tests \
- --exclude 'src/emel/.*/sm.hpp' \
- --gcov-ignore-errors no_working_dir_found \
- --gcov-ignore-parse-errors suspicious_hits.warn_once_per_file \
- --exclude-throw-branches \
- --exclude-unreachable-branches \
- --txt-summary \
- --print-summary \
- --fail-under-line "$LINE_COVERAGE_MIN" \
- --fail-under-branch "$BRANCH_COVERAGE_MIN" \
- "${coverage_search_paths[@]}"
+if [[ "$COVERAGE_CHANGED_ONLY" == "1" && "$COVERAGE_CHANGED_LINE_ONLY" != "0" ]]; then
+ changed_lines_file="$COVERAGE_BUILD_DIR/changed-lines.tsv"
+ coverage_json="$COVERAGE_BUILD_DIR/coverage.json"
+ collect_changed_lines "$changed_lines_file" "${changed_files[@]}"
+ gcovr \
+ --root . \
+ -j "$COVERAGE_GCOV_JOBS" \
+ "${coverage_filters[@]}" \
+ --exclude tests \
+ --exclude 'src/emel/.*/sm.hpp' \
+ --gcov-ignore-errors no_working_dir_found \
+ --gcov-ignore-parse-errors suspicious_hits.warn_once_per_file \
+ --gcov-ignore-parse-errors negative_hits.warn_once_per_file \
+ --merge-mode-functions separate \
+ --exclude-throw-branches \
+ --exclude-unreachable-branches \
+ --txt-summary \
+ --print-summary \
+ --json "$coverage_json" \
+ "${coverage_search_paths[@]}"
+ enforce_changed_line_coverage "$changed_lines_file" "$coverage_json"
+else
+ gcovr \
+ --root . \
+ -j "$COVERAGE_GCOV_JOBS" \
+ "${coverage_filters[@]}" \
+ --exclude tests \
+ --exclude 'src/emel/.*/sm.hpp' \
+ --gcov-ignore-errors no_working_dir_found \
+ --gcov-ignore-parse-errors suspicious_hits.warn_once_per_file \
+ --gcov-ignore-parse-errors negative_hits.warn_once_per_file \
+ --merge-mode-functions separate \
+ --exclude-throw-branches \
+ --exclude-unreachable-branches \
+ --txt-summary \
+ --print-summary \
+ --fail-under-line "$LINE_COVERAGE_MIN" \
+ --fail-under-branch "$BRANCH_COVERAGE_MIN" \
+ "${coverage_search_paths[@]}"
+fi
diff --git a/snapshots/bench/benchmarks.txt b/snapshots/bench/benchmarks.txt
index 602d5759..b94ceaa0 100644
--- a/snapshots/bench/benchmarks.txt
+++ b/snapshots/bench/benchmarks.txt
@@ -1,5 +1,5 @@
# ref=c5a3bc39b1b0fe56954c6adb99e89b25d5e7b9cb
-# toolchain=/opt/homebrew/bin/zig
+# toolchain=/shared/zig/zig
# benchmark_config: iterations=100 runs=3 sample_policy=median warmup_iterations=10 warmup_runs=1 generation_iterations=1 generation_runs=3 generation_warmup_iterations=0 generation_warmup_runs=0
batch/planner_equal ns_per_op=1467.910 iter=100 runs=3
batch/planner_seq ns_per_op=1736.250 iter=100 runs=3
@@ -84,3 +84,32 @@ tokenizer/preprocessor_ugm_long ns_per_op=4188.750 iter=100 runs=3
tokenizer/preprocessor_ugm_short ns_per_op=2497.080 iter=100 runs=3
tokenizer/preprocessor_wpm_long ns_per_op=4037.500 iter=100 runs=3
tokenizer/preprocessor_wpm_short ns_per_op=2505.840 iter=100 runs=3
+kernel/x86_64/op_add ns_per_op=122.000 iter=100 runs=3
+kernel/x86_64/op_cos ns_per_op=1926.490 iter=100 runs=3
+kernel/x86_64/op_div ns_per_op=118.500 iter=100 runs=3
+kernel/x86_64/op_dup ns_per_op=79.100 iter=100 runs=3
+kernel/x86_64/op_log ns_per_op=3055.990 iter=100 runs=3
+kernel/x86_64/op_mul ns_per_op=123.000 iter=100 runs=3
+kernel/x86_64/op_mul_mat ns_per_op=2485.590 iter=100 runs=3
+kernel/x86_64/op_sin ns_per_op=1619.190 iter=100 runs=3
+kernel/x86_64/op_soft_max ns_per_op=4899.380 iter=100 runs=3
+kernel/x86_64/op_sqr ns_per_op=82.100 iter=100 runs=3
+kernel/x86_64/op_sqrt ns_per_op=151.400 iter=100 runs=3
+kernel/x86_64/op_sub ns_per_op=100.500 iter=100 runs=3
+kernel/x86_64/op_unary_exp ns_per_op=3729.890 iter=100 runs=3
+kernel/x86_64/op_unary_neg ns_per_op=81.300 iter=100 runs=3
+kernel/x86_64/op_unary_relu ns_per_op=113.300 iter=100 runs=3
+kernel/x86_64/op_flash_attn_ext_decode_like ns_per_op=172.200 iter=100 runs=3
+kernel/x86_64/op_mul_mat_q2_k_q8_k ns_per_op=1351.800 iter=100 runs=3
+kernel/x86_64/op_mul_mat_q3_k_q8_k ns_per_op=1419.290 iter=100 runs=3
+kernel/x86_64/op_mul_mat_q6_k_q8_k ns_per_op=1345.190 iter=100 runs=3
+parallel_matmul/gemm8_f32 ns_per_op=168915.442 iter=2000 runs=5
+parallel_matmul/gemv_f32 ns_per_op=298427.410 iter=2000 runs=5
+parallel_matmul/gemv_q4_k ns_per_op=69830.877 iter=2000 runs=5
+parallel_matmul/gemv_q6_k ns_per_op=49344.404 iter=2000 runs=5
+parallel_matmul/gemv_q8_0 ns_per_op=55436.694 iter=2000 runs=5
+parallel_matmul/ggml_gemm8_f32 ns_per_op=169618.054 iter=2000 runs=5
+parallel_matmul/ggml_gemv_f32 ns_per_op=288998.998 iter=2000 runs=5
+parallel_matmul/ggml_gemv_q4_k ns_per_op=68885.360 iter=2000 runs=5
+parallel_matmul/ggml_gemv_q6_k ns_per_op=51010.082 iter=2000 runs=5
+parallel_matmul/ggml_gemv_q8_0 ns_per_op=55113.300 iter=2000 runs=5
diff --git a/snapshots/bench/benchmarks_compare.txt b/snapshots/bench/benchmarks_compare.txt
index 78a79922..4e6c06f8 100644
--- a/snapshots/bench/benchmarks_compare.txt
+++ b/snapshots/bench/benchmarks_compare.txt
@@ -59,6 +59,16 @@ logits/validator_sml/vocab_32000 emel.cpp 24274.209 ns/op, llama.cpp 23702.209 n
memory/hybrid_full emel.cpp 444.458 ns/op, llama.cpp 34307.000 ns/op, ratio=0.013x
memory/kv_full emel.cpp 131.833 ns/op, llama.cpp 33199.750 ns/op, ratio=0.004x
memory/recurrent_full emel.cpp 148.625 ns/op, llama.cpp 4460.167 ns/op, ratio=0.033x
+parallel_matmul/gemm8_f32 emel.cpp 161865.350 ns/op, reference-baseline 1249248.735 ns/op, ratio=0.130x
+parallel_matmul/gemv_f32 emel.cpp 291074.740 ns/op, reference-baseline 2404805.169 ns/op, ratio=0.121x
+parallel_matmul/gemv_q4_k emel.cpp 68127.424 ns/op, reference-baseline 469031.650 ns/op, ratio=0.145x
+parallel_matmul/gemv_q6_k emel.cpp 43589.802 ns/op, reference-baseline 284102.823 ns/op, ratio=0.153x
+parallel_matmul/gemv_q8_0 emel.cpp 55314.849 ns/op, reference-baseline 371634.264 ns/op, ratio=0.149x
+parallel_matmul/ggml_gemm8_f32 emel.cpp 170008.552 ns/op, llama.cpp 201689.476 ns/op, ratio=0.843x
+parallel_matmul/ggml_gemv_f32 emel.cpp 284070.253 ns/op, llama.cpp 29684.396 ns/op, ratio=9.570x
+parallel_matmul/ggml_gemv_q4_k emel.cpp 68174.144 ns/op, llama.cpp 16335.825 ns/op, ratio=4.173x
+parallel_matmul/ggml_gemv_q6_k emel.cpp 44451.588 ns/op, llama.cpp 17805.679 ns/op, ratio=2.496x
+parallel_matmul/ggml_gemv_q8_0 emel.cpp 55579.414 ns/op, llama.cpp 20757.682 ns/op, ratio=2.678x
text/encoders/bpe_long emel.cpp 65.417 ns/op, llama.cpp 66.333 ns/op, ratio=0.986x
text/encoders/bpe_short emel.cpp 58.416 ns/op, llama.cpp 57.000 ns/op, ratio=1.025x
text/encoders/fallback_long emel.cpp 2432.500 ns/op, llama.cpp 2504.917 ns/op, ratio=0.971x
diff --git a/snapshots/lint/clang_format.txt b/snapshots/lint/clang_format.txt
index cec88f38..25646feb 100644
--- a/snapshots/lint/clang_format.txt
+++ b/snapshots/lint/clang_format.txt
@@ -448,6 +448,12 @@ src/emel/text/formatter/guards.hpp
src/emel/text/formatter/sm.hpp
src/emel/text/generator/actions.hpp
src/emel/text/generator/context.hpp
+src/emel/text/generator/decode_wavefront/actions.hpp
+src/emel/text/generator/decode_wavefront/context.hpp
+src/emel/text/generator/decode_wavefront/errors.hpp
+src/emel/text/generator/decode_wavefront/events.hpp
+src/emel/text/generator/decode_wavefront/guards.hpp
+src/emel/text/generator/decode_wavefront/sm.hpp
src/emel/text/generator/detail.hpp
src/emel/text/generator/errors.hpp
src/emel/text/generator/events.hpp
@@ -610,6 +616,7 @@ tests/text/encoders/test_support.hpp
tests/text/encoders/ugm_tests.cpp
tests/text/encoders/wpm_tests.cpp
tests/text/generator/action_guard_tests.cpp
+tests/text/generator/decode_wavefront/lifecycle_tests.cpp
tests/text/generator/detail_tests.cpp
tests/text/generator/initializer/lifecycle_tests.cpp
tests/text/generator/legacy_compatibility_tests.cpp
diff --git a/snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_10.txt b/snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_10.txt
index 2cb67859..3c61e559 100644
--- a/snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_10.txt
+++ b/snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_10.txt
@@ -4,8 +4,8 @@ fixture=LFM2.5-1.2B-Thinking-Q4_K_M.gguf
prompt_hex=68656c6c6f
max_tokens=10
tokens_generated=10
-output_length=22
-trace_token_count=0
-token_ids=
-top_score_gaps=
-output_hex=3c7c696d5f656e647c3ec48a3c7c696d5f7374617274
+output_length=20
+trace_token_count=10
+token_ids=537,601,834,36171,601,539,834,36509,601,539
+top_score_gaps=1.63354683,0.24605751,1.70043945,0.368387222,2.36488152,1.36843204,1.18816185,0.351781845,0.625753403,0.952269554
+output_hex=3c7c696d5f73746172747c3e696d5f656e647c3e
diff --git a/snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_100.txt b/snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_100.txt
index 3d362263..7e8d2737 100644
--- a/snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_100.txt
+++ b/snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_100.txt
@@ -4,8 +4,8 @@ fixture=LFM2.5-1.2B-Thinking-Q4_K_M.gguf
prompt_hex=68656c6c6f
max_tokens=100
tokens_generated=100
-output_length=277
-trace_token_count=0
-token_ids=
-top_score_gaps=
-output_hex=3c7c696d5f656e647c3ec48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e
+output_length=248
+trace_token_count=100
+token_ids=537,601,834,36171,601,539,834,36509,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171
+top_score_gaps=1.63354683,0.24605751,1.70043945,0.368387222,2.36488152,1.36843204,1.18816185,0.351781845,0.625753403,0.952269554,0.768082619,2.13955879,2.69776535,1.23970318,2.68403435,1.13820744,2.23869896,2.92936611,4.29071426,3.37551308,2.9721241,3.5968895,4.17213821,3.48550129,3.24224091,3.93832588,4.05487919,3.19284058,3.71522141,4.33257675,3.83996201,3.52476025,4.02193642,4.62951469,3.52591896,3.95395088,4.03614426,4.70890999,3.40218353,3.98209953,4.25673676,5.17098618,3.2824955,4.25911903,4.17467308,5.44502449,3.17495728,4.76317406,4.16145325,5.78856754,2.96655083,4.8255949,4.3292141,6.17603111,2.71598053,5.10662651,4.45320892,6.61788273,2.92082787,5.51133347,4.63360023,7.34527493,3.01860237,5.74898243,4.75855064,7.53805542,3.2661705,5.98014736,4.62242317,7.30112839,3.09925079,6.13972569,4.70379066,7.18179703,3.05636215,6.00573921,4.72933769,7.15433216,2.95344353,6.03799915,4.67574883,6.99351788,2.87396622,5.88299274,4.66538811,6.88072109,2.78374863,5.97683144,4.68003654,7.06533813,2.74716759,6.03071404,4.81233406,6.91018963,2.93097115,6.01890755,4.90971947,7.00140572,3.00832558,6.27520561
+output_hex=3c7c696d5f73746172747c3e696d5f656e647c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f7374617274
diff --git a/snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_1000.txt b/snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_1000.txt
index 3edf9e63..0c445906 100644
--- a/snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_1000.txt
+++ b/snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_1000.txt
@@ -4,8 +4,8 @@ fixture=LFM2.5-1.2B-Thinking-Q4_K_M.gguf
prompt_hex=68656c6c6f
max_tokens=1000
tokens_generated=1000
-output_length=2866
-trace_token_count=0
-token_ids=
-top_score_gaps=
-output_hex=3c7c696d5f656e647c3ec48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c
+output_length=2498
+trace_token_count=1000
+token_ids=537,601,834,36171,601,539,834,36509,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171
+top_score_gaps=1.63354683,0.24605751,1.70043945,0.368387222,2.36488152,1.36843204,1.18816185,0.351781845,0.625753403,0.952269554,0.768082619,2.13955879,2.69776535,1.23970318,2.68403435,1.13820744,2.23869896,2.92936611,4.29071426,3.37551308,2.9721241,3.5968895,4.17213821,3.48550129,3.24224091,3.93832588,4.05487919,3.19284058,3.71522141,4.33257675,3.83996201,3.52476025,4.02193642,4.62951469,3.52591896,3.95395088,4.03614426,4.70890999,3.40218353,3.98209953,4.25673676,5.17098618,3.2824955,4.25911903,4.17467308,5.44502449,3.17495728,4.76317406,4.16145325,5.78856754,2.96655083,4.8255949,4.3292141,6.17603111,2.71598053,5.10662651,4.45320892,6.61788273,2.92082787,5.51133347,4.63360023,7.34527493,3.01860237,5.74898243,4.75855064,7.53805542,3.2661705,5.98014736,4.62242317,7.30112839,3.09925079,6.13972569,4.70379066,7.18179703,3.05636215,6.00573921,4.72933769,7.15433216,2.95344353,6.03799915,4.67574883,6.99351788,2.87396622,5.88299274,4.66538811,6.88072109,2.78374863,5.97683144,4.68003654,7.06533813,2.74716759,6.03071404,4.81233406,6.91018963,2.93097115,6.01890755,4.90971947,7.00140572,3.00832558,6.27520561,5.15449142,7.17754936,2.92182541,6.29348278,5.18918228,7.24613285,3.15817451,6.33448505,4.93919563,7.18188667,3.37161064,6.30781746,4.91264725,7.07508183,3.39411163,6.04449177,4.82236099,7.04416466,3.43402481,6.01409435,4.9267807,6.96591759,3.58568573,6.17149067,4.95745277,6.97708702,3.37389755,6.09702206,4.95066452,6.97212124,3.60499191,6.09376431,4.96656036,7.05609894,3.57179451,6.17537308,5.1514473,7.04147434,3.77630997,6.13268852,5.09565926,6.96484566,3.54657745,6.01200008,4.94233513,6.86031246,3.45708466,6.03563213,4.89934349,6.94096279,3.4433136,5.99887848,4.86978531,6.92713928,3.53595161,6.00555229,4.82303429,6.73809147,3.60980988,6.15188026,4.89002609,6.76029205,3.54994011,6.22898483,5.07240486,6.92019844,3.61861992,6.35840511,5.10358047,7.30640602,3.76493835,6.4697628,5.11379051,7.22468185,3.70443726,6.5120163,5.46486664,7.34616852,3.78965569,6.55810165,5.45630836,7.16510868,3.69276428,6.49792957,5.34859467,7.05204105,3.94773293,6.51829338,5.21468735,6.96044922,3.90048409,6.4548893,5.10279465,7.03608704,3.78993988,6.43599892,5.17170334,7.01614761,3.87434769,6.62017727,5.17409325,7.15720749,3.97450638,6.61591053,5.25351143,7.26149273,3.98797417,6.64940262,5.34532928,7.32811165,3.98655319,6.69561577,5.16225433,7.27540207,4.00711823,6.64959717,5.02927208,7.42131424,4.10794449,6.3504343,4.87427139,7.13493919,3.94173241,6.30237961,4.84730148,7.09026337,3.92112732,6.32969284,4.62829781,7.07922745,4.10039711,6.18699265,4.70614624,7.1133213,4.20389748,6.15161514,4.83810043,7.41781616,4.01337242,6.23406219,4.75249863,7.52049255,4.09392929,6.25523186,4.89223289,7.58530617,4.00470161,6.19423866,4.96609688,7.46766281,4.12454033,6.1914463,5.01087952,7.41868401,4.23431969,6.29924107,4.94741821,7.5768671,4.05877686,6.26094437,4.80808067,7.63875198,3.96557426,6.43565464,4.96214485,7.78561115,4.05957985,6.38298512,5.1269722,7.91959,3.92103386,6.44330025,5.11893082,8.00494957,4.28379822,6.6187439,5.38516235,8.07876205,4.15754318,6.54335213,5.42339897,8.25018501,4.3666153,6.68294811,5.56444168,8.14664459,4.17339706,6.61001778,5.40484047,8.02812004,4.36603737,6.56625366,5.28453064,8.07682133,4.40487862,6.56387424,5.29524231,8.02450943,4.54405975,6.52717209,5.29816246,7.93931389,4.51352692,6.57671928,5.28840065,7.92730904,4.34979439,6.62380219,5.32158279,8.08699226,4.45905304,6.74461555,5.37109566,8.01555252,4.28594398,6.71599197,5.35267067,8.05058193,4.37354851,6.65527153,5.33983231,8.03376389,4.39024544,6.70316219,5.38566208,8.02507591,4.35650253,6.66013336,5.26818275,7.8992281,4.32999611,6.63684082,5.07611465,7.84050751,4.19766235,6.50216198,5.080616,7.63294029,4.28756142,6.36629295,5.14682388,7.84460831,4.37177849,6.39452362,5.16367912,7.92552376,4.41466904,6.38336849,4.97603226,7.80973053,4.11863708,6.37950706,5.05570221,7.71188354,4.16390991,6.27833557,5.06598091,7.64024639,4.0553093,6.19119263,4.87779999,7.41340923,4.0946579,6.01751041,4.71178055,7.21930695,4.10017014,5.8859024,4.61994362,7.07185936,4.1017189,6.08514118,4.74275017,7.07374763,4.07217216,6.06008625,4.8759613,7.2887888,4.13499832,6.08717442,4.74877548,7.35038948,4.08800316,6.06817341,4.89426994,7.41510391,4.19381905,6.14213562,4.93709755,7.34150124,4.20363045,6.1820755,4.94212151,7.2675581,4.21402168,6.01086426,4.76105499,7.25420189,4.28453445,6.07355881,4.79317093,7.27148342,4.3359375,6.00758171,4.73082161,7.25371647,4.21537971,6.1421833,4.83570862,7.23797989,4.33719826,6.19045448,4.82212067,7.38318729,4.44676781,6.36742401,5.06836319,7.51957321,4.25700378,6.49521255,5.04674339,7.74581528,4.51745796,6.58267212,5.09534645,7.86675549,4.37240219,6.53255749,5.19275475,7.81407547,4.57707977,6.70201206,5.31071663,7.83781242,4.52520943,6.77231121,5.24424362,7.89129162,4.45015717,6.80490875,5.34228516,7.96236897,4.61785316,6.88516998,5.5496273,8.11745071,4.55598831,6.85617638,5.71512413,8.24121666,4.6751976,6.83379936,5.54876709,8.29695511,4.87487793,6.82317066,5.6565876,8.33533955,4.762537,6.85702896,5.7570076,8.25852871,4.78178787,6.78629494,5.55778885,8.11906719,4.96131325,6.81518841,5.51818466,8.08282566,4.86157417,6.61240959,5.38890076,8.08644199,4.86723518,6.59597778,5.24499702,7.97181511,5.01809883,6.68584538,5.36872482,8.0396328,5.19913483,6.58750725,5.40529823,8.18553162,5.18746948,6.59198666,5.58506775,8.21709538,5.31076431,6.60499001,5.49154663,8.25622177,5.27767372,6.74274158,5.57793808,8.26787472,5.22988892,6.64479065,5.36212921,8.06951714,5.23572159,6.41202354,5.37818527,8.13205624,5.14180756,6.52160168,5.34617615,8.12040615,5.107687,6.32893276,5.25270462,8.17889786,5.06069946,6.37924767,5.24970627,8.05556679,5.06832504,6.30836868,5.28464127,8.16378784,5.03841019,6.41894913,5.33346939,8.22968292,5.06711388,6.51340485,5.20555687,8.22129726,5.09238243,6.48105621,5.35327721,8.19785118,4.97625923,6.47792244,5.21417236,8.16148949,5.10004044,6.36968708,5.15242386,8.18161964,5.25624275,6.42994118,5.04351425,8.07405853,5.14030075,6.36172295,5.17052841,8.07892609,5.3619957,6.40137672,5.1654892,8.08720398,5.26086998,6.52788067,5.17366791,8.21195412,5.31811333,6.3486805,5.2519722,8.13913536,5.1263504,6.48794174,5.15417099,8.24138641,5.19828033,6.4694109,5.15841675,8.11807919,5.26512909,6.4569788,5.02939224,8.0028286,5.27293015,6.41951847,5.05481339,7.95901394,5.13341141,6.40538025,5.09624863,7.89557838,5.10813522,6.39192867,4.94404411,8.05454636,5.09833908,6.39883041,5.02837563,7.94150639,5.11763573,6.47364807,5.03023148,8.06495667,5.14579773,6.60127258,5.12008476,8.17419147,5.19213867,6.64751625,5.22068787,8.2072897,5.22324181,6.7547102,5.06778336,8.21466637,5.1476593,6.48773956,5.14493179,8.07195187,5.22911453,6.43293762,5.0694809,8.07123947,5.09415245,6.44200325,4.99944496,8.02376366,5.20451736,6.44797516,4.95704269,7.89868736,5.18758774,6.41598701,5.11302948,7.97619629,5.19637299,6.46459389,5.03109741,8.1726284,5.26836586,6.57701588,5.15589333,8.15628338,5.35698318,6.62969398,5.16781044,8.20305729,5.29644585,6.56591606,5.08177757,8.19900513,5.2761116,6.591362,5.2937355,8.13578987,5.49449158,6.57679653,5.1789093,8.11975861,5.58026123,6.61626053,5.2122612,8.24708176,5.47430611,6.65134621,5.20296288,8.35219193,5.40405083,6.84568024,5.10887718,8.35218906,5.3861866,6.80909538,5.36335564,8.45079422,5.4583149,6.89624214,5.41325378,8.56411076,5.50897408,6.94883919,5.44528198,8.51302719,5.48005104,6.98685455,5.4705162,8.64134502,5.42014885,7.02066708,5.47264862,8.60786057,5.440485,6.98165894,5.46196365,8.65524483,5.47345161,7.0641737,5.43378448,8.56656456,5.54297066,7.14050674,5.41973114,8.63925457,5.5287323,7.16239738,5.47761536,8.63204002,5.67994881,7.16528511,5.51246071,8.77296448,5.6181469,7.21063805,5.43946266,8.72153854,5.63790512,7.20365524,5.50118637,8.6414566,5.72280502,6.98625469,5.51038742,8.62572098,5.6904583,6.98760986,5.48636055,8.46716499,5.63944435,6.90142155,5.47950363,8.37626266,5.78129387,6.63391399,5.22815704,8.26700592,5.74568748,6.68688488,5.30966568,8.28059769,5.69919205,6.70224953,5.1866684,8.19664383,5.7410202,6.61538696,5.24163437,8.27741814,5.77989006,6.60492134,5.29459763,8.27816582,5.64590836,6.62918377,5.17449951,8.26740456,5.76762009,6.64905167,5.04925537,8.10276413,5.65835571,6.41890526,4.85007858,7.92645931,5.69874954,6.28791809,4.78050423,7.70150566,5.51111221,6.37421513,4.91578293,7.76987171,5.54414558,6.41958523,4.92844582,7.89545631,5.41973686,6.45242977,4.99083138,8.0347271,5.54695511,6.4104414,4.98251724,8.00759697,5.54224396,6.41043377,5.18890572,8.16882515,5.6139183,6.51286221,5.11805153,7.98017502,5.47458076,6.45481491,5.00892639,7.92031479,5.53252602,6.34074783,5.01001358,7.86529732,5.5212059,6.27339554,4.8295517,7.80489445,5.48496819,6.38019276,4.91117477,7.85472488,5.46661568,6.35686398,4.96008873,7.95297813,5.53714943,6.45419502,5.02433777,7.99053764,5.52077484,6.65921688,5.04179001,8.10065174,5.76436234,6.63166809,4.91928482,8.14009285,5.56265068,6.61696434,5.22071648,8.18057156,5.57130432,6.52271652,4.9258461,8.08891582,5.7355423,6.62742043,4.98779869,8.08633423,5.59970856,6.66469955,4.90749359,8.02793503,5.61119843,6.56915665,4.92798805,8.08284664,5.72509956,6.68988895,5.10199356,8.29250145,5.66152763,6.5911274,5.04431534,8.24588776,5.72471809,6.72398663,5.08971596,8.247612,5.71876907,6.70418739,5.29846573,8.16291046,5.83738327,6.78546906,5.13676262,8.23707771,5.84209824,6.67313766,5.29277229,8.18377113,5.82172585,6.6520443,5.18866539,8.11068916,5.83301926,6.57429028,5.11104965,8.06363964,5.78905678,6.72797871,5.11105347,8.08338928,5.80940437,6.68347931,5.19134521,8.2475853,6.05075264,6.61704063,5.29603195,8.19070435,5.89323235,6.66639137,5.30723,8.33905029,5.90149689,6.68814278,5.35193825,8.28684425,6.02704048,6.58704472,5.25161743,8.17722416,6.02680397,6.42173862,5.25564766,8.05768776,6.12816048,6.42662621,5.07863617,7.90750504,6.17559433,6.57017326,5.1002388,7.90670586,6.02440643,6.4688406,4.95903778,7.97939873,6.01169014,6.42444897,5.18427849,8.06044006,5.9139576,6.59120178,5.27438545,8.17666817,6.01253319,6.65959167,5.18017006,8.22201443,6.04235649,6.60766792,5.30283356,8.2788372,6.03053665,6.60948467,5.34325409,8.31509209,5.8453598,6.69171238,5.33260155,8.20406818,5.683918,6.56695843,5.30281258,8.13492966,5.75349617,6.55113983,5.34417725,8.1326561,5.78568649,6.57105255,5.3273201,8.15877342,5.76834869,6.64725208,5.37774849,8.17119789,5.89293289,6.57474041,5.41365814,8.36868477,5.68873978,6.58434677,5.48419189,8.34781837,5.85387421,6.64115715,5.38439941,8.27522278,5.91314125,6.70641136,5.13799477,8.17845726,5.75364876,6.70098782,5.13040352,8.00018501,5.96792984,6.62576485,5.03642464,7.94652843,5.92035866,6.63366604,5.10318375,7.9744215,5.82936859,6.61807251,5.1090126,7.96936798,5.85948563,6.58807373,5.18238449,8.03250885,5.87504005,6.6042366,5.2796917,8.13493443,5.85573006,6.60693741,5.27525139,8.21286392,5.98149109,6.71523666,5.17165375,8.24064732,5.87898636,6.64641953,5.19127846,8.21217346,5.90829086,6.62609482
+output_hex=3c7c696d5f73746172747c3e696d5f656e647c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f7374617274
diff --git a/snapshots/quality_gates/timing.txt b/snapshots/quality_gates/timing.txt
index 89282747..f1d9146f 100644
--- a/snapshots/quality_gates/timing.txt
+++ b/snapshots/quality_gates/timing.txt
@@ -1,37 +1,9 @@
# quality_gates timing (seconds)
-domain_boundaries 1
-legacy_sml_surface 3
+domain_boundaries 0
+legacy_sml_surface 1
build_with_zig 0
-bench_snapshot_gbnf_rule_parser 4
-bench_snapshot_jinja_formatter 7
-bench_snapshot_jinja_parser 5
-bench_snapshot_logits_sampler 2
-bench_snapshot_logits_validator 7
-bench_snapshot_kernel_aarch64 4
-bench_snapshot_batch_planner 3
-bench_snapshot_memory_kv 6
-bench_snapshot_memory_recurrent 7
-bench_snapshot_memory_hybrid 4
-bench_snapshot_generation 190
-bench_snapshot_diarization_sortformer 5
-bench_snapshot_flash_attention 4
-bench_snapshot_tokenizer_preprocessor_bpe 1
-bench_snapshot_tokenizer_preprocessor_spm 1
-bench_snapshot_tokenizer_preprocessor_ugm 0
-bench_snapshot_tokenizer_preprocessor_wpm 1
-bench_snapshot_tokenizer_preprocessor_rwkv 1
-bench_snapshot_tokenizer_preprocessor_plamo2 1
-bench_snapshot_encoder_bpe 0
-bench_snapshot_encoder_spm 38
-bench_snapshot_encoder_wpm 1
-bench_snapshot_encoder_ugm 14
-bench_snapshot_encoder_rwkv 10
-bench_snapshot_encoder_plamo2 1
-bench_snapshot_encoder_fallback 0
-bench_snapshot_tokenizer 62
-test_with_coverage 422
-paritychecker 16
-fuzz_smoke 45
-lint_snapshot 16
-generate_docs 2
-total 887
+bench_snapshot 47
+test_with_coverage 1476
+paritychecker 2
+fuzz_smoke 0
+total 1477
diff --git a/src/emel/diarization/sortformer/detail.cpp b/src/emel/diarization/sortformer/detail.cpp
index ac61ff01..c863afbf 100644
--- a/src/emel/diarization/sortformer/detail.cpp
+++ b/src/emel/diarization/sortformer/detail.cpp
@@ -7,6 +7,7 @@
#endif
#include "emel/kernel/aarch64/actions.hpp"
+#include "emel/kernel/sm.hpp"
#include "emel/kernel/detail.hpp"
#include "emel/kernel/events.hpp"
@@ -36,7 +37,8 @@ float32x4_t compute_neon_fma(const float32x4_t acc,
}
#endif
-bool run_dense_matmul(std::span input,
+bool run_dense_matmul(emel::kernel::sm & kernel,
+ std::span input,
std::span weights,
std::span output) noexcept {
const uint64_t row_bytes = sizeof(float) * static_cast(input.size());
@@ -52,7 +54,7 @@ bool run_dense_matmul(std::span input,
1u,
},
.nb = {
- 1u,
+ sizeof(float),
row_bytes,
row_bytes * static_cast(output.size()),
row_bytes * static_cast(output.size()),
@@ -90,22 +92,16 @@ bool run_dense_matmul(std::span input,
sizeof(float) * static_cast(output.size()),
},
},
- .nth = 1u,
};
-#if defined(__aarch64__) || defined(__ARM_NEON)
- if (!emel::kernel::aarch64::detail::execute_neon_mul_mat(request)) {
- return false;
- }
-#else
- if (!emel::kernel::detail::run_mul_mat(request)) {
+ if (!kernel.process_event(request)) {
return false;
}
-#endif
return true;
}
-bool run_dense_batch_matmul_from_transposed(std::span transposed_input,
+bool run_dense_batch_matmul_from_transposed(emel::kernel::sm & kernel,
+ std::span transposed_input,
const size_t row_count,
const size_t input_dim,
std::span weights,
@@ -125,7 +121,7 @@ bool run_dense_batch_matmul_from_transposed(std::span transposed_in
1u,
},
.nb = {
- 1u,
+ sizeof(float),
input_row_bytes,
input_row_bytes * static_cast(output_dim),
input_row_bytes * static_cast(output_dim),
@@ -163,23 +159,17 @@ bool run_dense_batch_matmul_from_transposed(std::span transposed_in
frame_row_bytes * static_cast(output_dim),
},
},
- .nth = 1u,
};
-#if defined(__aarch64__) || defined(__ARM_NEON)
- if (!emel::kernel::aarch64::detail::execute_neon_mul_mat(request)) {
- return false;
- }
-#else
- if (!emel::kernel::detail::run_mul_mat(request)) {
+ if (!kernel.process_event(request)) {
return false;
}
-#endif
return true;
}
bool run_dense_batch_matmul_from_transposed_prepared(
+ emel::kernel::sm & kernel,
std::span transposed_input,
const size_t row_count,
const size_t input_dim,
@@ -201,7 +191,7 @@ bool run_dense_batch_matmul_from_transposed_prepared(
1u,
},
.nb = {
- 1u,
+ sizeof(float),
input_row_bytes,
input_row_bytes * static_cast(output_dim),
input_row_bytes * static_cast(output_dim),
@@ -239,10 +229,10 @@ bool run_dense_batch_matmul_from_transposed_prepared(
frame_row_bytes * static_cast(output_dim),
},
},
- .nth = 1u,
};
#if defined(__aarch64__) || defined(__ARM_NEON)
+ (void)kernel;
if (!emel::kernel::aarch64::detail::execute_neon_mul_mat_prepared_f32_lhs_4row(
request,
cache.lhs_4row.data(),
@@ -250,7 +240,8 @@ bool run_dense_batch_matmul_from_transposed_prepared(
return false;
}
#else
- if (!emel::kernel::detail::run_mul_mat(request)) {
+ (void)cache;
+ if (!kernel.process_event(request)) {
return false;
}
#endif
@@ -458,7 +449,8 @@ bool prepare_dense_weight_cache(std::span weights,
return true;
}
-bool compute_dense(std::span input,
+bool compute_dense(emel::kernel::sm & kernel,
+ std::span input,
std::span weights,
std::span bias,
std::span output) noexcept {
@@ -467,7 +459,7 @@ bool compute_dense(std::span input,
return false;
}
- if (!run_dense_matmul(input, weights, output)) {
+ if (!run_dense_matmul(kernel, input, weights, output)) {
return false;
}
@@ -478,14 +470,15 @@ bool compute_dense(std::span input,
return true;
}
-bool compute_dense_without_bias(std::span input,
+bool compute_dense_without_bias(emel::kernel::sm & kernel,
+ std::span input,
std::span weights,
std::span output) noexcept {
if (input.empty() || output.empty() || weights.size() != input.size() * output.size()) {
return false;
}
- return run_dense_matmul(input, weights, output);
+ return run_dense_matmul(kernel, input, weights, output);
}
bool transpose_dense_input(std::span input_rows,
@@ -508,7 +501,8 @@ bool transpose_dense_input(std::span input_rows,
return true;
}
-bool compute_dense_batch(std::span input_rows,
+bool compute_dense_batch(emel::kernel::sm & kernel,
+ std::span input_rows,
const size_t row_count,
const size_t input_dim,
std::span weights,
@@ -534,7 +528,8 @@ bool compute_dense_batch(std::span input_rows,
}
}
- if (!run_dense_batch_matmul_from_transposed(transposed_input,
+ if (!run_dense_batch_matmul_from_transposed(kernel,
+ transposed_input,
row_count,
input_dim,
weights,
@@ -553,7 +548,8 @@ bool compute_dense_batch(std::span input_rows,
return true;
}
-bool compute_dense_batch_prepared(std::span input_rows,
+bool compute_dense_batch_prepared(emel::kernel::sm & kernel,
+ std::span input_rows,
const size_t row_count,
const size_t input_dim,
std::span weights,
@@ -583,7 +579,8 @@ bool compute_dense_batch_prepared(std::span input_rows,
}
}
- if (!run_dense_batch_matmul_from_transposed_prepared(transposed_input,
+ if (!run_dense_batch_matmul_from_transposed_prepared(kernel,
+ transposed_input,
row_count,
input_dim,
weights,
@@ -603,7 +600,8 @@ bool compute_dense_batch_prepared(std::span input_rows,
return true;
}
-bool compute_dense_batch_residual_prepared(std::span input_rows,
+bool compute_dense_batch_residual_prepared(emel::kernel::sm & kernel,
+ std::span input_rows,
const size_t row_count,
const size_t input_dim,
std::span weights,
@@ -632,7 +630,8 @@ bool compute_dense_batch_residual_prepared(std::span input_rows,
return false;
}
- return compute_dense_batch_from_transposed_scaled_residual_prepared(transposed_input,
+ return compute_dense_batch_from_transposed_scaled_residual_prepared(kernel,
+ transposed_input,
row_count,
input_dim,
weights,
@@ -645,7 +644,8 @@ bool compute_dense_batch_residual_prepared(std::span input_rows,
output_rows);
}
-bool compute_dense_batch_without_bias(std::span input_rows,
+bool compute_dense_batch_without_bias(emel::kernel::sm & kernel,
+ std::span input_rows,
const size_t row_count,
const size_t input_dim,
std::span weights,
@@ -669,7 +669,8 @@ bool compute_dense_batch_without_bias(std::span input_rows,
}
}
- if (!run_dense_batch_matmul_from_transposed(transposed_input,
+ if (!run_dense_batch_matmul_from_transposed(kernel,
+ transposed_input,
row_count,
input_dim,
weights,
@@ -688,7 +689,8 @@ bool compute_dense_batch_without_bias(std::span input_rows,
return true;
}
-bool compute_dense_batch_without_bias_prepared(std::span input_rows,
+bool compute_dense_batch_without_bias_prepared(emel::kernel::sm & kernel,
+ std::span input_rows,
const size_t row_count,
const size_t input_dim,
std::span weights,
@@ -716,7 +718,8 @@ bool compute_dense_batch_without_bias_prepared(std::span input_rows
}
}
- if (!run_dense_batch_matmul_from_transposed_prepared(transposed_input,
+ if (!run_dense_batch_matmul_from_transposed_prepared(kernel,
+ transposed_input,
row_count,
input_dim,
weights,
@@ -736,7 +739,8 @@ bool compute_dense_batch_without_bias_prepared(std::span input_rows
return true;
}
-bool compute_dense_batch_to_transposed(std::span input_rows,
+bool compute_dense_batch_to_transposed(emel::kernel::sm & kernel,
+ std::span input_rows,
const size_t row_count,
const size_t input_dim,
std::span weights,
@@ -760,7 +764,8 @@ bool compute_dense_batch_to_transposed(std::span input_rows,
}
}
- if (!run_dense_batch_matmul_from_transposed(transposed_input,
+ if (!run_dense_batch_matmul_from_transposed(kernel,
+ transposed_input,
row_count,
input_dim,
weights,
@@ -780,7 +785,8 @@ bool compute_dense_batch_to_transposed(std::span input_rows,
return true;
}
-bool compute_dense_batch_to_transposed_prepared(std::span input_rows,
+bool compute_dense_batch_to_transposed_prepared(emel::kernel::sm & kernel,
+ std::span input_rows,
const size_t row_count,
const size_t input_dim,
std::span weights,
@@ -808,7 +814,8 @@ bool compute_dense_batch_to_transposed_prepared(std::span input_row
}
}
- if (!run_dense_batch_matmul_from_transposed_prepared(transposed_input,
+ if (!run_dense_batch_matmul_from_transposed_prepared(kernel,
+ transposed_input,
row_count,
input_dim,
weights,
@@ -829,7 +836,8 @@ bool compute_dense_batch_to_transposed_prepared(std::span input_row
return true;
}
-bool compute_dense_batch_from_transposed(std::span transposed_input,
+bool compute_dense_batch_from_transposed(emel::kernel::sm & kernel,
+ std::span transposed_input,
const size_t row_count,
const size_t input_dim,
std::span weights,
@@ -846,7 +854,8 @@ bool compute_dense_batch_from_transposed(std::span transposed_input
return false;
}
- if (!run_dense_batch_matmul_from_transposed(transposed_input,
+ if (!run_dense_batch_matmul_from_transposed(kernel,
+ transposed_input,
row_count,
input_dim,
weights,
@@ -865,7 +874,8 @@ bool compute_dense_batch_from_transposed(std::span transposed_input
return true;
}
-bool compute_dense_batch_from_transposed_prepared(std::span transposed_input,
+bool compute_dense_batch_from_transposed_prepared(emel::kernel::sm & kernel,
+ std::span transposed_input,
const size_t row_count,
const size_t input_dim,
std::span weights,
@@ -886,7 +896,8 @@ bool compute_dense_batch_from_transposed_prepared(std::span transpo
return false;
}
- if (!run_dense_batch_matmul_from_transposed_prepared(transposed_input,
+ if (!run_dense_batch_matmul_from_transposed_prepared(kernel,
+ transposed_input,
row_count,
input_dim,
weights,
@@ -907,6 +918,7 @@ bool compute_dense_batch_from_transposed_prepared(std::span transpo
}
bool compute_dense_batch_from_transposed_scaled_residual_prepared(
+ emel::kernel::sm & kernel,
std::span transposed_input,
const size_t row_count,
const size_t input_dim,
@@ -931,7 +943,8 @@ bool compute_dense_batch_from_transposed_scaled_residual_prepared(
return false;
}
- if (!run_dense_batch_matmul_from_transposed_prepared(transposed_input,
+ if (!run_dense_batch_matmul_from_transposed_prepared(kernel,
+ transposed_input,
row_count,
input_dim,
weights,
diff --git a/src/emel/diarization/sortformer/detail.hpp b/src/emel/diarization/sortformer/detail.hpp
index 01828c90..beadc02c 100644
--- a/src/emel/diarization/sortformer/detail.hpp
+++ b/src/emel/diarization/sortformer/detail.hpp
@@ -4,6 +4,8 @@
#include
#include
+#include "emel/kernel/sm.hpp"
+
namespace emel::diarization::sortformer::detail {
struct dense_weight_cache {
@@ -34,12 +36,14 @@ bool prepare_dense_weight_cache(std::span weights,
size_t output_dim,
dense_weight_cache & cache) noexcept;
-bool compute_dense(std::span input,
+bool compute_dense(emel::kernel::sm & kernel,
+ std::span input,
std::span weights,
std::span bias,
std::span output) noexcept;
-bool compute_dense_without_bias(std::span input,
+bool compute_dense_without_bias(emel::kernel::sm & kernel,
+ std::span input,
std::span weights,
std::span output) noexcept;
@@ -48,7 +52,8 @@ bool transpose_dense_input(std::span input_rows,
size_t input_dim,
std::span transposed_input) noexcept;
-bool compute_dense_batch(std::span input_rows,
+bool compute_dense_batch(emel::kernel::sm & kernel,
+ std::span input_rows,
size_t row_count,
size_t input_dim,
std::span weights,
@@ -58,7 +63,8 @@ bool compute_dense_batch(std::span input_rows,
std::span transposed_output,
std::span output_rows) noexcept;
-bool compute_dense_batch_prepared(std::span input_rows,
+bool compute_dense_batch_prepared(emel::kernel::sm & kernel,
+ std::span input_rows,
size_t row_count,
size_t input_dim,
std::span weights,
@@ -69,7 +75,8 @@ bool compute_dense_batch_prepared(std::span input_rows,
std::span transposed_output,
std::span output_rows) noexcept;
-bool compute_dense_batch_residual_prepared(std::span input_rows,
+bool compute_dense_batch_residual_prepared(emel::kernel::sm & kernel,
+ std::span input_rows,
size_t row_count,
size_t input_dim,
std::span weights,
@@ -81,7 +88,8 @@ bool compute_dense_batch_residual_prepared(std::span input_rows,
std::span transposed_output,
std::span output_rows) noexcept;
-bool compute_dense_batch_without_bias(std::span input_rows,
+bool compute_dense_batch_without_bias(emel::kernel::sm & kernel,
+ std::span input_rows,
size_t row_count,
size_t input_dim,
std::span weights,
@@ -90,7 +98,8 @@ bool compute_dense_batch_without_bias(std::span input_rows,
std::span transposed_output,
std::span output_rows) noexcept;
-bool compute_dense_batch_without_bias_prepared(std::span input_rows,
+bool compute_dense_batch_without_bias_prepared(emel::kernel::sm & kernel,
+ std::span input_rows,
size_t row_count,
size_t input_dim,
std::span weights,
@@ -100,7 +109,8 @@ bool compute_dense_batch_without_bias_prepared(std::span input_rows
std::span transposed_output,
std::span output_rows) noexcept;
-bool compute_dense_batch_to_transposed(std::span input_rows,
+bool compute_dense_batch_to_transposed(emel::kernel::sm & kernel,
+ std::span input_rows,
size_t row_count,
size_t input_dim,
std::span weights,
@@ -109,7 +119,8 @@ bool compute_dense_batch_to_transposed(std::span input_rows,
std::span transposed_input,
std::span transposed_output) noexcept;
-bool compute_dense_batch_to_transposed_prepared(std::span input_rows,
+bool compute_dense_batch_to_transposed_prepared(emel::kernel::sm & kernel,
+ std::span input_rows,
size_t row_count,
size_t input_dim,
std::span weights,
@@ -119,7 +130,8 @@ bool compute_dense_batch_to_transposed_prepared(std::span input_row
std::span transposed_input,
std::span transposed_output) noexcept;
-bool compute_dense_batch_from_transposed(std::span transposed_input,
+bool compute_dense_batch_from_transposed(emel::kernel::sm & kernel,
+ std::span transposed_input,
size_t row_count,
size_t input_dim,
std::span weights,
@@ -128,7 +140,8 @@ bool compute_dense_batch_from_transposed(std::span transposed_input
std::span transposed_output,
std::span output_rows) noexcept;
-bool compute_dense_batch_from_transposed_prepared(std::span transposed_input,
+bool compute_dense_batch_from_transposed_prepared(emel::kernel::sm & kernel,
+ std::span transposed_input,
size_t row_count,
size_t input_dim,
std::span weights,
@@ -139,6 +152,7 @@ bool compute_dense_batch_from_transposed_prepared(std::span transpo
std::span output_rows) noexcept;
bool compute_dense_batch_from_transposed_scaled_residual_prepared(
+ emel::kernel::sm & kernel,
std::span transposed_input,
size_t row_count,
size_t input_dim,
diff --git a/src/emel/diarization/sortformer/encoder/detail.cpp b/src/emel/diarization/sortformer/encoder/detail.cpp
index d597dc4c..6f39c13a 100644
--- a/src/emel/diarization/sortformer/encoder/detail.cpp
+++ b/src/emel/diarization/sortformer/encoder/detail.cpp
@@ -294,6 +294,7 @@ bool compute_pointwise_row(std::span input,
pre_encoder_workspace & workspace,
std::span output) noexcept {
if (!emel::diarization::sortformer::detail::compute_dense_batch(
+ workspace.kernel,
input,
static_cast(freq_count),
static_cast(k_pre_channel_count),
@@ -411,6 +412,7 @@ bool compute_position_projection(
pre_encoder_workspace & workspace,
std::span output) noexcept {
return emel::diarization::sortformer::detail::compute_dense_batch_without_bias_prepared(
+ workspace.kernel,
positions,
static_cast(k_relative_position_count),
static_cast(k_model_dim),
@@ -446,6 +448,7 @@ bool compute_feed_forward_block(
}
if (!emel::diarization::sortformer::detail::compute_dense_batch_to_transposed_prepared(
+ workspace.kernel,
fixed_span(workspace.layer_norm),
static_cast(k_frame_count),
static_cast(k_model_dim),
@@ -467,6 +470,7 @@ bool compute_feed_forward_block(
if (!emel::diarization::sortformer::detail::
compute_dense_batch_from_transposed_scaled_residual_prepared(
+ workspace.kernel,
std::span{workspace.dense_transposed_output.data(),
feed_forward_value_count},
static_cast(k_frame_count),
@@ -591,6 +595,7 @@ bool compute_attention_block(
static_cast(k_model_dim),
qkv_transposed) ||
!emel::diarization::sortformer::detail::compute_dense_batch_from_transposed_prepared(
+ workspace.kernel,
qkv_transposed,
static_cast(k_frame_count),
static_cast(k_model_dim),
@@ -601,6 +606,7 @@ bool compute_attention_block(
workspace.dense_transposed_output,
fixed_span(workspace.query)) ||
!emel::diarization::sortformer::detail::compute_dense_batch_from_transposed_prepared(
+ workspace.kernel,
qkv_transposed,
static_cast(k_frame_count),
static_cast(k_model_dim),
@@ -611,6 +617,7 @@ bool compute_attention_block(
workspace.dense_transposed_output,
fixed_span(workspace.key)) ||
!emel::diarization::sortformer::detail::compute_dense_batch_from_transposed_prepared(
+ workspace.kernel,
qkv_transposed,
static_cast(k_frame_count),
static_cast(k_model_dim),
@@ -653,6 +660,7 @@ bool compute_attention_block(
}
if (!emel::diarization::sortformer::detail::compute_dense_batch_residual_prepared(
+ workspace.kernel,
fixed_span(workspace.layer_result),
static_cast(k_frame_count),
static_cast(k_model_dim),
@@ -702,6 +710,7 @@ bool compute_convolution_block(
static_cast(k_frame_count * 2 * k_model_dim),
};
if (!emel::diarization::sortformer::detail::compute_dense_batch_prepared(
+ workspace.kernel,
fixed_span(workspace.layer_norm),
static_cast(k_frame_count),
static_cast(k_model_dim),
@@ -751,6 +760,7 @@ bool compute_convolution_block(
}
if (!emel::diarization::sortformer::detail::compute_dense_batch_residual_prepared(
+ workspace.kernel,
fixed_span(workspace.layer_output),
static_cast(k_frame_count),
static_cast(k_model_dim),
@@ -1200,6 +1210,7 @@ bool compute_encoder_frames_from_features(
}
if (!emel::diarization::sortformer::detail::compute_dense_batch_prepared(
+ workspace.kernel,
workspace.pre_encoder_rows,
static_cast(k_frame_count),
static_cast(k_pre_expanded_dim),
diff --git a/src/emel/diarization/sortformer/encoder/detail.hpp b/src/emel/diarization/sortformer/encoder/detail.hpp
index ed03ff83..ccc78e36 100644
--- a/src/emel/diarization/sortformer/encoder/detail.hpp
+++ b/src/emel/diarization/sortformer/encoder/detail.hpp
@@ -8,6 +8,7 @@
#include
#include "emel/diarization/sortformer/detail.hpp"
+#include "emel/kernel/sm.hpp"
#include "emel/diarization/sortformer/encoder/feature_extractor/detail.hpp"
#include "emel/model/data.hpp"
@@ -69,6 +70,8 @@ struct contract {
struct pre_encoder_workspace {
pre_encoder_workspace();
+ emel::kernel::sm kernel{emel::kernel::detect_host_kind()};
+
std::array, 3> conv0_rows = {};
std::array, 3> stage1_rows = {};
std::array stage1_depthwise = {};
diff --git a/src/emel/diarization/sortformer/executor/actions.hpp b/src/emel/diarization/sortformer/executor/actions.hpp
index 7bfee84e..f1c593fb 100644
--- a/src/emel/diarization/sortformer/executor/actions.hpp
+++ b/src/emel/diarization/sortformer/executor/actions.hpp
@@ -141,6 +141,7 @@ inline bool compute_encoder_projection_stage(
*ctx.modules.encoder_projection_bias.tensor);
return modules_detail::compute_encoder_projection_batch(
+ ctx.transformer_workspace.kernel,
runtime_ev.request.encoder_frames,
static_cast(detail::k_frame_count),
encoder_projection_weight,
diff --git a/src/emel/diarization/sortformer/modules/detail.cpp b/src/emel/diarization/sortformer/modules/detail.cpp
index aee0892c..2e944785 100644
--- a/src/emel/diarization/sortformer/modules/detail.cpp
+++ b/src/emel/diarization/sortformer/modules/detail.cpp
@@ -94,11 +94,13 @@ bool bind_contract(const emel::model::data & model_data,
return true;
}
-bool compute_encoder_projection(std::span encoder_frame,
+bool compute_encoder_projection(emel::kernel::sm & kernel,
+ std::span encoder_frame,
std::span weights,
std::span bias,
std::span hidden_out) noexcept {
- return emel::diarization::sortformer::detail::compute_dense(encoder_frame,
+ return emel::diarization::sortformer::detail::compute_dense(kernel,
+ encoder_frame,
weights,
bias,
hidden_out);
@@ -115,6 +117,7 @@ bool prepare_encoder_projection_weight_cache(
}
bool compute_encoder_projection_batch(
+ emel::kernel::sm & kernel,
std::span encoder_frames,
const size_t frame_count,
std::span weights,
@@ -124,6 +127,7 @@ bool compute_encoder_projection_batch(
std::span transposed_output,
std::span hidden_out) noexcept {
return emel::diarization::sortformer::detail::compute_dense_batch_prepared(
+ kernel,
encoder_frames,
frame_count,
static_cast(k_encoder_dim),
diff --git a/src/emel/diarization/sortformer/modules/detail.hpp b/src/emel/diarization/sortformer/modules/detail.hpp
index 4efcdd0f..85cbac48 100644
--- a/src/emel/diarization/sortformer/modules/detail.hpp
+++ b/src/emel/diarization/sortformer/modules/detail.hpp
@@ -7,6 +7,7 @@
#include
#include "emel/diarization/sortformer/detail.hpp"
+#include "emel/kernel/sm.hpp"
#include "emel/model/data.hpp"
namespace emel::diarization::sortformer::modules::detail {
@@ -39,7 +40,8 @@ struct contract {
bool bind_contract(const emel::model::data & model_data,
contract & contract_out) noexcept;
-bool compute_encoder_projection(std::span encoder_frame,
+bool compute_encoder_projection(emel::kernel::sm & kernel,
+ std::span encoder_frame,
std::span weights,
std::span bias,
std::span hidden_out) noexcept;
@@ -49,6 +51,7 @@ bool prepare_encoder_projection_weight_cache(
emel::diarization::sortformer::detail::dense_weight_cache & cache) noexcept;
bool compute_encoder_projection_batch(
+ emel::kernel::sm & kernel,
std::span encoder_frames,
size_t frame_count,
std::span weights,
diff --git a/src/emel/diarization/sortformer/output/detail.cpp b/src/emel/diarization/sortformer/output/detail.cpp
index a0b6e457..f7658607 100644
--- a/src/emel/diarization/sortformer/output/detail.cpp
+++ b/src/emel/diarization/sortformer/output/detail.cpp
@@ -49,6 +49,7 @@ bool append_segment(std::span segments_out,
} // namespace
bool compute_speaker_probabilities(
+ emel::kernel::sm & kernel,
std::span hidden_frames,
const emel::diarization::sortformer::modules::detail::contract & modules_contract,
std::span probabilities_out) noexcept {
@@ -82,6 +83,7 @@ bool compute_speaker_probabilities(
}
if (!emel::diarization::sortformer::detail::compute_dense(
+ kernel,
intermediate, frame_hidden_weights, frame_hidden_bias, frame_hidden)) {
return false;
}
@@ -90,7 +92,7 @@ bool compute_speaker_probabilities(
intermediate[index] = relu(frame_hidden[index]);
}
- if (!emel::diarization::sortformer::detail::compute_dense(intermediate, weights, bias, logits)) {
+ if (!emel::diarization::sortformer::detail::compute_dense(kernel, intermediate, weights, bias, logits)) {
return false;
}
diff --git a/src/emel/diarization/sortformer/output/detail.hpp b/src/emel/diarization/sortformer/output/detail.hpp
index 2986134c..7b873aeb 100644
--- a/src/emel/diarization/sortformer/output/detail.hpp
+++ b/src/emel/diarization/sortformer/output/detail.hpp
@@ -6,6 +6,7 @@
#include
#include "emel/diarization/sortformer/modules/detail.hpp"
+#include "emel/kernel/sm.hpp"
namespace emel::diarization::sortformer::output::detail {
@@ -27,6 +28,7 @@ struct segment_record {
};
bool compute_speaker_probabilities(
+ emel::kernel::sm & kernel,
std::span hidden_frames,
const emel::diarization::sortformer::modules::detail::contract & modules_contract,
std::span probabilities_out) noexcept;
diff --git a/src/emel/diarization/sortformer/pipeline/actions.hpp b/src/emel/diarization/sortformer/pipeline/actions.hpp
index 20c8aa70..4cfb171d 100644
--- a/src/emel/diarization/sortformer/pipeline/actions.hpp
+++ b/src/emel/diarization/sortformer/pipeline/actions.hpp
@@ -133,7 +133,8 @@ struct effect_compute_probabilities {
static_cast(detail::k_required_probability_value_count));
const bool probability_ok =
emel::diarization::sortformer::output::detail::compute_speaker_probabilities(
- ctx.hidden, ctx.modules, probability_output);
+ ctx.encoder_workspace.kernel, ctx.hidden, ctx.modules,
+ probability_output);
effect_store_kernel_result(runtime_ev.ctx, probability_ok);
runtime_ev.request.probability_count_out =
detail::k_required_probability_value_count *
diff --git a/src/emel/diarization/sortformer/transformer/detail.cpp b/src/emel/diarization/sortformer/transformer/detail.cpp
index 44c27f80..4807b58c 100644
--- a/src/emel/diarization/sortformer/transformer/detail.cpp
+++ b/src/emel/diarization/sortformer/transformer/detail.cpp
@@ -323,6 +323,7 @@ bool compute_transformer_layer(
static_cast(k_hidden_dim),
qkv_transposed) ||
!emel::diarization::sortformer::detail::compute_dense_batch_from_transposed_prepared(
+ workspace.kernel,
qkv_transposed,
static_cast(frame_count),
static_cast(k_hidden_dim),
@@ -333,6 +334,7 @@ bool compute_transformer_layer(
workspace.dense_transposed_output,
std::span{workspace.query.data(), frame_value_count}) ||
!emel::diarization::sortformer::detail::compute_dense_batch_from_transposed_prepared(
+ workspace.kernel,
qkv_transposed,
static_cast(frame_count),
static_cast(k_hidden_dim),
@@ -343,6 +345,7 @@ bool compute_transformer_layer(
workspace.dense_transposed_output,
std::span{workspace.key.data(), frame_value_count}) ||
!emel::diarization::sortformer::detail::compute_dense_batch_from_transposed_prepared(
+ workspace.kernel,
qkv_transposed,
static_cast(frame_count),
static_cast(k_hidden_dim),
@@ -375,6 +378,7 @@ bool compute_transformer_layer(
}
if (!emel::diarization::sortformer::detail::compute_dense_batch_residual_prepared(
+ workspace.kernel,
std::span{workspace.first_norm.data(), frame_value_count},
static_cast(frame_count),
static_cast(k_hidden_dim),
@@ -404,6 +408,7 @@ bool compute_transformer_layer(
}
if (!emel::diarization::sortformer::detail::compute_dense_batch_prepared(
+ workspace.kernel,
std::span{workspace.first_norm.data(), frame_value_count},
static_cast(frame_count),
static_cast(k_hidden_dim),
@@ -426,6 +431,7 @@ bool compute_transformer_layer(
}
if (!emel::diarization::sortformer::detail::compute_dense_batch_residual_prepared(
+ workspace.kernel,
std::span