diff --git a/.codex/config.toml b/.codex/config.toml
index 3e0fa368..ffb51eba 100644
--- a/.codex/config.toml
+++ b/.codex/config.toml
@@ -6,124 +6,124 @@ codex_hooks = true
 
 [agents.gsd-advisor-researcher]
 description = "Researches a single gray area decision and returns a structured comparison table with rationale. Spawned by discuss-phase advisor mode."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-advisor-researcher.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-advisor-researcher.toml"
 
 [agents.gsd-ai-researcher]
 description = "Researches a chosen AI framework's official docs to produce implementation-ready guidance — best practices, syntax, core patterns, and pitfalls distilled for the specific use case. Writes the Framework Quick Reference and Implementation Guidance sections of AI-SPEC.md. Spawned by /gsd-ai-integration-phase orchestrator."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-ai-researcher.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-ai-researcher.toml"
 
 [agents.gsd-assumptions-analyzer]
 description = "Deeply analyzes codebase for a phase and returns structured assumptions with evidence. Spawned by discuss-phase assumptions mode."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-assumptions-analyzer.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-assumptions-analyzer.toml"
 
 [agents.gsd-code-fixer]
 description = "Applies fixes to code review findings from REVIEW.md. Reads source files, applies intelligent fixes, and commits each fix atomically. Spawned by /gsd-code-review-fix."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-code-fixer.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-code-fixer.toml"
 
 [agents.gsd-code-reviewer]
 description = "Reviews source files for bugs, security issues, and code quality problems. Produces structured REVIEW.md with severity-classified findings. Spawned by /gsd-code-review."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-code-reviewer.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-code-reviewer.toml"
 
 [agents.gsd-codebase-mapper]
 description = "Explores codebase and writes structured analysis documents. Spawned by map-codebase with a focus area (tech, arch, quality, concerns). Writes documents directly to reduce orchestrator context load."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-codebase-mapper.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-codebase-mapper.toml"
 
 [agents.gsd-debug-session-manager]
 description = "Manages multi-cycle /gsd-debug checkpoint and continuation loop in isolated context. Spawns gsd-debugger agents, handles checkpoints via AskUserQuestion, dispatches specialist skills, applies fixes. Returns compact summary to main context. Spawned by /gsd-debug command."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-debug-session-manager.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-debug-session-manager.toml"
 
 [agents.gsd-debugger]
 description = "Investigates bugs using scientific method, manages debug sessions, handles checkpoints. Spawned by /gsd-debug orchestrator."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-debugger.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-debugger.toml"
 
 [agents.gsd-doc-verifier]
 description = "Verifies factual claims in generated docs against the live codebase. Returns structured JSON per doc."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-doc-verifier.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-doc-verifier.toml"
 
 [agents.gsd-doc-writer]
 description = "Writes and updates project documentation. Spawned with a doc_assignment block specifying doc type, mode (create/update/supplement), and project context."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-doc-writer.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-doc-writer.toml"
 
 [agents.gsd-domain-researcher]
 description = "Researches the business domain and real-world application context of the AI system being built. Surfaces domain expert evaluation criteria, industry-specific failure modes, regulatory context, and what \"good\" looks like for practitioners in this field — before the eval-planner turns it into measurable rubrics. Spawned by /gsd-ai-integration-phase orchestrator."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-domain-researcher.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-domain-researcher.toml"
 
 [agents.gsd-eval-auditor]
 description = "Retroactive audit of an implemented AI phase's evaluation coverage. Checks implementation against the AI-SPEC.md evaluation plan. Scores each eval dimension as COVERED/PARTIAL/MISSING. Produces a scored EVAL-REVIEW.md with findings, gaps, and remediation guidance. Spawned by /gsd-eval-review orchestrator."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-eval-auditor.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-eval-auditor.toml"
 
 [agents.gsd-eval-planner]
 description = "Designs a structured evaluation strategy for an AI phase. Identifies critical failure modes, selects eval dimensions with rubrics, recommends tooling, and specifies the reference dataset. Writes the Evaluation Strategy, Guardrails, and Production Monitoring sections of AI-SPEC.md. Spawned by /gsd-ai-integration-phase orchestrator."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-eval-planner.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-eval-planner.toml"
 
 [agents.gsd-executor]
 description = "Executes GSD plans with atomic commits, deviation handling, checkpoint protocols, and state management. Spawned by execute-phase orchestrator or execute-plan command."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-executor.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-executor.toml"
 
 [agents.gsd-framework-selector]
 description = "Presents an interactive decision matrix to surface the right AI/LLM framework for the user's specific use case. Produces a scored recommendation with rationale. Spawned by /gsd-ai-integration-phase and /gsd-select-framework orchestrators."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-framework-selector.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-framework-selector.toml"
 
 [agents.gsd-integration-checker]
 description = "Verifies cross-phase integration and E2E flows. Checks that phases connect properly and user workflows complete end-to-end."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-integration-checker.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-integration-checker.toml"
 
 [agents.gsd-intel-updater]
 description = "Analyzes codebase and writes structured intel files to .planning/intel/."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-intel-updater.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-intel-updater.toml"
 
 [agents.gsd-nyquist-auditor]
 description = "Fills Nyquist validation gaps by generating tests and verifying coverage for phase requirements"
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-nyquist-auditor.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-nyquist-auditor.toml"
 
 [agents.gsd-pattern-mapper]
 description = "Analyzes codebase for existing patterns and produces PATTERNS.md mapping new files to closest analogs. Read-only codebase analysis spawned by /gsd-plan-phase orchestrator before planning."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-pattern-mapper.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-pattern-mapper.toml"
 
 [agents.gsd-phase-researcher]
 description = "Researches how to implement a phase before planning. Produces RESEARCH.md consumed by gsd-planner. Spawned by /gsd-plan-phase orchestrator."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-phase-researcher.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-phase-researcher.toml"
 
 [agents.gsd-plan-checker]
 description = "Verifies plans will achieve phase goal before execution. Goal-backward analysis of plan quality. Spawned by /gsd-plan-phase orchestrator."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-plan-checker.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-plan-checker.toml"
 
 [agents.gsd-planner]
 description = "Creates executable phase plans with task breakdown, dependency analysis, and goal-backward verification. Spawned by /gsd-plan-phase orchestrator."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-planner.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-planner.toml"
 
 [agents.gsd-project-researcher]
 description = "Researches domain ecosystem before roadmap creation. Produces files in .planning/research/ consumed during roadmap creation. Spawned by /gsd-new-project or /gsd-new-milestone orchestrators."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-project-researcher.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-project-researcher.toml"
 
 [agents.gsd-research-synthesizer]
 description = "Synthesizes research outputs from parallel researcher agents into SUMMARY.md. Spawned by /gsd-new-project after 4 researcher agents complete."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-research-synthesizer.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-research-synthesizer.toml"
 
 [agents.gsd-roadmapper]
 description = "Creates project roadmaps with phase breakdown, requirement mapping, success criteria derivation, and coverage validation. Spawned by /gsd-new-project orchestrator."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-roadmapper.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-roadmapper.toml"
 
 [agents.gsd-security-auditor]
 description = "Verifies threat mitigations from PLAN.md threat model exist in implemented code. Produces SECURITY.md. Spawned by /gsd-secure-phase."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-security-auditor.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-security-auditor.toml"
 
 [agents.gsd-ui-auditor]
 description = "Retroactive 6-pillar visual audit of implemented frontend code. Produces scored UI-REVIEW.md. Spawned by /gsd-ui-review orchestrator."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-ui-auditor.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-ui-auditor.toml"
 
 [agents.gsd-ui-checker]
 description = "Validates UI-SPEC.md design contracts against 6 quality dimensions. Produces BLOCK/FLAG/PASS verdicts. Spawned by /gsd-ui-phase orchestrator."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-ui-checker.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-ui-checker.toml"
 
 [agents.gsd-ui-researcher]
 description = "Produces UI-SPEC.md design contract for frontend phases. Reads upstream artifacts, detects design system state, asks only unanswered questions. Spawned by /gsd-ui-phase orchestrator."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-ui-researcher.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-ui-researcher.toml"
 
 [agents.gsd-user-profiler]
 description = "Analyzes extracted session messages across 8 behavioral dimensions to produce a scored developer profile with confidence levels and evidence. Spawned by profile orchestration workflows."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-user-profiler.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-user-profiler.toml"
 
 [agents.gsd-verifier]
 description = "Verifies phase goal achievement through goal-backward analysis. Checks codebase delivers what phase promised, not just that tasks completed. Creates VERIFICATION.md report."
-config_file = "/Users/gabrielwillen/VSCode/stateforward/emel/emel.cpp/.codex/agents/gsd-verifier.toml"
+config_file = "/shared/stateforward/emel.cpp/.codex/agents/gsd-verifier.toml"
diff --git a/.planning/MILESTONES.md b/.planning/MILESTONES.md
index 8e62151b..82b359eb 100644
--- a/.planning/MILESTONES.md
+++ b/.planning/MILESTONES.md
@@ -1,5 +1,72 @@
 # Project Milestones: EMEL
 
+## v1.27 Ryzen AVX2/FMA Kernel Support (Shipped: 2026-06-25)
+
+**Phases completed:** 6 phases, 6 plans, 0 tasks
+
+**Key accomplishments:**
+
+- Added a source-backed x86_64 host feature contract for AVX2, FMA, and F16C
+  while explicitly no-claiming AVX-512, AVX-VNNI, AMX, BF16, native FP16, GPU,
+  and broader x86 feature families.
+
+- Added EMEL-owned AVX2/FMA/F16C flash-attention execution for supported
+  x86_64 requests with deterministic shared fallback/no-claim behavior.
+
+- Added EMEL-owned AVX2/FMA q2_K, q3_K, and q6_K x q8_K hot-path kernels with
+  block-native operand flow, no whole-tensor f32 dequant substitution, and
+  allocation-free supported dispatch proof.
+
+- Integrated the optimized x86_64 path through the maintained generator ->
+  graph -> processor -> kernel chain and paritychecker attribution for `1`,
+  `10`, `100`, and `1000` token generation.
+
+- Published truthful `kernel_x86_64` benchmark evidence with counter-checked
+  optimized flash and q2/q3/q6 rows separated from shared/scalar/reference
+  lanes.
+
+**Audit:** Final source-backed audit passed with 13/13 requirements satisfied.
+The initial `XBN-01` benchmark attribution gap was closed before archive, and
+the x86_64 unary SML rule debt was removed from the milestone path.
+
+---
+
+## v1.26 I/O Staged Read Loading Strategy (Shipped: 2026-05-08)
+
+**Phases completed:** 12 phases, 12 plans, 0 tasks
+
+**Key accomplishments:**
+
+- Established the canonical `src/emel/io/staged_read` Stateforward.SML strategy
+  actor under the existing `emel/io` boundary.
+
+- Modeled source span, target window, stage sizing, platform/resource, and
+  validation behavior through explicit guards/transitions before accepting
+  staged copy work.
+
+- Implemented deterministic source-span staged copy semantics with monotonic
+  progress, explicit terminal success, and named deterministic error categories.
+
+- Integrated staged loading through public tensor-to-I/O dispatch while keeping
+  `model/tensor` as the sole tensor load/bind/evict/residency owner.
+
+- Kept maintained loader, benchmark, paritychecker, and embedded probe lanes on
+  public runtime contracts with no actor-internal reach-through.
+
+- Closed the source-backed audit by repairing direct tensor staged-load
+  nonzero-offset source-window behavior and reconciling closeout artifact plus
+  embedded-probe reporting truth.
+
+**Audit:** Final source-backed audit passed with all active requirements
+satisfied after Phase 237 and Phase 238 gap closure. `ESG-02B` remains
+deferred/future because real file open/seek/read and per-stage short-read
+taxonomy requires a separately approved file-backed staged-read source path.
+
+**Known deferred items at close:** `ESG-02B` plus the carried-forward deferred
+items listed in STATE.md.
+
+---
+
 ## v1.25 I/O Read Loading Strategy (Shipped: 2026-05-06)
 
 **Phases completed:** 16 phases, 21 plans, 12 tasks
diff --git a/.planning/PROJECT.md b/.planning/PROJECT.md
index bfda3a07..69dfda88 100644
--- a/.planning/PROJECT.md
+++ b/.planning/PROJECT.md
@@ -16,61 +16,83 @@ before widening API surface or model scope.
 
 ## Current State
 
-Current milestone: `v1.26 I/O Staged Read Loading Strategy` (GitHub issue #63)
-
-Latest shipped milestone: `v1.25 I/O Read Loading Strategy`
-
-Status: `v1.25` shipped on 2026-05-06 after Phase 225 review-fix cleanup and a
-refreshed source-backed milestone audit passed. The repo now ships the issue #62 read/copy
-strategy path beneath the existing `src/emel/io` boundary while preserving
-tensor-owned residency. `src/emel/io/read` is the canonical read/copy actor,
-`model/tensor` owns the target buffer and residency commit, and maintained
-loader/tool lanes select/report read/copy through public runtime surfaces.
-Maintained source-byte loading for benchmark, paritychecker, and embedded probe
-evidence now uses the public `emel::io::source::load_file_bytes` setup-time
-contract instead of actor-internal `io/read/detail.hpp`. Phase 224 also
-confirmed Phase 214 is historical, clarified the direct `request_read_load`
-coverage shape, and captured fresh passing `emel_tests_io` evidence before
-archive.
-
-Planning for `v1.26` (issue #63) adds a bounded `src/emel/io/staged_read`
-Stateforward.SML actor for constrained-memory chunked/staged tensor loads,
-integrated through the existing tensor-to-I/O boundary from issue #60, without
-moving tensor residency ownership out of `model/tensor` and without cooperative
-coroutine scheduling unless separately approved.
-
-## Current Milestone: v1.26 I/O Staged Read Loading Strategy
+Current milestone: none.
+
+Latest shipped milestone: `v1.27 Ryzen AVX2/FMA Kernel Support`
+
+Status: `v1.27` shipped on 2026-06-25 for this host CPU, an AMD Ryzen 9 5950X.
+The practical native feature contract is x86_64 AVX2 plus FMA, with F16C
+conversion support only. Phases 239-244 are verified for the host contract,
+optimized flash attention, q2_K/q3_K, q6_K, allocation-free quantized hot-path
+contract, runtime integration, maintained parity attribution, and truthful
+benchmark publication. Approved snapshot updates landed for the
+`kernel_x86_64` benchmark baseline and the maintained LFM2 generation
+publication baselines. The source-backed milestone audit passed after repairing
+the `XBN-01` optimized benchmark attribution gap and removing the x86_64 unary
+SML rule debt; the scoped quality gate passed after those updates.
+
+## Latest Shipped Milestone: v1.27 Ryzen AVX2/FMA Kernel Support
+
+**Goal:** Bring the maintained x86_64 runtime path on this Ryzen host up to the
+same standard as the earlier NEON/AArch64 support: EMEL-owned AVX2/FMA flash
+and quantized hot-path kernels, explicit runtime attribution, maintained parity
+proof, and truthful benchmark publication.
+
+**Source:** User request on 2026-06-25: "add support for this processor exactly
+how NEON was added"; local host inspection reports AMD Ryzen 9 5950X with
+AVX2, FMA, and F16C, and without AVX-512/VNNI/AMX/BF16/native FP16.
+
+**Target features:**
+- x86_64 host feature contract that detects and publishes AVX2, FMA, and F16C
+  availability while explicitly marking unsupported feature families as no-claim.
+- Host-tuned x86_64 build/config support analogous to the AArch64 host-feature
+  switch, without requiring unsupported AVX-512, AVX-VNNI, AMX, BF16, or native
+  FP16 instructions.
+- EMEL-owned AVX2/FMA flash-attention implementation for supported x86_64
+  requests, with deterministic fallback/no-claim behavior for unsupported
+  shapes and operand contracts.
+- EMEL-owned AVX2/FMA `q2_K/q3_K/q6_K x q8_K` hot-path kernels using the same
+  effective operand class as the reference path, with zero hot-path allocation
+  and no whole-tensor dequantize-to-f32 substitution.
+- Runtime integration through the shipped generator -> graph -> processor ->
+  kernel chain, with counters/attribution, maintained `1/10/100/1000` parity
+  proof, and source-backed tests for supported and fallback behavior.
+- Benchmark and documentation publication that truthfully distinguishes x86_64
+  Ryzen evidence from ARM-first claims and from reference-lane results.
+
+**Archive:** `.planning/milestones/v1.27-ROADMAP.md`
+**Requirements:** `.planning/milestones/v1.27-REQUIREMENTS.md`
+**Audit:** `.planning/milestones/v1.27-MILESTONE-AUDIT.md`
+
+## Latest Shipped Milestone: v1.26 I/O Staged Read Loading Strategy
 
 **Goal:** Add a dedicated `io/staged_read` Stateforward.SML strategy actor under
 `src/emel/io` so tensor-owned model loading can request bounded staging/chunked
-read residency into caller-owned target memory through the public I/O boundary,
-without folding mmap, full-span single-shot read/copy internals, async
-cooperative scheduling, or device strategy behavior into this issue.
+read residency into caller-owned target memory through the public I/O boundary.
 
 **Source:** GitHub issue #63, "Add io/staged_read state machine for
-constrained-memory tensor loading" (depends on issue #60 boundary; follows
-v1.24 mmap and v1.25 read/copy strategy milestones).
+constrained-memory tensor loading"
 
-**Target features:**
-- Dedicated `src/emel/io/staged_read` machine: events, guards, actions, context,
-  errors, and public aliases following `AGENTS.md` / `docs/rules/sml.rules.md`
-  (destination-first transitions, no dispatch-local context handoff, explicit
-  guard-modeled validation and chunk/stage policy).
-- Tensor-to-I/O integration that lets `model/tensor` request staged read loading
-  through the public `emel/io` boundary while `model/tensor` remains the
-  residency lifecycle owner; the staged strategy never takes residency
-  ownership of the target tensor buffer.
-- Deterministic multi-stage progress: explicit success, chunk/short-read,
-  validation, platform-unsupported, and file errors surfaced through states and
-  events without hidden behavior selection in actions or `detail` helpers.
-- RTC-safe externalization of blocking filesystem work per project conventions;
-  bounded transient resources per stage; no handle pool retained across dispatch
-  boundaries beyond what prior I/O actors allow.
-- Maintained tests, docs, lint snapshots, benchmark snapshots, and model
-  artifacts updated from maintained commands when required; public reporting
-  reflects actual staged-read runtime usage.
-
-## Latest Shipped Milestone: v1.25 I/O Read Loading Strategy
+**Shipped:** 2026-05-08
+
+**Delivered:**
+- Added the canonical `src/emel/io/staged_read` Stateforward.SML actor with
+  guard-modeled source span, target window, stage sizing, and platform/resource
+  validation.
+- Implemented deterministic source-span staged copy semantics with monotonic
+  forward progress, explicit terminal success, and named deterministic errors.
+- Preserved tensor-owned residency: `model/tensor` dispatches through public
+  `emel/io` events and remains the load/bind/evict/residency owner.
+- Kept maintained loader, benchmark, paritychecker, and embedded probe lanes on
+  public runtime contracts, with no reach-through into actor internals.
+- Closed the source-backed audit by repairing direct tensor staged-load
+  nonzero-offset behavior and reconciling closeout artifact/reporting truth.
+
+**Audit:** Final source-backed audit passed with all active requirements
+satisfied. `ESG-02B` remains deferred/future for a separately approved
+file-backed staged-read source path.
+
+## Previous Shipped Milestone: v1.25 I/O Read Loading Strategy
 
 **Goal:** Add a dedicated `io/read` Stateforward.SML strategy actor under `src/emel/io`
 so tensor-owned model loading can request explicit read/copy residency into a
@@ -446,12 +468,23 @@ truth anchor and without broadening into generic Liquid-family support.
 
 ### Active
 
-- v1.26 defines scoped staged/chunked constrained-memory loading under
-  `src/emel/io/staged_read` with tensor-owned residency (see
-  `.planning/REQUIREMENTS.md` and `.planning/ROADMAP.md`).
+- v1.27 defines scoped x86_64 Ryzen AVX2/FMA kernel support that mirrors the
+  earlier NEON/AArch64 optimization path: host feature contract, flash
+  attention, quantized hot-path kernels, maintained runtime/parity proof, and
+  benchmark attribution (see `.planning/REQUIREMENTS.md` and
+  `.planning/ROADMAP.md`).
 
 ### Recently Validated
 
+- v1.26 added the dedicated `src/emel/io/staged_read` Stateforward.SML strategy
+  actor for bounded staged/chunked source-span reads under tensor-owned
+  residency.
+- v1.26 proved staged copy progress, explicit success/failure outcomes, public
+  tensor-to-I/O integration, maintained loader/tool publication truth, and
+  non-regression guardrails for shipped mmap and bulk read/copy strategies.
+- v1.26 intentionally deferred real file open/seek/read and per-stage short-read
+  taxonomy (`ESG-02B`) until a future file-backed staged-read source path is
+  approved.
 - v1.25 added a dedicated `src/emel/io/read` Stateforward.SML strategy actor for read/copy
   tensor loading.
 - v1.25 integrated read-backed residency requests through the public tensor-to-I/O boundary
@@ -461,9 +494,6 @@ truth anchor and without broadening into generic Liquid-family support.
   helpers.
 - v1.25 kept mmap changes, staged/chunked read policy, device-specific loading, cooperative
   async loading, new model families, and broad public API expansion out of scope.
-- v1.26 is the dedicated follow-on for staged/chunked constrained-memory reads; it keeps
-  cooperative coroutine scheduling and device-specific strategies out of scope unless
-  separately approved and must not regress shipped mmap or bulk `io/read` semantics.
 
 ### Validated
 
@@ -622,9 +652,11 @@ truth anchor and without broadening into generic Liquid-family support.
 ## Context
 
 This remains a brownfield repository with an existing codebase map under `.planning/codebase/`.
-The repo stays governed by `AGENTS.md` and `docs/rules/sml.rules.md`. `v1.25` is
-the latest shipped I/O milestone; `v1.26` plans constrained-memory staged reads
-below the same boundary. Earlier shipped work includes quality gate optimization
+The repo stays governed by `AGENTS.md` and `docs/rules/sml.rules.md`. `v1.26` is
+the latest shipped I/O milestone; `v1.27` returns to native kernel performance
+work on this x86_64 Ryzen host by mirroring the earlier NEON/AArch64 progression
+for flash, quantized kernels, runtime proof, and benchmark attribution. Earlier
+shipped work includes quality gate optimization
 (`v1.21`) with manifest-backed selective runners,
 conservative fallback, and parallel lane reporting. The current maintained state includes repo-owned
 EMEL generation, embedding, diarization, and Whisper ASR lanes plus pluggable parity and benchmark
@@ -636,7 +668,7 @@ mandatory validation or change benchmark/parity semantics. `v1.22` shipped from
 shipped from issue #60 and added the missing `emel/io` orchestration boundary under tensor-owned
 residency while deferring concrete strategy machines to follow-on milestones (mmap #61,
 read/copy #62, staged read #63). `v1.24` shipped mmap; `v1.25` shipped bulk read/copy. `v1.26`
-owns constrained-memory staged reads under issue #63.
+shipped constrained-memory staged reads under issue #63.
 
 ## Constraints
 
@@ -681,12 +713,17 @@ owns constrained-memory staged reads under issue #63.
   `src/emel/io/staged_read` beneath tensor-owned residency. It must not regress shipped mmap or
   bulk read/copy strategy machines, introduce cooperative coroutine scheduling, add device-specific
   strategies, or move tensor residency ownership out of `model/tensor`.
+- **x86_64 Ryzen kernel scope**: `v1.27` targets this host's AVX2/FMA feature set with F16C
+  conversion only. It must not claim AVX-512, AVX-VNNI, AMX, BF16, native FP16 arithmetic, GPU
+  acceleration, broad public API widening, or dequantize-to-f32 hot-path substitution unless a
+  future user-approved milestone explicitly changes that performance contract.
 
 ## Key Decisions
 
 | Decision | Rationale | Outcome |
 |----------|-----------|---------|
-| Start v1.26 from GitHub issue #63 as the `io/staged_read` constrained-memory milestone | v1.25 shipped bulk read/copy; constrained-memory staging is the next narrow strategy slice under tensor-owned residency and the issue #60 boundary | ⏳ Planned |
+| Start v1.27 as Ryzen AVX2/FMA kernel support | The user asked to add support for this processor exactly how NEON was added; this host is an AMD Ryzen 9 5950X with AVX2/FMA/F16C but no AVX-512/VNNI/AMX/BF16/native FP16, so the milestone mirrors the NEON flash/quantized/runtime/benchmark progression for x86_64 | Phases 239-244 verified; milestone audit and closeout lifecycle next |
+| Start v1.26 from GitHub issue #63 as the `io/staged_read` constrained-memory milestone | v1.25 shipped bulk read/copy; constrained-memory staging is the next narrow strategy slice under tensor-owned residency and the issue #60 boundary | ✓ Shipped |
 | Start v1.25 from GitHub issue #62 as the `io/read` loading strategy milestone | v1.24 shipped the mmap strategy and left read/copy as the next narrow concrete strategy path beneath tensor-owned residency | ✓ Shipped |
 | Start v1.24 from GitHub issue #61 as the `io/mmap` loading strategy milestone | v1.23 established the `emel/io` strategy boundary and explicitly deferred concrete mmap behavior; issue #61 is the next narrow strategy path to land beneath tensor-owned residency | ✓ Shipped |
 | Start v1.23 from GitHub issue #60 as the `emel/io` boundary milestone | v1.22 moved tensor residency ownership into `model/tensor`; the next architecture step is the explicit I/O strategy seam beneath tensor-owned residency before concrete mmap or staged strategy work lands | Phase 203 closeout cleanup |
@@ -745,4 +782,4 @@ This document evolves at phase transitions and milestone boundaries.
 4. Update Context with current state
 
 ---
-*Last updated: 2026-05-07 after starting v1.26 I/O staged read loading strategy milestone (issue #63)*
+*Last updated: 2026-06-25 after starting v1.27 Ryzen AVX2/FMA kernel support milestone*
diff --git a/.planning/RETROSPECTIVE.md b/.planning/RETROSPECTIVE.md
index 657360cd..d9c6c587 100644
--- a/.planning/RETROSPECTIVE.md
+++ b/.planning/RETROSPECTIVE.md
@@ -2,6 +2,50 @@
 
 *A living document updated after each milestone. Lessons feed forward into future planning.*
 
+## Milestone: v1.27 - Ryzen AVX2/FMA Kernel Support
+
+**Shipped:** 2026-06-25
+**Phases:** 6 | **Plans:** 6 | **Sessions:** autonomous execution, source-backed audit, and closeout
+
+### What Was Built
+
+- x86_64 host feature contract for AVX2, FMA, and F16C on the Ryzen 9 5950X.
+- EMEL-owned AVX2/FMA/F16C flash-attention path with explicit fallback/no-claim behavior.
+- EMEL-owned AVX2/FMA q2_K/q3_K/q6_K x q8_K hot-path kernels.
+- Maintained generator and paritychecker attribution proving optimized x86_64 dispatch.
+- `kernel_x86_64` benchmark publication with counter-checked optimized flash and q2/q3/q6 rows.
+
+### What Worked
+
+- The source-backed audit caught a real benchmark-publication gap after phase artifacts looked green.
+- Counter checks in benchmarks made optimized/shared attribution mechanically enforceable.
+- Keeping x86_64 routing in explicit SML guards/transitions made the unary rule-debt repair small.
+
+### What Was Inefficient
+
+- The first benchmark snapshot update only covered common x86_64 rows and had to be repaired before closeout.
+- The quality gate coverage shard was slow under coverage instrumentation, so closeout needed long-running observation.
+
+### Patterns Established
+
+- Benchmark parity claims need counter-backed maintained entries for each optimized lane, not only suite-level presence.
+- x86_64 support should mirror NEON by proving host contract, native kernels, runtime attribution, parity, and publication as one slice.
+- Pre-close audits should separate current milestone blockers from historical backlog artifacts before archiving.
+
+### Key Lessons
+
+1. Artifact agreement is not enough for benchmark truth; source entrypoints and counters must match the claim.
+2. Runtime behavior selection debt can survive in helper APIs even when production transitions are explicit.
+3. Snapshot updates need a second source-backed pass when they create a new benchmark suite.
+
+### Cost Observations
+
+- Model mix: not measured.
+- Sessions: one long autonomous closeout session with an integration-checker agent.
+- Notable: `commit_docs=false` left archive and planning changes local instead of committing them.
+
+---
+
 ## Milestone: v1.25 - I/O Read Loading Strategy
 
 **Shipped:** 2026-05-06
diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md
index 8c2e8bfb..3ff6e628 100644
--- a/.planning/ROADMAP.md
+++ b/.planning/ROADMAP.md
@@ -2,773 +2,52 @@
 
 ## Milestones
 
-- ✅ **v1.0 EMEL Llama-68M Generation Slice** — shipped 2026-03-08
-- ✅ **v1.1 EMEL Llama-68M Generation Benchmark** — shipped 2026-03-11
-- ✅ **v1.2 Flash Attention** — shipped 2026-03-22
-- ✅ **v1.3 ARM Flash Optimizations** — shipped 2026-03-22
-- ✅ **v1.4 Full Vectorized Quantized Kernels** — shipped 2026-03-25
-- ✅ **v1.5 Full ARM Quantized Path** — shipped 2026-03-27
-- ✅ **v1.6 Qwen3-0.6B Parity And Benchmark** — shipped 2026-03-30
-- ✅ **v1.7 Generator Prefill Submachine Decomposition** — shipped 2026-03-30
-- ✅ **v1.8 Truthful Qwen3 E2E Embedded Size** — shipped 2026-04-02
-- ✅ **v1.9 Liquid LFM2.5-1.2B Thinking ARM Slice** — shipped 2026-04-02
-- ✅ **v1.11 TE-75M GGUF Trimodal Embedding Runtime** — shipped 2026-04-15
-- ✅ **v1.12 Pluggable Reference Parity Bench Architecture** — shipped 2026-04-18
-- ✅ **v1.13 Pluggable Generative Parity Bench** — shipped 2026-04-21
-- ✅ **v1.14 Benchmark Variant Organization** — shipped 2026-04-21
-- ✅ **v1.15 ARM Sortformer Diarization GGUF Slice** — shipped 2026-04-25
-- ✅ **v1.16 ARM Whisper GGUF Parity And Performance** — shipped 2026-04-28
-- ✅ **v1.17 Text Generator Domain Alignment** — shipped 2026-04-30
-- ✅ **v1.18 Parity Tool Boundary Refactor** — shipped 2026-05-01
-- ✅ **v1.19 Benchmark Tool Pluggable Runner Refactor** — shipped 2026-05-01
-- ✅ **v1.20 SML Dependency And Namespace Migration** — shipped 2026-05-02
-- ✅ **v1.21 Quality Gate Selective Runner Optimization** — shipped 2026-05-02
-- ✅ **v1.22 Weight Loading Ownership Cutover** — shipped 2026-05-03
-- ✅ **v1.23 I/O Loading Strategy Boundary** — shipped 2026-05-04
-- ✅ **v1.24 I/O Mmap Loading Strategy** — shipped 2026-05-04 (Phases 204-211)
-- ✅ **v1.25 I/O Read Loading Strategy** — shipped 2026-05-06 (Phases 212-226 + 214.1)
-- ✅ **v1.26 I/O Staged Read Loading Strategy** — completed 2026-05-08
-  (12 / 12 phases complete; issue #63; `ESG-02B` deferred/future)
-
-## Phases
-
-### ✅ v1.26 I/O Staged Read Loading Strategy (Phases 227-238) — COMPLETE 2026-05-08
-
-Source: GitHub issue #63, "Add io/staged_read state machine for constrained-memory tensor loading".
-Adds `src/emel/io/staged_read` for bounded chunked/windowed reads under tensor-owned residency.
-Depends on the tensor-to-I/O boundary from issue #60. Cooperative coroutine scheduling is out of
-scope unless separately approved. Shipped mmap (`io/mmap`) and bulk read/copy (`io/read`) must not
-regress.
-
-Execution order: 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238.
-
-**Milestone progress (v1.26):** **12 / 12** phases recorded **Complete** in the table below.
-The source-backed milestone audit found a direct tensor staged-load offset-contract gap plus
-closeout artifact debt; Phases 237-238 closed those gaps. `ESG-02B` remains deferred/future
-because file-backed staged-read source ownership is out of scope.
-
-- [x] Phase 227: Staged Read Strategy Component Boundary (STG-01)
-- [x] Phase 228: Span, Target-Window, and Platform Gating (STG-02, STG-03, PLAT-02)
-- [x] Phase 229: Staged Copy Progress and Completion Semantics (STG-04, STG-05, STG-06)
-- [x] Phase 230: Context Cleanness and Per-Attempt Lifetime (STG-07, LIFE-02, SNR-01)
-- [x] Phase 231: Deterministic Error Taxonomy (ESG-01, ESG-02A, ESG-03, ESG-04; ESG-02B deferred)
-- [x] Phase 232: Tensor-Owned Integration Graph (TNX-01, TNX-02, TNX-03, TNX-04)
-- [x] Phase 233: Public Loader and Maintained Entrypoints (PUB-01, PUB-02, PUB-03, PUB-04, PUB-05)
-- [x] Phase 234: Public Dispatch Tests (TST-01, TST-02)
-- [x] Phase 235: Scope and Non-Regression Guardrails (GRD-01, GRD-02, GRD-03, GRD-04, GRD-05)
-- [x] Phase 236: Publication and Evidence Truthfulness (DOC-01, LNT-01, BNH-01, EVI-01)
-- [x] Phase 237: Direct Tensor Staged Offset Contract Repair (TNX-01, TNX-03, TNX-04, TST-01, TST-02)
-- [x] Phase 238: Audit Artifact and Probe Reporting Cleanup (cleanup-only)
-
-#### Phase 227: Staged Read Strategy Component Boundary
-
-**Goal:** Locate canonical `src/emel/io/staged_read` with standard I/O component layout.
-**Depends on:** Phase 226
-**Requirements:** STG-01
-
-**Success criteria:**
-
-1. `src/emel/io/staged_read` exists with canonical `emel::io::staged_read::sm` alias.
-2. Component scope excludes mmap, device transfer, or cooperative async runtime.
-3. Initial fail-closed or smoke dispatch proves actors are wired like sibling I/O strategies.
-
-#### Phase 228: Span, Target-Window, and Platform Gating
-
-**Goal:** All staged preconditions enforced in guards/transitions before any file work.
-**Depends on:** Phase 227
-**Requirements:** STG-02, STG-03, PLAT-02
-
-**Success criteria:**
-
-1. Invalid source staging contract rejected solely via guard-modeled transitions.
-2. Invalid target window/layout rejected solely via guard-modeled transitions.
-3. Unsupported hosts/resources fail closed with explicit unsupported terminal shape.
-
-#### Phase 229: Staged Copy Progress and Completion Semantics
-
-**Goal:** Prove per-stage deterministic copy plus full-span monotone completion.
-**Depends on:** Phase 228
-**Requirements:** STG-04, STG-05, STG-06
-
-**Success criteria:**
-
-1. Test vectors observe correct bytes per staged window.
-2. Completeness tests cover entire logical span order.
-3. Terminal success aligns with copied full span per contract.
-
-#### Phase 230: Context Cleanness and Per-Attempt Lifetime
-
-**Goal:** Bounded handles and residency clarity for the staged actor.
-**Depends on:** Phase 229
-**Requirements:** STG-07, LIFE-02, SNR-01
-
-**Success criteria:**
-
-1. Static or dynamic review shows zero forbidden dispatch-local context mirrors.
-2. Handle lifetime tests/tools show release-before-done semantics.
-3. Tests confirm strategy never asserts tensor residency commits.
-
-#### Phase 231: Deterministic Error Taxonomy
-
-**Goal:** Errors are categorical, observable, exception-free.
-**Depends on:** Phase 230
-**Requirements:** ESG-01, ESG-02A, ESG-03, ESG-04 (`ESG-02B` deferred)
-
-**Success criteria:**
-
-1. At least one doctest per taxonomy family (pre-I/O guard, source-contract read-surface, sequencing/contract) demonstrates deterministic categories through `process_event(...)`.
-2. Source-backed docs explicitly defer `ESG-02B` file open/seek/read + per-stage short-read categories until approved file-backed staged-read ownership exists.
-3. ABI boundary scans show noexcept expectations for surfaced API.
-
-#### Phase 232: Tensor-Owned Integration Graph
-
-**Goal:** Integrate staged loads through explicit tensor+I/O graphs.
-**Depends on:** Phase 231
-**Requirements:** TNX-01, TNX-02, TNX-03, TNX-04
-
-**Closeout ledger (verified):** Manager-scoped **`scripts/quality_gates.sh`** for Phase 232
-changed-file corpus exited **2** (red — **not** exit 0). **`232-VERIFICATION.md`** records **bench_snapshot**
-suite regressions unrelated to staged tensor-integration files and a **paritychecker** failure outside
-Phase 232 scope. Phase 232 completion is ledger-approved **without** claiming a passing full-repo gate run.
-
-**Success criteria:**
-
-1. Requests flow only via public tensors↔IO events.
-2. Residency proofs remain tensor-owned (`model/tensor` retains lifecycle ownership).
-3. Success/failure each have explicit observable terminal representations.
-
-#### Phase 233: Public Loader and Maintained Entrypoints
-
-**Goal:** Strategies observable without actor detail reach-through or duplicate POSIX loops in tools.
-**Depends on:** Phase 232
-**Requirements:** PUB-01, PUB-02, PUB-03, PUB-04, PUB-05
-
-**Closeout (2026-05-08):** **`PUB-01`–`PUB-05`** satisfied per **`233-VERIFICATION.md`** (manager validation +
-**phase233-navigator final review PASS**). Public **`staged_read`** access is through **`io::loader`** and maintained
-tool entrypoints with **`io_staged_read`** wiring; **`tests/model/loader/lifecycle_tests.cpp`** covers the
-storage-backed **`staged_read`** route and include guards.
-
-**Residual:** **`scripts/quality_gates.sh`** was **not** run on a Phase **233** changed-file corpus in
-this closeout slice — **no Phase 233 scoped gate pass is claimed** (full-repo gate truth unchanged from
-Phase **232** ledger where applicable).
-
-**Success criteria:**
-
-1–4. Each lane (loader/bench/parity/probe) has independent proof of public-contract-only access.
-5. Source scan enforcement or doctest proves no duplicated unconstrained staged read shim in tools.
-
-#### Phase 234: Public Dispatch Tests
-
-**Goal:** Core success/failure behavior demonstrated through `process_event`.
-**Depends on:** Phase 233
-**Requirements:** TST-01, TST-02
-
-**Success criteria:**
-
-1. Passing success-path doctest with `visit_current_states` or equivalent.
-2. Passing failure-path doctest for guard rejection.
-
-#### Phase 235: Scope and Non-Regression Guardrails
-
-**Goal:** Freeze architecture invariants relative to loaders, mmap, and read strategies.
-**Depends on:** Phase 234
-**Requirements:** GRD-01, GRD-02, GRD-03, GRD-04, GRD-05
-
-**Success criteria:** Each of GRD-01, GRD-02, GRD-03, GRD-04, and GRD-05 has either a deterministic script failure mode or a narrowed regression doctest proving the invariant holds.
-
-#### Phase 236: Publication and Evidence Truthfulness
-
-**Goal:** Align docs and frozen artifacts with real staged/runtime usage.
-**Depends on:** Phase 235
-**Requirements:** DOC-01, LNT-01, BNH-01, EVI-01
-
-**Success criteria:**
-
-1. Doc diff review verifies accurate staged-read wording.
-2. Lint snapshot regeneration path documented/passing.
-3. Benchmark snapshot regeneration obeys policy.
-4. Parity/compare metadata never mislabels unstaged workloads as staged.
-
-**Closeout (2026-05-08):** **`DOC-01`–`EVI-01`** satisfied per
-**`236-VERIFICATION.md`**. Serial full quality gate passed:
-`EMEL_QUALITY_GATES_SCOPE=full EMEL_QUALITY_GATES_PARALLEL=0 scripts/quality_gates.sh`
-(exit **0**, ended `2026-05-08T21:21:42.028Z`). Benchmark defaults now use bounded routine
-settings (`100` iterations, `3` runs, `10` warmup iterations) with bounded generation and
-diarization defaults.
-
-#### Phase 237: Direct Tensor Staged Offset Contract Repair
-
-**Goal:** Repair direct `model/tensor` staged-load nonzero-offset source-window behavior and prove it through public dispatch.
-**Depends on:** Phase 236
-**Requirements:** TNX-01, TNX-03, TNX-04, TST-01, TST-02
-**Gap Closure:** Closes `.planning/v1.26-MILESTONE-AUDIT.md` findings
-`direct-tensor-staged-offset-contract` and `direct-tensor-staged-nonzero-offset`.
-
-**Success Criteria:**
-
-1. A public `model/tensor::event::request_staged_load` doctest fails before repair and passes after
-   repair for a nonzero `file_offset` against a whole-file source buffer.
-2. Direct tensor staged-load source-span construction is aligned with `io/loader` or the
-   pre-windowed-source contract is explicitly documented and enforced by validation/tests.
-3. Direct tensor staged-load success and failure outcomes remain explicit `_done` / `_error`
-   publications through public `process_event(...)` dispatch and SML state inspection.
-4. Changed-file quality gates for `model/tensor`, `io/staged_read`, and affected tests pass without
-   benchmark-regression override.
-5. If implementation changes maintained model or snapshot artifacts, those artifacts are refreshed
-   only through maintained workflows; model artifact updates are approved for this gap-closure work.
-
-**Closeout (2026-05-08):** Phase 237 completed with a failing-first public
-`request_staged_load` nonzero-offset doctest, repaired source-window dispatch in
-`model/tensor`, and passing scoped validation:
-`./build/emel_tests_bin --test-case="model_tensor_request_staged_load_*"`,
-`ctest --test-dir build -R '^emel_tests_model_and_batch$' --output-on-failure`,
-and changed-file `scripts/quality_gates.sh` (exit `0`). Reopened requirements
-`TNX-01`, `TNX-03`, `TNX-04`, `TST-01`, and `TST-02` are satisfied by
-`237-VERIFICATION.md`.
-
-#### Phase 238: Audit Artifact and Probe Reporting Cleanup
-
-**Goal:** Reconcile audit artifacts and probe reporting truth after the Phase 237 source repair.
-**Depends on:** Phase 237
-**Requirements:** none — cleanup-only; all reopened requirement closure belongs to Phase 237
-**Gap Closure:** Closes `.planning/v1.26-MILESTONE-AUDIT.md` tech-debt items for missing
-`requirements-completed` SUMMARY frontmatter and embedded-size probe reporting clarity.
-
-**Success Criteria:**
-
-1. Phase summaries for 232–236 expose accurate `requirements-completed` frontmatter or an explicit
-   cleanup rationale so the three-source audit matrix no longer needs manual reconciliation.
-2. Embedded-size probe evidence either prints the executed load strategy when appropriate or the
-   maintained docs/audit explain why captured `used_io_strategy` is the authoritative evidence
-   surface.
-3. REQUIREMENTS, ROADMAP, STATE, and the milestone audit are refreshed from source-backed evidence
-   after Phase 237.
-4. Focused lint/docs/audit commands pass; no maintained benchmark, model, or snapshot artifact is
-   updated unless the implementation actually requires it.
-
-**Closeout (2026-05-08):** Phase 238 completed summary frontmatter reconciliation,
-embedded probe reporting truth documentation, and refreshed `v1.26-MILESTONE-AUDIT.md`
-to `status: passed`. Changed-file `scripts/quality_gates.sh` passed with no benchmark,
-coverage, parity, fuzz, or docsgen-affecting lanes required.
-
----
-### ✅ v1.25 I/O Read Loading Strategy (Phases 212-226 + 214.1) — SHIPPED 2026-05-06
-
-Source: GitHub issue #62, "Add io/read state machine for copy-based tensor loading".
-Adds a dedicated `src/emel/io/read` Stateforward.SML actor for explicit read/copy tensor
-loading beneath tensor-owned residency. Mmap, staged/chunked constrained-memory, async,
-and device strategies remain out of scope.
-
-- [x] Phase 212: Read Strategy Component Boundary (1/1 plans) — completed 2026-05-05
-- [x] Phase 213: Read Validation and Platform Gating (1/1 plans) — completed 2026-05-05
-- [x] Phase 214: Read Execution, Errors, and Lifetime (1/1 plans) — completed 2026-05-05; audit found RTC compliance gap
-- [x] Phase 214.1: RTC-Safe Read Execution Boundary Repair (1/1 plans) — gap closure
-- [x] Phase 215: Tensor-Owned Read Integration (1/1 plans) — completed 2026-05-05
-- [x] Phase 216: Public Runtime and Evidence Surfaces (1/1 plans) — completed 2026-05-05
-- [x] Phase 217: Behavior Tests and Scope Guardrails (1/1 plans) — completed 2026-05-05
-- [x] Phase 218: Publication and Maintained Artifact Updates (1/1 plans) — completed 2026-05-05
-- [x] Phase 219: Maintained Read Source Provenance (1/1 plans) — completed
-  2026-05-05; source-backed benchmark/parity/probe read_copy provenance
-- [x] Phase 220: Explicit Tensor Read Outcome Graph (1/1 plans) — completed
-  2026-05-05; tensor read outcomes selected by explicit same-RTC result graph
-- [x] Phase 221: Read Closeout Truth Reconciliation — superseded planning stub
-  closed 2026-05-06; Phase 223 owns final closeout
-- [x] Phase 222: Public Read Source Contract Repair (1/1 plans) — completed
-  2026-05-06; actor-detail reach-through removed from maintained lanes
-- [x] Phase 223: Read Closeout Truth And Validation Reconciliation (1/1 plans) —
-  completed 2026-05-06; final closeout truth and validation reconciled
-- [x] Phase 224: Read Closeout Tech Debt Cleanup — completed 2026-05-06;
-  refreshed audit ambiguity closed with fresh passing `emel_tests_io` evidence
-- [x] Phase 225: Read Closeout Runtime Validation And SML Repair — completed
-  2026-05-06; refreshed source-backed audit gaps closed with dyld fallback evidence
-- [x] Phase 226: Read Batch Cap And Closeout Evidence Refresh — completed
-  2026-05-06; refreshed audit tech debt closed
-
-Archived closeout artifacts:
-- `.planning/milestones/v1.25-ROADMAP.md`
-- `.planning/milestones/v1.25-REQUIREMENTS.md`
-- `.planning/milestones/v1.25-MILESTONE-AUDIT.md`
-- `.planning/milestones/v1.25-phases/`
-
-**Execution Order:** Phases execute in numeric order:
-212 -> 213 -> 214 -> 214.1 -> 215 -> 216 -> 217 -> 218 -> 219 -> 220 -> 222 -> 223 -> 224 -> 225 -> 226.
-Phase 221 is a completed superseded closeout planning stub and Phase 223 owns final
-source-backed closeout truth. Phase 224 is cleanup-only; Phase 225 owns the refreshed
-2026-05-06 audit gaps before archive. Phase 226 closes the post-audit nonblocking
-tech-debt items before final closeout.
-
-#### Phase 212: Read Strategy Component Boundary
-**Goal**: Maintainers can identify `io/read` as the canonical read/copy strategy actor under
-`src/emel/io`.
-**Depends on**: Phase 211
-**Requirements**: READ-01
-**Success Criteria** (what must be TRUE):
-  1. Maintainer can inspect `src/emel/io/read` and find component-local `context`, `events`,
-     `guards`, `actions`, `errors`, and `sm` ownership.
-  2. Maintainer can use canonical `emel::io::read::sm` ownership and public aliases without
-     reaching into actor internals.
-  3. Maintainer can confirm the component is read/copy-only and contains no mmap, staged or
-     chunked constrained-memory, cooperative async, device-specific, loader-owned byte access,
-     model-family widening, or tool-only read scaffold behavior.
-**Plans**: 01 — Validated 2026-05-05; established canonical `io/read` boundary actor
-and lifecycle tests.
-
-#### Phase 213: Read Validation and Platform Gating
-**Goal**: The read actor accepts read attempts only after explicit request, platform, file,
-offset, length, layout, and target-buffer preconditions pass.
-**Depends on**: Phase 212
-**Requirements**: READ-02, PLAT-01
-**Success Criteria** (what must be TRUE):
-  1. Caller sees invalid request, file, offset, length, layout, or target-buffer preconditions
-     rejected before any open or read attempt is accepted.
-  2. Caller sees unsupported platforms and unsupported file/resource shapes fail closed
-     deterministically through the I/O abstraction boundary.
-  3. Maintainer can inspect SML guards and transitions and see validation outcomes modeled
-     before the open/read attempt.
-  4. Supported requests reach a read-attempt state only after all read preconditions are true.
-**Plans**: 01 — Validated 2026-05-05; added explicit read validation and platform
-gating before the read-attempt placeholder.
-
-#### Phase 214: Read Execution, Errors, and Lifetime
-**Goal**: Successful read requests deliver deterministic copied bytes into the caller-owned
-target buffer with deterministic transient-resource lifetime and deterministic failure
-outcomes, without taking tensor residency ownership.
-**Depends on**: Phase 213
-**Requirements**: READ-03, LIFE-01, ERR-01
-**Success Criteria** (what must be TRUE):
-  1. Caller receives a deterministic copied-bytes outcome on success with the requested byte
-     span written into the caller-provided owned target buffer; the read strategy never claims
-     residency ownership.
-  2. Read failures surface deterministic error categories (invalid request, unsupported
-     resource, unsupported platform, file open failed, file seek failed, file read failed,
-     short read, internal error) instead of thrown exceptions or ambiguous status mirroring.
-  3. Transient OS resources (file descriptor / handle) are released through the actor-owned
-     attempt before `_done` is published; no kernel handle is held across publication.
-  4. Maintainer can verify dispatch-local request data is not stored in `read::context` and
-     tensor residency semantics remain owned by `model/tensor`.
-**Plans**: 01 — Validated 2026-05-05; added concrete read execution, copied-byte
-success, deterministic read errors, and close-before-done lifetime behavior.
-2026-05-05 milestone audit found this phase superseded by unverified Phase 214.1 repair
-work; Phase 214.1 owns source-backed RTC validation and artifact reconciliation.
-
-#### Phase 214.1: RTC-Safe Read Execution Boundary Repair
-**Goal**: The read actor preserves copied-byte success, deterministic errors, and
-close-before-done lifetime evidence without performing blocking or input-dependent
-filesystem work inside SML dispatch.
-**Depends on**: Phase 214
-**Requirements**: READ-03, PLAT-01, LIFE-01, ERR-01
-**Gap Closure**: Closes v1.25 audit gaps for missing Phase 214.1 artifacts, stale Phase
-214 planning truth, and source-backed Nyquist validity after the read actor moved to
-caller-provided source spans.
-**Success Criteria** (what must be TRUE):
-  1. `src/emel/io/read` no longer calls platform open, seek, read, close, or equivalent
-     filesystem APIs from guards, actions, transition helpers, or functions called by them.
-  2. The read actor still accepts only validated read/copy attempts and publishes copied-byte
-     `_done` plus deterministic `_error` outcomes through explicit states/events.
-  3. The caller-owned target buffer remains caller-owned, dispatch-local request data is not
-     stored in `read::context`, and no transient OS handle is retained or hidden in context.
-  4. Tests prove the repaired behavior through public `process_event(...)` dispatch and SML
-     state inspection, including validation failure, unsupported/resource failure, read
-     failure, short read, and copied-byte success.
-  5. Phase 214.1 SUMMARY.md, VERIFICATION.md, and VALIDATION.md reconcile ROADMAP.md,
-     STATE.md, REQUIREMENTS.md, and generated architecture docs with the source-buffer based
-     implementation and do not claim maintained benchmark/parity evidence.
-**Plans**: 01 — Validated 2026-05-05; reconciled read actor evidence with the
-source-buffer based implementation, confirmed no dispatch-time filesystem calls, and
-updated requirement/state artifacts for the Phase 214.1 gap closure.
-
-#### Phase 215: Tensor-Owned Read Integration
-**Goal**: `model/tensor` can request and consume read-backed I/O through the public `emel/io`
-boundary while retaining load, bind, evict, and residency orchestration.
-**Depends on**: Phase 214.1
-**Requirements**: TIO-01, TIO-02
-**Gap Closure**: Closes v1.25 audit gaps for partial tensor-owned read integration and
-callback/status-mediated read outcomes.
-**Success Criteria** (what must be TRUE):
-  1. Tensor load flow can request read-based (copy) loading through public `emel/io` events
-     without direct low-level read calls.
-  2. Tensor bind, residency, and evict transitions remain in `model/tensor` and consume read
-     success outcomes that reference the caller-owned target buffer.
-  3. Read success, unsupported, validation failure, file open failure, and file read failure
-     are visible as explicit `_done` and `_error` events or states.
-  4. Maintainer can verify no callback-selected outcomes, mirrored status fields, or context
-     phase flags decide tensor-to-I/O outcomes for read-backed loading.
-  5. Existing source/test progress through `model/loader -> model/tensor -> io/loader ->
-     io/read -> tensor apply` is preserved or replaced by a stricter explicit outcome path
-     with equivalent public-dispatch tests.
-**Plans**: 01 — Validated 2026-05-05; added tensor-owned
-`request_read_load` public events, explicit read outcome states, and tests for read
-success, unsupported I/O actor, validation failure, file open failure, and file read
-failure.
-
-#### Phase 216: Public Runtime and Evidence Surfaces
-**Goal**: Runtime entrypoints and maintained tool lanes can select or report read-backed
-loading only through public surfaces, and evidence reflects the actual EMEL runtime path.
-**Depends on**: Phase 215
-**Requirements**: TIO-03, VAL-04
-**Gap Closure**: Closes v1.25 audit gaps for maintained benchmark, paritychecker, and
-embedded probe lanes bypassing the read-backed runtime path and for runtime reporting that
-currently exposes only mmap usage.
-**Success Criteria** (what must be TRUE):
-  1. `model/loader`, maintained benchmark lanes, paritychecker lanes, and embedded probes can
-     select or report read-backed loading only through public tensor and I/O runtime contracts.
-  2. Maintained benchmark, paritychecker, and embedded probe lanes avoid actor-internal
-     reach-through and contain no low-level read logic.
-  3. Benchmark and parity output reports read-strategy usage only when the EMEL lane executed
-     the read-backed runtime path.
-  4. Unsupported or fallback behavior is reported as unsupported or non-read-strategy, not as
-     read-strategy parity or performance evidence.
-  5. Runtime done/error evidence distinguishes mmap, read/copy, unsupported, and non-I/O
-     loading paths without relying on tool-only scaffolds.
-**Plans**: 01 — Validated 2026-05-05; added public model-loader load-strategy
-evidence, maintained tool strategy binding, load-strategy output notes, and
-source-backed tests proving benchmark/parity/embedded lanes avoid callback-time
-actor reach-through.
-
-#### Phase 217: Behavior Tests and Scope Guardrails
-**Goal**: Tests and guardrails prove read behavior through public dispatch and prevent scope
-or ownership leaks.
-**Depends on**: Phase 216
-**Requirements**: VAL-01, VAL-02
-**Gap Closure**: Closes v1.25 audit gaps for missing full-scope read behavior tests,
-domain/source guardrails, and former ambiguous read-strategy naming relative to the
-out-of-scope staged/chunked policy.
-**Success Criteria** (what must be TRUE):
-  1. Doctests drive supported read behavior through `process_event(...)` and inspect SML states
-     via `visit_current_states` and/or `is(...)`.
-  2. Doctests cover representative unsupported, validation failure, file open failure, and file
-     read failure outcomes through public events.
-  3. Guardrails fail if read implementation leaks into `model/loader` or tensor residency
-     ownership moves out of `model/tensor`.
-  4. Guardrails fail if mmap, staged or chunked constrained-memory, cooperative async,
-     device-specific, model-family widening, loader-owned byte access, or tool-only read
-     scaffold behavior enters this milestone.
-  5. Source guardrails clarify or eliminate any public naming that could present the v1.25
-     read/copy path as staged/chunked constrained-memory support.
-**Plans**: 01 — Validated 2026-05-05; renamed the copy strategy to
-`read_copy`, added public-dispatch behavior guardrails, tensor-residency ownership
-guardrails, and maintained tool/model-loader no-reach-through source checks.
-
-#### Phase 218: Publication and Maintained Artifact Updates
-**Goal**: Maintained docs, snapshots, benchmark outputs, model artifacts, and planning truth
-describe read-strategy support exactly as implemented.
-**Depends on**: Phase 217
-**Requirements**: VAL-03
-**Gap Closure**: Closes v1.25 audit gaps for stale planning truth, stale generated docs,
-and missing maintained artifact updates. User approved updating snapshots, benchmarks, and
-models as needed during this gap closure command.
-**Success Criteria** (what must be TRUE):
-  1. Public docs and generated architecture docs describe the read/copy strategy path,
-     ownership boundaries, and deferred strategies (mmap shipped in v1.24; staged/async/device
-     remain deferred) truthfully.
-  2. Lint snapshots, benchmark snapshots, benchmark outputs, and model artifacts are updated
-     from maintained commands when the implementation changes them.
-  3. Planning artifacts record final requirement coverage, validation evidence, and any
-     approved artifact updates for v1.25.
-  4. Closeout artifacts do not claim read-strategy support beyond source-backed maintained
-     runtime behavior.
-  5. Any snapshot, benchmark, or model artifact changes are produced by maintained commands
-     and explicitly tied to source-backed read/copy runtime behavior.
-**Plans**: 01 — Validated 2026-05-05; updated public docs, README template,
-generated architecture docs, benchmark snapshots, planning truth, and final closeout
-audit from maintained commands. The closing full quality gate passed with
-`EMEL_QUALITY_GATES_SCOPE=full EMEL_QUALITY_GATES_PARALLEL=never
-scripts/quality_gates.sh`.
-
-#### Phase 219: Maintained Read Source Provenance
-**Goal**: Maintained benchmark, paritychecker, and embedded probe lanes prove read/copy
-strategy usage from a maintained `src`-owned source contract instead of tool-local full-file
-read scaffolds.
-**Depends on**: Phase 218
-**Requirements**: PLAT-01, TIO-03, VAL-04
-**Gap Closure**: Closes v1.25 audit gaps where generation, Sortformer diarization,
-embedded probe, and paritychecker lanes report `read_copy` after tool-local
-`read_file_bytes` helpers create the source span.
-**Success Criteria** (what must be TRUE):
-  1. Maintained benchmark, paritychecker, and embedded probe lanes no longer own low-level
-     file slurp helpers as the source of `read_copy` evidence.
-  2. A maintained `src`-owned loading/source contract feeds `model/loader -> model/tensor ->
-     io/loader -> io/read` for read/copy tool evidence.
-  3. `read_copy` benchmark/parity/probe output is emitted only when the EMEL lane actually
-     consumed the maintained source contract and executed the public runtime path.
-  4. Unsupported or fallback source behavior is reported as unsupported or non-read-strategy,
-     never as read-strategy parity or performance evidence.
-  5. Tests and source guardrails fail on tool-local substitutes for the maintained read/copy
-     source path.
-
-#### Phase 220: Explicit Tensor Read Outcome Graph
-**Goal**: Tensor-owned read/copy integration exposes success and failure outcomes through
-explicit state/event routing without callback/status-mediated behavior selection.
-**Depends on**: Phase 219
-**Requirements**: TIO-02
-**Gap Closure**: Closes v1.25 audit gap where `model/tensor` represents final outcomes
-with explicit states/events but still uses callback-mutated runtime status inspected by
-guards to select the read outcome path.
-**Success Criteria** (what must be TRUE):
-  1. `model/tensor` read success, unsupported, validation failure, file open failure, and
-     file read failure outcomes are selected by explicit guards/transitions over typed
-     same-RTC events, not by callback-mutated status fields.
-  2. Any same-RTC callbacks used for immediate replies do not decide which tensor outcome
-     path runs.
-  3. No mirrored status fields, context phase flags, or callback-selected outcomes remain in
-     the read-backed tensor outcome path.
-  4. Public doctests prove all representative read success and error outcomes through
-     `process_event(...)` and SML state inspection.
-
-#### Phase 221: Read Closeout Truth Reconciliation
-**Goal**: Maintained docs, generated architecture docs, planning artifacts, snapshots,
-benchmark outputs, model artifacts, and the milestone audit describe read/copy support
-exactly as implemented after gap closure.
-**Depends on**: Phase 220
-**Requirements**: superseded by Phase 223
-**Gap Closure**: Closes v1.25 audit gap where closeout artifacts overstated maintained
-read/copy path truth while tool-local source spans still fed the reported lane. User
-approved updating model artifacts, snapshots, and benchmarks as needed during this gap
-closure command.
-**Success Criteria** (what must be TRUE):
-  1. Public docs, generated architecture docs, ROADMAP, REQUIREMENTS, STATE, PROJECT,
-     MILESTONES, and the milestone audit describe the maintained read/copy path truthfully.
-  2. Lint snapshots, benchmark snapshots, benchmark outputs, and model artifacts are updated
-     from maintained commands when implementation changes require it.
-  3. Phase 214 historical artifacts are reconciled or explicitly marked superseded so they no
-     longer conflict with the Phase 214.1 source-buffer truth.
-  4. A source-backed milestone audit passes without relying on tool-only source scaffolds.
-  5. The closing quality gate is run with the appropriate full or changed-file scope and no
-     benchmark-regression override unless explicitly documented as transitional.
-**Plans**: 01 — Ready only. 2026-05-06 audit found an additional source-contract
-blocker in Phase 219/216 maintained lanes, so Phase 221 is superseded by the
-Phase 222 source-contract repair and Phase 223 closeout truth plan.
-**Summary**: Superseded 2026-05-06 with no source or requirement claims.
-
-#### Phase 222: Public Read Source Contract Repair
-**Goal**: Maintained benchmark, paritychecker, and embedded probe lanes obtain read/copy
-source bytes through an allowed public or non-actor-internal EMEL-owned contract instead of
-including `emel/io/read/detail.hpp`.
-**Depends on**: Phase 220
-**Requirements**: PLAT-01, TIO-03, VAL-02, VAL-04
-**Gap Closure**: Closes v1.25 audit gaps where maintained lanes replaced tool-local
-`read_file_bytes` helpers with direct actor-detail reach-through, causing paritychecker
-guardrails and maintained read/copy evidence to fail.
-**Success Criteria** (what must be TRUE):
-  1. Maintained generation, Sortformer diarization, embedded probe, and paritychecker lanes
-     no longer include or call `emel/io/read/detail.hpp` or any actor `detail.hpp` helper for
-     benchmark/parity source loading.
-  2. Source-byte loading for maintained read/copy evidence is exposed through an allowed
-     EMEL-owned public/runtime/setup contract that does not violate the actor model,
-     benchmark/parity harness rules, or `detail.hpp` ownership rules.
-  3. Maintained lanes still report `read_copy` only when the EMEL lane executes the public
-     `model/loader -> model/tensor -> io/loader -> io/read` runtime path.
-  4. Guardrails fail on actor-internal reach-through, tool-local read substitutes, and any
-     unsupported fallback reported as read/copy evidence.
-  5. Focused paritychecker and maintained generation evidence passes without benchmark
-     regression override.
-**Plans**: 01 — Validated 2026-05-06; moved maintained source-byte loading to
-`emel::io::source::load_file_bytes`, removed `io/read/detail.hpp` reach-through
-from maintained lanes, and restored paritychecker/generation guardrail evidence.
-
-#### Phase 223: Read Closeout Truth And Validation Reconciliation
-**Goal**: Final v1.25 closeout truth, generated artifacts, snapshots, benchmark outputs,
-model artifacts, requirements, roadmap state, and milestone audit reflect the post-Phase 222
-maintained read/copy runtime path.
-**Depends on**: Phase 222
-**Requirements**: TIO-02, VAL-01, VAL-03
-**Gap Closure**: Closes v1.25 audit gaps for stale Phase 220 roadmap state, unvalidated
-Phase 221/VAL-03 closeout truth, dyld-blocked test rerun evidence, and final source-backed
-milestone audit truth.
-**Success Criteria** (what must be TRUE):
-  1. ROADMAP, REQUIREMENTS, STATE, PROJECT, MILESTONES, public docs, generated architecture
-     docs, and the milestone audit no longer claim stale Phase 218/221 closeout truth.
-  2. Phase 220 progress-table state is reconciled with its completed SUMMARY,
-     VERIFICATION, and VALIDATION artifacts.
-  3. Public behavior doctests and maintained guardrails are rerun or the dyld/libSystem launch
-     blocker is explicitly captured with source-backed substitute evidence approved for the
-     phase.
-  4. Lint snapshots, benchmark snapshots, benchmark outputs, and model artifacts are updated
-     only through maintained commands when the repaired implementation changes them.
-  5. A source-backed milestone audit reports every active v1.25 requirement satisfied, with
-     no actor-detail reach-through or tool-only maintained-path evidence.
-**Plans**: 01 — Validated 2026-05-06; reconciled final planning truth, generated
-docs checks, lint snapshot checks, public-dispatch doctests, paritychecker
-guardrails, maintained generation compare evidence, repaired batch planner
-benchmark evidence, the full closeout quality gate, and the source-backed
-milestone audit.
-
-#### Phase 224: Read Closeout Tech Debt Cleanup
-**Goal**: Close the nonblocking tech-debt items from the refreshed v1.25 milestone audit
-before archive.
-**Depends on**: Phase 223
-**Requirements**: none — all v1.25 requirements remain satisfied; this phase is cleanup only
-**Gap Closure**: Addresses audit tech debt without resetting any validated requirement:
-historical Phase 214 supersession noise, public tensor read event maintained-lane coverage shape,
-and fresh `emel_tests_io` evidence after the local dyld/libSystem launch blocker is resolved.
-**Success Criteria** (what must be TRUE):
-  1. Phase 214 historical artifacts are either further reconciled or explicitly confirmed as
-     intentionally superseded by Phase 214.1 without creating closeout ambiguity.
-  2. Maintainers can tell whether `model::tensor::event::request_read_load` should gain a
-     maintained direct-lane coverage path or remain a public tested route while maintained
-     model-loader lanes use `model/tensor` plan/apply plus `io/loader -> io/read`.
-  3. Fresh `emel_tests_io` evidence is captured from a healthy local environment, or the
-     dyld/libSystem launch blocker is captured with an explicit archive-time decision.
-  4. The milestone audit is rerun and either passes or reports only explicitly accepted
-     nonblocking debt.
-**Plans**: 01 — Validated 2026-05-06; Phase 214 supersession clarity,
-`request_read_load` maintained-lane decision evidence, fresh passing
-`emel_tests_io` evidence, and final milestone audit refresh.
-
-#### Phase 225: Read Closeout Runtime Validation And SML Repair
-**Goal**: Close refreshed v1.25 audit gaps by restoring executable model/batch validation,
-moving maintained read/copy per-tensor I/O orchestration out of model-loader action loops,
-and reconciling closeout artifact paths.
-**Depends on**: Phase 224
-**Requirements**: VAL-01, TIO-03, VAL-04, VAL-03
-**Gap Closure**: Closes `.planning/v1.25-MILESTONE-AUDIT.md` findings: current
-`emel_tests_model_and_batch` dyld launch failure, model-loader action-loop
-`io_loader->process_event(...)` SML readiness risk, and stale archived closeout path
-references.
-**Success Criteria** (what must be TRUE):
-  1. `ctest --test-dir build/zig --output-on-failure -R emel_tests_model_and_batch`
-     runs to completion or the dyld/libSystem launch blocker is eliminated with a
-     source-backed maintained substitute explicitly recorded in validation.
-  2. Maintained read/copy `model/loader -> io/loader` orchestration no longer relies on an
-     action loop calling `io_loader->process_event(...)`; runtime choice and per-phase
-     orchestration are represented with explicit SML guards/states/transitions.
-  3. The maintained read/copy path still reports `used_io_strategy` only after public
-     runtime execution through `model/loader -> model/tensor -> io/loader -> io/read`.
-  4. Closeout artifact paths in active and archived roadmap/requirements/audit docs point
-     at files that exist after the v1.25 archive layout.
-  5. Focused model-loader, model/tensor, io/loader, io/read, domain-boundary, consistency,
-     and changed-file quality gates pass without benchmark-regression override.
-**Plans**: 6 plans — completed 2026-05-06
-Plans:
-- [x] `225-01-PLAN.md` — Add the owning `io/read` batch copy surface and public-dispatch tests.
-- [x] `225-02-PLAN.md` — Route one `io/loader` read_copy batch to `io/read` with same-RTC result callbacks.
-- [x] `225-03-PLAN.md` — Replace model-loader per-tensor I/O dispatch with one public batch dispatch.
-- [x] `225-04-PLAN.md` — Wire maintained callers and guardrails to request-owned `io_load_spans`.
-- [x] `225-05-PLAN.md` — Reconcile active and archived closeout path and plan traceability.
-- [x] `225-06-PLAN.md` — Publish validation, summary, and active/archived audit evidence.
-
-#### Phase 226: Read Batch Cap And Closeout Evidence Refresh
-**Goal**: Close the nonblocking tech-debt items from `.planning/v1.25-MILESTONE-AUDIT.md`
-by bounding the public read/copy batch API independently and refreshing closeout evidence
-to match current executable validation.
-**Depends on**: Phase 225
-**Requirements**: none — all v1.25 requirements remain satisfied; this phase is cleanup only
-**Gap Closure**: Closes audit tech debt for the uncapped public `io/read`
-`read_tensor_batch` span and stale dyld-fallback closeout wording after current focused
-CTest passed.
-**Success Criteria** (what must be TRUE):
-  1. Public `io/read::event::read_tensor_batch` dispatch rejects over-large spans before
-     iterating or copying, with the cap owned by a public/read-side contract rather than
-     relying only on maintained model-loader callers.
-  2. Doctests prove accepted boundary-size batches and rejected over-large batches through
-     public `process_event(...)` dispatch and SML state inspection.
-  3. Active and archived closeout evidence distinguishes historical dyld fallback evidence
-     from current direct `build/zig` focused CTest evidence.
-  4. If the repaired implementation changes maintained snapshots, benchmark outputs,
-     benchmark snapshots, or model artifacts, those artifacts are updated only through
-     maintained commands. User permission for those updates was granted with this phase.
-  5. Changed-file quality gates pass without benchmark-regression override, and the
-     refreshed milestone audit reports no blockers.
-**Plans**: 01 — Validated 2026-05-06; public `io/read` batch cap added,
-exact-cap and over-cap doctests passed, closeout evidence refreshed, and
-changed-file quality gate passed.
-
-#### Coverage
-
-| Requirement | Phase |
-|-------------|-------|
-| READ-01 | Phase 212 |
-| READ-02 | Phase 213 |
-| PLAT-01 | Phase 222 |
-| READ-03 | Phase 214.1 |
-| LIFE-01 | Phase 214.1 |
-| ERR-01 | Phase 214.1 |
-| TIO-01 | Phase 215 |
-| TIO-02 | Phase 223 |
-| TIO-03 | Phase 225 |
-| VAL-04 | Phase 225 |
-| VAL-01 | Phase 225 |
-| VAL-02 | Phase 222 |
-| VAL-03 | Phase 225 |
-
-Mapped: 13/13 v1 requirements; validated 13, pending 0. Phases 224 and 226 are
-cleanup-only; Phase 225 closed refreshed closeout gaps for VAL-01, TIO-03, VAL-04,
-and VAL-03.
-
-<details>
-<summary>✅ v1.24 I/O Mmap Loading Strategy (Phases 204-211) — SHIPPED 2026-05-04</summary>
-
-- [x] Phase 204: Mmap Strategy Component Boundary (1/1 plans) — completed 2026-05-04
-- [x] Phase 205: Mmap Validation and Platform Gating (1/1 plans) — completed 2026-05-04
-- [x] Phase 206: Mapped Descriptor, Errors, and Lifetime (1/1 plans) — completed 2026-05-04
-- [x] Phase 207: Tensor-Owned Mmap Integration (1/1 plans) — completed 2026-05-04
-- [x] Phase 208: Public Runtime and Evidence Surfaces (1/1 plans) — completed 2026-05-04
-- [x] Phase 209: Behavior Tests and Scope Guardrails (1/1 plans) — completed 2026-05-04
-- [x] Phase 210: Publication and Maintained Artifact Updates (1/1 plans) — completed 2026-05-04
-- [x] Phase 211: Phase Verification Artifact Backfill (1/1 plans) — completed 2026-05-04 (gap closure)
-
-Archive:
-- `.planning/milestones/v1.24-ROADMAP.md`
-- `.planning/milestones/v1.24-REQUIREMENTS.md`
-- `.planning/milestones/v1.24-MILESTONE-AUDIT.md`
-- `.planning/milestones/v1.24-phases/{204..210}-*` (Phase 211 backfill artifacts live alongside their parent phase dirs)
-
-</details>
-
-<details>
-<summary>✅ v1.23 I/O Loading Strategy Boundary (Phases 197-203) — SHIPPED 2026-05-04</summary>
-
-Archive:
-- `.planning/milestones/v1.23-ROADMAP.md`
-- `.planning/milestones/v1.23-REQUIREMENTS.md`
-- `.planning/milestones/v1.23-MILESTONE-AUDIT.md`
-- `.planning/milestones/v1.23-phases/`
-
-</details>
-
-### 📋 Milestone backlog
-
-Older “next milestone” staging notes are superseded by **v1.26** (issue #63) in active planning
-artifacts (`REQUIREMENTS.md`, `STATE.md`). Future milestones after v1.26 continue via
-`$gsd-new-milestone`.
-
-## Progress
-
-| Phase | Milestone | Plans Complete | Status | Completed |
-|-------|-----------|----------------|--------|-----------|
-| 227. Staged Read Strategy Component Boundary | v1.26 | 1/1 | Complete | 2026-05-07 |
-| 228. Span, Target-Window, and Platform Gating | v1.26 | 1/1 | Complete | 2026-05-07 |
-| 229. Staged Copy Progress and Completion Semantics | v1.26 | 1/1 | Complete | 2026-05-07 |
-| 230. Context Cleanness and Per-Attempt Lifetime | v1.26 | 1/1 | Complete | 2026-05-07 |
-| 231. Deterministic Error Taxonomy | v1.26 | 1/1 | Complete | 2026-05-07 |
-| 232. Tensor-Owned Integration Graph | v1.26 | 1/1 | Complete | 2026-05-07 |
-| 233. Public Loader and Maintained Entrypoints | v1.26 | 1/1 | Complete | 2026-05-08 |
-| 234. Public Dispatch Tests | v1.26 | 1/1 | Complete | 2026-05-08 |
-| 235. Scope and Non-Regression Guardrails | v1.26 | 1/1 | Complete | 2026-05-08 |
-| 236. Publication and Evidence Truthfulness | v1.26 | 1/1 | Complete | 2026-05-08 |
-| 237. Direct Tensor Staged Offset Contract Repair | v1.26 | 1/1 | Complete | 2026-05-08 |
-| 238. Audit Artifact and Probe Reporting Cleanup | v1.26 | 1/1 | Complete | 2026-05-08 |
-| 212. Read Strategy Component Boundary | v1.25 | 1/1 | Validated | 2026-05-05 |
-| 213. Read Validation and Platform Gating | v1.25 | 1/1 | Validated | 2026-05-05 |
-| 214. Read Execution, Errors, and Lifetime | v1.25 | 1/1 | Validated | 2026-05-05 |
-| 214.1. RTC-Safe Read Execution Boundary Repair | v1.25 | 1/1 | Validated | 2026-05-05 |
-| 215. Tensor-Owned Read Integration | v1.25 | 1/1 | Validated | 2026-05-05 |
-| 216. Public Runtime and Evidence Surfaces | v1.25 | 1/1 | Validated | 2026-05-05 |
-| 217. Behavior Tests and Scope Guardrails | v1.25 | 1/1 | Validated | 2026-05-05 |
-| 218. Publication and Maintained Artifact Updates | v1.25 | 1/1 | Validated | 2026-05-05 |
-| 219. Maintained Read Source Provenance | v1.25 | 1/1 | Validated | 2026-05-05 |
-| 220. Explicit Tensor Read Outcome Graph | v1.25 | 1/1 | Validated | 2026-05-05 |
-| 221. Read Closeout Truth Reconciliation | v1.25 | 1/1 | Superseded | 2026-05-06 |
-| 222. Public Read Source Contract Repair | v1.25 | 1/1 | Validated | 2026-05-06 |
-| 223. Read Closeout Truth And Validation Reconciliation | v1.25 | 1/1 | Validated | 2026-05-06 |
-| 224. Read Closeout Tech Debt Cleanup | v1.25 | 1/1 | Complete    | 2026-05-06 |
-| 225. Read Closeout Runtime Validation And SML Repair | v1.25 | 6/6 | Complete   | 2026-05-06 |
-| 226. Read Batch Cap And Closeout Evidence Refresh | v1.25 | 1/1 | Validated | 2026-05-06 |
-| 204. Mmap Strategy Component Boundary | v1.24 | 1/1 | Complete | 2026-05-04 |
-| 205. Mmap Validation and Platform Gating | v1.24 | 1/1 | Complete | 2026-05-04 |
-| 206. Mapped Descriptor, Errors, and Lifetime | v1.24 | 1/1 | Complete | 2026-05-04 |
-| 207. Tensor-Owned Mmap Integration | v1.24 | 1/1 | Complete | 2026-05-04 |
-| 208. Public Runtime and Evidence Surfaces | v1.24 | 1/1 | Complete | 2026-05-04 |
-| 209. Behavior Tests and Scope Guardrails | v1.24 | 1/1 | Complete | 2026-05-04 |
-| 210. Publication and Maintained Artifact Updates | v1.24 | 1/1 | Complete | 2026-05-04 |
-| 211. Phase Verification Artifact Backfill | v1.24 | 1/1 | Complete | 2026-05-04 |
+- [x] **v1.0 EMEL Llama-68M Generation Slice** - shipped 2026-03-08
+- [x] **v1.1 EMEL Llama-68M Generation Benchmark** - shipped 2026-03-11
+- [x] **v1.2 Flash Attention** - shipped 2026-03-22
+- [x] **v1.3 ARM Flash Optimizations** - shipped 2026-03-22
+- [x] **v1.4 Full Vectorized Quantized Kernels** - shipped 2026-03-25
+- [x] **v1.5 Full ARM Quantized Path** - shipped 2026-03-27
+- [x] **v1.6 Qwen3-0.6B Parity And Benchmark** - shipped 2026-03-30
+- [x] **v1.7 Generator Prefill Submachine Decomposition** - shipped 2026-03-30
+- [x] **v1.8 Truthful Qwen3 E2E Embedded Size** - shipped 2026-04-02
+- [x] **v1.9 Liquid LFM2.5-1.2B Thinking ARM Slice** - shipped 2026-04-02
+- [x] **v1.11 TE-75M GGUF Trimodal Embedding Runtime** - shipped 2026-04-15
+- [x] **v1.12 Pluggable Reference Parity Bench Architecture** - shipped 2026-04-18
+- [x] **v1.13 Pluggable Generative Parity Bench** - shipped 2026-04-21
+- [x] **v1.14 Benchmark Variant Organization** - shipped 2026-04-21
+- [x] **v1.15 ARM Sortformer Diarization GGUF Slice** - shipped 2026-04-25
+- [x] **v1.16 ARM Whisper GGUF Parity And Performance** - shipped 2026-04-28
+- [x] **v1.17 Text Generator Domain Alignment** - shipped 2026-04-30
+- [x] **v1.18 Parity Tool Boundary Refactor** - shipped 2026-05-01
+- [x] **v1.19 Benchmark Tool Pluggable Runner Refactor** - shipped 2026-05-01
+- [x] **v1.20 SML Dependency And Namespace Migration** - shipped 2026-05-02
+- [x] **v1.21 Quality Gate Selective Runner Optimization** - shipped 2026-05-02
+- [x] **v1.22 Weight Loading Ownership Cutover** - shipped 2026-05-03
+- [x] **v1.23 I/O Loading Strategy Boundary** - shipped 2026-05-04
+- [x] **v1.24 I/O Mmap Loading Strategy** - shipped 2026-05-04
+- [x] **v1.25 I/O Read Loading Strategy** - shipped 2026-05-06
+- [x] **v1.26 I/O Staged Read Loading Strategy** - completed 2026-05-08
+- [x] **v1.27 Ryzen AVX2/FMA Kernel Support** - shipped 2026-06-25
+
+## Current Milestone
+
+No active milestone is open.
+
+## Recently Shipped
+
+### v1.27 Ryzen AVX2/FMA Kernel Support
+
+**Shipped:** 2026-06-25
+**Archive:** `.planning/milestones/v1.27-ROADMAP.md`
+**Requirements:** `.planning/milestones/v1.27-REQUIREMENTS.md`
+**Audit:** `.planning/milestones/v1.27-MILESTONE-AUDIT.md`
+
+Delivered native x86_64 AVX2/FMA support for the AMD Ryzen 9 5950X maintained
+runtime slice: host feature contract, optimized flash attention, q2_K/q3_K/q6_K
+x q8_K kernels, maintained generator parity attribution, and truthful
+`kernel_x86_64` benchmark publication. The source-backed audit passed after
+repairing the optimized benchmark attribution gap and removing the x86_64 unary
+SML rule debt.
+
+Next step: run `$gsd-new-milestone` to define the next milestone.
diff --git a/.planning/STATE.md b/.planning/STATE.md
index 65dfd835..d8a860a4 100644
--- a/.planning/STATE.md
+++ b/.planning/STATE.md
@@ -1,17 +1,16 @@
 ---
 gsd_state_version: 1.0
-milestone: v1.26
-milestone_name: I/O Staged Read Loading Strategy
-status: complete
-stopped_at: "v1.26 phases complete; milestone audit passed"
-last_updated: "2026-05-08T22:50:00.000Z"
-last_activity: 2026-05-08
+milestone: v1.27
+milestone_name: Ryzen AVX2/FMA Kernel Support
+status: ready_for_next_milestone
+stopped_at: "v1.27 shipped and archived; next step is new milestone definition"
+last_updated: "2026-06-25T14:35:28.048Z"
+last_activity: 2026-06-25
 progress:
-  # Matches v1.26 section in .planning/ROADMAP.md (Phases 227-238 inclusive).
-  total_phases: 12
-  completed_phases: 12
-  total_plans: 12
-  completed_plans: 12
+  total_phases: 6
+  completed_phases: 6
+  total_plans: 6
+  completed_plans: 6
   percent: 100
 ---
 
@@ -19,101 +18,109 @@ progress:
 
 ## Project Reference
 
-See: .planning/PROJECT.md (updated 2026-05-07)
+See: .planning/PROJECT.md (updated 2026-06-25)
 
 **Core value:** Prove real end-to-end behavior with explicit SML orchestration and
 parity-oriented verification before widening API surface or model scope.
 
-**Current focus:** Define and execute milestone `v1.26 I/O Staged Read Loading
-Strategy` (GitHub issue #63): bounded `src/emel/io/staged_read` actor for
-chunked constrained-memory loads through the tensor-to-I/O boundary (#60),
-tensor-owned residency unchanged, cooperative coroutine scheduling out of scope
-unless separately approved.
+**Current focus:** Define the next milestone after shipping native x86_64
+AVX2/FMA support for this AMD Ryzen 9 5950X host.
 
 ## Current Position
 
-Milestone: v1.26 I/O Staged Read Loading Strategy — complete (**Phases 227–238** complete).
-Status: **`v1.26-MILESTONE-AUDIT.md`** reports `status: passed`; **`ESG-02B`** remains
-deferred/future by design.
-Phase: none pending.
-Last activity: 2026-05-08 — Phase 238 reconciled summary frontmatter, embedded probe reporting
-truth, and the final source-backed milestone audit after the Phase 237 direct tensor staged-load
-offset repair.
+Milestone: v1.27 Ryzen AVX2/FMA Kernel Support
+Status: v1.27 shipped and archived. Phases 239-244 verified and the
+source-backed milestone audit passed. The repaired `kernel_x86_64` benchmark
+suite includes counter-checked optimized flash and q2/q3/q6 rows.
+Phase: ready for next milestone definition.
+Last activity: 2026-06-25 — `snapshots/bench/benchmarks.txt` plus the
+maintained LFM2 `10`, `100`, and `1000` token generation baselines were
+updated, the source-backed `XBN-01` benchmark gap was repaired, the x86_64
+unary SML rule debt was removed, focused validation and the changed-file scoped
+quality gate passed, the milestone audit passed, and the milestone was archived.
 
-Progress: [||||||||||] 100%
+Progress: [##########] 100%
 
-**Evidence (Phase 238 cleanup PASS — 2026-05-08):**
+**Host feature scope:**
 
-- Summary frontmatter source scan — pass.
-- Embedded probe `used_io_strategy` evidence scan — pass.
-- Phase 238 changed-file `scripts/quality_gates.sh` — exit **0**.
-- Final `.planning/v1.26-MILESTONE-AUDIT.md` — `status: passed`.
+- CPU: AMD Ryzen 9 5950X 16-Core Processor.
+- Supported target features: x86_64 AVX2, FMA, and F16C conversion.
+- Explicit no-claim features: AVX-512, AVX-VNNI, AMX, BF16, native FP16, and GPU
+  acceleration.
 
-Phase 237 source repair evidence remains in `237-VERIFICATION.md`.
+**Next implementation step:** define the next milestone with `$gsd-new-milestone`.
 
-**Residual / preserved gate truth:**
+**Closeout gate:** complete.
 
-- **Phase 235:** quality gate **not attempted** in final milestone-only pass; Phase 235 makes **no** separate `scripts/quality_gates.sh` pass claim.
-- **Phase 233/234:** earlier scoped gate truth remains as recorded in their phase artifacts.
-- **Phase 232:** scoped gate **exit 2** residual — **unchanged** in **`232-VERIFICATION.md`**.
-- **Phase 236:** serial full-repo quality gate **passed** with exit **0**; this is the maintained
-  milestone-level gate evidence for closeout.
+## Accumulated Context
 
-## Performance Metrics
+### Decisions
 
-**Prior audited milestone:** `v1.25 I/O Read Loading Strategy` remains the latest shipped I/O milestone
-(13/13 requirements satisfied after Phase 226). v1.26 planning continues phase numbering after Phase 226.
+Decisions are logged in PROJECT.md Key Decisions table.
 
-## Accumulated Context
+Carry-forward architectural constraints:
 
-### Decisions
+- Runtime behavior selection remains explicit guards and transitions
+  (`AGENTS.md` / `docs/rules/sml.rules.md`).
 
-Decisions are logged in PROJECT.md Key Decisions table. v1.26 source: GitHub issue #63.
+- Kernel arithmetic, lowering, packing, quant/dequant, and backend-specific
+  numeric work stays in the owning kernel layer.
 
-Carry-forward architectural constraints from shipped I/O milestones:
+- The EMEL lane stays repo-owned and separate from llama.cpp/ggml reference
+  runtime state; reference linkage is comparison-only in tools.
 
-- `model/tensor` remains the canonical residency owner; I/O strategies do not claim tensor buffer ownership.
-- `model/loader` stays orchestration-only with no low-level byte strategy in loader code paths.
-- Runtime behavior selection remains explicit guards and transitions (AGENTS.md / `docs/rules/sml.rules.md`).
+- Quantized kernel parity requires the same effective operand class as the
+  reference path; whole-tensor dequantize-to-f32 hot-path substitution requires
+  explicit user approval and is not part of v1.27.
 
-### Pending Todos
+- Benchmark and parity claims must be source-backed by the maintained runtime
+  path, not only planning artifacts or tool-local scaffolds.
+
+### Carry-Forward Backlog
 
 - 2026-04-02 - Move eager quant prepack into generator initializer
+  (`.planning/todos/backlog/2026-04-02-move-eager-quant-prepack-into-generator-initializer.md`)
+
 - 2026-04-02 - Reuse q8 RHS across LFM2.5 prefill matmuls
+  (`.planning/todos/backlog/2026-04-02-reuse-q8-rhs-across-lfm2-5-prefill-matmuls.md`)
+
 - 2026-04-02 - Optimize LFM2.5 q4 prefill kernel
-- 2026-04-02 - Optimize LFM2.5 q6 prefill kernel
+  (`.planning/todos/backlog/2026-04-02-optimize-lfm2-5-q4-prefill-kernel.md`)
 
-### Blockers/Concerns
+- 2026-04-02 - Optimize LFM2.5 q6 prefill kernel
+  (`.planning/todos/backlog/2026-04-02-optimize-lfm2-5-q6-prefill-kernel.md`)
 
-- No active v1.26 blockers remain.
-- ESG-02B remains deferred by design until approved file-backed staged source ownership introduces real open/seek/read lifecycle semantics.
+These pre-existing LFM2.5 performance backlog items are outside the v1.27 Ryzen
+AVX2/FMA support contract and are not milestone close blockers.
 
-### Prior milestone notes
+### Blockers/Concerns
 
-The following summarized v1.25 execution context and remains historical reference:
+- `ESG-02B` from v1.26 remains outside v1.27 processor scope until a file-backed
+  staged-read source path is separately approved.
 
-<details>
-<summary>v1.25 phase trail (collapsed)</summary>
+- v1.27 must not present AVX-512/VNNI/AMX/BF16/native-FP16 claims for this host.
 
-- Phase 225/226 refined read batch APIs, audit evidence, and SML hygiene for shipped `io/read`.
-- Public `read_tensor_batch` has an independent span cap; benchmarks and parity lanes use `emel::io::source::load_file_bytes` for setup-time bytes.
+### Prior milestone notes
 
-</details>
+`v1.26 I/O Staged Read Loading Strategy` completed on 2026-05-08. Its final
+audit passed after Phase 237 repaired direct tensor staged-load nonzero-offset
+behavior and Phase 238 reconciled artifact/reporting truth. Active v1.26
+evidence is archived under `.planning/milestones/v1.26-*`.
 
-## Deferred Items
+## Historical Carry-Forward Items
 
-Items acknowledged and deferred at v1.25 milestone close on 2026-05-06 (unchanged):
+Items acknowledged at v1.25 milestone close on 2026-05-06 (unchanged):
 
 | Category | Item | Status |
 |----------|------|--------|
-| quick_task | 260401-ejm-add-non-blocking-benchmark-binary-size-c | missing |
-| todo | 2026-04-02-move-eager-quant-prepack-into-generator-initializer.md | pending |
-| todo | 2026-04-02-optimize-lfm2-5-q4-prefill-kernel.md | pending |
-| todo | 2026-04-02-optimize-lfm2-5-q6-prefill-kernel.md | pending |
-| todo | 2026-04-02-reuse-q8-rhs-across-lfm2-5-prefill-matmuls.md | pending |
+| quick_task | 260401-ejm-add-non-blocking-benchmark-binary-size-c | complete |
+| todo | 2026-04-02-move-eager-quant-prepack-into-generator-initializer.md | backlog |
+| todo | 2026-04-02-optimize-lfm2-5-q4-prefill-kernel.md | backlog |
+| todo | 2026-04-02-optimize-lfm2-5-q6-prefill-kernel.md | backlog |
+| todo | 2026-04-02-reuse-q8-rhs-across-lfm2-5-prefill-matmuls.md | backlog |
 
 ## Session Continuity
 
-Last session: 2026-05-08 (v1.26 milestone audit and gap-closure phase creation)
-Stopped at: Phase **237** ready for discuss/plan/execute.
+Last session: 2026-06-25 (v1.27 closeout)
+Stopped at: v1.27 shipped and archived; ready for the next milestone.
 Resume file: None
diff --git a/.planning/architecture/kernel_x86_64.md b/.planning/architecture/kernel_x86_64.md
index f931dd16..da259c29 100644
--- a/.planning/architecture/kernel_x86_64.md
+++ b/.planning/architecture/kernel_x86_64.md
@@ -72,6 +72,9 @@ stateDiagram-v2
   ready --> ready : dispatch_op_group_norm [dispatch_op_group_norm__] / dispatch_op_group_norm__
   ready --> ready : dispatch_op_l2_norm [dispatch_op_l2_norm__] / dispatch_op_l2_norm__
   ready --> ready : dispatch_op_l2_norm [dispatch_op_l2_norm__] / dispatch_op_l2_norm__
+  ready --> ready : dispatch_op_mul_mat [guard_simd_op_mul_mat_q2_k_q8_k_] / effect_exec_simd_q2_k_q8_k_op_mul_mat_
+  ready --> ready : dispatch_op_mul_mat [guard_simd_op_mul_mat_q3_k_q8_k_] / effect_exec_simd_q3_k_q8_k_op_mul_mat_
+  ready --> ready : dispatch_op_mul_mat [guard_simd_op_mul_mat_q6_k_q8_k_] / effect_exec_simd_q6_k_q8_k_op_mul_mat_
   ready --> ready : dispatch_op_mul_mat [dispatch_op_mul_mat__] / dispatch_op_mul_mat__
   ready --> ready : dispatch_op_mul_mat [dispatch_op_mul_mat__] / dispatch_op_mul_mat__
   ready --> ready : dispatch_op_mul_mat [dispatch_op_mul_mat__] / dispatch_op_mul_mat__
@@ -163,7 +166,8 @@ stateDiagram-v2
   ready --> ready : dispatch_op_tri [dispatch_op_tri__] / dispatch_op_tri__
   ready --> ready : dispatch_op_fill [dispatch_op_fill__] / dispatch_op_fill__
   ready --> ready : dispatch_op_fill [dispatch_op_fill__] / dispatch_op_fill__
-  ready --> ready : dispatch_op_flash_attn_ext [dispatch_op_flash_attn_ext__] / dispatch_op_flash_attn_ext__
+  ready --> ready : dispatch_op_flash_attn_ext [simd_op_flash_attn_ext_f16kv_one_chunk_] / exec_simd_flash_attn_ext_f16kv_one_chunk_
+  ready --> ready : dispatch_op_flash_attn_ext [valid_op_flash_attn_ext_shared_] / dispatch_op_flash_attn_ext__
   ready --> ready : dispatch_op_flash_attn_ext [dispatch_op_flash_attn_ext__] / dispatch_op_flash_attn_ext__
   ready --> ready : dispatch_op_flash_attn_back [dispatch_op_flash_attn_back__] / dispatch_op_flash_attn_back__
   ready --> ready : dispatch_op_flash_attn_back [dispatch_op_flash_attn_back__] / dispatch_op_flash_attn_back__
@@ -284,6 +288,9 @@ stateDiagram-v2
 | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_group_norm`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_group_norm>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_group_norm>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
 | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_l2_norm`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_l2_norm>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_l2_norm>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
 | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_l2_norm`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_l2_norm>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_l2_norm>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
+| [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_mul_mat`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`guard_simd_op_mul_mat_q2_k_q8_k>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`effect_exec_simd_q2_k_q8_k_op_mul_mat>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
+| [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_mul_mat`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`guard_simd_op_mul_mat_q3_k_q8_k>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`effect_exec_simd_q3_k_q8_k_op_mul_mat>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
+| [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_mul_mat`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`guard_simd_op_mul_mat_q6_k_q8_k>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`effect_exec_simd_q6_k_q8_k_op_mul_mat>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
 | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_mul_mat`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_mul_mat>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_mul_mat>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
 | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_mul_mat`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_mul_mat>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_mul_mat>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
 | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_mul_mat`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_mul_mat>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_mul_mat>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
@@ -375,7 +382,8 @@ stateDiagram-v2
 | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_tri`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_tri>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_tri>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
 | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_fill`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_fill>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_fill>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
 | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_fill`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_fill>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_fill>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
-| [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_flash_attn_ext`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_flash_attn_ext>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_flash_attn_ext>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
+| [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_flash_attn_ext`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`simd_op_flash_attn_ext_f16kv_one_chunk>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`exec_simd_flash_attn_ext_f16kv_one_chunk>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
+| [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_flash_attn_ext`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`valid_op_flash_attn_ext_shared>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_flash_attn_ext>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
 | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_flash_attn_ext`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_flash_attn_ext>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_flash_attn_ext>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
 | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_flash_attn_back`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_flash_attn_back>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_flash_attn_back>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
 | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_flash_attn_back`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_flash_attn_back>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`dispatch_op_flash_attn_back>>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) | [`ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/kernel/x86_64/sm.hpp) |
diff --git a/.planning/architecture/mermaid/kernel_x86_64.mmd b/.planning/architecture/mermaid/kernel_x86_64.mmd
index 81d9cd57..76200241 100644
--- a/.planning/architecture/mermaid/kernel_x86_64.mmd
+++ b/.planning/architecture/mermaid/kernel_x86_64.mmd
@@ -65,6 +65,9 @@ stateDiagram-v2
   ready --> ready : dispatch_op_group_norm [dispatch_op_group_norm__] / dispatch_op_group_norm__
   ready --> ready : dispatch_op_l2_norm [dispatch_op_l2_norm__] / dispatch_op_l2_norm__
   ready --> ready : dispatch_op_l2_norm [dispatch_op_l2_norm__] / dispatch_op_l2_norm__
+  ready --> ready : dispatch_op_mul_mat [guard_simd_op_mul_mat_q2_k_q8_k_] / effect_exec_simd_q2_k_q8_k_op_mul_mat_
+  ready --> ready : dispatch_op_mul_mat [guard_simd_op_mul_mat_q3_k_q8_k_] / effect_exec_simd_q3_k_q8_k_op_mul_mat_
+  ready --> ready : dispatch_op_mul_mat [guard_simd_op_mul_mat_q6_k_q8_k_] / effect_exec_simd_q6_k_q8_k_op_mul_mat_
   ready --> ready : dispatch_op_mul_mat [dispatch_op_mul_mat__] / dispatch_op_mul_mat__
   ready --> ready : dispatch_op_mul_mat [dispatch_op_mul_mat__] / dispatch_op_mul_mat__
   ready --> ready : dispatch_op_mul_mat [dispatch_op_mul_mat__] / dispatch_op_mul_mat__
@@ -156,7 +159,8 @@ stateDiagram-v2
   ready --> ready : dispatch_op_tri [dispatch_op_tri__] / dispatch_op_tri__
   ready --> ready : dispatch_op_fill [dispatch_op_fill__] / dispatch_op_fill__
   ready --> ready : dispatch_op_fill [dispatch_op_fill__] / dispatch_op_fill__
-  ready --> ready : dispatch_op_flash_attn_ext [dispatch_op_flash_attn_ext__] / dispatch_op_flash_attn_ext__
+  ready --> ready : dispatch_op_flash_attn_ext [simd_op_flash_attn_ext_f16kv_one_chunk_] / exec_simd_flash_attn_ext_f16kv_one_chunk_
+  ready --> ready : dispatch_op_flash_attn_ext [valid_op_flash_attn_ext_shared_] / dispatch_op_flash_attn_ext__
   ready --> ready : dispatch_op_flash_attn_ext [dispatch_op_flash_attn_ext__] / dispatch_op_flash_attn_ext__
   ready --> ready : dispatch_op_flash_attn_back [dispatch_op_flash_attn_back__] / dispatch_op_flash_attn_back__
   ready --> ready : dispatch_op_flash_attn_back [dispatch_op_flash_attn_back__] / dispatch_op_flash_attn_back__
diff --git a/.planning/milestones/v1.26-MILESTONE-AUDIT.md b/.planning/milestones/v1.26-MILESTONE-AUDIT.md
new file mode 100644
index 00000000..becd336c
--- /dev/null
+++ b/.planning/milestones/v1.26-MILESTONE-AUDIT.md
@@ -0,0 +1,211 @@
+---
+milestone: v1.26
+audited: 2026-05-08T22:43:43.000Z
+status: passed
+scores:
+  requirements: 34/34 active
+  phases: 12/12
+  integration: 9/9
+  flows: 9/9
+gaps:
+  requirements: []
+  integration: []
+  flows: []
+closed_gaps:
+  - id: "direct-tensor-staged-offset-contract"
+    phase: "237"
+    requirements: ["TNX-01", "TNX-03", "TNX-04", "TST-01", "TST-02"]
+    evidence: "Phase 237 added failing-first public direct tensor staged-load coverage for nonzero file_offset, repaired model/tensor source-window dispatch, and passed scoped quality gates."
+  - id: "direct-tensor-staged-nonzero-offset"
+    phase: "237"
+    evidence: "tests/model/tensor/lifecycle_tests.cpp covers request_staged_load offset 2 against a whole-file source buffer and verifies copied bytes plus resident state."
+  - id: "cross-phase-summary-frontmatter"
+    phase: "238"
+    evidence: "Phases 232-236 now expose requirements-completed frontmatter or explicit partial/finalized-by rationale."
+  - id: "embedded-probe-reporting-truth"
+    phase: "238"
+    evidence: "Probe path captures used_io_strategy from model_loader::load_done; scripts/embedded_size.sh intentionally suppresses probe stdout for stable size measurement, so strategy evidence is the captured public outcome rather than ad hoc runtime printing."
+tech_debt: []
+deferred:
+  - requirement: ESG-02B
+    status: Deferred/Future
+    reason: "Real file open/seek/read and per-stage short-read taxonomy requires an approved file-backed staged-read source path; v1.26 intentionally uses source-span staging."
+nyquist:
+  overall: compliant
+  compliant_phases:
+    - 227-staged-read-strategy-component-boundary
+    - 228-span-target-window-platform-gating
+    - 229-staged-copy-progress-and-completion-semantics
+    - 230-context-cleanness-and-per-attempt-lifetime
+    - 231-deterministic-error-taxonomy
+    - 232-tensor-owned-integration-graph
+    - 233-public-loader-and-maintained-entrypoints
+    - 234-public-dispatch-tests
+    - 235-scope-and-non-regression-guardrails
+    - 236-publication-and-evidence-truthfulness
+    - 237-direct-tensor-staged-offset-contract-repair
+    - 238-audit-artifact-and-probe-reporting-cleanup
+  partial_phases: []
+  missing_phases: []
+---
+
+# v1.26 Milestone Audit - I/O Staged Read Loading Strategy
+
+## Result
+
+**Status: passed with one explicit deferred requirement.** All active v1.26
+requirements are satisfied from source-backed evidence after Phase 237 and Phase
+238 gap closure. `ESG-02B` remains deferred/future by design because real file
+open/seek/read and per-stage short-read taxonomy require a separately approved
+file-backed staged-read source path.
+
+Phase 237 closed the blocking direct tensor staged-load nonzero-offset source
+window gap. Phase 238 closed nonblocking audit artifact debt for summary
+frontmatter and embedded probe reporting truth.
+
+## Requirement Coverage
+
+Three-source cross-reference passed: the active requirements in
+`REQUIREMENTS.md` traceability are present in phase `VERIFICATION.md` evidence
+and in `SUMMARY.md` frontmatter, including the Phase 237 finalization entries
+for requirements reopened by the source-backed audit. No active orphaned
+requirements were found.
+
+| Requirement group | Requirements | Status | Evidence |
+|-------------------|--------------|--------|----------|
+| Staged component and guards | STG-01, STG-02, STG-03, PLAT-02 | Satisfied | Phases 227-228 verification and validation records; staged-read actor, guards, and platform-gated transitions exist. |
+| Staged execution semantics | STG-04, STG-05, STG-06, STG-07 | Satisfied | Phases 229-230 verification and validation records; lifecycle doctests cover deterministic copy, monotone completion, terminal success, and empty context. |
+| Lifetime and ownership | LIFE-02, SNR-01 | Satisfied | Phase 230 verification; no staged-read-owned OS handle is retained and tensor residency remains outside the staged actor. |
+| Error taxonomy | ESG-01, ESG-02A, ESG-03, ESG-04 | Satisfied | Phase 231 verification; named deterministic errors and exception-free actor boundary are covered. |
+| Tensor integration | TNX-01, TNX-02, TNX-03, TNX-04 | Satisfied | Phase 232 integration plus Phase 237 direct tensor nonzero-offset repair prove public dispatch, tensor-owned residency, explicit success, and explicit failure. |
+| Maintained surfaces | PUB-01, PUB-02, PUB-03, PUB-04, PUB-05 | Satisfied | Phase 233 verification; loader, benchmark, paritychecker, and probe paths use public contracts. |
+| Public dispatch tests | TST-01, TST-02 | Satisfied | Phase 234 maintained-route tests plus Phase 237 direct tensor offset success/failure doctests cover public `process_event(...)` dispatch. |
+| Guardrails | GRD-01, GRD-02, GRD-03, GRD-04, GRD-05 | Satisfied | Phase 235 verification; loader/tensor ownership scans and mmap/read non-regression doctests pass. |
+| Publication truth | DOC-01, LNT-01, BNH-01, EVI-01 | Satisfied | Phase 236 verification; docs, lint, benchmark workflow, and evidence-label truth all recorded. |
+
+`ESG-02B` remains explicitly deferred/future and is not counted as an in-scope
+v1.26 blocker.
+
+### Three-Source Requirement Matrix
+
+| Requirement | Traceability | Verification | Summary frontmatter | Final status |
+|-------------|--------------|--------------|---------------------|--------------|
+| STG-01 | Phase 227 satisfied | `227-VERIFICATION.md` satisfied | `227-01-SUMMARY.md` lists complete | Satisfied |
+| STG-02 | Phase 228 satisfied | `228-VERIFICATION.md` satisfied | `228-01-SUMMARY.md` lists complete | Satisfied |
+| STG-03 | Phase 228 satisfied | `228-VERIFICATION.md` satisfied | `228-01-SUMMARY.md` lists complete | Satisfied |
+| PLAT-02 | Phase 228 satisfied | `228-VERIFICATION.md` satisfied | `228-01-SUMMARY.md` lists complete | Satisfied |
+| STG-04 | Phase 229 satisfied | `229-VERIFICATION.md` satisfied | `229-01-SUMMARY.md` lists complete | Satisfied |
+| STG-05 | Phase 229 satisfied | `229-VERIFICATION.md` satisfied | `229-01-SUMMARY.md` lists complete | Satisfied |
+| STG-06 | Phase 229 satisfied | `229-VERIFICATION.md` satisfied | `229-01-SUMMARY.md` lists complete | Satisfied |
+| STG-07 | Phase 230 satisfied | `230-VERIFICATION.md` satisfied | `230-01-SUMMARY.md` lists complete | Satisfied |
+| LIFE-02 | Phase 230 satisfied | `230-VERIFICATION.md` satisfied | `230-01-SUMMARY.md` lists complete | Satisfied |
+| SNR-01 | Phase 230 satisfied | `230-VERIFICATION.md` satisfied | `230-01-SUMMARY.md` lists complete | Satisfied |
+| ESG-01 | Phase 231 satisfied | `231-VERIFICATION.md` satisfied | `231-01-SUMMARY.md` lists complete | Satisfied |
+| ESG-02A | Phase 231 satisfied | `231-VERIFICATION.md` satisfied | `231-01-SUMMARY.md` lists complete | Satisfied |
+| ESG-03 | Phase 231 satisfied | `231-VERIFICATION.md` satisfied | `231-01-SUMMARY.md` lists complete | Satisfied |
+| ESG-04 | Phase 231 satisfied | `231-VERIFICATION.md` satisfied | `231-01-SUMMARY.md` lists complete | Satisfied |
+| TNX-01 | Phase 237 satisfied | `237-VERIFICATION.md` satisfied | `237-01-SUMMARY.md` lists complete; Phase 232 marked finalized by 237 | Satisfied |
+| TNX-02 | Phase 232 satisfied | `232-VERIFICATION.md` satisfied | `232-01-SUMMARY.md` lists complete | Satisfied |
+| TNX-03 | Phase 237 satisfied | `237-VERIFICATION.md` satisfied | `237-01-SUMMARY.md` lists complete; Phase 232 marked finalized by 237 | Satisfied |
+| TNX-04 | Phase 237 satisfied | `237-VERIFICATION.md` satisfied | `237-01-SUMMARY.md` lists complete; Phase 232 marked finalized by 237 | Satisfied |
+| PUB-01 | Phase 233 satisfied | `233-VERIFICATION.md` satisfied | `233-01-SUMMARY.md` lists complete | Satisfied |
+| PUB-02 | Phase 233 satisfied | `233-VERIFICATION.md` satisfied | `233-01-SUMMARY.md` lists complete | Satisfied |
+| PUB-03 | Phase 233 satisfied | `233-VERIFICATION.md` satisfied | `233-01-SUMMARY.md` lists complete | Satisfied |
+| PUB-04 | Phase 233 satisfied | `233-VERIFICATION.md` satisfied | `233-01-SUMMARY.md` lists complete | Satisfied |
+| PUB-05 | Phase 233 satisfied | `233-VERIFICATION.md` satisfied | `233-01-SUMMARY.md` lists complete | Satisfied |
+| TST-01 | Phase 237 satisfied | `237-VERIFICATION.md` satisfied | `237-01-SUMMARY.md` lists complete; Phase 234 marked finalized by 237 | Satisfied |
+| TST-02 | Phase 237 satisfied | `237-VERIFICATION.md` satisfied | `237-01-SUMMARY.md` lists complete; Phase 234 marked finalized by 237 | Satisfied |
+| GRD-01 | Phase 235 satisfied | `235-VERIFICATION.md` satisfied | `235-01-SUMMARY.md` lists complete | Satisfied |
+| GRD-02 | Phase 235 satisfied | `235-VERIFICATION.md` satisfied | `235-01-SUMMARY.md` lists complete | Satisfied |
+| GRD-03 | Phase 235 satisfied | `235-VERIFICATION.md` satisfied | `235-01-SUMMARY.md` lists complete | Satisfied |
+| GRD-04 | Phase 235 satisfied | `235-VERIFICATION.md` satisfied | `235-01-SUMMARY.md` lists complete | Satisfied |
+| GRD-05 | Phase 235 satisfied | `235-VERIFICATION.md` satisfied | `235-01-SUMMARY.md` lists complete | Satisfied |
+| DOC-01 | Phase 236 satisfied | `236-VERIFICATION.md` satisfied | `236-01-SUMMARY.md` lists complete | Satisfied |
+| LNT-01 | Phase 236 satisfied | `236-VERIFICATION.md` satisfied | `236-01-SUMMARY.md` lists complete | Satisfied |
+| BNH-01 | Phase 236 satisfied | `236-VERIFICATION.md` satisfied | `236-01-SUMMARY.md` lists complete | Satisfied |
+| EVI-01 | Phase 236 satisfied | `236-VERIFICATION.md` satisfied | `236-01-SUMMARY.md` lists complete | Satisfied |
+
+## Phase Coverage
+
+| Phase | Verification | Validation | Audit status |
+|-------|--------------|------------|--------------|
+| 227 | `227-VERIFICATION.md` | `227-VALIDATION.md` | Satisfied |
+| 228 | `228-VERIFICATION.md` | `228-VALIDATION.md` | Satisfied |
+| 229 | `229-VERIFICATION.md` | `229-VALIDATION.md` | Satisfied |
+| 230 | `230-VERIFICATION.md` | `230-VALIDATION.md` | Satisfied |
+| 231 | `231-VERIFICATION.md` | `231-VALIDATION.md` | Satisfied; `ESG-02B` deferred |
+| 232 | `232-VERIFICATION.md` | `232-VALIDATION.md` | Satisfied with Phase 237 finalizing reopened direct-offset aspects |
+| 233 | `233-VERIFICATION.md` | `233-VALIDATION.md` | Satisfied; summary frontmatter added in Phase 238 |
+| 234 | `234-VERIFICATION.md` | `234-VALIDATION.md` | Satisfied with Phase 237 finalizing reopened direct-offset public dispatch aspects |
+| 235 | `235-VERIFICATION.md` | `235-VALIDATION.md` | Satisfied; summary frontmatter added in Phase 238 |
+| 236 | `236-VERIFICATION.md` | `236-VALIDATION.md` | Satisfied; full gate passed; summary frontmatter added in Phase 238 |
+| 237 | `237-VERIFICATION.md` | `237-VALIDATION.md` | Satisfied; direct tensor nonzero-offset source repair |
+| 238 | `238-VERIFICATION.md` | `238-VALIDATION.md` | Satisfied; audit artifact and probe reporting cleanup |
+
+## Integration Check
+
+Cross-phase source audit passes after the Phase 237 repair. The integration
+checker found **34/34 active requirements** and **9/9 requested E2E flows**
+passing, with no blocker or tech-debt findings:
+
+- `src/emel/io/staged_read` owns the staged actor and canonical machine alias.
+- `src/emel/io/loader` routes staged single and batch requests through the injected
+  `io_staged_read` actor.
+- `src/emel/model/tensor` remains the residency owner for staged-load results.
+- Direct `model::tensor::event::request_staged_load` now validates
+  `source_buffer_bytes` against `file_offset + byte_size`, dispatches
+  `source_buffer + file_offset` as the staged `source_span`, and passes the
+  logical window length as `source_span_bytes`.
+- Maintained benchmark, parity, and embedded-size probe lanes bind staged-read
+  strategy through public model-load strategy contracts.
+- Evidence labels use `used_io_strategy` after modeled execution, not requested
+  strategy alone.
+- Public dispatch and guardrail tests cover the maintained staged route, direct
+  tensor route, loader ownership boundary, tensor residency boundary, and shipped
+  mmap/read non-regression paths.
+- No coroutine/device/mmap staged-read scope creep was found in staged-read source
+  or tests.
+
+## Embedded Probe Reporting
+
+The embedded-size probe captures the executed load strategy from the public
+`model::loader::events::load_done::used_io_strategy` field in
+`tools/embedded_size/emel_probe/main.cpp`. `scripts/embedded_size.sh` intentionally
+suppresses probe stdout/stderr during the smoke run so size-measurement output and
+snapshot generation remain stable. Therefore, the authoritative evidence surface
+for staged-read execution is the captured `used_io_strategy` outcome and the
+published verification/audit record, not a separate ad hoc probe print line.
+
+## Summary Frontmatter
+
+Phases 232-236 now include machine-readable summary frontmatter:
+
+- Phase 232 records `TNX-02` complete and marks `TNX-01`, `TNX-03`, `TNX-04`
+  as finalized by Phase 237.
+- Phase 233 records `PUB-01` through `PUB-05`.
+- Phase 234 records `TST-01` and `TST-02` as finalized by Phase 237.
+- Phase 235 records `GRD-01` through `GRD-05`.
+- Phase 236 records `DOC-01`, `LNT-01`, `BNH-01`, and `EVI-01`.
+
+## Nyquist Validation
+
+All twelve v1.26 phase directories have `*-VALIDATION.md` records with
+`nyquist_compliant: true` and `wave_0_complete: true`. No missing validation
+files remain.
+
+## Closeout Readiness
+
+The milestone is ready for completion and cleanup. All active requirements have
+source-backed evidence; all phases are complete; the direct tensor nonzero-offset
+blocker is closed; and remaining staged-read file-backed error taxonomy is
+explicitly deferred as `ESG-02B`.
+
+Audit refresh validation:
+
+```bash
+EMEL_QUALITY_GATES_CHANGED_FILES=".planning/v1.26-MILESTONE-AUDIT.md" scripts/quality_gates.sh
+```
+
+Result: **PASS** (exit `0`). Benchmark, coverage, paritychecker, fuzz, and docsgen
+lanes were skipped where the changed-file scope was irrelevant; the legacy SML
+surface scan and configuration completed successfully.
diff --git a/.planning/REQUIREMENTS.md b/.planning/milestones/v1.26-REQUIREMENTS.md
similarity index 100%
rename from .planning/REQUIREMENTS.md
rename to .planning/milestones/v1.26-REQUIREMENTS.md
diff --git a/.planning/milestones/v1.26-ROADMAP.md b/.planning/milestones/v1.26-ROADMAP.md
new file mode 100644
index 00000000..8c2e8bfb
--- /dev/null
+++ b/.planning/milestones/v1.26-ROADMAP.md
@@ -0,0 +1,774 @@
+# Roadmap: EMEL
+
+## Milestones
+
+- ✅ **v1.0 EMEL Llama-68M Generation Slice** — shipped 2026-03-08
+- ✅ **v1.1 EMEL Llama-68M Generation Benchmark** — shipped 2026-03-11
+- ✅ **v1.2 Flash Attention** — shipped 2026-03-22
+- ✅ **v1.3 ARM Flash Optimizations** — shipped 2026-03-22
+- ✅ **v1.4 Full Vectorized Quantized Kernels** — shipped 2026-03-25
+- ✅ **v1.5 Full ARM Quantized Path** — shipped 2026-03-27
+- ✅ **v1.6 Qwen3-0.6B Parity And Benchmark** — shipped 2026-03-30
+- ✅ **v1.7 Generator Prefill Submachine Decomposition** — shipped 2026-03-30
+- ✅ **v1.8 Truthful Qwen3 E2E Embedded Size** — shipped 2026-04-02
+- ✅ **v1.9 Liquid LFM2.5-1.2B Thinking ARM Slice** — shipped 2026-04-02
+- ✅ **v1.11 TE-75M GGUF Trimodal Embedding Runtime** — shipped 2026-04-15
+- ✅ **v1.12 Pluggable Reference Parity Bench Architecture** — shipped 2026-04-18
+- ✅ **v1.13 Pluggable Generative Parity Bench** — shipped 2026-04-21
+- ✅ **v1.14 Benchmark Variant Organization** — shipped 2026-04-21
+- ✅ **v1.15 ARM Sortformer Diarization GGUF Slice** — shipped 2026-04-25
+- ✅ **v1.16 ARM Whisper GGUF Parity And Performance** — shipped 2026-04-28
+- ✅ **v1.17 Text Generator Domain Alignment** — shipped 2026-04-30
+- ✅ **v1.18 Parity Tool Boundary Refactor** — shipped 2026-05-01
+- ✅ **v1.19 Benchmark Tool Pluggable Runner Refactor** — shipped 2026-05-01
+- ✅ **v1.20 SML Dependency And Namespace Migration** — shipped 2026-05-02
+- ✅ **v1.21 Quality Gate Selective Runner Optimization** — shipped 2026-05-02
+- ✅ **v1.22 Weight Loading Ownership Cutover** — shipped 2026-05-03
+- ✅ **v1.23 I/O Loading Strategy Boundary** — shipped 2026-05-04
+- ✅ **v1.24 I/O Mmap Loading Strategy** — shipped 2026-05-04 (Phases 204-211)
+- ✅ **v1.25 I/O Read Loading Strategy** — shipped 2026-05-06 (Phases 212-226 + 214.1)
+- ✅ **v1.26 I/O Staged Read Loading Strategy** — completed 2026-05-08
+  (12 / 12 phases complete; issue #63; `ESG-02B` deferred/future)
+
+## Phases
+
+### ✅ v1.26 I/O Staged Read Loading Strategy (Phases 227-238) — COMPLETE 2026-05-08
+
+Source: GitHub issue #63, "Add io/staged_read state machine for constrained-memory tensor loading".
+Adds `src/emel/io/staged_read` for bounded chunked/windowed reads under tensor-owned residency.
+Depends on the tensor-to-I/O boundary from issue #60. Cooperative coroutine scheduling is out of
+scope unless separately approved. Shipped mmap (`io/mmap`) and bulk read/copy (`io/read`) must not
+regress.
+
+Execution order: 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238.
+
+**Milestone progress (v1.26):** **12 / 12** phases recorded **Complete** in the table below.
+The source-backed milestone audit found a direct tensor staged-load offset-contract gap plus
+closeout artifact debt; Phases 237-238 closed those gaps. `ESG-02B` remains deferred/future
+because file-backed staged-read source ownership is out of scope.
+
+- [x] Phase 227: Staged Read Strategy Component Boundary (STG-01)
+- [x] Phase 228: Span, Target-Window, and Platform Gating (STG-02, STG-03, PLAT-02)
+- [x] Phase 229: Staged Copy Progress and Completion Semantics (STG-04, STG-05, STG-06)
+- [x] Phase 230: Context Cleanness and Per-Attempt Lifetime (STG-07, LIFE-02, SNR-01)
+- [x] Phase 231: Deterministic Error Taxonomy (ESG-01, ESG-02A, ESG-03, ESG-04; ESG-02B deferred)
+- [x] Phase 232: Tensor-Owned Integration Graph (TNX-01, TNX-02, TNX-03, TNX-04)
+- [x] Phase 233: Public Loader and Maintained Entrypoints (PUB-01, PUB-02, PUB-03, PUB-04, PUB-05)
+- [x] Phase 234: Public Dispatch Tests (TST-01, TST-02)
+- [x] Phase 235: Scope and Non-Regression Guardrails (GRD-01, GRD-02, GRD-03, GRD-04, GRD-05)
+- [x] Phase 236: Publication and Evidence Truthfulness (DOC-01, LNT-01, BNH-01, EVI-01)
+- [x] Phase 237: Direct Tensor Staged Offset Contract Repair (TNX-01, TNX-03, TNX-04, TST-01, TST-02)
+- [x] Phase 238: Audit Artifact and Probe Reporting Cleanup (cleanup-only)
+
+#### Phase 227: Staged Read Strategy Component Boundary
+
+**Goal:** Locate canonical `src/emel/io/staged_read` with standard I/O component layout.
+**Depends on:** Phase 226
+**Requirements:** STG-01
+
+**Success criteria:**
+
+1. `src/emel/io/staged_read` exists with canonical `emel::io::staged_read::sm` alias.
+2. Component scope excludes mmap, device transfer, or cooperative async runtime.
+3. Initial fail-closed or smoke dispatch proves actors are wired like sibling I/O strategies.
+
+#### Phase 228: Span, Target-Window, and Platform Gating
+
+**Goal:** All staged preconditions enforced in guards/transitions before any file work.
+**Depends on:** Phase 227
+**Requirements:** STG-02, STG-03, PLAT-02
+
+**Success criteria:**
+
+1. Invalid source staging contract rejected solely via guard-modeled transitions.
+2. Invalid target window/layout rejected solely via guard-modeled transitions.
+3. Unsupported hosts/resources fail closed with explicit unsupported terminal shape.
+
+#### Phase 229: Staged Copy Progress and Completion Semantics
+
+**Goal:** Prove per-stage deterministic copy plus full-span monotone completion.
+**Depends on:** Phase 228
+**Requirements:** STG-04, STG-05, STG-06
+
+**Success criteria:**
+
+1. Test vectors observe correct bytes per staged window.
+2. Completeness tests cover entire logical span order.
+3. Terminal success aligns with copied full span per contract.
+
+#### Phase 230: Context Cleanness and Per-Attempt Lifetime
+
+**Goal:** Bounded handles and residency clarity for the staged actor.
+**Depends on:** Phase 229
+**Requirements:** STG-07, LIFE-02, SNR-01
+
+**Success criteria:**
+
+1. Static or dynamic review shows zero forbidden dispatch-local context mirrors.
+2. Handle lifetime tests/tools show release-before-done semantics.
+3. Tests confirm strategy never asserts tensor residency commits.
+
+#### Phase 231: Deterministic Error Taxonomy
+
+**Goal:** Errors are categorical, observable, exception-free.
+**Depends on:** Phase 230
+**Requirements:** ESG-01, ESG-02A, ESG-03, ESG-04 (`ESG-02B` deferred)
+
+**Success criteria:**
+
+1. At least one doctest per taxonomy family (pre-I/O guard, source-contract read-surface, sequencing/contract) demonstrates deterministic categories through `process_event(...)`.
+2. Source-backed docs explicitly defer `ESG-02B` file open/seek/read + per-stage short-read categories until approved file-backed staged-read ownership exists.
+3. ABI boundary scans show noexcept expectations for surfaced API.
+
+#### Phase 232: Tensor-Owned Integration Graph
+
+**Goal:** Integrate staged loads through explicit tensor+I/O graphs.
+**Depends on:** Phase 231
+**Requirements:** TNX-01, TNX-02, TNX-03, TNX-04
+
+**Closeout ledger (verified):** Manager-scoped **`scripts/quality_gates.sh`** for Phase 232
+changed-file corpus exited **2** (red — **not** exit 0). **`232-VERIFICATION.md`** records **bench_snapshot**
+suite regressions unrelated to staged tensor-integration files and a **paritychecker** failure outside
+Phase 232 scope. Phase 232 completion is ledger-approved **without** claiming a passing full-repo gate run.
+
+**Success criteria:**
+
+1. Requests flow only via public tensors↔IO events.
+2. Residency proofs remain tensor-owned (`model/tensor` retains lifecycle ownership).
+3. Success/failure each have explicit observable terminal representations.
+
+#### Phase 233: Public Loader and Maintained Entrypoints
+
+**Goal:** Strategies observable without actor detail reach-through or duplicate POSIX loops in tools.
+**Depends on:** Phase 232
+**Requirements:** PUB-01, PUB-02, PUB-03, PUB-04, PUB-05
+
+**Closeout (2026-05-08):** **`PUB-01`–`PUB-05`** satisfied per **`233-VERIFICATION.md`** (manager validation +
+**phase233-navigator final review PASS**). Public **`staged_read`** access is through **`io::loader`** and maintained
+tool entrypoints with **`io_staged_read`** wiring; **`tests/model/loader/lifecycle_tests.cpp`** covers the
+storage-backed **`staged_read`** route and include guards.
+
+**Residual:** **`scripts/quality_gates.sh`** was **not** run on a Phase **233** changed-file corpus in
+this closeout slice — **no Phase 233 scoped gate pass is claimed** (full-repo gate truth unchanged from
+Phase **232** ledger where applicable).
+
+**Success criteria:**
+
+1–4. Each lane (loader/bench/parity/probe) has independent proof of public-contract-only access.
+5. Source scan enforcement or doctest proves no duplicated unconstrained staged read shim in tools.
+
+#### Phase 234: Public Dispatch Tests
+
+**Goal:** Core success/failure behavior demonstrated through `process_event`.
+**Depends on:** Phase 233
+**Requirements:** TST-01, TST-02
+
+**Success criteria:**
+
+1. Passing success-path doctest with `visit_current_states` or equivalent.
+2. Passing failure-path doctest for guard rejection.
+
+#### Phase 235: Scope and Non-Regression Guardrails
+
+**Goal:** Freeze architecture invariants relative to loaders, mmap, and read strategies.
+**Depends on:** Phase 234
+**Requirements:** GRD-01, GRD-02, GRD-03, GRD-04, GRD-05
+
+**Success criteria:** Each of GRD-01, GRD-02, GRD-03, GRD-04, and GRD-05 has either a deterministic script failure mode or a narrowed regression doctest proving the invariant holds.
+
+#### Phase 236: Publication and Evidence Truthfulness
+
+**Goal:** Align docs and frozen artifacts with real staged/runtime usage.
+**Depends on:** Phase 235
+**Requirements:** DOC-01, LNT-01, BNH-01, EVI-01
+
+**Success criteria:**
+
+1. Doc diff review verifies accurate staged-read wording.
+2. Lint snapshot regeneration path documented/passing.
+3. Benchmark snapshot regeneration obeys policy.
+4. Parity/compare metadata never mislabels unstaged workloads as staged.
+
+**Closeout (2026-05-08):** **`DOC-01`–`EVI-01`** satisfied per
+**`236-VERIFICATION.md`**. Serial full quality gate passed:
+`EMEL_QUALITY_GATES_SCOPE=full EMEL_QUALITY_GATES_PARALLEL=0 scripts/quality_gates.sh`
+(exit **0**, ended `2026-05-08T21:21:42.028Z`). Benchmark defaults now use bounded routine
+settings (`100` iterations, `3` runs, `10` warmup iterations) with bounded generation and
+diarization defaults.
+
+#### Phase 237: Direct Tensor Staged Offset Contract Repair
+
+**Goal:** Repair direct `model/tensor` staged-load nonzero-offset source-window behavior and prove it through public dispatch.
+**Depends on:** Phase 236
+**Requirements:** TNX-01, TNX-03, TNX-04, TST-01, TST-02
+**Gap Closure:** Closes `.planning/v1.26-MILESTONE-AUDIT.md` findings
+`direct-tensor-staged-offset-contract` and `direct-tensor-staged-nonzero-offset`.
+
+**Success Criteria:**
+
+1. A public `model/tensor::event::request_staged_load` doctest fails before repair and passes after
+   repair for a nonzero `file_offset` against a whole-file source buffer.
+2. Direct tensor staged-load source-span construction is aligned with `io/loader` or the
+   pre-windowed-source contract is explicitly documented and enforced by validation/tests.
+3. Direct tensor staged-load success and failure outcomes remain explicit `_done` / `_error`
+   publications through public `process_event(...)` dispatch and SML state inspection.
+4. Changed-file quality gates for `model/tensor`, `io/staged_read`, and affected tests pass without
+   benchmark-regression override.
+5. If implementation changes maintained model or snapshot artifacts, those artifacts are refreshed
+   only through maintained workflows; model artifact updates are approved for this gap-closure work.
+
+**Closeout (2026-05-08):** Phase 237 completed with a failing-first public
+`request_staged_load` nonzero-offset doctest, repaired source-window dispatch in
+`model/tensor`, and passing scoped validation:
+`./build/emel_tests_bin --test-case="model_tensor_request_staged_load_*"`,
+`ctest --test-dir build -R '^emel_tests_model_and_batch$' --output-on-failure`,
+and changed-file `scripts/quality_gates.sh` (exit `0`). Reopened requirements
+`TNX-01`, `TNX-03`, `TNX-04`, `TST-01`, and `TST-02` are satisfied by
+`237-VERIFICATION.md`.
+
+#### Phase 238: Audit Artifact and Probe Reporting Cleanup
+
+**Goal:** Reconcile audit artifacts and probe reporting truth after the Phase 237 source repair.
+**Depends on:** Phase 237
+**Requirements:** none — cleanup-only; all reopened requirement closure belongs to Phase 237
+**Gap Closure:** Closes `.planning/v1.26-MILESTONE-AUDIT.md` tech-debt items for missing
+`requirements-completed` SUMMARY frontmatter and embedded-size probe reporting clarity.
+
+**Success Criteria:**
+
+1. Phase summaries for 232–236 expose accurate `requirements-completed` frontmatter or an explicit
+   cleanup rationale so the three-source audit matrix no longer needs manual reconciliation.
+2. Embedded-size probe evidence either prints the executed load strategy when appropriate or the
+   maintained docs/audit explain why captured `used_io_strategy` is the authoritative evidence
+   surface.
+3. REQUIREMENTS, ROADMAP, STATE, and the milestone audit are refreshed from source-backed evidence
+   after Phase 237.
+4. Focused lint/docs/audit commands pass; no maintained benchmark, model, or snapshot artifact is
+   updated unless the implementation actually requires it.
+
+**Closeout (2026-05-08):** Phase 238 completed summary frontmatter reconciliation,
+embedded probe reporting truth documentation, and refreshed `v1.26-MILESTONE-AUDIT.md`
+to `status: passed`. Changed-file `scripts/quality_gates.sh` passed with no benchmark,
+coverage, parity, fuzz, or docsgen-affecting lanes required.
+
+---
+### ✅ v1.25 I/O Read Loading Strategy (Phases 212-226 + 214.1) — SHIPPED 2026-05-06
+
+Source: GitHub issue #62, "Add io/read state machine for copy-based tensor loading".
+Adds a dedicated `src/emel/io/read` Stateforward.SML actor for explicit read/copy tensor
+loading beneath tensor-owned residency. Mmap, staged/chunked constrained-memory, async,
+and device strategies remain out of scope.
+
+- [x] Phase 212: Read Strategy Component Boundary (1/1 plans) — completed 2026-05-05
+- [x] Phase 213: Read Validation and Platform Gating (1/1 plans) — completed 2026-05-05
+- [x] Phase 214: Read Execution, Errors, and Lifetime (1/1 plans) — completed 2026-05-05; audit found RTC compliance gap
+- [x] Phase 214.1: RTC-Safe Read Execution Boundary Repair (1/1 plans) — gap closure
+- [x] Phase 215: Tensor-Owned Read Integration (1/1 plans) — completed 2026-05-05
+- [x] Phase 216: Public Runtime and Evidence Surfaces (1/1 plans) — completed 2026-05-05
+- [x] Phase 217: Behavior Tests and Scope Guardrails (1/1 plans) — completed 2026-05-05
+- [x] Phase 218: Publication and Maintained Artifact Updates (1/1 plans) — completed 2026-05-05
+- [x] Phase 219: Maintained Read Source Provenance (1/1 plans) — completed
+  2026-05-05; source-backed benchmark/parity/probe read_copy provenance
+- [x] Phase 220: Explicit Tensor Read Outcome Graph (1/1 plans) — completed
+  2026-05-05; tensor read outcomes selected by explicit same-RTC result graph
+- [x] Phase 221: Read Closeout Truth Reconciliation — superseded planning stub
+  closed 2026-05-06; Phase 223 owns final closeout
+- [x] Phase 222: Public Read Source Contract Repair (1/1 plans) — completed
+  2026-05-06; actor-detail reach-through removed from maintained lanes
+- [x] Phase 223: Read Closeout Truth And Validation Reconciliation (1/1 plans) —
+  completed 2026-05-06; final closeout truth and validation reconciled
+- [x] Phase 224: Read Closeout Tech Debt Cleanup — completed 2026-05-06;
+  refreshed audit ambiguity closed with fresh passing `emel_tests_io` evidence
+- [x] Phase 225: Read Closeout Runtime Validation And SML Repair — completed
+  2026-05-06; refreshed source-backed audit gaps closed with dyld fallback evidence
+- [x] Phase 226: Read Batch Cap And Closeout Evidence Refresh — completed
+  2026-05-06; refreshed audit tech debt closed
+
+Archived closeout artifacts:
+- `.planning/milestones/v1.25-ROADMAP.md`
+- `.planning/milestones/v1.25-REQUIREMENTS.md`
+- `.planning/milestones/v1.25-MILESTONE-AUDIT.md`
+- `.planning/milestones/v1.25-phases/`
+
+**Execution Order:** Phases execute in numeric order:
+212 -> 213 -> 214 -> 214.1 -> 215 -> 216 -> 217 -> 218 -> 219 -> 220 -> 222 -> 223 -> 224 -> 225 -> 226.
+Phase 221 is a completed superseded closeout planning stub and Phase 223 owns final
+source-backed closeout truth. Phase 224 is cleanup-only; Phase 225 owns the refreshed
+2026-05-06 audit gaps before archive. Phase 226 closes the post-audit nonblocking
+tech-debt items before final closeout.
+
+#### Phase 212: Read Strategy Component Boundary
+**Goal**: Maintainers can identify `io/read` as the canonical read/copy strategy actor under
+`src/emel/io`.
+**Depends on**: Phase 211
+**Requirements**: READ-01
+**Success Criteria** (what must be TRUE):
+  1. Maintainer can inspect `src/emel/io/read` and find component-local `context`, `events`,
+     `guards`, `actions`, `errors`, and `sm` ownership.
+  2. Maintainer can use canonical `emel::io::read::sm` ownership and public aliases without
+     reaching into actor internals.
+  3. Maintainer can confirm the component is read/copy-only and contains no mmap, staged or
+     chunked constrained-memory, cooperative async, device-specific, loader-owned byte access,
+     model-family widening, or tool-only read scaffold behavior.
+**Plans**: 01 — Validated 2026-05-05; established canonical `io/read` boundary actor
+and lifecycle tests.
+
+#### Phase 213: Read Validation and Platform Gating
+**Goal**: The read actor accepts read attempts only after explicit request, platform, file,
+offset, length, layout, and target-buffer preconditions pass.
+**Depends on**: Phase 212
+**Requirements**: READ-02, PLAT-01
+**Success Criteria** (what must be TRUE):
+  1. Caller sees invalid request, file, offset, length, layout, or target-buffer preconditions
+     rejected before any open or read attempt is accepted.
+  2. Caller sees unsupported platforms and unsupported file/resource shapes fail closed
+     deterministically through the I/O abstraction boundary.
+  3. Maintainer can inspect SML guards and transitions and see validation outcomes modeled
+     before the open/read attempt.
+  4. Supported requests reach a read-attempt state only after all read preconditions are true.
+**Plans**: 01 — Validated 2026-05-05; added explicit read validation and platform
+gating before the read-attempt placeholder.
+
+#### Phase 214: Read Execution, Errors, and Lifetime
+**Goal**: Successful read requests deliver deterministic copied bytes into the caller-owned
+target buffer with deterministic transient-resource lifetime and deterministic failure
+outcomes, without taking tensor residency ownership.
+**Depends on**: Phase 213
+**Requirements**: READ-03, LIFE-01, ERR-01
+**Success Criteria** (what must be TRUE):
+  1. Caller receives a deterministic copied-bytes outcome on success with the requested byte
+     span written into the caller-provided owned target buffer; the read strategy never claims
+     residency ownership.
+  2. Read failures surface deterministic error categories (invalid request, unsupported
+     resource, unsupported platform, file open failed, file seek failed, file read failed,
+     short read, internal error) instead of thrown exceptions or ambiguous status mirroring.
+  3. Transient OS resources (file descriptor / handle) are released through the actor-owned
+     attempt before `_done` is published; no kernel handle is held across publication.
+  4. Maintainer can verify dispatch-local request data is not stored in `read::context` and
+     tensor residency semantics remain owned by `model/tensor`.
+**Plans**: 01 — Validated 2026-05-05; added concrete read execution, copied-byte
+success, deterministic read errors, and close-before-done lifetime behavior.
+2026-05-05 milestone audit found this phase superseded by unverified Phase 214.1 repair
+work; Phase 214.1 owns source-backed RTC validation and artifact reconciliation.
+
+#### Phase 214.1: RTC-Safe Read Execution Boundary Repair
+**Goal**: The read actor preserves copied-byte success, deterministic errors, and
+close-before-done lifetime evidence without performing blocking or input-dependent
+filesystem work inside SML dispatch.
+**Depends on**: Phase 214
+**Requirements**: READ-03, PLAT-01, LIFE-01, ERR-01
+**Gap Closure**: Closes v1.25 audit gaps for missing Phase 214.1 artifacts, stale Phase
+214 planning truth, and source-backed Nyquist validity after the read actor moved to
+caller-provided source spans.
+**Success Criteria** (what must be TRUE):
+  1. `src/emel/io/read` no longer calls platform open, seek, read, close, or equivalent
+     filesystem APIs from guards, actions, transition helpers, or functions called by them.
+  2. The read actor still accepts only validated read/copy attempts and publishes copied-byte
+     `_done` plus deterministic `_error` outcomes through explicit states/events.
+  3. The caller-owned target buffer remains caller-owned, dispatch-local request data is not
+     stored in `read::context`, and no transient OS handle is retained or hidden in context.
+  4. Tests prove the repaired behavior through public `process_event(...)` dispatch and SML
+     state inspection, including validation failure, unsupported/resource failure, read
+     failure, short read, and copied-byte success.
+  5. Phase 214.1 SUMMARY.md, VERIFICATION.md, and VALIDATION.md reconcile ROADMAP.md,
+     STATE.md, REQUIREMENTS.md, and generated architecture docs with the source-buffer based
+     implementation and do not claim maintained benchmark/parity evidence.
+**Plans**: 01 — Validated 2026-05-05; reconciled read actor evidence with the
+source-buffer based implementation, confirmed no dispatch-time filesystem calls, and
+updated requirement/state artifacts for the Phase 214.1 gap closure.
+
+#### Phase 215: Tensor-Owned Read Integration
+**Goal**: `model/tensor` can request and consume read-backed I/O through the public `emel/io`
+boundary while retaining load, bind, evict, and residency orchestration.
+**Depends on**: Phase 214.1
+**Requirements**: TIO-01, TIO-02
+**Gap Closure**: Closes v1.25 audit gaps for partial tensor-owned read integration and
+callback/status-mediated read outcomes.
+**Success Criteria** (what must be TRUE):
+  1. Tensor load flow can request read-based (copy) loading through public `emel/io` events
+     without direct low-level read calls.
+  2. Tensor bind, residency, and evict transitions remain in `model/tensor` and consume read
+     success outcomes that reference the caller-owned target buffer.
+  3. Read success, unsupported, validation failure, file open failure, and file read failure
+     are visible as explicit `_done` and `_error` events or states.
+  4. Maintainer can verify no callback-selected outcomes, mirrored status fields, or context
+     phase flags decide tensor-to-I/O outcomes for read-backed loading.
+  5. Existing source/test progress through `model/loader -> model/tensor -> io/loader ->
+     io/read -> tensor apply` is preserved or replaced by a stricter explicit outcome path
+     with equivalent public-dispatch tests.
+**Plans**: 01 — Validated 2026-05-05; added tensor-owned
+`request_read_load` public events, explicit read outcome states, and tests for read
+success, unsupported I/O actor, validation failure, file open failure, and file read
+failure.
+
+#### Phase 216: Public Runtime and Evidence Surfaces
+**Goal**: Runtime entrypoints and maintained tool lanes can select or report read-backed
+loading only through public surfaces, and evidence reflects the actual EMEL runtime path.
+**Depends on**: Phase 215
+**Requirements**: TIO-03, VAL-04
+**Gap Closure**: Closes v1.25 audit gaps for maintained benchmark, paritychecker, and
+embedded probe lanes bypassing the read-backed runtime path and for runtime reporting that
+currently exposes only mmap usage.
+**Success Criteria** (what must be TRUE):
+  1. `model/loader`, maintained benchmark lanes, paritychecker lanes, and embedded probes can
+     select or report read-backed loading only through public tensor and I/O runtime contracts.
+  2. Maintained benchmark, paritychecker, and embedded probe lanes avoid actor-internal
+     reach-through and contain no low-level read logic.
+  3. Benchmark and parity output reports read-strategy usage only when the EMEL lane executed
+     the read-backed runtime path.
+  4. Unsupported or fallback behavior is reported as unsupported or non-read-strategy, not as
+     read-strategy parity or performance evidence.
+  5. Runtime done/error evidence distinguishes mmap, read/copy, unsupported, and non-I/O
+     loading paths without relying on tool-only scaffolds.
+**Plans**: 01 — Validated 2026-05-05; added public model-loader load-strategy
+evidence, maintained tool strategy binding, load-strategy output notes, and
+source-backed tests proving benchmark/parity/embedded lanes avoid callback-time
+actor reach-through.
+
+#### Phase 217: Behavior Tests and Scope Guardrails
+**Goal**: Tests and guardrails prove read behavior through public dispatch and prevent scope
+or ownership leaks.
+**Depends on**: Phase 216
+**Requirements**: VAL-01, VAL-02
+**Gap Closure**: Closes v1.25 audit gaps for missing full-scope read behavior tests,
+domain/source guardrails, and former ambiguous read-strategy naming relative to the
+out-of-scope staged/chunked policy.
+**Success Criteria** (what must be TRUE):
+  1. Doctests drive supported read behavior through `process_event(...)` and inspect SML states
+     via `visit_current_states` and/or `is(...)`.
+  2. Doctests cover representative unsupported, validation failure, file open failure, and file
+     read failure outcomes through public events.
+  3. Guardrails fail if read implementation leaks into `model/loader` or tensor residency
+     ownership moves out of `model/tensor`.
+  4. Guardrails fail if mmap, staged or chunked constrained-memory, cooperative async,
+     device-specific, model-family widening, loader-owned byte access, or tool-only read
+     scaffold behavior enters this milestone.
+  5. Source guardrails clarify or eliminate any public naming that could present the v1.25
+     read/copy path as staged/chunked constrained-memory support.
+**Plans**: 01 — Validated 2026-05-05; renamed the copy strategy to
+`read_copy`, added public-dispatch behavior guardrails, tensor-residency ownership
+guardrails, and maintained tool/model-loader no-reach-through source checks.
+
+#### Phase 218: Publication and Maintained Artifact Updates
+**Goal**: Maintained docs, snapshots, benchmark outputs, model artifacts, and planning truth
+describe read-strategy support exactly as implemented.
+**Depends on**: Phase 217
+**Requirements**: VAL-03
+**Gap Closure**: Closes v1.25 audit gaps for stale planning truth, stale generated docs,
+and missing maintained artifact updates. User approved updating snapshots, benchmarks, and
+models as needed during this gap closure command.
+**Success Criteria** (what must be TRUE):
+  1. Public docs and generated architecture docs describe the read/copy strategy path,
+     ownership boundaries, and deferred strategies (mmap shipped in v1.24; staged/async/device
+     remain deferred) truthfully.
+  2. Lint snapshots, benchmark snapshots, benchmark outputs, and model artifacts are updated
+     from maintained commands when the implementation changes them.
+  3. Planning artifacts record final requirement coverage, validation evidence, and any
+     approved artifact updates for v1.25.
+  4. Closeout artifacts do not claim read-strategy support beyond source-backed maintained
+     runtime behavior.
+  5. Any snapshot, benchmark, or model artifact changes are produced by maintained commands
+     and explicitly tied to source-backed read/copy runtime behavior.
+**Plans**: 01 — Validated 2026-05-05; updated public docs, README template,
+generated architecture docs, benchmark snapshots, planning truth, and final closeout
+audit from maintained commands. The closing full quality gate passed with
+`EMEL_QUALITY_GATES_SCOPE=full EMEL_QUALITY_GATES_PARALLEL=never
+scripts/quality_gates.sh`.
+
+#### Phase 219: Maintained Read Source Provenance
+**Goal**: Maintained benchmark, paritychecker, and embedded probe lanes prove read/copy
+strategy usage from a maintained `src`-owned source contract instead of tool-local full-file
+read scaffolds.
+**Depends on**: Phase 218
+**Requirements**: PLAT-01, TIO-03, VAL-04
+**Gap Closure**: Closes v1.25 audit gaps where generation, Sortformer diarization,
+embedded probe, and paritychecker lanes report `read_copy` after tool-local
+`read_file_bytes` helpers create the source span.
+**Success Criteria** (what must be TRUE):
+  1. Maintained benchmark, paritychecker, and embedded probe lanes no longer own low-level
+     file slurp helpers as the source of `read_copy` evidence.
+  2. A maintained `src`-owned loading/source contract feeds `model/loader -> model/tensor ->
+     io/loader -> io/read` for read/copy tool evidence.
+  3. `read_copy` benchmark/parity/probe output is emitted only when the EMEL lane actually
+     consumed the maintained source contract and executed the public runtime path.
+  4. Unsupported or fallback source behavior is reported as unsupported or non-read-strategy,
+     never as read-strategy parity or performance evidence.
+  5. Tests and source guardrails fail on tool-local substitutes for the maintained read/copy
+     source path.
+
+#### Phase 220: Explicit Tensor Read Outcome Graph
+**Goal**: Tensor-owned read/copy integration exposes success and failure outcomes through
+explicit state/event routing without callback/status-mediated behavior selection.
+**Depends on**: Phase 219
+**Requirements**: TIO-02
+**Gap Closure**: Closes v1.25 audit gap where `model/tensor` represents final outcomes
+with explicit states/events but still uses callback-mutated runtime status inspected by
+guards to select the read outcome path.
+**Success Criteria** (what must be TRUE):
+  1. `model/tensor` read success, unsupported, validation failure, file open failure, and
+     file read failure outcomes are selected by explicit guards/transitions over typed
+     same-RTC events, not by callback-mutated status fields.
+  2. Any same-RTC callbacks used for immediate replies do not decide which tensor outcome
+     path runs.
+  3. No mirrored status fields, context phase flags, or callback-selected outcomes remain in
+     the read-backed tensor outcome path.
+  4. Public doctests prove all representative read success and error outcomes through
+     `process_event(...)` and SML state inspection.
+
+#### Phase 221: Read Closeout Truth Reconciliation
+**Goal**: Maintained docs, generated architecture docs, planning artifacts, snapshots,
+benchmark outputs, model artifacts, and the milestone audit describe read/copy support
+exactly as implemented after gap closure.
+**Depends on**: Phase 220
+**Requirements**: superseded by Phase 223
+**Gap Closure**: Closes v1.25 audit gap where closeout artifacts overstated maintained
+read/copy path truth while tool-local source spans still fed the reported lane. User
+approved updating model artifacts, snapshots, and benchmarks as needed during this gap
+closure command.
+**Success Criteria** (what must be TRUE):
+  1. Public docs, generated architecture docs, ROADMAP, REQUIREMENTS, STATE, PROJECT,
+     MILESTONES, and the milestone audit describe the maintained read/copy path truthfully.
+  2. Lint snapshots, benchmark snapshots, benchmark outputs, and model artifacts are updated
+     from maintained commands when implementation changes require it.
+  3. Phase 214 historical artifacts are reconciled or explicitly marked superseded so they no
+     longer conflict with the Phase 214.1 source-buffer truth.
+  4. A source-backed milestone audit passes without relying on tool-only source scaffolds.
+  5. The closing quality gate is run with the appropriate full or changed-file scope and no
+     benchmark-regression override unless explicitly documented as transitional.
+**Plans**: 01 — Ready only. 2026-05-06 audit found an additional source-contract
+blocker in Phase 219/216 maintained lanes, so Phase 221 is superseded by the
+Phase 222 source-contract repair and Phase 223 closeout truth plan.
+**Summary**: Superseded 2026-05-06 with no source or requirement claims.
+
+#### Phase 222: Public Read Source Contract Repair
+**Goal**: Maintained benchmark, paritychecker, and embedded probe lanes obtain read/copy
+source bytes through an allowed public or non-actor-internal EMEL-owned contract instead of
+including `emel/io/read/detail.hpp`.
+**Depends on**: Phase 220
+**Requirements**: PLAT-01, TIO-03, VAL-02, VAL-04
+**Gap Closure**: Closes v1.25 audit gaps where maintained lanes replaced tool-local
+`read_file_bytes` helpers with direct actor-detail reach-through, causing paritychecker
+guardrails and maintained read/copy evidence to fail.
+**Success Criteria** (what must be TRUE):
+  1. Maintained generation, Sortformer diarization, embedded probe, and paritychecker lanes
+     no longer include or call `emel/io/read/detail.hpp` or any actor `detail.hpp` helper for
+     benchmark/parity source loading.
+  2. Source-byte loading for maintained read/copy evidence is exposed through an allowed
+     EMEL-owned public/runtime/setup contract that does not violate the actor model,
+     benchmark/parity harness rules, or `detail.hpp` ownership rules.
+  3. Maintained lanes still report `read_copy` only when the EMEL lane executes the public
+     `model/loader -> model/tensor -> io/loader -> io/read` runtime path.
+  4. Guardrails fail on actor-internal reach-through, tool-local read substitutes, and any
+     unsupported fallback reported as read/copy evidence.
+  5. Focused paritychecker and maintained generation evidence passes without benchmark
+     regression override.
+**Plans**: 01 — Validated 2026-05-06; moved maintained source-byte loading to
+`emel::io::source::load_file_bytes`, removed `io/read/detail.hpp` reach-through
+from maintained lanes, and restored paritychecker/generation guardrail evidence.
+
+#### Phase 223: Read Closeout Truth And Validation Reconciliation
+**Goal**: Final v1.25 closeout truth, generated artifacts, snapshots, benchmark outputs,
+model artifacts, requirements, roadmap state, and milestone audit reflect the post-Phase 222
+maintained read/copy runtime path.
+**Depends on**: Phase 222
+**Requirements**: TIO-02, VAL-01, VAL-03
+**Gap Closure**: Closes v1.25 audit gaps for stale Phase 220 roadmap state, unvalidated
+Phase 221/VAL-03 closeout truth, dyld-blocked test rerun evidence, and final source-backed
+milestone audit truth.
+**Success Criteria** (what must be TRUE):
+  1. ROADMAP, REQUIREMENTS, STATE, PROJECT, MILESTONES, public docs, generated architecture
+     docs, and the milestone audit no longer claim stale Phase 218/221 closeout truth.
+  2. Phase 220 progress-table state is reconciled with its completed SUMMARY,
+     VERIFICATION, and VALIDATION artifacts.
+  3. Public behavior doctests and maintained guardrails are rerun or the dyld/libSystem launch
+     blocker is explicitly captured with source-backed substitute evidence approved for the
+     phase.
+  4. Lint snapshots, benchmark snapshots, benchmark outputs, and model artifacts are updated
+     only through maintained commands when the repaired implementation changes them.
+  5. A source-backed milestone audit reports every active v1.25 requirement satisfied, with
+     no actor-detail reach-through or tool-only maintained-path evidence.
+**Plans**: 01 — Validated 2026-05-06; reconciled final planning truth, generated
+docs checks, lint snapshot checks, public-dispatch doctests, paritychecker
+guardrails, maintained generation compare evidence, repaired batch planner
+benchmark evidence, the full closeout quality gate, and the source-backed
+milestone audit.
+
+#### Phase 224: Read Closeout Tech Debt Cleanup
+**Goal**: Close the nonblocking tech-debt items from the refreshed v1.25 milestone audit
+before archive.
+**Depends on**: Phase 223
+**Requirements**: none — all v1.25 requirements remain satisfied; this phase is cleanup only
+**Gap Closure**: Addresses audit tech debt without resetting any validated requirement:
+historical Phase 214 supersession noise, public tensor read event maintained-lane coverage shape,
+and fresh `emel_tests_io` evidence after the local dyld/libSystem launch blocker is resolved.
+**Success Criteria** (what must be TRUE):
+  1. Phase 214 historical artifacts are either further reconciled or explicitly confirmed as
+     intentionally superseded by Phase 214.1 without creating closeout ambiguity.
+  2. Maintainers can tell whether `model::tensor::event::request_read_load` should gain a
+     maintained direct-lane coverage path or remain a public tested route while maintained
+     model-loader lanes use `model/tensor` plan/apply plus `io/loader -> io/read`.
+  3. Fresh `emel_tests_io` evidence is captured from a healthy local environment, or the
+     dyld/libSystem launch blocker is captured with an explicit archive-time decision.
+  4. The milestone audit is rerun and either passes or reports only explicitly accepted
+     nonblocking debt.
+**Plans**: 01 — Validated 2026-05-06; Phase 214 supersession clarity,
+`request_read_load` maintained-lane decision evidence, fresh passing
+`emel_tests_io` evidence, and final milestone audit refresh.
+
+#### Phase 225: Read Closeout Runtime Validation And SML Repair
+**Goal**: Close refreshed v1.25 audit gaps by restoring executable model/batch validation,
+moving maintained read/copy per-tensor I/O orchestration out of model-loader action loops,
+and reconciling closeout artifact paths.
+**Depends on**: Phase 224
+**Requirements**: VAL-01, TIO-03, VAL-04, VAL-03
+**Gap Closure**: Closes `.planning/v1.25-MILESTONE-AUDIT.md` findings: current
+`emel_tests_model_and_batch` dyld launch failure, model-loader action-loop
+`io_loader->process_event(...)` SML readiness risk, and stale archived closeout path
+references.
+**Success Criteria** (what must be TRUE):
+  1. `ctest --test-dir build/zig --output-on-failure -R emel_tests_model_and_batch`
+     runs to completion or the dyld/libSystem launch blocker is eliminated with a
+     source-backed maintained substitute explicitly recorded in validation.
+  2. Maintained read/copy `model/loader -> io/loader` orchestration no longer relies on an
+     action loop calling `io_loader->process_event(...)`; runtime choice and per-phase
+     orchestration are represented with explicit SML guards/states/transitions.
+  3. The maintained read/copy path still reports `used_io_strategy` only after public
+     runtime execution through `model/loader -> model/tensor -> io/loader -> io/read`.
+  4. Closeout artifact paths in active and archived roadmap/requirements/audit docs point
+     at files that exist after the v1.25 archive layout.
+  5. Focused model-loader, model/tensor, io/loader, io/read, domain-boundary, consistency,
+     and changed-file quality gates pass without benchmark-regression override.
+**Plans**: 6 plans — completed 2026-05-06
+Plans:
+- [x] `225-01-PLAN.md` — Add the owning `io/read` batch copy surface and public-dispatch tests.
+- [x] `225-02-PLAN.md` — Route one `io/loader` read_copy batch to `io/read` with same-RTC result callbacks.
+- [x] `225-03-PLAN.md` — Replace model-loader per-tensor I/O dispatch with one public batch dispatch.
+- [x] `225-04-PLAN.md` — Wire maintained callers and guardrails to request-owned `io_load_spans`.
+- [x] `225-05-PLAN.md` — Reconcile active and archived closeout path and plan traceability.
+- [x] `225-06-PLAN.md` — Publish validation, summary, and active/archived audit evidence.
+
+#### Phase 226: Read Batch Cap And Closeout Evidence Refresh
+**Goal**: Close the nonblocking tech-debt items from `.planning/v1.25-MILESTONE-AUDIT.md`
+by bounding the public read/copy batch API independently and refreshing closeout evidence
+to match current executable validation.
+**Depends on**: Phase 225
+**Requirements**: none — all v1.25 requirements remain satisfied; this phase is cleanup only
+**Gap Closure**: Closes audit tech debt for the uncapped public `io/read`
+`read_tensor_batch` span and stale dyld-fallback closeout wording after current focused
+CTest passed.
+**Success Criteria** (what must be TRUE):
+  1. Public `io/read::event::read_tensor_batch` dispatch rejects over-large spans before
+     iterating or copying, with the cap owned by a public/read-side contract rather than
+     relying only on maintained model-loader callers.
+  2. Doctests prove accepted boundary-size batches and rejected over-large batches through
+     public `process_event(...)` dispatch and SML state inspection.
+  3. Active and archived closeout evidence distinguishes historical dyld fallback evidence
+     from current direct `build/zig` focused CTest evidence.
+  4. If the repaired implementation changes maintained snapshots, benchmark outputs,
+     benchmark snapshots, or model artifacts, those artifacts are updated only through
+     maintained commands. User permission for those updates was granted with this phase.
+  5. Changed-file quality gates pass without benchmark-regression override, and the
+     refreshed milestone audit reports no blockers.
+**Plans**: 01 — Validated 2026-05-06; public `io/read` batch cap added,
+exact-cap and over-cap doctests passed, closeout evidence refreshed, and
+changed-file quality gate passed.
+
+#### Coverage
+
+| Requirement | Phase |
+|-------------|-------|
+| READ-01 | Phase 212 |
+| READ-02 | Phase 213 |
+| PLAT-01 | Phase 222 |
+| READ-03 | Phase 214.1 |
+| LIFE-01 | Phase 214.1 |
+| ERR-01 | Phase 214.1 |
+| TIO-01 | Phase 215 |
+| TIO-02 | Phase 223 |
+| TIO-03 | Phase 225 |
+| VAL-04 | Phase 225 |
+| VAL-01 | Phase 225 |
+| VAL-02 | Phase 222 |
+| VAL-03 | Phase 225 |
+
+Mapped: 13/13 v1 requirements; validated 13, pending 0. Phases 224 and 226 are
+cleanup-only; Phase 225 closed refreshed closeout gaps for VAL-01, TIO-03, VAL-04,
+and VAL-03.
+
+<details>
+<summary>✅ v1.24 I/O Mmap Loading Strategy (Phases 204-211) — SHIPPED 2026-05-04</summary>
+
+- [x] Phase 204: Mmap Strategy Component Boundary (1/1 plans) — completed 2026-05-04
+- [x] Phase 205: Mmap Validation and Platform Gating (1/1 plans) — completed 2026-05-04
+- [x] Phase 206: Mapped Descriptor, Errors, and Lifetime (1/1 plans) — completed 2026-05-04
+- [x] Phase 207: Tensor-Owned Mmap Integration (1/1 plans) — completed 2026-05-04
+- [x] Phase 208: Public Runtime and Evidence Surfaces (1/1 plans) — completed 2026-05-04
+- [x] Phase 209: Behavior Tests and Scope Guardrails (1/1 plans) — completed 2026-05-04
+- [x] Phase 210: Publication and Maintained Artifact Updates (1/1 plans) — completed 2026-05-04
+- [x] Phase 211: Phase Verification Artifact Backfill (1/1 plans) — completed 2026-05-04 (gap closure)
+
+Archive:
+- `.planning/milestones/v1.24-ROADMAP.md`
+- `.planning/milestones/v1.24-REQUIREMENTS.md`
+- `.planning/milestones/v1.24-MILESTONE-AUDIT.md`
+- `.planning/milestones/v1.24-phases/{204..210}-*` (Phase 211 backfill artifacts live alongside their parent phase dirs)
+
+</details>
+
+<details>
+<summary>✅ v1.23 I/O Loading Strategy Boundary (Phases 197-203) — SHIPPED 2026-05-04</summary>
+
+Archive:
+- `.planning/milestones/v1.23-ROADMAP.md`
+- `.planning/milestones/v1.23-REQUIREMENTS.md`
+- `.planning/milestones/v1.23-MILESTONE-AUDIT.md`
+- `.planning/milestones/v1.23-phases/`
+
+</details>
+
+### 📋 Milestone backlog
+
+Older “next milestone” staging notes are superseded by **v1.26** (issue #63) in active planning
+artifacts (`REQUIREMENTS.md`, `STATE.md`). Future milestones after v1.26 continue via
+`$gsd-new-milestone`.
+
+## Progress
+
+| Phase | Milestone | Plans Complete | Status | Completed |
+|-------|-----------|----------------|--------|-----------|
+| 227. Staged Read Strategy Component Boundary | v1.26 | 1/1 | Complete | 2026-05-07 |
+| 228. Span, Target-Window, and Platform Gating | v1.26 | 1/1 | Complete | 2026-05-07 |
+| 229. Staged Copy Progress and Completion Semantics | v1.26 | 1/1 | Complete | 2026-05-07 |
+| 230. Context Cleanness and Per-Attempt Lifetime | v1.26 | 1/1 | Complete | 2026-05-07 |
+| 231. Deterministic Error Taxonomy | v1.26 | 1/1 | Complete | 2026-05-07 |
+| 232. Tensor-Owned Integration Graph | v1.26 | 1/1 | Complete | 2026-05-07 |
+| 233. Public Loader and Maintained Entrypoints | v1.26 | 1/1 | Complete | 2026-05-08 |
+| 234. Public Dispatch Tests | v1.26 | 1/1 | Complete | 2026-05-08 |
+| 235. Scope and Non-Regression Guardrails | v1.26 | 1/1 | Complete | 2026-05-08 |
+| 236. Publication and Evidence Truthfulness | v1.26 | 1/1 | Complete | 2026-05-08 |
+| 237. Direct Tensor Staged Offset Contract Repair | v1.26 | 1/1 | Complete | 2026-05-08 |
+| 238. Audit Artifact and Probe Reporting Cleanup | v1.26 | 1/1 | Complete | 2026-05-08 |
+| 212. Read Strategy Component Boundary | v1.25 | 1/1 | Validated | 2026-05-05 |
+| 213. Read Validation and Platform Gating | v1.25 | 1/1 | Validated | 2026-05-05 |
+| 214. Read Execution, Errors, and Lifetime | v1.25 | 1/1 | Validated | 2026-05-05 |
+| 214.1. RTC-Safe Read Execution Boundary Repair | v1.25 | 1/1 | Validated | 2026-05-05 |
+| 215. Tensor-Owned Read Integration | v1.25 | 1/1 | Validated | 2026-05-05 |
+| 216. Public Runtime and Evidence Surfaces | v1.25 | 1/1 | Validated | 2026-05-05 |
+| 217. Behavior Tests and Scope Guardrails | v1.25 | 1/1 | Validated | 2026-05-05 |
+| 218. Publication and Maintained Artifact Updates | v1.25 | 1/1 | Validated | 2026-05-05 |
+| 219. Maintained Read Source Provenance | v1.25 | 1/1 | Validated | 2026-05-05 |
+| 220. Explicit Tensor Read Outcome Graph | v1.25 | 1/1 | Validated | 2026-05-05 |
+| 221. Read Closeout Truth Reconciliation | v1.25 | 1/1 | Superseded | 2026-05-06 |
+| 222. Public Read Source Contract Repair | v1.25 | 1/1 | Validated | 2026-05-06 |
+| 223. Read Closeout Truth And Validation Reconciliation | v1.25 | 1/1 | Validated | 2026-05-06 |
+| 224. Read Closeout Tech Debt Cleanup | v1.25 | 1/1 | Complete    | 2026-05-06 |
+| 225. Read Closeout Runtime Validation And SML Repair | v1.25 | 6/6 | Complete   | 2026-05-06 |
+| 226. Read Batch Cap And Closeout Evidence Refresh | v1.25 | 1/1 | Validated | 2026-05-06 |
+| 204. Mmap Strategy Component Boundary | v1.24 | 1/1 | Complete | 2026-05-04 |
+| 205. Mmap Validation and Platform Gating | v1.24 | 1/1 | Complete | 2026-05-04 |
+| 206. Mapped Descriptor, Errors, and Lifetime | v1.24 | 1/1 | Complete | 2026-05-04 |
+| 207. Tensor-Owned Mmap Integration | v1.24 | 1/1 | Complete | 2026-05-04 |
+| 208. Public Runtime and Evidence Surfaces | v1.24 | 1/1 | Complete | 2026-05-04 |
+| 209. Behavior Tests and Scope Guardrails | v1.24 | 1/1 | Complete | 2026-05-04 |
+| 210. Publication and Maintained Artifact Updates | v1.24 | 1/1 | Complete | 2026-05-04 |
+| 211. Phase Verification Artifact Backfill | v1.24 | 1/1 | Complete | 2026-05-04 |
diff --git a/.planning/phases/227-staged-read-strategy-component-boundary/227-01-PLAN.md b/.planning/milestones/v1.26-phases/227-staged-read-strategy-component-boundary/227-01-PLAN.md
similarity index 100%
rename from .planning/phases/227-staged-read-strategy-component-boundary/227-01-PLAN.md
rename to .planning/milestones/v1.26-phases/227-staged-read-strategy-component-boundary/227-01-PLAN.md
diff --git a/.planning/phases/227-staged-read-strategy-component-boundary/227-01-SUMMARY.md b/.planning/milestones/v1.26-phases/227-staged-read-strategy-component-boundary/227-01-SUMMARY.md
similarity index 100%
rename from .planning/phases/227-staged-read-strategy-component-boundary/227-01-SUMMARY.md
rename to .planning/milestones/v1.26-phases/227-staged-read-strategy-component-boundary/227-01-SUMMARY.md
diff --git a/.planning/phases/227-staged-read-strategy-component-boundary/227-CONTEXT.md b/.planning/milestones/v1.26-phases/227-staged-read-strategy-component-boundary/227-CONTEXT.md
similarity index 100%
rename from .planning/phases/227-staged-read-strategy-component-boundary/227-CONTEXT.md
rename to .planning/milestones/v1.26-phases/227-staged-read-strategy-component-boundary/227-CONTEXT.md
diff --git a/.planning/phases/227-staged-read-strategy-component-boundary/227-VALIDATION.md b/.planning/milestones/v1.26-phases/227-staged-read-strategy-component-boundary/227-VALIDATION.md
similarity index 100%
rename from .planning/phases/227-staged-read-strategy-component-boundary/227-VALIDATION.md
rename to .planning/milestones/v1.26-phases/227-staged-read-strategy-component-boundary/227-VALIDATION.md
diff --git a/.planning/phases/227-staged-read-strategy-component-boundary/227-VERIFICATION.md b/.planning/milestones/v1.26-phases/227-staged-read-strategy-component-boundary/227-VERIFICATION.md
similarity index 100%
rename from .planning/phases/227-staged-read-strategy-component-boundary/227-VERIFICATION.md
rename to .planning/milestones/v1.26-phases/227-staged-read-strategy-component-boundary/227-VERIFICATION.md
diff --git a/.planning/phases/228-span-target-window-platform-gating/228-01-PLAN.md b/.planning/milestones/v1.26-phases/228-span-target-window-platform-gating/228-01-PLAN.md
similarity index 100%
rename from .planning/phases/228-span-target-window-platform-gating/228-01-PLAN.md
rename to .planning/milestones/v1.26-phases/228-span-target-window-platform-gating/228-01-PLAN.md
diff --git a/.planning/phases/228-span-target-window-platform-gating/228-01-SUMMARY.md b/.planning/milestones/v1.26-phases/228-span-target-window-platform-gating/228-01-SUMMARY.md
similarity index 100%
rename from .planning/phases/228-span-target-window-platform-gating/228-01-SUMMARY.md
rename to .planning/milestones/v1.26-phases/228-span-target-window-platform-gating/228-01-SUMMARY.md
diff --git a/.planning/phases/228-span-target-window-platform-gating/228-CONTEXT.md b/.planning/milestones/v1.26-phases/228-span-target-window-platform-gating/228-CONTEXT.md
similarity index 100%
rename from .planning/phases/228-span-target-window-platform-gating/228-CONTEXT.md
rename to .planning/milestones/v1.26-phases/228-span-target-window-platform-gating/228-CONTEXT.md
diff --git a/.planning/phases/228-span-target-window-platform-gating/228-VALIDATION.md b/.planning/milestones/v1.26-phases/228-span-target-window-platform-gating/228-VALIDATION.md
similarity index 100%
rename from .planning/phases/228-span-target-window-platform-gating/228-VALIDATION.md
rename to .planning/milestones/v1.26-phases/228-span-target-window-platform-gating/228-VALIDATION.md
diff --git a/.planning/phases/228-span-target-window-platform-gating/228-VERIFICATION.md b/.planning/milestones/v1.26-phases/228-span-target-window-platform-gating/228-VERIFICATION.md
similarity index 100%
rename from .planning/phases/228-span-target-window-platform-gating/228-VERIFICATION.md
rename to .planning/milestones/v1.26-phases/228-span-target-window-platform-gating/228-VERIFICATION.md
diff --git a/.planning/phases/229-staged-copy-progress-and-completion-semantics/229-01-PLAN.md b/.planning/milestones/v1.26-phases/229-staged-copy-progress-and-completion-semantics/229-01-PLAN.md
similarity index 100%
rename from .planning/phases/229-staged-copy-progress-and-completion-semantics/229-01-PLAN.md
rename to .planning/milestones/v1.26-phases/229-staged-copy-progress-and-completion-semantics/229-01-PLAN.md
diff --git a/.planning/phases/229-staged-copy-progress-and-completion-semantics/229-01-SUMMARY.md b/.planning/milestones/v1.26-phases/229-staged-copy-progress-and-completion-semantics/229-01-SUMMARY.md
similarity index 100%
rename from .planning/phases/229-staged-copy-progress-and-completion-semantics/229-01-SUMMARY.md
rename to .planning/milestones/v1.26-phases/229-staged-copy-progress-and-completion-semantics/229-01-SUMMARY.md
diff --git a/.planning/phases/229-staged-copy-progress-and-completion-semantics/229-CONTEXT.md b/.planning/milestones/v1.26-phases/229-staged-copy-progress-and-completion-semantics/229-CONTEXT.md
similarity index 100%
rename from .planning/phases/229-staged-copy-progress-and-completion-semantics/229-CONTEXT.md
rename to .planning/milestones/v1.26-phases/229-staged-copy-progress-and-completion-semantics/229-CONTEXT.md
diff --git a/.planning/phases/229-staged-copy-progress-and-completion-semantics/229-VALIDATION.md b/.planning/milestones/v1.26-phases/229-staged-copy-progress-and-completion-semantics/229-VALIDATION.md
similarity index 100%
rename from .planning/phases/229-staged-copy-progress-and-completion-semantics/229-VALIDATION.md
rename to .planning/milestones/v1.26-phases/229-staged-copy-progress-and-completion-semantics/229-VALIDATION.md
diff --git a/.planning/phases/229-staged-copy-progress-and-completion-semantics/229-VERIFICATION.md b/.planning/milestones/v1.26-phases/229-staged-copy-progress-and-completion-semantics/229-VERIFICATION.md
similarity index 100%
rename from .planning/phases/229-staged-copy-progress-and-completion-semantics/229-VERIFICATION.md
rename to .planning/milestones/v1.26-phases/229-staged-copy-progress-and-completion-semantics/229-VERIFICATION.md
diff --git a/.planning/phases/230-context-cleanness-and-per-attempt-lifetime/230-01-PLAN.md b/.planning/milestones/v1.26-phases/230-context-cleanness-and-per-attempt-lifetime/230-01-PLAN.md
similarity index 100%
rename from .planning/phases/230-context-cleanness-and-per-attempt-lifetime/230-01-PLAN.md
rename to .planning/milestones/v1.26-phases/230-context-cleanness-and-per-attempt-lifetime/230-01-PLAN.md
diff --git a/.planning/phases/230-context-cleanness-and-per-attempt-lifetime/230-01-SUMMARY.md b/.planning/milestones/v1.26-phases/230-context-cleanness-and-per-attempt-lifetime/230-01-SUMMARY.md
similarity index 100%
rename from .planning/phases/230-context-cleanness-and-per-attempt-lifetime/230-01-SUMMARY.md
rename to .planning/milestones/v1.26-phases/230-context-cleanness-and-per-attempt-lifetime/230-01-SUMMARY.md
diff --git a/.planning/phases/230-context-cleanness-and-per-attempt-lifetime/230-CONTEXT.md b/.planning/milestones/v1.26-phases/230-context-cleanness-and-per-attempt-lifetime/230-CONTEXT.md
similarity index 100%
rename from .planning/phases/230-context-cleanness-and-per-attempt-lifetime/230-CONTEXT.md
rename to .planning/milestones/v1.26-phases/230-context-cleanness-and-per-attempt-lifetime/230-CONTEXT.md
diff --git a/.planning/phases/230-context-cleanness-and-per-attempt-lifetime/230-VALIDATION.md b/.planning/milestones/v1.26-phases/230-context-cleanness-and-per-attempt-lifetime/230-VALIDATION.md
similarity index 100%
rename from .planning/phases/230-context-cleanness-and-per-attempt-lifetime/230-VALIDATION.md
rename to .planning/milestones/v1.26-phases/230-context-cleanness-and-per-attempt-lifetime/230-VALIDATION.md
diff --git a/.planning/phases/230-context-cleanness-and-per-attempt-lifetime/230-VERIFICATION.md b/.planning/milestones/v1.26-phases/230-context-cleanness-and-per-attempt-lifetime/230-VERIFICATION.md
similarity index 100%
rename from .planning/phases/230-context-cleanness-and-per-attempt-lifetime/230-VERIFICATION.md
rename to .planning/milestones/v1.26-phases/230-context-cleanness-and-per-attempt-lifetime/230-VERIFICATION.md
diff --git a/.planning/phases/231-deterministic-error-taxonomy/231-01-PLAN.md b/.planning/milestones/v1.26-phases/231-deterministic-error-taxonomy/231-01-PLAN.md
similarity index 100%
rename from .planning/phases/231-deterministic-error-taxonomy/231-01-PLAN.md
rename to .planning/milestones/v1.26-phases/231-deterministic-error-taxonomy/231-01-PLAN.md
diff --git a/.planning/phases/231-deterministic-error-taxonomy/231-01-SUMMARY.md b/.planning/milestones/v1.26-phases/231-deterministic-error-taxonomy/231-01-SUMMARY.md
similarity index 100%
rename from .planning/phases/231-deterministic-error-taxonomy/231-01-SUMMARY.md
rename to .planning/milestones/v1.26-phases/231-deterministic-error-taxonomy/231-01-SUMMARY.md
diff --git a/.planning/phases/231-deterministic-error-taxonomy/231-CONTEXT.md b/.planning/milestones/v1.26-phases/231-deterministic-error-taxonomy/231-CONTEXT.md
similarity index 100%
rename from .planning/phases/231-deterministic-error-taxonomy/231-CONTEXT.md
rename to .planning/milestones/v1.26-phases/231-deterministic-error-taxonomy/231-CONTEXT.md
diff --git a/.planning/phases/231-deterministic-error-taxonomy/231-VALIDATION.md b/.planning/milestones/v1.26-phases/231-deterministic-error-taxonomy/231-VALIDATION.md
similarity index 100%
rename from .planning/phases/231-deterministic-error-taxonomy/231-VALIDATION.md
rename to .planning/milestones/v1.26-phases/231-deterministic-error-taxonomy/231-VALIDATION.md
diff --git a/.planning/phases/231-deterministic-error-taxonomy/231-VERIFICATION.md b/.planning/milestones/v1.26-phases/231-deterministic-error-taxonomy/231-VERIFICATION.md
similarity index 100%
rename from .planning/phases/231-deterministic-error-taxonomy/231-VERIFICATION.md
rename to .planning/milestones/v1.26-phases/231-deterministic-error-taxonomy/231-VERIFICATION.md
diff --git a/.planning/phases/232-tensor-owned-integration-graph/232-01-PLAN.md b/.planning/milestones/v1.26-phases/232-tensor-owned-integration-graph/232-01-PLAN.md
similarity index 100%
rename from .planning/phases/232-tensor-owned-integration-graph/232-01-PLAN.md
rename to .planning/milestones/v1.26-phases/232-tensor-owned-integration-graph/232-01-PLAN.md
diff --git a/.planning/phases/232-tensor-owned-integration-graph/232-01-SUMMARY.md b/.planning/milestones/v1.26-phases/232-tensor-owned-integration-graph/232-01-SUMMARY.md
similarity index 100%
rename from .planning/phases/232-tensor-owned-integration-graph/232-01-SUMMARY.md
rename to .planning/milestones/v1.26-phases/232-tensor-owned-integration-graph/232-01-SUMMARY.md
diff --git a/.planning/phases/232-tensor-owned-integration-graph/232-CONTEXT.md b/.planning/milestones/v1.26-phases/232-tensor-owned-integration-graph/232-CONTEXT.md
similarity index 100%
rename from .planning/phases/232-tensor-owned-integration-graph/232-CONTEXT.md
rename to .planning/milestones/v1.26-phases/232-tensor-owned-integration-graph/232-CONTEXT.md
diff --git a/.planning/phases/232-tensor-owned-integration-graph/232-VALIDATION.md b/.planning/milestones/v1.26-phases/232-tensor-owned-integration-graph/232-VALIDATION.md
similarity index 100%
rename from .planning/phases/232-tensor-owned-integration-graph/232-VALIDATION.md
rename to .planning/milestones/v1.26-phases/232-tensor-owned-integration-graph/232-VALIDATION.md
diff --git a/.planning/phases/232-tensor-owned-integration-graph/232-VERIFICATION.md b/.planning/milestones/v1.26-phases/232-tensor-owned-integration-graph/232-VERIFICATION.md
similarity index 100%
rename from .planning/phases/232-tensor-owned-integration-graph/232-VERIFICATION.md
rename to .planning/milestones/v1.26-phases/232-tensor-owned-integration-graph/232-VERIFICATION.md
diff --git a/.planning/phases/233-public-loader-and-maintained-entrypoints/233-01-PLAN.md b/.planning/milestones/v1.26-phases/233-public-loader-and-maintained-entrypoints/233-01-PLAN.md
similarity index 100%
rename from .planning/phases/233-public-loader-and-maintained-entrypoints/233-01-PLAN.md
rename to .planning/milestones/v1.26-phases/233-public-loader-and-maintained-entrypoints/233-01-PLAN.md
diff --git a/.planning/phases/233-public-loader-and-maintained-entrypoints/233-01-SUMMARY.md b/.planning/milestones/v1.26-phases/233-public-loader-and-maintained-entrypoints/233-01-SUMMARY.md
similarity index 100%
rename from .planning/phases/233-public-loader-and-maintained-entrypoints/233-01-SUMMARY.md
rename to .planning/milestones/v1.26-phases/233-public-loader-and-maintained-entrypoints/233-01-SUMMARY.md
diff --git a/.planning/phases/233-public-loader-and-maintained-entrypoints/233-CONTEXT.md b/.planning/milestones/v1.26-phases/233-public-loader-and-maintained-entrypoints/233-CONTEXT.md
similarity index 100%
rename from .planning/phases/233-public-loader-and-maintained-entrypoints/233-CONTEXT.md
rename to .planning/milestones/v1.26-phases/233-public-loader-and-maintained-entrypoints/233-CONTEXT.md
diff --git a/.planning/phases/233-public-loader-and-maintained-entrypoints/233-VALIDATION.md b/.planning/milestones/v1.26-phases/233-public-loader-and-maintained-entrypoints/233-VALIDATION.md
similarity index 100%
rename from .planning/phases/233-public-loader-and-maintained-entrypoints/233-VALIDATION.md
rename to .planning/milestones/v1.26-phases/233-public-loader-and-maintained-entrypoints/233-VALIDATION.md
diff --git a/.planning/phases/233-public-loader-and-maintained-entrypoints/233-VERIFICATION.md b/.planning/milestones/v1.26-phases/233-public-loader-and-maintained-entrypoints/233-VERIFICATION.md
similarity index 100%
rename from .planning/phases/233-public-loader-and-maintained-entrypoints/233-VERIFICATION.md
rename to .planning/milestones/v1.26-phases/233-public-loader-and-maintained-entrypoints/233-VERIFICATION.md
diff --git a/.planning/phases/234-public-dispatch-tests/234-01-PLAN.md b/.planning/milestones/v1.26-phases/234-public-dispatch-tests/234-01-PLAN.md
similarity index 100%
rename from .planning/phases/234-public-dispatch-tests/234-01-PLAN.md
rename to .planning/milestones/v1.26-phases/234-public-dispatch-tests/234-01-PLAN.md
diff --git a/.planning/phases/234-public-dispatch-tests/234-01-SUMMARY.md b/.planning/milestones/v1.26-phases/234-public-dispatch-tests/234-01-SUMMARY.md
similarity index 100%
rename from .planning/phases/234-public-dispatch-tests/234-01-SUMMARY.md
rename to .planning/milestones/v1.26-phases/234-public-dispatch-tests/234-01-SUMMARY.md
diff --git a/.planning/phases/234-public-dispatch-tests/234-CONTEXT.md b/.planning/milestones/v1.26-phases/234-public-dispatch-tests/234-CONTEXT.md
similarity index 100%
rename from .planning/phases/234-public-dispatch-tests/234-CONTEXT.md
rename to .planning/milestones/v1.26-phases/234-public-dispatch-tests/234-CONTEXT.md
diff --git a/.planning/phases/234-public-dispatch-tests/234-VALIDATION.md b/.planning/milestones/v1.26-phases/234-public-dispatch-tests/234-VALIDATION.md
similarity index 100%
rename from .planning/phases/234-public-dispatch-tests/234-VALIDATION.md
rename to .planning/milestones/v1.26-phases/234-public-dispatch-tests/234-VALIDATION.md
diff --git a/.planning/phases/234-public-dispatch-tests/234-VERIFICATION.md b/.planning/milestones/v1.26-phases/234-public-dispatch-tests/234-VERIFICATION.md
similarity index 100%
rename from .planning/phases/234-public-dispatch-tests/234-VERIFICATION.md
rename to .planning/milestones/v1.26-phases/234-public-dispatch-tests/234-VERIFICATION.md
diff --git a/.planning/phases/235-scope-and-non-regression-guardrails/235-01-PLAN.md b/.planning/milestones/v1.26-phases/235-scope-and-non-regression-guardrails/235-01-PLAN.md
similarity index 100%
rename from .planning/phases/235-scope-and-non-regression-guardrails/235-01-PLAN.md
rename to .planning/milestones/v1.26-phases/235-scope-and-non-regression-guardrails/235-01-PLAN.md
diff --git a/.planning/phases/235-scope-and-non-regression-guardrails/235-01-SUMMARY.md b/.planning/milestones/v1.26-phases/235-scope-and-non-regression-guardrails/235-01-SUMMARY.md
similarity index 100%
rename from .planning/phases/235-scope-and-non-regression-guardrails/235-01-SUMMARY.md
rename to .planning/milestones/v1.26-phases/235-scope-and-non-regression-guardrails/235-01-SUMMARY.md
diff --git a/.planning/phases/235-scope-and-non-regression-guardrails/235-CONTEXT.md b/.planning/milestones/v1.26-phases/235-scope-and-non-regression-guardrails/235-CONTEXT.md
similarity index 100%
rename from .planning/phases/235-scope-and-non-regression-guardrails/235-CONTEXT.md
rename to .planning/milestones/v1.26-phases/235-scope-and-non-regression-guardrails/235-CONTEXT.md
diff --git a/.planning/phases/235-scope-and-non-regression-guardrails/235-VALIDATION.md b/.planning/milestones/v1.26-phases/235-scope-and-non-regression-guardrails/235-VALIDATION.md
similarity index 100%
rename from .planning/phases/235-scope-and-non-regression-guardrails/235-VALIDATION.md
rename to .planning/milestones/v1.26-phases/235-scope-and-non-regression-guardrails/235-VALIDATION.md
diff --git a/.planning/phases/235-scope-and-non-regression-guardrails/235-VERIFICATION.md b/.planning/milestones/v1.26-phases/235-scope-and-non-regression-guardrails/235-VERIFICATION.md
similarity index 100%
rename from .planning/phases/235-scope-and-non-regression-guardrails/235-VERIFICATION.md
rename to .planning/milestones/v1.26-phases/235-scope-and-non-regression-guardrails/235-VERIFICATION.md
diff --git a/.planning/phases/236-publication-and-evidence-truthfulness/236-01-PLAN.md b/.planning/milestones/v1.26-phases/236-publication-and-evidence-truthfulness/236-01-PLAN.md
similarity index 100%
rename from .planning/phases/236-publication-and-evidence-truthfulness/236-01-PLAN.md
rename to .planning/milestones/v1.26-phases/236-publication-and-evidence-truthfulness/236-01-PLAN.md
diff --git a/.planning/phases/236-publication-and-evidence-truthfulness/236-01-SUMMARY.md b/.planning/milestones/v1.26-phases/236-publication-and-evidence-truthfulness/236-01-SUMMARY.md
similarity index 100%
rename from .planning/phases/236-publication-and-evidence-truthfulness/236-01-SUMMARY.md
rename to .planning/milestones/v1.26-phases/236-publication-and-evidence-truthfulness/236-01-SUMMARY.md
diff --git a/.planning/phases/236-publication-and-evidence-truthfulness/236-CONTEXT.md b/.planning/milestones/v1.26-phases/236-publication-and-evidence-truthfulness/236-CONTEXT.md
similarity index 100%
rename from .planning/phases/236-publication-and-evidence-truthfulness/236-CONTEXT.md
rename to .planning/milestones/v1.26-phases/236-publication-and-evidence-truthfulness/236-CONTEXT.md
diff --git a/.planning/phases/236-publication-and-evidence-truthfulness/236-VALIDATION.md b/.planning/milestones/v1.26-phases/236-publication-and-evidence-truthfulness/236-VALIDATION.md
similarity index 100%
rename from .planning/phases/236-publication-and-evidence-truthfulness/236-VALIDATION.md
rename to .planning/milestones/v1.26-phases/236-publication-and-evidence-truthfulness/236-VALIDATION.md
diff --git a/.planning/phases/236-publication-and-evidence-truthfulness/236-VERIFICATION.md b/.planning/milestones/v1.26-phases/236-publication-and-evidence-truthfulness/236-VERIFICATION.md
similarity index 100%
rename from .planning/phases/236-publication-and-evidence-truthfulness/236-VERIFICATION.md
rename to .planning/milestones/v1.26-phases/236-publication-and-evidence-truthfulness/236-VERIFICATION.md
diff --git a/.planning/phases/237-direct-tensor-staged-offset-contract-repair/237-01-PLAN.md b/.planning/milestones/v1.26-phases/237-direct-tensor-staged-offset-contract-repair/237-01-PLAN.md
similarity index 100%
rename from .planning/phases/237-direct-tensor-staged-offset-contract-repair/237-01-PLAN.md
rename to .planning/milestones/v1.26-phases/237-direct-tensor-staged-offset-contract-repair/237-01-PLAN.md
diff --git a/.planning/phases/237-direct-tensor-staged-offset-contract-repair/237-01-SUMMARY.md b/.planning/milestones/v1.26-phases/237-direct-tensor-staged-offset-contract-repair/237-01-SUMMARY.md
similarity index 100%
rename from .planning/phases/237-direct-tensor-staged-offset-contract-repair/237-01-SUMMARY.md
rename to .planning/milestones/v1.26-phases/237-direct-tensor-staged-offset-contract-repair/237-01-SUMMARY.md
diff --git a/.planning/phases/237-direct-tensor-staged-offset-contract-repair/237-CONTEXT.md b/.planning/milestones/v1.26-phases/237-direct-tensor-staged-offset-contract-repair/237-CONTEXT.md
similarity index 100%
rename from .planning/phases/237-direct-tensor-staged-offset-contract-repair/237-CONTEXT.md
rename to .planning/milestones/v1.26-phases/237-direct-tensor-staged-offset-contract-repair/237-CONTEXT.md
diff --git a/.planning/phases/237-direct-tensor-staged-offset-contract-repair/237-VALIDATION.md b/.planning/milestones/v1.26-phases/237-direct-tensor-staged-offset-contract-repair/237-VALIDATION.md
similarity index 100%
rename from .planning/phases/237-direct-tensor-staged-offset-contract-repair/237-VALIDATION.md
rename to .planning/milestones/v1.26-phases/237-direct-tensor-staged-offset-contract-repair/237-VALIDATION.md
diff --git a/.planning/phases/237-direct-tensor-staged-offset-contract-repair/237-VERIFICATION.md b/.planning/milestones/v1.26-phases/237-direct-tensor-staged-offset-contract-repair/237-VERIFICATION.md
similarity index 100%
rename from .planning/phases/237-direct-tensor-staged-offset-contract-repair/237-VERIFICATION.md
rename to .planning/milestones/v1.26-phases/237-direct-tensor-staged-offset-contract-repair/237-VERIFICATION.md
diff --git a/.planning/phases/238-audit-artifact-and-probe-reporting-cleanup/238-01-PLAN.md b/.planning/milestones/v1.26-phases/238-audit-artifact-and-probe-reporting-cleanup/238-01-PLAN.md
similarity index 100%
rename from .planning/phases/238-audit-artifact-and-probe-reporting-cleanup/238-01-PLAN.md
rename to .planning/milestones/v1.26-phases/238-audit-artifact-and-probe-reporting-cleanup/238-01-PLAN.md
diff --git a/.planning/phases/238-audit-artifact-and-probe-reporting-cleanup/238-01-SUMMARY.md b/.planning/milestones/v1.26-phases/238-audit-artifact-and-probe-reporting-cleanup/238-01-SUMMARY.md
similarity index 100%
rename from .planning/phases/238-audit-artifact-and-probe-reporting-cleanup/238-01-SUMMARY.md
rename to .planning/milestones/v1.26-phases/238-audit-artifact-and-probe-reporting-cleanup/238-01-SUMMARY.md
diff --git a/.planning/phases/238-audit-artifact-and-probe-reporting-cleanup/238-CONTEXT.md b/.planning/milestones/v1.26-phases/238-audit-artifact-and-probe-reporting-cleanup/238-CONTEXT.md
similarity index 100%
rename from .planning/phases/238-audit-artifact-and-probe-reporting-cleanup/238-CONTEXT.md
rename to .planning/milestones/v1.26-phases/238-audit-artifact-and-probe-reporting-cleanup/238-CONTEXT.md
diff --git a/.planning/phases/238-audit-artifact-and-probe-reporting-cleanup/238-VALIDATION.md b/.planning/milestones/v1.26-phases/238-audit-artifact-and-probe-reporting-cleanup/238-VALIDATION.md
similarity index 100%
rename from .planning/phases/238-audit-artifact-and-probe-reporting-cleanup/238-VALIDATION.md
rename to .planning/milestones/v1.26-phases/238-audit-artifact-and-probe-reporting-cleanup/238-VALIDATION.md
diff --git a/.planning/phases/238-audit-artifact-and-probe-reporting-cleanup/238-VERIFICATION.md b/.planning/milestones/v1.26-phases/238-audit-artifact-and-probe-reporting-cleanup/238-VERIFICATION.md
similarity index 100%
rename from .planning/phases/238-audit-artifact-and-probe-reporting-cleanup/238-VERIFICATION.md
rename to .planning/milestones/v1.26-phases/238-audit-artifact-and-probe-reporting-cleanup/238-VERIFICATION.md
diff --git a/.planning/milestones/v1.27-MILESTONE-AUDIT.md b/.planning/milestones/v1.27-MILESTONE-AUDIT.md
new file mode 100644
index 00000000..7f07c1dd
--- /dev/null
+++ b/.planning/milestones/v1.27-MILESTONE-AUDIT.md
@@ -0,0 +1,232 @@
+---
+milestone: v1.27
+audited: 2026-06-25T14:31:39Z
+status: passed
+scores:
+  requirements: 13/13
+  phases: 6/6
+  integration: 9/9
+  flows: 9/9
+gaps:
+  requirements: []
+  integration: []
+  flows: []
+  phase_artifacts: []
+closed_gaps:
+  - id: "XBN-01-optimized-benchmark-attribution"
+    phase: "244"
+    requirements: ["XBN-01", "XBN-02"]
+    evidence: "Initial audit found the kernel_x86_64 suite only published common f32/unary/matmul rows. Phase 244 was repaired so the maintained suite now publishes counter-checked optimized flash and q2/q3/q6 entries."
+tech_debt: []
+nyquist:
+  overall: compliant
+  compliant_phases:
+    - 239-x86-64-avx2-fma-host-contract-and-baseline-audit
+    - 240-x86-64-flash-attention-avx2-fma-kernel
+    - 241-x86-64-vectorized-q2-k-q3-k-kernels
+    - 242-x86-64-vectorized-q6-k-and-hot-path-contract
+    - 243-runtime-integration-and-parity-proof
+    - 244-benchmark-attribution-and-publication-truth
+  partial_phases: []
+  invalid_phases: []
+  missing_phases: []
+---
+
+# v1.27 Milestone Audit - Ryzen AVX2/FMA Kernel Support
+
+## Result
+
+Status: passed.
+
+All 13 v1.27 requirements are source-backed, all 6 phases are complete, and
+the cross-phase integration check passed with no blockers or tech debt. The
+milestone remains intentionally scoped to this AMD Ryzen 9 5950X class: x86_64
+AVX2 plus FMA, with F16C conversion support. It makes no AVX-512, AVX-VNNI,
+AMX, BF16, native FP16, GPU, or broader model-family claim.
+
+## Requirement Coverage
+
+Three-source cross-reference passed:
+
+- `.planning/REQUIREMENTS.md` marks 13/13 requirements verified.
+- Phase `*-VERIFICATION.md` files for 239-244 mark their assigned requirements
+  passed.
+- Phase `*-SUMMARY.md` frontmatter lists the completed requirements.
+
+| Requirement | Phase | Traceability | Verification | Summary | Final status |
+|-------------|-------|--------------|--------------|---------|--------------|
+| X86-01 | 239 | Verified | Passed | Listed | Satisfied |
+| X86-02 | 239 | Verified | Passed | Listed | Satisfied |
+| XFL-01 | 240 | Verified | Passed | Listed | Satisfied |
+| XFL-02 | 240 | Verified | Passed | Listed | Satisfied |
+| XQK-01 | 241 | Verified | Passed | Listed | Satisfied |
+| XQK-02 | 241 | Verified | Passed | Listed | Satisfied |
+| XQK-03 | 242 | Verified | Passed | Listed | Satisfied |
+| XQK-04 | 242 | Verified | Passed | Listed | Satisfied |
+| XRT-01 | 243 | Verified | Passed | Listed | Satisfied |
+| XRT-02 | 243 | Verified | Passed | Listed | Satisfied |
+| XRT-03 | 243 | Verified | Passed | Listed | Satisfied |
+| XBN-01 | 244 | Verified | Passed | Listed | Satisfied after benchmark repair |
+| XBN-02 | 244 | Verified | Passed | Listed | Satisfied |
+
+## Phase Coverage
+
+| Phase | Verification | Validation | Audit status |
+|-------|--------------|------------|--------------|
+| 239 | `239-VERIFICATION.md` | `239-VALIDATION.md` | Satisfied |
+| 240 | `240-VERIFICATION.md` | `240-VALIDATION.md` | Satisfied |
+| 241 | `241-VERIFICATION.md` | `241-VALIDATION.md` | Satisfied |
+| 242 | `242-VERIFICATION.md` | `242-VALIDATION.md` | Satisfied |
+| 243 | `243-VERIFICATION.md` | `243-VALIDATION.md` | Satisfied |
+| 244 | `244-VERIFICATION.md` | `244-VALIDATION.md` | Satisfied after `XBN-01` source-backed repair |
+
+All validation files carry `status: passed`, `nyquist_compliant: true`, and
+`wave_0_complete: true`, with executable command evidence and rule-compliance
+sign-off.
+
+## Source-Backed Checks
+
+### Host Contract
+
+`src/emel/kernel/x86_64/context.hpp` and `src/emel/kernel/x86_64/sm.hpp`
+publish AVX2, FMA, and F16C feature state and explicit false no-claim fields
+for AVX-512, AVX-VNNI, AMX, BF16, and native FP16. `CMakeLists.txt` wires the
+host-tuned x86_64 build flags for AVX2/FMA/F16C without adding unsupported
+feature flags.
+
+### Optimized Kernel Routing
+
+`src/emel/kernel/x86_64/sm.hpp` routes supported flash, q2_K, q3_K, and q6_K
+requests through explicit guards and transitions before fallback paths.
+`src/emel/kernel/x86_64/actions.hpp` owns the AVX2/FMA/F16C flash kernel and
+the q2_K/q3_K/q6_K x q8_K row kernels. Supported quantized hot-path tests
+assert optimized counters advance, shared counters stay zero, and no dispatch
+allocation occurs.
+
+### Unary Rule Debt
+
+The audit specifically rechecked the earlier rule debt around unary SIMD
+selection. The old generic `execute_avx2_unary(op_unary)` runtime-indexed
+function-pointer table is gone from `src/emel/kernel/x86_64/actions.hpp`.
+Unary SIMD selection is modeled in `sm.hpp` with explicit
+`simd_op_unary_abs`, `simd_op_unary_neg`, and `simd_op_unary_relu` guards and
+compile-time subop action helpers.
+
+### Runtime Path
+
+The maintained generator path drives public events through the shipped
+generator -> graph -> processor -> kernel chain. `src/emel/kernel/any.hpp`
+includes `emel::kernel::x86_64::sm`, and generator diagnostics expose the
+optimized/shared dispatch counters used by tests and parity tooling.
+
+### Parity Path
+
+`tools/paritychecker/parity_engines.cpp` captures public generator diagnostics,
+prints `quantized_dispatch:` attribution, and fails when x86_64 native q2/q3/q6
+tensors do not produce optimized counters or produce shared counters. The
+maintained parity proof covers generation at `1`, `10`, `100`, and `1000`
+tokens, with approved publication baseline updates for the maintained LFM2
+`10`, `100`, and `1000` token runs.
+
+### Benchmark Path
+
+The initial milestone audit found a real `XBN-01` blocker: the
+`kernel_x86_64` benchmark suite published the common f32/unary/matmul rows but
+did not prove the optimized flash and q2/q3/q6 lanes.
+
+That gap is closed. `tools/bench/kernel/x86_64_bench.cpp` now publishes:
+
+- `kernel/x86_64/op_flash_attn_ext_decode_like`
+- `kernel/x86_64/op_mul_mat_q2_k_q8_k`
+- `kernel/x86_64/op_mul_mat_q3_k_q8_k`
+- `kernel/x86_64/op_mul_mat_q6_k_q8_k`
+
+The EMEL benchmark callbacks drive `emel::kernel::x86_64::sm::process_event`
+and abort unless the matching optimized counter increments while the matching
+shared counter does not. Reference-lane comparison remains separate. The
+approved `snapshots/bench/benchmarks.txt` baseline now contains 19
+`kernel/x86_64/*` entries, including the four optimized rows.
+
+## Integration Check
+
+The integration checker returned `status: passed`:
+
+- requirements: 13/13
+- phases: 6/6
+- integration: 9/9
+- blockers: none
+- tech debt: none
+
+It independently traced:
+
+- `kernel_x86_64` benchmark registration, optimized row publication, and
+  snapshot presence.
+- generator -> graph -> processor -> kernel runtime wiring through
+  `process_event(...)`.
+- paritychecker attribution and failure behavior for missing optimized counters.
+- explicit x86_64 SML routing for flash, q2/q3/q6, and unary subops.
+- AVX2/FMA/F16C host feature publication with no unsupported x86 feature claims.
+
+## Validation Evidence
+
+Focused validation after the benchmark and unary-rule repairs:
+
+```bash
+cmake --build build/phase239 --target emel_tests_bin -j2
+```
+
+Result: PASS.
+
+```bash
+ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'
+```
+
+Result: PASS, `100% tests passed`.
+
+```bash
+cmake --build build/bench_tools_ninja_kernel_x86_64 --target bench_runner -j2
+```
+
+Result: PASS.
+
+```bash
+scripts/bench.sh --snapshot --compare --suite=kernel_x86_64
+```
+
+Result: PASS after the approved benchmark snapshot update and optimized
+benchmark repair.
+
+```bash
+PATH="/shared/stateforward/.tools/clang-format-venv/bin:/shared/stateforward/.tools/llvm18/root/usr/lib/llvm-18/bin:/shared/stateforward/.tools/git-lfs/git-lfs-3.7.1:$PATH" \
+EMEL_QUALITY_GATES_BENCH_SUITE="kernel_x86_64" \
+EMEL_QUALITY_GATES_CHANGED_FILES="<v1.27 changed-file scope>" \
+scripts/quality_gates.sh
+```
+
+Result: PASS. Selected lanes included legacy SML surface scan, Zig build,
+`kernel_x86_64` benchmark snapshot, changed-file scoped coverage, paritychecker,
+lint snapshot, docs generation, and fuzz routing. Changed-line coverage was
+`707/735` lines (`96.2%`) and `171/240` branches (`71.2%`).
+
+Additional source checks:
+
+```bash
+rg -n "execute_avx2_unary\\(|kernel_index|unary_kernel_t|kernels\\[" \
+  src/emel/kernel/x86_64/actions.hpp tests/kernel/x86_64_tests.cpp
+```
+
+Result: PASS, no matches.
+
+```bash
+rg -n "op_flash_attn_ext_decode_like|op_mul_mat_q2_k_q8_k|op_mul_mat_q3_k_q8_k|op_mul_mat_q6_k_q8_k" \
+  tools/bench/kernel/x86_64_bench.cpp snapshots/bench/benchmarks.txt
+```
+
+Result: PASS, source and snapshot entries present.
+
+## Closeout Readiness
+
+v1.27 is ready for milestone completion and cleanup. The earlier benchmark
+publication blocker is closed with maintained source and snapshot evidence; the
+x86_64 unary SML rule debt is removed from the milestone path; all active
+requirements are satisfied; and no deferred v1.27 work remains.
diff --git a/.planning/milestones/v1.27-REQUIREMENTS.md b/.planning/milestones/v1.27-REQUIREMENTS.md
new file mode 100644
index 00000000..b3320c50
--- /dev/null
+++ b/.planning/milestones/v1.27-REQUIREMENTS.md
@@ -0,0 +1,148 @@
+# Requirements Archive: v1.27 Ryzen AVX2/FMA Kernel Support
+
+**Archived:** 2026-06-25
+**Status:** SHIPPED
+
+For current requirements, see `.planning/REQUIREMENTS.md`.
+
+---
+
+# Requirements: EMEL v1.27 Ryzen AVX2/FMA Kernel Support
+
+**Defined:** 2026-06-25
+**Status:** Verified
+**Core Value:** Prove real end-to-end behavior with explicit SML orchestration and
+parity-oriented verification before widening API surface or model scope.
+**Source:** User request: "add support for this processor exactly how NEON was added"
+
+## v1.27 Requirements
+
+Each requirement is one independently testable obligation and maps to exactly
+one roadmap phase. This milestone targets the current host CPU: AMD Ryzen 9
+5950X with x86_64 AVX2, FMA, and F16C conversion support. It does not claim
+AVX-512, AVX-VNNI, AMX, BF16, or native FP16 arithmetic.
+
+### Host feature contract and build support
+
+- [x] **X86-01**: Maintainer can identify an x86_64 host feature contract that
+  detects and publishes AVX2, FMA, and F16C availability for this Ryzen class
+  without implying unsupported AVX-512, AVX-VNNI, AMX, BF16, or native FP16
+  support.
+
+- [x] **X86-02**: The build/config surface supports host-tuned x86_64 AVX2/FMA
+  code paths analogously to the existing AArch64 host-feature switch, while
+  preserving portable builds and fail-closed behavior when those features are
+  unavailable.
+
+### x86_64 flash attention
+
+- [x] **XFL-01**: Supported x86_64 flash-attention requests execute through an
+  EMEL-owned AVX2/FMA implementation rather than the shared scalar workspace
+  path.
+
+- [x] **XFL-02**: Unsupported or out-of-contract x86_64 flash-attention requests
+  publish deterministic fallback/no-claim behavior instead of silently claiming
+  optimized execution.
+
+### x86_64 quantized kernels
+
+- [x] **XQK-01**: The maintained x86_64 `q2_K x q8_K` hot path uses an
+  EMEL-owned AVX2/FMA kernel on supported requests.
+
+- [x] **XQK-02**: The maintained x86_64 `q3_K x q8_K` hot path uses an
+  EMEL-owned AVX2/FMA kernel on supported requests.
+
+- [x] **XQK-03**: The maintained x86_64 `q6_K x q8_K` hot path uses an
+  EMEL-owned AVX2/FMA kernel on supported requests.
+
+- [x] **XQK-04**: Supported x86_64 quantized hot-path requests remain
+  allocation-free during dispatch and consume the same effective operand class
+  as the reference path, with no whole-tensor dequantize-to-f32 substitution.
+
+### Runtime integration and parity proof
+
+- [x] **XRT-01**: The shipped generator -> graph -> processor -> kernel chain
+  selects the x86_64 optimized path on this host without actor rewrites,
+  queue-based orchestration, or public API widening.
+
+- [x] **XRT-02**: Maintained paritychecker proof covers `1`, `10`, `100`, and
+  `1000` token generation on the maintained path and publishes x86_64
+  attribution sufficient to prove the optimized path actually executed.
+
+- [x] **XRT-03**: Tests prove supported optimized behavior and deterministic
+  fallback/no-claim behavior through public machine dispatch and SML state
+  inspection where applicable.
+
+### Benchmark attribution and publication truth
+
+- [x] **XBN-01**: Maintained benchmark entrypoints exercise the x86_64 optimized
+  flash and quantized paths with attribution distinct from scalar/shared paths
+  and reference-lane execution.
+
+- [x] **XBN-02**: Published benchmark/docs artifacts truthfully distinguish
+  x86_64 Ryzen evidence from ARM-first claims and do not label unsupported
+  requests as optimized.
+
+## Out-of-Contract Tracking
+
+Tracked as explicit non-claims and separate-contract work outside the current
+Ryzen AVX2/FMA host obligation.
+
+### Wider x86 feature families
+
+- **X86-SEPARATE-01**: Add AVX-512, AVX-VNNI, AMX, BF16, or native FP16 paths
+  only on hardware that actually supports them and only after a separate feature
+  contract is approved.
+
+- **X86-SEPARATE-02**: Add additional x86_64 quantized formats beyond the
+  maintained `q2_K/q3_K/q6_K x q8_K` path after the first Ryzen AVX2/FMA slice
+  is source-backed.
+
+### Broader runtime scope
+
+- **XRT-SEPARATE-01**: Widen x86_64 performance publication to additional model
+  families only after the maintained current slice has truthful attribution and
+  parity proof.
+
+## Out of Scope
+
+Explicit exclusions for milestone v1.27:
+
+| Feature | Reason |
+|---------|--------|
+| AVX-512, AVX-VNNI, AMX, BF16, or native FP16 execution claims | The current Ryzen 9 5950X host does not provide those instruction families |
+| GPU acceleration | User asked for this processor; this milestone is CPU-owned x86_64 work |
+| Broad public C ABI or CLI expansion | The NEON precedent optimized the maintained runtime path before widening APIs |
+| New model-family support | Kernel milestone; maintained runtime slice remains the acceptance surface |
+| Tool-only compute fallbacks | EMEL support must live in `src/`, not only in benchmarks or parity tools |
+| Whole-tensor dequantize-to-f32 substitution in the hot path | Would violate the quantized inference performance contract unless explicitly approved |
+| Linking EMEL runtime against llama.cpp/ggml | Reference code remains comparison-only in tools per project policy |
+
+## Traceability
+
+Each row appears exactly once. Phases continue after v1.26's Phase 238.
+
+| Requirement | Phase | Status |
+|-------------|-------|--------|
+| X86-01 | Phase 239 | Verified |
+| X86-02 | Phase 239 | Verified |
+| XFL-01 | Phase 240 | Verified |
+| XFL-02 | Phase 240 | Verified |
+| XQK-01 | Phase 241 | Verified |
+| XQK-02 | Phase 241 | Verified |
+| XQK-03 | Phase 242 | Verified |
+| XQK-04 | Phase 242 | Verified |
+| XRT-01 | Phase 243 | Verified |
+| XRT-02 | Phase 243 | Verified |
+| XRT-03 | Phase 243 | Verified |
+| XBN-01 | Phase 244 | Verified |
+| XBN-02 | Phase 244 | Verified |
+
+**Coverage:**
+- v1.27 requirements: 13 total
+- Mapped to phases: 13
+- Unmapped: 0
+
+---
+*Requirements defined: 2026-06-25*
+*Last updated: 2026-06-25 after Phase 244 verification and approved snapshot publication*
diff --git a/.planning/milestones/v1.27-ROADMAP.md b/.planning/milestones/v1.27-ROADMAP.md
new file mode 100644
index 00000000..ec5e5140
--- /dev/null
+++ b/.planning/milestones/v1.27-ROADMAP.md
@@ -0,0 +1,209 @@
+# Roadmap: EMEL
+
+## Milestones
+
+- [x] **v1.0 EMEL Llama-68M Generation Slice** - shipped 2026-03-08
+- [x] **v1.1 EMEL Llama-68M Generation Benchmark** - shipped 2026-03-11
+- [x] **v1.2 Flash Attention** - shipped 2026-03-22
+- [x] **v1.3 ARM Flash Optimizations** - shipped 2026-03-22
+- [x] **v1.4 Full Vectorized Quantized Kernels** - shipped 2026-03-25
+- [x] **v1.5 Full ARM Quantized Path** - shipped 2026-03-27
+- [x] **v1.6 Qwen3-0.6B Parity And Benchmark** - shipped 2026-03-30
+- [x] **v1.7 Generator Prefill Submachine Decomposition** - shipped 2026-03-30
+- [x] **v1.8 Truthful Qwen3 E2E Embedded Size** - shipped 2026-04-02
+- [x] **v1.9 Liquid LFM2.5-1.2B Thinking ARM Slice** - shipped 2026-04-02
+- [x] **v1.11 TE-75M GGUF Trimodal Embedding Runtime** - shipped 2026-04-15
+- [x] **v1.12 Pluggable Reference Parity Bench Architecture** - shipped 2026-04-18
+- [x] **v1.13 Pluggable Generative Parity Bench** - shipped 2026-04-21
+- [x] **v1.14 Benchmark Variant Organization** - shipped 2026-04-21
+- [x] **v1.15 ARM Sortformer Diarization GGUF Slice** - shipped 2026-04-25
+- [x] **v1.16 ARM Whisper GGUF Parity And Performance** - shipped 2026-04-28
+- [x] **v1.17 Text Generator Domain Alignment** - shipped 2026-04-30
+- [x] **v1.18 Parity Tool Boundary Refactor** - shipped 2026-05-01
+- [x] **v1.19 Benchmark Tool Pluggable Runner Refactor** - shipped 2026-05-01
+- [x] **v1.20 SML Dependency And Namespace Migration** - shipped 2026-05-02
+- [x] **v1.21 Quality Gate Selective Runner Optimization** - shipped 2026-05-02
+- [x] **v1.22 Weight Loading Ownership Cutover** - shipped 2026-05-03
+- [x] **v1.23 I/O Loading Strategy Boundary** - shipped 2026-05-04
+- [x] **v1.24 I/O Mmap Loading Strategy** - shipped 2026-05-04
+- [x] **v1.25 I/O Read Loading Strategy** - shipped 2026-05-06
+- [x] **v1.26 I/O Staged Read Loading Strategy** - completed 2026-05-08
+- [ ] **v1.27 Ryzen AVX2/FMA Kernel Support** - planned 2026-06-25
+
+## Current Milestone
+
+### v1.27 Ryzen AVX2/FMA Kernel Support
+
+**Milestone Goal:** Bring the maintained x86_64 runtime path on this AMD Ryzen 9
+5950X host to the same support standard as the earlier NEON/AArch64 path:
+source-backed host feature contract, EMEL-owned AVX2/FMA flash and quantized
+hot-path kernels, maintained runtime/parity proof, and truthful benchmark
+attribution.
+
+**Host contract:** This milestone targets x86_64 AVX2 + FMA, with F16C
+conversion support only. It makes no AVX-512, AVX-VNNI, AMX, BF16, native FP16,
+or GPU execution claim.
+
+**Scope Guardrails:**
+- Keep the milestone narrow to this x86_64 CPU feature class and the maintained
+  runtime/parity/benchmark surfaces.
+- Preserve the generator -> graph -> processor -> kernel chain and current
+  Stateforward.SML orchestration structure.
+- Keep runtime behavior choice in guards/transitions, not action/detail helper
+  routing.
+- Do not accept tool-only compute fallbacks or whole-tensor dequantize-to-f32
+  substitution for quantized hot paths without explicit user approval.
+- Keep llama.cpp/ggml reference linkage confined to comparison-only tool lanes.
+
+Execution order: 239, 240, 241, 242, 243, 244.
+
+**Milestone progress (v1.27):** **6 / 6** phases fully verified.
+Source/test complete: **6 / 6** phases.
+
+- [x] Phase 239: x86_64 AVX2/FMA Host Contract and Baseline Audit (X86-01, X86-02)
+- [x] Phase 240: x86_64 Flash Attention AVX2/FMA Kernel (XFL-01, XFL-02)
+- [x] Phase 241: x86_64 Vectorized q2_K/q3_K Kernels (XQK-01, XQK-02)
+- [x] Phase 242: x86_64 Vectorized q6_K and Hot-Path Contract (XQK-03, XQK-04)
+- [x] Phase 243: Runtime Integration and Parity Proof (XRT-01, XRT-02, XRT-03)
+- [x] Phase 244: Benchmark Attribution and Publication Truth (XBN-01, XBN-02)
+
+## Phase Details
+
+### Phase 239: x86_64 AVX2/FMA Host Contract and Baseline Audit
+
+**Goal:** Define the Ryzen AVX2/FMA host contract, add/build the host-feature
+surface, and inventory the current x86_64 kernel path against the NEON/AArch64
+precedent before porting kernels.
+
+**Depends on:** Phase 238
+**Requirements:** X86-01, X86-02
+
+**Success Criteria:**
+
+1. Runtime/build evidence identifies AVX2, FMA, and F16C support on this host
+   and explicitly no-claims AVX-512, AVX-VNNI, AMX, BF16, and native FP16.
+2. CMake or equivalent config exposes host-tuned x86_64 AVX2/FMA compilation
+   without breaking portable or cross-compiled builds.
+3. A source-backed audit maps current `src/emel/kernel/x86_64` behavior against
+   the NEON/AArch64 flash and quantized support pattern.
+4. Tests prove supported and unsupported feature-contract behavior through the
+   public x86_64 kernel actor surface where applicable.
+
+### Phase 240: x86_64 Flash Attention AVX2/FMA Kernel
+
+**Goal:** Port the maintained flash-attention optimization pattern to an
+EMEL-owned x86_64 AVX2/FMA implementation with deterministic fallback/no-claim
+behavior for unsupported requests.
+
+**Depends on:** Phase 239
+**Requirements:** XFL-01, XFL-02
+
+**Success Criteria:**
+
+1. Supported x86_64 flash-attention requests execute through an AVX2/FMA path
+   rather than the shared scalar workspace helper.
+2. The optimized path preserves reusable workspace semantics and introduces no
+   dispatch-time allocation.
+3. Unsupported shape, dtype, feature, or operand contracts publish explicit
+   fallback/no-claim behavior.
+4. Focused kernel tests compare optimized output against the maintained scalar
+   or reference oracle within the accepted numeric tolerance.
+
+### Phase 241: x86_64 Vectorized q2_K/q3_K Kernels
+
+**Goal:** Land EMEL-owned AVX2/FMA `q2_K x q8_K` and `q3_K x q8_K` kernels for
+the maintained x86_64 quantized hot path.
+
+**Depends on:** Phase 240
+**Requirements:** XQK-01, XQK-02
+
+**Success Criteria:**
+
+1. Maintained `q2_K x q8_K` requests can execute through an AVX2/FMA kernel on
+   supported x86_64 hosts.
+2. Maintained `q3_K x q8_K` requests can execute through an AVX2/FMA kernel on
+   supported x86_64 hosts.
+3. Kernel-seam proof distinguishes optimized execution from scalar/shared row
+   helpers for both formats.
+4. Correctness tests cover representative block groups, tails, and accumulation
+   behavior against the maintained scalar/reference oracle.
+
+### Phase 242: x86_64 Vectorized q6_K and Hot-Path Contract
+
+**Goal:** Add the x86_64 AVX2/FMA `q6_K x q8_K` kernel and lock the maintained
+quantized hot path to operand fidelity and zero dispatch-time allocation.
+
+**Depends on:** Phase 241
+**Requirements:** XQK-03, XQK-04
+
+**Success Criteria:**
+
+1. Maintained `q6_K x q8_K` requests can execute through an AVX2/FMA kernel on
+   supported x86_64 hosts.
+2. Supported `q2_K/q3_K/q6_K x q8_K` requests consume the same effective operand
+   class as the reference path.
+3. Tests and/or allocation instrumentation prove no hot-path allocation and no
+   whole-tensor dequantize-to-f32 substitution for supported optimized requests.
+4. Unsupported quantized cases remain explicit fallback/no-claim paths rather
+   than silent performance claims.
+
+### Phase 243: Runtime Integration and Parity Proof
+
+**Goal:** Adopt the optimized x86_64 kernel set in the shipped runtime chain and
+prove supported plus fallback behavior through maintained parity and tests.
+
+**Depends on:** Phase 242
+**Requirements:** XRT-01, XRT-02, XRT-03
+
+**Success Criteria:**
+
+1. The shipped generator -> graph -> processor -> kernel chain selects the
+   optimized x86_64 path on this host without actor rewrites or public API
+   widening.
+2. Maintained paritychecker output covers `1`, `10`, `100`, and `1000` token
+   generation and proves the x86_64 optimized path actually executed.
+3. Regression tests cover supported optimized execution and deterministic
+   fallback/no-claim behavior through public dispatch and SML state inspection.
+4. Runtime counters/attribution remain bounded, deterministic, and
+   allocation-free in hot dispatch paths.
+
+### Phase 244: Benchmark Attribution and Publication Truth
+
+**Goal:** Publish maintained benchmark and documentation evidence that measures
+the Ryzen AVX2/FMA path truthfully and distinguishes it from ARM-first and
+reference-lane claims.
+
+**Depends on:** Phase 243
+**Requirements:** XBN-01, XBN-02
+
+**Success Criteria:**
+
+1. `tools/bench` runs maintained x86_64 flash and quantized workloads through
+   optimized paths and reports attribution distinct from scalar/shared paths.
+2. Benchmark docs and stored artifacts clearly identify host CPU, feature
+   contract, optimized-path counters, and reference-lane separation.
+3. Published results do not label unsupported requests as optimized and do not
+   dilute existing ARM-first benchmark claims.
+4. Required benchmark, docs, lint, parity, and quality-gate evidence is captured
+   from maintained commands without snapshot updates unless explicitly approved.
+
+## Progress
+
+**Execution Order:** 239 -> 240 -> 241 -> 242 -> 243 -> 244
+
+| Phase | Plans Complete | Status | Completed |
+|-------|----------------|--------|-----------|
+| 239. x86_64 AVX2/FMA Host Contract and Baseline Audit | 1/1 | Complete | 2026-06-25 |
+| 240. x86_64 Flash Attention AVX2/FMA Kernel | 1/1 | Complete | 2026-06-25 |
+| 241. x86_64 Vectorized q2_K/q3_K Kernels | 1/1 | Complete | 2026-06-25 |
+| 242. x86_64 Vectorized q6_K and Hot-Path Contract | 1/1 | Complete | 2026-06-25 |
+| 243. Runtime Integration and Parity Proof | 1/1 | Complete | 2026-06-25 |
+| 244. Benchmark Attribution and Publication Truth | 1/1 | Complete | 2026-06-25 |
+
+---
+
+Next implementation step: complete and archive the v1.27 milestone.
+
+Closeout gate: milestone completion and cleanup after the passed source-backed
+audit, approved snapshot updates, optimized benchmark repair, and scoped quality
+gate.
diff --git a/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-01-PLAN.md b/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-01-PLAN.md
new file mode 100644
index 00000000..88c5aea0
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-01-PLAN.md
@@ -0,0 +1,136 @@
+---
+phase: 239
+plan: 01
+title: x86_64 AVX2/FMA Host Contract and Baseline Audit
+wave: 1
+depends_on: []
+autonomous: true
+files_modified:
+  - CMakeLists.txt
+  - src/emel/kernel/x86_64/context.hpp
+  - src/emel/kernel/x86_64/actions.hpp
+  - src/emel/kernel/x86_64/sm.hpp
+  - src/emel/kernel/aarch64/actions.hpp
+  - src/emel/diarization/sortformer/detail.cpp
+  - src/emel/embeddings/generator/detail.hpp
+  - src/emel/text/generator/detail.hpp
+  - tests/kernel/x86_64_tests.cpp
+  - tests/kernel/aarch64_tests.cpp
+  - tests/text/generator/detail_tests.cpp
+  - tools/paritychecker/CMakeLists.txt
+  - .planning/phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-X86-BASELINE-AUDIT.md
+  - .planning/phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-VERIFICATION.md
+  - .planning/phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-VALIDATION.md
+  - .planning/phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-01-SUMMARY.md
+rule_constraints:
+  - Keep runtime behavior choice in x86_64 guards/transitions.
+  - Keep action/detail helpers limited to already-selected numeric work.
+  - Do not add AVX-512, AVX-VNNI, AMX, BF16, native FP16, or GPU claims.
+  - Do not implement Phase 240-242 kernels in this phase.
+must_haves:
+  - AVX2, FMA, and F16C are explicitly detected/published for x86_64.
+  - Unsupported feature families are explicitly published as no-claim.
+  - x86_64 host-tuned build flags exist and exclude unsupported feature families.
+  - A source-backed audit maps current x86_64 state against the NEON precedent.
+  - Focused tests prove supported and unsupported contract behavior.
+---
+
+# Phase 239 Plan: x86_64 AVX2/FMA Host Contract and Baseline Audit
+
+## Goal
+
+Define the Ryzen AVX2/FMA host contract, add/build the host-feature surface, and
+inventory the current x86_64 kernel path against the NEON/AArch64 precedent
+before porting kernels.
+
+## Tasks
+
+<tasks>
+  <task id="239-01-01" name="Add failing host-contract tests">
+    <instructions>
+      Add focused tests in `tests/kernel/x86_64_tests.cpp` that require the
+      x86_64 feature contract to publish AVX2, FMA, F16C, and explicit no-claim
+      unsupported feature families through the public x86_64 actor wrapper.
+      Run the targeted test binary or build and record the expected pre-fix
+      failure.
+    </instructions>
+    <verification>
+      The new tests fail before implementation because the public feature
+      contract/accessors do not yet exist.
+    </verification>
+  </task>
+
+  <task id="239-01-02" name="Implement x86_64 feature contract">
+    <instructions>
+      Extend `src/emel/kernel/x86_64/context.hpp` and
+      `src/emel/kernel/x86_64/sm.hpp` with a small host feature contract:
+      runtime AVX2/FMA/F16C detection and explicit no-claim booleans for
+      AVX-512, AVX-VNNI, AMX, BF16, and native FP16. Preserve existing context
+      construction used by tests.
+    </instructions>
+    <verification>
+      Focused x86_64 tests compile and pass on both default and forced context
+      construction.
+    </verification>
+  </task>
+
+  <task id="239-01-03" name="Add x86_64 host build option">
+    <instructions>
+      Add an `EMEL_ENABLE_X86_64_HOST_FEATURES` CMake option analogous to the
+      AArch64 option. Apply compiler-checked `-mavx2`, `-mfma`, and `-mf16c`
+      flags only for non-cross x86_64 builds. Repair any compile-only
+      portability gaps exposed by building the maintained x86 host target. Do
+      not add unsupported feature flags.
+    </instructions>
+    <verification>
+      CMake configure reports x86_64 host flags on this host when supported, and
+      source scan confirms no AVX-512/VNNI/AMX/BF16/native-FP16 flags were added.
+    </verification>
+  </task>
+
+  <task id="239-01-04" name="Write baseline audit and closeout artifacts">
+    <instructions>
+      Create `239-X86-BASELINE-AUDIT.md`, `239-VERIFICATION.md`,
+      `239-VALIDATION.md`, and `239-01-SUMMARY.md`. The audit must describe the
+      current x86_64 f32 AVX2 state and the missing flash/quantized/runtime
+      parity work assigned to concrete follow-on v1.27 phases.
+    </instructions>
+    <verification>
+      Artifacts cite source files and command evidence, not planning intent
+      alone. `X86-01` and `X86-02` are marked complete only if source/test/build
+      evidence supports them.
+    </verification>
+  </task>
+</tasks>
+
+## Verification
+
+1. Focused x86_64 doctests for the feature contract.
+2. CMake configure with default settings on this host.
+3. Source scan for forbidden x86 feature claims/flags.
+4. `git diff --check`.
+5. Changed-file scoped quality gate.
+
+<rule_constraints>
+## Rule Constraints
+
+- Follow `AGENTS.md` and `docs/rules/sml.rules.md` for actor structure.
+- Do not move runtime behavior selection from guards/transitions into
+  `actions.hpp` or `detail.hpp`.
+- Keep `detail` helpers non-routing and non-orchestrating.
+- Do not add queue/mailbox/deferred-dispatch behavior.
+- Do not add or claim AVX-512, AVX-VNNI, AMX, BF16, native FP16, GPU, or
+  llama.cpp/ggml runtime linkage.
+- Keep quantized hot-path fallback policy unchanged; do not introduce
+  dequantize-to-f32 substitutions.
+
+</rule_constraints>
+
+## Completion Criteria
+
+- `X86-01` has source-backed evidence via x86_64 feature-contract code and
+  public actor tests.
+- `X86-02` has source-backed evidence via CMake host-feature support and source
+  scans excluding unsupported flags.
+- The baseline audit clearly separates existing x86_64 f32 AVX2 support from
+  subsequent flash/quantized/runtime/benchmark phases.
diff --git a/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-01-SUMMARY.md b/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-01-SUMMARY.md
new file mode 100644
index 00000000..70839ab0
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-01-SUMMARY.md
@@ -0,0 +1,42 @@
+---
+phase: 239
+status: passed
+requirements-completed:
+  - X86-01
+  - X86-02
+requirements-blocked: []
+verification: passed
+---
+
+# Phase 239 Summary
+
+## What Changed
+
+- Added an x86_64 host feature contract for AVX2, FMA, and F16C conversion.
+- Published explicit no-claim fields for AVX-512, AVX-VNNI, AMX, BF16, and
+  native FP16 through the x86_64 actor surface.
+- Added `EMEL_ENABLE_X86_64_HOST_FEATURES` and compiler-checked
+  `-mavx2/-mfma/-mf16c` host flags.
+- Replaced `__builtin_cpu_supports` with local CPUID/XGETBV detection so Zig
+  toolchain links succeed.
+- Repaired x86 host build portability exposed by the new host build:
+  AArch64 NEON helper visibility, non-ARM warning-as-error issues, doctest skip
+  markers, and paritychecker reference vendor includes.
+- Added focused x86_64 tests for supported, fail-closed, and detected host
+  feature contracts.
+- Wrote a source-backed baseline audit separating current x86_64 f32 AVX2
+  support from active follow-on flash/quantized/runtime/benchmark phases.
+
+## Validation
+
+- CMake configure with Zig: pass.
+- `emel_tests_bin` build: pass.
+- `emel_tests_kernel_and_graph` CTest shard: pass.
+- `scripts/paritychecker.sh --runner=kernel`: pass.
+- `git diff --check`: pass.
+- Scoped `scripts/quality_gates.sh`: coverage, paritychecker, benchmark
+  snapshot, lint, docs, and fuzz routing pass after approved snapshot updates.
+
+## Closeout Status
+
+The Phase 239 implementation satisfies and verifies `X86-01` and `X86-02`.
diff --git a/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-CONTEXT.md b/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-CONTEXT.md
new file mode 100644
index 00000000..b07fc488
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-CONTEXT.md
@@ -0,0 +1,132 @@
+# Phase 239: x86_64 AVX2/FMA Host Contract and Baseline Audit - Context
+
+**Gathered:** 2026-06-25
+**Status:** Source-complete; benchmark snapshot update awaiting explicit approval
+**Mode:** Auto-generated (autonomous infrastructure phase)
+
+<domain>
+## Phase Boundary
+
+Define and prove the x86_64 Ryzen host feature contract before adding new flash
+or quantized kernels. This phase may add feature detection, public actor
+accessors, host-tuned build flags, tests, and a source-backed audit artifact. It
+must not implement the AVX2/FMA flash kernel or the q2_K/q3_K/q6_K hot-path
+kernels; those are Phase 240-242 work.
+
+</domain>
+
+<decisions>
+## Implementation Decisions
+
+### Host Feature Contract
+- **D-01:** Treat this processor as x86_64 AVX2 + FMA with F16C conversion
+  support.
+- **D-02:** Publish AVX2, FMA, and F16C as explicit supported feature booleans
+  on the x86_64 kernel actor context/surface.
+- **D-03:** Publish unsupported feature families as explicit no-claim booleans:
+  AVX-512, AVX-VNNI, AMX, BF16, and native FP16.
+- **D-04:** Keep no-claim feature families disabled even if a future host can
+  report them; adding those paths requires a separate milestone contract.
+
+### Build Contract
+- **D-05:** Add an x86_64 host-feature build option analogous to the existing
+  AArch64 host-feature option.
+- **D-06:** Use compiler-checked AVX2/FMA/F16C flags only; do not add AVX-512,
+  VNNI, AMX, BF16, native FP16, or GPU flags.
+- **D-07:** Preserve portable builds by keeping the new option configurable and
+  by applying flags only for non-cross x86_64 builds when supported.
+
+### Audit Scope
+- **D-08:** Capture the current x86_64 kernel state against the NEON precedent:
+  existing f32 AVX2 paths are present, but flash/quantized parity is not yet at
+  the AArch64 standard.
+- **D-09:** The audit is evidence for Phase 239 only; it must not claim Phase
+  240-244 runtime or benchmark completion.
+
+### the agent's Discretion
+- Use the smallest code shape that keeps the feature contract inspectable from
+  tests and follow-on phases.
+- Prefer focused x86_64 kernel tests and CMake/source scans over broad quality
+  gates until implementation files are touched.
+
+</decisions>
+
+<canonical_refs>
+## Canonical References
+
+### Current milestone
+- `.planning/REQUIREMENTS.md` - `X86-01` and `X86-02` requirements.
+- `.planning/ROADMAP.md` - Phase 239 goal and success criteria.
+- `.planning/STATE.md` - v1.27 host feature scope and no-claim constraints.
+
+### Existing x86_64 surface
+- `src/emel/kernel/x86_64/context.hpp` - current AVX2 detection and context.
+- `src/emel/kernel/x86_64/actions.hpp` - current f32 AVX2 execution helpers.
+- `src/emel/kernel/x86_64/guards.hpp` - current x86_64 SIMD route guards.
+- `src/emel/kernel/x86_64/sm.hpp` - public x86_64 actor wrapper surface.
+- `tests/kernel/x86_64_tests.cpp` - existing x86_64 actor/kernel coverage.
+
+### NEON precedent
+- `src/emel/kernel/aarch64/context.hpp` - feature/counter precedent.
+- `src/emel/kernel/aarch64/sm.hpp` - public counter/accessor precedent.
+- `.planning/milestones/v1.3-ROADMAP.md` - ARM flash optimization pattern.
+- `.planning/milestones/v1.4-ROADMAP.md` - vectorized quantized kernel pattern.
+- `.planning/milestones/v1.5-ROADMAP.md` - full ARM quantized path proof.
+
+</canonical_refs>
+
+<code_context>
+## Existing Code Insights
+
+### Reusable Assets
+- `CMakeLists.txt` already has `EMEL_ENABLE_AARCH64_HOST_FEATURES` and
+  compiler-checked host flags.
+- `src/emel/kernel/x86_64/context.hpp` already detects AVX2 with
+  `__builtin_cpu_supports` on GCC/Clang x86_64.
+- `src/emel/kernel/x86_64/actions.hpp` already has AVX2 target attributes and
+  f32 SIMD helpers for dup/add/sub/mul/div/sqr/sqrt/mul_mat/unary.
+- `tests/kernel/x86_64_tests.cpp` already has forced-SIMD and fallback tests.
+
+### Established Patterns
+- x86_64 behavior selection belongs in guards/transitions; action/detail code
+  only executes an already selected path.
+- AArch64 publishes runtime counters/accessors from `sm.hpp`; x86_64 can use the
+  same public actor-wrapper style for feature contract inspection.
+- Planning artifacts must not be used as proof of runtime support; tests and
+  source scans must back every closeout claim.
+
+### Integration Points
+- Add feature contract state in `src/emel/kernel/x86_64/context.hpp`.
+- Add public wrapper accessors in `src/emel/kernel/x86_64/sm.hpp`.
+- Add host-feature compile flag support in `CMakeLists.txt`.
+- Add focused x86_64 tests in `tests/kernel/x86_64_tests.cpp`.
+- Add Phase 239 audit artifact under this phase directory.
+
+</code_context>
+
+<specifics>
+## Specific Ideas
+
+- The current host inspection reports AMD Ryzen 9 5950X with AVX2, FMA, and
+  F16C flags.
+- The milestone wording must stay honest: this phase starts x86_64 support work;
+  it does not complete flash, quantized kernels, runtime parity, or benchmark
+  publication. Those items remain active v1.27 scope with concrete follow-on
+  phases and phase-owned acceptance criteria.
+
+</specifics>
+
+<active_follow_on_scope>
+## Active Follow-On Scope
+
+- AVX2/FMA flash-attention implementation - Phase 240.
+- AVX2/FMA q2_K/q3_K kernels - Phase 241.
+- AVX2/FMA q6_K and hot-path operand-fidelity proof - Phase 242.
+- Runtime parity and benchmark publication - Phases 243-244.
+
+</active_follow_on_scope>
+
+---
+
+*Phase: 239-x86-64-avx2-fma-host-contract-and-baseline-audit*
+*Context gathered: 2026-06-25*
diff --git a/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-VALIDATION.md b/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-VALIDATION.md
new file mode 100644
index 00000000..ab4bbe07
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-VALIDATION.md
@@ -0,0 +1,130 @@
+---
+phase: 239
+slug: x86-64-avx2-fma-host-contract-and-baseline-audit
+status: passed
+nyquist_compliant: true
+wave_0_complete: true
+created: 2026-06-25
+---
+
+# Phase 239 - Validation Strategy
+
+## Test Infrastructure
+
+| Property | Value |
+|----------|-------|
+| Framework | doctest, CTest, CMake configure/build, source scans, quality gate |
+| Config file | `CMakeLists.txt`; `scripts/quality_gates.sh` |
+| Quick run command | `ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'` |
+| Gate command | `EMEL_QUALITY_GATES_CHANGED_FILES="<phase 239 implementation files>" scripts/quality_gates.sh` |
+| Current gate status | passed after approved x86_64 benchmark baseline update |
+
+## Per-Task Verification Map
+
+| Task ID | Requirement | Test Type | Automated Command | Status |
+|---------|-------------|-----------|-------------------|--------|
+| 239-01-01 | X86-01 | failing-first compile proof | `ninja -C build/phase239 CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o` before implementation | red captured |
+| 239-01-02 | X86-01 | focused compile/test | `ninja -C build/phase239 CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o`; `ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'` | green |
+| 239-01-03 | X86-02 | configure/build/source scan | `CC='/shared/zig/zig cc' CXX='/shared/zig/zig c++' cmake -S . -B build/phase239 -G Ninja -DEMEL_ENABLE_TESTS=ON`; `cmake --build build/phase239 --target emel_tests_bin -j2`; unsupported-flag scan | green |
+| 239-01-04 | X86-01, X86-02 | artifact/source audit | `239-X86-BASELINE-AUDIT.md`; `git diff --check` | green |
+| 239-01-05 | quality gate | scoped quality gate | `EMEL_QUALITY_GATES_CHANGED_FILES="..." scripts/quality_gates.sh` | green |
+
+## Command Results
+
+```bash
+CC='/shared/zig/zig cc' CXX='/shared/zig/zig c++' cmake -S . -B build/phase239 -G Ninja -DEMEL_ENABLE_TESTS=ON
+```
+
+Result: PASS. Configure reported
+`EMEL enabling x86_64 host compile flags: -mavx2;-mfma;-mf16c`.
+
+```bash
+ninja -C build/phase239 CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o
+```
+
+Result: PASS after implementation. The pre-fix run failed on missing x86_64
+host feature contract/accessors and FMA/F16C detection.
+
+```bash
+cmake --build build/phase239 --target emel_tests_bin -j2
+```
+
+Result: PASS after repairing x86 host compile portability gaps.
+
+```bash
+ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'
+```
+
+Result: PASS, `100% tests passed`.
+
+```bash
+rg -n -- '-mavx512|-mavxvnni|-mamx|-mavx512bf16|-mavx512fp16|-mavx512vnni|-mavx512f' CMakeLists.txt src tests tools/paritychecker .planning/phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit
+```
+
+Result: PASS, no unsupported x86 compile flags found.
+
+```bash
+scripts/paritychecker.sh --runner=kernel
+```
+
+Result: PASS. The paritychecker builds and the kernel parity runner passes.
+
+```bash
+PATH="/shared/stateforward/.tools/clang-format-venv/bin:/shared/stateforward/.tools/llvm18/root/usr/lib/llvm-18/bin:/shared/stateforward/.tools/git-lfs/git-lfs-3.7.1:$PATH" \
+EMEL_QUALITY_GATES_TIMEOUT="3600s" \
+EMEL_QUALITY_GATES_BENCH_SUITE="kernel_x86_64" \
+EMEL_QUALITY_GATES_CHANGED_FILES="CMakeLists.txt:src/emel/kernel/x86_64/actions.hpp:src/emel/kernel/x86_64/context.hpp:src/emel/kernel/x86_64/sm.hpp:tests/kernel/x86_64_tests.cpp:src/emel/kernel/aarch64/actions.hpp:src/emel/diarization/sortformer/detail.cpp:src/emel/text/generator/detail.hpp:src/emel/text/generator/context.hpp:src/emel/embeddings/generator/detail.hpp:tests/kernel/aarch64_tests.cpp:tests/kernel/test_helpers.hpp:tests/text/generator/detail_tests.cpp:tests/text/generator/lifecycle_tests.cpp:tests/embeddings/vision_embedding_lane_tests.cpp:tests/embeddings/text_embedding_lane_tests.cpp:tools/paritychecker/CMakeLists.txt:tools/paritychecker/paritychecker_tests.cpp:tools/paritychecker/parity_engines.cpp:tools/bench/CMakeLists.txt:tools/bench/quality_gates_tests.cpp:scripts/test_with_coverage.sh:tests/diarization/request/lifecycle_tests.cpp:tests/diarization/sortformer/encoder/lifecycle_tests.cpp:tests/diarization/sortformer/modules/lifecycle_tests.cpp:tests/diarization/sortformer/output/lifecycle_tests.cpp:tests/diarization/sortformer/transformer/lifecycle_tests.cpp:tests/graph/assembler/assembler_tests.cpp:tests/graph/graph_tests.cpp:tests/model/loader/lifecycle_tests.cpp:tests/embeddings/te_fixture_data.hpp" \
+scripts/quality_gates.sh
+```
+
+Initial result before snapshot approval: the scoped gate passed all
+non-benchmark lanes:
+
+- `test_with_coverage`: PASS. CTest shards
+  `generator_and_runtime`, `diarization`, and `kernel_and_graph` pass.
+  Changed-line coverage is `73/78` lines (`93.6%`) and `18/34` branches
+  (`52.9%`).
+- `paritychecker`: PASS. Full paritychecker tests pass.
+- `fuzz_smoke`: skipped because no fuzz-affecting files changed.
+- `lint_snapshot`: PASS after using the existing `tests/kernel/test_helpers.hpp`
+  instead of adding a new helper file.
+- `generate_docs`: PASS.
+
+The only failing lane was `bench_snapshot`: the new `kernel_x86_64` suite emits
+15 `kernel/x86_64/*` entries, and `snapshots/bench/benchmarks.txt` had no
+matching baselines. User approval was granted and the snapshot baseline was
+updated.
+
+```bash
+EMEL_BENCH_SUITE=kernel_x86_64 \
+EMEL_BENCH_ITERS=100 \
+EMEL_BENCH_RUNS=3 \
+EMEL_BENCH_WARMUP_ITERS=10 \
+EMEL_BENCH_WARMUP_RUNS=1 \
+build/bench_tools_ninja_kernel_x86_64/bench_runner --mode=emel
+```
+
+Result: PASS as a non-mutating measurement. It emits x86_64 benchmark entries
+for `op_add`, `op_cos`, `op_div`, `op_dup`, `op_log`, `op_mul`, `op_mul_mat`,
+`op_sin`, `op_soft_max`, `op_sqr`, `op_sqrt`, `op_sub`, `op_unary_exp`,
+`op_unary_neg`, and `op_unary_relu`.
+
+## Manual-Only Verifications
+
+- Approve the benchmark snapshot baseline update.
+- Run `scripts/bench.sh --snapshot --update --suite=kernel_x86_64`.
+- Re-run the scoped quality gate above.
+
+## Validation Sign-Off
+
+- [x] Host feature contract has automated/source-backed validation.
+- [x] x86_64 host-tuned build config has configure/build/source-scan evidence.
+- [x] Focused x86_64 and kernel/graph tests pass.
+- [x] Snapshot updates were explicitly approved and applied.
+- [x] Scoped quality gate passes after approved benchmark baseline update.
+- [x] `nyquist_compliant: true` and `wave_0_complete: true` are set in
+  frontmatter.
+- [x] Rule-compliance evidence is recorded through Zig configure/build,
+  unsupported x86 feature scans, and source-backed host-contract validation.
+
+**Approval:** granted by user; snapshots updated.
diff --git a/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-VERIFICATION.md b/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-VERIFICATION.md
new file mode 100644
index 00000000..eb9fa9a4
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-VERIFICATION.md
@@ -0,0 +1,43 @@
+# Phase 239 Verification
+
+status: passed
+
+All commands were run from:
+`/shared/stateforward/emel.cpp`
+
+## Must-Have Verification
+
+| Must-have | Evidence | Status |
+|-----------|----------|--------|
+| AVX2, FMA, and F16C detected/published | `src/emel/kernel/x86_64/context.hpp`; `tests/kernel/x86_64_tests.cpp`; focused object build | PASS |
+| Unsupported feature families explicitly no-claimed | `host_feature_contract` false no-claim fields; actor accessors; doctests | PASS |
+| Host-tuned x86_64 build flags | CMake configure reports `-mavx2;-mfma;-mf16c` | PASS |
+| No unsupported x86 feature flags | unsupported-flag `rg` scan returns no matches | PASS |
+| Source-backed baseline audit | `239-X86-BASELINE-AUDIT.md` | PASS |
+| Required quality gate | approved `kernel_x86_64` benchmark/parity snapshots landed; scoped quality gate passed all selected lanes | PASS |
+
+## Evidence Summary
+
+- `cmake --build build/phase239 --target emel_tests_bin -j2` passes.
+- `ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'` passes.
+- `scripts/paritychecker.sh --runner=kernel` passes.
+- `git diff --check` passes.
+- `scripts/quality_gates.sh` scoped to Phase 239 files passes:
+  coverage, paritychecker, fuzz skip, lint snapshot, and docs generation.
+- Coverage evidence from the scoped gate:
+  `changed-line coverage: lines 73/78 (93.6%), branches 18/34 (52.9%)`.
+- Approved benchmark snapshots now include the `kernel/x86_64/*` benchmark
+  suite entries.
+- Direct non-mutating benchmark evidence:
+  `EMEL_BENCH_SUITE=kernel_x86_64 build/bench_tools_ninja_kernel_x86_64/bench_runner --mode=emel`
+  emits 15 x86_64 entries: add, cos, div, dup, log, mul, mul_mat, sin,
+  soft_max, sqr, sqrt, sub, unary_exp, unary_neg, and unary_relu.
+
+## Final Verification
+
+User approved snapshot updates. `scripts/bench.sh --snapshot --update
+--suite=kernel_x86_64` updated the benchmark baseline, maintained generation
+publication baselines were updated, and the changed-file scoped quality gate
+passed with `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64`.
+
+Phase 239 is fully verified for `X86-01` and `X86-02`.
diff --git a/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-X86-BASELINE-AUDIT.md b/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-X86-BASELINE-AUDIT.md
new file mode 100644
index 00000000..7e9c373f
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-X86-BASELINE-AUDIT.md
@@ -0,0 +1,74 @@
+# Phase 239 x86_64 Baseline Audit
+
+**Date:** 2026-06-25
+**Scope:** Ryzen 9 5950X host contract and pre-kernel x86_64 baseline.
+
+## Host Contract
+
+Phase 239 establishes an EMEL-owned x86_64 host feature contract for the current
+CPU feature class:
+
+- Supported: AVX2, FMA, and F16C conversion.
+- Explicit no-claim: AVX-512, AVX-VNNI, AMX, BF16 execution, native FP16
+  arithmetic, and GPU execution.
+- Runtime detection is local CPUID/XGETBV logic in
+  `src/emel/kernel/x86_64/context.hpp`, avoiding Zig toolchain link dependency
+  on compiler CPU-model symbols.
+- Public actor inspection is exposed through `src/emel/kernel/x86_64/sm.hpp`.
+
+## Build Contract
+
+`CMakeLists.txt` now has `EMEL_ENABLE_X86_64_HOST_FEATURES`, matching the
+existing AArch64 host-feature switch pattern. On non-cross non-MSVC x86_64
+builds, CMake checks and applies only:
+
+- `-mavx2`
+- `-mfma`
+- `-mf16c`
+
+No AVX-512, AVX-VNNI, AMX, BF16, native-FP16, or GPU flags were added.
+
+The x86 host build also exposed pre-existing non-ARM compile gaps. Phase 239
+repaired those without changing runtime contracts:
+
+- AArch64 NEON helper signatures are hidden from non-ARM compilers in
+  `src/emel/kernel/aarch64/actions.hpp`.
+- ARM-only doctest skip markers now use supported doctest assertions.
+- Non-ARM warning-as-error issues in shared generator/diarization/embedding
+  helpers are acknowledged with no-op casts.
+- `tools/paritychecker/CMakeLists.txt` includes the fetched reference
+  implementation's `vendor` directory so reference-side `<nlohmann/json.hpp>`
+  resolves.
+
+## Current x86_64 Kernel Baseline
+
+Current `src/emel/kernel/x86_64` support before Phase 240:
+
+- Existing f32 AVX2 execution helpers cover dup/add/sub/mul/div/sqr/sqrt,
+  mul_mat, and unary abs/neg/relu where dtype/layout and host support allow it.
+- Runtime SIMD choice still flows through x86_64 guards and SML transitions for
+  public dispatch; unsupported requests use the existing scalar/shared behavior.
+- Flash attention still routes through the shared workspace helper rather than
+  an x86_64 AVX2/FMA optimized flash kernel.
+- q2_K/q3_K/q6_K AVX2/FMA hot-path kernels do not exist yet.
+
+## NEON/AArch64 Comparison
+
+The AArch64 precedent has a broader maintained optimization surface:
+
+- AArch64 context publishes optimized/shared dispatch counters.
+- AArch64 SML transitions distinguish optimized flash attention from shared
+  flash behavior.
+- AArch64 quantized paths have route counters and packed/vector coverage for
+  multiple quantized formats.
+
+The x86_64 path now has the equivalent host/build contract foundation, but not
+the flash, quantized-kernel, runtime-parity, or benchmark attribution parity.
+
+## Assigned To Active Follow-On Phases
+
+- Phase 240: AVX2/FMA flash attention kernel and fallback/no-claim behavior.
+- Phase 241: q2_K/q3_K x q8_K AVX2/FMA kernels.
+- Phase 242: q6_K x q8_K AVX2/FMA kernel and hot-path allocation/operand proof.
+- Phase 243: maintained runtime integration and parity proof.
+- Phase 244: benchmark attribution and publication truth.
diff --git a/.planning/milestones/v1.27-phases/240-x86-64-flash-attention-avx2-fma-kernel/240-01-PLAN.md b/.planning/milestones/v1.27-phases/240-x86-64-flash-attention-avx2-fma-kernel/240-01-PLAN.md
new file mode 100644
index 00000000..9a433d2a
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/240-x86-64-flash-attention-avx2-fma-kernel/240-01-PLAN.md
@@ -0,0 +1,108 @@
+# Phase 240 Plan: x86_64 Flash Attention AVX2/FMA Kernel
+
+## Goal
+
+Route supported x86_64 `op_flash_attn_ext` requests through an EMEL-owned
+AVX2/FMA/F16C flash-attention kernel while preserving shared fallback,
+persistent workspace reuse, and explicit no-claim behavior for unsupported
+requests.
+
+## Tasks
+
+<tasks>
+  <task id="240-01-01" name="Add failing x86_64 flash route tests">
+    <instructions>
+      Extend `tests/kernel/x86_64_tests.cpp` with focused tests that initially
+      fail on the current shared-only x86 path:
+      optimized dispatch counter increments for supported AVX2/FMA/F16C
+      requests, shared counter remains zero on optimized dispatch, shared
+      counter increments when the optimized feature contract is disabled,
+      persistent workspace storage is reused, and optimized output matches the
+      maintained flash reference/shared oracle within existing flash tolerance.
+    </instructions>
+    <verification>
+      Compile or run the focused x86_64 test shard and capture the expected red
+      failure before implementation.
+    </verification>
+  </task>
+
+  <task id="240-01-02" name="Implement AVX2/FMA/F16C flash helper">
+    <instructions>
+      Add x86_64 detail helpers in `src/emel/kernel/x86_64/actions.hpp` for the
+      one-chunk f16 K/V flash path. Use AVX2/FMA for f32 vector arithmetic and
+      F16C conversion intrinsics for f16 workspace and K/V conversion. Keep the
+      effective operand class aligned with the AArch64 optimized path and the
+      shared flash workspace. Do not add allocation or native-FP16 claims.
+    </instructions>
+    <verification>
+      Focused tests compare optimized x86 output to the shared/reference helper
+      for fixture, long-span, and masked-total-token cases.
+    </verification>
+  </task>
+
+  <task id="240-01-03" name="Wire explicit SML optimized/shared routing">
+    <instructions>
+      Add x86_64 optimized flash predicate(s) in `guards.hpp`, add optimized and
+      shared route counters in context/accessors, and insert optimized/shared
+      `op_flash_attn_ext` transition rows in `sm.hpp` before invalid handling.
+      Runtime behavior choice must remain in guards/transitions, not actions or
+      detail helpers.
+    </instructions>
+    <verification>
+      Tests prove supported optimized route and feature-disabled shared route
+      through public actor dispatch and counter accessors.
+    </verification>
+  </task>
+
+  <task id="240-01-04" name="Close validation artifacts">
+    <instructions>
+      Write `240-VERIFICATION.md`, `240-VALIDATION.md`, and `240-01-SUMMARY.md`
+      from source/test evidence. Keep Phase 239 benchmark snapshot approval
+      visible as a separate gate and do not mutate benchmark snapshots without
+      explicit approval.
+    </instructions>
+    <verification>
+      `git diff --check`, focused x86_64/kernel tests, and changed-file scoped
+      quality gate evidence are recorded. If a required snapshot update has not
+      been approved yet, keep the phase verification separate from publication
+      approval until the gate can pass.
+    </verification>
+  </task>
+</tasks>
+
+## Verification
+
+1. Failing-first compile/test evidence for new x86_64 flash tests.
+2. `ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'`
+   or equivalent focused kernel shard after implementation.
+3. Direct x86_64 object/build target for `tests/kernel/x86_64_tests.cpp`.
+4. Source scan for unsupported x86 feature claims/flags:
+   `rg -n -- '-mavx512|-mavxvnni|-mamx|-mavx512bf16|-mavx512fp16|-mavx512vnni|-mavx512f' CMakeLists.txt src tests tools/paritychecker .planning/phases/240-x86-64-flash-attention-avx2-fma-kernel`
+5. `git diff --check`.
+6. Changed-file scoped `scripts/quality_gates.sh` with
+   `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64` when feasible. Do not update
+   snapshots without explicit approval.
+
+<rule_constraints>
+## Rule Constraints
+
+- Follow `AGENTS.md` and `docs/rules/sml.rules.md`.
+- Keep runtime behavior selection in `guards.hpp` and `sm.hpp` transitions.
+- Keep action/detail helpers as already-chosen numeric work only.
+- Do not add queue/mailbox/deferred-dispatch behavior.
+- Do not add AVX-512, AVX-VNNI, AMX, BF16, native-FP16, GPU, or llama.cpp/ggml
+  runtime linkage.
+- Do not add dispatch-time allocation or per-invocation context fields.
+- Do not update snapshots without explicit user approval.
+
+</rule_constraints>
+
+## Completion Criteria
+
+- `XFL-01` has source-backed evidence that supported x86_64 flash requests run
+  through the optimized AVX2/FMA/F16C path rather than the shared workspace path.
+- `XFL-02` has source-backed evidence that unsupported or disabled optimized
+  contracts take explicit shared or invalid paths.
+- Focused tests prove numeric correctness, route counters, and workspace reuse.
+- Verification artifacts distinguish source completion from any repository-level
+  snapshot publication gate until the approved snapshots and quality gate pass.
diff --git a/.planning/milestones/v1.27-phases/240-x86-64-flash-attention-avx2-fma-kernel/240-01-SUMMARY.md b/.planning/milestones/v1.27-phases/240-x86-64-flash-attention-avx2-fma-kernel/240-01-SUMMARY.md
new file mode 100644
index 00000000..4e844fdf
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/240-x86-64-flash-attention-avx2-fma-kernel/240-01-SUMMARY.md
@@ -0,0 +1,40 @@
+---
+phase: 240
+status: passed
+requirements-completed:
+  - XFL-01
+  - XFL-02
+requirements-blocked: []
+verification: passed
+---
+
+# Phase 240 Summary
+
+## What Changed
+
+- Added an EMEL-owned x86_64 one-chunk flash-attention kernel using AVX2/FMA
+  f32 vector arithmetic and F16C f16 conversions.
+- Kept the flash operand contract aligned with the AArch64 optimized path: f32
+  Q rounded to f16, f16 K/V operands, f16 workspace accumulation, and f32 output.
+- Added explicit optimized/shared x86_64 flash dispatch counters and actor
+  accessors for attribution.
+- Routed `op_flash_attn_ext` through optimized x86_64 guards/transitions before
+  shared fallback and invalid handling.
+- Added x86_64 tests proving optimized route, shared fallback when the feature
+  contract is disabled, workspace reuse, and numeric agreement with maintained
+  flash reference helpers.
+
+## Validation
+
+- Failing-first x86_64 test object compile: red captured before implementation.
+- x86_64 test object compile: pass after implementation.
+- `emel_tests_bin` build: pass.
+- `emel_tests_kernel_and_graph` CTest shard: pass.
+- Unsupported x86 feature flag source scan: pass.
+- `scripts/lint_snapshot.sh`: pass without snapshot updates.
+- Scoped `scripts/quality_gates.sh`: coverage, paritychecker, benchmark
+  snapshot, lint, docs, and fuzz routing pass after approved snapshot updates.
+
+## Closeout Status
+
+The Phase 240 implementation satisfies and verifies `XFL-01` and `XFL-02`.
diff --git a/.planning/milestones/v1.27-phases/240-x86-64-flash-attention-avx2-fma-kernel/240-CONTEXT.md b/.planning/milestones/v1.27-phases/240-x86-64-flash-attention-avx2-fma-kernel/240-CONTEXT.md
new file mode 100644
index 00000000..148ab75d
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/240-x86-64-flash-attention-avx2-fma-kernel/240-CONTEXT.md
@@ -0,0 +1,118 @@
+# Phase 240: x86_64 Flash Attention AVX2/FMA Kernel - Context
+
+**Gathered:** 2026-06-25
+**Status:** Ready for planning
+**Mode:** Auto-generated (autonomous processor-support phase)
+
+<domain>
+## Phase Boundary
+
+Port the maintained flash-attention optimization pattern to an EMEL-owned
+x86_64 AVX2/FMA implementation for this Ryzen AVX2/FMA/F16C host. This phase
+must route supported flash-attention requests through an optimized x86_64
+kernel, preserve persistent workspace reuse, and keep unsupported requests on
+explicit shared or invalid paths. It must not implement quantized q2_K/q3_K/q6_K
+matmul kernels, runtime parity publication, or benchmark attribution; those are
+active Phase 241-244 obligations.
+
+</domain>
+
+<decisions>
+## Implementation Decisions
+
+### Kernel Contract
+- Implement a native x86_64 flash-attention helper in `src/emel/kernel/x86_64`
+  using AVX2/FMA numeric work and F16C conversions for f16 K/V operand handling.
+- Match the AArch64 one-chunk f16 K/V operand class: f32 Q is rounded into f16,
+  K/V are consumed as f16, accumulation uses the existing f16 workspace buffer,
+  and output is converted back to f32.
+- Do not claim native FP16 arithmetic. F16C is a conversion capability; all x86
+  vector arithmetic remains f32 AVX2/FMA.
+- Keep all optimized kernel code in `src/`; parity and benchmark tools may only
+  observe through public dispatch surfaces.
+
+### Routing Contract
+- Put supported optimized-path selection in `x86_64/guards.hpp` and
+  `x86_64/sm.hpp`, analogous to the AArch64 flash route.
+- Add explicit x86_64 optimized/shared flash counters so tests and active parity
+  attribution can distinguish optimized execution from shared fallback.
+- Keep unsupported shapes, feature contracts, and workspace constraints on the
+  existing shared or invalid paths; do not silently label them optimized.
+
+### Verification Contract
+- Add failing-first x86_64 tests before source changes: optimized dispatch
+  counter, shared fallback counter, persistent workspace reuse, and numeric
+  comparison against the maintained shared/reference helper.
+- Verify through public actor dispatch or route-owned detail functions, not by
+  tool-only scaffolds.
+- Do not update `snapshots/bench/benchmarks.txt` in this phase unless explicit
+  snapshot approval is provided.
+
+### the agent's Discretion
+- Prefer the smallest x86_64 implementation that proves AVX2/FMA/F16C flash
+  support and keeps active follow-on runtime/benchmark attribution
+  straightforward.
+- Reuse the existing `flash_attn_workspace` instead of adding per-dispatch
+  allocation or new transient context fields.
+
+</decisions>
+
+<code_context>
+## Existing Code Insights
+
+### Reusable Assets
+- `src/emel/kernel/aarch64/actions.hpp` contains the optimized NEON flash
+  precedent: `run_flash_attn_ext_f16kv_one_chunk_neon_unchecked`,
+  `can_run_neon_flash_attn_ext_f16kv_one_chunk_request`, and
+  `exec_simd_flash_attn_ext_f16kv_one_chunk`.
+- `src/emel/kernel/aarch64/guards.hpp` and `sm.hpp` route optimized flash before
+  the shared flash path, then invalid fallback.
+- `src/emel/kernel/detail.hpp` owns the shared flash workspace, f16 conversion
+  helpers, active-token handling, and scalar workspace fallback.
+- `tests/kernel/test_helpers.hpp` provides flash fixtures and reference helpers.
+
+### Established Patterns
+- x86_64 SIMD helpers live in `src/emel/kernel/x86_64/actions.hpp`, with feature
+  predicates in guards and transition rows in `sm.hpp`.
+- The current x86_64 flash route accepts `op_flash_attn_ext` through
+  `guard::valid_op_flash_attn_ext` and `action::exec_op_flash_attn_ext`, which
+  calls the shared workspace helper.
+- Phase 239 added x86_64 AVX2/FMA/F16C feature-contract fields and actor
+  accessors that this phase can use for optimized flash eligibility.
+
+### Integration Points
+- Add x86 flash helpers and action aliases in `src/emel/kernel/x86_64/actions.hpp`.
+- Add optimized/shared flash guards in `src/emel/kernel/x86_64/guards.hpp`.
+- Add optimized/shared flash route counters in
+  `src/emel/kernel/x86_64/context.hpp` and public accessors in `sm.hpp`.
+- Add optimized flash transition rows before the shared flash row in
+  `src/emel/kernel/x86_64/sm.hpp`.
+- Add focused tests in `tests/kernel/x86_64_tests.cpp`.
+
+</code_context>
+
+<specifics>
+## Specific Ideas
+
+- The current host contract is Ryzen 9 5950X with AVX2, FMA, and F16C available.
+- The optimized flash route should require all three: AVX2 for vector lanes, FMA
+  for fused f32 accumulation, and F16C for f16 K/V conversion.
+- The test surface should prove optimized and shared counters separately so
+  Phase 243/244 attribution can build on source-backed evidence.
+
+</specifics>
+
+<active_follow_on_scope>
+## Active Follow-On Scope
+
+- Phase 241: AVX2/FMA q2_K/q3_K kernels.
+- Phase 242: AVX2/FMA q6_K and hot-path operand-fidelity proof.
+- Phase 243: maintained runtime integration and parity proof.
+- Phase 244: benchmark attribution and publication truth.
+
+</active_follow_on_scope>
+
+---
+
+*Phase: 240-x86-64-flash-attention-avx2-fma-kernel*
+*Context gathered: 2026-06-25*
diff --git a/.planning/milestones/v1.27-phases/240-x86-64-flash-attention-avx2-fma-kernel/240-VALIDATION.md b/.planning/milestones/v1.27-phases/240-x86-64-flash-attention-avx2-fma-kernel/240-VALIDATION.md
new file mode 100644
index 00000000..c45721ae
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/240-x86-64-flash-attention-avx2-fma-kernel/240-VALIDATION.md
@@ -0,0 +1,101 @@
+---
+phase: 240
+slug: x86-64-flash-attention-avx2-fma-kernel
+status: passed
+nyquist_compliant: true
+wave_0_complete: true
+created: 2026-06-25
+---
+
+# Phase 240 - Validation Strategy
+
+## Test Infrastructure
+
+| Property | Value |
+|----------|-------|
+| Framework | doctest, CTest, source scans, lint snapshot, quality gate |
+| Config file | `CMakeLists.txt`; `scripts/quality_gates.sh` |
+| Quick run command | `ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'` |
+| Gate command | `EMEL_QUALITY_GATES_CHANGED_FILES="<phase 240 files>" scripts/quality_gates.sh` |
+| Current gate status | passed after approved x86_64 benchmark baseline update |
+
+## Per-Task Verification Map
+
+| Task ID | Requirement | Test Type | Automated Command | Status |
+|---------|-------------|-----------|-------------------|--------|
+| 240-01-01 | XFL-01, XFL-02 | failing-first compile proof | `cmake --build build/phase239 --target CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o` before implementation | red captured |
+| 240-01-02 | XFL-01 | focused compile/test | x86_64 test object build; `emel_tests_kernel_and_graph` | green |
+| 240-01-03 | XFL-01, XFL-02 | public actor route proof | optimized/shared counter tests in `tests/kernel/x86_64_tests.cpp` | green |
+| 240-01-04 | quality gate | scoped quality gate | `EMEL_QUALITY_GATES_CHANGED_FILES="..." scripts/quality_gates.sh` | green |
+
+## Command Results
+
+```bash
+cmake --build build/phase239 --target CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o
+```
+
+Result: PASS after implementation. The pre-fix run failed on missing x86_64
+optimized/shared flash route counters and accessors.
+
+```bash
+cmake --build build/phase239 --target emel_tests_bin -j2
+```
+
+Result: PASS.
+
+```bash
+ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'
+```
+
+Result: PASS, `100% tests passed`.
+
+```bash
+rg -n -- '-mavx512|-mavxvnni|-mamx|-mavx512bf16|-mavx512fp16|-mavx512vnni|-mavx512f' CMakeLists.txt src tests tools/paritychecker
+```
+
+Result: PASS, no unsupported x86 compile flags found.
+
+```bash
+scripts/lint_snapshot.sh
+```
+
+Result: PASS. No lint snapshot update was made.
+
+```bash
+PATH="/shared/stateforward/.tools/clang-format-venv/bin:/shared/stateforward/.tools/llvm18/root/usr/lib/llvm-18/bin:/shared/stateforward/.tools/git-lfs/git-lfs-3.7.1:$PATH" \
+EMEL_QUALITY_GATES_TIMEOUT="3600s" \
+EMEL_QUALITY_GATES_BENCH_SUITE="kernel_x86_64" \
+EMEL_QUALITY_GATES_CHANGED_FILES="src/emel/kernel/x86_64/actions.hpp:src/emel/kernel/x86_64/context.hpp:src/emel/kernel/x86_64/guards.hpp:src/emel/kernel/x86_64/sm.hpp:tests/kernel/x86_64_tests.cpp" \
+scripts/quality_gates.sh
+```
+
+Initial result before snapshot approval: the scoped gate passed all
+non-benchmark lanes:
+
+- `test_with_coverage`: PASS. CTest shard `kernel_and_graph` passes.
+  Changed-line coverage is `381/406` lines (`93.8%`) and `86/124` branches
+  (`69.4%`).
+- `paritychecker`: PASS. Kernel parity runner passes.
+- `fuzz_smoke`: skipped because no fuzz-affecting files changed.
+- `lint_snapshot`: PASS without snapshot update.
+- `generate_docs`: PASS.
+
+The only failing lane was `bench_snapshot`: the `kernel_x86_64` suite still emits
+15 `kernel/x86_64/*` entries without baselines in
+`snapshots/bench/benchmarks.txt`. User approval was granted and the snapshot
+baseline was updated.
+
+## Validation Sign-Off
+
+- [x] x86_64 optimized flash route has automated/source-backed validation.
+- [x] x86_64 shared fallback/no-claim path has automated validation.
+- [x] Focused x86_64 and kernel/graph tests pass.
+- [x] Coverage, parity, lint, fuzz routing, and docs lanes pass in scoped gate.
+- [x] Snapshot updates were explicitly approved and applied.
+- [x] Scoped quality gate passes after approved benchmark baseline update.
+- [x] `nyquist_compliant: true` and `wave_0_complete: true` are set in
+  frontmatter.
+- [x] Rule-compliance evidence is recorded through explicit x86_64
+  guards/transitions, unsupported feature scans, focused actor tests, and lint.
+
+**Approval:** granted by user; snapshots updated.
diff --git a/.planning/milestones/v1.27-phases/240-x86-64-flash-attention-avx2-fma-kernel/240-VERIFICATION.md b/.planning/milestones/v1.27-phases/240-x86-64-flash-attention-avx2-fma-kernel/240-VERIFICATION.md
new file mode 100644
index 00000000..ecc168fb
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/240-x86-64-flash-attention-avx2-fma-kernel/240-VERIFICATION.md
@@ -0,0 +1,49 @@
+# Phase 240 Verification
+
+status: passed
+
+All commands were run from:
+`/shared/stateforward/emel.cpp`
+
+## Must-Have Verification
+
+| Must-have | Evidence | Status |
+|-----------|----------|--------|
+| Supported x86_64 flash route is optimized | `src/emel/kernel/x86_64/guards.hpp`, `sm.hpp`, `actions.hpp`; `tests/kernel/x86_64_tests.cpp` route-counter tests | PASS |
+| Optimized helper is EMEL-owned AVX2/FMA/F16C source code | `run_flash_attn_ext_f16kv_one_chunk_avx2_fma_f16c_unchecked` and conversion/dot/axpy helpers in `src/emel/kernel/x86_64/actions.hpp` | PASS |
+| Shared fallback/no-claim behavior remains explicit | feature-disabled x86_64 flash test increments shared counter and not optimized counter | PASS |
+| Persistent workspace reuse is preserved | x86_64 flash workspace reuse test observes prepared-token and reuse counters through actor accessors | PASS |
+| Numeric behavior matches maintained oracle | x86_64 flash tests compare fixture and masked-token output to flash reference helpers | PASS |
+| Required quality gate | approved `kernel_x86_64` benchmark/parity snapshots landed; scoped quality gate passed all selected lanes | PASS |
+
+## Evidence Summary
+
+- Failing-first compile proof captured after adding tests:
+  `cmake --build build/phase239 --target CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o`
+  failed before implementation on missing x86_64 flash counters/accessors and
+  route support.
+- `cmake --build build/phase239 --target CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o`
+  passes after implementation.
+- `cmake --build build/phase239 --target emel_tests_bin -j2` passes.
+- `ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'`
+  passes.
+- Source-only unsupported x86 flag scan passes:
+  `rg -n -- '-mavx512|-mavxvnni|-mamx|-mavx512bf16|-mavx512fp16|-mavx512vnni|-mavx512f' CMakeLists.txt src tests tools/paritychecker`
+  returns no matches.
+- `scripts/lint_snapshot.sh` passes without updating
+  `snapshots/lint/clang_format.txt`.
+- Scoped `scripts/quality_gates.sh` passes all non-benchmark lanes:
+  coverage, paritychecker, fuzz skip, lint snapshot, and docs generation.
+- Coverage evidence from the scoped gate:
+  `changed-line coverage: lines 381/406 (93.8%), branches 86/124 (69.4%)`.
+- Approved benchmark snapshots now include the `kernel/x86_64/*` benchmark
+  suite entries.
+
+## Final Verification
+
+User approved snapshot updates. `scripts/bench.sh --snapshot --update
+--suite=kernel_x86_64` updated the benchmark baseline, maintained generation
+publication baselines were updated, and the changed-file scoped quality gate
+passed with `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64`.
+
+Phase 240 is fully verified for `XFL-01` and `XFL-02`.
diff --git a/.planning/milestones/v1.27-phases/241-x86-64-vectorized-q2-k-q3-k-kernels/241-01-PLAN.md b/.planning/milestones/v1.27-phases/241-x86-64-vectorized-q2-k-q3-k-kernels/241-01-PLAN.md
new file mode 100644
index 00000000..a772a788
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/241-x86-64-vectorized-q2-k-q3-k-kernels/241-01-PLAN.md
@@ -0,0 +1,101 @@
+# Phase 241 Plan: x86_64 Vectorized q2_K/q3_K Kernels
+
+## Goal
+
+Add native x86_64 AVX2/FMA `q2_K x q8_K` and `q3_K x q8_K` kernels for the
+maintained quantized `op_mul_mat` path, with explicit optimized/shared route
+attribution through the x86_64 kernel actor.
+
+## Tasks
+
+<tasks>
+  <task id="241-01-01" name="Add failing q2_K/q3_K x86 route tests">
+    <instructions>
+      Extend `tests/kernel/x86_64_tests.cpp` with q2_K and q3_K tests that
+      initially fail on the current shared-only x86 path. Cover row-kernel
+      correctness, actor optimized counters, actor shared counters with disabled
+      feature contract, representative block data, and multi-block accumulation.
+    </instructions>
+    <verification>
+      Compile or run the focused x86_64/kernel shard and capture the expected
+      red failure before implementation.
+    </verification>
+  </task>
+
+  <task id="241-01-02" name="Implement native x86 q2_K/q3_K row kernels">
+    <instructions>
+      Add x86_64 detail row helpers for `q2_K x q8_K` and `q3_K x q8_K` using
+      the same effective operand format as the scalar/AArch64 paths. Use AVX2/FMA
+      where appropriate for accumulation, keep any scalar branches local to
+      already-selected numeric work, and do not dequantize whole tensors to f32.
+    </instructions>
+    <verification>
+      Row-level tests match scalar/reference output within the established
+      quantized tolerance for q2_K and q3_K across multiple block counts.
+    </verification>
+  </task>
+
+  <task id="241-01-03" name="Wire q2_K/q3_K SML optimized/shared routes">
+    <instructions>
+      Add x86_64 q2/q3 optimized route predicates in `guards.hpp`, selected-route
+      actions and counters in `actions.hpp`/`context.hpp`, actor accessors in
+      `sm.hpp`, and transition rows in `sm.hpp` before generic f32 SIMD and
+      shared scalar `op_mul_mat` routes.
+    </instructions>
+    <verification>
+      Actor tests prove optimized q2/q3 counters increment for supported
+      requests and shared q2/q3 counters increment when the feature contract is
+      disabled.
+    </verification>
+  </task>
+
+  <task id="241-01-04" name="Close validation artifacts">
+    <instructions>
+      Write `241-VERIFICATION.md`, `241-VALIDATION.md`, and `241-01-SUMMARY.md`
+      from source/test evidence. Keep benchmark snapshot approval visible as a
+      shared closeout gate and do not mutate snapshots without explicit approval.
+    </instructions>
+    <verification>
+      Focused tests, source scan, lint snapshot, and changed-file scoped quality
+      gate evidence are recorded.
+    </verification>
+  </task>
+</tasks>
+
+## Verification
+
+1. Failing-first x86_64 q2/q3 tests.
+2. `cmake --build build/phase239 --target CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o`.
+3. `ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'`.
+4. Source scan for forbidden x86 feature claims/flags:
+   `rg -n -- '-mavx512|-mavxvnni|-mamx|-mavx512bf16|-mavx512fp16|-mavx512vnni|-mavx512f' CMakeLists.txt src tests tools/paritychecker`
+5. `scripts/lint_snapshot.sh` without snapshot updates.
+6. Changed-file scoped `scripts/quality_gates.sh` with
+   `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64` when feasible. Do not update
+   snapshots without explicit approval.
+
+<rule_constraints>
+## Rule Constraints
+
+- Follow `AGENTS.md` and `docs/rules/sml.rules.md`.
+- Keep runtime behavior selection in `guards.hpp` and `sm.hpp` transitions.
+- Keep action/detail helpers as already-chosen numeric work only.
+- Do not add queue/mailbox/deferred-dispatch behavior.
+- Do not implement q6_K in this phase.
+- Do not add AVX-512, AVX-VNNI, AMX, BF16, native-FP16, GPU, or llama.cpp/ggml
+  runtime linkage.
+- Do not add whole-tensor dequantize-to-f32 hot-path substitution.
+- Do not update snapshots without explicit user approval.
+
+</rule_constraints>
+
+## Completion Criteria
+
+- `XQK-01` has source-backed evidence that supported x86_64 `q2_K x q8_K`
+  requests run through a native optimized x86 kernel.
+- `XQK-02` has source-backed evidence that supported x86_64 `q3_K x q8_K`
+  requests run through a native optimized x86 kernel.
+- Tests prove optimized/shared attribution, row correctness, and actor route
+  behavior for both formats.
+- Verification artifacts distinguish source completion from the still-pending
+  repository-level benchmark snapshot approval gate.
diff --git a/.planning/milestones/v1.27-phases/241-x86-64-vectorized-q2-k-q3-k-kernels/241-01-SUMMARY.md b/.planning/milestones/v1.27-phases/241-x86-64-vectorized-q2-k-q3-k-kernels/241-01-SUMMARY.md
new file mode 100644
index 00000000..8e720ae7
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/241-x86-64-vectorized-q2-k-q3-k-kernels/241-01-SUMMARY.md
@@ -0,0 +1,42 @@
+---
+phase: 241
+status: passed
+requirements-completed:
+  - XQK-01
+  - XQK-02
+requirements-blocked: []
+verification: passed
+---
+
+# Phase 241 Summary
+
+## What Changed
+
+- Added EMEL-owned x86_64 AVX2/FMA row kernels for `q2_K x q8_K` and
+  `q3_K x q8_K` in `src/emel/kernel/x86_64/actions.hpp`.
+- Kept the quantized operand path block-native: q2_K/q3_K LHS blocks and q8_K
+  RHS blocks generated by the maintained quantizer. No whole-tensor
+  dequantize-to-f32 hot-path substitution was added.
+- Added explicit q2/q3 optimized and shared route counters to the x86_64 actor
+  context and exposed actor accessors for attribution.
+- Routed supported q2_K/q3_K `op_mul_mat` requests through explicit guards and
+  SML transitions before generic f32 SIMD and shared scalar routes.
+- Added x86_64 tests proving row correctness against scalar q2/q3 oracles,
+  optimized route attribution on supported contracts, shared route attribution
+  when the feature contract is disabled, and multi-block accumulation behavior.
+
+## Validation
+
+- Failing-first x86_64 test object compile: red captured before implementation.
+- x86_64 test object compile: pass after implementation.
+- `emel_tests_bin` build: pass.
+- `emel_tests_kernel_and_graph` CTest shard: pass.
+- Unsupported x86 feature flag source scan: pass.
+- `scripts/lint_snapshot.sh`: pass without snapshot updates.
+- `git diff --check`: pass for Phase 241 changed source/test/planning files.
+- Scoped `scripts/quality_gates.sh`: coverage, paritychecker, benchmark
+  snapshot, lint, docs, and fuzz routing pass after approved snapshot updates.
+
+## Closeout Status
+
+The Phase 241 implementation satisfies and verifies `XQK-01` and `XQK-02`.
diff --git a/.planning/milestones/v1.27-phases/241-x86-64-vectorized-q2-k-q3-k-kernels/241-CONTEXT.md b/.planning/milestones/v1.27-phases/241-x86-64-vectorized-q2-k-q3-k-kernels/241-CONTEXT.md
new file mode 100644
index 00000000..881626fe
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/241-x86-64-vectorized-q2-k-q3-k-kernels/241-CONTEXT.md
@@ -0,0 +1,112 @@
+# Phase 241: x86_64 Vectorized q2_K/q3_K Kernels - Context
+
+**Gathered:** 2026-06-25
+**Status:** Ready for planning
+**Mode:** Auto-generated (autonomous processor-support phase)
+
+<domain>
+## Phase Boundary
+
+Land EMEL-owned x86_64 AVX2/FMA `q2_K x q8_K` and `q3_K x q8_K` kernels for
+the maintained quantized `op_mul_mat` hot path. This phase must prove optimized
+execution and shared fallback/no-claim behavior for q2_K and q3_K through the
+kernel actor route. It must not implement q6_K, runtime generator integration,
+or benchmark publication; those remain active Phase 242-244 obligations.
+
+</domain>
+
+<decisions>
+## Implementation Decisions
+
+### Kernel Contract
+- Implement native x86_64 row kernels in `src/emel/kernel/x86_64/actions.hpp`
+  for `q2_K x q8_K` and `q3_K x q8_K`.
+- Preserve the same effective operand class as the maintained scalar/AArch64
+  paths: block-q2_K or block-q3_K LHS and block-q8_K RHS, no whole-tensor
+  dequantize-to-f32 substitution in the hot path.
+- Use AVX2/FMA for vectorized accumulation where it improves the native row
+  path, with scalar tail handling only inside the already-selected kernel.
+- Keep Phase 241 limited to q2_K and q3_K; q6_K remains Phase 242.
+
+### Routing Contract
+- Add x86_64 guards/transitions for supported q2_K/q3_K `op_mul_mat` before the
+  generic f32 AVX2 and shared scalar routes.
+- Add optimized/shared q2/q3 route counters and actor accessors analogous to the
+  AArch64 route counters.
+- Keep runtime behavior choice in `guards.hpp` and `sm.hpp`; action/detail code
+  must execute only an already-selected q2 or q3 path.
+
+### Verification Contract
+- Add failing-first tests proving q2_K and q3_K optimized counters are missing or
+  shared-only before implementation.
+- Add correctness tests comparing x86 optimized row/mul_mat output to the
+  maintained scalar/reference oracle for representative blocks, multiple block
+  groups, tails, and accumulation behavior.
+- Prove disabled feature contracts take the shared q2/q3 route without claiming
+  optimized execution.
+
+### the agent's Discretion
+- Start with row-level kernels plus actor-route tests if that is the smallest
+  source-backed route to `XQK-01`/`XQK-02`.
+- Reuse test helpers from AArch64 quantized tests when possible instead of
+  inventing parallel fixtures.
+
+</decisions>
+
+<code_context>
+## Existing Code Insights
+
+### Reusable Assets
+- `src/emel/kernel/detail.hpp` owns scalar q2_K/q3_K/q8_K block structures and
+  scalar dot helpers.
+- `src/emel/kernel/aarch64/actions.hpp` contains NEON q2/q3 row kernels and
+  `op_mul_mat` route/counter precedent.
+- `tests/kernel/aarch64_tests.cpp` has row-level q2/q3 correctness fixtures and
+  actor-route tests for optimized/shared q2/q3 dispatch.
+- `tests/kernel/test_helpers.hpp` has q8_K vector source helpers and quantized
+  tensor-view construction helpers.
+
+### Established Patterns
+- Quantized optimized routes increment optimized q-format counters; shared
+  scalar routes increment shared q-format counters.
+- The actor surface exposes route counters through `sm.hpp` and `kernel/any.hpp`
+  consumes them when available.
+- Fallback/no-claim behavior is tested by disabling host SIMD support in the
+  machine context and proving shared counters increment.
+
+### Integration Points
+- `src/emel/kernel/x86_64/actions.hpp`: row kernels, selected-route actions, and
+  shared/optimized counter increments.
+- `src/emel/kernel/x86_64/guards.hpp`: q2/q3 optimized route predicates and
+  generic q2/q3 exclusion from shared route predicates.
+- `src/emel/kernel/x86_64/context.hpp`: q2/q3 route counters.
+- `src/emel/kernel/x86_64/sm.hpp`: q2/q3 transition rows and actor accessors.
+- `tests/kernel/x86_64_tests.cpp`: focused row and actor-route tests.
+
+</code_context>
+
+<specifics>
+## Specific Ideas
+
+- The Phase 241 request is exactly to bring this Ryzen AVX2/FMA processor toward
+  the same support standard as the NEON path, so q2_K/q3_K must be native EMEL
+  kernels, not benchmark-only or whole-tensor f32 fallback code.
+- The x86 feature contract from Phase 239 provides AVX2/FMA/F16C support booleans
+  that can gate q2/q3 optimized routes. q2/q3 integer unpacking itself should
+  not claim AVX-512, VNNI, AMX, BF16, or native FP16 support.
+
+</specifics>
+
+<active_follow_on_scope>
+## Active Follow-On Scope
+
+- Phase 242: AVX2/FMA q6_K and hot-path operand-fidelity proof.
+- Phase 243: maintained runtime integration and parity proof.
+- Phase 244: benchmark attribution and publication truth.
+
+</active_follow_on_scope>
+
+---
+
+*Phase: 241-x86-64-vectorized-q2-k-q3-k-kernels*
+*Context gathered: 2026-06-25*
diff --git a/.planning/milestones/v1.27-phases/241-x86-64-vectorized-q2-k-q3-k-kernels/241-VALIDATION.md b/.planning/milestones/v1.27-phases/241-x86-64-vectorized-q2-k-q3-k-kernels/241-VALIDATION.md
new file mode 100644
index 00000000..a5f8b74a
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/241-x86-64-vectorized-q2-k-q3-k-kernels/241-VALIDATION.md
@@ -0,0 +1,108 @@
+---
+phase: 241
+slug: x86-64-vectorized-q2-k-q3-k-kernels
+status: passed
+nyquist_compliant: true
+wave_0_complete: true
+created: 2026-06-25
+---
+
+# Phase 241 - Validation Strategy
+
+## Test Infrastructure
+
+| Property | Value |
+|----------|-------|
+| Framework | doctest, CTest, source scans, lint snapshot, quality gate |
+| Config file | `CMakeLists.txt`; `scripts/quality_gates.sh` |
+| Quick run command | `ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'` |
+| Gate command | `EMEL_QUALITY_GATES_CHANGED_FILES="<phase 241 files>" EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64 scripts/quality_gates.sh` |
+| Current gate status | passed after approved x86_64 benchmark baseline update |
+
+## Per-Task Verification Map
+
+| Task ID | Requirement | Test Type | Automated Command | Status |
+|---------|-------------|-----------|-------------------|--------|
+| 241-01-01 | XQK-01, XQK-02 | failing-first compile proof | `cmake --build build/phase239 --target CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o` before implementation | red captured |
+| 241-01-02 | XQK-01, XQK-02 | row-kernel correctness | x86_64 q2/q3 row tests against scalar q2_K/q3_K x q8_K helpers | green |
+| 241-01-03 | XQK-01, XQK-02 | public actor route proof | optimized/shared counter tests in `tests/kernel/x86_64_tests.cpp` | green |
+| 241-01-04 | quality gate | scoped quality gate | `EMEL_QUALITY_GATES_CHANGED_FILES="..." EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64 scripts/quality_gates.sh` | green |
+
+## Command Results
+
+```bash
+cmake --build build/phase239 --target CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o
+```
+
+Result: PASS after implementation. The pre-fix run failed on missing x86_64
+q2/q3 AVX2/FMA row helper symbols and actor counter accessors.
+
+```bash
+cmake --build build/phase239 --target emel_tests_bin -j2
+```
+
+Result: PASS.
+
+```bash
+ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'
+```
+
+Result: PASS, `100% tests passed`.
+
+```bash
+rg -n -- '-mavx512|-mavxvnni|-mamx|-mavx512bf16|-mavx512fp16|-mavx512vnni|-mavx512f' CMakeLists.txt src tests tools/paritychecker
+```
+
+Result: PASS, no unsupported x86 compile flags found.
+
+```bash
+scripts/lint_snapshot.sh
+```
+
+Result: PASS. No lint snapshot update was made.
+
+```bash
+git diff --check -- src/emel/kernel/x86_64/actions.hpp src/emel/kernel/x86_64/context.hpp src/emel/kernel/x86_64/guards.hpp src/emel/kernel/x86_64/sm.hpp tests/kernel/x86_64_tests.cpp .planning/phases/239-x86-64-avx2-fma-host-contract-and-baseline-audit/239-01-PLAN.md .planning/phases/240-x86-64-flash-attention-avx2-fma-kernel/240-CONTEXT.md
+```
+
+Result: PASS.
+
+```bash
+PATH="/shared/stateforward/.tools/clang-format-venv/bin:/shared/stateforward/.tools/llvm18/root/usr/lib/llvm-18/bin:/shared/stateforward/.tools/git-lfs/git-lfs-3.7.1:$PATH" \
+EMEL_QUALITY_GATES_BENCH_SUITE="kernel_x86_64" \
+EMEL_QUALITY_GATES_CHANGED_FILES="src/emel/kernel/x86_64/actions.hpp,src/emel/kernel/x86_64/context.hpp,src/emel/kernel/x86_64/guards.hpp,src/emel/kernel/x86_64/sm.hpp,tests/kernel/x86_64_tests.cpp" \
+scripts/quality_gates.sh
+```
+
+Initial result before snapshot approval: the scoped gate passed all
+non-benchmark lanes:
+
+- `test_with_coverage`: PASS. CTest shard `kernel_and_graph` passes.
+  Changed-line coverage is `576/601` lines (`95.8%`) and `144/204` branches
+  (`70.6%`).
+- `paritychecker`: PASS. Kernel parity runner passes.
+- `fuzz_smoke`: skipped because no fuzz-affecting files changed.
+- `lint_snapshot`: PASS without snapshot update.
+- `generate_docs`: PASS.
+
+The only failing lane was `bench_snapshot`: the `kernel_x86_64` suite still emits
+15 `kernel/x86_64/*` entries without baselines in
+`snapshots/bench/benchmarks.txt`. User approval was granted and the snapshot
+baseline was updated.
+
+## Validation Sign-Off
+
+- [x] x86_64 optimized q2_K route has automated/source-backed validation.
+- [x] x86_64 optimized q3_K route has automated/source-backed validation.
+- [x] x86_64 shared fallback/no-claim paths have automated validation.
+- [x] Focused x86_64 and kernel/graph tests pass.
+- [x] Coverage, parity, lint, fuzz routing, and docs lanes pass in scoped gate.
+- [x] Snapshot updates were explicitly approved and applied.
+- [x] Scoped quality gate passes after approved benchmark baseline update.
+- [x] `nyquist_compliant: true` and `wave_0_complete: true` are set in
+  frontmatter.
+- [x] Rule-compliance evidence is recorded through explicit q2/q3
+  guards/transitions, block-native operand tests, unsupported feature scans, and
+  lint.
+
+**Approval:** granted by user; snapshots updated.
diff --git a/.planning/milestones/v1.27-phases/241-x86-64-vectorized-q2-k-q3-k-kernels/241-VERIFICATION.md b/.planning/milestones/v1.27-phases/241-x86-64-vectorized-q2-k-q3-k-kernels/241-VERIFICATION.md
new file mode 100644
index 00000000..084ce424
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/241-x86-64-vectorized-q2-k-q3-k-kernels/241-VERIFICATION.md
@@ -0,0 +1,51 @@
+# Phase 241 Verification
+
+status: passed
+
+All commands were run from:
+`/shared/stateforward/emel.cpp`
+
+## Must-Have Verification
+
+| Must-have | Evidence | Status |
+|-----------|----------|--------|
+| Supported x86_64 q2_K route is optimized | `src/emel/kernel/x86_64/guards.hpp`, `sm.hpp`, `actions.hpp`; `tests/kernel/x86_64_tests.cpp` q2 route-counter test | PASS |
+| Supported x86_64 q3_K route is optimized | `src/emel/kernel/x86_64/guards.hpp`, `sm.hpp`, `actions.hpp`; `tests/kernel/x86_64_tests.cpp` q3 route-counter test | PASS |
+| Native EMEL-owned row kernels exist | `dot_q2_k_q8_k_row_avx2_fma` and `dot_q3_k_q8_k_row_avx2_fma` in `src/emel/kernel/x86_64/actions.hpp` | PASS |
+| Same effective operand class is preserved | tests use block-q2_K/block-q3_K LHS plus block-q8_K RHS generated by the maintained q8_K quantizer; no whole-tensor f32 substitution was added | PASS |
+| Shared fallback/no-claim behavior remains explicit | feature-disabled x86_64 q2/q3 tests increment shared counters and not optimized counters | PASS |
+| Required quality gate | approved `kernel_x86_64` benchmark/parity snapshots landed; scoped quality gate passed all selected lanes | PASS |
+
+## Evidence Summary
+
+- Failing-first compile proof captured after adding tests:
+  `cmake --build build/phase239 --target CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o`
+  failed before implementation on missing `dot_q2_k_q8_k_row_avx2_fma`,
+  `dot_q3_k_q8_k_row_avx2_fma`, and q2/q3 actor counter accessors.
+- `cmake --build build/phase239 --target CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o`
+  passes after implementation.
+- `cmake --build build/phase239 --target emel_tests_bin -j2` passes.
+- `ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'`
+  passes.
+- Source-only unsupported x86 flag scan passes:
+  `rg -n -- '-mavx512|-mavxvnni|-mamx|-mavx512bf16|-mavx512fp16|-mavx512vnni|-mavx512f' CMakeLists.txt src tests tools/paritychecker`
+  returns no matches.
+- `scripts/lint_snapshot.sh` passes without updating
+  `snapshots/lint/clang_format.txt`.
+- `git diff --check` passes for the Phase 241 changed source/test files and
+  the touched planning files.
+- Scoped `scripts/quality_gates.sh` passes all non-benchmark lanes:
+  coverage, paritychecker, fuzz routing, lint snapshot, and docs generation.
+- Coverage evidence from the scoped gate:
+  `changed-line coverage: lines 576/601 (95.8%), branches 144/204 (70.6%)`.
+- Approved benchmark snapshots now include the `kernel/x86_64/*` benchmark
+  suite entries.
+
+## Final Verification
+
+User approved snapshot updates. `scripts/bench.sh --snapshot --update
+--suite=kernel_x86_64` updated the benchmark baseline, maintained generation
+publication baselines were updated, and the changed-file scoped quality gate
+passed with `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64`.
+
+Phase 241 is fully verified for `XQK-01` and `XQK-02`.
diff --git a/.planning/milestones/v1.27-phases/242-x86-64-vectorized-q6-k-and-hot-path-contract/242-01-PLAN.md b/.planning/milestones/v1.27-phases/242-x86-64-vectorized-q6-k-and-hot-path-contract/242-01-PLAN.md
new file mode 100644
index 00000000..9e260d20
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/242-x86-64-vectorized-q6-k-and-hot-path-contract/242-01-PLAN.md
@@ -0,0 +1,113 @@
+# Phase 242 Plan: x86_64 Vectorized q6_K and Hot-Path Contract
+
+## Goal
+
+Add the x86_64 AVX2/FMA `q6_K x q8_K` kernel and prove the maintained x86_64
+quantized hot path stays block-native and allocation-free for supported
+q2_K/q3_K/q6_K optimized requests.
+
+## Tasks
+
+<tasks>
+  <task id="242-01-01" name="Add failing q6_K x86 route and hot-path tests">
+    <instructions>
+      Extend `tests/kernel/x86_64_tests.cpp` with q6_K tests that fail before
+      implementation. Cover row-kernel correctness, optimized q6 counters,
+      shared q6 counters when the feature contract is disabled, multi-block
+      accumulation, and source-backed hot-path contract assertions for q2/q3/q6.
+    </instructions>
+    <verification>
+      Compile or run the focused x86_64/kernel shard and capture the expected
+      red failure before implementation.
+    </verification>
+  </task>
+
+  <task id="242-01-02" name="Implement native x86 q6_K row kernel">
+    <instructions>
+      Add an x86_64 `q6_K x q8_K` row helper using AVX2/FMA numeric work and
+      the same block-q6_K/block-q8_K operand format as scalar/AArch64. Keep any
+      branch/tail handling local to the already-selected q6 numeric kernel.
+    </instructions>
+    <verification>
+      Row-level tests match scalar/reference output within the established
+      quantized tolerance across multiple block counts.
+    </verification>
+  </task>
+
+  <task id="242-01-03" name="Wire q6_K SML optimized/shared routes">
+    <instructions>
+      Add q6 optimized route predicates in `guards.hpp`, selected-route actions
+      and counters in `actions.hpp`/`context.hpp`, actor accessors in `sm.hpp`,
+      and transition rows in `sm.hpp` before generic f32 SIMD and shared scalar
+      `op_mul_mat` routes.
+    </instructions>
+    <verification>
+      Actor tests prove optimized q6 counters increment for supported requests
+      and shared q6 counters increment when the feature contract is disabled.
+    </verification>
+  </task>
+
+  <task id="242-01-04" name="Prove quantized hot-path contract">
+    <instructions>
+      Add focused proof that supported q2_K/q3_K/q6_K optimized requests stay on
+      q*_K x q8_K block operands and do not allocate during dispatch. Prefer a
+      maintained allocation guard or source-backed route assertions over broad
+      instrumentation.
+    </instructions>
+    <verification>
+      Tests or source-backed validation show no whole-tensor dequantize-to-f32
+      substitution and no dispatch-time heap allocation for supported optimized
+      q2/q3/q6 requests.
+    </verification>
+  </task>
+
+  <task id="242-01-05" name="Close validation artifacts">
+    <instructions>
+      Write `242-VERIFICATION.md`, `242-VALIDATION.md`, and `242-01-SUMMARY.md`
+      from source/test evidence. Keep benchmark snapshot approval visible as a
+      shared closeout gate and do not mutate snapshots without explicit
+      approval.
+    </instructions>
+    <verification>
+      Focused tests, source scan, lint snapshot, and changed-file scoped quality
+      gate evidence are recorded.
+    </verification>
+  </task>
+</tasks>
+
+## Verification
+
+1. Failing-first x86_64 q6/hot-path tests.
+2. `cmake --build build/phase239 --target CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o`.
+3. `ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'`.
+4. Source scan for forbidden x86 feature claims/flags:
+   `rg -n -- '-mavx512|-mavxvnni|-mamx|-mavx512bf16|-mavx512fp16|-mavx512vnni|-mavx512f' CMakeLists.txt src tests tools/paritychecker`
+5. `scripts/lint_snapshot.sh` without snapshot updates.
+6. Changed-file scoped `scripts/quality_gates.sh` with
+   `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64`. Do not update snapshots
+   without explicit approval.
+
+<rule_constraints>
+## Rule Constraints
+
+- Follow `AGENTS.md` and `docs/rules/sml.rules.md`.
+- Keep runtime behavior selection in `guards.hpp` and `sm.hpp` transitions.
+- Keep action/detail helpers as already-chosen numeric work only.
+- Do not add queue/mailbox/deferred-dispatch behavior.
+- Do not add AVX-512, AVX-VNNI, AMX, BF16, native-FP16, GPU, or llama.cpp/ggml
+  runtime linkage.
+- Do not add whole-tensor dequantize-to-f32 hot-path substitution.
+- Do not update snapshots without explicit user approval.
+
+</rule_constraints>
+
+## Completion Criteria
+
+- `XQK-03` has source-backed evidence that supported x86_64 `q6_K x q8_K`
+  requests run through a native optimized x86 kernel.
+- `XQK-04` has source-backed evidence that supported q2_K/q3_K/q6_K optimized
+  requests stay block-native and allocation-free during dispatch.
+- Tests prove optimized/shared q6 attribution, row correctness, and hot-path
+  contract behavior.
+- Verification artifacts distinguish source completion from the still-pending
+  repository-level benchmark snapshot approval gate.
diff --git a/.planning/milestones/v1.27-phases/242-x86-64-vectorized-q6-k-and-hot-path-contract/242-01-SUMMARY.md b/.planning/milestones/v1.27-phases/242-x86-64-vectorized-q6-k-and-hot-path-contract/242-01-SUMMARY.md
new file mode 100644
index 00000000..371d457b
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/242-x86-64-vectorized-q6-k-and-hot-path-contract/242-01-SUMMARY.md
@@ -0,0 +1,43 @@
+---
+phase: 242
+status: passed
+requirements-completed:
+  - XQK-03
+  - XQK-04
+requirements-blocked: []
+verification: passed
+---
+
+# Phase 242 Summary
+
+## What Changed
+
+- Added an EMEL-owned x86_64 AVX2/FMA row kernel for `q6_K x q8_K` in
+  `src/emel/kernel/x86_64/actions.hpp`.
+- Kept the quantized operand path block-native: q6_K LHS blocks and q8_K RHS
+  blocks generated by the maintained quantizer. No whole-tensor
+  dequantize-to-f32 hot-path substitution was added.
+- Added explicit q6 optimized and shared route counters to the x86_64 actor
+  context and exposed actor accessors for attribution.
+- Routed supported q6_K `op_mul_mat` requests through explicit guards and SML
+  transitions before generic f32 SIMD and shared scalar routes.
+- Added x86_64 tests proving q6 row correctness against the scalar q6 oracle,
+  optimized route attribution on supported contracts, shared route attribution
+  when the feature contract is disabled, and allocation-free q2/q3/q6 optimized
+  hot-path dispatch.
+
+## Validation
+
+- Failing-first x86_64 test object compile: red captured before implementation.
+- x86_64 test object compile: pass after implementation.
+- `emel_tests_bin` build: pass.
+- `emel_tests_kernel_and_graph` CTest shard: pass.
+- Unsupported x86 feature flag source scan: pass.
+- `scripts/lint_snapshot.sh`: pass without snapshot updates.
+- `git diff --check`: pass.
+- Scoped `scripts/quality_gates.sh`: coverage, paritychecker, benchmark
+  snapshot, lint, docs, and fuzz routing pass after approved snapshot updates.
+
+## Closeout Status
+
+The Phase 242 implementation satisfies and verifies `XQK-03` and `XQK-04`.
diff --git a/.planning/milestones/v1.27-phases/242-x86-64-vectorized-q6-k-and-hot-path-contract/242-CONTEXT.md b/.planning/milestones/v1.27-phases/242-x86-64-vectorized-q6-k-and-hot-path-contract/242-CONTEXT.md
new file mode 100644
index 00000000..2e0a9166
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/242-x86-64-vectorized-q6-k-and-hot-path-contract/242-CONTEXT.md
@@ -0,0 +1,112 @@
+# Phase 242: x86_64 Vectorized q6_K and Hot-Path Contract - Context
+
+**Gathered:** 2026-06-25
+**Status:** Ready for planning
+**Mode:** Auto-generated (autonomous processor-support phase)
+
+<domain>
+## Phase Boundary
+
+Add the EMEL-owned x86_64 AVX2/FMA `q6_K x q8_K` kernel for the maintained
+quantized `op_mul_mat` hot path and prove the q2_K/q3_K/q6_K optimized x86_64
+routes preserve the hot-path contract: same effective operand class, no
+whole-tensor dequantize-to-f32 substitution, and no dispatch-time allocation.
+This phase does not own runtime generator integration, parity publication, or
+benchmark publication; those remain active Phase 243-244 obligations.
+
+</domain>
+
+<decisions>
+## Implementation Decisions
+
+### Kernel Contract
+- Implement a native x86_64 `q6_K x q8_K` row kernel in
+  `src/emel/kernel/x86_64/actions.hpp`.
+- Preserve the same operand path as scalar/AArch64 q6: block-q6_K LHS,
+  block-q8_K RHS produced by the maintained q8_K quantizer, and f32 output.
+- Do not add whole-tensor dequantize-to-f32 hot-path substitution.
+- Keep q6_K scope to the maintained unpacked block-q6_K route; packed/prepared
+  q6 vector variants are not required for this phase unless already needed by
+  the maintained x86 route under test.
+
+### Routing Contract
+- Add x86_64 q6 optimized route predicates in `guards.hpp`, selected-route
+  actions in `actions.hpp`, counters in `context.hpp`, actor accessors in
+  `sm.hpp`, and transition rows before generic f32 SIMD/shared scalar
+  `op_mul_mat` routes.
+- Keep runtime behavior choice in guards and SML transitions; q6 actions must
+  execute only the already-selected q6 path.
+- Add shared q6 counter attribution for the scalar fallback/no-claim path.
+
+### Hot-Path Contract Proof
+- Prove q2_K/q3_K/q6_K optimized routes consume q*_K blocks and q8_K RHS blocks,
+  not whole-tensor dequantized f32 intermediates.
+- Prove supported optimized dispatch performs no heap allocation by exercising
+  q2/q3/q6 route calls under the repo's maintained allocation/checking pattern
+  or a focused deterministic allocation guard.
+- Keep validation source-backed: tests must drive public actor dispatch, not
+  private action helpers directly.
+
+### the agent's Discretion
+- Start with row helper correctness plus actor route tests for q6_K, then add
+  the narrowest allocation/operand-fidelity assertions that are source-backed
+  and maintainable.
+- Reuse Phase 241 x86 fixtures and AArch64 q6 test patterns where practical.
+
+</decisions>
+
+<code_context>
+## Existing Code Insights
+
+### Reusable Assets
+- `src/emel/kernel/detail.hpp` owns scalar `dot_q6_k_q8_k_block_scalar` and
+  `dot_q6_k_q8_k_row_scalar`.
+- `src/emel/kernel/aarch64/actions.hpp` contains NEON q6 row and `op_mul_mat`
+  route/counter precedent.
+- `tests/kernel/aarch64_tests.cpp` has q6 row correctness and actor route
+  examples.
+- Phase 241 added x86 q2/q3 helpers, route counters, guards/transitions, and
+  focused x86 tests that can be extended for q6.
+
+### Established Patterns
+- Quantized optimized routes increment optimized q-format counters; shared
+  scalar routes increment shared q-format counters.
+- Feature-disabled machine contexts prove fallback/no-claim behavior.
+- Focused x86 tests compare optimized row/mul_mat output against scalar q*_K x
+  q8_K oracles and then validate actor route attribution.
+
+### Integration Points
+- `src/emel/kernel/x86_64/actions.hpp`: q6 row kernel, selected-route action,
+  shared/optimized counter increments, and hot-path helper code.
+- `src/emel/kernel/x86_64/guards.hpp`: q6 optimized route predicate and generic
+  q6 exclusion from shared route predicates.
+- `src/emel/kernel/x86_64/context.hpp`: q6 route counters.
+- `src/emel/kernel/x86_64/sm.hpp`: q6 transition row and actor accessors.
+- `tests/kernel/x86_64_tests.cpp`: q6 row/route tests and hot-path contract
+  tests.
+
+</code_context>
+
+<specifics>
+## Specific Ideas
+
+- Phase 242 closes the quantized-kernel set named in v1.27: q2_K, q3_K, and
+  q6_K on this AVX2/FMA host.
+- The hot-path contract is an implementation obligation in this phase, not a
+  publication-only claim: supported optimized requests must stay block-native
+  and allocation-free during dispatch.
+
+</specifics>
+
+<active_follow_on_scope>
+## Active Follow-On Scope
+
+- Phase 243: maintained runtime integration and parity proof.
+- Phase 244: benchmark attribution and publication truth.
+
+</active_follow_on_scope>
+
+---
+
+*Phase: 242-x86-64-vectorized-q6-k-and-hot-path-contract*
+*Context gathered: 2026-06-25*
diff --git a/.planning/milestones/v1.27-phases/242-x86-64-vectorized-q6-k-and-hot-path-contract/242-VALIDATION.md b/.planning/milestones/v1.27-phases/242-x86-64-vectorized-q6-k-and-hot-path-contract/242-VALIDATION.md
new file mode 100644
index 00000000..921e7196
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/242-x86-64-vectorized-q6-k-and-hot-path-contract/242-VALIDATION.md
@@ -0,0 +1,109 @@
+---
+phase: 242
+slug: x86-64-vectorized-q6-k-and-hot-path-contract
+status: passed
+nyquist_compliant: true
+wave_0_complete: true
+created: 2026-06-25
+---
+
+# Phase 242 - Validation Strategy
+
+## Test Infrastructure
+
+| Property | Value |
+|----------|-------|
+| Framework | doctest, CTest, source scans, lint snapshot, quality gate |
+| Config file | `CMakeLists.txt`; `scripts/quality_gates.sh` |
+| Quick run command | `ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'` |
+| Gate command | `EMEL_QUALITY_GATES_CHANGED_FILES="<phase 242 files>" EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64 scripts/quality_gates.sh` |
+| Current gate status | passed after approved x86_64 benchmark baseline update |
+
+## Per-Task Verification Map
+
+| Task ID | Requirement | Test Type | Automated Command | Status |
+|---------|-------------|-----------|-------------------|--------|
+| 242-01-01 | XQK-03, XQK-04 | failing-first compile proof | `cmake --build build/phase239 --target CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o` before implementation | red captured |
+| 242-01-02 | XQK-03 | row-kernel correctness | x86_64 q6 row test against scalar q6_K x q8_K helper | green |
+| 242-01-03 | XQK-03 | public actor route proof | optimized/shared q6 counter tests in `tests/kernel/x86_64_tests.cpp` | green |
+| 242-01-04 | XQK-04 | hot-path contract proof | allocation-guarded q2/q3/q6 optimized dispatch test plus source review of q*_K x q8_K operand path | green |
+| 242-01-05 | quality gate | scoped quality gate | `EMEL_QUALITY_GATES_CHANGED_FILES="..." EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64 scripts/quality_gates.sh` | green |
+
+## Command Results
+
+```bash
+cmake --build build/phase239 --target CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o
+```
+
+Result: PASS after implementation. The pre-fix run failed on missing x86_64 q6
+AVX2/FMA row helper symbols and actor q6 counter accessors.
+
+```bash
+cmake --build build/phase239 --target emel_tests_bin -j2
+```
+
+Result: PASS.
+
+```bash
+ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'
+```
+
+Result: PASS, `100% tests passed`.
+
+```bash
+rg -n -- '-mavx512|-mavxvnni|-mamx|-mavx512bf16|-mavx512fp16|-mavx512vnni|-mavx512f' CMakeLists.txt src tests tools/paritychecker
+```
+
+Result: PASS, no unsupported x86 compile flags found.
+
+```bash
+scripts/lint_snapshot.sh
+```
+
+Result: PASS. No lint snapshot update was made.
+
+```bash
+git diff --check
+```
+
+Result: PASS.
+
+```bash
+PATH="/shared/stateforward/.tools/clang-format-venv/bin:/shared/stateforward/.tools/llvm18/root/usr/lib/llvm-18/bin:/shared/stateforward/.tools/git-lfs/git-lfs-3.7.1:$PATH" \
+EMEL_QUALITY_GATES_BENCH_SUITE="kernel_x86_64" \
+EMEL_QUALITY_GATES_CHANGED_FILES="src/emel/kernel/x86_64/actions.hpp,src/emel/kernel/x86_64/context.hpp,src/emel/kernel/x86_64/guards.hpp,src/emel/kernel/x86_64/sm.hpp,tests/kernel/x86_64_tests.cpp" \
+scripts/quality_gates.sh
+```
+
+Initial result before snapshot approval: the scoped gate passed all
+non-benchmark lanes:
+
+- `test_with_coverage`: PASS. CTest shard `kernel_and_graph` passes.
+  Changed-line coverage is `676/701` lines (`96.4%`) and `169/238` branches
+  (`71.0%`).
+- `paritychecker`: PASS. Kernel parity runner passes.
+- `fuzz_smoke`: skipped because no fuzz-affecting files changed.
+- `lint_snapshot`: PASS without snapshot update.
+- `generate_docs`: PASS.
+
+The only failing lane was `bench_snapshot`: the `kernel_x86_64` suite still emits
+15 `kernel/x86_64/*` entries without baselines in
+`snapshots/bench/benchmarks.txt`. User approval was granted and the snapshot
+baseline was updated.
+
+## Validation Sign-Off
+
+- [x] x86_64 optimized q6_K route has automated/source-backed validation.
+- [x] x86_64 shared q6 fallback/no-claim path has automated validation.
+- [x] q2_K/q3_K/q6_K optimized dispatch has allocation-free hot-path validation.
+- [x] Focused x86_64 and kernel/graph tests pass.
+- [x] Coverage, parity, lint, fuzz routing, and docs lanes pass in scoped gate.
+- [x] Snapshot updates were explicitly approved and applied.
+- [x] Scoped quality gate passes after approved benchmark baseline update.
+- [x] `nyquist_compliant: true` and `wave_0_complete: true` are set in
+  frontmatter.
+- [x] Rule-compliance evidence is recorded through explicit q6
+  guards/transitions, allocation-guarded optimized dispatch tests, block-native
+  operand review, unsupported feature scans, and lint.
+
+**Approval:** granted by user; snapshots updated.
diff --git a/.planning/milestones/v1.27-phases/242-x86-64-vectorized-q6-k-and-hot-path-contract/242-VERIFICATION.md b/.planning/milestones/v1.27-phases/242-x86-64-vectorized-q6-k-and-hot-path-contract/242-VERIFICATION.md
new file mode 100644
index 00000000..b357797d
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/242-x86-64-vectorized-q6-k-and-hot-path-contract/242-VERIFICATION.md
@@ -0,0 +1,51 @@
+# Phase 242 Verification
+
+status: passed
+
+All commands were run from:
+`/shared/stateforward/emel.cpp`
+
+## Must-Have Verification
+
+| Must-have | Evidence | Status |
+|-----------|----------|--------|
+| Supported x86_64 q6_K route is optimized | `src/emel/kernel/x86_64/guards.hpp`, `sm.hpp`, `actions.hpp`; `tests/kernel/x86_64_tests.cpp` q6 route-counter test | PASS |
+| Native EMEL-owned q6 row kernel exists | `dot_q6_k_q8_k_row_avx2_fma` and `dot_q6_k_q8_k_block_avx2_fma` in `src/emel/kernel/x86_64/actions.hpp` | PASS |
+| Same effective operand class is preserved | q6 tests use block-q6_K LHS plus block-q8_K RHS generated by the maintained q8_K quantizer; q2/q3/q6 hot-path test routes through block-native kernels | PASS |
+| Supported q2/q3/q6 dispatch is allocation-free | `kernel_x86_64_quantized_hot_path_dispatches_without_allocation` wraps q2/q3/q6 optimized dispatch in `allocation_scope` and observes zero allocations | PASS |
+| No whole-tensor dequantize-to-f32 hot-path substitution was added | optimized q2/q3/q6 actions quantize RHS columns to q8_K blocks and call q*_K x q8_K row helpers; source review found no dequantize-to-f32 substitution in the x86_64 optimized path | PASS |
+| Shared fallback/no-claim behavior remains explicit | feature-disabled q6 actor test increments shared q6 counter and not optimized q6 counter | PASS |
+| Required quality gate | approved `kernel_x86_64` benchmark/parity snapshots landed; scoped quality gate passed all selected lanes | PASS |
+
+## Evidence Summary
+
+- Failing-first compile proof captured after adding q6/hot-path tests:
+  `cmake --build build/phase239 --target CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o`
+  failed before implementation on missing `dot_q6_k_q8_k_row_avx2_fma` and q6
+  actor counter accessors.
+- `cmake --build build/phase239 --target CMakeFiles/emel_tests_bin.dir/tests/kernel/x86_64_tests.cpp.o`
+  passes after implementation.
+- `cmake --build build/phase239 --target emel_tests_bin -j2` passes.
+- `ctest --test-dir build/phase239 --output-on-failure -R '^emel_tests_kernel_and_graph$'`
+  passes.
+- Source-only unsupported x86 flag scan passes:
+  `rg -n -- '-mavx512|-mavxvnni|-mamx|-mavx512bf16|-mavx512fp16|-mavx512vnni|-mavx512f' CMakeLists.txt src tests tools/paritychecker`
+  returns no matches.
+- `scripts/lint_snapshot.sh` passes without updating
+  `snapshots/lint/clang_format.txt`.
+- `git diff --check` passes.
+- Scoped `scripts/quality_gates.sh` passes all non-benchmark lanes:
+  coverage, paritychecker, fuzz routing, lint snapshot, and docs generation.
+- Coverage evidence from the scoped gate:
+  `changed-line coverage: lines 676/701 (96.4%), branches 169/238 (71.0%)`.
+- Approved benchmark snapshots now include the `kernel/x86_64/*` benchmark
+  suite entries.
+
+## Final Verification
+
+User approved snapshot updates. `scripts/bench.sh --snapshot --update
+--suite=kernel_x86_64` updated the benchmark baseline, maintained generation
+publication baselines were updated, and the changed-file scoped quality gate
+passed with `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64`.
+
+Phase 242 is fully verified for `XQK-03` and `XQK-04`.
diff --git a/.planning/milestones/v1.27-phases/243-runtime-integration-and-parity-proof/243-01-PLAN.md b/.planning/milestones/v1.27-phases/243-runtime-integration-and-parity-proof/243-01-PLAN.md
new file mode 100644
index 00000000..e643fad5
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/243-runtime-integration-and-parity-proof/243-01-PLAN.md
@@ -0,0 +1,144 @@
+# Phase 243 Plan: Runtime Integration and Parity Proof
+
+## Goal
+
+Prove the maintained generator -> graph -> processor -> kernel chain selects
+the x86_64 AVX2/FMA optimized quantized path on this host and publish that
+proof through maintained generator tests and paritychecker generation
+attribution.
+
+## Tasks
+
+<tasks>
+  <task id="243-01-01" name="Add generator-chain x86 quantized dispatch assertions">
+    <read_first>
+      - `tests/text/generator/lifecycle_tests.cpp`
+      - `src/emel/text/generator/events.hpp`
+      - `src/emel/text/generator/actions.hpp`
+      - `src/emel/kernel/any.hpp`
+    </read_first>
+    <files>
+      - `tests/text/generator/lifecycle_tests.cpp`
+    </files>
+    <action>
+      Strengthen `generator_generate_quantized_contract_fixture_preserves_zero_disallowed_fallback`
+      so the x86_64 branch requires:
+      `diagnostics.optimized_q2_dispatch_calls > 0u`,
+      `diagnostics.optimized_q3_dispatch_calls > 0u`,
+      `diagnostics.optimized_q6_dispatch_calls > 0u`,
+      `diagnostics.shared_q2_dispatch_calls == 0u`,
+      `diagnostics.shared_q3_dispatch_calls == 0u`, and
+      `diagnostics.shared_q6_dispatch_calls == 0u`.
+      Keep the existing AArch64 q6-vector assertions and non-optimized vector
+      assertions for other hosts intact.
+    </action>
+    <acceptance_criteria>
+      - `tests/text/generator/lifecycle_tests.cpp` contains `diagnostics.optimized_q2_dispatch_calls > 0u`.
+      - `tests/text/generator/lifecycle_tests.cpp` contains `diagnostics.optimized_q3_dispatch_calls > 0u`.
+      - `tests/text/generator/lifecycle_tests.cpp` contains `diagnostics.optimized_q6_dispatch_calls > 0u`.
+      - `build/phase239/emel_tests_bin --test-case='generator_generate_quantized_contract_fixture_preserves_zero_disallowed_fallback' --no-skipped-summary` exits 0.
+    </acceptance_criteria>
+  </task>
+
+  <task id="243-01-02" name="Require x86 quantized attribution in generation parity">
+    <read_first>
+      - `tools/paritychecker/parity_engines.cpp`
+      - `tools/paritychecker/paritychecker_tests.cpp`
+      - `tools/paritychecker/parity_runner.cpp`
+    </read_first>
+    <files>
+      - `tools/paritychecker/parity_engines.cpp`
+      - `tools/paritychecker/paritychecker_tests.cpp`
+    </files>
+    <action>
+      Update the generation parity attribution check in
+      `tools/paritychecker/parity_engines.cpp` so `kernel_kind::x86_64`
+      requires optimized q2/q3/q6 dispatch counters to be positive and shared
+      q2/q3/q6 counters to be zero. Keep AArch64 and non-x86 behavior explicit.
+      Extend `check_generation_quantized_attribution` in
+      `tools/paritychecker/paritychecker_tests.cpp` so x86_64 output requires
+      positive `optimized_q2_dispatch_calls`, `optimized_q3_dispatch_calls`,
+      and `optimized_q6_dispatch_calls`, with zero shared q2/q3/q6 counters.
+    </action>
+    <acceptance_criteria>
+      - `tools/paritychecker/parity_engines.cpp` contains `kernel_kind::x86_64` near the generation quantized dispatch proof.
+      - `tools/paritychecker/paritychecker_tests.cpp` contains x86 assertions for `optimized_q2_dispatch_calls`, `optimized_q3_dispatch_calls`, and `optimized_q6_dispatch_calls`.
+      - `cmake --build build/paritychecker_zig --target paritychecker paritychecker_tests -j2` exits 0.
+      - `build/paritychecker_zig/paritychecker_tests --test-case='paritychecker matches current maintained generation publication across max-token counts' --no-skipped-summary` exits 0 when maintained fixtures are present.
+    </acceptance_criteria>
+  </task>
+
+  <task id="243-01-03" name="Close Phase 243 validation artifacts">
+    <read_first>
+      - `.planning/REQUIREMENTS.md`
+      - `.planning/ROADMAP.md`
+      - `.planning/STATE.md`
+      - `.planning/phases/243-runtime-integration-and-parity-proof/243-01-PLAN.md`
+    </read_first>
+    <files>
+      - `.planning/phases/243-runtime-integration-and-parity-proof/243-VERIFICATION.md`
+      - `.planning/phases/243-runtime-integration-and-parity-proof/243-VALIDATION.md`
+      - `.planning/phases/243-runtime-integration-and-parity-proof/243-01-SUMMARY.md`
+      - `.planning/REQUIREMENTS.md`
+      - `.planning/ROADMAP.md`
+      - `.planning/STATE.md`
+      - `.planning/PROJECT.md`
+    </files>
+    <action>
+      Record source/test evidence for `XRT-01`, `XRT-02`, and `XRT-03`.
+      Keep publication and quality-gate state source-backed; do not update
+      snapshots unless approval has been granted.
+    </action>
+    <acceptance_criteria>
+      - `.planning/phases/243-runtime-integration-and-parity-proof/243-VERIFICATION.md` exists.
+      - `.planning/phases/243-runtime-integration-and-parity-proof/243-VALIDATION.md` exists.
+      - `.planning/phases/243-runtime-integration-and-parity-proof/243-01-SUMMARY.md` exists.
+      - `.planning/REQUIREMENTS.md` traceability rows for `XRT-01`, `XRT-02`, and `XRT-03` reflect the current verified state.
+    </acceptance_criteria>
+  </task>
+</tasks>
+
+## Verification
+
+1. Focused generator doctest:
+   `build/phase239/emel_tests_bin --test-case='generator_generate_quantized_contract_fixture_preserves_zero_disallowed_fallback' --no-skipped-summary`.
+2. Paritychecker build:
+   `cmake --build build/paritychecker_zig --target paritychecker paritychecker_tests -j2`.
+3. Paritychecker maintained generation publication test:
+   `build/paritychecker_zig/paritychecker_tests --test-case='paritychecker matches current maintained generation publication across max-token counts' --no-skipped-summary`.
+4. Source scan for forbidden x86 feature claims/flags:
+   `rg -n -- '-mavx512|-mavxvnni|-mamx|-mavx512bf16|-mavx512fp16|-mavx512vnni|-mavx512f' CMakeLists.txt src tests tools/paritychecker`.
+5. `scripts/lint_snapshot.sh` without snapshot updates.
+6. Changed-file scoped `scripts/quality_gates.sh` with
+   `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64`. Do not update snapshots
+   without explicit approval.
+
+<rule_constraints>
+## Rule Constraints
+
+- Follow `AGENTS.md` and `docs/rules/sml.rules.md`.
+- Keep runtime behavior selection in guards and SML transitions; Phase 243
+  should not move routing choice into actions/detail.
+- Drive proof through public events and maintained state machines. Do not reach
+  into actor `actions.hpp` or private helpers from paritychecker/tests.
+- Keep EMEL and llama.cpp/ggml lanes separated. Paritychecker may use
+  llama.cpp/ggml only on the reference comparison lane.
+- Do not add queue/mailbox/deferred-dispatch behavior.
+- Do not add AVX-512, AVX-VNNI, AMX, BF16, native-FP16, GPU, or llama.cpp/ggml
+  runtime linkage.
+- Do not add whole-tensor dequantize-to-f32 hot-path substitution.
+- Do not update snapshots without explicit user approval.
+
+</rule_constraints>
+
+## Completion Criteria
+
+- `XRT-01` has source-backed evidence that generator diagnostics report the
+  x86_64 kernel kind and optimized q2/q3/q6 dispatch on the maintained
+  quantized-contract generation path.
+- `XRT-02` has paritychecker generation evidence for maintained fixture token
+  counts `1`, `10`, `100`, and `1000`.
+- `XRT-03` has tests proving supported optimized behavior and deterministic
+  fallback/no-claim behavior through public generator dispatch and diagnostics.
+- Verification artifacts distinguish source completion from the still-pending
+  repository-level benchmark snapshot approval gate.
diff --git a/.planning/milestones/v1.27-phases/243-runtime-integration-and-parity-proof/243-01-SUMMARY.md b/.planning/milestones/v1.27-phases/243-runtime-integration-and-parity-proof/243-01-SUMMARY.md
new file mode 100644
index 00000000..650c835c
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/243-runtime-integration-and-parity-proof/243-01-SUMMARY.md
@@ -0,0 +1,50 @@
+---
+phase: 243
+status: passed
+requirements-completed:
+  - XRT-01
+  - XRT-02
+  - XRT-03
+requirements-blocked: []
+verification: passed
+---
+
+# Phase 243 Summary
+
+## What Changed
+
+- Strengthened the maintained quantized-contract generator lifecycle test so
+  x86_64 hosts must report optimized q2/q3/q6 dispatch counters and zero shared
+  q2/q3/q6 counters through public generator diagnostics.
+- Updated paritychecker generation attribution so x86_64 maintained generation
+  proof requires native q2/q3/q6 optimized counters when those native tensor
+  types are present.
+- Extended paritychecker tests to parse and assert the x86_64
+  `quantized_dispatch:` counters emitted by the maintained generation path.
+- Fixed reference context sizing so live generation parity works for larger
+  `--max-tokens` runs after prompt tokenization.
+- Bound model-specific RoPE pairing metadata for Qwen3, Gemma4, and LFM2 so
+  maintained generation parity uses the correct NeoX/normal RoPE layout without
+  adding runtime hot-path layout routing.
+- Removed temporary generation/parity diagnostics probes that were not part of
+  the maintained runtime proof surface.
+
+## Validation
+
+- `emel_tests_bin` build: pass.
+- `paritychecker` and `paritychecker_tests` build: pass.
+- Focused generator, model-binding, and generator-detail doctests: pass.
+- `paritychecker_tests`: pass.
+- Maintained generation publication test against live reference: pass.
+- Live EMEL/reference generation parity for `1`, `10`, `100`, and `1000`
+  tokens: match. Checked-in generation baselines for `10`, `100`, and `1000`
+  tokens were updated after explicit approval.
+- Domain-boundary guard and unsupported x86 feature scan: pass.
+- `scripts/lint_snapshot.sh`: pass without snapshot updates.
+- `git diff --check`: pass.
+- Scoped `scripts/quality_gates.sh`: build, coverage, paritychecker, benchmark
+  snapshot, lint, docs, and fuzz routing pass after approved snapshot updates.
+
+## Closeout Status
+
+Phase 243 satisfies and verifies `XRT-01`, `XRT-02`, and `XRT-03`.
diff --git a/.planning/milestones/v1.27-phases/243-runtime-integration-and-parity-proof/243-CONTEXT.md b/.planning/milestones/v1.27-phases/243-runtime-integration-and-parity-proof/243-CONTEXT.md
new file mode 100644
index 00000000..474377f4
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/243-runtime-integration-and-parity-proof/243-CONTEXT.md
@@ -0,0 +1,112 @@
+# Phase 243: Runtime Integration and Parity Proof - Context
+
+**Gathered:** 2026-06-25
+**Status:** Ready for planning
+**Mode:** Auto-generated (autonomous processor-support phase)
+
+<domain>
+## Phase Boundary
+
+Adopt the x86_64 AVX2/FMA kernel work from Phases 239-242 in the maintained
+generator -> graph -> processor -> kernel proof surfaces. This phase does not
+add new numeric kernels; it proves the shipped runtime chain selects the new
+x86_64 optimized routes where the maintained generation fixture actually uses
+q2_K/q3_K/q6_K tensors, and that paritychecker publishes the corresponding
+attribution for `1`, `10`, `100`, and `1000` token generation runs.
+
+</domain>
+
+<decisions>
+## Implementation Decisions
+
+### Runtime Diagnostics Contract
+- Use the existing generator `capture_diagnostics` event as the runtime proof
+  surface. It already exposes `kernel_kind`, total kernel dispatch count,
+  flash attribution, q2/q3/q6 optimized and shared counters, and quantized
+  contract stage counts.
+- Do not add public API or C ABI surface. Phase 243 proof stays inside
+  maintained tests and paritychecker attribution.
+- Do not reach into actor actions or private helpers from tests or tools; drive
+  generator proof through `process_event(...)` and public generator events.
+
+### Maintained Generation Fixture
+- Use `generator_fixture::model_variant::quantized_contract` in
+  `tests/text/generator/lifecycle_tests.cpp` for source-backed generator-chain
+  assertions. Its tensor setup assigns q2_K/q3_K/q6_K to maintained model
+  stages.
+- On x86_64 hosts, require optimized q2/q3/q6 dispatch counters to be positive
+  and shared q2/q3/q6 counters to stay zero for the quantized-contract generate
+  path.
+- On non-x86 hosts, keep existing platform-specific expectations intact.
+
+### Paritychecker Proof
+- Update `tools/paritychecker/parity_engines.cpp` so generation parity accepts
+  and requires x86_64 optimized q2/q3/q6 attribution when the maintained
+  generation fixture runs on the x86_64 kernel kind.
+- Keep non-x86 and AArch64 expectations explicit; do not claim x86 optimized
+  attribution on other kernel kinds.
+- Extend existing `tools/paritychecker/paritychecker_tests.cpp` attribution
+  checks so emitted `quantized_dispatch:` output proves the x86 counters are
+  positive and shared counters are zero.
+
+### Validation
+- Run focused generator quantized-contract doctest cases directly when the
+  broad `emel_tests_generator_and_runtime` shard is blocked by unrelated dirty
+  embedding fixture failures.
+- Run paritychecker generation proof for the maintained fixture and token
+  counts `1`, `10`, `100`, and `1000` when fixture assets are present.
+- Keep benchmark snapshot approval as the shared milestone closeout gate; do
+  not update snapshots without explicit approval.
+
+</decisions>
+
+<code_context>
+## Existing Code Insights
+
+### Reusable Assets
+- `src/emel/text/generator/actions.hpp` fills generator diagnostics from
+  `ctx.compute.backend.kernel.*_dispatch_count()` accessors.
+- `src/emel/kernel/any.hpp` exposes q2/q3/q6 optimized/shared counters across
+  x86_64 and AArch64 kernel actors.
+- `tests/text/generator/lifecycle_tests.cpp` contains the
+  `quantized_contract` generator fixture and existing quantized contract tests.
+- `tools/paritychecker/parity_engines.cpp` prints `quantized_dispatch:` and
+  validates runtime quantized attribution during generation parity.
+- `tools/paritychecker/paritychecker_tests.cpp` already parses generation
+  attribution output.
+
+### Integration Points
+- `tests/text/generator/lifecycle_tests.cpp`: strengthen maintained generator
+  diagnostics assertions for x86_64 q2/q3/q6 optimized dispatch.
+- `tools/paritychecker/parity_engines.cpp`: require the x86_64 generation
+  parity path to show optimized q2/q3/q6 dispatch and zero shared q2/q3/q6
+  dispatch.
+- `tools/paritychecker/paritychecker_tests.cpp`: assert the emitted
+  `quantized_dispatch:` metrics match the x86_64 runtime contract.
+
+</code_context>
+
+<specifics>
+## Specific Ideas
+
+- The generator-level proof must distinguish two facts:
+  1. f32/default fixtures do not claim quantized optimized dispatch.
+  2. the quantized-contract fixture does claim optimized q2/q3/q6 dispatch on
+     x86_64 and does not fall back to shared q2/q3/q6 dispatch.
+- Paritychecker should fail if x86_64 generation parity succeeds numerically
+  while the optimized attribution counters are missing.
+
+</specifics>
+
+<active_next_scope>
+## Active Next Scope
+
+- Phase 244: benchmark attribution and publication truth after Phase 243
+  runtime/parity proof is source-backed.
+
+</active_next_scope>
+
+---
+
+*Phase: 243-runtime-integration-and-parity-proof*
+*Context gathered: 2026-06-25*
diff --git a/.planning/milestones/v1.27-phases/243-runtime-integration-and-parity-proof/243-VALIDATION.md b/.planning/milestones/v1.27-phases/243-runtime-integration-and-parity-proof/243-VALIDATION.md
new file mode 100644
index 00000000..654bcf49
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/243-runtime-integration-and-parity-proof/243-VALIDATION.md
@@ -0,0 +1,140 @@
+---
+phase: 243
+slug: runtime-integration-and-parity-proof
+status: passed
+nyquist_compliant: true
+wave_0_complete: true
+created: 2026-06-25
+---
+
+# Phase 243 - Validation Strategy
+
+## Test Infrastructure
+
+| Property | Value |
+|----------|-------|
+| Framework | doctest, paritychecker, source scans, lint snapshot, quality gate |
+| Config file | `CMakeLists.txt`; `scripts/quality_gates.sh` |
+| Quick run command | `build/phase239/emel_tests_bin --test-case='generator_generate_quantized_contract_fixture_preserves_zero_disallowed_fallback' --no-skipped-summary` |
+| Gate command | `EMEL_QUALITY_GATES_CHANGED_FILES="<phase 243 files>" EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64 scripts/quality_gates.sh` |
+| Current gate status | passed after approved benchmark and generation baseline snapshots |
+
+## Per-Task Verification Map
+
+| Task ID | Requirement | Test Type | Automated Command | Status |
+|---------|-------------|-----------|-------------------|--------|
+| 243-01-01 | XRT-01, XRT-03 | generator-chain route proof | focused quantized-contract generator doctest | green |
+| 243-01-02 | XRT-02, XRT-03 | paritychecker attribution proof | paritychecker tests and live reference generation parity | green, publication baselines stale |
+| 243-01-03 | quality gate | scoped quality gate | `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64 scripts/quality_gates.sh` | green |
+
+## Command Results
+
+```bash
+cmake --build build/phase239 --target emel_tests_bin -j2
+```
+
+Result: PASS.
+
+```bash
+cmake --build build/paritychecker_zig --target paritychecker paritychecker_tests -j2
+```
+
+Result: PASS.
+
+```bash
+build/phase239/emel_tests_bin --test-case='generator_generate_quantized_contract_fixture_preserves_zero_disallowed_fallback' --no-skipped-summary
+```
+
+Result: PASS. On x86_64 the maintained quantized-contract fixture reports positive
+optimized q2/q3/q6 dispatch counters and zero shared q2/q3/q6 counters.
+
+```bash
+build/paritychecker_zig/paritychecker_tests
+```
+
+Result: PASS.
+
+```bash
+build/paritychecker_zig/paritychecker_tests --test-case="paritychecker matches current maintained generation publication against live reference" --no-skipped-summary
+```
+
+Result: PASS.
+
+```bash
+build/paritychecker_zig/paritychecker --generation --model tests/models/LFM2.5-1.2B-Thinking-Q4_K_M.gguf --text hello --max-tokens=1
+build/paritychecker_zig/paritychecker --generation --model tests/models/LFM2.5-1.2B-Thinking-Q4_K_M.gguf --text hello --max-tokens=10
+build/paritychecker_zig/paritychecker --generation --model tests/models/LFM2.5-1.2B-Thinking-Q4_K_M.gguf --text hello --max-tokens=100
+build/paritychecker_zig/paritychecker --generation --model tests/models/LFM2.5-1.2B-Thinking-Q4_K_M.gguf --text hello --max-tokens=1000
+```
+
+Result: live EMEL/reference generation matched for all four token counts. The
+`--max-tokens=1` run also matches the checked-in baseline. The `10`, `100`, and
+`1000` token runs exit nonzero only because their checked-in generation baselines
+still contain the previous stale publication text.
+
+```bash
+scripts/check_domain_boundaries.sh
+```
+
+Result: PASS.
+
+```bash
+rg -n -- '-mavx512|-mavxvnni|-mamx|-mavx512bf16|-mavx512fp16|-mavx512vnni|-mavx512f' CMakeLists.txt src tests tools/paritychecker
+```
+
+Result: PASS, no unsupported x86 feature claims or compile flags found. `rg`
+returns exit 1 for this no-match scan.
+
+```bash
+PATH="/shared/stateforward/.tools/clang-format-venv/bin:/shared/stateforward/.tools/llvm18/root/usr/lib/llvm-18/bin:/shared/stateforward/.tools/git-lfs/git-lfs-3.7.1:$PATH" \
+scripts/lint_snapshot.sh
+```
+
+Result: PASS. No lint snapshot update was made.
+
+```bash
+git diff --check
+```
+
+Result: PASS.
+
+```bash
+PATH="/shared/stateforward/.tools/clang-format-venv/bin:/shared/stateforward/.tools/llvm18/root/usr/lib/llvm-18/bin:/shared/stateforward/.tools/git-lfs/git-lfs-3.7.1:$PATH" \
+EMEL_QUALITY_GATES_BENCH_SUITE="kernel_x86_64" \
+EMEL_QUALITY_GATES_CHANGED_FILES="src/emel/model/data.hpp,src/emel/model/gemma4/detail.cpp,src/emel/model/lfm2/detail.cpp,src/emel/model/qwen3/detail.cpp,src/emel/text/generator/detail.hpp,tests/model/loader/lifecycle_tests.cpp,tests/text/generator/detail_tests.cpp,tests/text/generator/lifecycle_tests.cpp,tools/paritychecker/parity_engines.cpp,tools/paritychecker/paritychecker_tests.cpp" \
+scripts/quality_gates.sh
+```
+
+Initial result before snapshot approval: the scoped gate passed all
+non-benchmark lanes:
+
+- `build`: PASS.
+- `test_with_coverage`: PASS. Changed-line coverage is `715/744` lines (`96.1%`)
+  and `171/240` branches (`71.2%`).
+- `paritychecker`: PASS.
+- `fuzz_smoke`: skipped because no fuzz-affecting files changed.
+- `lint_snapshot`: PASS without snapshot update.
+- `generate_docs`: PASS.
+
+The only failing lane was `bench_snapshot`: the `kernel_x86_64` suite still emits
+15 `kernel/x86_64/*` entries without approved baselines in
+`snapshots/bench/benchmarks.txt`.
+
+## Validation Sign-Off
+
+- [x] Generator-chain x86_64 optimized q2/q3/q6 dispatch has automated validation.
+- [x] Paritychecker x86_64 quantized attribution has automated validation.
+- [x] Live EMEL/reference generation matches for `1`, `10`, `100`, and `1000`
+  token runs.
+- [x] Domain-boundary and unsupported x86 feature scans pass.
+- [x] Coverage, paritychecker, lint, fuzz routing, and docs lanes pass in scoped gate.
+- [x] Snapshot updates were explicitly approved and applied.
+- [x] Scoped quality gate passes after approved benchmark baseline update.
+- [x] Maintained generation publication baselines are updated after explicit approval.
+- [x] `nyquist_compliant: true` and `wave_0_complete: true` are set in
+  frontmatter.
+- [x] Rule-compliance evidence is recorded through public generator dispatch,
+  domain-boundary checks, unsupported feature scans, paritychecker attribution,
+  and lint.
+
+**Approval:** granted by user; snapshots updated.
diff --git a/.planning/milestones/v1.27-phases/243-runtime-integration-and-parity-proof/243-VERIFICATION.md b/.planning/milestones/v1.27-phases/243-runtime-integration-and-parity-proof/243-VERIFICATION.md
new file mode 100644
index 00000000..428278e6
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/243-runtime-integration-and-parity-proof/243-VERIFICATION.md
@@ -0,0 +1,60 @@
+# Phase 243 Verification
+
+status: passed
+
+All commands were run from:
+`/shared/stateforward/emel.cpp`
+
+## Must-Have Verification
+
+| Must-have | Evidence | Status |
+|-----------|----------|--------|
+| Shipped generator chain selects x86_64 optimized q2/q3/q6 paths | `tests/text/generator/lifecycle_tests.cpp` requires positive optimized q2/q3/q6 counters and zero shared q2/q3/q6 counters on the maintained quantized-contract fixture | PASS |
+| Runtime proof uses public machine dispatch and diagnostics | Generator lifecycle tests drive `process_event(...)` and `capture_diagnostics`; paritychecker reads maintained generator diagnostics instead of actor private helpers | PASS |
+| Paritychecker publishes x86_64 attribution | `tools/paritychecker/parity_engines.cpp` prints `quantized_dispatch:` counters and requires x86_64 native q2/q3/q6 optimized counters when those native tensor types are present | PASS |
+| Maintained generation parity covers 1, 10, 100, and 1000 token runs | Live EMEL/reference generation parity matched at `--max-tokens` 1, 10, 100, and 1000; 10/100/1000 are blocked only by stale checked-in generation baselines | PASS |
+| Supported and fallback/no-claim behavior remain deterministic | `tools/paritychecker/paritychecker_tests.cpp` asserts x86 optimized counters are positive only when native tensor types are present and shared q2/q3/q6 counters stay zero | PASS |
+| Required quality gate | approved `kernel_x86_64` benchmark/parity snapshots landed; scoped quality gate passed all selected lanes | PASS |
+
+## Evidence Summary
+
+- `cmake --build build/phase239 --target emel_tests_bin -j2` passes.
+- `cmake --build build/paritychecker_zig --target paritychecker paritychecker_tests -j2`
+  passes.
+- Focused generator/model tests pass:
+  `generator_generate_quantized_contract_fixture_preserves_zero_disallowed_fallback`,
+  `generator_generate_runs_native_generator_contract`,
+  `generator_detail_lfm2_attention_uses_neox_rope_layout`,
+  `generator_detail_qwen3_generator_applies_per_head_qk_norm_before_rope`,
+  `generator_detail_gemma4_generator_applies_per_head_qk_norm_before_rope`,
+  and the Qwen3, Gemma4, and LFM2 model hparam binding tests.
+- `build/paritychecker_zig/paritychecker_tests` passes.
+- `build/paritychecker_zig/paritychecker_tests --test-case="paritychecker matches current maintained generation publication against live reference" --no-skipped-summary`
+  passes.
+- Live generation parity:
+  - `--max-tokens 1`: EMEL and reference match and the checked-in baseline matches.
+  - `--max-tokens 10`: EMEL and reference match; checked-in generation baseline is stale.
+  - `--max-tokens 100`: EMEL and reference match; checked-in generation baseline is stale.
+  - `--max-tokens 1000`: EMEL and reference match; checked-in generation baseline is stale.
+- `scripts/check_domain_boundaries.sh` passes.
+- Unsupported x86 flag scan passes:
+  `rg -n -- '-mavx512|-mavxvnni|-mamx|-mavx512bf16|-mavx512fp16|-mavx512vnni|-mavx512f' CMakeLists.txt src tests tools/paritychecker`
+  returns no matches.
+- `scripts/lint_snapshot.sh` passes with the maintained local tool PATH and without
+  updating snapshots.
+- `git diff --check` passes.
+- Scoped `scripts/quality_gates.sh` passes build, coverage, paritychecker, lint
+  snapshot, docs generation, and fuzz routing. Coverage evidence from the scoped gate:
+  `changed-line coverage: lines 715/744 (96.1%), branches 171/240 (71.2%)`.
+- Approved benchmark snapshots now include the `kernel/x86_64/*` benchmark suite
+  entries, and maintained LFM2 generation publication baselines are current for
+  `10`, `100`, and `1000` token runs.
+
+## Final Verification
+
+User approved snapshot updates. `scripts/bench.sh --snapshot --update
+--suite=kernel_x86_64` updated the benchmark baseline, maintained generation
+publication baselines were updated, and the changed-file scoped quality gate
+passed with `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64`.
+
+Phase 243 is fully verified for `XRT-01`, `XRT-02`, and `XRT-03`.
diff --git a/.planning/milestones/v1.27-phases/244-benchmark-attribution-and-publication-truth/244-01-PLAN.md b/.planning/milestones/v1.27-phases/244-benchmark-attribution-and-publication-truth/244-01-PLAN.md
new file mode 100644
index 00000000..2061c153
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/244-benchmark-attribution-and-publication-truth/244-01-PLAN.md
@@ -0,0 +1,145 @@
+# Phase 244 Plan: Benchmark Attribution and Publication Truth
+
+## Goal
+
+Publish maintained benchmark and generation baseline evidence for the Ryzen
+AVX2/FMA path, with source-backed attribution that distinguishes x86_64 EMEL
+optimized execution from scalar/shared paths, ARM-first claims, and reference
+lane execution.
+
+## Tasks
+
+<tasks>
+  <task id="244-01-01" name="Preflight maintained x86_64 benchmark publication">
+    <read_first>
+      - `scripts/bench.sh`
+      - `scripts/quality_gates.sh`
+      - `tools/bench/kernel/x86_64_bench.cpp`
+      - `tools/bench/bench_runner_registry.cpp`
+      - `tools/bench/bench_dependency_manifest.cpp`
+      - `snapshots/bench/benchmarks.txt`
+    </read_first>
+    <files>
+      - `.planning/phases/244-benchmark-attribution-and-publication-truth/244-VALIDATION.md`
+    </files>
+    <action>
+      Run the suite-scoped benchmark snapshot preflight without updating
+      snapshots:
+      `scripts/bench.sh --snapshot --compare --suite=kernel_x86_64`.
+      Record the exact missing or stale `kernel/x86_64/*` snapshot entries and
+      confirm no unrelated benchmark suites are required for this phase.
+    </action>
+    <acceptance_criteria>
+      - Validation records the `kernel_x86_64` preflight command and result.
+      - Any failure is limited to missing/stale maintained `kernel/x86_64/*`
+        snapshot entries.
+      - No snapshot file is changed by the preflight.
+    </acceptance_criteria>
+  </task>
+
+  <task id="244-01-02" name="Apply approved publication snapshot updates">
+    <read_first>
+      - `scripts/bench.sh`
+      - `tools/paritychecker/parity_runner.cpp`
+      - `tools/paritychecker/parity_engines.cpp`
+      - `snapshots/bench/benchmarks.txt`
+      - `snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_10.txt`
+      - `snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_100.txt`
+      - `snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_1000.txt`
+    </read_first>
+    <files>
+      - `snapshots/bench/benchmarks.txt`
+      - `snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_10.txt`
+      - `snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_100.txt`
+      - `snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_1000.txt`
+    </files>
+    <action>
+      Only after explicit user approval, update the maintained publication
+      snapshots:
+      `scripts/bench.sh --snapshot --update --suite=kernel_x86_64`, then run
+      paritychecker with `--write-generation-baseline` for the live-matching
+      LFM2 `10`, `100`, and `1000` token generation baselines.
+    </action>
+    <acceptance_criteria>
+      - User approval for snapshot updates is recorded in the session.
+      - `snapshots/bench/benchmarks.txt` contains the new `kernel/x86_64/*`
+        maintained benchmark entries.
+      - The stale LFM2 maintained generation baselines for `10`, `100`, and
+        `1000` token runs match live EMEL/reference output.
+      - Snapshot diffs do not change unrelated benchmark or parity baselines.
+    </acceptance_criteria>
+  </task>
+
+  <task id="244-01-03" name="Close quality gate and milestone traceability">
+    <read_first>
+      - `.planning/REQUIREMENTS.md`
+      - `.planning/ROADMAP.md`
+      - `.planning/STATE.md`
+      - `.planning/PROJECT.md`
+      - `.planning/phases/244-benchmark-attribution-and-publication-truth/244-01-PLAN.md`
+    </read_first>
+    <files>
+      - `.planning/phases/244-benchmark-attribution-and-publication-truth/244-VERIFICATION.md`
+      - `.planning/phases/244-benchmark-attribution-and-publication-truth/244-VALIDATION.md`
+      - `.planning/phases/244-benchmark-attribution-and-publication-truth/244-01-SUMMARY.md`
+      - `.planning/REQUIREMENTS.md`
+      - `.planning/ROADMAP.md`
+      - `.planning/STATE.md`
+      - `.planning/PROJECT.md`
+    </files>
+    <action>
+      Re-run the changed-file scoped quality gate with
+      `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64`. If it passes, mark
+      `XBN-01` and `XBN-02` complete and update the v1.27 progress ledger.
+      Keep completion tied to approved snapshots and passing gate evidence.
+    </action>
+    <acceptance_criteria>
+      - `244-VERIFICATION.md`, `244-VALIDATION.md`, and `244-01-SUMMARY.md`
+        exist.
+      - The scoped quality gate result is recorded.
+      - `XBN-01` and `XBN-02` are not marked complete unless the approved
+        snapshot updates have landed and the scoped quality gate passes.
+    </acceptance_criteria>
+  </task>
+</tasks>
+
+## Verification
+
+1. Benchmark publication preflight:
+   `scripts/bench.sh --snapshot --compare --suite=kernel_x86_64`.
+2. Approved benchmark snapshot update:
+   `scripts/bench.sh --snapshot --update --suite=kernel_x86_64`.
+3. Approved maintained generation baseline writes for LFM2 `10`, `100`, and
+   `1000` token runs using paritychecker `--write-generation-baseline`.
+4. Paritychecker maintained generation publication test:
+   `build/paritychecker_zig/paritychecker_tests --test-case="paritychecker matches current maintained generation publication against live reference" --no-skipped-summary`.
+5. `scripts/lint_snapshot.sh` without unapproved lint snapshot changes.
+6. Changed-file scoped `scripts/quality_gates.sh` with
+   `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64`.
+
+<rule_constraints>
+## Rule Constraints
+
+- Follow `AGENTS.md` and `docs/rules/sml.rules.md`.
+- Do not add new runtime routing, queueing, deferred dispatch, or actor
+  rewrites.
+- Do not add AVX-512, AVX-VNNI, AMX, BF16, native-FP16, GPU, or llama.cpp/ggml
+  runtime claims.
+- Keep benchmark and parity lanes separated. EMEL benchmark results must come
+  from EMEL-owned runtime code; reference results are comparison-only.
+- Do not update benchmark, parity, lint, or docs snapshots without explicit user
+  approval.
+- Do not mark publication requirements complete from planning artifacts alone.
+
+</rule_constraints>
+
+## Completion Criteria
+
+- `XBN-01` has maintained benchmark evidence that `tools/bench` runs x86_64
+  flash and quantized workloads through optimized paths with attribution
+  distinct from scalar/shared paths and reference-lane execution.
+- `XBN-02` has publication evidence that the host CPU, feature contract,
+  optimized counters, and reference-lane separation are represented truthfully.
+- The approved snapshot updates are limited to the maintained benchmark and
+  generation publication baselines required by this milestone.
+- The scoped quality gate passes after approved publication updates.
diff --git a/.planning/milestones/v1.27-phases/244-benchmark-attribution-and-publication-truth/244-01-SUMMARY.md b/.planning/milestones/v1.27-phases/244-benchmark-attribution-and-publication-truth/244-01-SUMMARY.md
new file mode 100644
index 00000000..7f0c4cf1
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/244-benchmark-attribution-and-publication-truth/244-01-SUMMARY.md
@@ -0,0 +1,46 @@
+---
+phase: 244
+status: passed
+requirements-completed:
+  - XBN-01
+  - XBN-02
+requirements-blocked: []
+verification: passed
+---
+
+# Phase 244 Summary
+
+## What Changed
+
+- Added Phase 244 context and plan for the benchmark attribution and publication
+  truth closeout.
+- Ran the non-mutating `kernel_x86_64` benchmark snapshot preflight.
+- Captured the would-be `kernel_x86_64` benchmark entries and EMEL/reference
+  compare rows into `/tmp` without touching `snapshots/bench/`.
+- Recorded the exact missing benchmark baseline entries and the stale maintained
+  generation baseline files, then applied the approved snapshot updates.
+- Repaired the source-backed audit gap in `XBN-01` by adding counter-checked
+  `kernel_x86_64` benchmark entries for optimized x86_64 flash attention and
+  q2/q3/q6 quantized matmul.
+- Generated candidate LFM2 `10`, `100`, and `1000` token generation baselines in
+  `/tmp/emel-phase244-baselines.N7inir` to prove the pending publication writes
+  are executable without modifying checked-in snapshots.
+
+## Validation
+
+- `node .codex/get-shit-done/bin/gsd-tools.cjs init phase-op 244`: pass.
+- `node .codex/get-shit-done/bin/gsd-tools.cjs roadmap analyze`: pass.
+- `git diff --check`: pass.
+- `scripts/bench.sh --snapshot --compare --suite=kernel_x86_64`: pass after
+  approved benchmark snapshot update and optimized benchmark repair.
+- Direct `bench_runner --mode=emel` with `EMEL_BENCH_SUITE=kernel_x86_64`:
+  pass, 19 benchmark entries including optimized flash and q2/q3/q6 entries.
+- Direct `bench_runner --mode=compare` with `EMEL_BENCH_SUITE=kernel_x86_64`:
+  pass, 19 EMEL/reference comparison rows.
+- Temp paritychecker generation baseline writes for LFM2 `10`, `100`, and
+  `1000` token runs: pass. Candidate diffs show stale checked-in snapshots lack
+  trace token IDs/score gaps and have old output lengths.
+
+## Closeout Status
+
+Phase 244 satisfies and verifies `XBN-01` and `XBN-02`.
diff --git a/.planning/milestones/v1.27-phases/244-benchmark-attribution-and-publication-truth/244-CONTEXT.md b/.planning/milestones/v1.27-phases/244-benchmark-attribution-and-publication-truth/244-CONTEXT.md
new file mode 100644
index 00000000..250c595e
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/244-benchmark-attribution-and-publication-truth/244-CONTEXT.md
@@ -0,0 +1,108 @@
+# Phase 244: Benchmark Attribution and Publication Truth - Context
+
+**Gathered:** 2026-06-25
+**Status:** Ready for planning
+**Mode:** Auto-generated (autonomous processor-support phase)
+
+<domain>
+## Phase Boundary
+
+Publish the maintained benchmark and documentation evidence for the Ryzen
+AVX2/FMA work from Phases 239-243. This phase does not add new kernels or widen
+the runtime contract. It closes the truthfulness loop by making the maintained
+benchmark snapshot and generation publication baselines match the source-backed
+x86_64 optimized path evidence, while keeping unsupported feature families as
+explicit no-claims.
+
+</domain>
+
+<decisions>
+## Implementation Decisions
+
+### Benchmark Surface
+- Use the existing `tools/bench` maintained `kernel_x86_64` suite. It is wired
+  through `tools/bench/bench_runner_registry.cpp`,
+  `tools/bench/kernel/x86_64_bench.cpp`, and
+  `scripts/quality_gates.sh`.
+- The benchmark snapshot baseline is `snapshots/bench/benchmarks.txt`.
+- Do not update benchmark snapshots without explicit user approval.
+
+### Generation Publication Surface
+- Maintained generation publication baselines live under `snapshots/parity/`.
+- Phase 243 proved live EMEL/reference generation matches for `1`, `10`, `100`,
+  and `1000` token runs. The `10`, `100`, and `1000` publication baselines are
+  stale and need explicit approval before update.
+- Use paritychecker's existing `--write-generation-baseline <path>` support for
+  baseline writes. Do not rewrite parity baselines without explicit approval.
+
+### Truthfulness Rules
+- Published output must identify this host as AMD Ryzen 9 5950X with x86_64
+  AVX2, FMA, and F16C conversion support only.
+- Published output must not imply AVX-512, AVX-VNNI, AMX, BF16, native FP16, GPU,
+  or llama.cpp/ggml runtime acceleration.
+- Benchmark and parity lanes must remain separated: EMEL-owned code produces the
+  EMEL result, and llama.cpp/ggml remains comparison-only on the reference side.
+
+### Validation
+- Run `scripts/bench.sh --snapshot --compare --suite=kernel_x86_64` before any
+  approved update to confirm the current missing baseline set.
+- After explicit approval, run
+  `scripts/bench.sh --snapshot --update --suite=kernel_x86_64`.
+- After explicit approval, refresh stale maintained generation baselines for the
+  live-matching `10`, `100`, and `1000` token runs.
+- Re-run the changed-file scoped quality gate with
+  `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64`.
+
+</decisions>
+
+<code_context>
+## Existing Code Insights
+
+### Reusable Assets
+- `tools/bench/kernel/x86_64_bench.cpp` appends EMEL and reference
+  `kernel_x86_64` benchmark cases.
+- `tools/bench/bench_runner_registry.cpp` registers the `kernel_x86_64` suite.
+- `tools/bench/bench_dependency_manifest.cpp` maps `kernel_x86_64` to
+  `tools/bench/kernel/x86_64_bench.cpp`, `tools/bench/kernel/bench_common.hpp`,
+  and `src/emel/kernel`.
+- `scripts/bench.sh` supports suite-scoped snapshot updates and merges them into
+  `snapshots/bench/benchmarks.txt`.
+- `tools/paritychecker/parity_runner.cpp` supports
+  `--write-generation-baseline`.
+- `tools/paritychecker/parity_engines.cpp` computes the default maintained
+  generation baseline path under `snapshots/parity/`.
+
+### Integration Points
+- `snapshots/bench/benchmarks.txt`: approved `kernel_x86_64` snapshot entries.
+- `snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_10.txt`
+- `snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_100.txt`
+- `snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_1000.txt`
+- `.planning/REQUIREMENTS.md`, `.planning/ROADMAP.md`, `.planning/STATE.md`, and
+  `.planning/PROJECT.md` for final traceability after approved snapshot updates.
+
+</code_context>
+
+<specifics>
+## Specific Ideas
+
+- Treat `scripts/bench.sh --snapshot --compare --suite=kernel_x86_64` as the
+  publication preflight; it should fail only because the maintained snapshot
+  lacks the new `kernel/x86_64/*` entries.
+- Treat the generation baseline updates as publication baselines, not runtime
+  proof. Phase 243 already proved live EMEL/reference output equality.
+- Final closeout requires a clean scoped quality gate, not just artifact edits.
+
+</specifics>
+
+<active_next_scope>
+## Active Next Scope
+
+- Get explicit snapshot approval, run the approved updates, rerun the scoped
+  quality gate, then mark `XBN-01` and `XBN-02` source/gate complete.
+
+</active_next_scope>
+
+---
+
+*Phase: 244-benchmark-attribution-and-publication-truth*
+*Context gathered: 2026-06-25*
diff --git a/.planning/milestones/v1.27-phases/244-benchmark-attribution-and-publication-truth/244-VALIDATION.md b/.planning/milestones/v1.27-phases/244-benchmark-attribution-and-publication-truth/244-VALIDATION.md
new file mode 100644
index 00000000..1f71f839
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/244-benchmark-attribution-and-publication-truth/244-VALIDATION.md
@@ -0,0 +1,209 @@
+---
+phase: 244
+slug: benchmark-attribution-and-publication-truth
+status: passed
+nyquist_compliant: true
+wave_0_complete: true
+created: 2026-06-25
+---
+
+# Phase 244 - Validation Strategy
+
+## Test Infrastructure
+
+| Property | Value |
+|----------|-------|
+| Framework | benchmark snapshot gate, paritychecker, source scans, quality gate |
+| Config file | `scripts/bench.sh`; `scripts/quality_gates.sh` |
+| Quick run command | `scripts/bench.sh --snapshot --compare --suite=kernel_x86_64` |
+| Gate command | `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64 scripts/quality_gates.sh` |
+| Current gate status | passed after approved benchmark and generation publication snapshot updates |
+
+## Per-Task Verification Map
+
+| Task ID | Requirement | Test Type | Automated Command | Status |
+|---------|-------------|-----------|-------------------|--------|
+| 244-01-01 | XBN-01, XBN-02 | benchmark preflight | `scripts/bench.sh --snapshot --compare --suite=kernel_x86_64` | green |
+| 244-01-02 | XBN-01, XBN-02 | approved snapshot writes | `scripts/bench.sh --snapshot --update --suite=kernel_x86_64` plus paritychecker baseline writes | green |
+| 244-01-03 | XBN-01 | source-backed audit gap repair | `tools/bench/kernel/x86_64_bench.cpp`; benchmark smoke and snapshot compare | green |
+| 244-01-04 | XBN-01, XBN-02 | scoped quality gate | `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64 scripts/quality_gates.sh` | green |
+
+## Command Results
+
+```bash
+node .codex/get-shit-done/bin/gsd-tools.cjs init phase-op 244
+```
+
+Result: PASS. Phase 244 context and plan are present.
+
+```bash
+node .codex/get-shit-done/bin/gsd-tools.cjs roadmap analyze
+```
+
+Result: PASS. Phase 244 is planned with one plan; phases 239-243 are
+disk-complete.
+
+```bash
+git diff --check
+```
+
+Result: PASS.
+
+```bash
+scripts/bench.sh --snapshot --compare --suite=kernel_x86_64
+```
+
+Initial result before approval: baseline update required. The command configured
+and built the suite-scoped benchmark runner, then reported these missing
+maintained baselines:
+
+- `kernel/x86_64/op_sqrt`
+- `kernel/x86_64/op_div`
+- `kernel/x86_64/op_sin`
+- `kernel/x86_64/op_unary_neg`
+- `kernel/x86_64/op_unary_relu`
+- `kernel/x86_64/op_mul`
+- `kernel/x86_64/op_mul_mat`
+- `kernel/x86_64/op_sub`
+- `kernel/x86_64/op_add`
+- `kernel/x86_64/op_soft_max`
+- `kernel/x86_64/op_dup`
+- `kernel/x86_64/op_cos`
+- `kernel/x86_64/op_sqr`
+- `kernel/x86_64/op_unary_exp`
+- `kernel/x86_64/op_log`
+
+No snapshot update was made.
+
+Final result after approved snapshot update and optimized benchmark repair:
+PASS.
+
+```bash
+EMEL_BENCH_SUITE=kernel_x86_64 build/bench_tools_ninja_kernel_x86_64/bench_runner --mode=emel
+```
+
+Initial result: PASS. Output was captured in
+`/tmp/emel-phase244-kernel-x86-current.sl4mKm.txt`; no snapshot file was
+modified.
+
+| Benchmark entry | Candidate ns/op |
+|-----------------|-----------------|
+| `kernel/x86_64/op_add` | 71.500 |
+| `kernel/x86_64/op_cos` | 1450.490 |
+| `kernel/x86_64/op_div` | 114.500 |
+| `kernel/x86_64/op_dup` | 77.900 |
+| `kernel/x86_64/op_log` | 3081.090 |
+| `kernel/x86_64/op_mul` | 71.700 |
+| `kernel/x86_64/op_mul_mat` | 2584.890 |
+| `kernel/x86_64/op_sin` | 1664.700 |
+| `kernel/x86_64/op_soft_max` | 4816.490 |
+| `kernel/x86_64/op_sqr` | 78.300 |
+| `kernel/x86_64/op_sqrt` | 151.600 |
+| `kernel/x86_64/op_sub` | 84.600 |
+| `kernel/x86_64/op_unary_exp` | 3662.790 |
+| `kernel/x86_64/op_unary_neg` | 80.400 |
+| `kernel/x86_64/op_unary_relu` | 98.800 |
+
+Source-backed audit then found that the first approved publication only covered
+common f32/unary/matmul entries and did not prove the x86_64 optimized flash and
+q2/q3/q6 benchmark lanes. `tools/bench/kernel/x86_64_bench.cpp` was repaired to
+add four counter-checked optimized entries:
+
+| Benchmark entry | Proof |
+|-----------------|-------|
+| `kernel/x86_64/op_flash_attn_ext_decode_like` | Aborts unless `optimized_flash_dispatch_count()` increments and `shared_flash_dispatch_count()` does not |
+| `kernel/x86_64/op_mul_mat_q2_k_q8_k` | Aborts unless `optimized_q2_dispatch_count()` increments and `shared_q2_dispatch_count()` does not |
+| `kernel/x86_64/op_mul_mat_q3_k_q8_k` | Aborts unless `optimized_q3_dispatch_count()` increments and `shared_q3_dispatch_count()` does not |
+| `kernel/x86_64/op_mul_mat_q6_k_q8_k` | Aborts unless `optimized_q6_dispatch_count()` increments and `shared_q6_dispatch_count()` does not |
+
+```bash
+EMEL_BENCH_SUITE=kernel_x86_64 build/bench_tools_ninja_kernel_x86_64/bench_runner --mode=compare
+```
+
+Initial result: PASS. Output was captured in
+`/tmp/emel-phase244-kernel-x86-compare.ZbJiE5.txt`; no snapshot file was
+modified. After repair, the compare output contains 19 EMEL/reference rows,
+including optimized flash and q2/q3/q6 entries, proving the reference/shared
+lane remains separate from the EMEL-owned optimized benchmark lane.
+
+Representative compare rows from that run:
+
+| Benchmark entry | EMEL ns/op | Reference ns/op | Ratio |
+|-----------------|------------|-----------------|-------|
+| `kernel/x86_64/op_add` | 125.700 | 397.700 | 0.316x |
+| `kernel/x86_64/op_mul_mat` | 2633.790 | 7301.070 | 0.361x |
+| `kernel/x86_64/op_soft_max` | 4898.180 | 1132.900 | 4.324x |
+| `kernel/x86_64/op_unary_exp` | 3773.590 | 2059.090 | 1.833x |
+
+```bash
+EMEL_BENCH_SUITE=kernel_x86_64 EMEL_BENCH_ITERS=1 EMEL_BENCH_RUNS=1 EMEL_BENCH_WARMUP_ITERS=0 EMEL_BENCH_WARMUP_RUNS=0 build/bench_tools_ninja_kernel_x86_64/bench_runner --mode=emel
+EMEL_BENCH_SUITE=kernel_x86_64 EMEL_BENCH_ITERS=1 EMEL_BENCH_RUNS=1 EMEL_BENCH_WARMUP_ITERS=0 EMEL_BENCH_WARMUP_RUNS=0 build/bench_tools_ninja_kernel_x86_64/bench_runner --mode=compare
+```
+
+Result: PASS. The optimized EMEL smoke emitted 19 `kernel/x86_64/*` entries,
+including the four counter-checked optimized entries. The compare smoke emitted
+matching EMEL/reference rows for all 19 entries.
+
+```bash
+scripts/bench.sh --snapshot --update --suite=kernel_x86_64
+```
+
+Result: PASS after explicit user approval. The benchmark snapshot baseline was
+merged into `snapshots/bench/benchmarks.txt` and later refreshed after the
+source-backed audit repair to include all 19 maintained `kernel/x86_64/*`
+entries.
+
+```bash
+build/paritychecker_zig/paritychecker --generation --model tests/models/LFM2.5-1.2B-Thinking-Q4_K_M.gguf --text hello --max-tokens=10 --write-generation-baseline snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_10.txt
+build/paritychecker_zig/paritychecker --generation --model tests/models/LFM2.5-1.2B-Thinking-Q4_K_M.gguf --text hello --max-tokens=100 --write-generation-baseline snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_100.txt
+build/paritychecker_zig/paritychecker --generation --model tests/models/LFM2.5-1.2B-Thinking-Q4_K_M.gguf --text hello --max-tokens=1000 --write-generation-baseline snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_1000.txt
+```
+
+Result: PASS after explicit user approval. The maintained generation
+publication baselines were updated in `snapshots/parity/`.
+
+```bash
+build/paritychecker_zig/paritychecker --generation --model tests/models/LFM2.5-1.2B-Thinking-Q4_K_M.gguf --text hello --max-tokens=10 --write-generation-baseline /tmp/emel-phase244-baselines.N7inir/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_10.txt
+build/paritychecker_zig/paritychecker --generation --model tests/models/LFM2.5-1.2B-Thinking-Q4_K_M.gguf --text hello --max-tokens=100 --write-generation-baseline /tmp/emel-phase244-baselines.N7inir/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_100.txt
+build/paritychecker_zig/paritychecker --generation --model tests/models/LFM2.5-1.2B-Thinking-Q4_K_M.gguf --text hello --max-tokens=1000 --write-generation-baseline /tmp/emel-phase244-baselines.N7inir/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_1000.txt
+```
+
+Result: PASS. These writes targeted `/tmp`, not checked-in snapshots.
+
+| Max tokens | Generated tokens | Output bytes | Optimized flash | Optimized q6 | Shared q6 |
+|------------|------------------|--------------|-----------------|--------------|-----------|
+| 10 | 10 | 20 | 228 | 390 | 0 |
+| 100 | 100 | 248 | 768 | 1380 | 0 |
+| 1000 | 1000 | 2498 | 6168 | 11280 | 0 |
+
+Candidate-vs-snapshot diff summary:
+
+| Max tokens | Stale snapshot | Candidate |
+|------------|----------------|-----------|
+| 10 | `output_length=22`, `trace_token_count=0` | `output_length=20`, `trace_token_count=10`, token IDs and score gaps populated |
+| 100 | `output_length=277`, `trace_token_count=0` | `output_length=248`, `trace_token_count=100`, token IDs and score gaps populated |
+| 1000 | `output_length=2866`, `trace_token_count=0` | `output_length=2498`, `trace_token_count=1000`, token IDs and score gaps populated |
+
+## Validation Sign-Off
+
+- [x] Phase 244 context and plan exist.
+- [x] Benchmark preflight was run without snapshot updates.
+- [x] Missing `kernel/x86_64/*` benchmark baseline entries are identified.
+- [x] Candidate benchmark entries and EMEL/reference compare rows were captured
+  under `/tmp` without modifying `snapshots/bench/`.
+- [x] Stale maintained generation publication baselines are identified from
+  Phase 243 live parity evidence.
+- [x] Candidate generation publication baselines were written to `/tmp` and
+  diffed against checked-in stale snapshots without modifying `snapshots/`.
+- [x] Benchmark snapshot update is approved and applied.
+- [x] Source-backed audit gap for `XBN-01` is repaired by counter-checked
+  optimized flash and q2/q3/q6 benchmark entries.
+- [x] Maintained generation baseline updates are approved and applied.
+- [x] Scoped quality gate passes after approved publication updates.
+- [x] `nyquist_compliant: true` and `wave_0_complete: true` are set in
+  frontmatter.
+- [x] Rule-compliance evidence is recorded through suite-scoped benchmark
+  commands, EMEL/reference lane separation, publication snapshot diffs, and the
+  scoped quality gate.
+
+**Approval:** granted by user; snapshots updated.
diff --git a/.planning/milestones/v1.27-phases/244-benchmark-attribution-and-publication-truth/244-VERIFICATION.md b/.planning/milestones/v1.27-phases/244-benchmark-attribution-and-publication-truth/244-VERIFICATION.md
new file mode 100644
index 00000000..a9c26ac2
--- /dev/null
+++ b/.planning/milestones/v1.27-phases/244-benchmark-attribution-and-publication-truth/244-VERIFICATION.md
@@ -0,0 +1,97 @@
+# Phase 244 Verification
+
+status: passed
+
+All commands were run from:
+`/shared/stateforward/emel.cpp`
+
+## Must-Have Verification
+
+| Must-have | Evidence | Status |
+|-----------|----------|--------|
+| Maintained `kernel_x86_64` benchmark suite is wired | `tools/bench/bench_runner_registry.cpp`, `tools/bench/kernel/x86_64_bench.cpp`, and `scripts/quality_gates.sh` expose and select the suite | PASS |
+| Optimized x86_64 flash and quantized paths are benchmarked | `tools/bench/kernel/x86_64_bench.cpp` publishes `op_flash_attn_ext_decode_like`, `op_mul_mat_q2_k_q8_k`, `op_mul_mat_q3_k_q8_k`, and `op_mul_mat_q6_k_q8_k` entries that abort if the optimized actor counters do not advance | PASS |
+| Benchmark preflight is suite-scoped and non-mutating | `scripts/bench.sh --snapshot --compare --suite=kernel_x86_64` ran and did not update snapshots | PASS |
+| Publication baseline updated | `scripts/bench.sh --snapshot --update --suite=kernel_x86_64` merged 19 `kernel/x86_64/*` entries into `snapshots/bench/benchmarks.txt` | PASS |
+| Generation publication baselines updated | paritychecker wrote the maintained LFM2 `10`, `100`, and `1000` token baselines under `snapshots/parity/` | PASS |
+| Unsupported feature claims remain excluded | Phase 243 unsupported x86 flag scan found no AVX-512, AVX-VNNI, AMX, BF16, or native FP16 compile-flag claims | PASS |
+| Required quality gate | changed-file scoped quality gate passed with `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64` | PASS |
+
+## Evidence Summary
+
+- `node .codex/get-shit-done/bin/gsd-tools.cjs init phase-op 244` reports
+  context and plan present.
+- `node .codex/get-shit-done/bin/gsd-tools.cjs roadmap analyze` reports Phase
+  244 as planned with one plan.
+- `git diff --check` passes.
+- Initial `scripts/bench.sh --snapshot --compare --suite=kernel_x86_64`
+  configured and built the suite-scoped benchmark runner, then failed only
+  because the maintained benchmark baseline lacked the first 15 common
+  `kernel/x86_64/*` entries:
+  - `kernel/x86_64/op_sqrt`
+  - `kernel/x86_64/op_div`
+  - `kernel/x86_64/op_sin`
+  - `kernel/x86_64/op_unary_neg`
+  - `kernel/x86_64/op_unary_relu`
+  - `kernel/x86_64/op_mul`
+  - `kernel/x86_64/op_mul_mat`
+  - `kernel/x86_64/op_sub`
+  - `kernel/x86_64/op_add`
+  - `kernel/x86_64/op_soft_max`
+  - `kernel/x86_64/op_dup`
+  - `kernel/x86_64/op_cos`
+  - `kernel/x86_64/op_sqr`
+  - `kernel/x86_64/op_unary_exp`
+  - `kernel/x86_64/op_log`
+- The preflight also prints `error: no benchmark entries matched selected suite`
+  because the selected suite has no existing baseline entries to compare yet.
+  That is a publication-baseline absence, not a runtime execution failure.
+- Source-backed milestone audit found an `XBN-01` gap: those first 15 common
+  entries did not prove the x86_64 optimized flash and q2/q3/q6 benchmark
+  lanes. `tools/bench/kernel/x86_64_bench.cpp` was repaired so the maintained
+  `kernel_x86_64` suite now also publishes counter-checked optimized-path
+  entries for:
+  - `kernel/x86_64/op_flash_attn_ext_decode_like`
+  - `kernel/x86_64/op_mul_mat_q2_k_q8_k`
+  - `kernel/x86_64/op_mul_mat_q3_k_q8_k`
+  - `kernel/x86_64/op_mul_mat_q6_k_q8_k`
+- Direct benchmark smoke passed:
+  `EMEL_BENCH_SUITE=kernel_x86_64 EMEL_BENCH_ITERS=1 EMEL_BENCH_RUNS=1 build/bench_tools_ninja_kernel_x86_64/bench_runner --mode=emel`
+  emitted 19 `kernel/x86_64/*` entries, including the optimized flash and
+  q2/q3/q6 entries.
+- Direct benchmark compare smoke passed with the same 19 EMEL/reference rows:
+  `EMEL_BENCH_SUITE=kernel_x86_64 EMEL_BENCH_ITERS=1 EMEL_BENCH_RUNS=1 build/bench_tools_ninja_kernel_x86_64/bench_runner --mode=compare`.
+- Final `scripts/bench.sh --snapshot --compare --suite=kernel_x86_64` passed
+  after the approved snapshot update and includes the optimized flash and
+  q2/q3/q6 entries.
+- Published benchmark entry names are:
+  `op_add`, `op_cos`, `op_div`, `op_dup`, `op_log`, `op_mul`, `op_mul_mat`,
+  `op_mul_mat_q2_k_q8_k`, `op_mul_mat_q3_k_q8_k`,
+  `op_mul_mat_q6_k_q8_k`, `op_flash_attn_ext_decode_like`, `op_sin`,
+  `op_soft_max`, `op_sqr`, `op_sqrt`, `op_sub`, `op_unary_exp`,
+  `op_unary_neg`, and `op_unary_relu`.
+- Temp maintained generation baseline candidate writes succeeded without touching
+  `snapshots/parity/`:
+  - `--max-tokens=10`: status 0, generated 10 tokens, output 20 bytes,
+    optimized flash 228, optimized q6 390, shared q6 0.
+  - `--max-tokens=100`: status 0, generated 100 tokens, output 248 bytes,
+    optimized flash 768, optimized q6 1380, shared q6 0.
+  - `--max-tokens=1000`: status 0, generated 1000 tokens, output 2498 bytes,
+    optimized flash 6168, optimized q6 11280, shared q6 0.
+- Candidate-vs-snapshot diffs show the checked-in stale baselines have
+  `trace_token_count=0` while candidates include the live token IDs and score
+  gaps. Output lengths change from `22 -> 20`, `277 -> 248`, and `2866 -> 2498`
+  for `10`, `100`, and `1000` tokens respectively.
+
+## Final Verification
+
+User approved snapshot updates. After the source-backed audit exposed the
+missing optimized benchmark entries, `tools/bench/kernel/x86_64_bench.cpp` was
+repaired and `scripts/bench.sh --snapshot --update --suite=kernel_x86_64`
+updated `snapshots/bench/benchmarks.txt`. Paritychecker updated the maintained
+LFM2 `10`, `100`, and `1000` token publication baselines, the focused parity
+publication test passed, `scripts/bench.sh --snapshot --compare
+--suite=kernel_x86_64` passed, and the changed-file scoped quality gate passed
+with `EMEL_QUALITY_GATES_BENCH_SUITE=kernel_x86_64`.
+
+Phase 244 is fully verified for `XBN-01` and `XBN-02`.
diff --git a/.planning/quick/260401-ejm-add-non-blocking-benchmark-binary-size-c/SUMMARY.md b/.planning/quick/260401-ejm-add-non-blocking-benchmark-binary-size-c/SUMMARY.md
new file mode 100644
index 00000000..3f1d74e1
--- /dev/null
+++ b/.planning/quick/260401-ejm-add-non-blocking-benchmark-binary-size-c/SUMMARY.md
@@ -0,0 +1,9 @@
+---
+status: complete
+completed: 2026-04-01
+summary: 260401-ejm-SUMMARY.md
+---
+
+# Quick Task 260401-ejm Status
+
+Complete. The detailed summary remains in `260401-ejm-SUMMARY.md`.
diff --git a/.planning/todos/pending/2026-04-02-move-eager-quant-prepack-into-generator-initializer.md b/.planning/todos/backlog/2026-04-02-move-eager-quant-prepack-into-generator-initializer.md
similarity index 100%
rename from .planning/todos/pending/2026-04-02-move-eager-quant-prepack-into-generator-initializer.md
rename to .planning/todos/backlog/2026-04-02-move-eager-quant-prepack-into-generator-initializer.md
diff --git a/.planning/todos/pending/2026-04-02-optimize-lfm2-5-q4-prefill-kernel.md b/.planning/todos/backlog/2026-04-02-optimize-lfm2-5-q4-prefill-kernel.md
similarity index 100%
rename from .planning/todos/pending/2026-04-02-optimize-lfm2-5-q4-prefill-kernel.md
rename to .planning/todos/backlog/2026-04-02-optimize-lfm2-5-q4-prefill-kernel.md
diff --git a/.planning/todos/pending/2026-04-02-optimize-lfm2-5-q6-prefill-kernel.md b/.planning/todos/backlog/2026-04-02-optimize-lfm2-5-q6-prefill-kernel.md
similarity index 100%
rename from .planning/todos/pending/2026-04-02-optimize-lfm2-5-q6-prefill-kernel.md
rename to .planning/todos/backlog/2026-04-02-optimize-lfm2-5-q6-prefill-kernel.md
diff --git a/.planning/todos/pending/2026-04-02-reuse-q8-rhs-across-lfm2-5-prefill-matmuls.md b/.planning/todos/backlog/2026-04-02-reuse-q8-rhs-across-lfm2-5-prefill-matmuls.md
similarity index 100%
rename from .planning/todos/pending/2026-04-02-reuse-q8-rhs-across-lfm2-5-prefill-matmuls.md
rename to .planning/todos/backlog/2026-04-02-reuse-q8-rhs-across-lfm2-5-prefill-matmuls.md
diff --git a/AGENTS.md b/AGENTS.md
index 3c903b7b..1d7bc325 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -9,6 +9,11 @@ ALWAYS follow the RTC actor model and no-queue invariant from `docs/rules/sml.ru
 NEVER use `sml::process_queue`, `sml::defer_queue`, or any mailbox/post-for-later
 mechanism.
 ALWAYS keep dispatch run-to-completion and single-writer per actor.
+ALWAYS treat coroutine or `async`-named dispatch APIs semantically: async is not
+deferred by definition, and `process_event_async` MAY be RTC when completion is
+driven and observed before the top-level dispatch returns.
+NEVER let coroutine continuations, incomplete tasks, scheduler work items, or
+callbacks escape the RTC boundary as hidden deferred work.
 NEVER call an actor's own `process_event` from guards/actions/entry/exit.
 ALWAYS model internal multi-step flows with `sml::completion<TEvent>`,
 anonymous transitions, and/or entry actions.
@@ -23,7 +28,10 @@ ALWAYS implement bulk numeric iteration in allocation-free action/detail kernels
 within a single transition per phase.
 NEVER copy event payload into context just to bridge internal phases.
 ALWAYS keep guards pure predicates of `(event, context)` with no side effects.
-ALWAYS keep actions bounded and non-blocking during dispatch.
+ALWAYS keep actions bounded during dispatch.
+ONLY allow an action-level scheduler fork/join wait when it joins already
+submitted child actor dispatches before the action returns, preserves RTC,
+does not re-enter the same actor, and leaves no hidden deferred work.
 ALWAYS keep hot-path actions allocation-free.
 ALWAYS keep any allowed one-time construction or initialization heap
 allocation before any `process_event(...)` dispatch.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b69c9bdf..e54c97bb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,6 +10,7 @@ set(CMAKE_CXX_EXTENSIONS OFF)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 include(CheckCXXCompilerFlag)
+include(CheckCXXSourceRuns)
 
 if(MSVC)
   add_compile_options(/W4 /WX)
@@ -22,6 +23,9 @@ option(EMEL_ENABLE_FUZZ "Build fuzz targets" OFF)
 option(EMEL_ENABLE_AARCH64_HOST_FEATURES
   "Enable host-tuned AArch64 compile flags for EMEL-owned C++ code"
   ON)
+option(EMEL_ENABLE_X86_64_HOST_FEATURES
+  "Enable host-tuned x86_64 AVX2/FMA/F16C compile flags for EMEL-owned C++ code"
+  ON)
 
 include(FetchContent)
 include(cmake/sml_version.cmake)
@@ -67,6 +71,50 @@ if(NOT EMEL_AARCH64_HOST_CXX_FLAG STREQUAL "")
   )
 endif()
 
+set(EMEL_X86_64_HOST_CXX_FLAGS "")
+if(EMEL_ENABLE_X86_64_HOST_FEATURES AND NOT CMAKE_CROSSCOMPILING AND NOT MSVC)
+  string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" EMEL_SYSTEM_PROCESSOR_LOWER)
+  if(EMEL_SYSTEM_PROCESSOR_LOWER MATCHES "^(x86_64|amd64)$")
+    check_cxx_compiler_flag("-mavx2" EMEL_HAS_X86_64_AVX2_FLAG)
+    check_cxx_compiler_flag("-mfma" EMEL_HAS_X86_64_FMA_FLAG)
+    check_cxx_compiler_flag("-mf16c" EMEL_HAS_X86_64_F16C_FLAG)
+    # Compiler acceptance alone is not enough: these flags let the compiler
+    # emit AVX2 in every consumer TU, bypassing the runtime CPUID guards in
+    # kernel/x86_64. Probe the build host CPU (compiled without the flags) so
+    # host-tuned codegen is only enabled where it can actually execute.
+    check_cxx_source_runs("
+      #include <cpuid.h>
+      #include <cstdint>
+      int main() {
+        unsigned int eax = 0, ebx = 0, ecx = 0, edx = 0;
+        if (!__get_cpuid(1, &eax, &ebx, &ecx, &edx)) { return 1; }
+        const bool fma = (ecx & (1u << 12)) != 0u;
+        const bool f16c = (ecx & (1u << 29)) != 0u;
+        const bool osxsave = (ecx & (1u << 27)) != 0u;
+        if (!fma || !f16c || !osxsave) { return 1; }
+        std::uint32_t xcr0_lo = 0, xcr0_hi = 0;
+        __asm__ volatile(\"xgetbv\" : \"=a\"(xcr0_lo), \"=d\"(xcr0_hi) : \"c\"(0));
+        if ((xcr0_lo & 0x6u) != 0x6u) { return 1; }
+        if (!__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) { return 1; }
+        return ((ebx & (1u << 5)) != 0u) ? 0 : 1;
+      }" EMEL_X86_64_HOST_RUNS_AVX2_FMA_F16C)
+    if(EMEL_HAS_X86_64_AVX2_FLAG AND EMEL_HAS_X86_64_FMA_FLAG AND
+       EMEL_HAS_X86_64_F16C_FLAG AND EMEL_X86_64_HOST_RUNS_AVX2_FMA_F16C)
+      list(APPEND EMEL_X86_64_HOST_CXX_FLAGS "-mavx2" "-mfma" "-mf16c")
+    endif()
+  endif()
+endif()
+
+if(EMEL_X86_64_HOST_CXX_FLAGS)
+  message(STATUS "EMEL enabling x86_64 host compile flags: ${EMEL_X86_64_HOST_CXX_FLAGS}")
+  foreach(EMEL_X86_64_HOST_CXX_FLAG IN LISTS EMEL_X86_64_HOST_CXX_FLAGS)
+    target_compile_options(emel_core
+      INTERFACE
+        "$<$<COMPILE_LANGUAGE:CXX>:${EMEL_X86_64_HOST_CXX_FLAG}>"
+    )
+  endforeach()
+endif()
+
 add_library(emel STATIC
   src/emel/io/mmap/actions.cpp
   src/emel/model/architecture/detail.cpp
@@ -115,6 +163,8 @@ if(EMEL_ENABLE_TESTS)
     tests/text/generator/lifecycle_tests.cpp
     tests/text/generator/action_guard_tests.cpp
     tests/text/generator/detail_tests.cpp
+    tests/text/generator/parallel_matmul_tests.cpp
+    tests/text/generator/decode_wavefront/lifecycle_tests.cpp
     tests/text/generator/initializer/lifecycle_tests.cpp
     tests/text/generator/prefill/lifecycle_tests.cpp
     tests/diarization/request/lifecycle_tests.cpp
diff --git a/coroutine-plan.md b/coroutine-plan.md
new file mode 100644
index 00000000..61fbd87a
--- /dev/null
+++ b/coroutine-plan.md
@@ -0,0 +1,667 @@
+# Coroutine Plan
+
+status: decode wavefront reserved-compute path measured
+owner: emel
+last updated: 2026-06-26
+
+## Decision
+
+Use `co_sm` first at the graph execution boundary.
+
+- Infrastructure surface: `src/emel/sm.hpp`
+- First inference actor: `src/emel/graph/processor/sm.hpp`
+- First current consumer: `src/emel/graph/sm.hpp` through `action::request_execute`
+- Experimental payoff path: standalone `src/emel/text/generator/decode_wavefront/**`
+  using graph-owned reserved compute
+
+This is not a plan to make one SML dispatch faster. The useful speedup path is to make graph
+execution schedulable at a bounded phase boundary so a later decode wavefront can keep compatible
+sequences ready for the same kernel route and weight stream. Single-request latency must remain
+neutral before the wavefront path is allowed to ship.
+
+Review update: production `text/generator` decode remains on direct graph dispatch. The current
+decode wavefront actor is a standalone component and benchmark target until a maintained
+multi-lane integration proves a benefit without regressing batch-1 latency.
+
+`async` is not deferred by definition. An async/coroutine dispatch is compatible with the EMEL
+RTC actor model when completion is driven and observed before the enclosing top-level dispatch
+returns. The forbidden case is hidden work escaping the RTC boundary: an incomplete task,
+continuation, scheduler work item, callback handle, mailbox entry, or post-for-later queue.
+
+## Source-Backed Current State
+
+- `stateforward::sml` is a namespace alias to the underlying SML implementation in the configured
+  Stateforward dependency, so EMEL code must use `stateforward::sml::utility::co_sm` rather than
+  naming `boost::sml` directly.
+- `src/emel/sm.hpp` now includes `stateforward/sml/utility/co_sm.hpp`, exposes `emel::co_sm`,
+  `emel::bool_task`, scheduler policy aliases, and a fixed no-heap coroutine allocator.
+- `emel::co_sm` defaults to `emel::policy::coroutine_scheduler<emel::policy::inline_scheduler>`
+  and `emel::policy::coroutine_allocator<emel::policy::fixed_coroutine_allocator<>>`.
+- `src/emel/graph/processor/sm.hpp` now inherits from `emel::co_sm<model, action::context,
+  inline_co_policy>` and keeps its public `process_event(const event::execute &)` wrapper
+  synchronous by driving `process_event_async(...).result()` to completion before returning.
+- `src/emel/graph/actions.hpp` dispatches graph execution through
+  `ctx.processor_actor.process_event_async(request).result()` within the same graph RTC chain.
+- `src/emel/graph/sm.hpp` now exposes `event::compute_reserved`, an internal reserved-compute
+  path that reuses the graph reservation output and enters processor execution without running
+  the graph assembler path again.
+- `src/emel/text/generator/decode_wavefront/sm.hpp` uses bounded static-scheduler `co_sm` and
+  dispatches compatible lanes through graph-owned `compute_reserved` events.
+- `src/emel/text/generator/actions.hpp` no longer routes single-lane production decode through
+  the wavefront actor; it dispatches the already-selected graph compute request directly.
+- `tests/sm/sm_policy_tests.cpp` covers the wrapper surface: default inline scheduler, sync
+  dispatch, inline `process_event_async(...).result()`, error normalization, context injection,
+  scheduler access, and fixed allocator exhaustion.
+- `docs/rules/sml.rules.md` and `AGENTS.md` now explicitly allow RTC async/coroutine dispatch
+  while forbidding hidden deferred work.
+- `docs/third_party/sml.md` now documents the inline `emel::co_sm` default.
+
+## Invariants
+
+- No `sml::process_queue`, `sml::defer_queue`, user mailbox, background worker, or hidden
+  post-for-later mechanism.
+- No dynamic allocation during dispatch. Coroutine frames must use fixed storage or reject the
+  dispatch; no heap fallback is acceptable in hot paths.
+- No runtime behavior selection in actions, detail helpers, coroutine bodies, or awaitables.
+  Runtime decisions stay in guards and transition rows.
+- No coroutine use inside kernels, logits scans, sampler loops, tokenizer inner loops, renderer
+  loops, tensor-element loops, packing, quant/dequant, or matmul/attention numeric loops.
+- No public API exposure of coroutine tasks or scheduler internals.
+- No performance claim from `co_sm` adoption alone. Claims require benchmark evidence.
+- Snapshot updates require explicit user consent.
+
+## Completed Work
+
+### Completed 0: Coroutine Surface
+
+Evidence:
+
+- `src/emel/sm.hpp` exposes the EMEL wrapper around Stateforward's utility `co_sm`.
+- `emel::policy::fixed_coroutine_allocator` returns `nullptr` on pool exhaustion instead of
+  falling back to heap allocation.
+- Existing `emel::sm` users are untouched.
+- `emel::co_sm` mirrors the existing contextless/contextful wrapper shape and normalizes
+  `error_out` results like `emel::sm`.
+- `process_event_async` in the EMEL wrapper observes completion before returning and always
+  returns a normalized immediate `emel::bool_task`; incomplete scheduler work is not allowed to
+  escape the RTC boundary.
+
+Validation already run:
+
+```bash
+cmake --build build/zig --target emel_tests_bin -j2
+ctest --test-dir build/zig -R '^emel_tests_sm$' --output-on-failure
+```
+
+Result: passed.
+
+### Completed 1: No-Op Graph Processor Conversion
+
+Evidence:
+
+- `src/emel/graph/processor/sm.hpp` uses `emel::co_sm` with an inline scheduler.
+- The public graph processor `event::execute` entrypoint still creates the same
+  `event::execute_ctx` and `event::execute_step`, drives the inline async base dispatch to
+  completion, and returns only after the RTC chain finishes.
+- `src/emel/graph/actions.hpp` uses the processor async execute wrapper and observes completion
+  before the graph compute action returns.
+- No transition rows were moved into coroutine bodies.
+
+Validation already run:
+
+```bash
+ctest --test-dir build/zig -R '^emel_tests_kernel_and_graph$' --output-on-failure
+```
+
+Result: passed.
+
+## Quality Gate Status
+
+Current status: focused unit tests and the corrected decode-wavefront benchmark pass; the repo
+quality gate does not yet pass for this expanded wavefront change set. The strict LFM2 generation
+comparison now correctly fails on x86_64 because the runtime uses the shared Q4 fallback instead of
+an optimized Q4 kernel.
+
+A current changed-file scoped gate was run with
+`EMEL_QUALITY_GATES_BENCH_SUITE=decode_wavefront` and the temporary `clang-format` shim in
+`/tmp/emel-clang-format-venv/bin`. It selected the decode wavefront benchmark runner, passed the
+legacy SML scan, Zig build, dependency manifest freshness checks, parity skip, and fuzz skip, then
+failed on the snapshot blockers below.
+
+Gate blockers:
+
+- `bench_snapshot`: failed because the new focused benchmark rows
+  `decode_wavefront/batch1`, `decode_wavefront/batch4`, and `decode_wavefront/batch8` have no
+  approved baseline in `snapshots/bench/benchmarks_compare.txt`.
+- `lint_snapshot`: failed because new source/test files under
+  `src/emel/text/generator/decode_wavefront/**` and
+  `tests/text/generator/decode_wavefront/lifecycle_tests.cpp` are not yet listed in
+  `snapshots/lint/clang_format.txt`.
+- Snapshot updates require explicit user approval, so `snapshots/bench/benchmarks_compare.txt`,
+  `snapshots/lint/clang_format.txt`, and `snapshots/quality_gates/timing.txt` were not updated.
+
+Code-owned review fixes completed after the first failed gate:
+
+- `tools/bench/bench_runner.cpp` now treats x86_64 and AArch64 as optimized flash hosts, matching
+  existing generator tests and runtime counters.
+- LFM2 generation quantized evidence validation again requires optimized Q4 and optimized Q6
+  evidence on all hosts, with no shared fallback and no unrelated Q2/Q3/Q8 routes.
+- `src/emel/kernel/x86_64/**` now reports shared Q4 fallback dispatches, so a missing optimized Q4
+  path is visible in generation evidence instead of looking like no Q4 path ran.
+
+Validation for those fixes:
+
+```bash
+EMEL_BENCH_ITERS=100 \
+EMEL_BENCH_RUNS=3 \
+EMEL_BENCH_WARMUP_ITERS=10 \
+EMEL_BENCH_WARMUP_RUNS=1 \
+scripts/bench.sh --compare --suite=generation
+```
+
+Result: failed as expected on this x86_64 host because optimized Q4 is still missing. The run
+reported `optimized_q4_dispatch_calls=0`, `shared_q4_dispatch_calls=2378`,
+`optimized_q6_dispatch_calls=291`, and `shared_q6_dispatch_calls=0`.
+
+Current decode wavefront benchmark evidence after review fixes:
+
+```text
+decode_wavefront/batch1 emel.cpp 417.890 ns/op, reserved-scalar-baseline 340.800 ns/op, ratio=1.226x
+decode_wavefront/batch4 emel.cpp 1533.400 ns/op, reserved-scalar-baseline 1355.200 ns/op, ratio=1.131x
+decode_wavefront/batch8 emel.cpp 2930.290 ns/op, reserved-scalar-baseline 2713.990 ns/op, ratio=1.080x
+```
+
+Interpretation: the previous apparent speedup was from comparing reserved compute against full
+graph assemble+compute. Against direct per-lane reserved compute, the current wavefront component
+is slower in this fixture and is not ready for production generator integration.
+
+```text
+src/emel/text/generator/decode_wavefront/actions.hpp
+src/emel/text/generator/decode_wavefront/context.hpp
+src/emel/text/generator/decode_wavefront/errors.hpp
+src/emel/text/generator/decode_wavefront/events.hpp
+src/emel/text/generator/decode_wavefront/guards.hpp
+src/emel/text/generator/decode_wavefront/sm.hpp
+tests/text/generator/decode_wavefront/lifecycle_tests.cpp
+```
+
+Gate command:
+
+```bash
+PATH="/tmp/emel-clang-format-venv/bin:$PATH" \
+EMEL_QUALITY_GATES_CHANGED_FILES="AGENTS.md \
+docs/rules/sml.rules.md \
+docs/third_party/sml.md \
+src/emel/sm.hpp \
+src/emel/graph/actions.hpp \
+src/emel/graph/context.hpp \
+src/emel/graph/events.hpp \
+src/emel/graph/guards.hpp \
+src/emel/graph/processor/sm.hpp \
+src/emel/graph/sm.hpp \
+src/emel/text/generator/actions.hpp \
+src/emel/text/generator/context.hpp \
+src/emel/text/generator/decode_wavefront/actions.hpp \
+src/emel/text/generator/decode_wavefront/context.hpp \
+src/emel/text/generator/decode_wavefront/errors.hpp \
+src/emel/text/generator/decode_wavefront/events.hpp \
+src/emel/text/generator/decode_wavefront/guards.hpp \
+src/emel/text/generator/decode_wavefront/sm.hpp \
+tests/sm/sm_policy_tests.cpp \
+tests/graph/graph_tests.cpp \
+tests/graph/processor/processor_action_branch_tests.cpp \
+tests/text/generator/decode_wavefront/lifecycle_tests.cpp \
+tools/bench/CMakeLists.txt \
+tools/bench/bench_cases.hpp \
+tools/bench/bench_dependency_manifest.cpp \
+tools/bench/bench_disabled_cases.cpp \
+tools/bench/bench_runner.cpp \
+tools/bench/bench_runner_registry.cpp \
+tools/bench/dependency_manifest.txt \
+tools/bench/graph/processor_bench.cpp \
+tools/bench/text/generator/decode_wavefront_bench.cpp \
+coroutine-plan.md" \
+EMEL_QUALITY_GATES_BENCH_SUITE=decode_wavefront \
+  scripts/quality_gates.sh
+```
+
+Result: failed only for the snapshot-baseline blockers above.
+
+### Completed 2: Graph Processor Neutrality Benchmark
+
+Goal: prove the no-op `co_sm` graph processor is neutral before adding any async call surface or
+decode wavefront behavior.
+
+Evidence:
+
+- `tools/bench/graph/processor_bench.cpp` adds a focused `graph_processor` suite.
+- The EMEL lane uses the current `emel::graph::processor::sm` inline `co_sm` wrapper.
+- The reference lane uses a benchmark-local `emel::sm<processor::model, action::context>`
+  wrapper over the same transition table.
+- Cases cover invalid request rejection, reused-buffer success, allocation-required success,
+  lifecycle gate/publish/release, and done/error callback publication.
+- `tools/bench/CMakeLists.txt`, `tools/bench/bench_cases.hpp`,
+  `tools/bench/bench_disabled_cases.cpp`, and `tools/bench/bench_runner_registry.cpp` register
+  the suite without requiring llama.cpp.
+- `tools/bench/bench_runner.cpp` prints graph-processor compare rows as `reference-baseline`
+  instead of `llama.cpp`.
+
+Validation run:
+
+```bash
+EMEL_BENCH_ITERS=200000 \
+EMEL_BENCH_RUNS=9 \
+EMEL_BENCH_WARMUP_ITERS=10000 \
+EMEL_BENCH_WARMUP_RUNS=1 \
+scripts/bench.sh --compare --suite=graph_processor
+```
+
+Result:
+
+| case | inline `co_sm` | `emel::sm` baseline | ratio |
+| --- | ---: | ---: | ---: |
+| `graph/processor_alloc` | 350.723 ns/op | 357.034 ns/op | 0.982x |
+| `graph/processor_invalid` | 24.942 ns/op | 24.958 ns/op | 0.999x |
+| `graph/processor_reused` | 305.952 ns/op | 315.271 ns/op | 0.970x |
+
+Interpretation:
+
+- The no-op graph processor `co_sm` conversion passes the neutrality checkpoint.
+- The benchmark has low-single-digit run-to-run movement, including earlier samples where
+  successful dispatch was slightly slower and later samples where it was slightly faster.
+- Treat the result as neutral infrastructure, not an inference speedup claim.
+- Do not expand this no-op conversion into generator/decode paths as a performance feature.
+- Use the benchmark as the regression guard for future coroutine candidates.
+
+Generation comparison still required before any inference-throughput claim:
+
+```bash
+EMEL_BENCH_ITERS=1 \
+EMEL_BENCH_RUNS=1 \
+EMEL_BENCH_WARMUP_ITERS=0 \
+EMEL_BENCH_WARMUP_RUNS=0 \
+EMEL_GENERATION_WORKLOAD_ID=qwen3_single_user_hello_max_tokens_1_v1 \
+scripts/bench.sh --compare --suite=generation
+```
+
+## Phase 3: RTC `process_event_async` Surface
+
+Goal: make graph execution callable through `process_event_async` without changing generator or
+graph RTC semantics.
+
+Status: completed and measured. This is not an inference speedup by itself.
+
+Evidence:
+
+- `processor::sm::process_event(const event::execute &)` delegates to
+  `process_event_async(ev).result()`.
+- `processor::sm::process_event_async(const event::execute &)` preserves the dispatch-local
+  `execute_ctx` / `execute_step` handoff and returns an immediate `emel::bool_task`.
+- `graph::action::request_execute` calls
+  `ctx.processor_actor.process_event_async(request).result()`.
+- `graph_processor_process_event_async_execute_completes_in_rtc` proves the public async execute
+  wrapper publishes callbacks and output before `.result()` is observed.
+- The existing `graph_machine_compute_lifecycle_dispatch_is_alloc_free` test now covers graph
+  compute through the async processor execution path.
+
+Acceptance:
+
+- `graph::sm` and `text::generator::sm` remain synchronous RTC actors.
+- No generated token can observe partially completed graph work.
+- No hidden scheduler work survives return from the top-level dispatch.
+- Allocator counters show zero graph compute hot-path heap allocation.
+
+Validation run:
+
+```bash
+cmake --build build/zig --target emel_tests_bin -j2
+ctest --test-dir build/zig -R '^emel_tests_sm$' --output-on-failure
+ctest --test-dir build/zig -R '^emel_tests_kernel_and_graph$' --output-on-failure
+EMEL_BENCH_ITERS=200000 \
+EMEL_BENCH_RUNS=9 \
+EMEL_BENCH_WARMUP_ITERS=10000 \
+EMEL_BENCH_WARMUP_RUNS=1 \
+scripts/bench.sh --compare --suite=graph_processor
+```
+
+Result from the latest repeated sample:
+
+| case | async `co_sm` graph processor | `emel::sm` baseline | ratio |
+| --- | ---: | ---: | ---: |
+| `graph/processor_alloc` | 349.161 ns/op | 344.381 ns/op | 1.014x |
+| `graph/processor_invalid` | 32.150 ns/op | 26.940 ns/op | 1.193x |
+| `graph/processor_reused` | 316.821 ns/op | 310.523 ns/op | 1.020x |
+
+Interpretation:
+
+- The coroutine-driven graph processor is implemented and remains RTC.
+- Successful dispatch has a low-single-digit overhead against the direct `emel::sm` baseline.
+- Invalid rejection is materially slower because the async wrapper overhead dominates a tiny
+  failure path.
+- This phase proves semantics and establishes a benchmark guard; it does not speed inference.
+
+## Completed 4: Decode Wavefront Driver
+
+Goal: turn `co_sm` into an inference speedup by batching compatible decode work across sequences.
+
+Status: completed for the bounded graph-compute wavefront target.
+
+Evidence:
+
+- `src/emel/text/generator/decode_wavefront/**` defines a bounded generator-owned wavefront
+  actor with explicit lane stages for up to 8 lanes.
+- Guards require compatible model identity, backend identity, kernel kind, attention mode, kernel
+  route, output contract, dtype/layout contract, quantized contract, step size, and token count.
+- The wavefront path does not share mutable lane context. Each lane carries its own graph actor,
+  graph compute request, compatibility key, and acceptance flag.
+- The graph wrapper owns the optimization boundary through `event::compute_reserved`; wavefront
+  does not reach into graph actions or assembler internals.
+- Reserved compute requires a successful graph reservation first, seeds the compute output from the
+  reservation, and skips normal graph assemble hints.
+- `process_event_async` on the wavefront observes the bounded static-scheduler dispatch before
+  returning and exposes a normalized immediate `emel::bool_task`, preserving RTC semantics.
+- No background worker, mailbox, `defer_queue`, `process_queue`, or hidden post-return work was
+  introduced.
+
+Acceptance:
+
+- Deterministic dispatch and first-lane failure behavior are covered by decode wavefront tests.
+- Scheduler depth is bounded by `fifo_scheduler<16u, 64u>` and lane count is bounded by
+  `event::k_max_lanes == 8`.
+- Batch-1/4/8 performance is not accepted for production integration yet. The corrected
+  reserved-scalar baseline shows current wavefront overhead rather than a grouping win.
+
+Validation run:
+
+```bash
+cmake --build build/zig --target emel_tests_bin -j2
+./build/zig/emel_tests_bin --no-breaks --test-case="*graph*"
+./build/zig/emel_tests_bin --no-breaks --test-case="*x86_64*q4*"
+./build/zig/emel_tests_bin --no-breaks --test-case="co_sm*"
+./build/zig/emel_tests_bin --no-breaks --test-case="decode wavefront*"
+ctest --test-dir build/zig -R '^emel_tests_kernel_and_graph$' --output-on-failure
+ctest --test-dir build/zig -R '^emel_tests_generator_and_runtime$' --output-on-failure
+ctest --test-dir build/zig -R '^emel_tests_sm$' --output-on-failure
+EMEL_BENCH_ITERS=2000 \
+EMEL_BENCH_RUNS=5 \
+EMEL_BENCH_WARMUP_ITERS=100 \
+EMEL_BENCH_WARMUP_RUNS=1 \
+scripts/bench.sh --suite=decode_wavefront --compare
+```
+
+Result:
+
+| case | wavefront path | reserved-scalar baseline | ratio |
+| --- | ---: | ---: | ---: |
+| `decode_wavefront/batch1` | 417.890 ns/op | 340.800 ns/op | 1.226x |
+| `decode_wavefront/batch4` | 1533.400 ns/op | 1355.200 ns/op | 1.131x |
+| `decode_wavefront/batch8` | 2930.290 ns/op | 2713.990 ns/op | 1.080x |
+
+Interpretation:
+
+- The first useful `co_sm` target remains the graph execution boundary, but the current
+  decode-wavefront component is not a production win.
+- The earlier speedup claim was an apples-to-oranges comparison against full graph compute. The
+  corrected benchmark measures wavefront orchestration against direct reserved graph dispatch.
+- The current implementation still drains inside the top-level RTC call; that is intentional for
+  actor correctness. Future external completion backends must suspend only at explicit phase
+  boundaries without changing public graph/generator semantics.
+
+## Completed 5: Thread-Pool Scheduler Win Over Single-SM and llama.cpp
+
+Goal: get `co_sm` + `thread_pool_scheduler` to beat the single-`sm` baseline and
+llama.cpp on a realistic parallel decode workload (independent per-lane GEMVs).
+
+Status: achieved at small per-lane dims (the inter-op-parallelism regime), with a
+prerequisite scheduler deadlock fixed along the way.
+
+### Scheduler correctness fix (prerequisite)
+
+`emel::policy::thread_pool_scheduler` deadlocked under rapid repeated fork/join,
+most readily when lane count == worker count (the decode wavefront's 8-lane,
+8-worker config). A single dispatch rarely hit it, so existing tests passed while
+the bug was latent. Two independent races in the join latch:
+
+1. `join_group`'s close/complete handshake was a Dekker pattern with
+   release/acquire ordering. StoreLoad reordering let `wait()` miss the final
+   completion while the last completer missed the close, stranding the wakeup.
+2. The per-group `std::binary_semaphore` was destroy-during-release: groups are
+   stack-reused each round, so the waiter could return and destroy the group
+   while the last completer was still inside libc++ `release()`/notify (UAF).
+   `run_or_schedule_and_wait`'s local `done` semaphore had the same flaw.
+
+Fix: replaced the semaphore/closed handshake with a lifetime-safe spin-join on
+`pending_` (a completer's last touch is its decrement, so `wait()` returns only
+after every completer is done); `run_or_schedule_and_wait` spins on a local
+`atomic<bool>` the worker sets last. Added `emel::policy::cpu_relax()`. New
+regression test `thread_pool_scheduler_ref_fork_join_survives_rapid_repeated_rounds`
+(20000 rounds x 8 lanes). Validated: 80M+ batch-8 fork/joins with 0 stalls.
+
+### Warm worker loop
+
+Workers now spin-claim a wake permit (bounded `k_idle_spin_budget`) before
+falling back to a blocking acquire, keeping the pool warm across a burst of
+fork/joins and removing ~3-4us of resleep/wakeup latency per round (the same
+warm-polling strategy ggml's threadpool uses). The permit-per-task invariant is
+preserved, so there is no drift or lost-wakeup regression.
+
+### Evidence (Ryzen 9 5950X, 16C/32T; `-O3 -march=native`)
+
+In-repo, via the actual production decode wavefront actor (thread-pool co_sm)
+through `scripts/bench.sh --compare --suite=decode_wavefront`. New realistic
+cases run a decode-representative f32 GEMV (`y = W@x`, dim x dim, independent
+weights per lane); the `gemv_*` cases compare against the single graph sm
+(reserved scalar), the `ggml_*` cases against a ggml reference (independent
+mul_mat lanes, warm threadpool, same core budget). Default `dim=256`:
+
+| case (batch 8) | emel.cpp | baseline | result |
+| --- | ---: | ---: | --- |
+| `gemv_batch8` (vs single-sm) | ~8.4-10.5 us | ~33 us | 3.1-3.9x faster |
+| `ggml_batch8` (vs llama.cpp) | ~9.1-9.3 us | ~12.7 us | 1.37-1.41x faster (3 runs) |
+
+The single-sm win holds at every dim (3-6x). The llama.cpp win holds at small
+dims (dim<=256, ~1.4x) where inter-op parallelism dominates ggml's intra-op
+threading; the crossover is ~dim 384-512, where ggml's hand-optimized blocked
+kernel catches up. That crossover is a kernel-quality matter, not a scheduling
+one: the scheduler delivers near-linear parallelism (up to 7.85x on 8 lanes for
+bare GEMV lanes), and its overhead floor is ~5-10us (idle-pool worker wakeup),
+dwarfed by real decode work.
+
+Contrast: the original trivial-work cases (`batch8`, kernel = `*calls += 1`)
+still show the wavefront 2-4x slower, confirming the earlier "wavefront is
+slower" result was a fixture artifact, not a scheduler limitation.
+
+Caveat: EMEL's inter-op parallelism beats ggml only for independent-weight lanes
+(concurrent requests / multi-model / MoE). For shared-weight batched decode, ggml
+batches sequences into one GEMM and reuses the weight stream (bandwidth-optimal);
+the comparison here is framed as independent-weight concurrent lanes accordingly.
+
+## Completed 6: Parallel Matmul Cutover (view-sliced lanes, ith/nth removed)
+
+Goal: turn the proven thread-pool fork/join into a maintained single-request hot-path win for
+prefill GEMM and per-token decode GEMV, and remove ggml's `ith`/`nth` thread-partition fields so
+view slicing is the only parallelism model in the kernel event contract.
+
+Decision record:
+
+- `ith`/`nth` were declared on every kernel op event but never used for partitioning; the only
+  read site was a validator that rejected anything but `ith==0 && nth==1`. The architecture-native
+  slice descriptor is the `tensor_view` itself: a row slice is a smaller view, the kernel computes
+  whatever its views describe, and partition policy lives only at the orchestration fork site.
+  The fields and their validator clause are deleted; events remain complete work descriptions
+  with no thread identity.
+
+Implementation:
+
+- `src/emel/text/generator/detail.hpp` owns the lanes: `k_matmul_lanes == 8` per-lane
+  `emel::kernel::sm` actors plus an `std::optional` `thread_pool_scheduler<8, 16, 128>` engaged
+  once in `prepare()`. A parallel dispatch forks one logical mul_mat into pack-group-aligned
+  contiguous row-slice events (`compute_matmul_row_slices`, groups of 8 for `*_x8`, 4 for
+  `*_x4`, 1 otherwise), the caller computes slice 0 while pool workers compute the rest, and the
+  join completes before the action returns. Rejected submits run the same slice inline, so a
+  pool-worker caller (wavefront lane) degrades to serial automatically — lanes-first composition.
+- Lane mode is a compile-time template parameter threaded through the matmul helpers and
+  runners; route guards choose it: prefill parallel rows require the pool engaged and
+  `prompt_token_count >= k_parallel_min_prefill_tokens` (8), decode parallel rows require
+  `n_embd >= k_parallel_min_gemv_dim` (1024) so tiny models keep the scalar route. Parallel
+  contract rows sit above their serial siblings for flash materialized + preselected prefill
+  chunk routes and flash materialized + preselected decode scalar routes.
+- Evidence counters live per kernel actor, so audit reads sum the primary kernel and every lane
+  (`compute_kernel_counter_total`); `kernel_dispatch_calls` and route counters stay once per
+  logical matmul at the fork site.
+
+Evidence (Ryzen 9 5950X, zig c++ -O3 -mavx2 -mfma):
+
+```text
+parallel_matmul/gemv_f32   emel.cpp 285062 ns/op, reference-baseline 2419432 ns/op, ratio=0.118x
+parallel_matmul/gemv_q8_0  emel.cpp  54841 ns/op, reference-baseline  371154 ns/op, ratio=0.148x
+parallel_matmul/gemv_q4_k  emel.cpp  68648 ns/op, reference-baseline  470182 ns/op, ratio=0.146x
+parallel_matmul/gemv_q6_k  emel.cpp  43786 ns/op, reference-baseline  282983 ns/op, ratio=0.155x
+parallel_matmul/gemm8_f32  emel.cpp 156122 ns/op, reference-baseline 1265074 ns/op, ratio=0.123x
+```
+
+Interpretation: 6.5x-8.5x over serial single-kernel dispatch at dim 2048 across dtypes — the
+inter-op fork/join regime delivering near-linear scaling on 8 lanes, now on the maintained
+matmul dispatch path rather than a standalone component.
+
+Correctness:
+
+- Row slices write disjoint dst rows and reorder no reductions, so parallel output is
+  bit-identical to serial; `tests/text/generator/parallel_matmul_tests.cpp` proves slice
+  arithmetic, group alignment, and f32/q8_0 serial-vs-parallel byte equality, and the full
+  generator fixture suites pass with prompts >= 8 tokens taking the parallel prefill route.
+- The strict LFM2 x86_64 generation evidence failure is unchanged and pre-existing
+  (`optimized_q4_dispatch_calls=0 shared_q4_dispatch_calls=2378 optimized_q6_dispatch_calls=291`
+  — identical counts to the pre-cutover run, which also proves lane-counter aggregation
+  preserves evidence exactly).
+
+## Completed 7: Matched-Thread llama.cpp Comparison Lanes
+
+Goal: measure the view-sliced parallel matmul against llama.cpp/ggml at the same core budget
+instead of only against EMEL's own serial dispatch, per the reference-comparison rules.
+
+Kernel-class lane (`parallel_matmul/ggml_*` cases): the EMEL side runs the identical 8-lane
+fork/join as the plain-named cases; the reference side runs the same logical matmul as one ggml
+`mul_mat` node on a warm 8-thread ggml threadpool (the decode_wavefront reference pattern).
+Operand class is plain GGUF-native blocks on BOTH sides — this exercises EMEL's shared
+(non-repacked) kernels, not the repacked x4/x8 kernels the production decode routes dispatch
+after `prepare()`.
+
+Evidence (Ryzen 9 5950X, dim 2048, 8 threads both sides, iter=2000 runs=5):
+
+```text
+parallel_matmul/ggml_gemm8_f32 emel.cpp 170008 ns/op, llama.cpp 201689 ns/op, ratio=0.843x
+parallel_matmul/ggml_gemv_f32  emel.cpp 284070 ns/op, llama.cpp  29684 ns/op, ratio=9.570x
+parallel_matmul/ggml_gemv_q8_0 emel.cpp  55579 ns/op, llama.cpp  20757 ns/op, ratio=2.678x
+parallel_matmul/ggml_gemv_q4_k emel.cpp  68174 ns/op, llama.cpp  16335 ns/op, ratio=4.173x
+parallel_matmul/ggml_gemv_q6_k emel.cpp  44451 ns/op, llama.cpp  17805 ns/op, ratio=2.496x
+```
+
+Interpretation:
+
+- Prefill-shape GEMM: EMEL's inter-actor row slicing beats ggml's intra-op chunking (0.843x).
+- Decode-shape GEMV: the gap is per-kernel arithmetic on plain blocks, not orchestration — the
+  EMEL parallel numbers match serial/8 scaling exactly, and the serial kernels trail ggml's
+  vec_dot by the same factors (plain-q4_k ~4x, plain-q8_0 ~2.7x, f32 GEMV ~10x, consistent with
+  the known missing optimized plain-Q4 kernel and a near-scalar f32 GEMV path).
+- These rows are a kernel-class comparison and must not be quoted as production decode numbers:
+  production decode dispatches the repacked x4/x8 kernels. The production-class cross-engine
+  number is the end-to-end generation compare at matched reference threads (in progress; blocked
+  for LFM2-architecture fixtures on x86 by the same strict Q4 evidence audit until either the
+  optimized plain-Q4 kernel lands or the audit gains a quant-class-aware LFM2+Q8_0 branch).
+
+Follow-up work surfaced by this comparison: optimized plain-Q4 GEMV kernel (already the known
+blocker), SIMD f32 GEMV, and a repacked-operand cross-engine lane once llama.cpp's x86 repack
+path is wired into the reference fixture.
+
+End-to-end lane (LFM2.5-230M-Q8_0, prompt "hello", `EMEL_BENCH_REFERENCE_THREADS=8` so the
+reference runs the same 8-core budget as EMEL's lane pool; 1-thread reference rows kept for
+context):
+
+```text
+max_tokens_1   emel.cpp 443.0 ms/op, llama.cpp  27.1 ms/op (8t), ratio=16.337x   [1t: 130.7 ms, 3.407x]
+max_tokens_100 emel.cpp 1002.6 ms/op, llama.cpp 514.0 ms/op (8t), ratio=1.950x
+```
+
+Decomposition (t100 minus t1, over 99 decode tokens): EMEL ~5.65 ms/token vs llama.cpp
+~4.92 ms/token — steady-state decode is within ~1.15x at matched threads, consistent with both
+sides being memory-bandwidth-bound on q8_0 GEMV at this model size. The end-to-end gap is
+concentrated in the first-token path: EMEL spends ~443 ms before the first sampled token where
+llama.cpp spends ~27 ms at 8 threads. The "hello" prompt is only a handful of tokens, so this is
+not GEMM volume; profiling the EMEL first-token/session path (prompt format, tokenize, prefill
+graph build/plan, first dispatch) is the top follow-up from this lane.
+
+## Phase 5: External Completion Backends
+
+Goal: use coroutine suspension only where an actual external completion source exists.
+
+Valid future boundaries:
+
+- accelerator submission/completion
+- OS-backed cold-load or staged-read completion
+- platform DMA/NPU completion
+
+Invalid boundaries:
+
+- CPU AArch64 or x86_64 kernel inner loops
+- software prefetch
+- per-token sampler loops
+- synthetic sleep/poll loops
+
+Acceptance:
+
+- Async backend routes are explicit guarded transitions.
+- Synchronous CPU routes remain direct and are not slower.
+- The EMEL lane remains EMEL-owned; no llama.cpp/ggml object is shared into runtime execution.
+
+## Validation Checklist
+
+Before claiming the graph processor coroutine phase is complete:
+
+- `emel_tests_sm` passes.
+- `emel_tests_kernel_and_graph` passes.
+- Changed-file scoped quality gate passes.
+- A graph processor benchmark exists and compares `emel::sm` vs async inline `emel::co_sm`.
+- No snapshots were updated without explicit approval.
+- Code review confirms no hidden runtime behavior selection was moved into coroutine bodies,
+  awaitables, actions, or detail helpers.
+
+Before making any broader maintained-generation throughput claim:
+
+- Generation benchmark evidence exists for single-token and multi-token maintained decode
+  workloads, not only the focused decode wavefront microbenchmark.
+- Snapshot/parity checks cover maintained generation fixtures.
+- A wavefront or external-completion implementation exists at the maintained generation entrypoint,
+  not only as a callable component.
+
+## Risk Register
+
+- `co_sm` adds overhead but no speedup.
+  Detection: graph processor benchmark.
+  Response: keep graph processor on `emel::sm` and use `co_sm` only where it preserves RTC or
+  where a later completion is modeled as an explicit external event.
+- Scheduler becomes a hidden queue.
+  Detection: SML rule review, callback-order tests, and escaping-task tests.
+  Response: restrict inference paths to inline/immediate-drain scheduling until a bounded driver
+  is explicitly approved.
+- Coroutine frame allocates on hot path.
+  Detection: allocator accounting and allocation instrumentation.
+  Response: increase fixed pool or reject dispatch; never fall back to heap in hot paths.
+- Awaitable chooses behavior.
+  Detection: action/guard branch tests and code review.
+  Response: move choice into guards and transition rows.
+- Decode batching changes outputs.
+  Detection: generation parity tests and compare summaries.
+  Response: fix lane ordering and publication; do not accept drift as a performance tradeoff.
+- Kernel route mismatch weakens parity claims.
+  Detection: diagnostics and runtime contract counters.
+  Response: benchmark only equivalent operand paths.
+
+## Decision Checkpoints
+
+- After graph processor measurement: continue only if the no-op `co_sm` conversion is neutral.
+- After RTC `process_event_async` wiring: continue only if RTC semantics and allocation
+  guarantees are proven.
+- After decode wavefront implementation: focused correctness passed, corrected performance did not.
+  Next checkpoint is either a real multi-lane scheduler integration that amortizes the overhead or
+  a different coroutine boundary with external completion work.
diff --git a/docs/rules/sml.rules.md b/docs/rules/sml.rules.md
index fa402701..9f1c9cd7 100644
--- a/docs/rules/sml.rules.md
+++ b/docs/rules/sml.rules.md
@@ -7,7 +7,10 @@ remaining synchronous run-to-completion (RTC) and using no message queue.
 
 these rules apply to:
 - stateforward.SML state machines (`stateforward::sml::sm<...>`) and their composition (composite state machines, orthogonal regions).
-- synchronous dispatch only (no background workers, no mailboxes, no async buffering).
+- synchronous RTC semantics only (no background workers, no mailboxes, no deferred buffering).
+- coroutine or `async`-named APIs when the caller observes quiescence before
+  the top-level dispatch returns. `async` is not deferred by definition; hidden
+  retention of work for later is what violates RTC/no-queue semantics.
 
 the rules assume the project-pinned stateforward.SML semantics as implemented in the
 local header and utility dispatch table, including typed completion propagation
@@ -25,6 +28,10 @@ primary sources consulted (non-exhaustive)
   exposed events are immutable. internal-only events that are not publicly
   exposed MAY carry mutable fields when needed for synchronous RTC handoff.
 - RTC chain: the complete, synchronous computation triggered by one top-level dispatch call, including SML internal anonymous transitions.
+- RTC async/coroutine dispatch: a coroutine-backed dispatch whose completion is
+  driven and observed within the same RTC chain. such a call may expose a task
+  object internally, but no incomplete task or continuation may escape the RTC
+  boundary unless the later completion is modeled as an explicit external event.
 - quiescence: a stable configuration where no further internal (anonymous) transitions are enabled.
 - orchestrator: the external driver that calls `process_event` on actors and provides time and ordering.
 - no message queue: no SML `process_queue`, no SML `defer_queue`, no user mailbox, and no “post for later” mechanism.
@@ -36,6 +43,11 @@ primary sources consulted (non-exhaustive)
 4. single-writer invariant: during any RTC chain, exactly one thread MUST be executing inside any given actor’s `process_event`.
 5. allocation invariant: no dynamic allocation (heap) MUST occur during dispatch (guards/actions/entry/exit/anonymous progress).
 6. bounded-work invariant: each top-level dispatch MUST have a provable upper bound on executed transitions and on total work.
+7. coroutine invariant: `process_event_async` or other coroutine-backed dispatch
+   surfaces MAY be used only when their completion is immediate or driven to
+   quiescence before the caller returns from the enclosing RTC dispatch. a
+   scheduler may sequence continuations inside that chain, but MUST NOT retain
+   work as hidden deferred state.
 
 ## 4. event model
 1. event types SHOULD be small, trivially copyable, and contain only immutable payload.
@@ -86,7 +98,12 @@ primary sources consulted (non-exhaustive)
 3. for an external transition with entry/exit enabled, the order MUST be: guard, on-exit, state update, action, on-entry. this follows `transition<...>::execute` which calls `on_exit`, updates current state, executes action, then calls `on_entry`. (source: `stateforward/sml.hpp`, `transition<state<s1>, state<s2>, event<E>, G, A>::execute`.)
 
 ### real-time and determinism constraints
-4. guards and actions MUST be bounded time and MUST NOT block (no I/O waits, no mutex waits, no sleeps).
+4. guards and actions MUST be bounded time and MUST NOT block on external
+   resources (no I/O waits, no mutex waits, no sleeps). an action MAY perform a
+   bounded RTC scheduler fork/join wait only after submitting child actor
+   dispatches whose completion is joined before the action returns; the join
+   MUST preserve single-writer per actor, MUST NOT re-enter the same actor, and
+   MUST NOT leave hidden deferred work.
 5. guards and actions MUST NOT allocate. if an action MUST allocate for rare paths (e.g., error reporting), it MUST do so outside dispatch and only pass references into dispatch.
 6. guards MUST NOT read wall-clock time. time MUST be provided explicitly via events (section 10).
 7. actions MUST NOT contain orchestration branching or validation logic. any runtime control-flow
@@ -102,7 +119,7 @@ primary sources consulted (non-exhaustive)
    transitions or explicit choices/states in the transition graph. only compile-time conditionals
    (e.g., `if constexpr`, `#if`) are allowed inside actions, member methods, or functions called
    from actions/member methods.
-9. actions SHOULD be short. long-running work MUST be split:
+9. actions SHOULD be short. long-running external work MUST be split:
    - action initiates work and transitions to a “waiting” state.
    - A later external event represents completion (still no queues).
 10. actions SHOULD be `noexcept` in production builds. if exceptions are enabled, the system MUST define a hard policy for exception events and document action-throws semantics (overview page notes different semantics for guard-throws vs action-throws).
diff --git a/docs/third_party/sml.md b/docs/third_party/sml.md
index d8ad7e04..814bb43c 100644
--- a/docs/third_party/sml.md
+++ b/docs/third_party/sml.md
@@ -1718,15 +1718,16 @@ sml::sm<example, sml::logger<my_logger>, sml::thread_safe<std::recursive_mutex>>
 ### emel extension: coroutine scheduler policy
 
 `emel::co_sm` supports a scheduler policy in addition to SML policies.
-default is:
+default is the inline scheduler to preserve EMEL's synchronous RTC/no-queue actor
+contract:
 
 ```cpp
-emel::policy::coroutine_scheduler<emel::policy::fifo_scheduler<>>
+emel::policy::coroutine_scheduler<emel::policy::inline_scheduler>
 ```
 
 ```cpp
 using inline_policy = emel::policy::coroutine_scheduler<emel::policy::inline_scheduler>;
-emel::co_sm<example, inline_policy> co;
+emel::co_sm<example, void, inline_policy> co;
 ```
 
 custom scheduler requirement:
diff --git a/scripts/quality_gates.sh b/scripts/quality_gates.sh
index f60d2e9b..ac3ebef3 100755
--- a/scripts/quality_gates.sh
+++ b/scripts/quality_gates.sh
@@ -361,7 +361,7 @@ bench_suite_supported_for_host() {
     kernel_aarch64)
       [[ "$host_arch" == "aarch64" || "$host_arch" == "arm64" ]]
       ;;
-    sm_any)
+    sm_any|sm_scheduler)
       [[ -n "${EMEL_BENCH_INTERNAL:-}" && "${EMEL_BENCH_INTERNAL:-}" != "0" ]]
       ;;
     *)
@@ -996,6 +996,17 @@ run_benchmark_gates() {
         bench_warmup_runs="${EMEL_QUALITY_GATES_DIARIZATION_BENCH_WARMUP_RUNS:-1}"
         bench_tolerance="${EMEL_QUALITY_GATES_DIARIZATION_BENCH_TOLERANCE:-0.30}"
         ;;
+      parallel_matmul)
+        # Thread-pool fork/join cases need amortized measurement windows or
+        # post-build CPU contention dominates the short default runs.
+        bench_iters="${EMEL_QUALITY_GATES_PARALLEL_MATMUL_BENCH_ITERS:-2000}"
+        bench_runs="${EMEL_QUALITY_GATES_PARALLEL_MATMUL_BENCH_RUNS:-5}"
+        bench_warmup_iters="${EMEL_QUALITY_GATES_PARALLEL_MATMUL_BENCH_WARMUP_ITERS:-200}"
+        bench_warmup_runs="${EMEL_QUALITY_GATES_PARALLEL_MATMUL_BENCH_WARMUP_RUNS:-1}"
+        ;;
+      sm_any|sm_scheduler)
+        bench_extra_env+=(EMEL_BENCH_INTERNAL=1)
+        ;;
       whisper_compare)
         if run_step_allow_fail "bench_snapshot_${suite}" \
           "$ROOT_DIR/scripts/bench_whisper_compare.sh"; then
diff --git a/scripts/test_with_coverage.sh b/scripts/test_with_coverage.sh
index 3b3e85b0..eb1a8216 100755
--- a/scripts/test_with_coverage.sh
+++ b/scripts/test_with_coverage.sh
@@ -6,6 +6,7 @@ BRANCH_COVERAGE_MIN="${BRANCH_COVERAGE_MIN:-50}"
 COVERAGE_BUILD_DIR="${EMEL_COVERAGE_BUILD_DIR:-build/coverage}"
 COVERAGE_CLEAN="${EMEL_COVERAGE_CLEAN:-0}"
 COVERAGE_CHANGED_ONLY="${EMEL_COVERAGE_CHANGED_ONLY:-0}"
+COVERAGE_CHANGED_LINE_ONLY="${EMEL_COVERAGE_CHANGED_LINE_ONLY:-1}"
 COVERAGE_BASE_REF="${EMEL_COVERAGE_BASE_REF:-origin/main}"
 COVERAGE_CHANGED_FILES="${EMEL_COVERAGE_CHANGED_FILES:-}"
 COVERAGE_TEST_REGEX="${EMEL_COVERAGE_TEST_REGEX:-}"
@@ -27,7 +28,12 @@ if ! command -v llvm-cov >/dev/null 2>&1 || ! command -v llvm-profdata >/dev/nul
   done
 fi
 
-for tool in cmake ctest gcovr clang-format llvm-cov llvm-profdata gcc g++; do
+required_tools=(cmake ctest gcovr clang-format llvm-cov llvm-profdata gcc g++)
+if [[ "$COVERAGE_CHANGED_ONLY" == "1" && "$COVERAGE_CHANGED_LINE_ONLY" != "0" ]]; then
+  required_tools+=(python3)
+fi
+
+for tool in "${required_tools[@]}"; do
   if ! command -v "$tool" >/dev/null 2>&1; then
     echo "error: required tool missing: $tool" >&2
     exit 1
@@ -49,6 +55,7 @@ changed_shards=()
 selected_test_dirs=()
 selected_test_sources=()
 unknown_changed_src=0
+coverage_base_ref_resolved="$COVERAGE_BASE_REF"
 
 is_coverage_excluded_src_file() {
   local file="$1"
@@ -167,6 +174,7 @@ if [[ "$COVERAGE_CHANGED_ONLY" == "1" ]]; then
       echo "warning: unable to resolve coverage base ref, using HEAD" >&2
     fi
   fi
+  coverage_base_ref_resolved="$base_ref"
 
   if [[ -n "$COVERAGE_CHANGED_FILES" ]]; then
     while IFS= read -r file; do
@@ -277,8 +285,8 @@ cmake -S . -B "$COVERAGE_BUILD_DIR" -G Ninja \
   -DCMAKE_BUILD_TYPE=Debug \
   -DCMAKE_C_COMPILER=gcc \
   -DCMAKE_CXX_COMPILER=g++ \
-  -DCMAKE_C_FLAGS="--coverage -O0" \
-  -DCMAKE_CXX_FLAGS="--coverage -O0" \
+  -DCMAKE_C_FLAGS="--coverage -O0 -fprofile-update=atomic" \
+  -DCMAKE_CXX_FLAGS="--coverage -O0 -fprofile-update=atomic" \
   -DCMAKE_EXE_LINKER_FLAGS="--coverage" \
   -DEMEL_TEST_EXTRA_ARG="$COVERAGE_TEST_EXTRA_ARG" \
   -DEMEL_TEST_SHARDS="$COVERAGE_TEST_SHARDS"
@@ -319,6 +327,187 @@ if [[ "$COVERAGE_CHANGED_ONLY" == "1" &&
   fi
 fi
 
+collect_changed_lines() {
+  local output_file="$1"
+  shift
+
+  : > "$output_file"
+  if [[ "$COVERAGE_CHANGED_ONLY" != "1" || "$COVERAGE_CHANGED_LINE_ONLY" == "0" ]]; then
+    return 0
+  fi
+
+  python3 - "$coverage_base_ref_resolved" "$output_file" "$@" <<'PY'
+import pathlib
+import re
+import subprocess
+import sys
+
+base_ref = sys.argv[1]
+output_path = pathlib.Path(sys.argv[2])
+files = sys.argv[3:]
+hunk_re = re.compile(r"^@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@")
+changed = {}
+
+
+def run_git(args):
+    try:
+        return subprocess.run(
+            ["git", *args],
+            check=False,
+            text=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.DEVNULL,
+        ).stdout
+    except OSError:
+        return ""
+
+
+def add_line(path, line):
+    if line > 0:
+        changed.setdefault(path, set()).add(line)
+
+
+def parse_diff(text):
+    current_file = None
+    new_line = None
+    for raw in text.splitlines():
+        if raw.startswith("+++ b/"):
+            current_file = raw[6:]
+            continue
+        if raw.startswith("+++ "):
+            current_file = None
+            continue
+        match = hunk_re.match(raw)
+        if match:
+            new_line = int(match.group(1))
+            continue
+        if current_file is None or new_line is None:
+            continue
+        if raw.startswith("+") and not raw.startswith("+++"):
+            add_line(current_file, new_line)
+            new_line += 1
+        elif raw.startswith("-") and not raw.startswith("---"):
+            continue
+        elif raw.startswith(" "):
+            new_line += 1
+
+
+for path in files:
+    if base_ref != "HEAD":
+        parse_diff(run_git(["diff", "--unified=0", f"{base_ref}...HEAD", "--", path]))
+    parse_diff(run_git(["diff", "--unified=0", "--", path]))
+    parse_diff(run_git(["diff", "--cached", "--unified=0", "--", path]))
+
+    if run_git(["ls-files", "--others", "--exclude-standard", "--", path]).strip():
+        try:
+            line_count = len(pathlib.Path(path).read_text(errors="ignore").splitlines())
+        except OSError:
+            line_count = 0
+        for line in range(1, line_count + 1):
+            add_line(path, line)
+
+with output_path.open("w", encoding="utf-8") as output:
+    for path in sorted(changed):
+        for line in sorted(changed[path]):
+            output.write(f"{path}\t{line}\n")
+PY
+}
+
+enforce_changed_line_coverage() {
+  local changed_lines_file="$1"
+  local coverage_json="$2"
+
+  python3 - "$changed_lines_file" "$coverage_json" "$LINE_COVERAGE_MIN" \
+    "$BRANCH_COVERAGE_MIN" <<'PY'
+import json
+import pathlib
+import sys
+
+changed_lines_path = pathlib.Path(sys.argv[1])
+coverage_json_path = pathlib.Path(sys.argv[2])
+line_min = float(sys.argv[3])
+branch_min = float(sys.argv[4])
+
+changed = {}
+for raw in changed_lines_path.read_text(encoding="utf-8").splitlines():
+    if not raw:
+        continue
+    path, line_text = raw.split("\t", 1)
+    changed.setdefault(path, set()).add(int(line_text))
+
+with coverage_json_path.open(encoding="utf-8") as coverage_file:
+    report = json.load(coverage_file)
+
+line_records = {}
+for file_record in report.get("files", []):
+    path = file_record.get("file", "")
+    records = {}
+    for line_record in file_record.get("lines", []):
+        records[int(line_record["line_number"])] = line_record
+    line_records[path] = records
+
+line_total = 0
+line_covered = 0
+branch_total = 0
+branch_covered = 0
+missing_lines = []
+
+for path in sorted(changed):
+    records = line_records.get(path, {})
+    for line_number in sorted(changed[path]):
+        record = records.get(line_number)
+        if record is None:
+            continue
+        line_total += 1
+        if int(record.get("count", 0)) > 0:
+            line_covered += 1
+        else:
+            missing_lines.append(f"{path}:{line_number}")
+        for branch in record.get("branches", []):
+            branch_total += 1
+            if int(branch.get("count", 0)) > 0:
+                branch_covered += 1
+
+if line_total == 0:
+    print("changed-line coverage: no executable changed lines found")
+    sys.exit(0)
+
+line_percent = (line_covered * 100.0) / line_total
+if branch_total == 0:
+    branch_percent = 100.0
+else:
+    branch_percent = (branch_covered * 100.0) / branch_total
+
+print(
+    "changed-line coverage: "
+    f"lines {line_covered}/{line_total} ({line_percent:.1f}%), "
+    f"branches {branch_covered}/{branch_total} ({branch_percent:.1f}%)"
+)
+
+if missing_lines:
+    preview = ", ".join(missing_lines[:20])
+    if len(missing_lines) > 20:
+        preview += f", ... +{len(missing_lines) - 20} more"
+    print(f"changed-line coverage missing: {preview}", file=sys.stderr)
+
+failed = False
+if line_percent + 1e-9 < line_min:
+    print(
+        f"error: changed-line coverage {line_percent:.1f}% below required {line_min:.1f}%",
+        file=sys.stderr,
+    )
+    failed = True
+if branch_percent + 1e-9 < branch_min:
+    print(
+        f"error: changed-branch coverage {branch_percent:.1f}% below required {branch_min:.1f}%",
+        file=sys.stderr,
+    )
+    failed = True
+
+sys.exit(1 if failed else 0)
+PY
+}
+
 cpu_count=2
 if command -v nproc >/dev/null 2>&1; then
   cpu_count="$(nproc)"
@@ -406,18 +595,43 @@ fi
 
 echo "enforcing coverage thresholds: line >= ${LINE_COVERAGE_MIN}%, branch >= ${BRANCH_COVERAGE_MIN}%"
 
-gcovr \
-  --root . \
-  -j "$COVERAGE_GCOV_JOBS" \
-  "${coverage_filters[@]}" \
-  --exclude tests \
-  --exclude 'src/emel/.*/sm.hpp' \
-  --gcov-ignore-errors no_working_dir_found \
-  --gcov-ignore-parse-errors suspicious_hits.warn_once_per_file \
-  --exclude-throw-branches \
-  --exclude-unreachable-branches \
-  --txt-summary \
-  --print-summary \
-  --fail-under-line "$LINE_COVERAGE_MIN" \
-  --fail-under-branch "$BRANCH_COVERAGE_MIN" \
-  "${coverage_search_paths[@]}"
+if [[ "$COVERAGE_CHANGED_ONLY" == "1" && "$COVERAGE_CHANGED_LINE_ONLY" != "0" ]]; then
+  changed_lines_file="$COVERAGE_BUILD_DIR/changed-lines.tsv"
+  coverage_json="$COVERAGE_BUILD_DIR/coverage.json"
+  collect_changed_lines "$changed_lines_file" "${changed_files[@]}"
+  gcovr \
+    --root . \
+    -j "$COVERAGE_GCOV_JOBS" \
+    "${coverage_filters[@]}" \
+    --exclude tests \
+    --exclude 'src/emel/.*/sm.hpp' \
+    --gcov-ignore-errors no_working_dir_found \
+    --gcov-ignore-parse-errors suspicious_hits.warn_once_per_file \
+    --gcov-ignore-parse-errors negative_hits.warn_once_per_file \
+    --merge-mode-functions separate \
+    --exclude-throw-branches \
+    --exclude-unreachable-branches \
+    --txt-summary \
+    --print-summary \
+    --json "$coverage_json" \
+    "${coverage_search_paths[@]}"
+  enforce_changed_line_coverage "$changed_lines_file" "$coverage_json"
+else
+  gcovr \
+    --root . \
+    -j "$COVERAGE_GCOV_JOBS" \
+    "${coverage_filters[@]}" \
+    --exclude tests \
+    --exclude 'src/emel/.*/sm.hpp' \
+    --gcov-ignore-errors no_working_dir_found \
+    --gcov-ignore-parse-errors suspicious_hits.warn_once_per_file \
+    --gcov-ignore-parse-errors negative_hits.warn_once_per_file \
+    --merge-mode-functions separate \
+    --exclude-throw-branches \
+    --exclude-unreachable-branches \
+    --txt-summary \
+    --print-summary \
+    --fail-under-line "$LINE_COVERAGE_MIN" \
+    --fail-under-branch "$BRANCH_COVERAGE_MIN" \
+    "${coverage_search_paths[@]}"
+fi
diff --git a/snapshots/bench/benchmarks.txt b/snapshots/bench/benchmarks.txt
index 602d5759..b94ceaa0 100644
--- a/snapshots/bench/benchmarks.txt
+++ b/snapshots/bench/benchmarks.txt
@@ -1,5 +1,5 @@
 # ref=c5a3bc39b1b0fe56954c6adb99e89b25d5e7b9cb
-# toolchain=/opt/homebrew/bin/zig
+# toolchain=/shared/zig/zig
 # benchmark_config: iterations=100 runs=3 sample_policy=median warmup_iterations=10 warmup_runs=1 generation_iterations=1 generation_runs=3 generation_warmup_iterations=0 generation_warmup_runs=0
 batch/planner_equal ns_per_op=1467.910 iter=100 runs=3
 batch/planner_seq ns_per_op=1736.250 iter=100 runs=3
@@ -84,3 +84,32 @@ tokenizer/preprocessor_ugm_long ns_per_op=4188.750 iter=100 runs=3
 tokenizer/preprocessor_ugm_short ns_per_op=2497.080 iter=100 runs=3
 tokenizer/preprocessor_wpm_long ns_per_op=4037.500 iter=100 runs=3
 tokenizer/preprocessor_wpm_short ns_per_op=2505.840 iter=100 runs=3
+kernel/x86_64/op_add ns_per_op=122.000 iter=100 runs=3
+kernel/x86_64/op_cos ns_per_op=1926.490 iter=100 runs=3
+kernel/x86_64/op_div ns_per_op=118.500 iter=100 runs=3
+kernel/x86_64/op_dup ns_per_op=79.100 iter=100 runs=3
+kernel/x86_64/op_log ns_per_op=3055.990 iter=100 runs=3
+kernel/x86_64/op_mul ns_per_op=123.000 iter=100 runs=3
+kernel/x86_64/op_mul_mat ns_per_op=2485.590 iter=100 runs=3
+kernel/x86_64/op_sin ns_per_op=1619.190 iter=100 runs=3
+kernel/x86_64/op_soft_max ns_per_op=4899.380 iter=100 runs=3
+kernel/x86_64/op_sqr ns_per_op=82.100 iter=100 runs=3
+kernel/x86_64/op_sqrt ns_per_op=151.400 iter=100 runs=3
+kernel/x86_64/op_sub ns_per_op=100.500 iter=100 runs=3
+kernel/x86_64/op_unary_exp ns_per_op=3729.890 iter=100 runs=3
+kernel/x86_64/op_unary_neg ns_per_op=81.300 iter=100 runs=3
+kernel/x86_64/op_unary_relu ns_per_op=113.300 iter=100 runs=3
+kernel/x86_64/op_flash_attn_ext_decode_like ns_per_op=172.200 iter=100 runs=3
+kernel/x86_64/op_mul_mat_q2_k_q8_k ns_per_op=1351.800 iter=100 runs=3
+kernel/x86_64/op_mul_mat_q3_k_q8_k ns_per_op=1419.290 iter=100 runs=3
+kernel/x86_64/op_mul_mat_q6_k_q8_k ns_per_op=1345.190 iter=100 runs=3
+parallel_matmul/gemm8_f32 ns_per_op=168915.442 iter=2000 runs=5
+parallel_matmul/gemv_f32 ns_per_op=298427.410 iter=2000 runs=5
+parallel_matmul/gemv_q4_k ns_per_op=69830.877 iter=2000 runs=5
+parallel_matmul/gemv_q6_k ns_per_op=49344.404 iter=2000 runs=5
+parallel_matmul/gemv_q8_0 ns_per_op=55436.694 iter=2000 runs=5
+parallel_matmul/ggml_gemm8_f32 ns_per_op=169618.054 iter=2000 runs=5
+parallel_matmul/ggml_gemv_f32 ns_per_op=288998.998 iter=2000 runs=5
+parallel_matmul/ggml_gemv_q4_k ns_per_op=68885.360 iter=2000 runs=5
+parallel_matmul/ggml_gemv_q6_k ns_per_op=51010.082 iter=2000 runs=5
+parallel_matmul/ggml_gemv_q8_0 ns_per_op=55113.300 iter=2000 runs=5
diff --git a/snapshots/bench/benchmarks_compare.txt b/snapshots/bench/benchmarks_compare.txt
index 78a79922..4e6c06f8 100644
--- a/snapshots/bench/benchmarks_compare.txt
+++ b/snapshots/bench/benchmarks_compare.txt
@@ -59,6 +59,16 @@ logits/validator_sml/vocab_32000 emel.cpp 24274.209 ns/op, llama.cpp 23702.209 n
 memory/hybrid_full emel.cpp 444.458 ns/op, llama.cpp 34307.000 ns/op, ratio=0.013x
 memory/kv_full emel.cpp 131.833 ns/op, llama.cpp 33199.750 ns/op, ratio=0.004x
 memory/recurrent_full emel.cpp 148.625 ns/op, llama.cpp 4460.167 ns/op, ratio=0.033x
+parallel_matmul/gemm8_f32 emel.cpp 161865.350 ns/op, reference-baseline 1249248.735 ns/op, ratio=0.130x
+parallel_matmul/gemv_f32 emel.cpp 291074.740 ns/op, reference-baseline 2404805.169 ns/op, ratio=0.121x
+parallel_matmul/gemv_q4_k emel.cpp 68127.424 ns/op, reference-baseline 469031.650 ns/op, ratio=0.145x
+parallel_matmul/gemv_q6_k emel.cpp 43589.802 ns/op, reference-baseline 284102.823 ns/op, ratio=0.153x
+parallel_matmul/gemv_q8_0 emel.cpp 55314.849 ns/op, reference-baseline 371634.264 ns/op, ratio=0.149x
+parallel_matmul/ggml_gemm8_f32 emel.cpp 170008.552 ns/op, llama.cpp 201689.476 ns/op, ratio=0.843x
+parallel_matmul/ggml_gemv_f32 emel.cpp 284070.253 ns/op, llama.cpp 29684.396 ns/op, ratio=9.570x
+parallel_matmul/ggml_gemv_q4_k emel.cpp 68174.144 ns/op, llama.cpp 16335.825 ns/op, ratio=4.173x
+parallel_matmul/ggml_gemv_q6_k emel.cpp 44451.588 ns/op, llama.cpp 17805.679 ns/op, ratio=2.496x
+parallel_matmul/ggml_gemv_q8_0 emel.cpp 55579.414 ns/op, llama.cpp 20757.682 ns/op, ratio=2.678x
 text/encoders/bpe_long emel.cpp 65.417 ns/op, llama.cpp 66.333 ns/op, ratio=0.986x
 text/encoders/bpe_short emel.cpp 58.416 ns/op, llama.cpp 57.000 ns/op, ratio=1.025x
 text/encoders/fallback_long emel.cpp 2432.500 ns/op, llama.cpp 2504.917 ns/op, ratio=0.971x
diff --git a/snapshots/lint/clang_format.txt b/snapshots/lint/clang_format.txt
index cec88f38..25646feb 100644
--- a/snapshots/lint/clang_format.txt
+++ b/snapshots/lint/clang_format.txt
@@ -448,6 +448,12 @@ src/emel/text/formatter/guards.hpp
 src/emel/text/formatter/sm.hpp
 src/emel/text/generator/actions.hpp
 src/emel/text/generator/context.hpp
+src/emel/text/generator/decode_wavefront/actions.hpp
+src/emel/text/generator/decode_wavefront/context.hpp
+src/emel/text/generator/decode_wavefront/errors.hpp
+src/emel/text/generator/decode_wavefront/events.hpp
+src/emel/text/generator/decode_wavefront/guards.hpp
+src/emel/text/generator/decode_wavefront/sm.hpp
 src/emel/text/generator/detail.hpp
 src/emel/text/generator/errors.hpp
 src/emel/text/generator/events.hpp
@@ -610,6 +616,7 @@ tests/text/encoders/test_support.hpp
 tests/text/encoders/ugm_tests.cpp
 tests/text/encoders/wpm_tests.cpp
 tests/text/generator/action_guard_tests.cpp
+tests/text/generator/decode_wavefront/lifecycle_tests.cpp
 tests/text/generator/detail_tests.cpp
 tests/text/generator/initializer/lifecycle_tests.cpp
 tests/text/generator/legacy_compatibility_tests.cpp
diff --git a/snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_10.txt b/snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_10.txt
index 2cb67859..3c61e559 100644
--- a/snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_10.txt
+++ b/snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_10.txt
@@ -4,8 +4,8 @@ fixture=LFM2.5-1.2B-Thinking-Q4_K_M.gguf
 prompt_hex=68656c6c6f
 max_tokens=10
 tokens_generated=10
-output_length=22
-trace_token_count=0
-token_ids=
-top_score_gaps=
-output_hex=3c7c696d5f656e647c3ec48a3c7c696d5f7374617274
+output_length=20
+trace_token_count=10
+token_ids=537,601,834,36171,601,539,834,36509,601,539
+top_score_gaps=1.63354683,0.24605751,1.70043945,0.368387222,2.36488152,1.36843204,1.18816185,0.351781845,0.625753403,0.952269554
+output_hex=3c7c696d5f73746172747c3e696d5f656e647c3e
diff --git a/snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_100.txt b/snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_100.txt
index 3d362263..7e8d2737 100644
--- a/snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_100.txt
+++ b/snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_100.txt
@@ -4,8 +4,8 @@ fixture=LFM2.5-1.2B-Thinking-Q4_K_M.gguf
 prompt_hex=68656c6c6f
 max_tokens=100
 tokens_generated=100
-output_length=277
-trace_token_count=0
-token_ids=
-top_score_gaps=
-output_hex=3c7c696d5f656e647c3ec48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e
+output_length=248
+trace_token_count=100
+token_ids=537,601,834,36171,601,539,834,36509,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171
+top_score_gaps=1.63354683,0.24605751,1.70043945,0.368387222,2.36488152,1.36843204,1.18816185,0.351781845,0.625753403,0.952269554,0.768082619,2.13955879,2.69776535,1.23970318,2.68403435,1.13820744,2.23869896,2.92936611,4.29071426,3.37551308,2.9721241,3.5968895,4.17213821,3.48550129,3.24224091,3.93832588,4.05487919,3.19284058,3.71522141,4.33257675,3.83996201,3.52476025,4.02193642,4.62951469,3.52591896,3.95395088,4.03614426,4.70890999,3.40218353,3.98209953,4.25673676,5.17098618,3.2824955,4.25911903,4.17467308,5.44502449,3.17495728,4.76317406,4.16145325,5.78856754,2.96655083,4.8255949,4.3292141,6.17603111,2.71598053,5.10662651,4.45320892,6.61788273,2.92082787,5.51133347,4.63360023,7.34527493,3.01860237,5.74898243,4.75855064,7.53805542,3.2661705,5.98014736,4.62242317,7.30112839,3.09925079,6.13972569,4.70379066,7.18179703,3.05636215,6.00573921,4.72933769,7.15433216,2.95344353,6.03799915,4.67574883,6.99351788,2.87396622,5.88299274,4.66538811,6.88072109,2.78374863,5.97683144,4.68003654,7.06533813,2.74716759,6.03071404,4.81233406,6.91018963,2.93097115,6.01890755,4.90971947,7.00140572,3.00832558,6.27520561
+output_hex=3c7c696d5f73746172747c3e696d5f656e647c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f7374617274
diff --git a/snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_1000.txt b/snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_1000.txt
index 3edf9e63..0c445906 100644
--- a/snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_1000.txt
+++ b/snapshots/parity/generation_lfm2_5_1_2b_thinking_q4_k_m_prompt_hello_max_tokens_1000.txt
@@ -4,8 +4,8 @@ fixture=LFM2.5-1.2B-Thinking-Q4_K_M.gguf
 prompt_hex=68656c6c6f
 max_tokens=1000
 tokens_generated=1000
-output_length=2866
-trace_token_count=0
-token_ids=
-top_score_gaps=
-output_hex=3c7c696d5f656e647c3ec48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c696d5f73746172747c3e617373697374616e74c48a3c7c
+output_length=2498
+trace_token_count=1000
+token_ids=537,601,834,36171,601,539,834,36509,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171,601,539,834,36171
+top_score_gaps=1.63354683,0.24605751,1.70043945,0.368387222,2.36488152,1.36843204,1.18816185,0.351781845,0.625753403,0.952269554,0.768082619,2.13955879,2.69776535,1.23970318,2.68403435,1.13820744,2.23869896,2.92936611,4.29071426,3.37551308,2.9721241,3.5968895,4.17213821,3.48550129,3.24224091,3.93832588,4.05487919,3.19284058,3.71522141,4.33257675,3.83996201,3.52476025,4.02193642,4.62951469,3.52591896,3.95395088,4.03614426,4.70890999,3.40218353,3.98209953,4.25673676,5.17098618,3.2824955,4.25911903,4.17467308,5.44502449,3.17495728,4.76317406,4.16145325,5.78856754,2.96655083,4.8255949,4.3292141,6.17603111,2.71598053,5.10662651,4.45320892,6.61788273,2.92082787,5.51133347,4.63360023,7.34527493,3.01860237,5.74898243,4.75855064,7.53805542,3.2661705,5.98014736,4.62242317,7.30112839,3.09925079,6.13972569,4.70379066,7.18179703,3.05636215,6.00573921,4.72933769,7.15433216,2.95344353,6.03799915,4.67574883,6.99351788,2.87396622,5.88299274,4.66538811,6.88072109,2.78374863,5.97683144,4.68003654,7.06533813,2.74716759,6.03071404,4.81233406,6.91018963,2.93097115,6.01890755,4.90971947,7.00140572,3.00832558,6.27520561,5.15449142,7.17754936,2.92182541,6.29348278,5.18918228,7.24613285,3.15817451,6.33448505,4.93919563,7.18188667,3.37161064,6.30781746,4.91264725,7.07508183,3.39411163,6.04449177,4.82236099,7.04416466,3.43402481,6.01409435,4.9267807,6.96591759,3.58568573,6.17149067,4.95745277,6.97708702,3.37389755,6.09702206,4.95066452,6.97212124,3.60499191,6.09376431,4.96656036,7.05609894,3.57179451,6.17537308,5.1514473,7.04147434,3.77630997,6.13268852,5.09565926,6.96484566,3.54657745,6.01200008,4.94233513,6.86031246,3.45708466,6.03563213,4.89934349,6.94096279,3.4433136,5.99887848,4.86978531,6.92713928,3.53595161,6.00555229,4.82303429,6.73809147,3.60980988,6.15188026,4.89002609,6.76029205,3.54994011,6.22898483,5.07240486,6.92019844,3.61861992,6.35840511,5.10358047,7.30640602,3.76493835,6.4697628,5.11379051,7.22468185,3.70443726,6.5120163,5.46486664,7.34616852,3.78965569,6.55810165,5.45630836,7.16510868,3.69276428,6.49792957,5.34859467,7.05204105,3.94773293,6.51829338,5.21468735,6.96044922,3.90048409,6.4548893,5.10279465,7.03608704,3.78993988,6.43599892,5.17170334,7.01614761,3.87434769,6.62017727,5.17409325,7.15720749,3.97450638,6.61591053,5.25351143,7.26149273,3.98797417,6.64940262,5.34532928,7.32811165,3.98655319,6.69561577,5.16225433,7.27540207,4.00711823,6.64959717,5.02927208,7.42131424,4.10794449,6.3504343,4.87427139,7.13493919,3.94173241,6.30237961,4.84730148,7.09026337,3.92112732,6.32969284,4.62829781,7.07922745,4.10039711,6.18699265,4.70614624,7.1133213,4.20389748,6.15161514,4.83810043,7.41781616,4.01337242,6.23406219,4.75249863,7.52049255,4.09392929,6.25523186,4.89223289,7.58530617,4.00470161,6.19423866,4.96609688,7.46766281,4.12454033,6.1914463,5.01087952,7.41868401,4.23431969,6.29924107,4.94741821,7.5768671,4.05877686,6.26094437,4.80808067,7.63875198,3.96557426,6.43565464,4.96214485,7.78561115,4.05957985,6.38298512,5.1269722,7.91959,3.92103386,6.44330025,5.11893082,8.00494957,4.28379822,6.6187439,5.38516235,8.07876205,4.15754318,6.54335213,5.42339897,8.25018501,4.3666153,6.68294811,5.56444168,8.14664459,4.17339706,6.61001778,5.40484047,8.02812004,4.36603737,6.56625366,5.28453064,8.07682133,4.40487862,6.56387424,5.29524231,8.02450943,4.54405975,6.52717209,5.29816246,7.93931389,4.51352692,6.57671928,5.28840065,7.92730904,4.34979439,6.62380219,5.32158279,8.08699226,4.45905304,6.74461555,5.37109566,8.01555252,4.28594398,6.71599197,5.35267067,8.05058193,4.37354851,6.65527153,5.33983231,8.03376389,4.39024544,6.70316219,5.38566208,8.02507591,4.35650253,6.66013336,5.26818275,7.8992281,4.32999611,6.63684082,5.07611465,7.84050751,4.19766235,6.50216198,5.080616,7.63294029,4.28756142,6.36629295,5.14682388,7.84460831,4.37177849,6.39452362,5.16367912,7.92552376,4.41466904,6.38336849,4.97603226,7.80973053,4.11863708,6.37950706,5.05570221,7.71188354,4.16390991,6.27833557,5.06598091,7.64024639,4.0553093,6.19119263,4.87779999,7.41340923,4.0946579,6.01751041,4.71178055,7.21930695,4.10017014,5.8859024,4.61994362,7.07185936,4.1017189,6.08514118,4.74275017,7.07374763,4.07217216,6.06008625,4.8759613,7.2887888,4.13499832,6.08717442,4.74877548,7.35038948,4.08800316,6.06817341,4.89426994,7.41510391,4.19381905,6.14213562,4.93709755,7.34150124,4.20363045,6.1820755,4.94212151,7.2675581,4.21402168,6.01086426,4.76105499,7.25420189,4.28453445,6.07355881,4.79317093,7.27148342,4.3359375,6.00758171,4.73082161,7.25371647,4.21537971,6.1421833,4.83570862,7.23797989,4.33719826,6.19045448,4.82212067,7.38318729,4.44676781,6.36742401,5.06836319,7.51957321,4.25700378,6.49521255,5.04674339,7.74581528,4.51745796,6.58267212,5.09534645,7.86675549,4.37240219,6.53255749,5.19275475,7.81407547,4.57707977,6.70201206,5.31071663,7.83781242,4.52520943,6.77231121,5.24424362,7.89129162,4.45015717,6.80490875,5.34228516,7.96236897,4.61785316,6.88516998,5.5496273,8.11745071,4.55598831,6.85617638,5.71512413,8.24121666,4.6751976,6.83379936,5.54876709,8.29695511,4.87487793,6.82317066,5.6565876,8.33533955,4.762537,6.85702896,5.7570076,8.25852871,4.78178787,6.78629494,5.55778885,8.11906719,4.96131325,6.81518841,5.51818466,8.08282566,4.86157417,6.61240959,5.38890076,8.08644199,4.86723518,6.59597778,5.24499702,7.97181511,5.01809883,6.68584538,5.36872482,8.0396328,5.19913483,6.58750725,5.40529823,8.18553162,5.18746948,6.59198666,5.58506775,8.21709538,5.31076431,6.60499001,5.49154663,8.25622177,5.27767372,6.74274158,5.57793808,8.26787472,5.22988892,6.64479065,5.36212921,8.06951714,5.23572159,6.41202354,5.37818527,8.13205624,5.14180756,6.52160168,5.34617615,8.12040615,5.107687,6.32893276,5.25270462,8.17889786,5.06069946,6.37924767,5.24970627,8.05556679,5.06832504,6.30836868,5.28464127,8.16378784,5.03841019,6.41894913,5.33346939,8.22968292,5.06711388,6.51340485,5.20555687,8.22129726,5.09238243,6.48105621,5.35327721,8.19785118,4.97625923,6.47792244,5.21417236,8.16148949,5.10004044,6.36968708,5.15242386,8.18161964,5.25624275,6.42994118,5.04351425,8.07405853,5.14030075,6.36172295,5.17052841,8.07892609,5.3619957,6.40137672,5.1654892,8.08720398,5.26086998,6.52788067,5.17366791,8.21195412,5.31811333,6.3486805,5.2519722,8.13913536,5.1263504,6.48794174,5.15417099,8.24138641,5.19828033,6.4694109,5.15841675,8.11807919,5.26512909,6.4569788,5.02939224,8.0028286,5.27293015,6.41951847,5.05481339,7.95901394,5.13341141,6.40538025,5.09624863,7.89557838,5.10813522,6.39192867,4.94404411,8.05454636,5.09833908,6.39883041,5.02837563,7.94150639,5.11763573,6.47364807,5.03023148,8.06495667,5.14579773,6.60127258,5.12008476,8.17419147,5.19213867,6.64751625,5.22068787,8.2072897,5.22324181,6.7547102,5.06778336,8.21466637,5.1476593,6.48773956,5.14493179,8.07195187,5.22911453,6.43293762,5.0694809,8.07123947,5.09415245,6.44200325,4.99944496,8.02376366,5.20451736,6.44797516,4.95704269,7.89868736,5.18758774,6.41598701,5.11302948,7.97619629,5.19637299,6.46459389,5.03109741,8.1726284,5.26836586,6.57701588,5.15589333,8.15628338,5.35698318,6.62969398,5.16781044,8.20305729,5.29644585,6.56591606,5.08177757,8.19900513,5.2761116,6.591362,5.2937355,8.13578987,5.49449158,6.57679653,5.1789093,8.11975861,5.58026123,6.61626053,5.2122612,8.24708176,5.47430611,6.65134621,5.20296288,8.35219193,5.40405083,6.84568024,5.10887718,8.35218906,5.3861866,6.80909538,5.36335564,8.45079422,5.4583149,6.89624214,5.41325378,8.56411076,5.50897408,6.94883919,5.44528198,8.51302719,5.48005104,6.98685455,5.4705162,8.64134502,5.42014885,7.02066708,5.47264862,8.60786057,5.440485,6.98165894,5.46196365,8.65524483,5.47345161,7.0641737,5.43378448,8.56656456,5.54297066,7.14050674,5.41973114,8.63925457,5.5287323,7.16239738,5.47761536,8.63204002,5.67994881,7.16528511,5.51246071,8.77296448,5.6181469,7.21063805,5.43946266,8.72153854,5.63790512,7.20365524,5.50118637,8.6414566,5.72280502,6.98625469,5.51038742,8.62572098,5.6904583,6.98760986,5.48636055,8.46716499,5.63944435,6.90142155,5.47950363,8.37626266,5.78129387,6.63391399,5.22815704,8.26700592,5.74568748,6.68688488,5.30966568,8.28059769,5.69919205,6.70224953,5.1866684,8.19664383,5.7410202,6.61538696,5.24163437,8.27741814,5.77989006,6.60492134,5.29459763,8.27816582,5.64590836,6.62918377,5.17449951,8.26740456,5.76762009,6.64905167,5.04925537,8.10276413,5.65835571,6.41890526,4.85007858,7.92645931,5.69874954,6.28791809,4.78050423,7.70150566,5.51111221,6.37421513,4.91578293,7.76987171,5.54414558,6.41958523,4.92844582,7.89545631,5.41973686,6.45242977,4.99083138,8.0347271,5.54695511,6.4104414,4.98251724,8.00759697,5.54224396,6.41043377,5.18890572,8.16882515,5.6139183,6.51286221,5.11805153,7.98017502,5.47458076,6.45481491,5.00892639,7.92031479,5.53252602,6.34074783,5.01001358,7.86529732,5.5212059,6.27339554,4.8295517,7.80489445,5.48496819,6.38019276,4.91117477,7.85472488,5.46661568,6.35686398,4.96008873,7.95297813,5.53714943,6.45419502,5.02433777,7.99053764,5.52077484,6.65921688,5.04179001,8.10065174,5.76436234,6.63166809,4.91928482,8.14009285,5.56265068,6.61696434,5.22071648,8.18057156,5.57130432,6.52271652,4.9258461,8.08891582,5.7355423,6.62742043,4.98779869,8.08633423,5.59970856,6.66469955,4.90749359,8.02793503,5.61119843,6.56915665,4.92798805,8.08284664,5.72509956,6.68988895,5.10199356,8.29250145,5.66152763,6.5911274,5.04431534,8.24588776,5.72471809,6.72398663,5.08971596,8.247612,5.71876907,6.70418739,5.29846573,8.16291046,5.83738327,6.78546906,5.13676262,8.23707771,5.84209824,6.67313766,5.29277229,8.18377113,5.82172585,6.6520443,5.18866539,8.11068916,5.83301926,6.57429028,5.11104965,8.06363964,5.78905678,6.72797871,5.11105347,8.08338928,5.80940437,6.68347931,5.19134521,8.2475853,6.05075264,6.61704063,5.29603195,8.19070435,5.89323235,6.66639137,5.30723,8.33905029,5.90149689,6.68814278,5.35193825,8.28684425,6.02704048,6.58704472,5.25161743,8.17722416,6.02680397,6.42173862,5.25564766,8.05768776,6.12816048,6.42662621,5.07863617,7.90750504,6.17559433,6.57017326,5.1002388,7.90670586,6.02440643,6.4688406,4.95903778,7.97939873,6.01169014,6.42444897,5.18427849,8.06044006,5.9139576,6.59120178,5.27438545,8.17666817,6.01253319,6.65959167,5.18017006,8.22201443,6.04235649,6.60766792,5.30283356,8.2788372,6.03053665,6.60948467,5.34325409,8.31509209,5.8453598,6.69171238,5.33260155,8.20406818,5.683918,6.56695843,5.30281258,8.13492966,5.75349617,6.55113983,5.34417725,8.1326561,5.78568649,6.57105255,5.3273201,8.15877342,5.76834869,6.64725208,5.37774849,8.17119789,5.89293289,6.57474041,5.41365814,8.36868477,5.68873978,6.58434677,5.48419189,8.34781837,5.85387421,6.64115715,5.38439941,8.27522278,5.91314125,6.70641136,5.13799477,8.17845726,5.75364876,6.70098782,5.13040352,8.00018501,5.96792984,6.62576485,5.03642464,7.94652843,5.92035866,6.63366604,5.10318375,7.9744215,5.82936859,6.61807251,5.1090126,7.96936798,5.85948563,6.58807373,5.18238449,8.03250885,5.87504005,6.6042366,5.2796917,8.13493443,5.85573006,6.60693741,5.27525139,8.21286392,5.98149109,6.71523666,5.17165375,8.24064732,5.87898636,6.64641953,5.19127846,8.21217346,5.90829086,6.62609482
+output_hex=3c7c696d5f73746172747c3e696d5f656e647c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f73746172747c3e696d5f7374617274
diff --git a/snapshots/quality_gates/timing.txt b/snapshots/quality_gates/timing.txt
index 89282747..f1d9146f 100644
--- a/snapshots/quality_gates/timing.txt
+++ b/snapshots/quality_gates/timing.txt
@@ -1,37 +1,9 @@
 # quality_gates timing (seconds)
-domain_boundaries 1
-legacy_sml_surface 3
+domain_boundaries 0
+legacy_sml_surface 1
 build_with_zig 0
-bench_snapshot_gbnf_rule_parser 4
-bench_snapshot_jinja_formatter 7
-bench_snapshot_jinja_parser 5
-bench_snapshot_logits_sampler 2
-bench_snapshot_logits_validator 7
-bench_snapshot_kernel_aarch64 4
-bench_snapshot_batch_planner 3
-bench_snapshot_memory_kv 6
-bench_snapshot_memory_recurrent 7
-bench_snapshot_memory_hybrid 4
-bench_snapshot_generation 190
-bench_snapshot_diarization_sortformer 5
-bench_snapshot_flash_attention 4
-bench_snapshot_tokenizer_preprocessor_bpe 1
-bench_snapshot_tokenizer_preprocessor_spm 1
-bench_snapshot_tokenizer_preprocessor_ugm 0
-bench_snapshot_tokenizer_preprocessor_wpm 1
-bench_snapshot_tokenizer_preprocessor_rwkv 1
-bench_snapshot_tokenizer_preprocessor_plamo2 1
-bench_snapshot_encoder_bpe 0
-bench_snapshot_encoder_spm 38
-bench_snapshot_encoder_wpm 1
-bench_snapshot_encoder_ugm 14
-bench_snapshot_encoder_rwkv 10
-bench_snapshot_encoder_plamo2 1
-bench_snapshot_encoder_fallback 0
-bench_snapshot_tokenizer 62
-test_with_coverage 422
-paritychecker 16
-fuzz_smoke 45
-lint_snapshot 16
-generate_docs 2
-total 887
+bench_snapshot 47
+test_with_coverage 1476
+paritychecker 2
+fuzz_smoke 0
+total 1477
diff --git a/src/emel/diarization/sortformer/detail.cpp b/src/emel/diarization/sortformer/detail.cpp
index ac61ff01..c863afbf 100644
--- a/src/emel/diarization/sortformer/detail.cpp
+++ b/src/emel/diarization/sortformer/detail.cpp
@@ -7,6 +7,7 @@
 #endif
 
 #include "emel/kernel/aarch64/actions.hpp"
+#include "emel/kernel/sm.hpp"
 #include "emel/kernel/detail.hpp"
 #include "emel/kernel/events.hpp"
 
@@ -36,7 +37,8 @@ float32x4_t compute_neon_fma(const float32x4_t acc,
 }
 #endif
 
-bool run_dense_matmul(std::span<const float> input,
+bool run_dense_matmul(emel::kernel::sm & kernel,
+                      std::span<const float> input,
                       std::span<const float> weights,
                       std::span<float> output) noexcept {
   const uint64_t row_bytes = sizeof(float) * static_cast<uint64_t>(input.size());
@@ -52,7 +54,7 @@ bool run_dense_matmul(std::span<const float> input,
               1u,
           },
           .nb = {
-              1u,
+              sizeof(float),
               row_bytes,
               row_bytes * static_cast<uint64_t>(output.size()),
               row_bytes * static_cast<uint64_t>(output.size()),
@@ -90,22 +92,16 @@ bool run_dense_matmul(std::span<const float> input,
               sizeof(float) * static_cast<uint64_t>(output.size()),
           },
       },
-      .nth = 1u,
   };
 
-#if defined(__aarch64__) || defined(__ARM_NEON)
-  if (!emel::kernel::aarch64::detail::execute_neon_mul_mat(request)) {
-    return false;
-  }
-#else
-  if (!emel::kernel::detail::run_mul_mat(request)) {
+  if (!kernel.process_event(request)) {
     return false;
   }
-#endif
   return true;
 }
 
-bool run_dense_batch_matmul_from_transposed(std::span<const float> transposed_input,
+bool run_dense_batch_matmul_from_transposed(emel::kernel::sm & kernel,
+                                            std::span<const float> transposed_input,
                                             const size_t row_count,
                                             const size_t input_dim,
                                             std::span<const float> weights,
@@ -125,7 +121,7 @@ bool run_dense_batch_matmul_from_transposed(std::span<const float> transposed_in
               1u,
           },
           .nb = {
-              1u,
+              sizeof(float),
               input_row_bytes,
               input_row_bytes * static_cast<uint64_t>(output_dim),
               input_row_bytes * static_cast<uint64_t>(output_dim),
@@ -163,23 +159,17 @@ bool run_dense_batch_matmul_from_transposed(std::span<const float> transposed_in
               frame_row_bytes * static_cast<uint64_t>(output_dim),
           },
       },
-      .nth = 1u,
   };
 
-#if defined(__aarch64__) || defined(__ARM_NEON)
-  if (!emel::kernel::aarch64::detail::execute_neon_mul_mat(request)) {
-    return false;
-  }
-#else
-  if (!emel::kernel::detail::run_mul_mat(request)) {
+  if (!kernel.process_event(request)) {
     return false;
   }
-#endif
 
   return true;
 }
 
 bool run_dense_batch_matmul_from_transposed_prepared(
+    emel::kernel::sm & kernel,
     std::span<const float> transposed_input,
     const size_t row_count,
     const size_t input_dim,
@@ -201,7 +191,7 @@ bool run_dense_batch_matmul_from_transposed_prepared(
               1u,
           },
           .nb = {
-              1u,
+              sizeof(float),
               input_row_bytes,
               input_row_bytes * static_cast<uint64_t>(output_dim),
               input_row_bytes * static_cast<uint64_t>(output_dim),
@@ -239,10 +229,10 @@ bool run_dense_batch_matmul_from_transposed_prepared(
               frame_row_bytes * static_cast<uint64_t>(output_dim),
           },
       },
-      .nth = 1u,
   };
 
 #if defined(__aarch64__) || defined(__ARM_NEON)
+  (void)kernel;
   if (!emel::kernel::aarch64::detail::execute_neon_mul_mat_prepared_f32_lhs_4row(
           request,
           cache.lhs_4row.data(),
@@ -250,7 +240,8 @@ bool run_dense_batch_matmul_from_transposed_prepared(
     return false;
   }
 #else
-  if (!emel::kernel::detail::run_mul_mat(request)) {
+  (void)cache;
+  if (!kernel.process_event(request)) {
     return false;
   }
 #endif
@@ -458,7 +449,8 @@ bool prepare_dense_weight_cache(std::span<const float> weights,
   return true;
 }
 
-bool compute_dense(std::span<const float> input,
+bool compute_dense(emel::kernel::sm & kernel,
+                   std::span<const float> input,
                    std::span<const float> weights,
                    std::span<const float> bias,
                    std::span<float> output) noexcept {
@@ -467,7 +459,7 @@ bool compute_dense(std::span<const float> input,
     return false;
   }
 
-  if (!run_dense_matmul(input, weights, output)) {
+  if (!run_dense_matmul(kernel, input, weights, output)) {
     return false;
   }
 
@@ -478,14 +470,15 @@ bool compute_dense(std::span<const float> input,
   return true;
 }
 
-bool compute_dense_without_bias(std::span<const float> input,
+bool compute_dense_without_bias(emel::kernel::sm & kernel,
+                                std::span<const float> input,
                                 std::span<const float> weights,
                                 std::span<float> output) noexcept {
   if (input.empty() || output.empty() || weights.size() != input.size() * output.size()) {
     return false;
   }
 
-  return run_dense_matmul(input, weights, output);
+  return run_dense_matmul(kernel, input, weights, output);
 }
 
 bool transpose_dense_input(std::span<const float> input_rows,
@@ -508,7 +501,8 @@ bool transpose_dense_input(std::span<const float> input_rows,
   return true;
 }
 
-bool compute_dense_batch(std::span<const float> input_rows,
+bool compute_dense_batch(emel::kernel::sm & kernel,
+                         std::span<const float> input_rows,
                          const size_t row_count,
                          const size_t input_dim,
                          std::span<const float> weights,
@@ -534,7 +528,8 @@ bool compute_dense_batch(std::span<const float> input_rows,
     }
   }
 
-  if (!run_dense_batch_matmul_from_transposed(transposed_input,
+  if (!run_dense_batch_matmul_from_transposed(kernel,
+                                              transposed_input,
                                               row_count,
                                               input_dim,
                                               weights,
@@ -553,7 +548,8 @@ bool compute_dense_batch(std::span<const float> input_rows,
   return true;
 }
 
-bool compute_dense_batch_prepared(std::span<const float> input_rows,
+bool compute_dense_batch_prepared(emel::kernel::sm & kernel,
+                                  std::span<const float> input_rows,
                                   const size_t row_count,
                                   const size_t input_dim,
                                   std::span<const float> weights,
@@ -583,7 +579,8 @@ bool compute_dense_batch_prepared(std::span<const float> input_rows,
     }
   }
 
-  if (!run_dense_batch_matmul_from_transposed_prepared(transposed_input,
+  if (!run_dense_batch_matmul_from_transposed_prepared(kernel,
+                                                       transposed_input,
                                                        row_count,
                                                        input_dim,
                                                        weights,
@@ -603,7 +600,8 @@ bool compute_dense_batch_prepared(std::span<const float> input_rows,
   return true;
 }
 
-bool compute_dense_batch_residual_prepared(std::span<const float> input_rows,
+bool compute_dense_batch_residual_prepared(emel::kernel::sm & kernel,
+                                           std::span<const float> input_rows,
                                            const size_t row_count,
                                            const size_t input_dim,
                                            std::span<const float> weights,
@@ -632,7 +630,8 @@ bool compute_dense_batch_residual_prepared(std::span<const float> input_rows,
     return false;
   }
 
-  return compute_dense_batch_from_transposed_scaled_residual_prepared(transposed_input,
+  return compute_dense_batch_from_transposed_scaled_residual_prepared(kernel,
+                                                                      transposed_input,
                                                                       row_count,
                                                                       input_dim,
                                                                       weights,
@@ -645,7 +644,8 @@ bool compute_dense_batch_residual_prepared(std::span<const float> input_rows,
                                                                       output_rows);
 }
 
-bool compute_dense_batch_without_bias(std::span<const float> input_rows,
+bool compute_dense_batch_without_bias(emel::kernel::sm & kernel,
+                                      std::span<const float> input_rows,
                                       const size_t row_count,
                                       const size_t input_dim,
                                       std::span<const float> weights,
@@ -669,7 +669,8 @@ bool compute_dense_batch_without_bias(std::span<const float> input_rows,
     }
   }
 
-  if (!run_dense_batch_matmul_from_transposed(transposed_input,
+  if (!run_dense_batch_matmul_from_transposed(kernel,
+                                              transposed_input,
                                               row_count,
                                               input_dim,
                                               weights,
@@ -688,7 +689,8 @@ bool compute_dense_batch_without_bias(std::span<const float> input_rows,
   return true;
 }
 
-bool compute_dense_batch_without_bias_prepared(std::span<const float> input_rows,
+bool compute_dense_batch_without_bias_prepared(emel::kernel::sm & kernel,
+                                               std::span<const float> input_rows,
                                                const size_t row_count,
                                                const size_t input_dim,
                                                std::span<const float> weights,
@@ -716,7 +718,8 @@ bool compute_dense_batch_without_bias_prepared(std::span<const float> input_rows
     }
   }
 
-  if (!run_dense_batch_matmul_from_transposed_prepared(transposed_input,
+  if (!run_dense_batch_matmul_from_transposed_prepared(kernel,
+                                                       transposed_input,
                                                        row_count,
                                                        input_dim,
                                                        weights,
@@ -736,7 +739,8 @@ bool compute_dense_batch_without_bias_prepared(std::span<const float> input_rows
   return true;
 }
 
-bool compute_dense_batch_to_transposed(std::span<const float> input_rows,
+bool compute_dense_batch_to_transposed(emel::kernel::sm & kernel,
+                                       std::span<const float> input_rows,
                                        const size_t row_count,
                                        const size_t input_dim,
                                        std::span<const float> weights,
@@ -760,7 +764,8 @@ bool compute_dense_batch_to_transposed(std::span<const float> input_rows,
     }
   }
 
-  if (!run_dense_batch_matmul_from_transposed(transposed_input,
+  if (!run_dense_batch_matmul_from_transposed(kernel,
+                                              transposed_input,
                                               row_count,
                                               input_dim,
                                               weights,
@@ -780,7 +785,8 @@ bool compute_dense_batch_to_transposed(std::span<const float> input_rows,
   return true;
 }
 
-bool compute_dense_batch_to_transposed_prepared(std::span<const float> input_rows,
+bool compute_dense_batch_to_transposed_prepared(emel::kernel::sm & kernel,
+                                                std::span<const float> input_rows,
                                                 const size_t row_count,
                                                 const size_t input_dim,
                                                 std::span<const float> weights,
@@ -808,7 +814,8 @@ bool compute_dense_batch_to_transposed_prepared(std::span<const float> input_row
     }
   }
 
-  if (!run_dense_batch_matmul_from_transposed_prepared(transposed_input,
+  if (!run_dense_batch_matmul_from_transposed_prepared(kernel,
+                                                       transposed_input,
                                                        row_count,
                                                        input_dim,
                                                        weights,
@@ -829,7 +836,8 @@ bool compute_dense_batch_to_transposed_prepared(std::span<const float> input_row
   return true;
 }
 
-bool compute_dense_batch_from_transposed(std::span<const float> transposed_input,
+bool compute_dense_batch_from_transposed(emel::kernel::sm & kernel,
+                                         std::span<const float> transposed_input,
                                          const size_t row_count,
                                          const size_t input_dim,
                                          std::span<const float> weights,
@@ -846,7 +854,8 @@ bool compute_dense_batch_from_transposed(std::span<const float> transposed_input
     return false;
   }
 
-  if (!run_dense_batch_matmul_from_transposed(transposed_input,
+  if (!run_dense_batch_matmul_from_transposed(kernel,
+                                              transposed_input,
                                               row_count,
                                               input_dim,
                                               weights,
@@ -865,7 +874,8 @@ bool compute_dense_batch_from_transposed(std::span<const float> transposed_input
   return true;
 }
 
-bool compute_dense_batch_from_transposed_prepared(std::span<const float> transposed_input,
+bool compute_dense_batch_from_transposed_prepared(emel::kernel::sm & kernel,
+                                                  std::span<const float> transposed_input,
                                                   const size_t row_count,
                                                   const size_t input_dim,
                                                   std::span<const float> weights,
@@ -886,7 +896,8 @@ bool compute_dense_batch_from_transposed_prepared(std::span<const float> transpo
     return false;
   }
 
-  if (!run_dense_batch_matmul_from_transposed_prepared(transposed_input,
+  if (!run_dense_batch_matmul_from_transposed_prepared(kernel,
+                                                       transposed_input,
                                                        row_count,
                                                        input_dim,
                                                        weights,
@@ -907,6 +918,7 @@ bool compute_dense_batch_from_transposed_prepared(std::span<const float> transpo
 }
 
 bool compute_dense_batch_from_transposed_scaled_residual_prepared(
+    emel::kernel::sm & kernel,
     std::span<const float> transposed_input,
     const size_t row_count,
     const size_t input_dim,
@@ -931,7 +943,8 @@ bool compute_dense_batch_from_transposed_scaled_residual_prepared(
     return false;
   }
 
-  if (!run_dense_batch_matmul_from_transposed_prepared(transposed_input,
+  if (!run_dense_batch_matmul_from_transposed_prepared(kernel,
+                                                       transposed_input,
                                                        row_count,
                                                        input_dim,
                                                        weights,
diff --git a/src/emel/diarization/sortformer/detail.hpp b/src/emel/diarization/sortformer/detail.hpp
index 01828c90..beadc02c 100644
--- a/src/emel/diarization/sortformer/detail.hpp
+++ b/src/emel/diarization/sortformer/detail.hpp
@@ -4,6 +4,8 @@
 #include <span>
 #include <vector>
 
+#include "emel/kernel/sm.hpp"
+
 namespace emel::diarization::sortformer::detail {
 
 struct dense_weight_cache {
@@ -34,12 +36,14 @@ bool prepare_dense_weight_cache(std::span<const float> weights,
                                 size_t output_dim,
                                 dense_weight_cache & cache) noexcept;
 
-bool compute_dense(std::span<const float> input,
+bool compute_dense(emel::kernel::sm & kernel,
+                   std::span<const float> input,
                    std::span<const float> weights,
                    std::span<const float> bias,
                    std::span<float> output) noexcept;
 
-bool compute_dense_without_bias(std::span<const float> input,
+bool compute_dense_without_bias(emel::kernel::sm & kernel,
+                                std::span<const float> input,
                                 std::span<const float> weights,
                                 std::span<float> output) noexcept;
 
@@ -48,7 +52,8 @@ bool transpose_dense_input(std::span<const float> input_rows,
                            size_t input_dim,
                            std::span<float> transposed_input) noexcept;
 
-bool compute_dense_batch(std::span<const float> input_rows,
+bool compute_dense_batch(emel::kernel::sm & kernel,
+                         std::span<const float> input_rows,
                          size_t row_count,
                          size_t input_dim,
                          std::span<const float> weights,
@@ -58,7 +63,8 @@ bool compute_dense_batch(std::span<const float> input_rows,
                          std::span<float> transposed_output,
                          std::span<float> output_rows) noexcept;
 
-bool compute_dense_batch_prepared(std::span<const float> input_rows,
+bool compute_dense_batch_prepared(emel::kernel::sm & kernel,
+                                  std::span<const float> input_rows,
                                   size_t row_count,
                                   size_t input_dim,
                                   std::span<const float> weights,
@@ -69,7 +75,8 @@ bool compute_dense_batch_prepared(std::span<const float> input_rows,
                                   std::span<float> transposed_output,
                                   std::span<float> output_rows) noexcept;
 
-bool compute_dense_batch_residual_prepared(std::span<const float> input_rows,
+bool compute_dense_batch_residual_prepared(emel::kernel::sm & kernel,
+                                           std::span<const float> input_rows,
                                            size_t row_count,
                                            size_t input_dim,
                                            std::span<const float> weights,
@@ -81,7 +88,8 @@ bool compute_dense_batch_residual_prepared(std::span<const float> input_rows,
                                            std::span<float> transposed_output,
                                            std::span<float> output_rows) noexcept;
 
-bool compute_dense_batch_without_bias(std::span<const float> input_rows,
+bool compute_dense_batch_without_bias(emel::kernel::sm & kernel,
+                                      std::span<const float> input_rows,
                                       size_t row_count,
                                       size_t input_dim,
                                       std::span<const float> weights,
@@ -90,7 +98,8 @@ bool compute_dense_batch_without_bias(std::span<const float> input_rows,
                                       std::span<float> transposed_output,
                                       std::span<float> output_rows) noexcept;
 
-bool compute_dense_batch_without_bias_prepared(std::span<const float> input_rows,
+bool compute_dense_batch_without_bias_prepared(emel::kernel::sm & kernel,
+                                               std::span<const float> input_rows,
                                                size_t row_count,
                                                size_t input_dim,
                                                std::span<const float> weights,
@@ -100,7 +109,8 @@ bool compute_dense_batch_without_bias_prepared(std::span<const float> input_rows
                                                std::span<float> transposed_output,
                                                std::span<float> output_rows) noexcept;
 
-bool compute_dense_batch_to_transposed(std::span<const float> input_rows,
+bool compute_dense_batch_to_transposed(emel::kernel::sm & kernel,
+                                       std::span<const float> input_rows,
                                        size_t row_count,
                                        size_t input_dim,
                                        std::span<const float> weights,
@@ -109,7 +119,8 @@ bool compute_dense_batch_to_transposed(std::span<const float> input_rows,
                                        std::span<float> transposed_input,
                                        std::span<float> transposed_output) noexcept;
 
-bool compute_dense_batch_to_transposed_prepared(std::span<const float> input_rows,
+bool compute_dense_batch_to_transposed_prepared(emel::kernel::sm & kernel,
+                                                std::span<const float> input_rows,
                                                 size_t row_count,
                                                 size_t input_dim,
                                                 std::span<const float> weights,
@@ -119,7 +130,8 @@ bool compute_dense_batch_to_transposed_prepared(std::span<const float> input_row
                                                 std::span<float> transposed_input,
                                                 std::span<float> transposed_output) noexcept;
 
-bool compute_dense_batch_from_transposed(std::span<const float> transposed_input,
+bool compute_dense_batch_from_transposed(emel::kernel::sm & kernel,
+                                         std::span<const float> transposed_input,
                                          size_t row_count,
                                          size_t input_dim,
                                          std::span<const float> weights,
@@ -128,7 +140,8 @@ bool compute_dense_batch_from_transposed(std::span<const float> transposed_input
                                          std::span<float> transposed_output,
                                          std::span<float> output_rows) noexcept;
 
-bool compute_dense_batch_from_transposed_prepared(std::span<const float> transposed_input,
+bool compute_dense_batch_from_transposed_prepared(emel::kernel::sm & kernel,
+                                                  std::span<const float> transposed_input,
                                                   size_t row_count,
                                                   size_t input_dim,
                                                   std::span<const float> weights,
@@ -139,6 +152,7 @@ bool compute_dense_batch_from_transposed_prepared(std::span<const float> transpo
                                                   std::span<float> output_rows) noexcept;
 
 bool compute_dense_batch_from_transposed_scaled_residual_prepared(
+    emel::kernel::sm & kernel,
     std::span<const float> transposed_input,
     size_t row_count,
     size_t input_dim,
diff --git a/src/emel/diarization/sortformer/encoder/detail.cpp b/src/emel/diarization/sortformer/encoder/detail.cpp
index d597dc4c..6f39c13a 100644
--- a/src/emel/diarization/sortformer/encoder/detail.cpp
+++ b/src/emel/diarization/sortformer/encoder/detail.cpp
@@ -294,6 +294,7 @@ bool compute_pointwise_row(std::span<const float> input,
                            pre_encoder_workspace & workspace,
                            std::span<float> output) noexcept {
   if (!emel::diarization::sortformer::detail::compute_dense_batch(
+          workspace.kernel,
           input,
           static_cast<size_t>(freq_count),
           static_cast<size_t>(k_pre_channel_count),
@@ -411,6 +412,7 @@ bool compute_position_projection(
     pre_encoder_workspace & workspace,
     std::span<float, k_relative_position_count * k_model_dim> output) noexcept {
   return emel::diarization::sortformer::detail::compute_dense_batch_without_bias_prepared(
+      workspace.kernel,
       positions,
       static_cast<size_t>(k_relative_position_count),
       static_cast<size_t>(k_model_dim),
@@ -446,6 +448,7 @@ bool compute_feed_forward_block(
   }
 
   if (!emel::diarization::sortformer::detail::compute_dense_batch_to_transposed_prepared(
+          workspace.kernel,
           fixed_span<k_required_encoder_value_count>(workspace.layer_norm),
           static_cast<size_t>(k_frame_count),
           static_cast<size_t>(k_model_dim),
@@ -467,6 +470,7 @@ bool compute_feed_forward_block(
 
   if (!emel::diarization::sortformer::detail::
           compute_dense_batch_from_transposed_scaled_residual_prepared(
+          workspace.kernel,
           std::span<const float>{workspace.dense_transposed_output.data(),
                                  feed_forward_value_count},
           static_cast<size_t>(k_frame_count),
@@ -591,6 +595,7 @@ bool compute_attention_block(
           static_cast<size_t>(k_model_dim),
           qkv_transposed) ||
       !emel::diarization::sortformer::detail::compute_dense_batch_from_transposed_prepared(
+          workspace.kernel,
           qkv_transposed,
           static_cast<size_t>(k_frame_count),
           static_cast<size_t>(k_model_dim),
@@ -601,6 +606,7 @@ bool compute_attention_block(
           workspace.dense_transposed_output,
           fixed_span<k_required_encoder_value_count>(workspace.query)) ||
       !emel::diarization::sortformer::detail::compute_dense_batch_from_transposed_prepared(
+          workspace.kernel,
           qkv_transposed,
           static_cast<size_t>(k_frame_count),
           static_cast<size_t>(k_model_dim),
@@ -611,6 +617,7 @@ bool compute_attention_block(
           workspace.dense_transposed_output,
           fixed_span<k_required_encoder_value_count>(workspace.key)) ||
       !emel::diarization::sortformer::detail::compute_dense_batch_from_transposed_prepared(
+          workspace.kernel,
           qkv_transposed,
           static_cast<size_t>(k_frame_count),
           static_cast<size_t>(k_model_dim),
@@ -653,6 +660,7 @@ bool compute_attention_block(
   }
 
   if (!emel::diarization::sortformer::detail::compute_dense_batch_residual_prepared(
+          workspace.kernel,
           fixed_span<k_required_encoder_value_count>(workspace.layer_result),
           static_cast<size_t>(k_frame_count),
           static_cast<size_t>(k_model_dim),
@@ -702,6 +710,7 @@ bool compute_convolution_block(
       static_cast<size_t>(k_frame_count * 2 * k_model_dim),
   };
   if (!emel::diarization::sortformer::detail::compute_dense_batch_prepared(
+          workspace.kernel,
           fixed_span<k_required_encoder_value_count>(workspace.layer_norm),
           static_cast<size_t>(k_frame_count),
           static_cast<size_t>(k_model_dim),
@@ -751,6 +760,7 @@ bool compute_convolution_block(
   }
 
   if (!emel::diarization::sortformer::detail::compute_dense_batch_residual_prepared(
+          workspace.kernel,
           fixed_span<k_required_encoder_value_count>(workspace.layer_output),
           static_cast<size_t>(k_frame_count),
           static_cast<size_t>(k_model_dim),
@@ -1200,6 +1210,7 @@ bool compute_encoder_frames_from_features(
   }
 
   if (!emel::diarization::sortformer::detail::compute_dense_batch_prepared(
+          workspace.kernel,
           workspace.pre_encoder_rows,
           static_cast<size_t>(k_frame_count),
           static_cast<size_t>(k_pre_expanded_dim),
diff --git a/src/emel/diarization/sortformer/encoder/detail.hpp b/src/emel/diarization/sortformer/encoder/detail.hpp
index ed03ff83..ccc78e36 100644
--- a/src/emel/diarization/sortformer/encoder/detail.hpp
+++ b/src/emel/diarization/sortformer/encoder/detail.hpp
@@ -8,6 +8,7 @@
 #include <vector>
 
 #include "emel/diarization/sortformer/detail.hpp"
+#include "emel/kernel/sm.hpp"
 #include "emel/diarization/sortformer/encoder/feature_extractor/detail.hpp"
 #include "emel/model/data.hpp"
 
@@ -69,6 +70,8 @@ struct contract {
 struct pre_encoder_workspace {
   pre_encoder_workspace();
 
+  emel::kernel::sm kernel{emel::kernel::detect_host_kind()};
+
   std::array<std::array<float, k_conv0_row_value_count>, 3> conv0_rows = {};
   std::array<std::array<float, k_stage1_row_value_count>, 3> stage1_rows = {};
   std::array<float, k_stage1_row_value_count> stage1_depthwise = {};
diff --git a/src/emel/diarization/sortformer/executor/actions.hpp b/src/emel/diarization/sortformer/executor/actions.hpp
index 7bfee84e..f1c593fb 100644
--- a/src/emel/diarization/sortformer/executor/actions.hpp
+++ b/src/emel/diarization/sortformer/executor/actions.hpp
@@ -141,6 +141,7 @@ inline bool compute_encoder_projection_stage(
           *ctx.modules.encoder_projection_bias.tensor);
 
   return modules_detail::compute_encoder_projection_batch(
+      ctx.transformer_workspace.kernel,
       runtime_ev.request.encoder_frames,
       static_cast<size_t>(detail::k_frame_count),
       encoder_projection_weight,
diff --git a/src/emel/diarization/sortformer/modules/detail.cpp b/src/emel/diarization/sortformer/modules/detail.cpp
index aee0892c..2e944785 100644
--- a/src/emel/diarization/sortformer/modules/detail.cpp
+++ b/src/emel/diarization/sortformer/modules/detail.cpp
@@ -94,11 +94,13 @@ bool bind_contract(const emel::model::data & model_data,
   return true;
 }
 
-bool compute_encoder_projection(std::span<const float, k_encoder_dim> encoder_frame,
+bool compute_encoder_projection(emel::kernel::sm & kernel,
+                                std::span<const float, k_encoder_dim> encoder_frame,
                                 std::span<const float, k_hidden_dim * k_encoder_dim> weights,
                                 std::span<const float, k_hidden_dim> bias,
                                 std::span<float, k_hidden_dim> hidden_out) noexcept {
-  return emel::diarization::sortformer::detail::compute_dense(encoder_frame,
+  return emel::diarization::sortformer::detail::compute_dense(kernel,
+                                                              encoder_frame,
                                                               weights,
                                                               bias,
                                                               hidden_out);
@@ -115,6 +117,7 @@ bool prepare_encoder_projection_weight_cache(
 }
 
 bool compute_encoder_projection_batch(
+    emel::kernel::sm & kernel,
     std::span<const float> encoder_frames,
     const size_t frame_count,
     std::span<const float, k_hidden_dim * k_encoder_dim> weights,
@@ -124,6 +127,7 @@ bool compute_encoder_projection_batch(
     std::span<float> transposed_output,
     std::span<float> hidden_out) noexcept {
   return emel::diarization::sortformer::detail::compute_dense_batch_prepared(
+      kernel,
       encoder_frames,
       frame_count,
       static_cast<size_t>(k_encoder_dim),
diff --git a/src/emel/diarization/sortformer/modules/detail.hpp b/src/emel/diarization/sortformer/modules/detail.hpp
index 4efcdd0f..85cbac48 100644
--- a/src/emel/diarization/sortformer/modules/detail.hpp
+++ b/src/emel/diarization/sortformer/modules/detail.hpp
@@ -7,6 +7,7 @@
 #include <string_view>
 
 #include "emel/diarization/sortformer/detail.hpp"
+#include "emel/kernel/sm.hpp"
 #include "emel/model/data.hpp"
 
 namespace emel::diarization::sortformer::modules::detail {
@@ -39,7 +40,8 @@ struct contract {
 bool bind_contract(const emel::model::data & model_data,
                    contract & contract_out) noexcept;
 
-bool compute_encoder_projection(std::span<const float, k_encoder_dim> encoder_frame,
+bool compute_encoder_projection(emel::kernel::sm & kernel,
+                                std::span<const float, k_encoder_dim> encoder_frame,
                                 std::span<const float, k_hidden_dim * k_encoder_dim> weights,
                                 std::span<const float, k_hidden_dim> bias,
                                 std::span<float, k_hidden_dim> hidden_out) noexcept;
@@ -49,6 +51,7 @@ bool prepare_encoder_projection_weight_cache(
     emel::diarization::sortformer::detail::dense_weight_cache & cache) noexcept;
 
 bool compute_encoder_projection_batch(
+    emel::kernel::sm & kernel,
     std::span<const float> encoder_frames,
     size_t frame_count,
     std::span<const float, k_hidden_dim * k_encoder_dim> weights,
diff --git a/src/emel/diarization/sortformer/output/detail.cpp b/src/emel/diarization/sortformer/output/detail.cpp
index a0b6e457..f7658607 100644
--- a/src/emel/diarization/sortformer/output/detail.cpp
+++ b/src/emel/diarization/sortformer/output/detail.cpp
@@ -49,6 +49,7 @@ bool append_segment(std::span<segment_record> segments_out,
 }  // namespace
 
 bool compute_speaker_probabilities(
+    emel::kernel::sm & kernel,
     std::span<const float> hidden_frames,
     const emel::diarization::sortformer::modules::detail::contract & modules_contract,
     std::span<float> probabilities_out) noexcept {
@@ -82,6 +83,7 @@ bool compute_speaker_probabilities(
     }
 
     if (!emel::diarization::sortformer::detail::compute_dense(
+            kernel,
             intermediate, frame_hidden_weights, frame_hidden_bias, frame_hidden)) {
       return false;
     }
@@ -90,7 +92,7 @@ bool compute_speaker_probabilities(
       intermediate[index] = relu(frame_hidden[index]);
     }
 
-    if (!emel::diarization::sortformer::detail::compute_dense(intermediate, weights, bias, logits)) {
+    if (!emel::diarization::sortformer::detail::compute_dense(kernel, intermediate, weights, bias, logits)) {
       return false;
     }
 
diff --git a/src/emel/diarization/sortformer/output/detail.hpp b/src/emel/diarization/sortformer/output/detail.hpp
index 2986134c..7b873aeb 100644
--- a/src/emel/diarization/sortformer/output/detail.hpp
+++ b/src/emel/diarization/sortformer/output/detail.hpp
@@ -6,6 +6,7 @@
 #include <string_view>
 
 #include "emel/diarization/sortformer/modules/detail.hpp"
+#include "emel/kernel/sm.hpp"
 
 namespace emel::diarization::sortformer::output::detail {
 
@@ -27,6 +28,7 @@ struct segment_record {
 };
 
 bool compute_speaker_probabilities(
+    emel::kernel::sm & kernel,
     std::span<const float> hidden_frames,
     const emel::diarization::sortformer::modules::detail::contract & modules_contract,
     std::span<float> probabilities_out) noexcept;
diff --git a/src/emel/diarization/sortformer/pipeline/actions.hpp b/src/emel/diarization/sortformer/pipeline/actions.hpp
index 20c8aa70..4cfb171d 100644
--- a/src/emel/diarization/sortformer/pipeline/actions.hpp
+++ b/src/emel/diarization/sortformer/pipeline/actions.hpp
@@ -133,7 +133,8 @@ struct effect_compute_probabilities {
         static_cast<size_t>(detail::k_required_probability_value_count));
     const bool probability_ok =
         emel::diarization::sortformer::output::detail::compute_speaker_probabilities(
-            ctx.hidden, ctx.modules, probability_output);
+            ctx.encoder_workspace.kernel, ctx.hidden, ctx.modules,
+            probability_output);
     effect_store_kernel_result(runtime_ev.ctx, probability_ok);
     runtime_ev.request.probability_count_out =
         detail::k_required_probability_value_count *
diff --git a/src/emel/diarization/sortformer/transformer/detail.cpp b/src/emel/diarization/sortformer/transformer/detail.cpp
index 44c27f80..4807b58c 100644
--- a/src/emel/diarization/sortformer/transformer/detail.cpp
+++ b/src/emel/diarization/sortformer/transformer/detail.cpp
@@ -323,6 +323,7 @@ bool compute_transformer_layer(
           static_cast<size_t>(k_hidden_dim),
           qkv_transposed) ||
       !emel::diarization::sortformer::detail::compute_dense_batch_from_transposed_prepared(
+          workspace.kernel,
           qkv_transposed,
           static_cast<size_t>(frame_count),
           static_cast<size_t>(k_hidden_dim),
@@ -333,6 +334,7 @@ bool compute_transformer_layer(
           workspace.dense_transposed_output,
           std::span<float>{workspace.query.data(), frame_value_count}) ||
       !emel::diarization::sortformer::detail::compute_dense_batch_from_transposed_prepared(
+          workspace.kernel,
           qkv_transposed,
           static_cast<size_t>(frame_count),
           static_cast<size_t>(k_hidden_dim),
@@ -343,6 +345,7 @@ bool compute_transformer_layer(
           workspace.dense_transposed_output,
           std::span<float>{workspace.key.data(), frame_value_count}) ||
       !emel::diarization::sortformer::detail::compute_dense_batch_from_transposed_prepared(
+          workspace.kernel,
           qkv_transposed,
           static_cast<size_t>(frame_count),
           static_cast<size_t>(k_hidden_dim),
@@ -375,6 +378,7 @@ bool compute_transformer_layer(
   }
 
   if (!emel::diarization::sortformer::detail::compute_dense_batch_residual_prepared(
+          workspace.kernel,
           std::span<const float>{workspace.first_norm.data(), frame_value_count},
           static_cast<size_t>(frame_count),
           static_cast<size_t>(k_hidden_dim),
@@ -404,6 +408,7 @@ bool compute_transformer_layer(
   }
 
   if (!emel::diarization::sortformer::detail::compute_dense_batch_prepared(
+          workspace.kernel,
           std::span<const float>{workspace.first_norm.data(), frame_value_count},
           static_cast<size_t>(frame_count),
           static_cast<size_t>(k_hidden_dim),
@@ -426,6 +431,7 @@ bool compute_transformer_layer(
   }
 
   if (!emel::diarization::sortformer::detail::compute_dense_batch_residual_prepared(
+          workspace.kernel,
           std::span<const float>{workspace.feed_forward_rows.data(), feed_forward_value_count},
           static_cast<size_t>(frame_count),
           static_cast<size_t>(k_inner_dim),
diff --git a/src/emel/diarization/sortformer/transformer/detail.hpp b/src/emel/diarization/sortformer/transformer/detail.hpp
index c3afd201..42ffa75d 100644
--- a/src/emel/diarization/sortformer/transformer/detail.hpp
+++ b/src/emel/diarization/sortformer/transformer/detail.hpp
@@ -7,6 +7,7 @@
 #include <string_view>
 
 #include "emel/diarization/sortformer/detail.hpp"
+#include "emel/kernel/sm.hpp"
 #include "emel/model/data.hpp"
 
 namespace emel::diarization::sortformer::transformer::detail {
@@ -57,6 +58,8 @@ struct contract {
 struct layer_workspace {
   layer_workspace();
 
+  emel::kernel::sm kernel{emel::kernel::detect_host_kind()};
+
   std::array<float, k_max_frame_count * k_hidden_dim> query = {};
   std::array<float, k_max_frame_count * k_hidden_dim> key = {};
   std::array<float, k_max_frame_count * k_hidden_dim> value = {};
diff --git a/src/emel/embeddings/generator/context.hpp b/src/emel/embeddings/generator/context.hpp
index 6f11f33b..48c3b910 100644
--- a/src/emel/embeddings/generator/context.hpp
+++ b/src/emel/embeddings/generator/context.hpp
@@ -7,6 +7,7 @@
 
 #include "emel/kernel/detail.hpp"
 #include "emel/kernel/events.hpp"
+#include "emel/kernel/sm.hpp"
 #include "emel/model/data.hpp"
 #include "emel/model/omniembed/detail.hpp"
 #include "emel/text/conditioner/sm.hpp"
@@ -324,6 +325,7 @@ struct context {
   image_runtime image = {};
   audio_runtime audio = {};
   scratch_buffers scratch = {};
+  emel::kernel::sm kernel{emel::kernel::detect_host_kind()};
   bool initialized = false;
 };
 
diff --git a/src/emel/embeddings/generator/detail.hpp b/src/emel/embeddings/generator/detail.hpp
index 311656e2..300036e1 100644
--- a/src/emel/embeddings/generator/detail.hpp
+++ b/src/emel/embeddings/generator/detail.hpp
@@ -338,6 +338,14 @@ inline bool pack_standard_conv_kernel_major(const float * src,
 }
 
 inline emel::kernel::event::tensor_view make_dense_src0_view(const action::matrix_view & matrix) noexcept {
+  // Quantized rows use the byte-stride convention (nb[0] == 1); dense float
+  // tensors carry their element size so kernel dispatch validation accepts
+  // the layout.
+  const uint64_t elem_bytes =
+      emel::kernel::detail::is_native_quantized_dtype(matrix.dtype)
+          ? 1u
+          : static_cast<uint64_t>(
+                emel::kernel::detail::dtype_size_bytes(matrix.dtype));
   return emel::kernel::event::tensor_view{
     .data = matrix.data,
     .type = static_cast<emel::kernel::event::dtype>(matrix.dtype),
@@ -348,7 +356,7 @@ inline emel::kernel::event::tensor_view make_dense_src0_view(const action::matri
       1u,
     },
     .nb = {
-      1u,
+      elem_bytes,
       static_cast<uint64_t>(matrix.row_bytes),
       static_cast<uint64_t>(matrix.row_bytes) * static_cast<uint64_t>(matrix.rows),
       static_cast<uint64_t>(matrix.row_bytes) * static_cast<uint64_t>(matrix.rows),
@@ -2048,7 +2056,8 @@ inline void set_error(const event::embed_audio_run & runtime_ev, const error err
   runtime_ev.ctx.err = to_error(err);
 }
 
-inline bool matmul_q5_0(const action::matrix_view & matrix,
+inline bool matmul_q5_0(emel::kernel::sm & kernel,
+                        const action::matrix_view & matrix,
                         std::span<const float> input,
                         std::span<emel::kernel::detail::quant::block_q8_0> q8_input,
                         std::span<float> output) noexcept {
@@ -2067,31 +2076,19 @@ inline bool matmul_q5_0(const action::matrix_view & matrix,
     return false;
   }
 
-#if defined(__aarch64__) || defined(__ARM_NEON)
-  emel::kernel::event::op_mul_mat request{
+  // Activation quantization is kernel-owned; the q8 scratch stays for
+  // signature stability with callers that still pre-reserve it.
+  (void) q8_input;
+  const emel::kernel::event::op_mul_mat request{
     .src0 = make_dense_src0_view(matrix),
     .src1 = make_dense_vector_src1_view(input),
     .dst = make_dense_vector_dst_view(output),
-    .nth = 1u,
   };
-  return ::emel::kernel::aarch64::detail::execute_neon_mul_mat_q5_0_vector(request);
-#else
-  emel::kernel::detail::quant::quantize_row_q8_0_strided(
-      input.data(), 1u, q8_input.data(), matrix.cols);
-  const auto * base = static_cast<const uint8_t *>(matrix.data);
-  for (int32_t row = 0; row < matrix.rows; ++row) {
-    const auto * row_ptr = base + static_cast<size_t>(row) * matrix.row_bytes;
-    output[static_cast<size_t>(row)] =
-        emel::kernel::detail::dot_q5_0_q8_0_row_scalar(
-            reinterpret_cast<const emel::kernel::detail::quant::block_q5_0 *>(row_ptr),
-            q8_input.data(),
-            block_count);
-  }
-  return true;
-#endif
+  return kernel.process_event(request);
 }
 
-inline bool matmul_q8_0(const action::matrix_view & matrix,
+inline bool matmul_q8_0(emel::kernel::sm & kernel,
+                        const action::matrix_view & matrix,
                         std::span<const float> input,
                         std::span<emel::kernel::detail::quant::block_q8_0> q8_input,
                         std::span<float> output) noexcept {
@@ -2110,29 +2107,15 @@ inline bool matmul_q8_0(const action::matrix_view & matrix,
     return false;
   }
 
-#if defined(__aarch64__) || defined(__ARM_NEON)
-  emel::kernel::event::op_mul_mat request{
+  // Activation quantization is kernel-owned; the q8 scratch stays for
+  // signature stability with callers that still pre-reserve it.
+  (void) q8_input;
+  const emel::kernel::event::op_mul_mat request{
     .src0 = make_dense_src0_view(matrix),
     .src1 = make_dense_vector_src1_view(input),
     .dst = make_dense_vector_dst_view(output),
-    .nth = 1u,
   };
-  ::emel::kernel::aarch64::detail::execute_neon_mul_mat_q8_0_vector_unchecked(request);
-  return true;
-#else
-  emel::kernel::detail::quant::quantize_row_q8_0_strided(
-      input.data(), 1u, q8_input.data(), matrix.cols);
-  const auto * base = static_cast<const uint8_t *>(matrix.data);
-  for (int32_t row = 0; row < matrix.rows; ++row) {
-    const auto * row_ptr = base + static_cast<size_t>(row) * matrix.row_bytes;
-    output[static_cast<size_t>(row)] =
-        emel::kernel::detail::dot_q8_0_q8_0_row_scalar(
-            reinterpret_cast<const emel::kernel::detail::quant::block_q8_0 *>(row_ptr),
-            q8_input.data(),
-            block_count);
-  }
-  return true;
-#endif
+  return kernel.process_event(request);
 }
 
 inline bool matmul_f16(const action::matrix_view & matrix,
@@ -2160,7 +2143,8 @@ inline bool matmul_f16(const action::matrix_view & matrix,
   return true;
 }
 
-inline bool matmul_f32(const action::matrix_view & matrix,
+inline bool matmul_f32(emel::kernel::sm & kernel,
+                       const action::matrix_view & matrix,
                        std::span<const float> input,
                        std::span<float> output) noexcept {
   if (matrix.dtype != emel::kernel::detail::dtype_f32 ||
@@ -2171,30 +2155,16 @@ inline bool matmul_f32(const action::matrix_view & matrix,
     return false;
   }
 
-#if defined(__aarch64__) || defined(__ARM_NEON)
-  emel::kernel::event::op_mul_mat request{
+  const emel::kernel::event::op_mul_mat request{
     .src0 = make_dense_src0_view(matrix),
     .src1 = make_dense_vector_src1_view(input),
     .dst = make_dense_vector_dst_view(output),
-    .nth = 1u,
   };
-  return ::emel::kernel::aarch64::detail::execute_neon_mul_mat(request);
-#else
-  const auto * base = static_cast<const uint8_t *>(matrix.data);
-  for (int32_t row = 0; row < matrix.rows; ++row) {
-    const auto * weights =
-        reinterpret_cast<const float *>(base + static_cast<size_t>(row) * matrix.row_bytes);
-    float acc = 0.0f;
-    for (int32_t col = 0; col < matrix.cols; ++col) {
-      acc += weights[col] * input[static_cast<size_t>(col)];
-    }
-    output[static_cast<size_t>(row)] = acc;
-  }
-  return true;
-#endif
+  return kernel.process_event(request);
 }
 
-inline bool matmul_f32_matrix(const action::matrix_view & matrix,
+inline bool matmul_f32_matrix(emel::kernel::sm & kernel,
+                              const action::matrix_view & matrix,
                               const float * input,
                               const int32_t input_cols,
                               float * output) noexcept {
@@ -2207,51 +2177,30 @@ inline bool matmul_f32_matrix(const action::matrix_view & matrix,
     return false;
   }
 
-#if defined(__aarch64__) || defined(__ARM_NEON)
-  emel::kernel::event::op_mul_mat request{
+  const emel::kernel::event::op_mul_mat request{
     .src0 = make_dense_src0_view(matrix),
     .src1 = make_dense_matrix_src1_view(input, input_cols, matrix.cols),
     .dst = make_dense_matrix_dst_view(output, input_cols, matrix.rows),
-    .nth = 1u,
   };
-  return ::emel::kernel::aarch64::detail::execute_neon_mul_mat(request);
-#else
-  const auto * weights_base = static_cast<const float *>(matrix.data);
-  for (int32_t row = 0; row < matrix.rows; ++row) {
-    const auto * weights =
-        reinterpret_cast<const float *>(static_cast<const uint8_t *>(matrix.data) +
-                                        static_cast<size_t>(row) * matrix.row_bytes);
-    float * output_row = output + static_cast<size_t>(row) * static_cast<size_t>(input_cols);
-    for (int32_t col = 0; col < input_cols; ++col) {
-      float acc = 0.0f;
-      for (int32_t depth = 0; depth < matrix.cols; ++depth) {
-        acc += weights[static_cast<size_t>(depth)] *
-            input[static_cast<size_t>(depth) * static_cast<size_t>(input_cols) +
-                  static_cast<size_t>(col)];
-      }
-      output_row[static_cast<size_t>(col)] = acc;
-    }
-  }
-  (void) weights_base;
-  return true;
-#endif
+  return kernel.process_event(request);
 }
 
-inline bool matmul(const action::matrix_view & matrix,
+inline bool matmul(emel::kernel::sm & kernel,
+                   const action::matrix_view & matrix,
                    std::span<const float> input,
                    std::span<emel::kernel::detail::quant::block_q8_0> q8_input,
                    std::span<float> output) noexcept {
   if (matrix.dtype == emel::kernel::detail::dtype_q5_0) {
-    return matmul_q5_0(matrix, input, q8_input, output);
+    return matmul_q5_0(kernel, matrix, input, q8_input, output);
   }
   if (matrix.dtype == emel::kernel::detail::dtype_q8_0) {
-    return matmul_q8_0(matrix, input, q8_input, output);
+    return matmul_q8_0(kernel, matrix, input, q8_input, output);
   }
   if (matrix.dtype == emel::kernel::detail::dtype_f16) {
     return matmul_f16(matrix, input, output);
   }
   if (matrix.dtype == emel::kernel::detail::dtype_f32) {
-    return matmul_f32(matrix, input, output);
+    return matmul_f32(kernel, matrix, input, output);
   }
   return false;
 }
@@ -2473,11 +2422,11 @@ inline bool run_attention_layer(action::context & ctx,
     auto query = token_span(ctx.scratch.query.get(), token_index, ctx.text.hidden_size);
     auto key = token_span(ctx.scratch.key.get(), token_index, ctx.text.hidden_size);
     auto value = token_span(ctx.scratch.value.get(), token_index, ctx.text.hidden_size);
-    if (!matmul(layer.attention_query, input, q8_input, query) ||
+    if (!matmul(ctx.kernel, layer.attention_query, input, q8_input, query) ||
         !add_bias(query, layer.attention_query_bias) ||
-        !matmul(layer.attention_key, input, q8_input, key) ||
+        !matmul(ctx.kernel, layer.attention_key, input, q8_input, key) ||
         !add_bias(key, layer.attention_key_bias) ||
-        !matmul(layer.attention_value, input, q8_input, value) ||
+        !matmul(ctx.kernel, layer.attention_value, input, q8_input, value) ||
         !add_bias(value, layer.attention_value_bias)) {
       return false;
     }
@@ -2522,7 +2471,7 @@ inline bool run_attention_layer(action::context & ctx,
 
     auto hidden = std::span<float>{ctx.scratch.token_hidden.get(), static_cast<size_t>(ctx.text.hidden_size)};
     const auto input = token_span(sequence_in, token_index, ctx.text.hidden_size);
-    if (!matmul(layer.attention_output, context, q8_input, hidden) ||
+    if (!matmul(ctx.kernel, layer.attention_output, context, q8_input, hidden) ||
         !add_bias(hidden, layer.attention_output_bias)) {
       return false;
     }
@@ -2538,14 +2487,14 @@ inline bool run_attention_layer(action::context & ctx,
     }
 
     auto feed_forward = std::span<float>{ctx.scratch.feed_forward.get(), static_cast<size_t>(ctx.text.intermediate_size)};
-    if (!matmul(layer.intermediate, hidden, q8_input, feed_forward) ||
+    if (!matmul(ctx.kernel, layer.intermediate, hidden, q8_input, feed_forward) ||
         !add_bias(feed_forward, layer.intermediate_bias)) {
       return false;
     }
     apply_gelu(feed_forward);
 
     auto output = token_span(sequence_out, token_index, ctx.text.hidden_size);
-    if (!matmul(layer.output, feed_forward, q8_input, output) ||
+    if (!matmul(ctx.kernel, layer.output, feed_forward, q8_input, output) ||
         !add_bias(output, layer.output_bias)) {
       return false;
     }
@@ -2657,7 +2606,8 @@ inline void apply_batch_norm_hwc(float * values,
   }
 }
 
-inline bool pointwise_conv_hwc(const action::matrix_view & matrix,
+inline bool pointwise_conv_hwc(emel::kernel::sm & kernel,
+                               const action::matrix_view & matrix,
                                const float * input,
                                const int32_t pixel_count,
                                float * output) noexcept {
@@ -2680,7 +2630,8 @@ inline bool pointwise_conv_hwc(const action::matrix_view & matrix,
         output + static_cast<size_t>(pixel_index) * static_cast<size_t>(matrix.rows),
         static_cast<size_t>(matrix.rows),
     };
-    if (!matmul(matrix,
+    if (!matmul(kernel,
+                matrix,
                 input_pixel,
                 std::span<emel::kernel::detail::quant::block_q8_0>{},
                 output_pixel)) {
@@ -2697,8 +2648,6 @@ inline bool pointwise_conv_hwc_direct_f32_impl(const action::matrix_view & matri
                                                const action::batch_norm_view * batch_norm,
                                                float * output) noexcept {
   if (matrix.dtype != emel::kernel::detail::dtype_f32 ||
-      matrix.packed_rhs_f32 == nullptr ||
-      matrix.packed_rhs_cols != matrix.rows ||
       matrix.rows <= 0 ||
       matrix.cols <= 0 ||
       input == nullptr ||
@@ -2706,6 +2655,16 @@ inline bool pointwise_conv_hwc_direct_f32_impl(const action::matrix_view & matri
       pixel_count <= 0) {
     return false;
   }
+#if defined(__aarch64__) || defined(__ARM_NEON)
+  if (matrix.packed_rhs_f32 == nullptr ||
+      matrix.packed_rhs_cols != matrix.rows) {
+    return false;
+  }
+#else
+  if (matrix.transposed_f32 == nullptr) {
+    return false;
+  }
+#endif
   if constexpr (fuse_batch_norm) {
     if (batch_norm == nullptr ||
         batch_norm->scale == nullptr ||
@@ -2743,6 +2702,13 @@ inline bool pointwise_conv_hwc_direct_f32_impl(const action::matrix_view & matri
                                   static_cast<size_t>(matrix.rows) +
                                   static_cast<size_t>(output_channel)];
       }
+      if constexpr (fuse_batch_norm) {
+        acc = (acc * batch_norm->scale[static_cast<size_t>(output_channel)]) +
+            batch_norm->shift[static_cast<size_t>(output_channel)];
+        if constexpr (apply_relu) {
+          acc = std::max(acc, 0.0f);
+        }
+      }
       output_pixel[static_cast<size_t>(output_channel)] = acc;
     }
   }
@@ -2852,6 +2818,7 @@ inline bool standard_conv_hwc_impl(const action::conv2d_view & conv,
   const int32_t pad_w = same_padding(conv.kernel_w, stride);
   (void) patch_buffer;
   (void) patch_capacity;
+  (void) kernel_channel_stride;
 
   for (int32_t output_y = 0; output_y < output_spatial; ++output_y) {
     for (int32_t output_x = 0; output_x < output_spatial; ++output_x) {
@@ -3284,7 +3251,8 @@ inline void apply_batch_norm_hwc_rect(float * values,
   }
 }
 
-inline bool standard_conv_hwc_rect(const action::conv2d_view & conv,
+inline bool standard_conv_hwc_rect(emel::kernel::sm & kernel,
+                                   const action::conv2d_view & conv,
                                    const float * input,
                                    const int32_t input_height,
                                    const int32_t input_width,
@@ -3332,7 +3300,8 @@ inline bool standard_conv_hwc_rect(const action::conv2d_view & conv,
               ox,
               patch_buffer,
               patch_capacity) ||
-          !matmul_f32(matrix,
+          !matmul_f32(kernel,
+                      matrix,
                       std::span<const float>{patch_buffer, patch_size},
                       std::span<float>{output_pixel, static_cast<size_t>(conv.output_channels)})) {
         return false;
@@ -3665,12 +3634,12 @@ inline bool run_audio_se(action::context & ctx,
   };
 
   average_pool_hwc_rect(values, height, width, se.input_size, pooled.data());
-  if (!matmul(se.fc1, pooled, q8_input, hidden) ||
+  if (!matmul(ctx.kernel, se.fc1, pooled, q8_input, hidden) ||
       !add_bias(hidden, se.fc1_bias)) {
     return false;
   }
   apply_activation_in_place<true, false>(hidden);
-  if (!matmul(se.fc2, hidden, q8_input, scale) ||
+  if (!matmul(ctx.kernel, se.fc2, hidden, q8_input, scale) ||
       !add_bias(scale, se.fc2_bias)) {
     return false;
   }
@@ -3716,6 +3685,7 @@ inline bool run_audio_block(action::context & ctx,
   int32_t work_width = width;
   if (block.has_expand) {
     if (!pointwise_conv_hwc(
+            ctx.kernel,
             block.expand,
             current_buffer,
             height * width,
@@ -3777,6 +3747,7 @@ inline bool run_audio_block(action::context & ctx,
 
   float * project_dst = depthwise_dst == current_buffer ? alternate_buffer : current_buffer;
   if (!pointwise_conv_hwc(
+          ctx.kernel,
           block.project,
           depthwise_dst,
           output_height * output_width,
@@ -3822,7 +3793,8 @@ inline emel::error::type run_audio_embedding(action::context & ctx) noexcept {
 
   int32_t height = 0;
   int32_t width = 0;
-  if (!standard_conv_hwc_rect(ctx.audio.stem.conv,
+  if (!standard_conv_hwc_rect(ctx.kernel,
+                              ctx.audio.stem.conv,
                               ctx.scratch.audio_input.get(),
                               ctx.audio.num_mel_bins,
                               ctx.audio.time_frames,
@@ -3858,7 +3830,8 @@ inline emel::error::type run_audio_embedding(action::context & ctx) noexcept {
 
   int32_t head_height = 0;
   int32_t head_width = 0;
-  if (!standard_conv_hwc_rect(ctx.audio.head.conv,
+  if (!standard_conv_hwc_rect(ctx.kernel,
+                              ctx.audio.head.conv,
                               current_buffer,
                               height,
                               width,
@@ -4111,7 +4084,7 @@ inline bool run_projection_head(action::context & ctx,
   auto full_embedding = std::span<float>{
       ctx.scratch.full_embedding.get(), static_cast<size_t>(projection.output_size)};
 
-  if (!matmul(projection.expand, input_embedding, q8_input, projection_hidden) ||
+  if (!matmul(ctx.kernel, projection.expand, input_embedding, q8_input, projection_hidden) ||
       !add_bias(projection_hidden, projection.expand_bias)) {
     return false;
   }
@@ -4127,7 +4100,7 @@ inline bool run_projection_head(action::context & ctx,
   std::memcpy(projection_residual.data(),
               projection_hidden.data(),
               projection_hidden.size_bytes());
-  if (!matmul(projection.residual, projection_hidden, q8_input, projection_residual) ||
+  if (!matmul(ctx.kernel, projection.residual, projection_hidden, q8_input, projection_residual) ||
       !add_bias(projection_residual, projection.residual_bias)) {
     return false;
   }
@@ -4141,7 +4114,7 @@ inline bool run_projection_head(action::context & ctx,
     return false;
   }
 
-  return matmul(projection.project, projection_residual, q8_input, full_embedding) &&
+  return matmul(ctx.kernel, projection.project, projection_residual, q8_input, full_embedding) &&
       add_bias(full_embedding, projection.project_bias) &&
       l2_normalize(full_embedding);
 }
@@ -4154,7 +4127,7 @@ inline bool run_text_projection(action::context & ctx) noexcept {
   auto pooled = std::span<const float>{ctx.scratch.pooled.get(), static_cast<size_t>(ctx.text.hidden_size)};
   auto text_embedding =
       std::span<float>{ctx.scratch.text_embedding.get(), static_cast<size_t>(ctx.text.output_size)};
-  if (!matmul(ctx.text.dense, pooled, q8_input, text_embedding) ||
+  if (!matmul(ctx.kernel, ctx.text.dense, pooled, q8_input, text_embedding) ||
       !add_bias(text_embedding, ctx.text.dense_bias) ||
       !l2_normalize(text_embedding)) {
     return false;
diff --git a/src/emel/graph/actions.hpp b/src/emel/graph/actions.hpp
index 003e7ae9..2699fd33 100644
--- a/src/emel/graph/actions.hpp
+++ b/src/emel/graph/actions.hpp
@@ -95,6 +95,44 @@ inline void reset_compute_output(event::compute_output & output) noexcept {
   output.lifecycle = nullptr;
 }
 
+namespace detail {
+
+template <class compute_event>
+inline processor::event::execute bind_execute_request(const compute_event & ev,
+                                                      context & ctx,
+                                                      compute_capture & capture) noexcept {
+  return processor::event::execute{
+    .step_plan = ev.request.step_plan,
+    .output_out = &ev.ctx.execute_output,
+    .lifecycle = ev.request.lifecycle,
+    .tensor_machine = &ctx.tensor_actor,
+    .step_index = ev.request.step_index,
+    .step_size = ev.request.step_size,
+    .kv_tokens = ev.request.kv_tokens,
+    .memory_sm = ev.request.memory_sm,
+    .memory_view = ev.request.memory_view,
+    .expected_outputs = ev.request.expected_outputs,
+    .compute_ctx = ev.request.compute_ctx,
+    .positions = ev.request.positions,
+    .positions_count = ev.request.positions_count,
+    .seq_masks = ev.request.seq_masks,
+    .seq_mask_words = ev.request.seq_mask_words,
+    .seq_masks_count = ev.request.seq_masks_count,
+    .seq_primary_ids = ev.request.seq_primary_ids,
+    .seq_primary_ids_count = ev.request.seq_primary_ids_count,
+    .validate = ev.request.validate,
+    .prepare_graph = ev.request.prepare_graph,
+    .alloc_graph = ev.request.alloc_graph,
+    .bind_inputs = ev.request.bind_inputs,
+    .run_kernel = ev.request.run_kernel,
+    .extract_outputs = ev.request.extract_outputs,
+    .dispatch_done = {&capture, on_execute_done},
+    .dispatch_error = {&capture, on_execute_error},
+  };
+}
+
+}  // namespace detail
+
 struct reject_invalid_reserve_with_dispatch {
   void operator()(const event::reserve_graph & ev, context &) const noexcept {
     ev.ctx.err = emel::error::cast(error::invalid_request);
@@ -130,6 +168,17 @@ struct reject_invalid_compute_with_dispatch {
   }
 };
 
+struct effect_reject_invalid_reserved_compute_with_dispatch {
+  void operator()(const event::compute_reserved_graph & ev, context &) const noexcept {
+    ev.ctx.err = emel::error::cast(error::invalid_request);
+    reset_compute_output(*ev.request.output_out);
+    ev.request.dispatch_error(events::compute_error{
+      *ev.request.output_out,
+      static_cast<int32_t>(ev.ctx.err),
+    });
+  }
+};
+
 struct reject_invalid_compute_with_output_only {
   void operator()(const event::compute_graph & ev, context &) const noexcept {
     ev.ctx.err = emel::error::cast(error::invalid_request);
@@ -137,12 +186,25 @@ struct reject_invalid_compute_with_output_only {
   }
 };
 
+struct effect_reject_invalid_reserved_compute_with_output_only {
+  void operator()(const event::compute_reserved_graph & ev, context &) const noexcept {
+    ev.ctx.err = emel::error::cast(error::invalid_request);
+    reset_compute_output(*ev.request.output_out);
+  }
+};
+
 struct reject_invalid_compute_without_output {
   void operator()(const event::compute_graph & ev, context &) const noexcept {
     ev.ctx.err = emel::error::cast(error::invalid_request);
   }
 };
 
+struct effect_reject_invalid_reserved_compute_without_output {
+  void operator()(const event::compute_reserved_graph & ev, context &) const noexcept {
+    ev.ctx.err = emel::error::cast(error::invalid_request);
+  }
+};
+
 struct begin_reserve {
   void operator()(const event::reserve_graph & ev, context & ctx) const noexcept {
     ev.ctx.err = emel::error::cast(error::none);
@@ -166,6 +228,27 @@ struct begin_compute {
   }
 };
 
+struct effect_begin_reserved_compute {
+  void operator()(const event::compute_reserved_graph & ev, context & ctx) const noexcept {
+    ev.ctx.err = emel::error::cast(error::none);
+    ev.ctx.assemble_outcome = event::phase_outcome::unknown;
+    ev.ctx.execute_outcome = event::phase_outcome::unknown;
+    ev.ctx.assemble_output = {};
+    ev.ctx.execute_output = {};
+    ++ctx.dispatch_generation;
+    reset_compute_output(*ev.request.output_out);
+    ev.ctx.assemble_outcome = event::phase_outcome::done;
+    ev.ctx.assemble_output.graph_topology = ctx.reservation.graph_topology;
+    ev.ctx.assemble_output.node_count = ctx.reservation.node_count;
+    ev.ctx.assemble_output.tensor_count = ctx.reservation.tensor_count;
+    ev.ctx.assemble_output.required_buffer_bytes =
+        ctx.reservation.required_buffer_bytes;
+    ev.ctx.assemble_output.version = ctx.reservation.version;
+    ev.ctx.assemble_output.reused_topology = 1u;
+    ev.ctx.assemble_output.lifecycle = ctx.reservation.lifecycle;
+  }
+};
+
 struct request_reserve {
   void operator()(const event::reserve_graph & ev, context & ctx) const noexcept {
     detail::reserve_capture capture{&ev.ctx};
@@ -212,37 +295,19 @@ struct request_execute {
   void operator()(const event::compute_graph & ev, context & ctx) const noexcept {
     detail::compute_capture capture{&ev.ctx};
     ev.ctx.err = emel::error::cast(error::processor_failed);
+    const processor::event::execute request = detail::bind_execute_request(ev, ctx, capture);
 
-    const processor::event::execute request{
-      .step_plan = ev.request.step_plan,
-      .output_out = &ev.ctx.execute_output,
-      .lifecycle = ev.request.lifecycle,
-      .tensor_machine = &ctx.tensor_actor,
-      .step_index = ev.request.step_index,
-      .step_size = ev.request.step_size,
-      .kv_tokens = ev.request.kv_tokens,
-      .memory_sm = ev.request.memory_sm,
-      .memory_view = ev.request.memory_view,
-      .expected_outputs = ev.request.expected_outputs,
-      .compute_ctx = ev.request.compute_ctx,
-      .positions = ev.request.positions,
-      .positions_count = ev.request.positions_count,
-      .seq_masks = ev.request.seq_masks,
-      .seq_mask_words = ev.request.seq_mask_words,
-      .seq_masks_count = ev.request.seq_masks_count,
-      .seq_primary_ids = ev.request.seq_primary_ids,
-      .seq_primary_ids_count = ev.request.seq_primary_ids_count,
-      .validate = ev.request.validate,
-      .prepare_graph = ev.request.prepare_graph,
-      .alloc_graph = ev.request.alloc_graph,
-      .bind_inputs = ev.request.bind_inputs,
-      .run_kernel = ev.request.run_kernel,
-      .extract_outputs = ev.request.extract_outputs,
-      .dispatch_done = {&capture, detail::on_execute_done},
-      .dispatch_error = {&capture, detail::on_execute_error},
-    };
+    (void)ctx.processor_actor.process_event_async(request).result();
+  }
+};
+
+struct effect_request_reserved_execute {
+  void operator()(const event::compute_reserved_graph & ev, context & ctx) const noexcept {
+    detail::compute_capture capture{&ev.ctx};
+    ev.ctx.err = emel::error::cast(error::processor_failed);
+    const processor::event::execute request = detail::bind_execute_request(ev, ctx, capture);
 
-    (void)ctx.processor_actor.process_event(request);
+    (void)ctx.processor_actor.process_event_async(request).result();
   }
 };
 
@@ -286,13 +351,14 @@ struct request_tensor_reserve {
 };
 
 struct dispatch_reserve_done {
-  void operator()(const event::reserve_graph & ev, const context &) const noexcept {
+  void operator()(const event::reserve_graph & ev, context & ctx) const noexcept {
     ev.request.output_out->graph_topology = ev.ctx.reserve_output.graph_topology;
     ev.request.output_out->node_count = ev.ctx.reserve_output.node_count;
     ev.request.output_out->tensor_count = ev.ctx.reserve_output.tensor_count;
     ev.request.output_out->required_buffer_bytes = ev.ctx.reserve_output.required_buffer_bytes;
     ev.request.output_out->version = ev.ctx.reserve_output.version;
     ev.request.output_out->lifecycle = ev.ctx.reserve_output.lifecycle;
+    ctx.reservation = *ev.request.output_out;
 
     ev.request.dispatch_done(events::reserve_done{*ev.request.output_out});
   }
@@ -323,6 +389,22 @@ struct dispatch_compute_done {
   }
 };
 
+struct effect_dispatch_reserved_compute_done {
+  void operator()(const event::compute_reserved_graph & ev, const context &) const noexcept {
+    ev.request.output_out->graph_topology = ev.ctx.assemble_output.graph_topology;
+    ev.request.output_out->node_count = ev.ctx.assemble_output.node_count;
+    ev.request.output_out->tensor_count = ev.ctx.assemble_output.tensor_count;
+    ev.request.output_out->required_buffer_bytes = ev.ctx.assemble_output.required_buffer_bytes;
+    ev.request.output_out->version = ev.ctx.assemble_output.version;
+    ev.request.output_out->reused_topology = ev.ctx.assemble_output.reused_topology;
+    ev.request.output_out->outputs_produced = ev.ctx.execute_output.outputs_produced;
+    ev.request.output_out->graph_reused = ev.ctx.execute_output.graph_reused;
+    ev.request.output_out->lifecycle = ev.request.lifecycle;
+
+    ev.request.dispatch_done(events::compute_done{*ev.request.output_out});
+  }
+};
+
 struct dispatch_compute_error {
   void operator()(const event::compute_graph & ev, const context &) const noexcept {
     ev.request.dispatch_error(events::compute_error{
@@ -332,6 +414,15 @@ struct dispatch_compute_error {
   }
 };
 
+struct effect_dispatch_reserved_compute_error {
+  void operator()(const event::compute_reserved_graph & ev, const context &) const noexcept {
+    ev.request.dispatch_error(events::compute_error{
+      *ev.request.output_out,
+      static_cast<int32_t>(ev.ctx.err),
+    });
+  }
+};
+
 struct on_unexpected {
   template <class event_type>
   void operator()(const event_type & ev, context &) const noexcept {
@@ -345,18 +436,28 @@ inline constexpr reject_invalid_reserve_with_dispatch reject_invalid_reserve_wit
 inline constexpr reject_invalid_reserve_with_output_only reject_invalid_reserve_with_output_only{};
 inline constexpr reject_invalid_reserve_without_output reject_invalid_reserve_without_output{};
 inline constexpr reject_invalid_compute_with_dispatch reject_invalid_compute_with_dispatch{};
+inline constexpr effect_reject_invalid_reserved_compute_with_dispatch
+    effect_reject_invalid_reserved_compute_with_dispatch{};
 inline constexpr reject_invalid_compute_with_output_only reject_invalid_compute_with_output_only{};
+inline constexpr effect_reject_invalid_reserved_compute_with_output_only
+    effect_reject_invalid_reserved_compute_with_output_only{};
 inline constexpr reject_invalid_compute_without_output reject_invalid_compute_without_output{};
+inline constexpr effect_reject_invalid_reserved_compute_without_output
+    effect_reject_invalid_reserved_compute_without_output{};
 inline constexpr begin_reserve begin_reserve{};
 inline constexpr begin_compute begin_compute{};
+inline constexpr effect_begin_reserved_compute effect_begin_reserved_compute{};
 inline constexpr request_reserve request_reserve{};
 inline constexpr request_tensor_reserve request_tensor_reserve{};
 inline constexpr request_assemble request_assemble{};
 inline constexpr request_execute request_execute{};
+inline constexpr effect_request_reserved_execute effect_request_reserved_execute{};
 inline constexpr dispatch_reserve_done dispatch_reserve_done{};
 inline constexpr dispatch_reserve_error dispatch_reserve_error{};
 inline constexpr dispatch_compute_done dispatch_compute_done{};
+inline constexpr effect_dispatch_reserved_compute_done effect_dispatch_reserved_compute_done{};
 inline constexpr dispatch_compute_error dispatch_compute_error{};
+inline constexpr effect_dispatch_reserved_compute_error effect_dispatch_reserved_compute_error{};
 inline constexpr on_unexpected on_unexpected{};
 
 }  // namespace emel::graph::action
diff --git a/src/emel/graph/context.hpp b/src/emel/graph/context.hpp
index 6055c227..553adafb 100644
--- a/src/emel/graph/context.hpp
+++ b/src/emel/graph/context.hpp
@@ -3,6 +3,7 @@
 #include <cstdint>
 
 #include "emel/graph/assembler/sm.hpp"
+#include "emel/graph/events.hpp"
 #include "emel/graph/processor/sm.hpp"
 #include "emel/graph/tensor/sm.hpp"
 
@@ -12,6 +13,7 @@ struct context {
   assembler::sm assembler_actor = {};
   processor::sm processor_actor = {};
   tensor::sm tensor_actor = {};
+  event::reserve_output reservation = {};
   uint64_t dispatch_generation = 0;
 };
 
diff --git a/src/emel/graph/events.hpp b/src/emel/graph/events.hpp
index 09940d9b..39707293 100644
--- a/src/emel/graph/events.hpp
+++ b/src/emel/graph/events.hpp
@@ -92,6 +92,13 @@ struct compute {
   ::emel::callback<bool(const ::emel::graph::events::compute_error &)> dispatch_error = {};
 };
 
+struct compute_reserved {
+  explicit compute_reserved(const compute & request_ref) noexcept
+    : request(request_ref) {}
+
+  const compute & request;
+};
+
 // Internal context object carried via completion<reserve_graph>.
 enum class phase_outcome : uint8_t {
   unknown = 0,
@@ -127,6 +134,12 @@ struct compute_graph {
   compute_ctx & ctx;
 };
 
+// Internal event used by graph::sm wrapper; not part of public API.
+struct compute_reserved_graph {
+  const compute & request;
+  compute_ctx & ctx;
+};
+
 }  // namespace emel::graph::event
 
 namespace emel::graph::events {
diff --git a/src/emel/graph/guards.hpp b/src/emel/graph/guards.hpp
index 86c79534..ec8d89a1 100644
--- a/src/emel/graph/guards.hpp
+++ b/src/emel/graph/guards.hpp
@@ -10,6 +10,10 @@ inline emel::error::type runtime_error(const event::compute_graph & ev) noexcept
   return ev.ctx.err;
 }
 
+inline emel::error::type runtime_error(const event::compute_reserved_graph & ev) noexcept {
+  return ev.ctx.err;
+}
+
 inline bool error_is(const emel::error::type runtime_err,
                      const error expected) noexcept {
   return runtime_err == emel::error::cast(expected);
@@ -25,6 +29,29 @@ inline bool error_is_unknown(const emel::error::type runtime_err) noexcept {
          !error_is(runtime_err, error::untracked);
 }
 
+template <class compute_request>
+inline bool guard_valid_compute_execution_request(const compute_request & request) noexcept {
+  return request.step_plan != nullptr &&
+         request.output_out != nullptr &&
+         request.lifecycle != nullptr &&
+         request.lifecycle->tensors != nullptr &&
+         request.lifecycle->tensor_count > 0 &&
+         request.step_index >= 0 &&
+         request.step_size > 0 &&
+         request.kv_tokens >= 0 &&
+         request.expected_outputs >= 0 &&
+         request.positions_count >= 0 &&
+         request.seq_mask_words > 0 &&
+         request.seq_masks_count >= 0 &&
+         request.seq_primary_ids_count >= 0 &&
+         request.prepare_graph != nullptr &&
+         request.bind_inputs != nullptr &&
+         request.run_kernel != nullptr &&
+         request.extract_outputs != nullptr &&
+         static_cast<bool>(request.dispatch_done) &&
+         static_cast<bool>(request.dispatch_error);
+}
+
 struct valid_reserve {
   bool operator()(const event::reserve_graph & ev, const action::context &) const noexcept {
     return ev.request.model_topology != nullptr &&
@@ -71,27 +98,25 @@ struct invalid_reserve_without_output {
 
 struct valid_compute {
   bool operator()(const event::compute_graph & ev, const action::context &) const noexcept {
-    return ev.request.step_plan != nullptr &&
-           ev.request.output_out != nullptr &&
-           ev.request.lifecycle != nullptr &&
-           ev.request.lifecycle->tensors != nullptr &&
-           ev.request.lifecycle->tensor_count > 0 &&
+    return guard_valid_compute_execution_request(ev.request) &&
            ev.request.bytes_per_tensor != 0u &&
-           ev.request.workspace_capacity_bytes != 0u &&
-           ev.request.step_index >= 0 &&
-           ev.request.step_size > 0 &&
-           ev.request.kv_tokens >= 0 &&
-           ev.request.expected_outputs >= 0 &&
-           ev.request.positions_count >= 0 &&
-           ev.request.seq_mask_words > 0 &&
-           ev.request.seq_masks_count >= 0 &&
-           ev.request.seq_primary_ids_count >= 0 &&
-           ev.request.prepare_graph != nullptr &&
-           ev.request.bind_inputs != nullptr &&
-           ev.request.run_kernel != nullptr &&
-           ev.request.extract_outputs != nullptr &&
-           static_cast<bool>(ev.request.dispatch_done) &&
-           static_cast<bool>(ev.request.dispatch_error);
+           ev.request.workspace_capacity_bytes != 0u;
+  }
+};
+
+struct guard_valid_compute_reserved {
+  bool operator()(const event::compute_reserved_graph & ev,
+                  const action::context & ctx) const noexcept {
+    return guard_valid_compute_execution_request(ev.request) &&
+           ev.request.lifecycle->phase != nullptr &&
+           ctx.reservation.graph_topology != nullptr &&
+           ctx.reservation.node_count != 0u &&
+           ctx.reservation.tensor_count != 0u &&
+           ctx.reservation.required_buffer_bytes != 0u &&
+           ctx.reservation.lifecycle != nullptr &&
+           ctx.reservation.lifecycle->tensors == ev.request.lifecycle->tensors &&
+           ctx.reservation.lifecycle->tensor_count ==
+               ev.request.lifecycle->tensor_count;
   }
 };
 
@@ -101,6 +126,13 @@ struct invalid_compute {
   }
 };
 
+struct guard_invalid_compute_reserved {
+  bool operator()(const event::compute_reserved_graph & ev,
+                  const action::context & ctx) const noexcept {
+    return !guard_valid_compute_reserved{}(ev, ctx);
+  }
+};
+
 struct invalid_compute_with_dispatchable_output {
   bool operator()(const event::compute_graph & ev, const action::context & ctx) const noexcept {
     return invalid_compute{}(ev, ctx) &&
@@ -109,6 +141,15 @@ struct invalid_compute_with_dispatchable_output {
   }
 };
 
+struct guard_invalid_compute_reserved_with_dispatchable_output {
+  bool operator()(const event::compute_reserved_graph & ev,
+                  const action::context & ctx) const noexcept {
+    return guard_invalid_compute_reserved{}(ev, ctx) &&
+           ev.request.output_out != nullptr &&
+           static_cast<bool>(ev.request.dispatch_error);
+  }
+};
+
 struct invalid_compute_with_output_only {
   bool operator()(const event::compute_graph & ev, const action::context & ctx) const noexcept {
     return invalid_compute{}(ev, ctx) &&
@@ -117,12 +158,28 @@ struct invalid_compute_with_output_only {
   }
 };
 
+struct guard_invalid_compute_reserved_with_output_only {
+  bool operator()(const event::compute_reserved_graph & ev,
+                  const action::context & ctx) const noexcept {
+    return guard_invalid_compute_reserved{}(ev, ctx) &&
+           ev.request.output_out != nullptr &&
+           !static_cast<bool>(ev.request.dispatch_error);
+  }
+};
+
 struct invalid_compute_without_output {
   bool operator()(const event::compute_graph & ev, const action::context & ctx) const noexcept {
     return invalid_compute{}(ev, ctx) && ev.request.output_out == nullptr;
   }
 };
 
+struct guard_invalid_compute_reserved_without_output {
+  bool operator()(const event::compute_reserved_graph & ev,
+                  const action::context & ctx) const noexcept {
+    return guard_invalid_compute_reserved{}(ev, ctx) && ev.request.output_out == nullptr;
+  }
+};
+
 struct reserve_done {
   bool operator()(const event::reserve_graph & ev, const action::context &) const noexcept {
     return ev.ctx.err == emel::error::cast(error::none) &&
@@ -172,6 +229,14 @@ struct execute_done {
   }
 };
 
+struct guard_reserved_execute_done {
+  bool operator()(const event::compute_reserved_graph & ev,
+                  const action::context &) const noexcept {
+    return ev.ctx.err == emel::error::cast(error::none) &&
+           ev.ctx.execute_outcome == event::phase_outcome::done;
+  }
+};
+
 struct execute_failed {
   bool operator()(const event::compute_graph & ev, const action::context &) const noexcept {
     return ev.ctx.err != emel::error::cast(error::none) ||
@@ -179,52 +244,116 @@ struct execute_failed {
   }
 };
 
+struct guard_reserved_execute_failed {
+  bool operator()(const event::compute_reserved_graph & ev,
+                  const action::context &) const noexcept {
+    return ev.ctx.err != emel::error::cast(error::none) ||
+           ev.ctx.execute_outcome == event::phase_outcome::failed;
+  }
+};
+
 struct compute_error_none {
   bool operator()(const event::compute_graph & ev, const action::context &) const noexcept {
     return error_is(runtime_error(ev), error::none);
   }
 };
 
+struct guard_reserved_compute_error_none {
+  bool operator()(const event::compute_reserved_graph & ev,
+                  const action::context &) const noexcept {
+    return error_is(runtime_error(ev), error::none);
+  }
+};
+
 struct compute_error_invalid_request {
   bool operator()(const event::compute_graph & ev, const action::context &) const noexcept {
     return error_is(runtime_error(ev), error::invalid_request);
   }
 };
 
+struct guard_reserved_compute_error_invalid_request {
+  bool operator()(const event::compute_reserved_graph & ev,
+                  const action::context &) const noexcept {
+    return error_is(runtime_error(ev), error::invalid_request);
+  }
+};
+
 struct compute_error_assembler_failed {
   bool operator()(const event::compute_graph & ev, const action::context &) const noexcept {
     return error_is(runtime_error(ev), error::assembler_failed);
   }
 };
 
+struct guard_reserved_compute_error_assembler_failed {
+  bool operator()(const event::compute_reserved_graph & ev,
+                  const action::context &) const noexcept {
+    return error_is(runtime_error(ev), error::assembler_failed);
+  }
+};
+
 struct compute_error_processor_failed {
   bool operator()(const event::compute_graph & ev, const action::context &) const noexcept {
     return error_is(runtime_error(ev), error::processor_failed);
   }
 };
 
+struct guard_reserved_compute_error_processor_failed {
+  bool operator()(const event::compute_reserved_graph & ev,
+                  const action::context &) const noexcept {
+    return error_is(runtime_error(ev), error::processor_failed);
+  }
+};
+
 struct compute_error_busy {
   bool operator()(const event::compute_graph & ev, const action::context &) const noexcept {
     return error_is(runtime_error(ev), error::busy);
   }
 };
 
+struct guard_reserved_compute_error_busy {
+  bool operator()(const event::compute_reserved_graph & ev,
+                  const action::context &) const noexcept {
+    return error_is(runtime_error(ev), error::busy);
+  }
+};
+
 struct compute_error_internal_error {
   bool operator()(const event::compute_graph & ev, const action::context &) const noexcept {
     return error_is(runtime_error(ev), error::internal_error);
   }
 };
 
+struct guard_reserved_compute_error_internal_error {
+  bool operator()(const event::compute_reserved_graph & ev,
+                  const action::context &) const noexcept {
+    return error_is(runtime_error(ev), error::internal_error);
+  }
+};
+
 struct compute_error_untracked {
   bool operator()(const event::compute_graph & ev, const action::context &) const noexcept {
     return error_is(runtime_error(ev), error::untracked);
   }
 };
 
+struct guard_reserved_compute_error_untracked {
+  bool operator()(const event::compute_reserved_graph & ev,
+                  const action::context &) const noexcept {
+    return error_is(runtime_error(ev), error::untracked);
+  }
+};
+
 struct compute_error_unknown {
   bool operator()(const event::compute_graph & ev, const action::context &) const noexcept {
     return error_is_unknown(runtime_error(ev));
   }
 };
 
+struct guard_reserved_compute_error_unknown {
+  bool operator()(const event::compute_reserved_graph & ev,
+                  const action::context &) const noexcept {
+    return error_is_unknown(runtime_error(ev));
+  }
+};
+
 }  // namespace emel::graph::guard
diff --git a/src/emel/graph/processor/sm.hpp b/src/emel/graph/processor/sm.hpp
index b17a5824..4cd72d38 100644
--- a/src/emel/graph/processor/sm.hpp
+++ b/src/emel/graph/processor/sm.hpp
@@ -233,15 +233,23 @@ struct model {
   }
 };
 
-struct sm : public emel::sm<model, action::context> {
-  using base_type = emel::sm<model, action::context>;
+using inline_co_policy =
+    emel::policy::coroutine_scheduler<emel::policy::inline_scheduler>;
+
+struct sm : public emel::co_sm<model, action::context, inline_co_policy> {
+  using base_type = emel::co_sm<model, action::context, inline_co_policy>;
   using base_type::base_type;
 
   bool process_event(const event::execute & ev) {
+    return process_event_async(ev).result();
+  }
+
+  emel::bool_task process_event_async(const event::execute & ev) {
     event::execute_ctx ctx{};
     event::execute_step evt{ev, ctx};
-    const bool accepted = base_type::process_event(evt);
-    return accepted && ctx.err == emel::error::cast(error::none);
+    const bool accepted = base_type::process_event_async(evt).result();
+    return emel::bool_task::from_value(
+        accepted && ctx.err == emel::error::cast(error::none));
   }
 };
 
diff --git a/src/emel/graph/sm.hpp b/src/emel/graph/sm.hpp
index 12b3d521..12b7f1b8 100644
--- a/src/emel/graph/sm.hpp
+++ b/src/emel/graph/sm.hpp
@@ -118,6 +118,45 @@ struct model {
                  [ guard::invalid_compute_without_output{} ]
                  / action::reject_invalid_compute_without_output
 
+      //------------------------------------------------------------------------------//
+      // Reserved compute request validation. This path is for callers that already
+      // proved graph reservation compatibility and only need execution.
+      , sml::state<executing> <= sml::state<reserved> + sml::event<event::compute_reserved_graph>
+                 [ guard::guard_valid_compute_reserved{} ]
+                 / action::effect_begin_reserved_compute
+
+      , sml::state<reserved> <= sml::state<reserved> + sml::event<event::compute_reserved_graph>
+                 [ guard::guard_invalid_compute_reserved_with_dispatchable_output{} ]
+                 / action::effect_reject_invalid_reserved_compute_with_dispatch
+
+      , sml::state<reserved> <= sml::state<reserved> + sml::event<event::compute_reserved_graph>
+                 [ guard::guard_invalid_compute_reserved_with_output_only{} ]
+                 / action::effect_reject_invalid_reserved_compute_with_output_only
+
+      , sml::state<reserved> <= sml::state<reserved> + sml::event<event::compute_reserved_graph>
+                 [ guard::guard_invalid_compute_reserved_without_output{} ]
+                 / action::effect_reject_invalid_reserved_compute_without_output
+
+      , sml::state<uninitialized> <= sml::state<uninitialized>
+                 + sml::event<event::compute_reserved_graph>
+                 [ guard::guard_valid_compute_reserved{} ]
+                 / action::effect_reject_invalid_reserved_compute_with_dispatch
+
+      , sml::state<uninitialized> <= sml::state<uninitialized>
+                 + sml::event<event::compute_reserved_graph>
+                 [ guard::guard_invalid_compute_reserved_with_dispatchable_output{} ]
+                 / action::effect_reject_invalid_reserved_compute_with_dispatch
+
+      , sml::state<uninitialized> <= sml::state<uninitialized>
+                 + sml::event<event::compute_reserved_graph>
+                 [ guard::guard_invalid_compute_reserved_with_output_only{} ]
+                 / action::effect_reject_invalid_reserved_compute_with_output_only
+
+      , sml::state<uninitialized> <= sml::state<uninitialized>
+                 + sml::event<event::compute_reserved_graph>
+                 [ guard::guard_invalid_compute_reserved_without_output{} ]
+                 / action::effect_reject_invalid_reserved_compute_without_output
+
       //------------------------------------------------------------------------------//
       // Assemble phase.
       , sml::state<assemble_decision> <= sml::state<assembling> + sml::completion<event::compute_graph>
@@ -140,6 +179,18 @@ struct model {
       , sml::state<compute_decision> <= sml::state<execute_decision> + sml::completion<event::compute_graph>
                  [ guard::execute_failed{} ]
 
+      , sml::state<execute_decision> <= sml::state<executing>
+                 + sml::completion<event::compute_reserved_graph>
+                 / action::effect_request_reserved_execute
+
+      , sml::state<compute_decision> <= sml::state<execute_decision>
+                 + sml::completion<event::compute_reserved_graph>
+                 [ guard::guard_reserved_execute_done{} ]
+
+      , sml::state<compute_decision> <= sml::state<execute_decision>
+                 + sml::completion<event::compute_reserved_graph>
+                 [ guard::guard_reserved_execute_failed{} ]
+
       //------------------------------------------------------------------------------//
       // Compute finalization.
       , sml::state<reserved> <= sml::state<compute_decision> + sml::completion<event::compute_graph>
@@ -174,6 +225,46 @@ struct model {
                  [ guard::compute_error_unknown{} ]
                  / action::dispatch_compute_error
 
+      , sml::state<reserved> <= sml::state<compute_decision>
+                 + sml::completion<event::compute_reserved_graph>
+                 [ guard::guard_reserved_compute_error_none{} ]
+                 / action::effect_dispatch_reserved_compute_done
+
+      , sml::state<reserved> <= sml::state<compute_decision>
+                 + sml::completion<event::compute_reserved_graph>
+                 [ guard::guard_reserved_compute_error_invalid_request{} ]
+                 / action::effect_dispatch_reserved_compute_error
+
+      , sml::state<reserved> <= sml::state<compute_decision>
+                 + sml::completion<event::compute_reserved_graph>
+                 [ guard::guard_reserved_compute_error_assembler_failed{} ]
+                 / action::effect_dispatch_reserved_compute_error
+
+      , sml::state<reserved> <= sml::state<compute_decision>
+                 + sml::completion<event::compute_reserved_graph>
+                 [ guard::guard_reserved_compute_error_processor_failed{} ]
+                 / action::effect_dispatch_reserved_compute_error
+
+      , sml::state<reserved> <= sml::state<compute_decision>
+                 + sml::completion<event::compute_reserved_graph>
+                 [ guard::guard_reserved_compute_error_busy{} ]
+                 / action::effect_dispatch_reserved_compute_error
+
+      , sml::state<reserved> <= sml::state<compute_decision>
+                 + sml::completion<event::compute_reserved_graph>
+                 [ guard::guard_reserved_compute_error_internal_error{} ]
+                 / action::effect_dispatch_reserved_compute_error
+
+      , sml::state<reserved> <= sml::state<compute_decision>
+                 + sml::completion<event::compute_reserved_graph>
+                 [ guard::guard_reserved_compute_error_untracked{} ]
+                 / action::effect_dispatch_reserved_compute_error
+
+      , sml::state<reserved> <= sml::state<compute_decision>
+                 + sml::completion<event::compute_reserved_graph>
+                 [ guard::guard_reserved_compute_error_unknown{} ]
+                 / action::effect_dispatch_reserved_compute_error
+
       //------------------------------------------------------------------------------//
       // Unexpected events.
       , sml::state<uninitialized> <= sml::state<uninitialized> + sml::unexpected_event<sml::_>
@@ -221,6 +312,13 @@ struct sm : public emel::sm<model, action::context> {
     return accepted && ctx.err == emel::error::cast(error::none);
   }
 
+  bool process_event(const event::compute_reserved & ev) {
+    event::compute_ctx ctx{};
+    event::compute_reserved_graph evt{ev.request, ctx};
+    const bool accepted = base_type::process_event(evt);
+    return accepted && ctx.err == emel::error::cast(error::none);
+  }
+
   bool try_capture_tensor(const int32_t tensor_id,
                           tensor::event::tensor_state & state_out,
                           emel::error::type & err_out) noexcept {
diff --git a/src/emel/kernel/aarch64/actions.hpp b/src/emel/kernel/aarch64/actions.hpp
index b86132de..21e7edc4 100644
--- a/src/emel/kernel/aarch64/actions.hpp
+++ b/src/emel/kernel/aarch64/actions.hpp
@@ -2189,6 +2189,7 @@ inline bool execute_neon_sqrt(const event::op_sqrt &request) noexcept {
 #endif
 }
 
+#if defined(__aarch64__) || defined(__ARM_NEON)
 inline uint8x16x2_t load_u8x16x2(const uint8_t *ptr) noexcept {
   uint8x16x2_t out{};
   out.val[0] = vld1q_u8(ptr);
@@ -2204,6 +2205,7 @@ inline int8x16x4_t load_s8x16x4(const int8_t *ptr) noexcept {
   out.val[3] = vld1q_s8(ptr + 48);
   return out;
 }
+#endif
 
 inline float dot_q2_k_q8_k_block_neon(
     const ::emel::kernel::detail::quant::block_q2_k &lhs,
@@ -2566,6 +2568,7 @@ decode_q4_k_scales_words(const ::emel::kernel::detail::quant::block_q4_k &lhs,
   decoded_words[0] &= kmask1;
 }
 
+#if defined(__aarch64__) || defined(__ARM_NEON)
 inline int32_t q4_k_min_sum_neon(const uint8_t *mins,
                                  const int16x8_t q8_pair_sums) noexcept {
   const int16x8_t mins_s16 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(mins)));
@@ -2574,6 +2577,7 @@ inline int32_t q4_k_min_sum_neon(const uint8_t *mins,
       vmull_s16(vget_high_s16(q8_pair_sums), vget_high_s16(mins_s16)));
   return vaddvq_s32(min_prod);
 }
+#endif
 
 inline float dot_q4_k_q8_k_block_neon(
     const ::emel::kernel::detail::quant::block_q4_k &lhs,
@@ -3569,6 +3573,7 @@ dot_q6_k_q8_k_row_neon(const ::emel::kernel::detail::quant::block_q6_k *lhs,
   return sum;
 }
 
+#if defined(__aarch64__) || defined(__ARM_NEON)
 inline int32_t horizontal_sum_s32_neon(const int32x4_t value) noexcept {
 #if defined(__aarch64__)
   return vaddvq_s32(value);
@@ -3604,6 +3609,7 @@ inline int32_t dot_q8_0_q8_0_block_sum_neon(
 #endif
   return horizontal_sum_s32_neon(acc);
 }
+#endif
 
 inline constexpr std::array<std::array<uint8_t, 8>, 256>
 make_q5_0_high_bit_lookup() noexcept {
@@ -3618,6 +3624,7 @@ make_q5_0_high_bit_lookup() noexcept {
 
 inline constexpr auto k_q5_0_high_bit_lookup = make_q5_0_high_bit_lookup();
 
+#if defined(__aarch64__) || defined(__ARM_NEON)
 inline uint8x8_t load_q5_0_high_bit_mask_u8(const uint8_t bits) noexcept {
   return vld1_u8(k_q5_0_high_bit_lookup[bits].data());
 }
@@ -3640,6 +3647,7 @@ decode_q5_0_block_neon(const ::emel::kernel::detail::quant::block_q5_0 &block,
   high_out =
       vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(high_nibbles, high_mask)), bias);
 }
+#endif
 
 inline float
 dot_q5_0_q8_0_row_neon(const ::emel::kernel::detail::quant::block_q5_0 *lhs,
@@ -3918,8 +3926,9 @@ dot_q4_0_q8_0_4rows_neon(const ::emel::kernel::detail::quant::block_q4_0 *row0,
 #endif
 }
 
+#if defined(__aarch64__) || defined(__ARM_NEON)
 inline int32_t horizontal_sum_s8_neon(const int8x16_t value) noexcept {
-#if defined(__aarch64__) && defined(__ARM_NEON)
+#if defined(__aarch64__)
   const int16x8_t widened =
       vaddq_s16(vmovl_s8(vget_low_s8(value)), vmovl_s8(vget_high_s8(value)));
   const int32x4_t pair = vpaddlq_s16(widened);
@@ -3929,6 +3938,7 @@ inline int32_t horizontal_sum_s8_neon(const int8x16_t value) noexcept {
   return 0;
 #endif
 }
+#endif
 
 inline float
 dot_q4_1_q8_0_row_neon(const ::emel::kernel::detail::quant::block_q4_1 *lhs,
@@ -4094,6 +4104,10 @@ inline float
 dot_q8_0_q8_0_row_neon(const ::emel::kernel::detail::quant::block_q8_0 *lhs,
                        const ::emel::kernel::detail::quant::block_q8_0 *rhs,
                        const uint64_t block_count) noexcept {
+#if !(defined(__aarch64__) || defined(__ARM_NEON))
+  return ::emel::kernel::detail::dot_q8_0_q8_0_row_scalar(lhs, rhs,
+                                                          block_count);
+#else
   float sum = 0.0f;
   for (uint64_t block = 0; block < block_count; ++block) {
     const int32_t sumi = dot_q8_0_q8_0_block_sum_neon(lhs[block], rhs[block]);
@@ -4102,8 +4116,10 @@ dot_q8_0_q8_0_row_neon(const ::emel::kernel::detail::quant::block_q8_0 *lhs,
             ::emel::kernel::detail::quant::fp16_to_fp32(rhs[block].d));
   }
   return sum;
+#endif
 }
 
+#if defined(__aarch64__) || defined(__ARM_NEON)
 inline float32x4_t
 dot_q8_0_q8_0_4row_neon(const ::emel::kernel::detail::quant::block_q8_0 *row0,
                         const ::emel::kernel::detail::quant::block_q8_0 *row1,
@@ -4346,6 +4362,7 @@ inline int32x4_t q6_k_upper_half_dot_accumulate_neon(
   return next;
 #endif
 }
+#endif
 
 inline void
 dot_q6_k_q8_k_4rows_neon(const ::emel::kernel::detail::quant::block_q6_k *lhs0,
@@ -6963,6 +6980,7 @@ prepare_neon_mul_mat_f32_lhs_4row(const float *src, const uint64_t k,
   return true;
 }
 
+#if defined(__aarch64__) || defined(__ARM_NEON)
 template <int Lane>
 inline float32x4_t neon_fma_lane_f32(const float32x4_t acc,
                                      const float32x4_t rhs,
@@ -6973,6 +6991,7 @@ inline float32x4_t neon_fma_lane_f32(const float32x4_t acc,
   return vmlaq_n_f32(acc, rhs, vgetq_lane_f32(lhs, Lane));
 #endif
 }
+#endif
 
 inline bool execute_neon_mul_mat_prepared_f32_lhs_4row(
     const event::op_mul_mat &request, const float *prepared_lhs,
diff --git a/src/emel/kernel/any.hpp b/src/emel/kernel/any.hpp
index 27c50b6b..2efe49f9 100644
--- a/src/emel/kernel/any.hpp
+++ b/src/emel/kernel/any.hpp
@@ -17,6 +17,14 @@ enum class kernel_kind : uint8_t {
   aarch64 = 1,
 };
 
+constexpr kernel_kind detect_host_kind() noexcept {
+#if defined(__aarch64__) || defined(_M_ARM64)
+  return kernel_kind::aarch64;
+#else
+  return kernel_kind::x86_64;
+#endif
+}
+
 class any {
  public:
   any() = default;
diff --git a/src/emel/kernel/detail.hpp b/src/emel/kernel/detail.hpp
index 951e4d37..287690e4 100644
--- a/src/emel/kernel/detail.hpp
+++ b/src/emel/kernel/detail.hpp
@@ -1826,9 +1826,8 @@ template <class request_type>
 inline bool validate_dispatch_request(const request_type & request) noexcept {
   const bool has_required_buffers =
       has_required_src0(request) && has_required_src1(request) && has_required_dst(request);
-  const bool has_valid_threading = request.ith == 0 && request.nth == 1;
   const bool has_valid_params = request.op_params_size <= request.op_params.size();
-  return has_required_buffers && has_valid_threading && has_valid_params;
+  return has_required_buffers && has_valid_params;
 }
 
 template <class tensor_type>
@@ -3074,10 +3073,70 @@ inline bool run_mul_mat(const request_type & request) noexcept {
       request.dst.ne[2] != 1 || request.dst.ne[3] != 1;
   const bool valid = !(has_empty_dim || shape_mismatch || invalid_rank);
   const uint8_t src0_type = dtype_code(request.src0.type);
+  const bool q4_0_src0 = is_q4_0_dtype(src0_type);
+  const bool q4_1_src0 = is_q4_1_dtype(src0_type);
   const bool q5_0_src0 = is_q5_0_dtype(src0_type);
   const bool q8_0_src0 = is_q8_0_dtype(src0_type);
   const bool quantized_src0 = is_quantized_k_dtype(src0_type);
 
+  if (valid && q4_0_src0) {
+    const auto * b_dense = static_cast<const float *>(request.src1.data);
+    auto * c_dense = static_cast<float *>(request.dst.data);
+    const auto * a_base = static_cast<const uint8_t *>(request.src0.data);
+    const size_t row_bytes = request.src0.nb[1];
+    const uint64_t block_count = k / quant::QK4_0;
+    std::array<quant::block_q8_0, quant::MAX_Q8_0_BLOCKS> q8_blocks = {};
+    if (block_count > q8_blocks.size()) {
+      return false;
+    }
+
+    for (uint64_t j = 0; j < n; ++j) {
+      for (uint64_t i = 0; i < m; ++i) {
+        c_dense[i * n + j] = 0.0f;
+      }
+      for (uint64_t block = 0; block < block_count; ++block) {
+        quant::quantize_row_q8_0_strided(
+            b_dense + block * quant::QK4_0 * n + j, n, &q8_blocks[block], quant::QK4_0);
+      }
+      for (uint64_t i = 0; i < m; ++i) {
+        const uint8_t * row_ptr = a_base + i * row_bytes;
+        c_dense[i * n + j] = dot_q4_0_q8_0_row_scalar(
+            reinterpret_cast<const quant::block_q4_0 *>(row_ptr), q8_blocks.data(), block_count);
+      }
+    }
+
+    return true;
+  }
+
+  if (valid && q4_1_src0) {
+    const auto * b_dense = static_cast<const float *>(request.src1.data);
+    auto * c_dense = static_cast<float *>(request.dst.data);
+    const auto * a_base = static_cast<const uint8_t *>(request.src0.data);
+    const size_t row_bytes = request.src0.nb[1];
+    const uint64_t block_count = k / quant::QK4_1;
+    std::array<quant::block_q8_0, quant::MAX_Q8_0_BLOCKS> q8_blocks = {};
+    if (block_count > q8_blocks.size()) {
+      return false;
+    }
+
+    for (uint64_t j = 0; j < n; ++j) {
+      for (uint64_t i = 0; i < m; ++i) {
+        c_dense[i * n + j] = 0.0f;
+      }
+      for (uint64_t block = 0; block < block_count; ++block) {
+        quant::quantize_row_q8_0_strided(
+            b_dense + block * quant::QK4_1 * n + j, n, &q8_blocks[block], quant::QK4_1);
+      }
+      for (uint64_t i = 0; i < m; ++i) {
+        const uint8_t * row_ptr = a_base + i * row_bytes;
+        c_dense[i * n + j] = dot_q4_1_q8_0_row_scalar(
+            reinterpret_cast<const quant::block_q4_1 *>(row_ptr), q8_blocks.data(), block_count);
+      }
+    }
+
+    return true;
+  }
+
   if (valid && q5_0_src0) {
     const auto * b_dense = static_cast<const float *>(request.src1.data);
     auto * c_dense = static_cast<float *>(request.dst.data);
diff --git a/src/emel/kernel/events.hpp b/src/emel/kernel/events.hpp
index d5f75f8a..4b3f02fa 100644
--- a/src/emel/kernel/events.hpp
+++ b/src/emel/kernel/events.hpp
@@ -129,9 +129,7 @@ enum class glu_subop : uint8_t {
   tensor_view src2 = {};                     \
   tensor_view_mut dst = {};                  \
   std::array<uint8_t, 64> op_params = {};    \
-  uint32_t op_params_size = 0;               \
-  uint32_t ith = 0;                          \
-  uint32_t nth = 1;
+  uint32_t op_params_size = 0;
 
 #define EMEL_KERNEL_DECLARE_OP(name) \
   struct name {                      \
diff --git a/src/emel/kernel/x86_64/actions.hpp b/src/emel/kernel/x86_64/actions.hpp
index aa15074d..7154b18d 100644
--- a/src/emel/kernel/x86_64/actions.hpp
+++ b/src/emel/kernel/x86_64/actions.hpp
@@ -5,6 +5,7 @@
 #include <cmath>
 #include <cstdint>
 #include <cstring>
+#include <limits>
 #include <type_traits>
 
 #if defined(__x86_64__) || defined(_M_X64)
@@ -21,16 +22,23 @@
 #if defined(__x86_64__) || defined(_M_X64)
 #if defined(__GNUC__) || defined(__clang__)
 #define EMEL_KERNEL_X86_AVX2_TARGET __attribute__((target("avx2")))
+#define EMEL_KERNEL_X86_AVX2_FMA_TARGET __attribute__((target("avx2,fma")))
+#define EMEL_KERNEL_X86_AVX2_FMA_F16C_TARGET                                   \
+  __attribute__((target("avx2,fma,f16c")))
 #else
 #define EMEL_KERNEL_X86_AVX2_TARGET
+#define EMEL_KERNEL_X86_AVX2_FMA_TARGET
+#define EMEL_KERNEL_X86_AVX2_FMA_F16C_TARGET
 #endif
 #else
 #define EMEL_KERNEL_X86_AVX2_TARGET
+#define EMEL_KERNEL_X86_AVX2_FMA_TARGET
+#define EMEL_KERNEL_X86_AVX2_FMA_F16C_TARGET
 #endif
 
 namespace emel::kernel::x86_64::detail {
 
-namespace event = ::emel::kernel::event;
+namespace event= ::emel::kernel::event;
 
 inline constexpr bool avx2_intrinsics_compiled =
 #if defined(__x86_64__) || defined(_M_X64)
@@ -43,21 +51,32 @@ inline constexpr bool avx2_intrinsics_compiled =
     false;
 #endif
 
-inline bool detect_avx2() noexcept {
+inline constexpr bool avx2_fma_intrinsics_compiled =
 #if defined(__x86_64__) || defined(_M_X64)
-#if defined(__GNUC__) || defined(__clang__)
-  __builtin_cpu_init();
-  return __builtin_cpu_supports("avx2");
+#if (defined(__AVX2__) && defined(__FMA__)) || defined(__GNUC__) ||            \
+    defined(__clang__)
+    true;
 #else
-  return false;
+    false;
 #endif
 #else
-  return false;
+    false;
+#endif
+
+inline constexpr bool avx2_fma_f16c_intrinsics_compiled =
+#if defined(__x86_64__) || defined(_M_X64)
+#if (defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)) ||            \
+    defined(__GNUC__) || defined(__clang__)
+    true;
+#else
+    false;
+#endif
+#else
+    false;
 #endif
-}
 
 template <class tensor_type>
-inline bool is_dense_contiguous(const tensor_type & tensor) noexcept {
+inline bool is_dense_contiguous(const tensor_type &tensor) noexcept {
   return ::emel::kernel::detail::is_dense_contiguous(tensor);
 }
 
@@ -73,7 +92,8 @@ inline constexpr bool simd_supported_request_v =
     std::is_same_v<request_type, event::op_mul_mat> ||
     std::is_same_v<request_type, event::op_unary>;
 
-inline bool unary_subop_supported_simd(const event::unary_subop subop) noexcept {
+inline bool
+unary_subop_supported_simd(const event::unary_subop subop) noexcept {
   const auto subop_code = static_cast<uint8_t>(subop);
   return subop_code == static_cast<uint8_t>(event::unary_subop::abs) ||
          subop_code == static_cast<uint8_t>(event::unary_subop::neg) ||
@@ -81,7 +101,8 @@ inline bool unary_subop_supported_simd(const event::unary_subop subop) noexcept
 }
 
 EMEL_KERNEL_X86_AVX2_TARGET
-inline void execute_avx2_unary_abs(const float * src, float * dst, const uint64_t count) noexcept {
+inline void execute_avx2_unary_abs(const float *src, float *dst,
+                                   const uint64_t count) noexcept {
 #if defined(__x86_64__) || defined(_M_X64)
 #if defined(__AVX2__) || defined(__GNUC__) || defined(__clang__)
   uint64_t i = 0;
@@ -94,19 +115,20 @@ inline void execute_avx2_unary_abs(const float * src, float * dst, const uint64_
     dst[i] = std::fabs(src[i]);
   }
 #else
-  (void) src;
-  (void) dst;
-  (void) count;
+  (void)src;
+  (void)dst;
+  (void)count;
 #endif
 #else
-  (void) src;
-  (void) dst;
-  (void) count;
+  (void)src;
+  (void)dst;
+  (void)count;
 #endif
 }
 
 EMEL_KERNEL_X86_AVX2_TARGET
-inline void execute_avx2_unary_neg(const float * src, float * dst, const uint64_t count) noexcept {
+inline void execute_avx2_unary_neg(const float *src, float *dst,
+                                   const uint64_t count) noexcept {
 #if defined(__x86_64__) || defined(_M_X64)
 #if defined(__AVX2__) || defined(__GNUC__) || defined(__clang__)
   uint64_t i = 0;
@@ -119,19 +141,20 @@ inline void execute_avx2_unary_neg(const float * src, float * dst, const uint64_
     dst[i] = -src[i];
   }
 #else
-  (void) src;
-  (void) dst;
-  (void) count;
+  (void)src;
+  (void)dst;
+  (void)count;
 #endif
 #else
-  (void) src;
-  (void) dst;
-  (void) count;
+  (void)src;
+  (void)dst;
+  (void)count;
 #endif
 }
 
 EMEL_KERNEL_X86_AVX2_TARGET
-inline void execute_avx2_unary_relu(const float * src, float * dst, const uint64_t count) noexcept {
+inline void execute_avx2_unary_relu(const float *src, float *dst,
+                                    const uint64_t count) noexcept {
 #if defined(__x86_64__) || defined(_M_X64)
 #if defined(__AVX2__) || defined(__GNUC__) || defined(__clang__)
   uint64_t i = 0;
@@ -144,22 +167,23 @@ inline void execute_avx2_unary_relu(const float * src, float * dst, const uint64
     dst[i] = std::max(0.0f, src[i]);
   }
 #else
-  (void) src;
-  (void) dst;
-  (void) count;
+  (void)src;
+  (void)dst;
+  (void)count;
 #endif
 #else
-  (void) src;
-  (void) dst;
-  (void) count;
+  (void)src;
+  (void)dst;
+  (void)count;
 #endif
 }
 
 template <class request_type>
-inline bool can_use_avx2(const request_type & request, const bool avx2_available) noexcept {
+inline bool can_use_avx2(const request_type &request,
+                         const bool avx2_available) noexcept {
 #if !(defined(__x86_64__) || defined(_M_X64))
-  (void) request;
-  (void) avx2_available;
+  (void)request;
+  (void)avx2_available;
   return false;
 #else
   if constexpr (!simd_supported_request_v<request_type>) {
@@ -167,8 +191,7 @@ inline bool can_use_avx2(const request_type & request, const bool avx2_available
   }
 
   const bool base_supported =
-      avx2_available &&
-      avx2_intrinsics_compiled &&
+      avx2_available && avx2_intrinsics_compiled &&
       ::emel::kernel::detail::can_run_backend_request(request) &&
       ::emel::kernel::detail::dtype_code(request.src0.type) ==
           ::emel::kernel::detail::dtype_f32 &&
@@ -176,10 +199,9 @@ inline bool can_use_avx2(const request_type & request, const bool avx2_available
           ::emel::kernel::detail::dtype_f32;
   bool src1_supported = true;
   if constexpr (::emel::kernel::detail::requires_src1_v<request_type>) {
-    src1_supported =
-        ::emel::kernel::detail::dtype_code(request.src1.type) ==
-            ::emel::kernel::detail::dtype_f32 &&
-        is_dense_contiguous(request.src1);
+    src1_supported = ::emel::kernel::detail::dtype_code(request.src1.type) ==
+                         ::emel::kernel::detail::dtype_f32 &&
+                     is_dense_contiguous(request.src1);
   }
 
   bool unary_supported = true;
@@ -187,21 +209,1485 @@ inline bool can_use_avx2(const request_type & request, const bool avx2_available
     unary_supported = unary_subop_supported_simd(request.subop);
   }
 
-  return base_supported &&
-      src1_supported &&
-      unary_supported &&
-      is_dense_contiguous(request.src0) &&
-      is_dense_contiguous(request.dst);
+  return base_supported && src1_supported && unary_supported &&
+         is_dense_contiguous(request.src0) && is_dense_contiguous(request.dst);
+#endif
+}
+
+template <class request_type>
+inline bool can_use_avx2_fma_f16c_flash_attn_ext_f16kv_one_chunk(
+    const request_type &request,
+    const host_feature_contract &host_features) noexcept {
+#if !(defined(__x86_64__) || defined(_M_X64))
+  (void)request;
+  (void)host_features;
+  return false;
+#else
+  return host_features.avx2_fma_f16c_available() &&
+         avx2_fma_f16c_intrinsics_compiled &&
+         ::emel::kernel::detail::can_run_flash_attn_ext(request);
+#endif
+}
+
+template <class request_type>
+inline bool can_run_avx2_fma_f16c_flash_attn_ext_f16kv_one_chunk_request(
+    const request_type &request, const host_feature_contract &host_features,
+    const ::emel::kernel::detail::flash_attn_workspace &workspace) noexcept {
+  return can_use_avx2_fma_f16c_flash_attn_ext_f16kv_one_chunk(request,
+                                                              host_features) &&
+         ::emel::kernel::detail::can_run_flash_attn_ext_with_workspace(
+             request, workspace);
+}
+
+inline bool can_use_avx2_fma_q2_k_q8_k_mul_mat(
+    const event::op_mul_mat &request,
+    const host_feature_contract &host_features) noexcept {
+#if !(defined(__x86_64__) || defined(_M_X64))
+  (void)request;
+  (void)host_features;
+  return false;
+#else
+  const uint64_t k = request.src0.ne[0];
+  const uint64_t block_count = k / ::emel::kernel::detail::quant::QK_K;
+  return host_features.avx2_available &&
+         host_features.fma_available &&
+         avx2_fma_intrinsics_compiled &&
+         ::emel::kernel::detail::can_run_backend_request(request) &&
+         ::emel::kernel::detail::dtype_code(request.src0.type) ==
+             ::emel::kernel::detail::dtype_q2_k &&
+         ::emel::kernel::detail::dtype_code(request.src1.type) ==
+             ::emel::kernel::detail::dtype_f32 &&
+         ::emel::kernel::detail::dtype_code(request.dst.type) ==
+             ::emel::kernel::detail::dtype_f32 &&
+         k != 0u &&
+         (k % ::emel::kernel::detail::quant::QK_K) == 0u &&
+         block_count <= ::emel::kernel::detail::quant::MAX_Q8_K_BLOCKS;
+#endif
+}
+
+inline bool can_use_avx2_fma_q3_k_q8_k_mul_mat(
+    const event::op_mul_mat &request,
+    const host_feature_contract &host_features) noexcept {
+#if !(defined(__x86_64__) || defined(_M_X64))
+  (void)request;
+  (void)host_features;
+  return false;
+#else
+  const uint64_t k = request.src0.ne[0];
+  const uint64_t block_count = k / ::emel::kernel::detail::quant::QK_K;
+  return host_features.avx2_available &&
+         host_features.fma_available &&
+         avx2_fma_intrinsics_compiled &&
+         ::emel::kernel::detail::can_run_backend_request(request) &&
+         ::emel::kernel::detail::dtype_code(request.src0.type) ==
+             ::emel::kernel::detail::dtype_q3_k &&
+         ::emel::kernel::detail::dtype_code(request.src1.type) ==
+             ::emel::kernel::detail::dtype_f32 &&
+         ::emel::kernel::detail::dtype_code(request.dst.type) ==
+             ::emel::kernel::detail::dtype_f32 &&
+         k != 0u &&
+         (k % ::emel::kernel::detail::quant::QK_K) == 0u &&
+         block_count <= ::emel::kernel::detail::quant::MAX_Q8_K_BLOCKS;
+#endif
+}
+
+inline bool can_use_avx2_fma_f32_mul_mat(
+    const event::op_mul_mat &request,
+    const host_feature_contract &host_features) noexcept {
+#if !(defined(__x86_64__) || defined(_M_X64))
+  (void)request;
+  (void)host_features;
+  return false;
+#else
+  return host_features.fma_available &&
+         avx2_fma_intrinsics_compiled &&
+         request.src1.ne[0] != 1u &&
+         can_use_avx2(request, host_features.avx2_available);
+#endif
+}
+
+inline bool can_use_avx2_fma_f32_vector_mul_mat(
+    const event::op_mul_mat &request,
+    const host_feature_contract &host_features) noexcept {
+#if !(defined(__x86_64__) || defined(_M_X64))
+  (void)request;
+  (void)host_features;
+  return false;
+#else
+  return host_features.fma_available &&
+         avx2_fma_intrinsics_compiled &&
+         request.src1.ne[0] == 1u &&
+         can_use_avx2(request, host_features.avx2_available);
+#endif
+}
+
+inline bool can_use_avx2_fma_q4_k_q8_k_mul_mat(
+    const event::op_mul_mat &request,
+    const host_feature_contract &host_features) noexcept {
+#if !(defined(__x86_64__) || defined(_M_X64))
+  (void)request;
+  (void)host_features;
+  return false;
+#else
+  const uint64_t k = request.src0.ne[0];
+  const uint64_t block_count = k / ::emel::kernel::detail::quant::QK_K;
+  return host_features.avx2_available &&
+         host_features.fma_available &&
+         avx2_fma_intrinsics_compiled &&
+         ::emel::kernel::detail::can_run_backend_request(request) &&
+         ::emel::kernel::detail::dtype_code(request.src0.type) ==
+             ::emel::kernel::detail::dtype_q4_k &&
+         ::emel::kernel::detail::dtype_code(request.src1.type) ==
+             ::emel::kernel::detail::dtype_f32 &&
+         ::emel::kernel::detail::dtype_code(request.dst.type) ==
+             ::emel::kernel::detail::dtype_f32 &&
+         k != 0u &&
+         (k % ::emel::kernel::detail::quant::QK_K) == 0u &&
+         block_count <= ::emel::kernel::detail::quant::MAX_Q8_K_BLOCKS;
+#endif
+}
+
+inline bool can_use_avx2_fma_q6_k_q8_k_mul_mat(
+    const event::op_mul_mat &request,
+    const host_feature_contract &host_features) noexcept {
+#if !(defined(__x86_64__) || defined(_M_X64))
+  (void)request;
+  (void)host_features;
+  return false;
+#else
+  const uint64_t k = request.src0.ne[0];
+  const uint64_t block_count = k / ::emel::kernel::detail::quant::QK_K;
+  return host_features.avx2_available &&
+         host_features.fma_available &&
+         avx2_fma_intrinsics_compiled &&
+         ::emel::kernel::detail::can_run_backend_request(request) &&
+         ::emel::kernel::detail::dtype_code(request.src0.type) ==
+             ::emel::kernel::detail::dtype_q6_k &&
+         ::emel::kernel::detail::dtype_code(request.src1.type) ==
+             ::emel::kernel::detail::dtype_f32 &&
+         ::emel::kernel::detail::dtype_code(request.dst.type) ==
+             ::emel::kernel::detail::dtype_f32 &&
+         k != 0u &&
+         (k % ::emel::kernel::detail::quant::QK_K) == 0u &&
+         block_count <= ::emel::kernel::detail::quant::MAX_Q8_K_BLOCKS;
+#endif
+}
+
+template <uint8_t src0_dtype_code, uint64_t quant_block_size>
+inline bool can_use_avx2_fma_q8_0_rhs_mul_mat(
+    const event::op_mul_mat &request,
+    const host_feature_contract &host_features) noexcept {
+#if !(defined(__x86_64__) || defined(_M_X64))
+  (void)request;
+  (void)host_features;
+  return false;
+#else
+  const uint64_t k = request.src0.ne[0];
+  const uint64_t block_count = k / quant_block_size;
+  return host_features.avx2_available &&
+         host_features.fma_available &&
+         avx2_fma_intrinsics_compiled &&
+         ::emel::kernel::detail::can_run_backend_request(request) &&
+         ::emel::kernel::detail::dtype_code(request.src0.type) ==
+             src0_dtype_code &&
+         ::emel::kernel::detail::dtype_code(request.src1.type) ==
+             ::emel::kernel::detail::dtype_f32 &&
+         ::emel::kernel::detail::dtype_code(request.dst.type) ==
+             ::emel::kernel::detail::dtype_f32 &&
+         k != 0u &&
+         (k % quant_block_size) == 0u &&
+         block_count <= ::emel::kernel::detail::quant::MAX_Q8_0_BLOCKS;
+#endif
+}
+
+inline bool can_use_avx2_fma_q4_0_q8_0_mul_mat(
+    const event::op_mul_mat &request,
+    const host_feature_contract &host_features) noexcept {
+  return can_use_avx2_fma_q8_0_rhs_mul_mat<
+      ::emel::kernel::detail::dtype_q4_0,
+      ::emel::kernel::detail::quant::QK4_0>(request, host_features);
+}
+
+inline bool can_use_avx2_fma_q4_1_q8_0_mul_mat(
+    const event::op_mul_mat &request,
+    const host_feature_contract &host_features) noexcept {
+  return can_use_avx2_fma_q8_0_rhs_mul_mat<
+      ::emel::kernel::detail::dtype_q4_1,
+      ::emel::kernel::detail::quant::QK4_1>(request, host_features);
+}
+
+inline bool can_use_avx2_fma_q5_0_q8_0_mul_mat(
+    const event::op_mul_mat &request,
+    const host_feature_contract &host_features) noexcept {
+  return can_use_avx2_fma_q8_0_rhs_mul_mat<
+      ::emel::kernel::detail::dtype_q5_0,
+      ::emel::kernel::detail::quant::QK5_0>(request, host_features);
+}
+
+inline bool can_use_avx2_fma_q8_0_q8_0_mul_mat(
+    const event::op_mul_mat &request,
+    const host_feature_contract &host_features) noexcept {
+  return can_use_avx2_fma_q8_0_rhs_mul_mat<
+      ::emel::kernel::detail::dtype_q8_0,
+      ::emel::kernel::detail::quant::QK8_0>(request, host_features);
+}
+
+#if defined(__x86_64__) || defined(_M_X64)
+EMEL_KERNEL_X86_AVX2_FMA_TARGET
+inline int32_t horizontal_sum_i32x8_avx2(const __m256i values) noexcept {
+#if defined(__x86_64__) || defined(_M_X64)
+#if (defined(__AVX2__) && defined(__FMA__)) || defined(__GNUC__) ||            \
+    defined(__clang__)
+  const __m128i low = _mm256_castsi256_si128(values);
+  const __m128i high = _mm256_extracti128_si256(values, 1);
+  __m128i sum = _mm_add_epi32(low, high);
+  sum = _mm_hadd_epi32(sum, sum);
+  sum = _mm_hadd_epi32(sum, sum);
+  return _mm_cvtsi128_si32(sum);
+#else
+  (void)values;
+  return 0;
+#endif
+#else
+  (void)values;
+  return 0;
+#endif
+}
+
+EMEL_KERNEL_X86_AVX2_FMA_TARGET
+inline int32_t dot_u2_s8_16_avx2(const uint8_t *q2,
+                                 const int8_t *q8,
+                                 const int shift) noexcept {
+#if defined(__x86_64__) || defined(_M_X64)
+#if (defined(__AVX2__) && defined(__FMA__)) || defined(__GNUC__) ||            \
+    defined(__clang__)
+  const __m128i q2_bytes =
+      _mm_loadu_si128(reinterpret_cast<const __m128i *>(q2));
+  const __m128i q8_bytes =
+      _mm_loadu_si128(reinterpret_cast<const __m128i *>(q8));
+  const __m256i q2_u16 = _mm256_and_si256(
+      _mm256_srli_epi16(_mm256_cvtepu8_epi16(q2_bytes), shift),
+      _mm256_set1_epi16(0x03));
+  const __m256i q8_i16 = _mm256_cvtepi8_epi16(q8_bytes);
+  return horizontal_sum_i32x8_avx2(_mm256_madd_epi16(q2_u16, q8_i16));
+#else
+  (void)q2;
+  (void)q8;
+  (void)shift;
+  return 0;
+#endif
+#else
+  (void)q2;
+  (void)q8;
+  (void)shift;
+  return 0;
+#endif
+}
+
+EMEL_KERNEL_X86_AVX2_FMA_TARGET
+inline int32_t dot_q3_s8_16_avx2(const uint8_t *q3,
+                                 const uint8_t *hmask,
+                                 const int8_t *q8,
+                                 const int shift,
+                                 const uint8_t mask) noexcept {
+#if defined(__x86_64__) || defined(_M_X64)
+#if (defined(__AVX2__) && defined(__FMA__)) || defined(__GNUC__) ||            \
+    defined(__clang__)
+  const __m128i q3_bytes =
+      _mm_loadu_si128(reinterpret_cast<const __m128i *>(q3));
+  const __m128i hmask_bytes =
+      _mm_loadu_si128(reinterpret_cast<const __m128i *>(hmask));
+  const __m128i q8_bytes =
+      _mm_loadu_si128(reinterpret_cast<const __m128i *>(q8));
+
+  const __m256i q3_u16 = _mm256_and_si256(
+      _mm256_srli_epi16(_mm256_cvtepu8_epi16(q3_bytes), shift),
+      _mm256_set1_epi16(0x03));
+  const __m256i hmask_u16 = _mm256_cvtepu8_epi16(hmask_bytes);
+  const __m256i missing_high_bit = _mm256_cmpeq_epi16(
+      _mm256_and_si256(hmask_u16, _mm256_set1_epi16(mask)),
+      _mm256_setzero_si256());
+  const __m256i q3_i16 =
+      _mm256_sub_epi16(q3_u16,
+                       _mm256_and_si256(missing_high_bit,
+                                        _mm256_set1_epi16(4)));
+  const __m256i q8_i16 = _mm256_cvtepi8_epi16(q8_bytes);
+  return horizontal_sum_i32x8_avx2(_mm256_madd_epi16(q3_i16, q8_i16));
+#else
+  (void)q3;
+  (void)hmask;
+  (void)q8;
+  (void)shift;
+  (void)mask;
+  return 0;
+#endif
+#else
+  (void)q3;
+  (void)hmask;
+  (void)q8;
+  (void)shift;
+  (void)mask;
+  return 0;
+#endif
+}
+
+EMEL_KERNEL_X86_AVX2_FMA_TARGET
+inline int32_t dot_u6_s8_16_avx2(const uint8_t *ql,
+                                 const uint8_t *qh,
+                                 const int8_t *q8,
+                                 const int low_shift,
+                                 const int high_shift) noexcept {
+#if defined(__x86_64__) || defined(_M_X64)
+#if (defined(__AVX2__) && defined(__FMA__)) || defined(__GNUC__) ||            \
+    defined(__clang__)
+  const __m128i ql_bytes =
+      _mm_loadu_si128(reinterpret_cast<const __m128i *>(ql));
+  const __m128i qh_bytes =
+      _mm_loadu_si128(reinterpret_cast<const __m128i *>(qh));
+  const __m128i q8_bytes =
+      _mm_loadu_si128(reinterpret_cast<const __m128i *>(q8));
+
+  const __m256i low_nibble = _mm256_and_si256(
+      _mm256_srli_epi16(_mm256_cvtepu8_epi16(ql_bytes), low_shift),
+      _mm256_set1_epi16(0x0f));
+  const __m256i high_bits = _mm256_slli_epi16(
+      _mm256_and_si256(
+          _mm256_srli_epi16(_mm256_cvtepu8_epi16(qh_bytes), high_shift),
+          _mm256_set1_epi16(0x03)),
+      4);
+  const __m256i q6_u16 = _mm256_or_si256(low_nibble, high_bits);
+  const __m256i q8_i16 = _mm256_cvtepi8_epi16(q8_bytes);
+  return horizontal_sum_i32x8_avx2(_mm256_madd_epi16(q6_u16, q8_i16));
+#else
+  (void)ql;
+  (void)qh;
+  (void)q8;
+  (void)low_shift;
+  (void)high_shift;
+  return 0;
+#endif
+#else
+  (void)ql;
+  (void)qh;
+  (void)q8;
+  (void)low_shift;
+  (void)high_shift;
+  return 0;
+#endif
+}
+#if (defined(__AVX2__) && defined(__FMA__)) || defined(__GNUC__) ||            \
+    defined(__clang__)
+EMEL_KERNEL_X86_AVX2_FMA_TARGET
+inline __m256i unpack_nibbles_32_avx2(const uint8_t *src) noexcept {
+  const __m128i packed =
+      _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
+  const __m256i bytes = _mm256_insertf128_si256(
+      _mm256_castsi128_si256(packed), _mm_srli_epi16(packed, 4), 1);
+  return _mm256_and_si256(bytes, _mm256_set1_epi8(0x0f));
+}
+
+EMEL_KERNEL_X86_AVX2_FMA_TARGET
+inline __m256i expand_high_bits_32_avx2(const uint8_t *qh) noexcept {
+  uint32_t bits = 0u;
+  std::memcpy(&bits, qh, sizeof(bits));
+  const __m256i lane_bytes = _mm256_shuffle_epi8(
+      _mm256_set1_epi32(static_cast<int32_t>(bits)),
+      _mm256_set_epi64x(0x0303030303030303ll, 0x0202020202020202ll,
+                        0x0101010101010101ll, 0x0000000000000000ll));
+  const __m256i bit_select =
+      _mm256_set1_epi64x(static_cast<int64_t>(0x7fbfdfeff7fbfdfeull));
+  const __m256i selected = _mm256_or_si256(lane_bytes, bit_select);
+  return _mm256_cmpeq_epi8(selected, _mm256_set1_epi64x(-1ll));
+}
+
+// Precondition: y lanes must be > -128. _mm256_sign_epi8 cannot negate -128
+// (two's-complement wrap), so a -128 y lane paired with a negative x lane
+// would flip the product sign. Every caller passes activations produced by
+// quantize_row_q8_0_strided, which clamps to [-127, 127]; x (weight lanes,
+// which may hold -128 in q8_0 model data) is only ever abs'd, where the
+// u8 reinterpretation of -128 as 128 is exact.
+EMEL_KERNEL_X86_AVX2_FMA_TARGET
+inline __m256i dot_i8_pairs_i32x8_avx2(const __m256i x,
+                                       const __m256i y) noexcept {
+  const __m256i abs_x = _mm256_sign_epi8(x, x);
+  const __m256i signed_y = _mm256_sign_epi8(y, x);
+  const __m256i pair_products = _mm256_maddubs_epi16(abs_x, signed_y);
+  return _mm256_madd_epi16(_mm256_set1_epi16(1), pair_products);
+}
+#endif
+#endif
+
+EMEL_KERNEL_X86_AVX2_FMA_TARGET
+inline float dot_q2_k_q8_k_block_avx2_fma(
+    const ::emel::kernel::detail::quant::block_q2_k &lhs,
+    const ::emel::kernel::detail::quant::block_q8_k &rhs) noexcept {
+#if defined(__x86_64__) || defined(_M_X64)
+#if (defined(__AVX2__) && defined(__FMA__)) || defined(__GNUC__) ||            \
+    defined(__clang__)
+  const uint8_t *q2 = lhs.qs.data();
+  const int8_t *q8 = rhs.qs.data();
+  const uint8_t *scales = lhs.scales.data();
+
+  const __m128i scales_bytes =
+      _mm_loadu_si128(reinterpret_cast<const __m128i *>(scales));
+  const __m256i mins_i16 = _mm256_srli_epi16(
+      _mm256_cvtepu8_epi16(scales_bytes), 4);
+  const __m256i bsums_i16 = _mm256_loadu_si256(
+      reinterpret_cast<const __m256i *>(rhs.bsums.data()));
+  const int32_t sum_mins =
+      horizontal_sum_i32x8_avx2(_mm256_madd_epi16(mins_i16, bsums_i16));
+
+  int32_t sum = 0;
+  int scale_index = 0;
+  for (uint64_t block = 0;
+       block < (::emel::kernel::detail::quant::QK_K / 128u); ++block) {
+    for (int shift = 0; shift < 8; shift += 2) {
+      sum += static_cast<int32_t>(scales[scale_index++] & 0x0fu) *
+             dot_u2_s8_16_avx2(q2, q8, shift);
+      sum += static_cast<int32_t>(scales[scale_index++] & 0x0fu) *
+             dot_u2_s8_16_avx2(q2 + 16, q8 + 16, shift);
+      q8 += 32;
+    }
+    q2 += 32;
+  }
+
+  const float d_all =
+      rhs.d * ::emel::kernel::detail::quant::fp16_to_fp32(lhs.d);
+  const float d_min =
+      rhs.d * ::emel::kernel::detail::quant::fp16_to_fp32(lhs.dmin);
+  const __m128 block_sum = _mm_fmadd_ss(
+      _mm_set_ss(d_all), _mm_set_ss(static_cast<float>(sum)),
+      _mm_set_ss(-d_min * static_cast<float>(sum_mins)));
+  return _mm_cvtss_f32(block_sum);
+#else
+  return ::emel::kernel::detail::dot_q2_k_q8_k_block_scalar(lhs, rhs);
+#endif
+#else
+  return ::emel::kernel::detail::dot_q2_k_q8_k_block_scalar(lhs, rhs);
 #endif
 }
 
+EMEL_KERNEL_X86_AVX2_FMA_TARGET
+inline float dot_q2_k_q8_k_row_avx2_fma(
+    const ::emel::kernel::detail::quant::block_q2_k *lhs,
+    const ::emel::kernel::detail::quant::block_q8_k *rhs,
+    const uint64_t block_count) noexcept {
+#if defined(__x86_64__) || defined(_M_X64)
+#if (defined(__AVX2__) && defined(__FMA__)) || defined(__GNUC__) ||            \
+    defined(__clang__)
+  float sum = 0.0f;
+  for (uint64_t block = 0; block < block_count; ++block) {
+    sum += dot_q2_k_q8_k_block_avx2_fma(lhs[block], rhs[block]);
+  }
+  return sum;
+#else
+  return ::emel::kernel::detail::dot_q2_k_q8_k_row_scalar(lhs, rhs,
+                                                          block_count);
+#endif
+#else
+  return ::emel::kernel::detail::dot_q2_k_q8_k_row_scalar(lhs, rhs,
+                                                          block_count);
+#endif
+}
+
+EMEL_KERNEL_X86_AVX2_FMA_TARGET
+inline float dot_q3_k_q8_k_block_avx2_fma(
+    const ::emel::kernel::detail::quant::block_q3_k &lhs,
+    const ::emel::kernel::detail::quant::block_q8_k &rhs) noexcept {
+#if defined(__x86_64__) || defined(_M_X64)
+#if (defined(__AVX2__) && defined(__FMA__)) || defined(__GNUC__) ||            \
+    defined(__clang__)
+  constexpr uint32_t kmask1 = 0x03030303u;
+  constexpr uint32_t kmask2 = 0x0f0f0f0fu;
+
+  uint32_t scale_words[4] = {};
+  std::memcpy(scale_words, lhs.scales.data(), lhs.scales.size());
+  const uint32_t tmp = scale_words[2];
+  scale_words[3] =
+      ((scale_words[1] >> 4u) & kmask2) | (((tmp >> 6u) & kmask1) << 4u);
+  scale_words[2] =
+      ((scale_words[0] >> 4u) & kmask2) | (((tmp >> 4u) & kmask1) << 4u);
+  scale_words[1] =
+      (scale_words[1] & kmask2) | (((tmp >> 2u) & kmask1) << 4u);
+  scale_words[0] =
+      (scale_words[0] & kmask2) | (((tmp >> 0u) & kmask1) << 4u);
+
+  auto *scale = reinterpret_cast<int8_t *>(scale_words);
+  for (uint64_t idx = 0; idx < 16u; ++idx) {
+    scale[idx] = static_cast<int8_t>(scale[idx] - 32);
+  }
+
+  const uint8_t *q3 = lhs.qs.data();
+  const uint8_t *hmask = lhs.hmask.data();
+  const int8_t *q8 = rhs.qs.data();
+  int32_t isum = 0;
+  uint8_t mask = 1u;
+  int scale_index = 0;
+  for (uint64_t block = 0;
+       block < (::emel::kernel::detail::quant::QK_K / 128u); ++block) {
+    for (int shift = 0; shift < 8; shift += 2) {
+      isum += static_cast<int32_t>(scale[scale_index++]) *
+              dot_q3_s8_16_avx2(q3, hmask, q8, shift, mask);
+      isum += static_cast<int32_t>(scale[scale_index++]) *
+              dot_q3_s8_16_avx2(q3 + 16, hmask + 16, q8 + 16, shift, mask);
+      q8 += 32;
+      mask = static_cast<uint8_t>(mask << 1u);
+    }
+    q3 += 32;
+  }
+
+  const float d =
+      rhs.d * ::emel::kernel::detail::quant::fp16_to_fp32(lhs.d);
+  const __m128 block_sum =
+      _mm_mul_ss(_mm_set_ss(d), _mm_set_ss(static_cast<float>(isum)));
+  return _mm_cvtss_f32(block_sum);
+#else
+  return ::emel::kernel::detail::dot_q3_k_q8_k_block_scalar(lhs, rhs);
+#endif
+#else
+  return ::emel::kernel::detail::dot_q3_k_q8_k_block_scalar(lhs, rhs);
+#endif
+}
+
+EMEL_KERNEL_X86_AVX2_FMA_TARGET
+inline float dot_q3_k_q8_k_row_avx2_fma(
+    const ::emel::kernel::detail::quant::block_q3_k *lhs,
+    const ::emel::kernel::detail::quant::block_q8_k *rhs,
+    const uint64_t block_count) noexcept {
+#if defined(__x86_64__) || defined(_M_X64)
+#if (defined(__AVX2__) && defined(__FMA__)) || defined(__GNUC__) ||            \
+    defined(__clang__)
+  float sum = 0.0f;
+  for (uint64_t block = 0; block < block_count; ++block) {
+    sum += dot_q3_k_q8_k_block_avx2_fma(lhs[block], rhs[block]);
+  }
+  return sum;
+#else
+  return ::emel::kernel::detail::dot_q3_k_q8_k_row_scalar(lhs, rhs,
+                                                          block_count);
+#endif
+#else
+  return ::emel::kernel::detail::dot_q3_k_q8_k_row_scalar(lhs, rhs,
+                                                          block_count);
+#endif
+}
+
+EMEL_KERNEL_X86_AVX2_FMA_TARGET
+inline float dot_q4_k_q8_k_block_avx2_fma(
+    const ::emel::kernel::detail::quant::block_q4_k &lhs,
+    const ::emel::kernel::detail::quant::block_q8_k &rhs) noexcept {
+#if defined(__x86_64__) || defined(_M_X64)
+#if (defined(__AVX2__) && defined(__FMA__)) || defined(__GNUC__) ||            \
+    defined(__clang__)
+  constexpr uint32_t kmask1 = 0x3f3f3f3fu;
+  constexpr uint32_t kmask2 = 0x0f0f0f0fu;
+  constexpr uint32_t kmask3 = 0x03030303u;
+  alignas(32) static constexpr uint8_t SCALE_PAIR_SHUFFLE[256] = {
+      0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,
+      0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,
+      2,  3,  2,  3,  2,  3,  2,  3,  2,  3,  2,  3,  2,  3,  2,  3,
+      2,  3,  2,  3,  2,  3,  2,  3,  2,  3,  2,  3,  2,  3,  2,  3,
+      4,  5,  4,  5,  4,  5,  4,  5,  4,  5,  4,  5,  4,  5,  4,  5,
+      4,  5,  4,  5,  4,  5,  4,  5,  4,  5,  4,  5,  4,  5,  4,  5,
+      6,  7,  6,  7,  6,  7,  6,  7,  6,  7,  6,  7,  6,  7,  6,  7,
+      6,  7,  6,  7,  6,  7,  6,  7,  6,  7,  6,  7,  6,  7,  6,  7,
+      8,  9,  8,  9,  8,  9,  8,  9,  8,  9,  8,  9,  8,  9,  8,  9,
+      8,  9,  8,  9,  8,  9,  8,  9,  8,  9,  8,  9,  8,  9,  8,  9,
+      10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11,
+      10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11,
+      12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
+      12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
+      14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
+      14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
+  };
+
+  uint32_t scale_words[4] = {};
+  std::memcpy(scale_words, lhs.scales.data(), lhs.scales.size());
+  scale_words[3] = ((scale_words[2] >> 4u) & kmask2) |
+                   (((scale_words[1] >> 6u) & kmask3) << 4u);
+  const uint32_t scale_high = scale_words[1] & kmask1;
+  scale_words[1] = (scale_words[2] & kmask2) |
+                   (((scale_words[0] >> 6u) & kmask3) << 4u);
+  scale_words[2] = scale_high;
+  scale_words[0] &= kmask1;
+
+  const __m256i mins_and_scales = _mm256_cvtepu8_epi16(
+      _mm_set_epi32(static_cast<int32_t>(scale_words[3]),
+                    static_cast<int32_t>(scale_words[2]),
+                    static_cast<int32_t>(scale_words[1]),
+                    static_cast<int32_t>(scale_words[0])));
+
+  const __m256i bsums_i16 = _mm256_loadu_si256(
+      reinterpret_cast<const __m256i *>(rhs.bsums.data()));
+  const __m128i bsum_pairs =
+      _mm_hadd_epi16(_mm256_castsi256_si128(bsums_i16),
+                     _mm256_extracti128_si256(bsums_i16, 1));
+  const __m128i min_products = _mm_madd_epi16(
+      _mm256_extracti128_si256(mins_and_scales, 1), bsum_pairs);
+  __m128i min_sum = _mm_hadd_epi32(min_products, min_products);
+  min_sum = _mm_hadd_epi32(min_sum, min_sum);
+  const int32_t sum_mins = _mm_cvtsi128_si32(min_sum);
+
+  const __m128i scales_i16 = _mm256_castsi256_si128(mins_and_scales);
+  const __m256i scales_vec = _mm256_set_m128i(scales_i16, scales_i16);
+  const __m256i low_nibble_mask = _mm256_set1_epi8(0x0f);
+  const uint8_t *q4 = lhs.qs.data();
+  const int8_t *q8 = rhs.qs.data();
+  __m256i sum_i32 = _mm256_setzero_si256();
+
+  for (uint64_t pair = 0;
+       pair < (::emel::kernel::detail::quant::QK_K / 64u); ++pair) {
+    const __m256i scale_low = _mm256_shuffle_epi8(
+        scales_vec,
+        _mm256_load_si256(
+            reinterpret_cast<const __m256i *>(SCALE_PAIR_SHUFFLE) +
+            (2u * pair)));
+    const __m256i scale_high_vec = _mm256_shuffle_epi8(
+        scales_vec,
+        _mm256_load_si256(
+            reinterpret_cast<const __m256i *>(SCALE_PAIR_SHUFFLE) +
+            (2u * pair + 1u)));
+
+    const __m256i q4_bits =
+        _mm256_loadu_si256(reinterpret_cast<const __m256i *>(q4));
+    q4 += 32;
+    const __m256i q4_low = _mm256_and_si256(q4_bits, low_nibble_mask);
+    const __m256i q4_high =
+        _mm256_and_si256(_mm256_srli_epi16(q4_bits, 4), low_nibble_mask);
+
+    const __m256i q8_low =
+        _mm256_loadu_si256(reinterpret_cast<const __m256i *>(q8));
+    q8 += 32;
+    const __m256i products_low = _mm256_madd_epi16(
+        scale_low, _mm256_maddubs_epi16(q4_low, q8_low));
+
+    const __m256i q8_high =
+        _mm256_loadu_si256(reinterpret_cast<const __m256i *>(q8));
+    q8 += 32;
+    const __m256i products_high = _mm256_madd_epi16(
+        scale_high_vec, _mm256_maddubs_epi16(q4_high, q8_high));
+
+    sum_i32 = _mm256_add_epi32(
+        sum_i32, _mm256_add_epi32(products_low, products_high));
+  }
+
+  const int32_t sum = horizontal_sum_i32x8_avx2(sum_i32);
+  const float d_all =
+      rhs.d * ::emel::kernel::detail::quant::fp16_to_fp32(lhs.d);
+  const float d_min =
+      rhs.d * ::emel::kernel::detail::quant::fp16_to_fp32(lhs.dmin);
+  const __m128 block_sum = _mm_fmadd_ss(
+      _mm_set_ss(d_all), _mm_set_ss(static_cast<float>(sum)),
+      _mm_set_ss(-d_min * static_cast<float>(sum_mins)));
+  return _mm_cvtss_f32(block_sum);
+#else
+  return ::emel::kernel::detail::dot_q4_k_q8_k_block_scalar(lhs, rhs);
+#endif
+#else
+  return ::emel::kernel::detail::dot_q4_k_q8_k_block_scalar(lhs, rhs);
+#endif
+}
+
+EMEL_KERNEL_X86_AVX2_FMA_TARGET
+inline float dot_q4_k_q8_k_row_avx2_fma(
+    const ::emel::kernel::detail::quant::block_q4_k *lhs,
+    const ::emel::kernel::detail::quant::block_q8_k *rhs,
+    const uint64_t block_count) noexcept {
+#if defined(__x86_64__) || defined(_M_X64)
+#if (defined(__AVX2__) && defined(__FMA__)) || defined(__GNUC__) ||            \
+    defined(__clang__)
+  float sum = 0.0f;
+  for (uint64_t block = 0; block < block_count; ++block) {
+    sum += dot_q4_k_q8_k_block_avx2_fma(lhs[block], rhs[block]);
+  }
+  return sum;
+#else
+  return ::emel::kernel::detail::dot_q4_k_q8_k_row_scalar(lhs, rhs,
+                                                          block_count);
+#endif
+#else
+  return ::emel::kernel::detail::dot_q4_k_q8_k_row_scalar(lhs, rhs,
+                                                          block_count);
+#endif
+}
+
+EMEL_KERNEL_X86_AVX2_FMA_TARGET
+inline float dot_q4_0_q8_0_row_avx2_fma(
+    const ::emel::kernel::detail::quant::block_q4_0 *lhs,
+    const ::emel::kernel::detail::quant::block_q8_0 *rhs,
+    const uint64_t block_count) noexcept {
+#if defined(__x86_64__) || defined(_M_X64)
+#if (defined(__AVX2__) && defined(__FMA__)) || defined(__GNUC__) ||            \
+    defined(__clang__)
+  float sum = 0.0f;
+  for (uint64_t block = 0; block < block_count; ++block) {
+    const __m256i nibbles = unpack_nibbles_32_avx2(lhs[block].qs.data());
+    const __m256i x = _mm256_sub_epi8(nibbles, _mm256_set1_epi8(8));
+    const __m256i y = _mm256_loadu_si256(
+        reinterpret_cast<const __m256i *>(rhs[block].qs.data()));
+    const int32_t sumi =
+        horizontal_sum_i32x8_avx2(dot_i8_pairs_i32x8_avx2(x, y));
+    sum += static_cast<float>(sumi) *
+           (::emel::kernel::detail::quant::fp16_to_fp32(lhs[block].d) *
+            ::emel::kernel::detail::quant::fp16_to_fp32(rhs[block].d));
+  }
+  return sum;
+#else
+  return ::emel::kernel::detail::dot_q4_0_q8_0_row_scalar(lhs, rhs,
+                                                          block_count);
+#endif
+#else
+  return ::emel::kernel::detail::dot_q4_0_q8_0_row_scalar(lhs, rhs,
+                                                          block_count);
+#endif
+}
+
+EMEL_KERNEL_X86_AVX2_FMA_TARGET
+inline float dot_q4_1_q8_0_row_avx2_fma(
+    const ::emel::kernel::detail::quant::block_q4_1 *lhs,
+    const ::emel::kernel::detail::quant::block_q8_0 *rhs,
+    const uint64_t block_count) noexcept {
+#if defined(__x86_64__) || defined(_M_X64)
+#if (defined(__AVX2__) && defined(__FMA__)) || defined(__GNUC__) ||            \
+    defined(__clang__)
+  float sum = 0.0f;
+  for (uint64_t block = 0; block < block_count; ++block) {
+    const __m256i nibbles = unpack_nibbles_32_avx2(lhs[block].qs.data());
+    const __m256i y = _mm256_loadu_si256(
+        reinterpret_cast<const __m256i *>(rhs[block].qs.data()));
+    const int32_t sumi = horizontal_sum_i32x8_avx2(_mm256_madd_epi16(
+        _mm256_set1_epi16(1), _mm256_maddubs_epi16(nibbles, y)));
+    const int32_t rhs_sum = horizontal_sum_i32x8_avx2(_mm256_madd_epi16(
+        _mm256_set1_epi16(1),
+        _mm256_maddubs_epi16(_mm256_set1_epi8(1), y)));
+    const float rhs_d =
+        ::emel::kernel::detail::quant::fp16_to_fp32(rhs[block].d);
+    sum += rhs_d *
+           (::emel::kernel::detail::quant::fp16_to_fp32(lhs[block].d) *
+                static_cast<float>(sumi) +
+            ::emel::kernel::detail::quant::fp16_to_fp32(lhs[block].m) *
+                static_cast<float>(rhs_sum));
+  }
+  return sum;
+#else
+  return ::emel::kernel::detail::dot_q4_1_q8_0_row_scalar(lhs, rhs,
+                                                          block_count);
+#endif
+#else
+  return ::emel::kernel::detail::dot_q4_1_q8_0_row_scalar(lhs, rhs,
+                                                          block_count);
+#endif
+}
+
+EMEL_KERNEL_X86_AVX2_FMA_TARGET
+inline float dot_q5_0_q8_0_row_avx2_fma(
+    const ::emel::kernel::detail::quant::block_q5_0 *lhs,
+    const ::emel::kernel::detail::quant::block_q8_0 *rhs,
+    const uint64_t block_count) noexcept {
+#if defined(__x86_64__) || defined(_M_X64)
+#if (defined(__AVX2__) && defined(__FMA__)) || defined(__GNUC__) ||            \
+    defined(__clang__)
+  float sum = 0.0f;
+  for (uint64_t block = 0; block < block_count; ++block) {
+    const __m256i nibbles = unpack_nibbles_32_avx2(lhs[block].qs.data());
+    const __m256i high_set = expand_high_bits_32_avx2(lhs[block].qh.data());
+    const __m256i x = _mm256_or_si256(
+        nibbles, _mm256_andnot_si256(
+                     high_set, _mm256_set1_epi8(static_cast<char>(0xf0))));
+    const __m256i y = _mm256_loadu_si256(
+        reinterpret_cast<const __m256i *>(rhs[block].qs.data()));
+    const int32_t sumi =
+        horizontal_sum_i32x8_avx2(dot_i8_pairs_i32x8_avx2(x, y));
+    sum += static_cast<float>(sumi) *
+           (::emel::kernel::detail::quant::fp16_to_fp32(lhs[block].d) *
+            ::emel::kernel::detail::quant::fp16_to_fp32(rhs[block].d));
+  }
+  return sum;
+#else
+  return ::emel::kernel::detail::dot_q5_0_q8_0_row_scalar(lhs, rhs,
+                                                          block_count);
+#endif
+#else
+  return ::emel::kernel::detail::dot_q5_0_q8_0_row_scalar(lhs, rhs,
+                                                          block_count);
+#endif
+}
+
+EMEL_KERNEL_X86_AVX2_FMA_TARGET
+inline float dot_q8_0_q8_0_row_avx2_fma(
+    const ::emel::kernel::detail::quant::block_q8_0 *lhs,
+    const ::emel::kernel::detail::quant::block_q8_0 *rhs,
+    const uint64_t block_count) noexcept {
+#if defined(__x86_64__) || defined(_M_X64)
+#if (defined(__AVX2__) && defined(__FMA__)) || defined(__GNUC__) ||            \
+    defined(__clang__)
+  float sum = 0.0f;
+  for (uint64_t block = 0; block < block_count; ++block) {
+    const __m256i x = _mm256_loadu_si256(
+        reinterpret_cast<const __m256i *>(lhs[block].qs.data()));
+    const __m256i y = _mm256_loadu_si256(
+        reinterpret_cast<const __m256i *>(rhs[block].qs.data()));
+    const int32_t sumi =
+        horizontal_sum_i32x8_avx2(dot_i8_pairs_i32x8_avx2(x, y));
+    sum += static_cast<float>(sumi) *
+           (::emel::kernel::detail::quant::fp16_to_fp32(lhs[block].d) *
+            ::emel::kernel::detail::quant::fp16_to_fp32(rhs[block].d));
+  }
+  return sum;
+#else
+  return ::emel::kernel::detail::dot_q8_0_q8_0_row_scalar(lhs, rhs,
+                                                          block_count);
+#endif
+#else
+  return ::emel::kernel::detail::dot_q8_0_q8_0_row_scalar(lhs, rhs,
+                                                          block_count);
+#endif
+}
+
+EMEL_KERNEL_X86_AVX2_FMA_TARGET
+inline float dot_q6_k_q8_k_block_avx2_fma(
+    const ::emel::kernel::detail::quant::block_q6_k &lhs,
+    const ::emel::kernel::detail::quant::block_q8_k &rhs) noexcept {
+#if defined(__x86_64__) || defined(_M_X64)
+#if (defined(__AVX2__) && defined(__FMA__)) || defined(__GNUC__) ||            \
+    defined(__clang__)
+  const __m128i scales_bytes =
+      _mm_loadu_si128(reinterpret_cast<const __m128i *>(lhs.scales.data()));
+  const __m256i scales_i16 = _mm256_cvtepi8_epi16(scales_bytes);
+  const __m256i bsums_i16 = _mm256_loadu_si256(
+      reinterpret_cast<const __m256i *>(rhs.bsums.data()));
+  const int32_t sum_mins =
+      horizontal_sum_i32x8_avx2(_mm256_madd_epi16(scales_i16, bsums_i16));
+
+  const uint8_t *ql = lhs.ql.data();
+  const uint8_t *qh = lhs.qh.data();
+  const int8_t *q8 = rhs.qs.data();
+  const int8_t *scale = lhs.scales.data();
+  int32_t isum = 0;
+  int scale_index = 0;
+  for (uint64_t block = 0;
+       block < (::emel::kernel::detail::quant::QK_K / 128u); ++block) {
+    isum += static_cast<int32_t>(scale[scale_index++]) *
+            dot_u6_s8_16_avx2(ql + 0, qh + 0, q8 + 0, 0, 0);
+    isum += static_cast<int32_t>(scale[scale_index++]) *
+            dot_u6_s8_16_avx2(ql + 16, qh + 16, q8 + 16, 0, 0);
+    isum += static_cast<int32_t>(scale[scale_index++]) *
+            dot_u6_s8_16_avx2(ql + 32, qh + 0, q8 + 32, 0, 2);
+    isum += static_cast<int32_t>(scale[scale_index++]) *
+            dot_u6_s8_16_avx2(ql + 48, qh + 16, q8 + 48, 0, 2);
+    isum += static_cast<int32_t>(scale[scale_index++]) *
+            dot_u6_s8_16_avx2(ql + 0, qh + 0, q8 + 64, 4, 4);
+    isum += static_cast<int32_t>(scale[scale_index++]) *
+            dot_u6_s8_16_avx2(ql + 16, qh + 16, q8 + 80, 4, 4);
+    isum += static_cast<int32_t>(scale[scale_index++]) *
+            dot_u6_s8_16_avx2(ql + 32, qh + 0, q8 + 96, 4, 6);
+    isum += static_cast<int32_t>(scale[scale_index++]) *
+            dot_u6_s8_16_avx2(ql + 48, qh + 16, q8 + 112, 4, 6);
+    ql += 64;
+    qh += 32;
+    q8 += 128;
+  }
+
+  const int32_t adjusted = isum - (32 * sum_mins);
+  const float d =
+      rhs.d * ::emel::kernel::detail::quant::fp16_to_fp32(lhs.d);
+  const __m128 block_sum =
+      _mm_mul_ss(_mm_set_ss(d), _mm_set_ss(static_cast<float>(adjusted)));
+  return _mm_cvtss_f32(block_sum);
+#else
+  return ::emel::kernel::detail::dot_q6_k_q8_k_block_scalar(lhs, rhs);
+#endif
+#else
+  return ::emel::kernel::detail::dot_q6_k_q8_k_block_scalar(lhs, rhs);
+#endif
+}
+
+EMEL_KERNEL_X86_AVX2_FMA_TARGET
+inline float dot_q6_k_q8_k_row_avx2_fma(
+    const ::emel::kernel::detail::quant::block_q6_k *lhs,
+    const ::emel::kernel::detail::quant::block_q8_k *rhs,
+    const uint64_t block_count) noexcept {
+#if defined(__x86_64__) || defined(_M_X64)
+#if (defined(__AVX2__) && defined(__FMA__)) || defined(__GNUC__) ||            \
+    defined(__clang__)
+  float sum = 0.0f;
+  for (uint64_t block = 0; block < block_count; ++block) {
+    sum += dot_q6_k_q8_k_block_avx2_fma(lhs[block], rhs[block]);
+  }
+  return sum;
+#else
+  return ::emel::kernel::detail::dot_q6_k_q8_k_row_scalar(lhs, rhs,
+                                                          block_count);
+#endif
+#else
+  return ::emel::kernel::detail::dot_q6_k_q8_k_row_scalar(lhs, rhs,
+                                                          block_count);
+#endif
+}
+
+EMEL_KERNEL_X86_AVX2_FMA_TARGET
+inline void execute_avx2_fma_mul_mat_q2_k_q8_k_unchecked(
+    const event::op_mul_mat &request) noexcept {
+#if defined(__x86_64__) || defined(_M_X64)
+#if (defined(__AVX2__) && defined(__FMA__)) || defined(__GNUC__) ||            \
+    defined(__clang__)
+  const uint64_t k = request.src0.ne[0];
+  const uint64_t m = request.src0.ne[1];
+  const uint64_t n = request.src1.ne[0];
+  const uint64_t block_count = k / ::emel::kernel::detail::quant::QK_K;
+  const float *b = static_cast<const float *>(request.src1.data);
+  float *c = static_cast<float *>(request.dst.data);
+  const auto *a = static_cast<const uint8_t *>(request.src0.data);
+  const size_t row_bytes = request.src0.nb[1];
+  std::array<::emel::kernel::detail::quant::block_q8_k,
+             ::emel::kernel::detail::quant::MAX_Q8_K_BLOCKS>
+      q8_blocks = {};
+
+  for (uint64_t j = 0; j < n; ++j) {
+    for (uint64_t block = 0; block < block_count; ++block) {
+      ::emel::kernel::detail::quant::quantize_row_q8_k_strided(
+          b + block * ::emel::kernel::detail::quant::QK_K * n + j, n,
+          &q8_blocks[block], ::emel::kernel::detail::quant::QK_K);
+    }
+    for (uint64_t i = 0; i < m; ++i) {
+      const auto *row = reinterpret_cast<
+          const ::emel::kernel::detail::quant::block_q2_k *>(
+          a + i * row_bytes);
+      c[i * n + j] =
+          dot_q2_k_q8_k_row_avx2_fma(row, q8_blocks.data(), block_count);
+    }
+  }
+  return;
+#endif
+#endif
+  (void)request;
+}
+
+EMEL_KERNEL_X86_AVX2_FMA_TARGET
+inline void execute_avx2_fma_mul_mat_q3_k_q8_k_unchecked(
+    const event::op_mul_mat &request) noexcept {
+#if defined(__x86_64__) || defined(_M_X64)
+#if (defined(__AVX2__) && defined(__FMA__)) || defined(__GNUC__) ||            \
+    defined(__clang__)
+  const uint64_t k = request.src0.ne[0];
+  const uint64_t m = request.src0.ne[1];
+  const uint64_t n = request.src1.ne[0];
+  const uint64_t block_count = k / ::emel::kernel::detail::quant::QK_K;
+  const float *b = static_cast<const float *>(request.src1.data);
+  float *c = static_cast<float *>(request.dst.data);
+  const auto *a = static_cast<const uint8_t *>(request.src0.data);
+  const size_t row_bytes = request.src0.nb[1];
+  std::array<::emel::kernel::detail::quant::block_q8_k,
+             ::emel::kernel::detail::quant::MAX_Q8_K_BLOCKS>
+      q8_blocks = {};
+
+  for (uint64_t j = 0; j < n; ++j) {
+    for (uint64_t block = 0; block < block_count; ++block) {
+      ::emel::kernel::detail::quant::quantize_row_q8_k_strided(
+          b + block * ::emel::kernel::detail::quant::QK_K * n + j, n,
+          &q8_blocks[block], ::emel::kernel::detail::quant::QK_K);
+    }
+    for (uint64_t i = 0; i < m; ++i) {
+      const auto *row = reinterpret_cast<
+          const ::emel::kernel::detail::quant::block_q3_k *>(
+          a + i * row_bytes);
+      c[i * n + j] =
+          dot_q3_k_q8_k_row_avx2_fma(row, q8_blocks.data(), block_count);
+    }
+  }
+  return;
+#endif
+#endif
+  (void)request;
+}
+
+EMEL_KERNEL_X86_AVX2_FMA_TARGET
+inline void execute_avx2_fma_mul_mat_q4_k_q8_k_unchecked(
+    const event::op_mul_mat &request) noexcept {
+#if defined(__x86_64__) || defined(_M_X64)
+#if (defined(__AVX2__) && defined(__FMA__)) || defined(__GNUC__) ||            \
+    defined(__clang__)
+  const uint64_t k = request.src0.ne[0];
+  const uint64_t m = request.src0.ne[1];
+  const uint64_t n = request.src1.ne[0];
+  const uint64_t block_count = k / ::emel::kernel::detail::quant::QK_K;
+  const float *b = static_cast<const float *>(request.src1.data);
+  float *c = static_cast<float *>(request.dst.data);
+  const auto *a = static_cast<const uint8_t *>(request.src0.data);
+  const size_t row_bytes = request.src0.nb[1];
+  std::array<::emel::kernel::detail::quant::block_q8_k,
+             ::emel::kernel::detail::quant::MAX_Q8_K_BLOCKS>
+      q8_blocks = {};
+
+  for (uint64_t j = 0; j < n; ++j) {
+    for (uint64_t block = 0; block < block_count; ++block) {
+      ::emel::kernel::detail::quant::quantize_row_q8_k_strided(
+          b + block * ::emel::kernel::detail::quant::QK_K * n + j, n,
+          &q8_blocks[block], ::emel::kernel::detail::quant::QK_K);
+    }
+    for (uint64_t i = 0; i < m; ++i) {
+      const auto *row = reinterpret_cast<
+          const ::emel::kernel::detail::quant::block_q4_k *>(
+          a + i * row_bytes);
+      c[i * n + j] =
+          dot_q4_k_q8_k_row_avx2_fma(row, q8_blocks.data(), block_count);
+    }
+  }
+  return;
+#endif
+#endif
+  (void)request;
+}
+
+template <class block_type, uint64_t quant_block_size,
+          float (*row_dot)(const block_type *,
+                           const ::emel::kernel::detail::quant::block_q8_0 *,
+                           uint64_t)>
+inline void execute_avx2_fma_mul_mat_q8_0_rhs_unchecked(
+    const event::op_mul_mat &request) noexcept {
+#if defined(__x86_64__) || defined(_M_X64)
+#if (defined(__AVX2__) && defined(__FMA__)) || defined(__GNUC__) ||            \
+    defined(__clang__)
+  const uint64_t k = request.src0.ne[0];
+  const uint64_t m = request.src0.ne[1];
+  const uint64_t n = request.src1.ne[0];
+  const uint64_t block_count = k / quant_block_size;
+  const float *b = static_cast<const float *>(request.src1.data);
+  float *c = static_cast<float *>(request.dst.data);
+  const auto *a = static_cast<const uint8_t *>(request.src0.data);
+  const size_t row_bytes = request.src0.nb[1];
+  std::array<::emel::kernel::detail::quant::block_q8_0,
+             ::emel::kernel::detail::quant::MAX_Q8_0_BLOCKS>
+      q8_blocks = {};
+
+  for (uint64_t j = 0; j < n; ++j) {
+    ::emel::kernel::detail::quant::quantize_row_q8_0_strided(
+        b + j, n, q8_blocks.data(), static_cast<int64_t>(k));
+    for (uint64_t i = 0; i < m; ++i) {
+      const auto *row =
+          reinterpret_cast<const block_type *>(a + i * row_bytes);
+      c[i * n + j] = row_dot(row, q8_blocks.data(), block_count);
+    }
+  }
+  return;
+#endif
+#endif
+  (void)request;
+}
+
+inline void execute_avx2_fma_mul_mat_q4_0_q8_0_unchecked(
+    const event::op_mul_mat &request) noexcept {
+  execute_avx2_fma_mul_mat_q8_0_rhs_unchecked<
+      ::emel::kernel::detail::quant::block_q4_0,
+      ::emel::kernel::detail::quant::QK4_0,
+      &dot_q4_0_q8_0_row_avx2_fma>(request);
+}
+
+inline void execute_avx2_fma_mul_mat_q4_1_q8_0_unchecked(
+    const event::op_mul_mat &request) noexcept {
+  execute_avx2_fma_mul_mat_q8_0_rhs_unchecked<
+      ::emel::kernel::detail::quant::block_q4_1,
+      ::emel::kernel::detail::quant::QK4_1,
+      &dot_q4_1_q8_0_row_avx2_fma>(request);
+}
+
+inline void execute_avx2_fma_mul_mat_q5_0_q8_0_unchecked(
+    const event::op_mul_mat &request) noexcept {
+  execute_avx2_fma_mul_mat_q8_0_rhs_unchecked<
+      ::emel::kernel::detail::quant::block_q5_0,
+      ::emel::kernel::detail::quant::QK5_0,
+      &dot_q5_0_q8_0_row_avx2_fma>(request);
+}
+
+inline void execute_avx2_fma_mul_mat_q8_0_q8_0_unchecked(
+    const event::op_mul_mat &request) noexcept {
+  execute_avx2_fma_mul_mat_q8_0_rhs_unchecked<
+      ::emel::kernel::detail::quant::block_q8_0,
+      ::emel::kernel::detail::quant::QK8_0,
+      &dot_q8_0_q8_0_row_avx2_fma>(request);
+}
+
+EMEL_KERNEL_X86_AVX2_FMA_TARGET
+inline void execute_avx2_fma_mul_mat_q6_k_q8_k_unchecked(
+    const event::op_mul_mat &request) noexcept {
+#if defined(__x86_64__) || defined(_M_X64)
+#if (defined(__AVX2__) && defined(__FMA__)) || defined(__GNUC__) ||            \
+    defined(__clang__)
+  const uint64_t k = request.src0.ne[0];
+  const uint64_t m = request.src0.ne[1];
+  const uint64_t n = request.src1.ne[0];
+  const uint64_t block_count = k / ::emel::kernel::detail::quant::QK_K;
+  const float *b = static_cast<const float *>(request.src1.data);
+  float *c = static_cast<float *>(request.dst.data);
+  const auto *a = static_cast<const uint8_t *>(request.src0.data);
+  const size_t row_bytes = request.src0.nb[1];
+  std::array<::emel::kernel::detail::quant::block_q8_k,
+             ::emel::kernel::detail::quant::MAX_Q8_K_BLOCKS>
+      q8_blocks = {};
+
+  for (uint64_t j = 0; j < n; ++j) {
+    for (uint64_t block = 0; block < block_count; ++block) {
+      ::emel::kernel::detail::quant::quantize_row_q8_k_strided(
+          b + block * ::emel::kernel::detail::quant::QK_K * n + j, n,
+          &q8_blocks[block], ::emel::kernel::detail::quant::QK_K);
+    }
+    for (uint64_t i = 0; i < m; ++i) {
+      const auto *row = reinterpret_cast<
+          const ::emel::kernel::detail::quant::block_q6_k *>(
+          a + i * row_bytes);
+      c[i * n + j] =
+          dot_q6_k_q8_k_row_avx2_fma(row, q8_blocks.data(), block_count);
+    }
+  }
+  return;
+#endif
+#endif
+  (void)request;
+}
+
+EMEL_KERNEL_X86_AVX2_FMA_F16C_TARGET
+inline void convert_f32_to_f16_buffer_avx2_f16c(const float *src, uint16_t *dst,
+                                                const uint64_t count) noexcept {
+#if defined(__x86_64__) || defined(_M_X64)
+#if (defined(__AVX2__) && defined(__F16C__)) || defined(__GNUC__) ||           \
+    defined(__clang__)
+  constexpr int round_mode = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
+  uint64_t idx = 0u;
+  for (; idx + 16u <= count; idx += 16u) {
+    const __m256 fp32_0 = _mm256_loadu_ps(src + idx + 0u);
+    const __m256 fp32_1 = _mm256_loadu_ps(src + idx + 8u);
+    const __m128i fp16_0 = _mm256_cvtps_ph(fp32_0, round_mode);
+    const __m128i fp16_1 = _mm256_cvtps_ph(fp32_1, round_mode);
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + idx + 0u), fp16_0);
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + idx + 8u), fp16_1);
+  }
+  for (; idx + 8u <= count; idx += 8u) {
+    const __m256 fp32 = _mm256_loadu_ps(src + idx);
+    const __m128i fp16 = _mm256_cvtps_ph(fp32, round_mode);
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + idx), fp16);
+  }
+  for (; idx < count; ++idx) {
+    dst[idx] = ::emel::kernel::detail::quant::fp32_to_fp16(src[idx]);
+  }
+  return;
+#endif
+#endif
+  ::emel::kernel::detail::convert_f32_to_fp16_buffer_scalar(src, dst, count);
+}
+
+EMEL_KERNEL_X86_AVX2_FMA_F16C_TARGET
+inline void convert_f16_buffer_to_f32_avx2_f16c(const uint16_t *src, float *dst,
+                                                const uint64_t count) noexcept {
+#if defined(__x86_64__) || defined(_M_X64)
+#if (defined(__AVX2__) && defined(__F16C__)) || defined(__GNUC__) ||           \
+    defined(__clang__)
+  uint64_t idx = 0u;
+  for (; idx + 16u <= count; idx += 16u) {
+    const __m128i fp16_0 =
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + idx + 0u));
+    const __m128i fp16_1 =
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + idx + 8u));
+    _mm256_storeu_ps(dst + idx + 0u, _mm256_cvtph_ps(fp16_0));
+    _mm256_storeu_ps(dst + idx + 8u, _mm256_cvtph_ps(fp16_1));
+  }
+  for (; idx + 8u <= count; idx += 8u) {
+    const __m128i fp16 =
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + idx));
+    _mm256_storeu_ps(dst + idx, _mm256_cvtph_ps(fp16));
+  }
+  for (; idx < count; ++idx) {
+    dst[idx] = ::emel::kernel::detail::quant::fp16_to_fp32(src[idx]);
+  }
+  return;
+#endif
+#endif
+  ::emel::kernel::detail::convert_f16_buffer_to_f32_scalar(src, dst, count);
+}
+
+EMEL_KERNEL_X86_AVX2_FMA_F16C_TARGET
+inline float
+dot_product_f16_f16_scores_avx2_fma(const uint16_t *lhs, const uint16_t *rhs,
+                                    const uint64_t count) noexcept {
+#if defined(__x86_64__) || defined(_M_X64)
+#if (defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)) ||            \
+    defined(__GNUC__) || defined(__clang__)
+  __m256 sum0 = _mm256_setzero_ps();
+  __m256 sum1 = _mm256_setzero_ps();
+  __m256 sum2 = _mm256_setzero_ps();
+  __m256 sum3 = _mm256_setzero_ps();
+  uint64_t idx = 0u;
+  for (; idx + 32u <= count; idx += 32u) {
+    const __m256 lhs0 = _mm256_cvtph_ps(
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(lhs + idx + 0u)));
+    const __m256 rhs0 = _mm256_cvtph_ps(
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(rhs + idx + 0u)));
+    const __m256 lhs1 = _mm256_cvtph_ps(
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(lhs + idx + 8u)));
+    const __m256 rhs1 = _mm256_cvtph_ps(
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(rhs + idx + 8u)));
+    const __m256 lhs2 = _mm256_cvtph_ps(
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(lhs + idx + 16u)));
+    const __m256 rhs2 = _mm256_cvtph_ps(
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(rhs + idx + 16u)));
+    const __m256 lhs3 = _mm256_cvtph_ps(
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(lhs + idx + 24u)));
+    const __m256 rhs3 = _mm256_cvtph_ps(
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(rhs + idx + 24u)));
+    sum0 = _mm256_fmadd_ps(lhs0, rhs0, sum0);
+    sum1 = _mm256_fmadd_ps(lhs1, rhs1, sum1);
+    sum2 = _mm256_fmadd_ps(lhs2, rhs2, sum2);
+    sum3 = _mm256_fmadd_ps(lhs3, rhs3, sum3);
+  }
+
+  double sumf = 0.0;
+  const __m256 sum01 = _mm256_add_ps(sum0, sum1);
+  const __m256 sum23 = _mm256_add_ps(sum2, sum3);
+  const __m256 sum = _mm256_add_ps(sum01, sum23);
+  alignas(32) float lanes[8] = {};
+  _mm256_store_ps(lanes, sum);
+  for (float lane : lanes) {
+    sumf += static_cast<double>(lane);
+  }
+  for (; idx < count; ++idx) {
+    sumf += static_cast<double>(
+                ::emel::kernel::detail::quant::fp16_to_fp32(lhs[idx])) *
+            static_cast<double>(
+                ::emel::kernel::detail::quant::fp16_to_fp32(rhs[idx]));
+  }
+  return static_cast<float>(sumf);
+#endif
+#endif
+  float scalar_sum = 0.0f;
+  for (uint64_t idx = 0u; idx < count; ++idx) {
+    scalar_sum += ::emel::kernel::detail::quant::fp16_to_fp32(lhs[idx]) *
+                  ::emel::kernel::detail::quant::fp16_to_fp32(rhs[idx]);
+  }
+  return scalar_sum;
+}
+
+EMEL_KERNEL_X86_AVX2_FMA_F16C_TARGET
+inline void scale_f32_avx2(float *data, const float scale,
+                           const uint64_t count) noexcept {
+#if defined(__x86_64__) || defined(_M_X64)
+#if defined(__AVX2__) || defined(__GNUC__) || defined(__clang__)
+  const __m256 scale_v = _mm256_set1_ps(scale);
+  uint64_t idx = 0u;
+  for (; idx + 8u <= count; idx += 8u) {
+    const __m256 data_v = _mm256_loadu_ps(data + idx);
+    _mm256_storeu_ps(data + idx, _mm256_mul_ps(data_v, scale_v));
+  }
+  for (; idx < count; ++idx) {
+    data[idx] *= scale;
+  }
+  return;
+#endif
+#endif
+  ::emel::kernel::detail::scale_f32_scalar(data, scale, count);
+}
+
+EMEL_KERNEL_X86_AVX2_FMA_F16C_TARGET
+inline void scale_f16_buffer_avx2_f16c(uint16_t *data, const float scale,
+                                       const uint64_t count) noexcept {
+#if defined(__x86_64__) || defined(_M_X64)
+#if (defined(__AVX2__) && defined(__F16C__)) || defined(__GNUC__) ||           \
+    defined(__clang__)
+  constexpr int round_mode = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
+  const float rounded_scale = ::emel::kernel::detail::round_fp16_scalar(scale);
+  const __m256 scale_v = _mm256_set1_ps(rounded_scale);
+  uint64_t idx = 0u;
+  for (; idx + 16u <= count; idx += 16u) {
+    const __m128i data0 =
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(data + idx + 0u));
+    const __m128i data1 =
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(data + idx + 8u));
+    const __m256 scaled0 = _mm256_mul_ps(_mm256_cvtph_ps(data0), scale_v);
+    const __m256 scaled1 = _mm256_mul_ps(_mm256_cvtph_ps(data1), scale_v);
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(data + idx + 0u),
+                     _mm256_cvtps_ph(scaled0, round_mode));
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(data + idx + 8u),
+                     _mm256_cvtps_ph(scaled1, round_mode));
+  }
+  for (; idx + 8u <= count; idx += 8u) {
+    const __m128i data_v =
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(data + idx));
+    const __m256 scaled = _mm256_mul_ps(_mm256_cvtph_ps(data_v), scale_v);
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(data + idx),
+                     _mm256_cvtps_ph(scaled, round_mode));
+  }
+  for (; idx < count; ++idx) {
+    const float rounded_value =
+        ::emel::kernel::detail::quant::fp16_to_fp32(data[idx]);
+    data[idx] = ::emel::kernel::detail::quant::fp32_to_fp16(
+        ::emel::kernel::detail::round_fp16_scalar(rounded_value *
+                                                  rounded_scale));
+  }
+  return;
+#endif
+#endif
+  ::emel::kernel::detail::scale_f16_buffer_scalar(data, scale, count);
+}
+
+EMEL_KERNEL_X86_AVX2_FMA_F16C_TARGET
+inline void axpy_f16_buffer_avx2_fma_f16c(uint16_t *dst, const uint16_t *src,
+                                          const float alpha,
+                                          const uint64_t count) noexcept {
+#if defined(__x86_64__) || defined(_M_X64)
+#if (defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)) ||            \
+    defined(__GNUC__) || defined(__clang__)
+  constexpr int round_mode = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
+  const float rounded_alpha = ::emel::kernel::detail::round_fp16_scalar(alpha);
+  const __m256 alpha_v = _mm256_set1_ps(rounded_alpha);
+  uint64_t idx = 0u;
+  for (; idx + 16u <= count; idx += 16u) {
+    const __m128i dst0 =
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(dst + idx + 0u));
+    const __m128i dst1 =
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(dst + idx + 8u));
+    const __m128i src0 =
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + idx + 0u));
+    const __m128i src1 =
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + idx + 8u));
+    const __m256 out0 =
+        _mm256_fmadd_ps(_mm256_cvtph_ps(src0), alpha_v, _mm256_cvtph_ps(dst0));
+    const __m256 out1 =
+        _mm256_fmadd_ps(_mm256_cvtph_ps(src1), alpha_v, _mm256_cvtph_ps(dst1));
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + idx + 0u),
+                     _mm256_cvtps_ph(out0, round_mode));
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + idx + 8u),
+                     _mm256_cvtps_ph(out1, round_mode));
+  }
+  for (; idx + 8u <= count; idx += 8u) {
+    const __m128i dst_v =
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(dst + idx));
+    const __m128i src_v =
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + idx));
+    const __m256 out = _mm256_fmadd_ps(_mm256_cvtph_ps(src_v), alpha_v,
+                                       _mm256_cvtph_ps(dst_v));
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + idx),
+                     _mm256_cvtps_ph(out, round_mode));
+  }
+  for (; idx < count; ++idx) {
+    const float rounded_dst =
+        ::emel::kernel::detail::quant::fp16_to_fp32(dst[idx]);
+    const float rounded_src =
+        ::emel::kernel::detail::quant::fp16_to_fp32(src[idx]);
+    dst[idx] = ::emel::kernel::detail::quant::fp32_to_fp16(
+        ::emel::kernel::detail::round_fp16_scalar(rounded_dst +
+                                                  rounded_src * rounded_alpha));
+  }
+  return;
+#endif
+#endif
+  ::emel::kernel::detail::axpy_f16_buffer_scalar(dst, src, alpha, count);
+}
+
+template <class request_type>
+inline void prepare_flash_attn_ext_f16kv_one_chunk_workspace_avx2(
+    const request_type &request,
+    ::emel::kernel::detail::flash_attn_workspace &workspace) noexcept {
+  const uint64_t kv_tokens =
+      ::emel::kernel::detail::flash_attn_active_tokens(request);
+  const bool reusing = workspace.prepared_tokens == kv_tokens;
+  workspace.reuse_count += static_cast<uint64_t>(reusing);
+  workspace.prepared_tokens = kv_tokens;
+}
+
+template <class request_type>
+inline void run_flash_attn_ext_f16kv_one_chunk_avx2_fma_f16c_unchecked(
+    const request_type &request,
+    ::emel::kernel::detail::flash_attn_workspace &workspace) noexcept {
+  const uint64_t kv_tokens =
+      ::emel::kernel::detail::flash_attn_active_tokens(request);
+  prepare_flash_attn_ext_f16kv_one_chunk_workspace_avx2(request, workspace);
+  const uint64_t head_dim = request.src0.ne[0];
+  const uint64_t head_count = request.src0.ne[2];
+  const uint64_t kv_head_count = request.src1.ne[2];
+  const float scale = ::emel::kernel::detail::flash_attn_scale(request);
+  const uint64_t n_rep = head_count / kv_head_count;
+  for (uint64_t head = 0u; head < head_count; ++head) {
+    const uint64_t kv_head = head / n_rep;
+    const float *q =
+        ::emel::kernel::detail::tensor_row_ptr(request.src0, 0u, head);
+    uint16_t *accum = workspace.accum_buffer_f16.data();
+    float *dst =
+        ::emel::kernel::detail::tensor_row_ptr_mut(request.dst, 0u, head);
+
+    convert_f32_to_f16_buffer_avx2_f16c(q, workspace.q_buffer_f16.data(),
+                                        head_dim);
+    std::memset(accum, 0, sizeof(uint16_t) * head_dim);
+
+    const auto *k_head_base = static_cast<const char *>(request.src1.data) +
+                              kv_head * request.src1.nb[2];
+    const auto *v_head_base = static_cast<const char *>(request.src2.data) +
+                              kv_head * request.src2.nb[2];
+    const uint64_t k_stride = request.src1.nb[1];
+    const uint64_t v_stride = request.src2.nb[1];
+    const char *k_ptr_bytes = k_head_base;
+    const char *v_ptr_bytes = v_head_base;
+
+    float score_sum = 0.0f;
+    float max_score = -std::numeric_limits<float>::infinity();
+    for (uint64_t token = 0u; token < kv_tokens; ++token) {
+      const uint16_t *k = reinterpret_cast<const uint16_t *>(k_ptr_bytes);
+      const float score = dot_product_f16_f16_scores_avx2_fma(
+                              workspace.q_buffer_f16.data(), k, head_dim) *
+                          scale;
+      const float old_max = max_score;
+      float max_scale = 1.0f;
+      float value_scale = 1.0f;
+      if (score > max_score) {
+        max_score = score;
+        max_scale = std::exp(old_max - max_score);
+        scale_f16_buffer_avx2_f16c(accum, max_scale, head_dim);
+      } else {
+        value_scale = std::exp(score - max_score);
+      }
+
+      const uint16_t *v = reinterpret_cast<const uint16_t *>(v_ptr_bytes);
+      axpy_f16_buffer_avx2_fma_f16c(accum, v, value_scale, head_dim);
+      score_sum = score_sum * max_scale + value_scale;
+
+      k_ptr_bytes += k_stride;
+      v_ptr_bytes += v_stride;
+    }
+
+    convert_f16_buffer_to_f32_avx2_f16c(accum, dst, head_dim);
+    if (score_sum == 0.0f) {
+      std::fill_n(dst, head_dim, 0.0f);
+    } else {
+      scale_f32_avx2(dst, 1.0f / score_sum, head_dim);
+    }
+  }
+}
+
+template <class request_type>
+inline void run_flash_attn_ext_avx2_fma_f16c_unchecked(
+    const request_type &request,
+    ::emel::kernel::detail::flash_attn_workspace &workspace) noexcept {
+  run_flash_attn_ext_f16kv_one_chunk_avx2_fma_f16c_unchecked(request,
+                                                             workspace);
+}
+
+template <class request_type>
+inline bool run_flash_attn_ext_avx2_fma_f16c(
+    const request_type &request, const host_feature_contract &host_features,
+    ::emel::kernel::detail::flash_attn_workspace &workspace) noexcept {
+  if (!can_run_avx2_fma_f16c_flash_attn_ext_f16kv_one_chunk_request(
+          request, host_features, workspace)) {
+    return false;
+  }
+  run_flash_attn_ext_avx2_fma_f16c_unchecked(request, workspace);
+  return true;
+}
+
 EMEL_KERNEL_X86_AVX2_TARGET
-inline bool execute_avx2_dup(const event::op_dup & request) noexcept {
+inline bool execute_avx2_dup(const event::op_dup &request) noexcept {
 #if defined(__x86_64__) || defined(_M_X64)
 #if defined(__AVX2__) || defined(__GNUC__) || defined(__clang__)
-  const uint64_t count = ::emel::kernel::detail::tensor_element_count(request.dst);
-  const float * src = static_cast<const float *>(request.src0.data);
-  float * dst = static_cast<float *>(request.dst.data);
+  const uint64_t count =
+      ::emel::kernel::detail::tensor_element_count(request.dst);
+  const float *src = static_cast<const float *>(request.src0.data);
+  float *dst = static_cast<float *>(request.dst.data);
 
   uint64_t i = 0;
   for (; i + 8 <= count; i += 8) {
@@ -213,23 +1699,24 @@ inline bool execute_avx2_dup(const event::op_dup & request) noexcept {
   }
   return true;
 #else
-  (void) request;
+  (void)request;
   return false;
 #endif
 #else
-  (void) request;
+  (void)request;
   return false;
 #endif
 }
 
 EMEL_KERNEL_X86_AVX2_TARGET
-inline bool execute_avx2_add(const event::op_add & request) noexcept {
+inline bool execute_avx2_add(const event::op_add &request) noexcept {
 #if defined(__x86_64__) || defined(_M_X64)
 #if defined(__AVX2__) || defined(__GNUC__) || defined(__clang__)
-  const uint64_t count = ::emel::kernel::detail::tensor_element_count(request.dst);
-  const float * lhs = static_cast<const float *>(request.src0.data);
-  const float * rhs = static_cast<const float *>(request.src1.data);
-  float * dst = static_cast<float *>(request.dst.data);
+  const uint64_t count =
+      ::emel::kernel::detail::tensor_element_count(request.dst);
+  const float *lhs = static_cast<const float *>(request.src0.data);
+  const float *rhs = static_cast<const float *>(request.src1.data);
+  float *dst = static_cast<float *>(request.dst.data);
 
   uint64_t i = 0;
   for (; i + 8 <= count; i += 8) {
@@ -242,23 +1729,24 @@ inline bool execute_avx2_add(const event::op_add & request) noexcept {
   }
   return true;
 #else
-  (void) request;
+  (void)request;
   return false;
 #endif
 #else
-  (void) request;
+  (void)request;
   return false;
 #endif
 }
 
 EMEL_KERNEL_X86_AVX2_TARGET
-inline bool execute_avx2_sub(const event::op_sub & request) noexcept {
+inline bool execute_avx2_sub(const event::op_sub &request) noexcept {
 #if defined(__x86_64__) || defined(_M_X64)
 #if defined(__AVX2__) || defined(__GNUC__) || defined(__clang__)
-  const uint64_t count = ::emel::kernel::detail::tensor_element_count(request.dst);
-  const float * lhs = static_cast<const float *>(request.src0.data);
-  const float * rhs = static_cast<const float *>(request.src1.data);
-  float * dst = static_cast<float *>(request.dst.data);
+  const uint64_t count =
+      ::emel::kernel::detail::tensor_element_count(request.dst);
+  const float *lhs = static_cast<const float *>(request.src0.data);
+  const float *rhs = static_cast<const float *>(request.src1.data);
+  float *dst = static_cast<float *>(request.dst.data);
 
   uint64_t i = 0;
   for (; i + 8 <= count; i += 8) {
@@ -271,23 +1759,24 @@ inline bool execute_avx2_sub(const event::op_sub & request) noexcept {
   }
   return true;
 #else
-  (void) request;
+  (void)request;
   return false;
 #endif
 #else
-  (void) request;
+  (void)request;
   return false;
 #endif
 }
 
 EMEL_KERNEL_X86_AVX2_TARGET
-inline bool execute_avx2_mul(const event::op_mul & request) noexcept {
+inline bool execute_avx2_mul(const event::op_mul &request) noexcept {
 #if defined(__x86_64__) || defined(_M_X64)
 #if defined(__AVX2__) || defined(__GNUC__) || defined(__clang__)
-  const uint64_t count = ::emel::kernel::detail::tensor_element_count(request.dst);
-  const float * lhs = static_cast<const float *>(request.src0.data);
-  const float * rhs = static_cast<const float *>(request.src1.data);
-  float * dst = static_cast<float *>(request.dst.data);
+  const uint64_t count =
+      ::emel::kernel::detail::tensor_element_count(request.dst);
+  const float *lhs = static_cast<const float *>(request.src0.data);
+  const float *rhs = static_cast<const float *>(request.src1.data);
+  float *dst = static_cast<float *>(request.dst.data);
 
   uint64_t i = 0;
   for (; i + 8 <= count; i += 8) {
@@ -300,23 +1789,24 @@ inline bool execute_avx2_mul(const event::op_mul & request) noexcept {
   }
   return true;
 #else
-  (void) request;
+  (void)request;
   return false;
 #endif
 #else
-  (void) request;
+  (void)request;
   return false;
 #endif
 }
 
 EMEL_KERNEL_X86_AVX2_TARGET
-inline bool execute_avx2_div(const event::op_div & request) noexcept {
+inline bool execute_avx2_div(const event::op_div &request) noexcept {
 #if defined(__x86_64__) || defined(_M_X64)
 #if defined(__AVX2__) || defined(__GNUC__) || defined(__clang__)
-  const uint64_t count = ::emel::kernel::detail::tensor_element_count(request.dst);
-  const float * lhs = static_cast<const float *>(request.src0.data);
-  const float * rhs = static_cast<const float *>(request.src1.data);
-  float * dst = static_cast<float *>(request.dst.data);
+  const uint64_t count =
+      ::emel::kernel::detail::tensor_element_count(request.dst);
+  const float *lhs = static_cast<const float *>(request.src0.data);
+  const float *rhs = static_cast<const float *>(request.src1.data);
+  float *dst = static_cast<float *>(request.dst.data);
 
   uint64_t i = 0;
   for (; i + 8 <= count; i += 8) {
@@ -329,22 +1819,23 @@ inline bool execute_avx2_div(const event::op_div & request) noexcept {
   }
   return true;
 #else
-  (void) request;
+  (void)request;
   return false;
 #endif
 #else
-  (void) request;
+  (void)request;
   return false;
 #endif
 }
 
 EMEL_KERNEL_X86_AVX2_TARGET
-inline bool execute_avx2_sqr(const event::op_sqr & request) noexcept {
+inline bool execute_avx2_sqr(const event::op_sqr &request) noexcept {
 #if defined(__x86_64__) || defined(_M_X64)
 #if defined(__AVX2__) || defined(__GNUC__) || defined(__clang__)
-  const uint64_t count = ::emel::kernel::detail::tensor_element_count(request.dst);
-  const float * src = static_cast<const float *>(request.src0.data);
-  float * dst = static_cast<float *>(request.dst.data);
+  const uint64_t count =
+      ::emel::kernel::detail::tensor_element_count(request.dst);
+  const float *src = static_cast<const float *>(request.src0.data);
+  float *dst = static_cast<float *>(request.dst.data);
 
   uint64_t i = 0;
   for (; i + 8 <= count; i += 8) {
@@ -356,22 +1847,23 @@ inline bool execute_avx2_sqr(const event::op_sqr & request) noexcept {
   }
   return true;
 #else
-  (void) request;
+  (void)request;
   return false;
 #endif
 #else
-  (void) request;
+  (void)request;
   return false;
 #endif
 }
 
 EMEL_KERNEL_X86_AVX2_TARGET
-inline bool execute_avx2_sqrt(const event::op_sqrt & request) noexcept {
+inline bool execute_avx2_sqrt(const event::op_sqrt &request) noexcept {
 #if defined(__x86_64__) || defined(_M_X64)
 #if defined(__AVX2__) || defined(__GNUC__) || defined(__clang__)
-  const uint64_t count = ::emel::kernel::detail::tensor_element_count(request.dst);
-  const float * src = static_cast<const float *>(request.src0.data);
-  float * dst = static_cast<float *>(request.dst.data);
+  const uint64_t count =
+      ::emel::kernel::detail::tensor_element_count(request.dst);
+  const float *src = static_cast<const float *>(request.src0.data);
+  float *dst = static_cast<float *>(request.dst.data);
 
   uint64_t i = 0;
   for (; i + 8 <= count; i += 8) {
@@ -383,30 +1875,30 @@ inline bool execute_avx2_sqrt(const event::op_sqrt & request) noexcept {
   }
   return true;
 #else
-  (void) request;
+  (void)request;
   return false;
 #endif
 #else
-  (void) request;
+  (void)request;
   return false;
 #endif
 }
 
 EMEL_KERNEL_X86_AVX2_TARGET
-inline bool execute_avx2_mul_mat(const event::op_mul_mat & request) noexcept {
+inline bool execute_avx2_mul_mat(const event::op_mul_mat &request) noexcept {
 #if defined(__x86_64__) || defined(_M_X64)
 #if defined(__AVX2__) || defined(__GNUC__) || defined(__clang__)
   const uint64_t k = request.src0.ne[0];
   const uint64_t m = request.src0.ne[1];
   const uint64_t n = request.src1.ne[0];
   const bool valid_dims = k != 0 && m != 0 && n != 0;
-  const bool valid_layout =
-      request.src1.ne[1] == k && request.dst.ne[0] == n && request.dst.ne[1] == m;
+  const bool valid_layout = request.src1.ne[1] == k && request.dst.ne[0] == n &&
+                            request.dst.ne[1] == m;
   const bool valid = valid_dims && valid_layout;
   const uint64_t valid_u64 = static_cast<uint64_t>(valid);
-  const float * a = static_cast<const float *>(request.src0.data);
-  const float * b = static_cast<const float *>(request.src1.data);
-  float * c = static_cast<float *>(request.dst.data);
+  const float *a = static_cast<const float *>(request.src0.data);
+  const float *b = static_cast<const float *>(request.src1.data);
+  float *c = static_cast<float *>(request.dst.data);
 
   constexpr uint64_t row_block = 4;
   constexpr uint64_t col_vec = 8;
@@ -423,19 +1915,20 @@ inline bool execute_avx2_mul_mat(const event::op_mul_mat & request) noexcept {
       const uint64_t depth = std::min<uint64_t>(depth_block, k - pb);
       const bool first_depth_block = (pb == 0);
       const __m256 zero = _mm256_setzero_ps();
-      const __m256 depth_reset_mask =
-          _mm256_castsi256_ps(_mm256_set1_epi32(-static_cast<int32_t>(first_depth_block)));
+      const __m256 depth_reset_mask = _mm256_castsi256_ps(
+          _mm256_set1_epi32(-static_cast<int32_t>(first_depth_block)));
 
       for (uint64_t kk = 0; kk < depth; ++kk) {
-        const float * b_src = b + (pb + kk) * n + jb;
-        float * b_dst = packed_b + kk * vec_cols;
-        std::memcpy(b_dst, b_src, static_cast<size_t>(vec_cols) * sizeof(float));
+        const float *b_src = b + (pb + kk) * n + jb;
+        float *b_dst = packed_b + kk * vec_cols;
+        std::memcpy(b_dst, b_src,
+                    static_cast<size_t>(vec_cols) * sizeof(float));
 #if defined(__GNUC__) || defined(__clang__)
         const uint64_t prefetch_distance =
             16u * static_cast<uint64_t>((kk & 15u) == 0u && kk + 16u < depth);
-        _mm_prefetch(
-            reinterpret_cast<const char *>(b + (pb + kk + prefetch_distance) * n + jb),
-            _MM_HINT_T0);
+        _mm_prefetch(reinterpret_cast<const char *>(
+                         b + (pb + kk + prefetch_distance) * n + jb),
+                     _MM_HINT_T0);
 #endif
       }
 
@@ -453,15 +1946,20 @@ inline bool execute_avx2_mul_mat(const event::op_mul_mat & request) noexcept {
           acc3 = _mm256_blendv_ps(acc3, zero, depth_reset_mask);
 
           for (uint64_t kk = 0; kk < depth; ++kk) {
-            const __m256 bv = _mm256_loadu_ps(packed_b + kk * vec_cols + j_offset);
+            const __m256 bv =
+                _mm256_loadu_ps(packed_b + kk * vec_cols + j_offset);
             acc0 = _mm256_add_ps(
-                acc0, _mm256_mul_ps(_mm256_set1_ps(a[(i + 0) * k + pb + kk]), bv));
+                acc0,
+                _mm256_mul_ps(_mm256_set1_ps(a[(i + 0) * k + pb + kk]), bv));
             acc1 = _mm256_add_ps(
-                acc1, _mm256_mul_ps(_mm256_set1_ps(a[(i + 1) * k + pb + kk]), bv));
+                acc1,
+                _mm256_mul_ps(_mm256_set1_ps(a[(i + 1) * k + pb + kk]), bv));
             acc2 = _mm256_add_ps(
-                acc2, _mm256_mul_ps(_mm256_set1_ps(a[(i + 2) * k + pb + kk]), bv));
+                acc2,
+                _mm256_mul_ps(_mm256_set1_ps(a[(i + 2) * k + pb + kk]), bv));
             acc3 = _mm256_add_ps(
-                acc3, _mm256_mul_ps(_mm256_set1_ps(a[(i + 3) * k + pb + kk]), bv));
+                acc3,
+                _mm256_mul_ps(_mm256_set1_ps(a[(i + 3) * k + pb + kk]), bv));
           }
 
           _mm256_storeu_ps(c + (i + 0) * n + j, acc0);
@@ -474,7 +1972,8 @@ inline bool execute_avx2_mul_mat(const event::op_mul_mat & request) noexcept {
           __m256 acc = _mm256_loadu_ps(c + i * n + j);
           acc = _mm256_blendv_ps(acc, zero, depth_reset_mask);
           for (uint64_t kk = 0; kk < depth; ++kk) {
-            const __m256 bv = _mm256_loadu_ps(packed_b + kk * vec_cols + j_offset);
+            const __m256 bv =
+                _mm256_loadu_ps(packed_b + kk * vec_cols + j_offset);
             acc = _mm256_add_ps(
                 acc, _mm256_mul_ps(_mm256_set1_ps(a[i * k + pb + kk]), bv));
           }
@@ -502,100 +2001,249 @@ inline bool execute_avx2_mul_mat(const event::op_mul_mat & request) noexcept {
 
   return valid;
 #else
-  (void) request;
+  (void)request;
   return false;
 #endif
 #else
-  (void) request;
+  (void)request;
   return false;
 #endif
 }
 
-EMEL_KERNEL_X86_AVX2_TARGET
-inline bool execute_avx2_unary(const event::op_unary & request) noexcept {
+EMEL_KERNEL_X86_AVX2_FMA_TARGET
+inline bool
+execute_avx2_fma_mul_mat(const event::op_mul_mat &request) noexcept {
 #if defined(__x86_64__) || defined(_M_X64)
-#if defined(__AVX2__) || defined(__GNUC__) || defined(__clang__)
-  const uint8_t subop_code = static_cast<uint8_t>(request.subop);
-  const size_t is_abs =
-      static_cast<size_t>(subop_code == static_cast<uint8_t>(event::unary_subop::abs));
-  const size_t is_neg =
-      static_cast<size_t>(subop_code == static_cast<uint8_t>(event::unary_subop::neg));
-  const size_t is_relu =
-      static_cast<size_t>(subop_code == static_cast<uint8_t>(event::unary_subop::relu));
-  const size_t kernel_index = is_abs * 1u + is_neg * 2u + is_relu * 3u;
-  const uint64_t count = ::emel::kernel::detail::tensor_element_count(request.dst);
-  const float * src = static_cast<const float *>(request.src0.data);
-  float * dst = static_cast<float *>(request.dst.data);
-  using unary_kernel_t = void (*)(const float *, float *, uint64_t) noexcept;
-  constexpr unary_kernel_t noop_kernel = +[](const float *, float *, uint64_t) noexcept {};
-  constexpr std::array<unary_kernel_t, 4> kernels = {
-      noop_kernel,
-      execute_avx2_unary_abs,
-      execute_avx2_unary_neg,
-      execute_avx2_unary_relu,
-  };
-  kernels[kernel_index](src, dst, count);
-  return kernel_index != 0u;
+#if (defined(__AVX2__) && defined(__FMA__)) || defined(__GNUC__) ||            \
+    defined(__clang__)
+  const uint64_t k = request.src0.ne[0];
+  const uint64_t m = request.src0.ne[1];
+  const uint64_t n = request.src1.ne[0];
+  const bool valid_dims = k != 0 && m != 0 && n != 0;
+  const bool valid_layout = request.src1.ne[1] == k && request.dst.ne[0] == n &&
+                            request.dst.ne[1] == m;
+  const bool valid = valid_dims && valid_layout;
+  const uint64_t valid_u64 = static_cast<uint64_t>(valid);
+  const float *a = static_cast<const float *>(request.src0.data);
+  const float *b = static_cast<const float *>(request.src1.data);
+  float *c = static_cast<float *>(request.dst.data);
+
+  constexpr uint64_t row_block = 4;
+  constexpr uint64_t col_vec = 8;
+  constexpr uint64_t col_block = 64;
+  constexpr uint64_t depth_block = 64;
+  alignas(64) static thread_local float packed_b[depth_block * col_block];
+
+  for (uint64_t jb = 0; jb < n * valid_u64; jb += col_block) {
+    const uint64_t j_end = std::min<uint64_t>(n, jb + col_block);
+    const uint64_t vec_cols = ((j_end - jb) / col_vec) * col_vec;
+    const uint64_t j_vec_end = jb + vec_cols;
+
+    for (uint64_t pb = 0; pb < k * valid_u64; pb += depth_block) {
+      const uint64_t depth = std::min<uint64_t>(depth_block, k - pb);
+      const bool first_depth_block = (pb == 0);
+      const __m256 zero = _mm256_setzero_ps();
+      const __m256 depth_reset_mask = _mm256_castsi256_ps(
+          _mm256_set1_epi32(-static_cast<int32_t>(first_depth_block)));
+
+      for (uint64_t kk = 0; kk < depth; ++kk) {
+        const float *b_src = b + (pb + kk) * n + jb;
+        float *b_dst = packed_b + kk * vec_cols;
+        std::memcpy(b_dst, b_src,
+                    static_cast<size_t>(vec_cols) * sizeof(float));
+#if defined(__GNUC__) || defined(__clang__)
+        const uint64_t prefetch_distance =
+            16u * static_cast<uint64_t>((kk & 15u) == 0u && kk + 16u < depth);
+        _mm_prefetch(reinterpret_cast<const char *>(
+                         b + (pb + kk + prefetch_distance) * n + jb),
+                     _MM_HINT_T0);
+#endif
+      }
+
+      for (uint64_t j = jb; j < j_vec_end; j += col_vec) {
+        const uint64_t j_offset = j - jb;
+        uint64_t i = 0;
+        for (; i + row_block <= m; i += row_block) {
+          __m256 acc0 = _mm256_loadu_ps(c + (i + 0) * n + j);
+          __m256 acc1 = _mm256_loadu_ps(c + (i + 1) * n + j);
+          __m256 acc2 = _mm256_loadu_ps(c + (i + 2) * n + j);
+          __m256 acc3 = _mm256_loadu_ps(c + (i + 3) * n + j);
+          acc0 = _mm256_blendv_ps(acc0, zero, depth_reset_mask);
+          acc1 = _mm256_blendv_ps(acc1, zero, depth_reset_mask);
+          acc2 = _mm256_blendv_ps(acc2, zero, depth_reset_mask);
+          acc3 = _mm256_blendv_ps(acc3, zero, depth_reset_mask);
+
+          for (uint64_t kk = 0; kk < depth; ++kk) {
+            const __m256 bv =
+                _mm256_loadu_ps(packed_b + kk * vec_cols + j_offset);
+            acc0 = _mm256_fmadd_ps(_mm256_set1_ps(a[(i + 0) * k + pb + kk]),
+                                   bv, acc0);
+            acc1 = _mm256_fmadd_ps(_mm256_set1_ps(a[(i + 1) * k + pb + kk]),
+                                   bv, acc1);
+            acc2 = _mm256_fmadd_ps(_mm256_set1_ps(a[(i + 2) * k + pb + kk]),
+                                   bv, acc2);
+            acc3 = _mm256_fmadd_ps(_mm256_set1_ps(a[(i + 3) * k + pb + kk]),
+                                   bv, acc3);
+          }
+
+          _mm256_storeu_ps(c + (i + 0) * n + j, acc0);
+          _mm256_storeu_ps(c + (i + 1) * n + j, acc1);
+          _mm256_storeu_ps(c + (i + 2) * n + j, acc2);
+          _mm256_storeu_ps(c + (i + 3) * n + j, acc3);
+        }
+
+        for (; i < m; ++i) {
+          __m256 acc = _mm256_loadu_ps(c + i * n + j);
+          acc = _mm256_blendv_ps(acc, zero, depth_reset_mask);
+          for (uint64_t kk = 0; kk < depth; ++kk) {
+            const __m256 bv =
+                _mm256_loadu_ps(packed_b + kk * vec_cols + j_offset);
+            acc = _mm256_fmadd_ps(_mm256_set1_ps(a[i * k + pb + kk]), bv, acc);
+          }
+          _mm256_storeu_ps(c + i * n + j, acc);
+        }
+      }
+
+      const uint32_t keep_existing_mask =
+          static_cast<uint32_t>(-static_cast<int32_t>(!first_depth_block));
+      for (uint64_t j = j_vec_end; j < j_end; ++j) {
+        for (uint64_t i = 0; i < m; ++i) {
+          uint32_t acc_bits = 0u;
+          std::memcpy(&acc_bits, c + i * n + j, sizeof(acc_bits));
+          acc_bits &= keep_existing_mask;
+          float acc = 0.0f;
+          std::memcpy(&acc, &acc_bits, sizeof(acc));
+          for (uint64_t kk = 0; kk < depth; ++kk) {
+            acc += a[i * k + pb + kk] * b[(pb + kk) * n + j];
+          }
+          c[i * n + j] = acc;
+        }
+      }
+    }
+  }
+
+  return valid;
 #else
-  (void) request;
+  (void)request;
   return false;
 #endif
 #else
-  (void) request;
+  (void)request;
   return false;
 #endif
 }
 
+EMEL_KERNEL_X86_AVX2_FMA_TARGET
+inline void execute_avx2_fma_mul_mat_f32_vector_unchecked(
+    const event::op_mul_mat &request) noexcept {
+#if defined(__x86_64__) || defined(_M_X64)
+#if (defined(__AVX2__) && defined(__FMA__)) || defined(__GNUC__) ||            \
+    defined(__clang__)
+  const uint64_t k = request.src0.ne[0];
+  const uint64_t m = request.src0.ne[1];
+  const float *a = static_cast<const float *>(request.src0.data);
+  const float *b = static_cast<const float *>(request.src1.data);
+  float *c = static_cast<float *>(request.dst.data);
+
+  const uint64_t vec_depth = (k / 32u) * 32u;
+  for (uint64_t i = 0; i < m; ++i) {
+    const float *row = a + i * k;
+    __m256 acc0 = _mm256_setzero_ps();
+    __m256 acc1 = _mm256_setzero_ps();
+    __m256 acc2 = _mm256_setzero_ps();
+    __m256 acc3 = _mm256_setzero_ps();
+    for (uint64_t kk = 0; kk < vec_depth; kk += 32u) {
+      acc0 = _mm256_fmadd_ps(_mm256_loadu_ps(row + kk),
+                             _mm256_loadu_ps(b + kk), acc0);
+      acc1 = _mm256_fmadd_ps(_mm256_loadu_ps(row + kk + 8u),
+                             _mm256_loadu_ps(b + kk + 8u), acc1);
+      acc2 = _mm256_fmadd_ps(_mm256_loadu_ps(row + kk + 16u),
+                             _mm256_loadu_ps(b + kk + 16u), acc2);
+      acc3 = _mm256_fmadd_ps(_mm256_loadu_ps(row + kk + 24u),
+                             _mm256_loadu_ps(b + kk + 24u), acc3);
+    }
+    uint64_t kk = vec_depth;
+    for (; kk + 8u <= k; kk += 8u) {
+      acc0 = _mm256_fmadd_ps(_mm256_loadu_ps(row + kk),
+                             _mm256_loadu_ps(b + kk), acc0);
+    }
+    const __m256 acc01 = _mm256_add_ps(acc0, acc1);
+    const __m256 acc23 = _mm256_add_ps(acc2, acc3);
+    const __m256 acc = _mm256_add_ps(acc01, acc23);
+    const __m128 low = _mm256_castps256_ps128(acc);
+    const __m128 high = _mm256_extractf128_ps(acc, 1);
+    __m128 sum = _mm_add_ps(low, high);
+    sum = _mm_hadd_ps(sum, sum);
+    sum = _mm_hadd_ps(sum, sum);
+    float total = _mm_cvtss_f32(sum);
+    for (; kk < k; ++kk) {
+      total += row[kk] * b[kk];
+    }
+    c[i] = total;
+  }
+  return;
+#endif
+#endif
+  (void)request;
+}
+
 EMEL_KERNEL_X86_AVX2_TARGET
-inline void execute_avx2_unary_abs_request(const event::op_unary & request) noexcept {
+inline void
+execute_avx2_unary_abs_request(const event::op_unary &request) noexcept {
 #if defined(__x86_64__) || defined(_M_X64)
 #if defined(__AVX2__) || defined(__GNUC__) || defined(__clang__)
-  const uint64_t count = ::emel::kernel::detail::tensor_element_count(request.dst);
-  const float * src = static_cast<const float *>(request.src0.data);
-  float * dst = static_cast<float *>(request.dst.data);
+  const uint64_t count =
+      ::emel::kernel::detail::tensor_element_count(request.dst);
+  const float *src = static_cast<const float *>(request.src0.data);
+  float *dst = static_cast<float *>(request.dst.data);
   execute_avx2_unary_abs(src, dst, count);
 #else
-  (void) request;
+  (void)request;
 #endif
 #else
-  (void) request;
+  (void)request;
 #endif
 }
 
 EMEL_KERNEL_X86_AVX2_TARGET
-inline void execute_avx2_unary_neg_request(const event::op_unary & request) noexcept {
+inline void
+execute_avx2_unary_neg_request(const event::op_unary &request) noexcept {
 #if defined(__x86_64__) || defined(_M_X64)
 #if defined(__AVX2__) || defined(__GNUC__) || defined(__clang__)
-  const uint64_t count = ::emel::kernel::detail::tensor_element_count(request.dst);
-  const float * src = static_cast<const float *>(request.src0.data);
-  float * dst = static_cast<float *>(request.dst.data);
+  const uint64_t count =
+      ::emel::kernel::detail::tensor_element_count(request.dst);
+  const float *src = static_cast<const float *>(request.src0.data);
+  float *dst = static_cast<float *>(request.dst.data);
   execute_avx2_unary_neg(src, dst, count);
 #else
-  (void) request;
+  (void)request;
 #endif
 #else
-  (void) request;
+  (void)request;
 #endif
 }
 
 EMEL_KERNEL_X86_AVX2_TARGET
-inline void execute_avx2_unary_relu_request(const event::op_unary & request) noexcept {
+inline void
+execute_avx2_unary_relu_request(const event::op_unary &request) noexcept {
 #if defined(__x86_64__) || defined(_M_X64)
 #if defined(__AVX2__) || defined(__GNUC__) || defined(__clang__)
-  const uint64_t count = ::emel::kernel::detail::tensor_element_count(request.dst);
-  const float * src = static_cast<const float *>(request.src0.data);
-  float * dst = static_cast<float *>(request.dst.data);
+  const uint64_t count =
+      ::emel::kernel::detail::tensor_element_count(request.dst);
+  const float *src = static_cast<const float *>(request.src0.data);
+  float *dst = static_cast<float *>(request.dst.data);
   execute_avx2_unary_relu(src, dst, count);
 #else
-  (void) request;
+  (void)request;
 #endif
 #else
-  (void) request;
+  (void)request;
 #endif
 }
 
 template <event::unary_subop subop>
-inline void execute_simd_unary_subop_unchecked(const event::op_unary & request) noexcept {
+inline void
+execute_simd_unary_subop_unchecked(const event::op_unary &request) noexcept {
   if constexpr (subop == event::unary_subop::abs) {
     execute_avx2_unary_abs_request(request);
   }
@@ -608,38 +2256,35 @@ inline void execute_simd_unary_subop_unchecked(const event::op_unary & request)
 }
 
 template <class request_type>
-inline void execute_simd_unchecked(const request_type & request) noexcept {
+inline void execute_simd_unchecked(const request_type &request) noexcept {
   if constexpr (std::is_same_v<request_type, event::op_dup>) {
-    (void) execute_avx2_dup(request);
+    (void)execute_avx2_dup(request);
   }
   if constexpr (std::is_same_v<request_type, event::op_add>) {
-    (void) execute_avx2_add(request);
+    (void)execute_avx2_add(request);
   }
   if constexpr (std::is_same_v<request_type, event::op_sub>) {
-    (void) execute_avx2_sub(request);
+    (void)execute_avx2_sub(request);
   }
   if constexpr (std::is_same_v<request_type, event::op_mul>) {
-    (void) execute_avx2_mul(request);
+    (void)execute_avx2_mul(request);
   }
   if constexpr (std::is_same_v<request_type, event::op_div>) {
-    (void) execute_avx2_div(request);
+    (void)execute_avx2_div(request);
   }
   if constexpr (std::is_same_v<request_type, event::op_sqr>) {
-    (void) execute_avx2_sqr(request);
+    (void)execute_avx2_sqr(request);
   }
   if constexpr (std::is_same_v<request_type, event::op_sqrt>) {
-    (void) execute_avx2_sqrt(request);
+    (void)execute_avx2_sqrt(request);
   }
   if constexpr (std::is_same_v<request_type, event::op_mul_mat>) {
-    (void) execute_avx2_mul_mat(request);
-  }
-  if constexpr (std::is_same_v<request_type, event::op_unary>) {
-    (void) execute_avx2_unary(request);
+    (void)execute_avx2_mul_mat(request);
   }
 }
 
 template <class request_type>
-inline bool execute_simd(const request_type & request) noexcept {
+inline bool execute_simd(const request_type &request) noexcept {
   if constexpr (std::is_same_v<request_type, event::op_dup>) {
     return execute_avx2_dup(request);
   }
@@ -664,37 +2309,36 @@ inline bool execute_simd(const request_type & request) noexcept {
   if constexpr (std::is_same_v<request_type, event::op_mul_mat>) {
     return execute_avx2_mul_mat(request);
   }
-  if constexpr (std::is_same_v<request_type, event::op_unary>) {
-    return execute_avx2_unary(request);
-  }
   return false;
 }
 
 template <class request_type, class context_type>
-inline bool execute_request(const request_type & request, const context_type & ctx) noexcept {
+inline bool execute_request(const request_type &request,
+                            const context_type &ctx) noexcept {
 #if defined(__x86_64__) || defined(_M_X64)
-  const bool simd_succeeded = can_use_avx2(request, ctx.avx2_available) && execute_simd(request);
+  const bool simd_succeeded =
+      can_use_avx2(request, ctx.avx2_available) && execute_simd(request);
   return simd_succeeded || ::emel::kernel::detail::execute_scalar(request);
 #else
-  (void) ctx;
+  (void)ctx;
   return ::emel::kernel::detail::execute_scalar(request);
 #endif
 }
 
-}  // namespace emel::kernel::x86_64::detail
+} // namespace emel::kernel::x86_64::detail
 namespace emel::kernel::x86_64::action {
 
 namespace detail {
 
 template <class dispatch_event_type>
-inline void mark_done(const dispatch_event_type & ev, context & ctx) noexcept {
+inline void mark_done(const dispatch_event_type &ev, context &ctx) noexcept {
   ++ctx.dispatch_generation;
   ev.ctx.outcome = events::phase_outcome::done;
   ev.ctx.err = static_cast<int32_t>(emel::error::cast(error::none));
 }
 
 template <class dispatch_event_type>
-inline void mark_error(const dispatch_event_type & ev, context & ctx,
+inline void mark_error(const dispatch_event_type &ev, context &ctx,
                        const int32_t err) noexcept {
   ++ctx.dispatch_generation;
   ev.ctx.outcome = events::phase_outcome::failed;
@@ -703,78 +2347,219 @@ inline void mark_error(const dispatch_event_type & ev, context & ctx,
 
 struct mark_done_op {
   template <class dispatch_event_type>
-  void operator()(const dispatch_event_type & ev, context & ctx) const noexcept {
+  void operator()(const dispatch_event_type &ev, context &ctx) const noexcept {
     mark_done(ev, ctx);
   }
 };
 
 struct exec_dispatch {
-  void operator()(const ::emel::kernel::x86_64::event::dispatch_request & ev,
-                  context & ctx) const noexcept {
+  void operator()(const ::emel::kernel::x86_64::event::dispatch_request &ev,
+                  context &ctx) const noexcept {
     detail::mark_done(ev, ctx);
   }
 };
 
-template <class dispatch_event_type>
-struct exec_scalar_op {
-  void operator()(const dispatch_event_type & ev, context & ctx) const noexcept {
+template <class dispatch_event_type> struct exec_scalar_op {
+  void operator()(const dispatch_event_type &ev, context &ctx) const noexcept {
     using request_type = std::remove_cvref_t<decltype(ev.request)>;
-    if constexpr (std::is_same_v<request_type, ::emel::kernel::event::op_flash_attn_ext>) {
-      if (::emel::kernel::detail::run_flash_attn_ext_with_workspace(ev.request,
-                                                                    ctx.flash_attn_workspace)) {
+    if constexpr (std::is_same_v<request_type,
+                                 ::emel::kernel::event::op_flash_attn_ext>) {
+      if (::emel::kernel::detail::run_flash_attn_ext_with_workspace(
+              ev.request, ctx.flash_attn_workspace)) {
+        ++ctx.shared_flash_dispatch_count;
         detail::mark_done(ev, ctx);
       } else {
         detail::mark_error(
-            ev, ctx, static_cast<int32_t>(emel::error::cast(error::invalid_request)));
+            ev, ctx,
+            static_cast<int32_t>(emel::error::cast(error::invalid_request)));
       }
     } else {
+      if constexpr (std::is_same_v<request_type,
+                                   ::emel::kernel::event::op_mul_mat>) {
+        const uint8_t src0_type =
+            ::emel::kernel::detail::dtype_code(ev.request.src0.type);
+        ctx.shared_q2_dispatch_count += static_cast<uint64_t>(
+            src0_type == ::emel::kernel::detail::dtype_q2_k);
+        ctx.shared_q3_dispatch_count += static_cast<uint64_t>(
+            src0_type == ::emel::kernel::detail::dtype_q3_k);
+        ctx.shared_q4_dispatch_count += static_cast<uint64_t>(
+            src0_type == ::emel::kernel::detail::dtype_q4_k);
+        ctx.shared_q6_dispatch_count += static_cast<uint64_t>(
+            src0_type == ::emel::kernel::detail::dtype_q6_k);
+        ctx.shared_q4_0_dispatch_count += static_cast<uint64_t>(
+            src0_type == ::emel::kernel::detail::dtype_q4_0);
+        ctx.shared_q4_1_dispatch_count += static_cast<uint64_t>(
+            src0_type == ::emel::kernel::detail::dtype_q4_1);
+        ctx.shared_q5_0_dispatch_count += static_cast<uint64_t>(
+            src0_type == ::emel::kernel::detail::dtype_q5_0);
+        ctx.shared_q8_0_dispatch_count += static_cast<uint64_t>(
+            src0_type == ::emel::kernel::detail::dtype_q8_0);
+      }
       ::emel::kernel::detail::execute_scalar_unchecked(ev.request);
       detail::mark_done(ev, ctx);
     }
   }
 };
 
-template <class dispatch_event_type>
-struct exec_simd_op {
-  void operator()(const dispatch_event_type & ev, context & ctx) const noexcept {
+struct exec_simd_flash_attn_ext_f16kv_one_chunk {
+  void operator()(
+      const ::emel::kernel::x86_64::event::dispatch_op_flash_attn_ext &ev,
+      context &ctx) const noexcept {
+    ::emel::kernel::x86_64::detail::
+        run_flash_attn_ext_f16kv_one_chunk_avx2_fma_f16c_unchecked(
+            ev.request, ctx.flash_attn_workspace);
+    ++ctx.optimized_flash_dispatch_count;
+    detail::mark_done(ev, ctx);
+  }
+};
+
+struct effect_exec_simd_q2_k_q8_k_op_mul_mat {
+  void operator()(const ::emel::kernel::x86_64::event::dispatch_op_mul_mat &ev,
+                  context &ctx) const noexcept {
+    ::emel::kernel::x86_64::detail::
+        execute_avx2_fma_mul_mat_q2_k_q8_k_unchecked(ev.request);
+    ++ctx.optimized_q2_dispatch_count;
+    detail::mark_done(ev, ctx);
+  }
+};
+
+struct effect_exec_simd_q3_k_q8_k_op_mul_mat {
+  void operator()(const ::emel::kernel::x86_64::event::dispatch_op_mul_mat &ev,
+                  context &ctx) const noexcept {
+    ::emel::kernel::x86_64::detail::
+        execute_avx2_fma_mul_mat_q3_k_q8_k_unchecked(ev.request);
+    ++ctx.optimized_q3_dispatch_count;
+    detail::mark_done(ev, ctx);
+  }
+};
+
+struct effect_exec_simd_f32_fma_vector_op_mul_mat {
+  void operator()(const ::emel::kernel::x86_64::event::dispatch_op_mul_mat &ev,
+                  context &ctx) const noexcept {
+    ::emel::kernel::x86_64::detail::
+        execute_avx2_fma_mul_mat_f32_vector_unchecked(ev.request);
+    ++ctx.optimized_f32_fma_vector_dispatch_count;
+    detail::mark_done(ev, ctx);
+  }
+};
+
+struct effect_exec_simd_f32_fma_op_mul_mat {
+  void operator()(const ::emel::kernel::x86_64::event::dispatch_op_mul_mat &ev,
+                  context &ctx) const noexcept {
+    ::emel::kernel::x86_64::detail::execute_avx2_fma_mul_mat(ev.request);
+    ++ctx.optimized_f32_fma_dispatch_count;
+    detail::mark_done(ev, ctx);
+  }
+};
+
+struct effect_exec_simd_q4_k_q8_k_op_mul_mat {
+  void operator()(const ::emel::kernel::x86_64::event::dispatch_op_mul_mat &ev,
+                  context &ctx) const noexcept {
+    ::emel::kernel::x86_64::detail::
+        execute_avx2_fma_mul_mat_q4_k_q8_k_unchecked(ev.request);
+    ++ctx.optimized_q4_dispatch_count;
+    detail::mark_done(ev, ctx);
+  }
+};
+
+struct effect_exec_simd_q6_k_q8_k_op_mul_mat {
+  void operator()(const ::emel::kernel::x86_64::event::dispatch_op_mul_mat &ev,
+                  context &ctx) const noexcept {
+    ::emel::kernel::x86_64::detail::
+        execute_avx2_fma_mul_mat_q6_k_q8_k_unchecked(ev.request);
+    ++ctx.optimized_q6_dispatch_count;
+    detail::mark_done(ev, ctx);
+  }
+};
+
+struct effect_exec_simd_q4_0_q8_0_op_mul_mat {
+  void operator()(const ::emel::kernel::x86_64::event::dispatch_op_mul_mat &ev,
+                  context &ctx) const noexcept {
+    ::emel::kernel::x86_64::detail::
+        execute_avx2_fma_mul_mat_q4_0_q8_0_unchecked(ev.request);
+    ++ctx.optimized_q4_0_dispatch_count;
+    detail::mark_done(ev, ctx);
+  }
+};
+
+struct effect_exec_simd_q4_1_q8_0_op_mul_mat {
+  void operator()(const ::emel::kernel::x86_64::event::dispatch_op_mul_mat &ev,
+                  context &ctx) const noexcept {
+    ::emel::kernel::x86_64::detail::
+        execute_avx2_fma_mul_mat_q4_1_q8_0_unchecked(ev.request);
+    ++ctx.optimized_q4_1_dispatch_count;
+    detail::mark_done(ev, ctx);
+  }
+};
+
+struct effect_exec_simd_q5_0_q8_0_op_mul_mat {
+  void operator()(const ::emel::kernel::x86_64::event::dispatch_op_mul_mat &ev,
+                  context &ctx) const noexcept {
+    ::emel::kernel::x86_64::detail::
+        execute_avx2_fma_mul_mat_q5_0_q8_0_unchecked(ev.request);
+    ++ctx.optimized_q5_0_dispatch_count;
+    detail::mark_done(ev, ctx);
+  }
+};
+
+struct effect_exec_simd_q8_0_q8_0_op_mul_mat {
+  void operator()(const ::emel::kernel::x86_64::event::dispatch_op_mul_mat &ev,
+                  context &ctx) const noexcept {
+    ::emel::kernel::x86_64::detail::
+        execute_avx2_fma_mul_mat_q8_0_q8_0_unchecked(ev.request);
+    ++ctx.optimized_q8_0_dispatch_count;
+    detail::mark_done(ev, ctx);
+  }
+};
+
+template <class dispatch_event_type> struct exec_simd_op {
+  void operator()(const dispatch_event_type &ev, context &ctx) const noexcept {
     ::emel::kernel::x86_64::detail::execute_simd_unchecked(ev.request);
     detail::mark_done(ev, ctx);
   }
 };
 
-template <::emel::kernel::event::unary_subop subop>
-struct exec_simd_unary_op {
-  void operator()(const ::emel::kernel::x86_64::event::dispatch_op_unary & ev,
-                  context & ctx) const noexcept {
-    ::emel::kernel::x86_64::detail::execute_simd_unary_subop_unchecked<subop>(ev.request);
+template <::emel::kernel::event::unary_subop subop> struct exec_simd_unary_op {
+  void operator()(const ::emel::kernel::x86_64::event::dispatch_op_unary &ev,
+                  context &ctx) const noexcept {
+    ::emel::kernel::x86_64::detail::execute_simd_unary_subop_unchecked<subop>(
+        ev.request);
     detail::mark_done(ev, ctx);
   }
 };
 
-template <class dispatch_event_type>
-struct reject_op {
-  void operator()(const dispatch_event_type & ev, context & ctx) const noexcept {
-    detail::mark_error(ev, ctx, static_cast<int32_t>(emel::error::cast(error::invalid_request)));
+template <class dispatch_event_type> struct reject_op {
+  void operator()(const dispatch_event_type &ev, context &ctx) const noexcept {
+    detail::mark_error(
+        ev, ctx,
+        static_cast<int32_t>(emel::error::cast(error::invalid_request)));
   }
 };
 
-}  // namespace detail
+} // namespace detail
 
 using exec_dispatch_t = detail::exec_dispatch;
 
-#define EMEL_KERNEL_DECLARE_RUN_TYPE(op_name)                                \
-  using exec_##op_name##_t =                                                  \
-      detail::exec_scalar_op<::emel::kernel::x86_64::event::dispatch_##op_name>;
+#define EMEL_KERNEL_DECLARE_RUN_TYPE(op_name)                                  \
+  using exec_##op_name##_t = detail::exec_scalar_op<                           \
+      ::emel::kernel::x86_64::event::dispatch_##op_name>;
 EMEL_KERNEL_OP_EVENT_LIST(EMEL_KERNEL_DECLARE_RUN_TYPE)
 #undef EMEL_KERNEL_DECLARE_RUN_TYPE
 
-using exec_simd_op_dup_t = detail::exec_simd_op<::emel::kernel::x86_64::event::dispatch_op_dup>;
-using exec_simd_op_add_t = detail::exec_simd_op<::emel::kernel::x86_64::event::dispatch_op_add>;
-using exec_simd_op_sub_t = detail::exec_simd_op<::emel::kernel::x86_64::event::dispatch_op_sub>;
-using exec_simd_op_mul_t = detail::exec_simd_op<::emel::kernel::x86_64::event::dispatch_op_mul>;
-using exec_simd_op_div_t = detail::exec_simd_op<::emel::kernel::x86_64::event::dispatch_op_div>;
-using exec_simd_op_sqr_t = detail::exec_simd_op<::emel::kernel::x86_64::event::dispatch_op_sqr>;
-using exec_simd_op_sqrt_t = detail::exec_simd_op<::emel::kernel::x86_64::event::dispatch_op_sqrt>;
+using exec_simd_op_dup_t =
+    detail::exec_simd_op<::emel::kernel::x86_64::event::dispatch_op_dup>;
+using exec_simd_op_add_t =
+    detail::exec_simd_op<::emel::kernel::x86_64::event::dispatch_op_add>;
+using exec_simd_op_sub_t =
+    detail::exec_simd_op<::emel::kernel::x86_64::event::dispatch_op_sub>;
+using exec_simd_op_mul_t =
+    detail::exec_simd_op<::emel::kernel::x86_64::event::dispatch_op_mul>;
+using exec_simd_op_div_t =
+    detail::exec_simd_op<::emel::kernel::x86_64::event::dispatch_op_div>;
+using exec_simd_op_sqr_t =
+    detail::exec_simd_op<::emel::kernel::x86_64::event::dispatch_op_sqr>;
+using exec_simd_op_sqrt_t =
+    detail::exec_simd_op<::emel::kernel::x86_64::event::dispatch_op_sqrt>;
 using exec_simd_op_mul_mat_t =
     detail::exec_simd_op<::emel::kernel::x86_64::event::dispatch_op_mul_mat>;
 using exec_simd_op_unary_abs_t =
@@ -783,30 +2568,55 @@ using exec_simd_op_unary_neg_t =
     detail::exec_simd_unary_op<::emel::kernel::event::unary_subop::neg>;
 using exec_simd_op_unary_relu_t =
     detail::exec_simd_unary_op<::emel::kernel::event::unary_subop::relu>;
+using exec_simd_op_flash_attn_ext_f16kv_one_chunk_t =
+    detail::exec_simd_flash_attn_ext_f16kv_one_chunk;
+using effect_exec_simd_op_mul_mat_q2_k_q8_k_t =
+    detail::effect_exec_simd_q2_k_q8_k_op_mul_mat;
+using effect_exec_simd_op_mul_mat_q3_k_q8_k_t =
+    detail::effect_exec_simd_q3_k_q8_k_op_mul_mat;
+using effect_exec_simd_op_mul_mat_f32_fma_vector_t =
+    detail::effect_exec_simd_f32_fma_vector_op_mul_mat;
+using effect_exec_simd_op_mul_mat_f32_fma_t =
+    detail::effect_exec_simd_f32_fma_op_mul_mat;
+using effect_exec_simd_op_mul_mat_q4_k_q8_k_t =
+    detail::effect_exec_simd_q4_k_q8_k_op_mul_mat;
+using effect_exec_simd_op_mul_mat_q6_k_q8_k_t =
+    detail::effect_exec_simd_q6_k_q8_k_op_mul_mat;
+using effect_exec_simd_op_mul_mat_q4_0_q8_0_t =
+    detail::effect_exec_simd_q4_0_q8_0_op_mul_mat;
+using effect_exec_simd_op_mul_mat_q4_1_q8_0_t =
+    detail::effect_exec_simd_q4_1_q8_0_op_mul_mat;
+using effect_exec_simd_op_mul_mat_q5_0_q8_0_t =
+    detail::effect_exec_simd_q5_0_q8_0_op_mul_mat;
+using effect_exec_simd_op_mul_mat_q8_0_q8_0_t =
+    detail::effect_exec_simd_q8_0_q8_0_op_mul_mat;
 using exec_scalar_op_unary_abs_t = ::emel::kernel::detail::exec_scalar_unary_op<
-    ::emel::kernel::x86_64::event::dispatch_op_unary, context, detail::mark_done_op,
-    ::emel::kernel::event::unary_subop::abs>;
+    ::emel::kernel::x86_64::event::dispatch_op_unary, context,
+    detail::mark_done_op, ::emel::kernel::event::unary_subop::abs>;
 using exec_scalar_op_unary_neg_t = ::emel::kernel::detail::exec_scalar_unary_op<
-    ::emel::kernel::x86_64::event::dispatch_op_unary, context, detail::mark_done_op,
-    ::emel::kernel::event::unary_subop::neg>;
-using exec_scalar_op_unary_relu_t = ::emel::kernel::detail::exec_scalar_unary_op<
-    ::emel::kernel::x86_64::event::dispatch_op_unary, context, detail::mark_done_op,
-    ::emel::kernel::event::unary_subop::relu>;
+    ::emel::kernel::x86_64::event::dispatch_op_unary, context,
+    detail::mark_done_op, ::emel::kernel::event::unary_subop::neg>;
+using exec_scalar_op_unary_relu_t =
+    ::emel::kernel::detail::exec_scalar_unary_op<
+        ::emel::kernel::x86_64::event::dispatch_op_unary, context,
+        detail::mark_done_op, ::emel::kernel::event::unary_subop::relu>;
 using exec_scalar_op_unary_exp_t = ::emel::kernel::detail::exec_scalar_unary_op<
-    ::emel::kernel::x86_64::event::dispatch_op_unary, context, detail::mark_done_op,
-    ::emel::kernel::event::unary_subop::exp>;
+    ::emel::kernel::x86_64::event::dispatch_op_unary, context,
+    detail::mark_done_op, ::emel::kernel::event::unary_subop::exp>;
 
-#define EMEL_KERNEL_DECLARE_REJECT_TYPE(op_name)                                      \
-  using reject_invalid_##op_name##_t =                                                \
+#define EMEL_KERNEL_DECLARE_REJECT_TYPE(op_name)                               \
+  using reject_invalid_##op_name##_t =                                         \
       detail::reject_op<::emel::kernel::x86_64::event::dispatch_##op_name>;
 EMEL_KERNEL_OP_EVENT_LIST(EMEL_KERNEL_DECLARE_REJECT_TYPE)
 #undef EMEL_KERNEL_DECLARE_REJECT_TYPE
 
 struct on_unexpected {
   template <class event_type>
-  void operator()(const event_type & ev, context & ctx) const noexcept {
+  void operator()(const event_type &ev, context &ctx) const noexcept {
     if constexpr (requires { ev.ctx; }) {
-      detail::mark_error(ev, ctx, static_cast<int32_t>(emel::error::cast(error::internal_error)));
+      detail::mark_error(
+          ev, ctx,
+          static_cast<int32_t>(emel::error::cast(error::internal_error)));
     } else {
       ++ctx.dispatch_generation;
     }
@@ -825,21 +2635,43 @@ inline constexpr exec_simd_op_mul_mat_t exec_simd_op_mul_mat{};
 inline constexpr exec_simd_op_unary_abs_t exec_simd_op_unary_abs{};
 inline constexpr exec_simd_op_unary_neg_t exec_simd_op_unary_neg{};
 inline constexpr exec_simd_op_unary_relu_t exec_simd_op_unary_relu{};
+inline constexpr exec_simd_op_flash_attn_ext_f16kv_one_chunk_t
+    exec_simd_op_flash_attn_ext_f16kv_one_chunk{};
+inline constexpr effect_exec_simd_op_mul_mat_q2_k_q8_k_t
+    effect_exec_simd_op_mul_mat_q2_k_q8_k{};
+inline constexpr effect_exec_simd_op_mul_mat_q3_k_q8_k_t
+    effect_exec_simd_op_mul_mat_q3_k_q8_k{};
+inline constexpr effect_exec_simd_op_mul_mat_f32_fma_vector_t
+    effect_exec_simd_op_mul_mat_f32_fma_vector{};
+inline constexpr effect_exec_simd_op_mul_mat_f32_fma_t
+    effect_exec_simd_op_mul_mat_f32_fma{};
+inline constexpr effect_exec_simd_op_mul_mat_q4_k_q8_k_t
+    effect_exec_simd_op_mul_mat_q4_k_q8_k{};
+inline constexpr effect_exec_simd_op_mul_mat_q6_k_q8_k_t
+    effect_exec_simd_op_mul_mat_q6_k_q8_k{};
+inline constexpr effect_exec_simd_op_mul_mat_q4_0_q8_0_t
+    effect_exec_simd_op_mul_mat_q4_0_q8_0{};
+inline constexpr effect_exec_simd_op_mul_mat_q4_1_q8_0_t
+    effect_exec_simd_op_mul_mat_q4_1_q8_0{};
+inline constexpr effect_exec_simd_op_mul_mat_q5_0_q8_0_t
+    effect_exec_simd_op_mul_mat_q5_0_q8_0{};
+inline constexpr effect_exec_simd_op_mul_mat_q8_0_q8_0_t
+    effect_exec_simd_op_mul_mat_q8_0_q8_0{};
 inline constexpr exec_scalar_op_unary_abs_t exec_scalar_op_unary_abs{};
 inline constexpr exec_scalar_op_unary_neg_t exec_scalar_op_unary_neg{};
 inline constexpr exec_scalar_op_unary_relu_t exec_scalar_op_unary_relu{};
 inline constexpr exec_scalar_op_unary_exp_t exec_scalar_op_unary_exp{};
 
-#define EMEL_KERNEL_DEFINE_RUN_ACTION(op_name) \
+#define EMEL_KERNEL_DEFINE_RUN_ACTION(op_name)                                 \
   inline constexpr exec_##op_name##_t exec_##op_name{};
 EMEL_KERNEL_OP_EVENT_LIST(EMEL_KERNEL_DEFINE_RUN_ACTION)
 #undef EMEL_KERNEL_DEFINE_RUN_ACTION
 
-#define EMEL_KERNEL_DEFINE_REJECT_ACTION(op_name)            \
+#define EMEL_KERNEL_DEFINE_REJECT_ACTION(op_name)                              \
   inline constexpr reject_invalid_##op_name##_t reject_invalid_##op_name{};
 EMEL_KERNEL_OP_EVENT_LIST(EMEL_KERNEL_DEFINE_REJECT_ACTION)
 #undef EMEL_KERNEL_DEFINE_REJECT_ACTION
 
 inline constexpr on_unexpected on_unexpected{};
 
-}  // namespace emel::kernel::x86_64::action
+} // namespace emel::kernel::x86_64::action
diff --git a/src/emel/kernel/x86_64/context.hpp b/src/emel/kernel/x86_64/context.hpp
index a060db14..aa41d7d9 100644
--- a/src/emel/kernel/x86_64/context.hpp
+++ b/src/emel/kernel/x86_64/context.hpp
@@ -2,32 +2,243 @@
 
 #include <cstdint>
 
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+#include <intrin.h>
+#elif (defined(__GNUC__) || defined(__clang__)) &&                             \
+    (defined(__x86_64__) || defined(__i386__))
+#include <cpuid.h>
+#endif
+
 #include "emel/kernel/detail.hpp"
 
-namespace emel::kernel::x86_64::action {
+namespace emel::kernel::x86_64::detail {
 
-namespace detail {
+struct cpuid_registers {
+  uint32_t eax = 0;
+  uint32_t ebx = 0;
+  uint32_t ecx = 0;
+  uint32_t edx = 0;
+};
+
+inline cpuid_registers read_cpuid(const uint32_t leaf,
+                                  const uint32_t subleaf) noexcept {
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+  int regs[4] = {};
+  __cpuidex(regs, static_cast<int>(leaf), static_cast<int>(subleaf));
+  return cpuid_registers{
+      .eax = static_cast<uint32_t>(regs[0]),
+      .ebx = static_cast<uint32_t>(regs[1]),
+      .ecx = static_cast<uint32_t>(regs[2]),
+      .edx = static_cast<uint32_t>(regs[3]),
+  };
+#elif (defined(__GNUC__) || defined(__clang__)) &&                             \
+    (defined(__x86_64__) || defined(__i386__))
+  uint32_t eax = 0;
+  uint32_t ebx = 0;
+  uint32_t ecx = 0;
+  uint32_t edx = 0;
+  __cpuid_count(leaf, subleaf, eax, ebx, ecx, edx);
+  return cpuid_registers{.eax = eax, .ebx = ebx, .ecx = ecx, .edx = edx};
+#else
+  (void)leaf;
+  (void)subleaf;
+  return {};
+#endif
+}
+
+inline uint64_t read_xcr0() noexcept {
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+  return _xgetbv(0);
+#elif (defined(__GNUC__) || defined(__clang__)) &&                             \
+    (defined(__x86_64__) || defined(__i386__))
+  uint32_t eax = 0;
+  uint32_t edx = 0;
+  __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
+  return (static_cast<uint64_t>(edx) << 32u) | eax;
+#else
+  return 0u;
+#endif
+}
+
+inline bool os_supports_avx_state() noexcept {
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) ||             \
+    defined(_M_IX86)
+  constexpr uint32_t XSAVE_BIT = 1u << 26u;
+  constexpr uint32_t OSXSAVE_BIT = 1u << 27u;
+  constexpr uint64_t XMM_YMM_STATE_BITS = 0x6u;
+  const cpuid_registers leaf1 = read_cpuid(1u, 0u);
+  if ((leaf1.ecx & XSAVE_BIT) == 0u || (leaf1.ecx & OSXSAVE_BIT) == 0u) {
+    return false;
+  }
+  return (read_xcr0() & XMM_YMM_STATE_BITS) == XMM_YMM_STATE_BITS;
+#else
+  return false;
+#endif
+}
 
 inline bool detect_avx2() noexcept {
-#if defined(__x86_64__) || defined(_M_X64)
-#if defined(__GNUC__) || defined(__clang__)
-  __builtin_cpu_init();
-  return __builtin_cpu_supports("avx2");
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) ||             \
+    defined(_M_IX86)
+  constexpr uint32_t AVX_BIT = 1u << 28u;
+  constexpr uint32_t AVX2_BIT = 1u << 5u;
+  const cpuid_registers max_leaf = read_cpuid(0u, 0u);
+  if (max_leaf.eax < 7u || !os_supports_avx_state()) {
+    return false;
+  }
+  const cpuid_registers leaf1 = read_cpuid(1u, 0u);
+  const cpuid_registers leaf7 = read_cpuid(7u, 0u);
+  return (leaf1.ecx & AVX_BIT) != 0u && (leaf7.ebx & AVX2_BIT) != 0u;
+#else
+  return false;
+#endif
+}
+
+inline bool detect_fma() noexcept {
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) ||             \
+    defined(_M_IX86)
+  constexpr uint32_t AVX_BIT = 1u << 28u;
+  constexpr uint32_t FMA_BIT = 1u << 12u;
+  if (!os_supports_avx_state()) {
+    return false;
+  }
+  const cpuid_registers leaf1 = read_cpuid(1u, 0u);
+  return (leaf1.ecx & AVX_BIT) != 0u && (leaf1.ecx & FMA_BIT) != 0u;
 #else
   return false;
 #endif
+}
+
+inline bool detect_f16c() noexcept {
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) ||             \
+    defined(_M_IX86)
+  constexpr uint32_t AVX_BIT = 1u << 28u;
+  constexpr uint32_t F16C_BIT = 1u << 29u;
+  if (!os_supports_avx_state()) {
+    return false;
+  }
+  const cpuid_registers leaf1 = read_cpuid(1u, 0u);
+  return (leaf1.ecx & AVX_BIT) != 0u && (leaf1.ecx & F16C_BIT) != 0u;
 #else
   return false;
 #endif
 }
 
-}  // namespace detail
+struct host_feature_contract {
+  bool avx2_available = false;
+  bool fma_available = false;
+  bool f16c_available = false;
+  bool avx512_claimed = false;
+  bool avx_vnni_claimed = false;
+  bool amx_claimed = false;
+  bool bf16_claimed = false;
+  bool native_fp16_claimed = false;
+
+  bool avx2_fma_f16c_available() const noexcept {
+    return avx2_available && fma_available && f16c_available;
+  }
+};
+
+inline host_feature_contract detect_host_feature_contract() noexcept {
+  return host_feature_contract{
+      .avx2_available = detect_avx2(),
+      .fma_available = detect_fma(),
+      .f16c_available = detect_f16c(),
+      .avx512_claimed = false,
+      .avx_vnni_claimed = false,
+      .amx_claimed = false,
+      .bf16_claimed = false,
+      .native_fp16_claimed = false,
+  };
+}
+
+} // namespace emel::kernel::x86_64::detail
+
+namespace emel::kernel::x86_64::action {
+
+namespace detail {
+
+inline bool detect_avx2() noexcept {
+  return ::emel::kernel::x86_64::detail::detect_avx2();
+}
+
+inline bool detect_fma() noexcept {
+  return ::emel::kernel::x86_64::detail::detect_fma();
+}
+
+inline bool detect_f16c() noexcept {
+  return ::emel::kernel::x86_64::detail::detect_f16c();
+}
+
+} // namespace detail
 
 struct context {
-  const bool avx2_available = detail::detect_avx2();
-  ::emel::kernel::detail::flash_attn_workspace flash_attn_workspace = {};
-  // TODO(emel): remove once dispatch observability no longer relies on this counter.
+  using host_feature_contract =::emel::kernel::x86_64::detail::host_feature_contract;
+
+  context() noexcept
+      : context(::emel::kernel::x86_64::detail::detect_host_feature_contract(),
+                {}, 0) {}
+
+  context(const bool avx2,
+          const ::emel::kernel::detail::flash_attn_workspace &workspace,
+          const uint64_t generation) noexcept
+      : context(
+            host_feature_contract{
+                .avx2_available = avx2,
+                .fma_available = detail::detect_fma(),
+                .f16c_available = detail::detect_f16c(),
+                .avx512_claimed = false,
+                .avx_vnni_claimed = false,
+                .amx_claimed = false,
+                .bf16_claimed = false,
+                .native_fp16_claimed = false,
+            },
+            workspace, generation) {}
+
+  context(const host_feature_contract &contract,
+          const ::emel::kernel::detail::flash_attn_workspace &workspace,
+          const uint64_t generation) noexcept
+      : host_features(contract), avx2_available(contract.avx2_available),
+        fma_available(contract.fma_available),
+        f16c_available(contract.f16c_available),
+        avx512_claimed(contract.avx512_claimed),
+        avx_vnni_claimed(contract.avx_vnni_claimed),
+        amx_claimed(contract.amx_claimed), bf16_claimed(contract.bf16_claimed),
+        native_fp16_claimed(contract.native_fp16_claimed),
+        flash_attn_workspace(workspace), dispatch_generation(generation) {}
+
+  const host_feature_contract host_features;
+  const bool avx2_available;
+  const bool fma_available;
+  const bool f16c_available;
+  const bool avx512_claimed;
+  const bool avx_vnni_claimed;
+  const bool amx_claimed;
+  const bool bf16_claimed;
+  const bool native_fp16_claimed;
+  ::emel::kernel::detail::flash_attn_workspace flash_attn_workspace;
+  uint64_t optimized_flash_dispatch_count = 0;
+  uint64_t shared_flash_dispatch_count = 0;
+  uint64_t optimized_q2_dispatch_count = 0;
+  uint64_t shared_q2_dispatch_count = 0;
+  uint64_t optimized_q3_dispatch_count = 0;
+  uint64_t shared_q3_dispatch_count = 0;
+  uint64_t optimized_f32_fma_dispatch_count = 0;
+  uint64_t optimized_f32_fma_vector_dispatch_count = 0;
+  uint64_t optimized_q4_dispatch_count = 0;
+  uint64_t shared_q4_dispatch_count = 0;
+  uint64_t optimized_q6_dispatch_count = 0;
+  uint64_t shared_q6_dispatch_count = 0;
+  uint64_t optimized_q4_0_dispatch_count = 0;
+  uint64_t shared_q4_0_dispatch_count = 0;
+  uint64_t optimized_q4_1_dispatch_count = 0;
+  uint64_t shared_q4_1_dispatch_count = 0;
+  uint64_t optimized_q5_0_dispatch_count = 0;
+  uint64_t shared_q5_0_dispatch_count = 0;
+  uint64_t optimized_q8_0_dispatch_count = 0;
+  uint64_t shared_q8_0_dispatch_count = 0;
+  // TODO(emel): remove once dispatch observability no longer relies on this
+  // counter.
   uint64_t dispatch_generation = 0;
 };
 
-}  // namespace emel::kernel::x86_64::action
+} // namespace emel::kernel::x86_64::action
diff --git a/src/emel/kernel/x86_64/guards.hpp b/src/emel/kernel/x86_64/guards.hpp
index 9a1f5b9e..76d4ed7f 100644
--- a/src/emel/kernel/x86_64/guards.hpp
+++ b/src/emel/kernel/x86_64/guards.hpp
@@ -1,46 +1,239 @@
 #pragma once
 
-#include "emel/kernel/x86_64/actions.hpp"
 #include "emel/kernel/detail.hpp"
+#include "emel/kernel/x86_64/actions.hpp"
 #include "emel/kernel/x86_64/context.hpp"
 #include "emel/kernel/x86_64/events.hpp"
 
 namespace emel::kernel::x86_64::guard {
 
-template <class dispatch_event_type>
-struct simd_op {
-  bool operator()(const dispatch_event_type & ev, const action::context & ctx) const noexcept {
+template <class dispatch_event_type> struct simd_op {
+  bool operator()(const dispatch_event_type &ev,
+                  const action::context &ctx) const noexcept {
+    if (!::emel::kernel::detail::validate_dispatch_request(ev.request)) {
+      return false ;
+    }
+    return ::emel::kernel::x86_64::detail::can_use_avx2(ev.request,
+                                                        ctx.avx2_available);
+  }
+};
+
+struct guard_simd_op_mul_mat_f32_fma {
+  bool operator()(
+      const ::emel::kernel::x86_64::event::dispatch_op_mul_mat &ev,
+      const action::context &ctx) const noexcept {
+    if (!::emel::kernel::detail::validate_dispatch_request(ev.request)) {
+      return false;
+    }
+    return ::emel::kernel::x86_64::detail::can_use_avx2_fma_f32_mul_mat(
+        ev.request, ctx.host_features);
+  }
+};
+
+struct guard_simd_op_mul_mat_f32_fma_vector {
+  bool operator()(
+      const ::emel::kernel::x86_64::event::dispatch_op_mul_mat &ev,
+      const action::context &ctx) const noexcept {
+    if (!::emel::kernel::detail::validate_dispatch_request(ev.request)) {
+      return false;
+    }
+    return ::emel::kernel::x86_64::detail::can_use_avx2_fma_f32_vector_mul_mat(
+        ev.request, ctx.host_features);
+  }
+};
+
+struct guard_simd_op_mul_mat_f32_avx2_only {
+  bool operator()(
+      const ::emel::kernel::x86_64::event::dispatch_op_mul_mat &ev,
+      const action::context &ctx) const noexcept {
+    return simd_op<::emel::kernel::x86_64::event::dispatch_op_mul_mat>{}(
+               ev, ctx) &&
+           !guard_simd_op_mul_mat_f32_fma{}(ev, ctx) &&
+           !guard_simd_op_mul_mat_f32_fma_vector{}(ev, ctx);
+  }
+};
+
+struct guard_simd_op_mul_mat_q2_k_q8_k {
+  bool operator()(
+      const ::emel::kernel::x86_64::event::dispatch_op_mul_mat &ev,
+      const action::context &ctx) const noexcept {
     if (!::emel::kernel::detail::validate_dispatch_request(ev.request)) {
       return false;
     }
-    return ::emel::kernel::x86_64::detail::can_use_avx2(ev.request, ctx.avx2_available);
+    return ::emel::kernel::x86_64::detail::
+        can_use_avx2_fma_q2_k_q8_k_mul_mat(ev.request, ctx.host_features);
   }
 };
 
-template <class dispatch_event_type>
-struct valid_op {
-  bool operator()(const dispatch_event_type & ev, const action::context & ctx) const noexcept {
+struct guard_simd_op_mul_mat_q3_k_q8_k {
+  bool operator()(
+      const ::emel::kernel::x86_64::event::dispatch_op_mul_mat &ev,
+      const action::context &ctx) const noexcept {
+    if (!::emel::kernel::detail::validate_dispatch_request(ev.request)) {
+      return false;
+    }
+    return ::emel::kernel::x86_64::detail::
+        can_use_avx2_fma_q3_k_q8_k_mul_mat(ev.request, ctx.host_features);
+  }
+};
+
+struct guard_simd_op_mul_mat_q4_k_q8_k {
+  bool operator()(
+      const ::emel::kernel::x86_64::event::dispatch_op_mul_mat &ev,
+      const action::context &ctx) const noexcept {
+    if (!::emel::kernel::detail::validate_dispatch_request(ev.request)) {
+      return false;
+    }
+    return ::emel::kernel::x86_64::detail::
+        can_use_avx2_fma_q4_k_q8_k_mul_mat(ev.request, ctx.host_features);
+  }
+};
+
+struct guard_simd_op_mul_mat_q6_k_q8_k {
+  bool operator()(
+      const ::emel::kernel::x86_64::event::dispatch_op_mul_mat &ev,
+      const action::context &ctx) const noexcept {
+    if (!::emel::kernel::detail::validate_dispatch_request(ev.request)) {
+      return false;
+    }
+    return ::emel::kernel::x86_64::detail::
+        can_use_avx2_fma_q6_k_q8_k_mul_mat(ev.request, ctx.host_features);
+  }
+};
+
+struct guard_simd_op_mul_mat_q4_0_q8_0 {
+  bool operator()(
+      const ::emel::kernel::x86_64::event::dispatch_op_mul_mat &ev,
+      const action::context &ctx) const noexcept {
+    if (!::emel::kernel::detail::validate_dispatch_request(ev.request)) {
+      return false;
+    }
+    return ::emel::kernel::x86_64::detail::
+        can_use_avx2_fma_q4_0_q8_0_mul_mat(ev.request, ctx.host_features);
+  }
+};
+
+struct guard_simd_op_mul_mat_q4_1_q8_0 {
+  bool operator()(
+      const ::emel::kernel::x86_64::event::dispatch_op_mul_mat &ev,
+      const action::context &ctx) const noexcept {
+    if (!::emel::kernel::detail::validate_dispatch_request(ev.request)) {
+      return false;
+    }
+    return ::emel::kernel::x86_64::detail::
+        can_use_avx2_fma_q4_1_q8_0_mul_mat(ev.request, ctx.host_features);
+  }
+};
+
+struct guard_simd_op_mul_mat_q5_0_q8_0 {
+  bool operator()(
+      const ::emel::kernel::x86_64::event::dispatch_op_mul_mat &ev,
+      const action::context &ctx) const noexcept {
+    if (!::emel::kernel::detail::validate_dispatch_request(ev.request)) {
+      return false;
+    }
+    return ::emel::kernel::x86_64::detail::
+        can_use_avx2_fma_q5_0_q8_0_mul_mat(ev.request, ctx.host_features);
+  }
+};
+
+struct guard_simd_op_mul_mat_q8_0_q8_0 {
+  bool operator()(
+      const ::emel::kernel::x86_64::event::dispatch_op_mul_mat &ev,
+      const action::context &ctx) const noexcept {
+    if (!::emel::kernel::detail::validate_dispatch_request(ev.request)) {
+      return false;
+    }
+    return ::emel::kernel::x86_64::detail::
+        can_use_avx2_fma_q8_0_q8_0_mul_mat(ev.request, ctx.host_features);
+  }
+};
+
+template <class dispatch_event_type> struct valid_op {
+  bool operator()(const dispatch_event_type &ev,
+                  const action::context &ctx) const noexcept {
     if (!::emel::kernel::detail::validate_dispatch_request(ev.request)) {
       return false;
     }
     if (!::emel::kernel::detail::can_run_backend_request(ev.request)) {
       return false;
     }
+    if constexpr (std::is_same_v<
+                      dispatch_event_type,
+                      ::emel::kernel::x86_64::event::dispatch_op_mul_mat>) {
+      return !simd_op<dispatch_event_type>{}(ev, ctx) &&
+             !guard_simd_op_mul_mat_q2_k_q8_k{}(ev, ctx) &&
+             !guard_simd_op_mul_mat_q3_k_q8_k{}(ev, ctx) &&
+             !guard_simd_op_mul_mat_q4_k_q8_k{}(ev, ctx) &&
+             !guard_simd_op_mul_mat_q6_k_q8_k{}(ev, ctx) &&
+             !guard_simd_op_mul_mat_q4_0_q8_0{}(ev, ctx) &&
+             !guard_simd_op_mul_mat_q4_1_q8_0{}(ev, ctx) &&
+             !guard_simd_op_mul_mat_q5_0_q8_0{}(ev, ctx) &&
+             !guard_simd_op_mul_mat_q8_0_q8_0{}(ev, ctx);
+    }
     return !simd_op<dispatch_event_type>{}(ev, ctx);
   }
 };
 
-template <class dispatch_event_type>
-struct invalid_op {
-  bool operator()(const dispatch_event_type & ev, const action::context & ctx) const noexcept {
+struct simd_op_flash_attn_ext_f16kv_one_chunk {
+  bool operator()(
+      const ::emel::kernel::x86_64::event::dispatch_op_flash_attn_ext &ev,
+      const action::context &ctx) const noexcept {
+    if (!::emel::kernel::detail::validate_dispatch_request(ev.request)) {
+      return false;
+    }
+    return ::emel::kernel::x86_64::detail::
+        can_run_avx2_fma_f16c_flash_attn_ext_f16kv_one_chunk_request(
+            ev.request, ctx.host_features, ctx.flash_attn_workspace);
+  }
+};
+
+struct valid_op_flash_attn_ext_shared {
+  bool operator()(
+      const ::emel::kernel::x86_64::event::dispatch_op_flash_attn_ext &ev,
+      const action::context &ctx) const noexcept {
+    if (!::emel::kernel::detail::validate_dispatch_request(ev.request)) {
+      return false;
+    }
+    if (!::emel::kernel::detail::can_run_backend_request(ev.request)) {
+      return false;
+    }
+    return ::emel::kernel::detail::can_run_flash_attn_ext_with_workspace(
+               ev.request, ctx.flash_attn_workspace) &&
+           !simd_op_flash_attn_ext_f16kv_one_chunk{}(ev, ctx);
+  }
+};
+
+template <class dispatch_event_type> struct invalid_op {
+  bool operator()(const dispatch_event_type &ev,
+                  const action::context &ctx) const noexcept {
+    if constexpr (std::is_same_v<dispatch_event_type,
+                                 ::emel::kernel::x86_64::event::
+                                     dispatch_op_flash_attn_ext>) {
+      return !simd_op_flash_attn_ext_f16kv_one_chunk{}(ev, ctx) &&
+             !valid_op_flash_attn_ext_shared{}(ev, ctx);
+    }
+    if constexpr (std::is_same_v<
+                      dispatch_event_type,
+                      ::emel::kernel::x86_64::event::dispatch_op_mul_mat>) {
+      return !simd_op<dispatch_event_type>{}(ev, ctx) &&
+             !guard_simd_op_mul_mat_q2_k_q8_k{}(ev, ctx) &&
+             !guard_simd_op_mul_mat_q3_k_q8_k{}(ev, ctx) &&
+             !guard_simd_op_mul_mat_q4_k_q8_k{}(ev, ctx) &&
+             !guard_simd_op_mul_mat_q6_k_q8_k{}(ev, ctx) &&
+             !guard_simd_op_mul_mat_q4_0_q8_0{}(ev, ctx) &&
+             !guard_simd_op_mul_mat_q4_1_q8_0{}(ev, ctx) &&
+             !guard_simd_op_mul_mat_q5_0_q8_0{}(ev, ctx) &&
+             !guard_simd_op_mul_mat_q8_0_q8_0{}(ev, ctx) &&
+             !valid_op<dispatch_event_type>{}(ev, ctx);
+    }
     return !simd_op<dispatch_event_type>{}(ev, ctx) &&
            !valid_op<dispatch_event_type>{}(ev, ctx);
   }
 };
 
-template <::emel::kernel::event::unary_subop subop>
-struct unary_subop_is {
-  bool operator()(const ::emel::kernel::x86_64::event::dispatch_op_unary & ev,
+template <::emel::kernel::event::unary_subop subop> struct unary_subop_is {
+  bool operator()(const ::emel::kernel::x86_64::event::dispatch_op_unary &ev,
                   const action::context &) const noexcept {
     return ev.request.subop == subop;
   }
@@ -49,29 +242,38 @@ struct unary_subop_is {
 template <::emel::kernel::event::unary_subop subop>
 using simd_op_unary_subop = ::emel::kernel::detail::simd_unary_subop_guard<
     ::emel::kernel::x86_64::event::dispatch_op_unary, action::context,
-    simd_op<::emel::kernel::x86_64::event::dispatch_op_unary>, unary_subop_is<subop>>;
+    simd_op<::emel::kernel::x86_64::event::dispatch_op_unary>,
+    unary_subop_is<subop>>;
 
 template <::emel::kernel::event::unary_subop subop>
 using valid_op_unary_subop = ::emel::kernel::detail::valid_unary_subop_guard<
     ::emel::kernel::x86_64::event::dispatch_op_unary, action::context,
-    valid_op<::emel::kernel::x86_64::event::dispatch_op_unary>, unary_subop_is<subop>>;
-
-using simd_op_unary_abs = simd_op_unary_subop<::emel::kernel::event::unary_subop::abs>;
-using simd_op_unary_neg = simd_op_unary_subop<::emel::kernel::event::unary_subop::neg>;
-using simd_op_unary_relu = simd_op_unary_subop<::emel::kernel::event::unary_subop::relu>;
-using valid_op_unary_abs = valid_op_unary_subop<::emel::kernel::event::unary_subop::abs>;
-using valid_op_unary_neg = valid_op_unary_subop<::emel::kernel::event::unary_subop::neg>;
-using valid_op_unary_relu = valid_op_unary_subop<::emel::kernel::event::unary_subop::relu>;
-using valid_op_unary_exp = valid_op_unary_subop<::emel::kernel::event::unary_subop::exp>;
-
-#define EMEL_KERNEL_DECLARE_GUARD_ALIAS(op_name)                                 \
-  using simd_##op_name =                                                         \
-      simd_op<::emel::kernel::x86_64::event::dispatch_##op_name>;                \
-  using valid_##op_name =                                                        \
-      valid_op<::emel::kernel::x86_64::event::dispatch_##op_name>;               \
-  using invalid_##op_name =                                                      \
+    valid_op<::emel::kernel::x86_64::event::dispatch_op_unary>,
+    unary_subop_is<subop>>;
+
+using simd_op_unary_abs =
+    simd_op_unary_subop<::emel::kernel::event::unary_subop::abs>;
+using simd_op_unary_neg =
+    simd_op_unary_subop<::emel::kernel::event::unary_subop::neg>;
+using simd_op_unary_relu =
+    simd_op_unary_subop<::emel::kernel::event::unary_subop::relu>;
+using valid_op_unary_abs =
+    valid_op_unary_subop<::emel::kernel::event::unary_subop::abs>;
+using valid_op_unary_neg =
+    valid_op_unary_subop<::emel::kernel::event::unary_subop::neg>;
+using valid_op_unary_relu =
+    valid_op_unary_subop<::emel::kernel::event::unary_subop::relu>;
+using valid_op_unary_exp =
+    valid_op_unary_subop<::emel::kernel::event::unary_subop::exp>;
+
+#define EMEL_KERNEL_DECLARE_GUARD_ALIAS(op_name)                               \
+  using simd_##op_name =                                                       \
+      simd_op<::emel::kernel::x86_64::event::dispatch_##op_name>;              \
+  using valid_##op_name =                                                      \
+      valid_op<::emel::kernel::x86_64::event::dispatch_##op_name>;             \
+  using invalid_##op_name =                                                    \
       invalid_op<::emel::kernel::x86_64::event::dispatch_##op_name>;
 EMEL_KERNEL_OP_EVENT_LIST(EMEL_KERNEL_DECLARE_GUARD_ALIAS)
 #undef EMEL_KERNEL_DECLARE_GUARD_ALIAS
 
-}  // namespace emel::kernel::x86_64::guard
+} // namespace emel::kernel::x86_64::guard
diff --git a/src/emel/kernel/x86_64/sm.hpp b/src/emel/kernel/x86_64/sm.hpp
index 4f6163d9..69bc5221 100644
--- a/src/emel/kernel/x86_64/sm.hpp
+++ b/src/emel/kernel/x86_64/sm.hpp
@@ -2,11 +2,11 @@
 
 // benchmark: kernel
 #include "emel/emel.h"
+#include "emel/kernel/detail.hpp"
 #include "emel/kernel/x86_64/actions.hpp"
 #include "emel/kernel/x86_64/errors.hpp"
 #include "emel/kernel/x86_64/events.hpp"
 #include "emel/kernel/x86_64/guards.hpp"
-#include "emel/kernel/detail.hpp"
 #include "emel/sm.hpp"
 
 namespace emel::kernel::x86_64 {
@@ -344,7 +344,57 @@ struct model {
 
       , sml::state<ready> <= sml::state<ready> +
                sml::event<::emel::kernel::x86_64::event::dispatch_op_mul_mat>
-                 [ guard::simd_op_mul_mat{} ]
+                 [ guard::guard_simd_op_mul_mat_q2_k_q8_k{} ]
+                 / action::effect_exec_simd_op_mul_mat_q2_k_q8_k
+
+      , sml::state<ready> <= sml::state<ready> +
+               sml::event<::emel::kernel::x86_64::event::dispatch_op_mul_mat>
+                 [ guard::guard_simd_op_mul_mat_q3_k_q8_k{} ]
+                 / action::effect_exec_simd_op_mul_mat_q3_k_q8_k
+
+      , sml::state<ready> <= sml::state<ready> +
+               sml::event<::emel::kernel::x86_64::event::dispatch_op_mul_mat>
+                 [ guard::guard_simd_op_mul_mat_q4_k_q8_k{} ]
+                 / action::effect_exec_simd_op_mul_mat_q4_k_q8_k
+
+      , sml::state<ready> <= sml::state<ready> +
+               sml::event<::emel::kernel::x86_64::event::dispatch_op_mul_mat>
+                 [ guard::guard_simd_op_mul_mat_q6_k_q8_k{} ]
+                 / action::effect_exec_simd_op_mul_mat_q6_k_q8_k
+
+      , sml::state<ready> <= sml::state<ready> +
+               sml::event<::emel::kernel::x86_64::event::dispatch_op_mul_mat>
+                 [ guard::guard_simd_op_mul_mat_q4_0_q8_0{} ]
+                 / action::effect_exec_simd_op_mul_mat_q4_0_q8_0
+
+      , sml::state<ready> <= sml::state<ready> +
+               sml::event<::emel::kernel::x86_64::event::dispatch_op_mul_mat>
+                 [ guard::guard_simd_op_mul_mat_q4_1_q8_0{} ]
+                 / action::effect_exec_simd_op_mul_mat_q4_1_q8_0
+
+      , sml::state<ready> <= sml::state<ready> +
+               sml::event<::emel::kernel::x86_64::event::dispatch_op_mul_mat>
+                 [ guard::guard_simd_op_mul_mat_q5_0_q8_0{} ]
+                 / action::effect_exec_simd_op_mul_mat_q5_0_q8_0
+
+      , sml::state<ready> <= sml::state<ready> +
+               sml::event<::emel::kernel::x86_64::event::dispatch_op_mul_mat>
+                 [ guard::guard_simd_op_mul_mat_q8_0_q8_0{} ]
+                 / action::effect_exec_simd_op_mul_mat_q8_0_q8_0
+
+      , sml::state<ready> <= sml::state<ready> +
+               sml::event<::emel::kernel::x86_64::event::dispatch_op_mul_mat>
+                 [ guard::guard_simd_op_mul_mat_f32_fma_vector{} ]
+                 / action::effect_exec_simd_op_mul_mat_f32_fma_vector
+
+      , sml::state<ready> <= sml::state<ready> +
+               sml::event<::emel::kernel::x86_64::event::dispatch_op_mul_mat>
+                 [ guard::guard_simd_op_mul_mat_f32_fma{} ]
+                 / action::effect_exec_simd_op_mul_mat_f32_fma
+
+      , sml::state<ready> <= sml::state<ready> +
+               sml::event<::emel::kernel::x86_64::event::dispatch_op_mul_mat>
+                 [ guard::guard_simd_op_mul_mat_f32_avx2_only{} ]
                  / action::exec_simd_op_mul_mat
 
       , sml::state<ready> <= sml::state<ready> +
@@ -799,7 +849,12 @@ struct model {
 
       , sml::state<ready> <= sml::state<ready> +
                sml::event<::emel::kernel::x86_64::event::dispatch_op_flash_attn_ext>
-                 [ guard::valid_op_flash_attn_ext{} ]
+                 [ guard::simd_op_flash_attn_ext_f16kv_one_chunk{} ]
+                 / action::exec_simd_op_flash_attn_ext_f16kv_one_chunk
+
+      , sml::state<ready> <= sml::state<ready> +
+               sml::event<::emel::kernel::x86_64::event::dispatch_op_flash_attn_ext>
+                 [ guard::valid_op_flash_attn_ext_shared{} ]
                  / action::exec_op_flash_attn_ext
 
       , sml::state<ready> <= sml::state<ready> +
@@ -1060,7 +1115,7 @@ struct sm : public emel::sm<model, action::context> {
   using base_type = emel::sm<model, action::context>;
   using base_type::base_type;
 
-  bool process_event(const ::emel::kernel::event::dispatch & ev) {
+  bool process_event(const ::emel::kernel::event::dispatch &ev) {
     event::dispatch_ctx ctx{};
     const event::dispatch_request dispatch{ev, ctx};
     return process_dispatch_event(dispatch);
@@ -1068,19 +1123,132 @@ struct sm : public emel::sm<model, action::context> {
 
   template <class event_type>
     requires(::emel::kernel::is_op_event_v<event_type>)
-  bool process_event(const event_type & ev) {
+  bool process_event(const event_type &ev) {
     event::dispatch_ctx ctx{};
     using dispatch_event_type = event::dispatch_event_for_t<event_type>;
     const dispatch_event_type dispatch{ev, ctx};
     return process_dispatch_event(dispatch);
   }
 
+  bool avx2_available() const noexcept { return this->context_.avx2_available; }
+
+  bool fma_available() const noexcept { return this->context_.fma_available; }
+
+  bool f16c_available() const noexcept { return this->context_.f16c_available; }
+
+  bool avx2_fma_f16c_available() const noexcept {
+    return this->context_.host_features.avx2_fma_f16c_available();
+  }
+
+  bool avx512_claimed() const noexcept { return this->context_.avx512_claimed; }
+
+  bool avx_vnni_claimed() const noexcept {
+    return this->context_.avx_vnni_claimed;
+  }
+
+  bool amx_claimed() const noexcept { return this->context_.amx_claimed; }
+
+  bool bf16_claimed() const noexcept { return this->context_.bf16_claimed; }
+
+  bool native_fp16_claimed() const noexcept {
+    return this->context_.native_fp16_claimed;
+  }
+
+  uint64_t optimized_flash_dispatch_count() const noexcept {
+    return this->context_.optimized_flash_dispatch_count;
+  }
+
+  uint64_t shared_flash_dispatch_count() const noexcept {
+    return this->context_.shared_flash_dispatch_count;
+  }
+
+  uint64_t optimized_q2_dispatch_count() const noexcept {
+    return this->context_.optimized_q2_dispatch_count;
+  }
+
+  uint64_t shared_q2_dispatch_count() const noexcept {
+    return this->context_.shared_q2_dispatch_count;
+  }
+
+  uint64_t optimized_q3_dispatch_count() const noexcept {
+    return this->context_.optimized_q3_dispatch_count;
+  }
+
+  uint64_t shared_q3_dispatch_count() const noexcept {
+    return this->context_.shared_q3_dispatch_count;
+  }
+
+  uint64_t optimized_f32_fma_dispatch_count() const noexcept {
+    return this->context_.optimized_f32_fma_dispatch_count;
+  }
+
+  uint64_t optimized_f32_fma_vector_dispatch_count() const noexcept {
+    return this->context_.optimized_f32_fma_vector_dispatch_count;
+  }
+
+  uint64_t optimized_q4_dispatch_count() const noexcept {
+    return this->context_.optimized_q4_dispatch_count;
+  }
+
+  uint64_t shared_q4_dispatch_count() const noexcept {
+    return this->context_.shared_q4_dispatch_count;
+  }
+
+  uint64_t optimized_q6_dispatch_count() const noexcept {
+    return this->context_.optimized_q6_dispatch_count;
+  }
+
+  uint64_t shared_q6_dispatch_count() const noexcept {
+    return this->context_.shared_q6_dispatch_count;
+  }
+
+  uint64_t optimized_q4_0_dispatch_count() const noexcept {
+    return this->context_.optimized_q4_0_dispatch_count;
+  }
+
+  uint64_t shared_q4_0_dispatch_count() const noexcept {
+    return this->context_.shared_q4_0_dispatch_count;
+  }
+
+  uint64_t optimized_q4_1_dispatch_count() const noexcept {
+    return this->context_.optimized_q4_1_dispatch_count;
+  }
+
+  uint64_t shared_q4_1_dispatch_count() const noexcept {
+    return this->context_.shared_q4_1_dispatch_count;
+  }
+
+  uint64_t optimized_q5_0_dispatch_count() const noexcept {
+    return this->context_.optimized_q5_0_dispatch_count;
+  }
+
+  uint64_t shared_q5_0_dispatch_count() const noexcept {
+    return this->context_.shared_q5_0_dispatch_count;
+  }
+
+  uint64_t optimized_q8_0_dispatch_count() const noexcept {
+    return this->context_.optimized_q8_0_dispatch_count;
+  }
+
+  uint64_t shared_q8_0_dispatch_count() const noexcept {
+    return this->context_.shared_q8_0_dispatch_count;
+  }
+
+  uint64_t flash_attn_workspace_prepared_tokens() const noexcept {
+    return this->context_.flash_attn_workspace.prepared_tokens;
+  }
+
+  uint64_t flash_attn_workspace_reuse_count() const noexcept {
+    return this->context_.flash_attn_workspace.reuse_count;
+  }
+
  private:
   template <class dispatch_event_type>
-  bool process_dispatch_event(const dispatch_event_type & ev) {
+  bool process_dispatch_event(const dispatch_event_type &ev) {
     const bool accepted = base_type::process_event(ev);
-    return accepted && ev.ctx.err == static_cast<int32_t>(emel::error::cast(error::none));
+    return accepted &&
+           ev.ctx.err == static_cast<int32_t>(emel::error::cast(error::none));
   }
 };
 
-}  // namespace emel::kernel::x86_64
+} // namespace emel::kernel::x86_64
diff --git a/src/emel/model/data.cpp b/src/emel/model/data.cpp
index 0948ece5..2079dfe9 100644
--- a/src/emel/model/data.cpp
+++ b/src/emel/model/data.cpp
@@ -51,7 +51,16 @@ bool uses_attention_qk_norm(const std::string_view architecture) noexcept {
          uses_gemma4_block_contract(architecture);
 }
 
-bool is_lfm2_attention_layer(const int32_t block_index) noexcept {
+bool is_lfm2_attention_layer(const data &model_data,
+                             const int32_t block_index) noexcept {
+  const uint32_t pattern_count =
+      model_data.params.attention_layer_pattern_count;
+  if (pattern_count > 0u) {
+    return block_index >= 0 &&
+           static_cast<uint32_t>(block_index) < pattern_count &&
+           model_data.params.attention_layer_pattern_flags[static_cast<size_t>(
+               block_index)] != 0u;
+  }
   for (const int32_t candidate : k_lfm2_attention_layers) {
     if (candidate == block_index) {
       return true;
@@ -524,8 +533,9 @@ emel::error::type lookup_block_view(const execution_view &execution,
     return emel::error::cast(emel::model::loader::error::model_invalid);
   }
 
-  block_out.uses_attention = !uses_lfm2_block_contract(architecture) ||
-                             is_lfm2_attention_layer(block_index);
+  block_out.uses_attention =
+      !uses_lfm2_block_contract(architecture) ||
+      is_lfm2_attention_layer(*execution.model, block_index);
   if (!block_out.uses_attention) {
     const bool shortconv_ok =
         bind("shortconv.conv.weight", block_out.shortconv_conv) &&
@@ -618,7 +628,7 @@ emel::error::type build_topology(const execution_view &execution,
     uint32_t tensor_count = k_global_tensor_count;
     for (int32_t block_index = 0; block_index < execution.block_count;
          ++block_index) {
-      tensor_count += is_lfm2_attention_layer(block_index)
+      tensor_count += is_lfm2_attention_layer(*execution.model, block_index)
                           ? k_lfm2_attention_block_tensor_count
                           : k_lfm2_conv_block_tensor_count;
     }
diff --git a/src/emel/model/data.hpp b/src/emel/model/data.hpp
index bf588fd2..1f0f063c 100644
--- a/src/emel/model/data.hpp
+++ b/src/emel/model/data.hpp
@@ -189,6 +189,9 @@ struct data {
     uint32_t attention_sliding_window_pattern_count = 0;
     std::array<uint8_t, k_max_metadata_arrays>
         attention_sliding_window_pattern_flags = {};
+    uint32_t attention_layer_pattern_count = 0;
+    std::array<uint8_t, k_max_metadata_arrays> attention_layer_pattern_flags =
+        {};
     float attention_scale = 0.0f;
     float attention_output_scale = 0.0f;
     int32_t attention_temperature_length = 0;
@@ -202,6 +205,10 @@ struct data {
     float rope_freq_base = 0.0f;
     float rope_freq_base_swa = 0.0f;
     int32_t n_rot_swa = 0;
+    int32_t rope_pair_x0_stride = 2;
+    int32_t rope_pair_x1_stride = 2;
+    int32_t rope_pair_x1_offset = 1;
+    int32_t rope_pair_x1_half_rot_offset = 0;
     float rope_scale_linear = 0.0f;
     int32_t rope_dimension_sections_count = 0;
     std::array<int32_t, k_max_rope_dimension_sections> rope_dimension_sections =
diff --git a/src/emel/model/gemma4/detail.cpp b/src/emel/model/gemma4/detail.cpp
index a909a5f1..7c476fdf 100644
--- a/src/emel/model/gemma4/detail.cpp
+++ b/src/emel/model/gemma4/detail.cpp
@@ -93,6 +93,10 @@ bool load_hparams(const emel::model::detail::hparam_loader & loader,
   model_out.params.full_attention_interval = 5;
   model_out.params.n_embd_out = model_out.params.n_embd;
   model_out.params.tie_word_embeddings = true;
+  model_out.params.rope_pair_x0_stride = 1;
+  model_out.params.rope_pair_x1_stride = 1;
+  model_out.params.rope_pair_x1_offset = 0;
+  model_out.params.rope_pair_x1_half_rot_offset = 1;
   return true;
 }
 
diff --git a/src/emel/model/lfm2/detail.cpp b/src/emel/model/lfm2/detail.cpp
index 0780b4d4..7bce6d01 100644
--- a/src/emel/model/lfm2/detail.cpp
+++ b/src/emel/model/lfm2/detail.cpp
@@ -1,6 +1,7 @@
 #include "emel/model/lfm2/detail.hpp"
 
 #include <array>
+#include <span>
 
 #include "emel/model/detail.hpp"
 #include "emel/model/llama/detail.hpp"
@@ -13,6 +14,7 @@ namespace {
 constexpr std::string_view k_architecture = "lfm2";
 constexpr std::string_view k_token_embedding_name = "token_embd.weight";
 constexpr std::string_view k_output_norm_name = "token_embd_norm.weight";
+// Maintained LFM2.5-1.2B geometry.
 constexpr int32_t k_block_count = 16;
 constexpr int32_t k_context_length = 128000;
 constexpr int32_t k_embedding_length = 2048;
@@ -21,9 +23,23 @@ constexpr int32_t k_head_count_kv = 8;
 constexpr int32_t k_vocab_size = 65536;
 constexpr int32_t k_shortconv_l_cache = 3;
 constexpr float k_rope_freq_base = 1000000.0f;
+// Maintained LFM2.5-230M geometry (shares vocab, l_cache, ctx, and freq base).
+constexpr int32_t k_230m_block_count = 14;
+constexpr int32_t k_230m_embedding_length = 1024;
+constexpr int32_t k_230m_head_count = 16;
+// 1.2B layout fallback for model data without a bound per-layer kv-head
+// pattern (synthetic fixtures); real models classify from
+// `lfm2.attention.head_count_kv`, where a nonzero entry marks attention.
 constexpr std::array<int32_t, 6> k_attention_layers = {2, 5, 8, 10, 12, 14};
 
-bool is_attention_layer(const int32_t block_index) noexcept {
+bool is_attention_layer(const emel::model::data & model_data,
+                        const int32_t block_index) noexcept {
+  const uint32_t pattern_count = model_data.params.attention_layer_pattern_count;
+  if (pattern_count > 0u) {
+    return block_index >= 0 && static_cast<uint32_t>(block_index) < pattern_count &&
+           model_data.params
+                   .attention_layer_pattern_flags[static_cast<size_t>(block_index)] != 0u;
+  }
   for (const int32_t candidate : k_attention_layers) {
     if (candidate == block_index) {
       return true;
@@ -51,12 +67,20 @@ bool load_hparams(const emel::model::detail::hparam_loader & loader,
           "lfm2.attention.layer_norm_rms_epsilon", model_out.params.attention_layer_norm_rms_epsilon) ||
       !loader.assign_f32("lfm2.rope.freq_base", model_out.params.rope_freq_base) ||
       !loader.assign_first_nonzero_i32_from_array(
-          "lfm2.attention.head_count_kv", model_out.params.n_head_kv)) {
+          "lfm2.attention.head_count_kv", model_out.params.n_head_kv) ||
+      !loader.copy_flag_array(
+          "lfm2.attention.head_count_kv",
+          std::span<uint8_t>{model_out.params.attention_layer_pattern_flags},
+          model_out.params.attention_layer_pattern_count)) {
     return false;
   }
 
   model_out.params.n_embd_out = model_out.params.n_embd;
   model_out.params.tie_word_embeddings = true;
+  model_out.params.rope_pair_x0_stride = 1;
+  model_out.params.rope_pair_x1_stride = 1;
+  model_out.params.rope_pair_x1_offset = 0;
+  model_out.params.rope_pair_x1_half_rot_offset = 1;
   return true;
 }
 
@@ -64,12 +88,19 @@ namespace {
 
 emel::error::type validate_contract(const emel::model::data & model_data,
                                     const bool strict_metadata) noexcept {
+  const bool metadata_1_2b =
+      model_data.n_layers == k_block_count &&
+      model_data.params.n_layer == k_block_count &&
+      model_data.params.n_embd == k_embedding_length &&
+      model_data.params.n_head == k_head_count;
+  const bool metadata_230m =
+      model_data.n_layers == k_230m_block_count &&
+      model_data.params.n_layer == k_230m_block_count &&
+      model_data.params.n_embd == k_230m_embedding_length &&
+      model_data.params.n_head == k_230m_head_count;
   const bool metadata_ok = strict_metadata
-      ? model_data.n_layers == k_block_count &&
-            model_data.params.n_layer == k_block_count &&
+      ? (metadata_1_2b || metadata_230m) &&
             model_data.params.n_ctx == k_context_length &&
-            model_data.params.n_embd == k_embedding_length &&
-            model_data.params.n_head == k_head_count &&
             model_data.params.n_head_kv == k_head_count_kv &&
             model_data.params.n_vocab == k_vocab_size &&
             model_data.params.shortconv_l_cache == k_shortconv_l_cache &&
@@ -106,7 +137,7 @@ emel::error::type validate_contract(const emel::model::data & model_data,
       return emel::error::cast(emel::model::loader::error::model_invalid);
     }
 
-    const bool attention_layer = is_attention_layer(block_index);
+    const bool attention_layer = is_attention_layer(model_data, block_index);
     const bool hybrid_ok =
         attention_layer
             ? emel::model::llama::detail::require_block_tensor(
diff --git a/src/emel/model/qwen3/detail.cpp b/src/emel/model/qwen3/detail.cpp
index 1183910b..8c694fe1 100644
--- a/src/emel/model/qwen3/detail.cpp
+++ b/src/emel/model/qwen3/detail.cpp
@@ -26,6 +26,10 @@ bool load_hparams(const emel::model::detail::hparam_loader & loader,
   model_out.params.attention_value_length = value_length;
   model_out.params.n_embd_out = model_out.params.n_embd;
   model_out.params.tie_word_embeddings = true;
+  model_out.params.rope_pair_x0_stride = 1;
+  model_out.params.rope_pair_x1_stride = 1;
+  model_out.params.rope_pair_x1_offset = 0;
+  model_out.params.rope_pair_x1_half_rot_offset = 1;
   if (model_out.params.n_rot == 0) {
     model_out.params.n_rot = key_length;
   }
diff --git a/src/emel/sm.hpp b/src/emel/sm.hpp
index 2a8a328a..26d2252e 100644
--- a/src/emel/sm.hpp
+++ b/src/emel/sm.hpp
@@ -1,24 +1,665 @@
 #pragma once
 
 #include <stateforward/sml.hpp>
+#include <stateforward/sml/utility/co_sm.hpp>
 #include <algorithm>
 #include <array>
+#include <atomic>
 #include <concepts>
 #include <coroutine>
 #include <cstddef>
+#include <cstdint>
 #include <cstdlib>
 #include <exception>
 #include <memory>
 #include <new>
+#include <semaphore>
 #include <stdexcept>
+#include <thread>
 #include <type_traits>
 #include <tuple>
 #include <utility>
 
 namespace emel {
 
+namespace policy {
+
+using inline_scheduler = stateforward::sml::utility::policy::inline_scheduler;
+
+template <std::size_t capacity = 1024, std::size_t inline_task_bytes = 64>
+using fifo_scheduler =
+    stateforward::sml::utility::policy::fifo_scheduler<capacity, inline_task_bytes>;
+
+template <class scheduler>
+using coroutine_scheduler =
+    stateforward::sml::utility::policy::coroutine_scheduler<scheduler>;
+
+template <class allocator>
+using coroutine_allocator =
+    stateforward::sml::utility::policy::coroutine_allocator<allocator>;
+
+template <std::size_t slot_size = 1024, std::size_t slot_count = 64>
+class fixed_coroutine_allocator {
+ public:
+  static_assert(slot_size > 0,
+                "fixed_coroutine_allocator slot size must be non-zero");
+  static_assert(slot_count > 0,
+                "fixed_coroutine_allocator slot count must be non-zero");
+
+  fixed_coroutine_allocator() noexcept { reset_freelist(); }
+
+  void * allocate(const std::size_t size, const std::size_t alignment) noexcept {
+    const bool fits = size <= slot_size && alignment <= alignof(pool_slot);
+    const bool available = free_head_ != invalid_index;
+    if (fits && available) {
+      const std::size_t slot_index = free_head_;
+      free_head_ = next_free_[slot_index];
+      return static_cast<void *>(slots_[slot_index].storage.data());
+    }
+    return nullptr;
+  }
+
+  void deallocate(void * ptr, const std::size_t size,
+                  const std::size_t alignment) noexcept {
+    const bool reusable = ptr != nullptr && size <= slot_size &&
+                          alignment <= alignof(pool_slot) &&
+                          is_pool_pointer(ptr);
+    if (reusable) {
+      const std::size_t slot_index = slot_index_for(ptr);
+      next_free_[slot_index] = free_head_;
+      free_head_ = slot_index;
+    }
+  }
+
+ private:
+  static constexpr std::size_t invalid_index = slot_count;
+
+  struct pool_slot {
+    alignas(std::max_align_t) std::array<unsigned char, slot_size> storage{};
+  };
+
+  bool is_pool_pointer(void * ptr) const noexcept {
+    const auto begin = reinterpret_cast<std::uintptr_t>(slots_.data());
+    const auto end = begin + sizeof(slots_);
+    const auto candidate = reinterpret_cast<std::uintptr_t>(ptr);
+    const bool in_range = candidate >= begin && candidate < end;
+    if (in_range) {
+      const std::size_t offset = static_cast<std::size_t>(candidate - begin);
+      return (offset % sizeof(pool_slot)) == 0;
+    }
+    return false;
+  }
+
+  std::size_t slot_index_for(void * ptr) const noexcept {
+    const auto begin = reinterpret_cast<std::uintptr_t>(slots_.data());
+    const auto candidate = reinterpret_cast<std::uintptr_t>(ptr);
+    const std::size_t offset = static_cast<std::size_t>(candidate - begin);
+    return offset / sizeof(pool_slot);
+  }
+
+  void reset_freelist() noexcept {
+    for (std::size_t i = 0; i + 1 < slot_count; ++i) {
+      next_free_[i] = i + 1;
+    }
+    next_free_[slot_count - 1] = invalid_index;
+    free_head_ = 0;
+  }
+
+  std::array<pool_slot, slot_count> slots_{};
+  std::array<std::size_t, slot_count> next_free_{};
+  std::size_t free_head_ = 0;
+};
+
+// Architecture-appropriate spin hint for short busy-waits (lock-free join /
+// queue retries). Reduces contention and power on hyperthreads without yielding
+// the scheduler.
+inline void cpu_relax() noexcept {
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__)
+  __builtin_ia32_pause();
+#elif defined(__aarch64__) || defined(_M_ARM64)
+  __asm__ __volatile__("yield" ::: "memory");
+#else
+  std::atomic_signal_fence(std::memory_order_seq_cst);
+#endif
+}
+
+template <std::size_t worker_count = 2, std::size_t capacity = 1024,
+          std::size_t inline_task_bytes = 64>
+class thread_pool_scheduler {
+ public:
+  static_assert(worker_count > 0,
+                "thread_pool_scheduler worker count must be non-zero");
+  static_assert(capacity > 1,
+                "thread_pool_scheduler capacity must be greater than one");
+  static_assert((capacity & (capacity - 1)) == 0,
+                "thread_pool_scheduler capacity must be a power of two");
+  static_assert(inline_task_bytes > 0,
+                "thread_pool_scheduler inline storage must be non-zero");
+
+  static constexpr bool guarantees_fifo = false;
+  static constexpr bool single_consumer = false;
+  static constexpr bool multi_consumer = true;
+  static constexpr bool owns_workers = true;
+  static constexpr bool run_to_completion = false;
+  static constexpr std::size_t static_worker_count = worker_count;
+  static constexpr std::size_t static_capacity = capacity;
+
+  // std::thread may allocate OS/runtime resources here; dispatch uses only the
+  // fixed task ring below.
+  thread_pool_scheduler() { start_workers(); }
+
+  ~thread_pool_scheduler() { stop_workers(); }
+
+  thread_pool_scheduler(const thread_pool_scheduler &) = delete;
+  thread_pool_scheduler & operator=(const thread_pool_scheduler &) = delete;
+  thread_pool_scheduler(thread_pool_scheduler &&) = delete;
+  thread_pool_scheduler & operator=(thread_pool_scheduler &&) = delete;
+
+  template <class fn>
+  bool try_run_immediate(fn && fn_in) noexcept(noexcept(std::forward<fn>(fn_in)())) {
+    if (queued_or_running_.load(std::memory_order_acquire) != 0u) {
+      return false;
+    }
+
+    bool expected = false;
+    if (!inline_active_.compare_exchange_strong(
+            expected, true, std::memory_order_acq_rel,
+            std::memory_order_acquire)) {
+      return false;
+    }
+
+    struct reset_inline {
+      std::atomic<bool> & active;
+      ~reset_inline() noexcept { active.store(false, std::memory_order_release); }
+    } reset{inline_active_};
+
+    if (queued_or_running_.load(std::memory_order_acquire) != 0u) {
+      return false;
+    }
+
+    std::forward<fn>(fn_in)();
+    immediate_run_count_.fetch_add(1u, std::memory_order_relaxed);
+    return true;
+  }
+
+  template <class fn>
+  bool try_submit(fn && fn_in) noexcept {
+    return try_submit_with_completion(std::forward<fn>(fn_in), nullptr, nullptr);
+  }
+
+  // Detached hard-contract wrapper for call sites that have already proven
+  // scheduler lifetime and queue capacity. Actor-facing RTC paths must use
+  // thread_pool_scheduler_ref::schedule or run_or_schedule_and_wait.
+  template <class fn>
+  void submit(fn && fn_in) noexcept {
+    if (!try_submit(std::forward<fn>(fn_in))) {
+      std::terminate();
+    }
+  }
+
+  template <class fn>
+  bool run_or_schedule_and_wait(fn && fn_in) noexcept(noexcept(std::forward<fn>(fn_in)())) {
+    if (try_run_immediate(std::forward<fn>(fn_in))) {
+      return true;
+    }
+    if (running_on_this_worker()) {
+      return false;
+    }
+
+    // Spin-join on a local flag rather than blocking on a semaphore: the worker
+    // sets done last, so the waiter returns only after the worker's final write
+    // and can safely let the flag go out of scope (no destroy-during-notify
+    // fault). The wait is a bounded RTC join over one already-submitted task.
+    std::atomic<bool> done{false};
+    const bool scheduled = try_submit_and_signal(
+        [&fn_in]() noexcept(noexcept(fn_in())) { fn_in(); }, done);
+    if (!scheduled) {
+      return false;
+    }
+    while (!done.load(std::memory_order_acquire)) {
+      cpu_relax();
+    }
+    return true;
+  }
+
+  uint64_t immediate_run_count() const noexcept {
+    return immediate_run_count_.load(std::memory_order_relaxed);
+  }
+
+  uint64_t scheduled_run_count() const noexcept {
+    return scheduled_run_count_.load(std::memory_order_relaxed);
+  }
+
+  uint64_t worker_run_count() const noexcept {
+    return worker_run_count_.load(std::memory_order_relaxed);
+  }
+
+  bool is_current_thread_worker() const noexcept { return running_on_this_worker(); }
+
+  template <class fn>
+  bool try_submit_with_completion(
+      fn && fn_in,
+      void * completion_ctx,
+      void (*completion_fn)(void *) noexcept) noexcept {
+    if (stopping_.load(std::memory_order_acquire)) {
+      return false;
+    }
+
+    queued_or_running_.fetch_add(1u, std::memory_order_acq_rel);
+    const bool enqueued = enqueue(
+        std::forward<fn>(fn_in), completion_ctx, completion_fn);
+    if (!enqueued) {
+      queued_or_running_.fetch_sub(1u, std::memory_order_acq_rel);
+      return false;
+    }
+
+    scheduled_run_count_.fetch_add(1u, std::memory_order_relaxed);
+    ready_.release();
+    return true;
+  }
+
+ private:
+  struct task_slot {
+    using invoke_fn = void (*)(void *) noexcept;
+    using destroy_fn = void (*)(void *) noexcept;
+
+    alignas(std::max_align_t) std::array<unsigned char, inline_task_bytes> storage{};
+    std::atomic<std::size_t> sequence = 0u;
+    invoke_fn invoke = nullptr;
+    destroy_fn destroy = nullptr;
+    void * completion_ctx = nullptr;
+    void (*completion_fn)(void *) noexcept = nullptr;
+
+    template <class fn>
+    void set(fn && fn_in,
+             void * completion_ctx_in,
+             void (*completion_fn_in)(void *) noexcept) noexcept {
+      using fn_type = std::decay_t<fn>;
+      static_assert(sizeof(fn_type) <= inline_task_bytes,
+                    "scheduled task exceeds inline storage capacity");
+      static_assert(alignof(fn_type) <= alignof(std::max_align_t),
+                    "scheduled task alignment exceeds scheduler storage alignment");
+
+      new (storage.data()) fn_type(std::forward<fn>(fn_in));
+      invoke = [](void * ptr) noexcept { (*static_cast<fn_type *>(ptr))(); };
+      destroy = [](void * ptr) noexcept { static_cast<fn_type *>(ptr)->~fn_type(); };
+      completion_ctx = completion_ctx_in;
+      completion_fn = completion_fn_in;
+    }
+
+    void run() noexcept {
+      invoke(storage.data());
+      destroy(storage.data());
+      invoke = nullptr;
+      destroy = nullptr;
+    }
+
+    void reset() noexcept {
+      if (destroy != nullptr) {
+        destroy(storage.data());
+      }
+      invoke = nullptr;
+      destroy = nullptr;
+      completion_ctx = nullptr;
+      completion_fn = nullptr;
+    }
+  };
+
+  static constexpr std::size_t index_mask = capacity - 1u;
+
+  void start_workers() {
+    for (std::size_t i = 0; i < capacity; ++i) {
+      tasks_[i].sequence.store(i, std::memory_order_relaxed);
+    }
+    std::size_t started = 0u;
+    try {
+      for (; started < worker_count; ++started) {
+        workers_[started] = std::thread([this]() noexcept { worker_loop(); });
+      }
+    } catch (...) {
+      stopping_.store(true, std::memory_order_release);
+      for (std::size_t i = 0; i < started; ++i) {
+        ready_.release();
+      }
+      for (std::size_t i = 0; i < started; ++i) {
+        if (workers_[i].joinable()) {
+          workers_[i].join();
+        }
+      }
+      throw;
+    }
+  }
+
+  void stop_workers() noexcept {
+    const bool was_stopping = stopping_.exchange(true, std::memory_order_acq_rel);
+    if (was_stopping) {
+      return;
+    }
+
+    for (std::size_t i = 0; i < worker_count; ++i) {
+      ready_.release();
+    }
+
+    for (auto & worker : workers_) {
+      if (worker.joinable()) {
+        worker.join();
+      }
+    }
+
+    clear_unrun_tasks();
+  }
+
+  template <class fn>
+  bool try_submit_and_signal(fn && fn_in, std::atomic<bool> & done) noexcept {
+    return try_submit_with_completion(
+        std::forward<fn>(fn_in), &done, signal_done_flag);
+  }
+
+  template <class fn>
+  bool enqueue(fn && fn_in,
+               void * completion_ctx,
+               void (*completion_fn)(void *) noexcept) noexcept {
+    task_slot * slot = nullptr;
+    std::size_t pos = enqueue_pos_.load(std::memory_order_relaxed);
+    for (;;) {
+      slot = &tasks_[pos & index_mask];
+      const std::size_t seq = slot->sequence.load(std::memory_order_acquire);
+      const auto diff = static_cast<std::intptr_t>(seq) -
+                        static_cast<std::intptr_t>(pos);
+      if (diff == 0) {
+        if (enqueue_pos_.compare_exchange_weak(
+                pos, pos + 1u, std::memory_order_relaxed,
+                std::memory_order_relaxed)) {
+          break;
+        }
+      } else if (diff < 0) {
+        return false;
+      } else {
+        pos = enqueue_pos_.load(std::memory_order_relaxed);
+      }
+    }
+
+    slot->set(std::forward<fn>(fn_in), completion_ctx, completion_fn);
+    slot->sequence.store(pos + 1u, std::memory_order_release);
+    return true;
+  }
+
+  bool try_dequeue_and_run() noexcept {
+    task_slot * slot = nullptr;
+    std::size_t pos = dequeue_pos_.load(std::memory_order_relaxed);
+    for (;;) {
+      slot = &tasks_[pos & index_mask];
+      const std::size_t seq = slot->sequence.load(std::memory_order_acquire);
+      const auto diff = static_cast<std::intptr_t>(seq) -
+                        static_cast<std::intptr_t>(pos + 1u);
+      if (diff == 0) {
+        if (dequeue_pos_.compare_exchange_weak(
+                pos, pos + 1u, std::memory_order_relaxed,
+                std::memory_order_relaxed)) {
+          break;
+        }
+      } else if (diff < 0) {
+        return false;
+      } else {
+        pos = dequeue_pos_.load(std::memory_order_relaxed);
+      }
+    }
+
+    slot->run();
+    void * completion_ctx = slot->completion_ctx;
+    void (*completion_fn)(void *) noexcept = slot->completion_fn;
+    slot->completion_ctx = nullptr;
+    slot->completion_fn = nullptr;
+    worker_run_count_.fetch_add(1u, std::memory_order_relaxed);
+    queued_or_running_.fetch_sub(1u, std::memory_order_acq_rel);
+    slot->sequence.store(pos + capacity, std::memory_order_release);
+    if (completion_fn != nullptr) {
+      completion_fn(completion_ctx);
+    }
+    return true;
+  }
+
+  static void signal_done_flag(void * ctx) noexcept {
+    static_cast<std::atomic<bool> *>(ctx)->store(true, std::memory_order_release);
+  }
+
+  void worker_loop() noexcept {
+    struct worker_scope {
+      const thread_pool_scheduler * previous;
+      explicit worker_scope(const thread_pool_scheduler * current) noexcept
+          : previous(active_worker_scheduler_) {
+        active_worker_scheduler_ = current;
+      }
+      ~worker_scope() noexcept { active_worker_scheduler_ = previous; }
+    } scope{this};
+
+    for (;;) {
+      // Claim exactly one wake permit before dequeuing, preserving the
+      // permit-per-task invariant. Spin-claim first so back-to-back fork/joins
+      // (e.g. a decode burst) keep the worker warm and skip resleep/wakeup
+      // latency between rounds, the same warm-polling strategy optimized native
+      // threadpools use; fall back to a blocking acquire once genuinely idle so
+      // a quiescent pool does not burn a core.
+      bool claimed = false;
+      for (std::size_t spin = 0; spin < k_idle_spin_budget; ++spin) {
+        if (ready_.try_acquire()) {
+          claimed = true;
+          break;
+        }
+        if (stopping_.load(std::memory_order_acquire)) {
+          return;
+        }
+        cpu_relax();
+      }
+      if (!claimed) {
+        ready_.acquire();
+      }
+      // The claimed permit promises a published task or a stop signal. The task
+      // may not be visible at dequeue_pos for a few cycles, so retry rather than
+      // drop the permit (which would strand the task); re-check stop to exit.
+      while (!try_dequeue_and_run()) {
+        if (stopping_.load(std::memory_order_acquire)) {
+          return;
+        }
+        cpu_relax();
+      }
+    }
+  }
+
+  static constexpr std::size_t k_idle_spin_budget = 2048;
+
+  bool running_on_this_worker() const noexcept {
+    return active_worker_scheduler_ == this;
+  }
+
+  void clear_unrun_tasks() noexcept {
+    while (try_dequeue_and_run()) {
+    }
+    for (auto & task : tasks_) {
+      task.reset();
+    }
+  }
+
+  std::array<task_slot, capacity> tasks_{};
+  std::array<std::thread, worker_count> workers_{};
+  std::counting_semaphore<> ready_{0};
+  std::atomic<std::size_t> enqueue_pos_ = 0u;
+  std::atomic<std::size_t> dequeue_pos_ = 0u;
+  std::atomic<std::size_t> queued_or_running_ = 0u;
+  std::atomic<bool> inline_active_ = false;
+  std::atomic<bool> stopping_ = false;
+  std::atomic<uint64_t> immediate_run_count_ = 0u;
+  std::atomic<uint64_t> scheduled_run_count_ = 0u;
+  std::atomic<uint64_t> worker_run_count_ = 0u;
+  inline static thread_local const thread_pool_scheduler *
+      active_worker_scheduler_ = nullptr;
+};
+
+template <class scheduler>
+class thread_pool_scheduler_ref {
+ public:
+  static constexpr bool guarantees_fifo = scheduler::guarantees_fifo;
+  static constexpr bool single_consumer = scheduler::single_consumer;
+  static constexpr bool multi_consumer = scheduler::multi_consumer;
+  static constexpr bool owns_workers = false;
+  static constexpr bool run_to_completion = true;
+  static constexpr std::size_t static_worker_count = scheduler::static_worker_count;
+  static constexpr std::size_t static_capacity = scheduler::static_capacity;
+
+  thread_pool_scheduler_ref() = delete;
+  explicit thread_pool_scheduler_ref(scheduler & scheduler_in) noexcept
+      : scheduler_(&scheduler_in) {}
+
+  class join_group {
+   public:
+    join_group() = default;
+    ~join_group() = default;
+
+    join_group(const join_group &) = delete;
+    join_group & operator=(const join_group &) = delete;
+    join_group(join_group &&) = delete;
+    join_group & operator=(join_group &&) = delete;
+
+    bool wait() noexcept {
+      // Spin-join on pending_ rather than blocking on a per-group semaphore.
+      // The group is caller-owned and typically stack-reused across fork/joins,
+      // so a notify-based wakeup is unsafe: the waiter could observe completion,
+      // return, and destroy the group before the last completer finishes its
+      // release()/notify, faulting on freed semaphore state. With a plain spin,
+      // a completer's final touch of the group is its pending_ decrement, and
+      // wait() returns only after observing pending_ == 0 (all decrements done),
+      // so nothing accesses the group after the caller may destroy it. The wait
+      // is a bounded RTC fork/join over already-submitted lanes, so the producer
+      // core would otherwise be idle; spinning gives the lowest join latency.
+      while (pending_.load(std::memory_order_acquire) != 0u) {
+        cpu_relax();
+      }
+      return accepted_.load(std::memory_order_acquire);
+    }
+
+   private:
+    friend class thread_pool_scheduler_ref;
+
+    void start_one() noexcept {
+      pending_.fetch_add(1u, std::memory_order_acq_rel);
+    }
+
+    void reject_one() noexcept {
+      accepted_.store(false, std::memory_order_release);
+      complete_one();
+    }
+
+    void reject() noexcept {
+      accepted_.store(false, std::memory_order_release);
+    }
+
+    void complete_one() noexcept {
+      pending_.fetch_sub(1u, std::memory_order_acq_rel);
+    }
+
+    static void complete_one(void * ctx) noexcept {
+      static_cast<join_group *>(ctx)->complete_one();
+    }
+
+    std::atomic<uint32_t> pending_ = 0u;
+    std::atomic<bool> accepted_ = true;
+  };
+
+  template <class fn>
+  bool try_run_immediate(fn && fn_in) noexcept(noexcept(std::forward<fn>(fn_in)())) {
+    return scheduler_->try_run_immediate(std::forward<fn>(fn_in));
+  }
+
+  template <class fn>
+  bool try_submit(join_group & group, fn && fn_in) noexcept {
+    if (scheduler_->is_current_thread_worker()) {
+      group.reject();
+      return false;
+    }
+
+    group.start_one();
+    const bool submitted = scheduler_->try_submit_with_completion(
+        std::forward<fn>(fn_in), &group, join_group::complete_one);
+    if (!submitted) {
+      group.reject_one();
+      return false;
+    }
+    return true;
+  }
+
+  template <class fn>
+  void schedule(fn && fn_in) noexcept(noexcept(std::forward<fn>(fn_in)())) {
+    if (!scheduler_->run_or_schedule_and_wait(std::forward<fn>(fn_in))) {
+      std::terminate();
+    }
+  }
+
+  template <class fn>
+  bool run_or_schedule_and_wait(fn && fn_in) noexcept(noexcept(std::forward<fn>(fn_in)())) {
+    return scheduler_->run_or_schedule_and_wait(std::forward<fn>(fn_in));
+  }
+
+  uint64_t immediate_run_count() const noexcept {
+    return scheduler_->immediate_run_count();
+  }
+
+  uint64_t scheduled_run_count() const noexcept {
+    return scheduler_->scheduled_run_count();
+  }
+
+  uint64_t worker_run_count() const noexcept {
+    return scheduler_->worker_run_count();
+  }
+
+ private:
+  scheduler * scheduler_ = nullptr;
+};
+
+using default_coroutine_scheduler = coroutine_scheduler<inline_scheduler>;
+using default_coroutine_allocator =
+    coroutine_allocator<fixed_coroutine_allocator<>>;
+
+template <class scheduler>
+concept strict_ordering_scheduler_contract =
+    requires {
+      { scheduler::guarantees_fifo } -> std::convertible_to<bool>;
+      { scheduler::single_consumer } -> std::convertible_to<bool>;
+      { scheduler::run_to_completion } -> std::convertible_to<bool>;
+    } && static_cast<bool>(scheduler::guarantees_fifo) &&
+    static_cast<bool>(scheduler::single_consumer) &&
+    static_cast<bool>(scheduler::run_to_completion);
+
+}  // namespace policy
+
+using bool_task = stateforward::sml::utility::bool_task;
+
 namespace detail {
 
+class dispatch_scope {
+ public:
+  explicit dispatch_scope(std::atomic<bool> & active) noexcept
+      : active_(&active) {
+    bool expected = false;
+    acquired_ = active_->compare_exchange_strong(
+        expected, true, std::memory_order_acq_rel, std::memory_order_acquire);
+  }
+
+  dispatch_scope(const dispatch_scope &) = delete;
+  dispatch_scope & operator=(const dispatch_scope &) = delete;
+
+  ~dispatch_scope() noexcept {
+    if (acquired_) {
+      active_->store(false, std::memory_order_release);
+    }
+  }
+
+  explicit operator bool() const noexcept { return acquired_; }
+
+ private:
+  std::atomic<bool> * active_ = nullptr;
+  bool acquired_ = false;
+};
+
 template <class event>
 constexpr bool normalize_event_result(const event & ev, const bool accepted) noexcept {
   const bool accepted_ok = accepted;
@@ -218,6 +859,12 @@ struct sm_any_visit<stateforward::sml::aux::type_list<types...>> {
 template <class model, class context = void, class... policies>
 class sm;
 
+template <class model, class context = void,
+          class scheduler_policy = policy::default_coroutine_scheduler,
+          class allocator_policy = policy::default_coroutine_allocator,
+          class... policies>
+class co_sm;
+
 template <class model, class... policies>
 class sm<model, void, policies...> {
  public:
@@ -318,6 +965,369 @@ class sm {
   state_machine_type state_machine_;
 };
 
+namespace detail {
+
+template <class model, class scheduler_policy, class allocator_policy,
+          class... policies>
+class multi_consumer_co_sm_backend {
+ public:
+  using model_type = model;
+  using scheduler_policy_type = scheduler_policy;
+  using scheduler_type = typename scheduler_policy_type::scheduler_type;
+  using allocator_policy_type = allocator_policy;
+  using allocator_type = typename allocator_policy_type::allocator_type;
+  using state_machine_type = stateforward::sml::sm<model, policies...>;
+
+  static_assert(requires { scheduler_type::owns_workers; } &&
+                    !static_cast<bool>(scheduler_type::owns_workers),
+                "multi-consumer co_sm requires a non-owning scheduler reference; "
+                "construct a shared thread_pool_scheduler and pass "
+                "thread_pool_scheduler_ref<pool_type>");
+
+  multi_consumer_co_sm_backend()
+    requires std::default_initializable<scheduler_type>
+  = default;
+  ~multi_consumer_co_sm_backend() = default;
+
+  multi_consumer_co_sm_backend(const multi_consumer_co_sm_backend &) = delete;
+  multi_consumer_co_sm_backend &
+  operator=(const multi_consumer_co_sm_backend &) = delete;
+  multi_consumer_co_sm_backend(multi_consumer_co_sm_backend &&) = delete;
+  multi_consumer_co_sm_backend &
+  operator=(multi_consumer_co_sm_backend &&) = delete;
+
+  explicit multi_consumer_co_sm_backend(scheduler_type scheduler_in)
+      : scheduler_(scheduler_in) {}
+
+  template <class... args>
+  explicit multi_consumer_co_sm_backend(args &&... args_in)
+      : state_machine_(std::forward<args>(args_in)...) {}
+
+  template <class... args>
+  explicit multi_consumer_co_sm_backend(scheduler_type scheduler_in,
+                                        args &&... args_in)
+      : state_machine_(std::forward<args>(args_in)...),
+        scheduler_(scheduler_in) {}
+
+  template <class event>
+  bool process_event(const event & ev) {
+    // The shared pool may have many consumers, but one actor still has a single
+    // writer. Concurrent external dispatch is rejected instead of queued.
+    dispatch_scope dispatch{dispatch_active_};
+    if (!dispatch) {
+      return false;
+    }
+    return state_machine_.process_event(ev);
+  }
+
+  template <class event>
+  bool_task process_event_async(const event & ev) {
+    // This remains RTC: the returned task is ready only after inline or worker
+    // execution completes, and concurrent dispatch on the same actor is rejected.
+    dispatch_scope dispatch{dispatch_active_};
+    if (!dispatch) {
+      return bool_task::from_value(false);
+    }
+    bool accepted = false;
+    const bool completed = scheduler_.run_or_schedule_and_wait([this, &ev, &accepted]() {
+      accepted = state_machine_.process_event(ev);
+    });
+    return bool_task::from_value(completed && accepted);
+  }
+
+  template <class state>
+  bool is(state state_value = {}) const {
+    dispatch_scope dispatch{dispatch_active_};
+    if (!dispatch) {
+      return false;
+    }
+    return state_machine_.is(state_value);
+  }
+
+  template <class visitor>
+  void visit_current_states(visitor && visitor_fn) {
+    dispatch_scope dispatch{dispatch_active_};
+    if (!dispatch) {
+      return;
+    }
+    state_machine_.visit_current_states(std::forward<visitor>(visitor_fn));
+  }
+
+  scheduler_type & scheduler() noexcept { return scheduler_; }
+  const scheduler_type & scheduler() const noexcept { return scheduler_; }
+
+  allocator_type & allocator() noexcept { return allocator_; }
+  const allocator_type & allocator() const noexcept { return allocator_; }
+
+ protected:
+  state_machine_type & raw_sm() { return state_machine_; }
+  const state_machine_type & raw_sm() const { return state_machine_; }
+
+ private:
+  state_machine_type state_machine_{};
+  scheduler_type scheduler_;
+  allocator_type allocator_{};
+  mutable std::atomic<bool> dispatch_active_ = false;
+};
+
+template <bool multi_consumer, class model, class scheduler_policy,
+          class allocator_policy, class... policies>
+struct co_sm_backend_selector;
+
+template <class model, class scheduler_policy, class allocator_policy,
+          class... policies>
+struct co_sm_backend_selector<false, model, scheduler_policy, allocator_policy,
+                              policies...> {
+  using type =
+      stateforward::sml::utility::co_sm<model, scheduler_policy,
+                                        allocator_policy, policies...>;
+};
+
+template <class model, class scheduler_policy, class allocator_policy,
+          class... policies>
+struct co_sm_backend_selector<true, model, scheduler_policy, allocator_policy,
+                              policies...> {
+  using type =
+      multi_consumer_co_sm_backend<model, scheduler_policy, allocator_policy,
+                                   policies...>;
+};
+
+template <class scheduler, class = void>
+struct is_multi_consumer_scheduler : std::false_type {};
+
+template <class scheduler>
+struct is_multi_consumer_scheduler<
+    scheduler, std::void_t<decltype(scheduler::multi_consumer)>>
+    : std::bool_constant<static_cast<bool>(scheduler::multi_consumer)> {};
+
+template <class scheduler>
+inline constexpr bool is_multi_consumer_scheduler_v =
+    is_multi_consumer_scheduler<scheduler>::value;
+
+template <class model, class scheduler_policy, class allocator_policy,
+          class... policies>
+using co_sm_backend_t = typename co_sm_backend_selector<
+    is_multi_consumer_scheduler_v<typename scheduler_policy::scheduler_type>,
+    model, scheduler_policy, allocator_policy, policies...>::type;
+
+}  // namespace detail
+
+template <class model, class scheduler_policy, class allocator_policy,
+          class... policies>
+class co_sm<model, void, scheduler_policy, allocator_policy, policies...> {
+ public:
+  using model_type = model;
+  using context_type = void;
+  using scheduler_policy_type = scheduler_policy;
+  using scheduler_type = typename scheduler_policy_type::scheduler_type;
+  using allocator_policy_type = allocator_policy;
+  using allocator_type = typename allocator_policy_type::allocator_type;
+  using state_machine_type =
+      detail::co_sm_backend_t<model, scheduler_policy, allocator_policy,
+                              policies...>;
+
+  co_sm() = default;
+  ~co_sm() = default;
+
+  co_sm(const co_sm &) = default;
+  co_sm(co_sm &&) = default;
+  co_sm & operator=(const co_sm &) = default;
+  co_sm & operator=(co_sm &&) = default;
+
+  template <class... args>
+  explicit co_sm(args &&... args_in)
+      : state_machine_(std::forward<args>(args_in)...) {}
+
+  explicit co_sm(scheduler_type scheduler_in)
+    requires detail::is_multi_consumer_scheduler_v<scheduler_type>
+      : state_machine_(scheduler_in) {}
+
+  template <class... args>
+  explicit co_sm(scheduler_type scheduler_in, args &&... args_in)
+    requires detail::is_multi_consumer_scheduler_v<scheduler_type>
+      : state_machine_(scheduler_in, std::forward<args>(args_in)...) {}
+
+  template <class event>
+  bool process_event(const event & ev) {
+    const bool accepted = state_machine_.process_event(ev);
+    return detail::normalize_event_result(ev, accepted);
+  }
+
+  template <class event>
+  bool_task process_event_async(const event & ev) {
+    if constexpr (std::is_same_v<scheduler_type, policy::inline_scheduler>) {
+      const bool accepted = state_machine_.process_event_async(ev).result();
+      return bool_task::from_value(detail::normalize_event_result(ev, accepted));
+    } else if constexpr (detail::is_multi_consumer_scheduler_v<scheduler_type>) {
+      const bool accepted = state_machine_.process_event_async(ev).result();
+      return bool_task::from_value(detail::normalize_event_result(ev, accepted));
+    } else if constexpr (requires(scheduler_type & scheduler) {
+                           {
+                             scheduler.try_run_immediate([]() noexcept {})
+                           } -> std::same_as<bool>;
+                         }) {
+      bool accepted = false;
+      const bool completed = state_machine_.scheduler().try_run_immediate(
+          [this, &ev, &accepted]() {
+            accepted = state_machine_.process_event(ev);
+          });
+      return bool_task::from_value(
+          completed && detail::normalize_event_result(ev, accepted));
+    } else {
+      auto task = state_machine_.process_event_async(ev);
+      if (!task.await_ready()) {
+        std::terminate();
+      }
+      const bool accepted = task.result();
+      return bool_task::from_value(detail::normalize_event_result(ev, accepted));
+    }
+  }
+
+  template <class state>
+  bool is(state state_value = {}) const {
+    return state_machine_.is(state_value);
+  }
+
+  template <class visitor>
+  void visit_current_states(visitor && visitor_fn) {
+    state_machine_.visit_current_states(std::forward<visitor>(visitor_fn));
+  }
+
+  scheduler_type & scheduler() noexcept { return state_machine_.scheduler(); }
+  const scheduler_type & scheduler() const noexcept {
+    return state_machine_.scheduler();
+  }
+
+  allocator_type & allocator() noexcept { return state_machine_.allocator(); }
+  const allocator_type & allocator() const noexcept {
+    return state_machine_.allocator();
+  }
+
+ protected:
+  state_machine_type & raw_sm() { return state_machine_; }
+  const state_machine_type & raw_sm() const { return state_machine_; }
+
+ private:
+  state_machine_type state_machine_;
+};
+
+template <class model, class context, class scheduler_policy,
+          class allocator_policy, class... policies>
+class co_sm {
+ public:
+  static_assert(!std::is_void_v<context>, "contextful co_sm requires a non-void context type");
+
+  using model_type = model;
+  using context_type = context;
+  using scheduler_policy_type = scheduler_policy;
+  using scheduler_type = typename scheduler_policy_type::scheduler_type;
+  using allocator_policy_type = allocator_policy;
+  using allocator_type = typename allocator_policy_type::allocator_type;
+  using state_machine_type =
+      detail::co_sm_backend_t<model, scheduler_policy, allocator_policy,
+                              policies...>;
+
+  co_sm() : state_machine_(context_) {}
+  explicit co_sm(const context_type & context_in)
+      : context_(context_in), state_machine_(context_) {}
+  explicit co_sm(context_type && context_in)
+      : context_(std::move(context_in)), state_machine_(context_) {}
+
+  explicit co_sm(scheduler_type scheduler_in)
+    requires detail::is_multi_consumer_scheduler_v<scheduler_type>
+      : state_machine_(scheduler_in, context_) {}
+
+  co_sm(scheduler_type scheduler_in, const context_type & context_in)
+    requires detail::is_multi_consumer_scheduler_v<scheduler_type>
+      : context_(context_in), state_machine_(scheduler_in, context_) {}
+
+  co_sm(scheduler_type scheduler_in, context_type && context_in)
+    requires detail::is_multi_consumer_scheduler_v<scheduler_type>
+      : context_(std::move(context_in)), state_machine_(scheduler_in, context_) {}
+
+  template <class... args>
+    requires (sizeof...(args) > 0)
+  explicit co_sm(const context_type & context_in, args &&... args_in)
+      : context_(context_in),
+        state_machine_(context_, std::forward<args>(args_in)...) {}
+
+  template <class... args>
+    requires (sizeof...(args) > 0)
+  explicit co_sm(context_type && context_in, args &&... args_in)
+      : context_(std::move(context_in)),
+        state_machine_(context_, std::forward<args>(args_in)...) {}
+
+  co_sm(const co_sm &) = default;
+  co_sm(co_sm &&) = default;
+  co_sm & operator=(const co_sm &) = default;
+  co_sm & operator=(co_sm &&) = default;
+  ~co_sm() = default;
+
+  template <class event>
+  bool process_event(const event & ev) {
+    const bool accepted = state_machine_.process_event(ev);
+    return detail::normalize_event_result(ev, accepted);
+  }
+
+  template <class event>
+  bool_task process_event_async(const event & ev) {
+    if constexpr (std::is_same_v<scheduler_type, policy::inline_scheduler>) {
+      const bool accepted = state_machine_.process_event_async(ev).result();
+      return bool_task::from_value(detail::normalize_event_result(ev, accepted));
+    } else if constexpr (detail::is_multi_consumer_scheduler_v<scheduler_type>) {
+      const bool accepted = state_machine_.process_event_async(ev).result();
+      return bool_task::from_value(detail::normalize_event_result(ev, accepted));
+    } else if constexpr (requires(scheduler_type & scheduler) {
+                           {
+                             scheduler.try_run_immediate([]() noexcept {})
+                           } -> std::same_as<bool>;
+                         }) {
+      bool accepted = false;
+      const bool completed = state_machine_.scheduler().try_run_immediate(
+          [this, &ev, &accepted]() {
+            accepted = state_machine_.process_event(ev);
+          });
+      return bool_task::from_value(
+          completed && detail::normalize_event_result(ev, accepted));
+    } else {
+      auto task = state_machine_.process_event_async(ev);
+      if (!task.await_ready()) {
+        std::terminate();
+      }
+      const bool accepted = task.result();
+      return bool_task::from_value(detail::normalize_event_result(ev, accepted));
+    }
+  }
+
+  template <class state>
+  bool is(state state_value = {}) const {
+    return state_machine_.is(state_value);
+  }
+
+  template <class visitor>
+  void visit_current_states(visitor && visitor_fn) {
+    state_machine_.visit_current_states(std::forward<visitor>(visitor_fn));
+  }
+
+  scheduler_type & scheduler() noexcept { return state_machine_.scheduler(); }
+  const scheduler_type & scheduler() const noexcept {
+    return state_machine_.scheduler();
+  }
+
+  allocator_type & allocator() noexcept { return state_machine_.allocator(); }
+  const allocator_type & allocator() const noexcept {
+    return state_machine_.allocator();
+  }
+
+ protected:
+  context_type context_{};
+  state_machine_type & raw_sm() { return state_machine_; }
+  const state_machine_type & raw_sm() const { return state_machine_; }
+
+ private:
+  state_machine_type state_machine_;
+};
+
 template <class kind_enum, class sm_list, class event_list>
 class sm_any {
  public:
diff --git a/src/emel/speech/decoder/whisper/actions.hpp b/src/emel/speech/decoder/whisper/actions.hpp
index 8089dad0..c89cb0ed 100644
--- a/src/emel/speech/decoder/whisper/actions.hpp
+++ b/src/emel/speech/decoder/whisper/actions.hpp
@@ -86,6 +86,7 @@ struct effect_run_decoder_variant {
         .space = tokens.space,
     };
     const uint64_t digest = kdetail::run_decoder_sequence<Variant, Aux>(
+        ctx.kernel,
         *runtime_ev.request.contract.model,
         runtime_ev.request.encoder_state.data(),
         static_cast<uint64_t>(runtime_ev.request.encoder_frame_count),
diff --git a/src/emel/speech/decoder/whisper/context.hpp b/src/emel/speech/decoder/whisper/context.hpp
index f3aa3c73..46703590 100644
--- a/src/emel/speech/decoder/whisper/context.hpp
+++ b/src/emel/speech/decoder/whisper/context.hpp
@@ -2,9 +2,12 @@
 
 #include <cstdint>
 
+#include "emel/kernel/sm.hpp"
+
 namespace emel::speech::decoder::whisper::action {
 
 struct context {
+  emel::kernel::sm kernel{emel::kernel::detect_host_kind()};
   uint64_t q8_0_dispatch_count = 0;
   uint64_t q4_0_dispatch_count = 0;
   uint64_t q4_1_dispatch_count = 0;
diff --git a/src/emel/speech/decoder/whisper/detail.hpp b/src/emel/speech/decoder/whisper/detail.hpp
index cd3e57ed..215898bc 100644
--- a/src/emel/speech/decoder/whisper/detail.hpp
+++ b/src/emel/speech/decoder/whisper/detail.hpp
@@ -12,7 +12,7 @@
 #include <arm_neon.h>
 #endif
 
-#include "emel/kernel/aarch64/actions.hpp"
+#include "emel/kernel/sm.hpp"
 #include "emel/model/data.hpp"
 #include "emel/speech/decoder/whisper/errors.hpp"
 
@@ -286,14 +286,14 @@ inline float dot_linear_row(const emel::model::data::tensor_record &weight,
 template <uint64_t In, uint64_t Out, bool HasBias,
           aux_weight_variant Aux = aux_weight_variant::q8_0>
 inline void
-linear_q8_0_quantized_input(const emel::model::data::tensor_record &weight,
+linear_q8_0_quantized_input(::emel::kernel::sm &kernel,
+                            const emel::model::data::tensor_record &weight,
                             const emel::model::data::tensor_record *bias,
                             const float *input, float *output) noexcept {
   const uint64_t row_bytes =
       ::emel::kernel::detail::quantized_row_storage_bytes(
           ::emel::kernel::detail::dtype_q8_0, In);
-#if defined(__aarch64__) || defined(__ARM_NEON)
-  ::emel::kernel::event::op_mul_mat request{
+  const ::emel::kernel::event::op_mul_mat request{
       .src0 =
           {
               .data = weight.data,
@@ -327,27 +327,8 @@ linear_q8_0_quantized_input(const emel::model::data::tensor_record &weight,
                       sizeof(float) * Out,
                   },
           },
-      .nth = 1u,
   };
-  ::emel::kernel::aarch64::detail::execute_neon_mul_mat_q8_0_vector_unchecked(
-      request);
-#else
-  constexpr uint64_t block_count = In / ::emel::kernel::detail::quant::QK8_0;
-  std::array<::emel::kernel::detail::quant::block_q8_0,
-             static_cast<size_t>(block_count)>
-      input_blocks = {};
-  ::emel::kernel::detail::quant::quantize_row_q8_0_strided(
-      input, 1u, input_blocks.data(), static_cast<int64_t>(In));
-  for (uint64_t row = 0; row < Out; ++row) {
-    const auto *row_base =
-        static_cast<const uint8_t *>(weight.data) + row * row_bytes;
-    const auto *weight_blocks =
-        reinterpret_cast<const ::emel::kernel::detail::quant::block_q8_0 *>(
-            row_base);
-    output[row] = ::emel::kernel::detail::dot_q8_0_q8_0_row_scalar(
-        weight_blocks, input_blocks.data(), block_count);
-  }
-#endif
+  (void) kernel.process_event(request);
   if constexpr (HasBias) {
     for (uint64_t row = 0; row < Out; ++row) {
       output[row] += read_aux_vector<Aux>(*bias, row);
@@ -392,13 +373,15 @@ inline void layer_norm_frame(const float *input,
 
 template <linear_weight_variant Variant, uint64_t In, uint64_t Out,
           aux_weight_variant Aux = aux_weight_variant::q8_0>
-inline void linear(const emel::model::data::tensor_record &weight,
+inline void linear(::emel::kernel::sm &kernel,
+                   const emel::model::data::tensor_record &weight,
                    const emel::model::data::tensor_record &bias,
                    const float *input, float *output) noexcept {
   if constexpr (Variant == linear_weight_variant::q8_0) {
-    linear_q8_0_quantized_input<In, Out, true, Aux>(weight, &bias, input,
-                                                    output);
+    linear_q8_0_quantized_input<In, Out, true, Aux>(kernel, weight, &bias,
+                                                    input, output);
   } else {
+    (void) kernel;
     for (uint64_t row = 0; row < Out; ++row) {
       output[row] = dot_linear_row<Variant>(weight, row, input, In) +
                     read_aux_vector<Aux>(bias, row);
@@ -407,11 +390,14 @@ inline void linear(const emel::model::data::tensor_record &weight,
 }
 
 template <linear_weight_variant Variant, uint64_t In, uint64_t Out>
-inline void linear_no_bias(const emel::model::data::tensor_record &weight,
+inline void linear_no_bias(::emel::kernel::sm &kernel,
+                           const emel::model::data::tensor_record &weight,
                            const float *input, float *output) noexcept {
   if constexpr (Variant == linear_weight_variant::q8_0) {
-    linear_q8_0_quantized_input<In, Out, false>(weight, nullptr, input, output);
+    linear_q8_0_quantized_input<In, Out, false>(kernel, weight, nullptr, input,
+                                                output);
   } else {
+    (void) kernel;
     for (uint64_t row = 0; row < Out; ++row) {
       output[row] = dot_linear_row<Variant>(weight, row, input, In);
     }
@@ -455,7 +441,8 @@ inline uint64_t write_layer_tensor_name(char *output, const char *prefix,
 }
 
 template <linear_weight_variant Variant, aux_weight_variant Aux>
-inline void compute_decoder_cross_cache(const emel::model::data &model,
+inline void compute_decoder_cross_cache(::emel::kernel::sm &kernel,
+                                        const emel::model::data &model,
                                         const float *encoder_state,
                                         const uint64_t encoder_frames,
                                         float *cross_k_cache,
@@ -479,16 +466,18 @@ inline void compute_decoder_cross_cache(const emel::model::data &model,
     for (uint64_t frame = 0; frame < encoder_frames; ++frame) {
       const float *frame_in = encoder_state + frame * width;
       linear_no_bias<Variant, k_embedding_length, k_embedding_length>(
-          cross_k_w, frame_in, layer_cross_k + frame * width);
+          kernel, cross_k_w, frame_in, layer_cross_k + frame * width);
       linear<Variant, k_embedding_length, k_embedding_length, Aux>(
-          cross_v_w, cross_v_b, frame_in, layer_cross_v + frame * width);
+          kernel, cross_v_w, cross_v_b, frame_in,
+          layer_cross_v + frame * width);
     }
   }
 }
 
 template <linear_weight_variant Variant, aux_weight_variant Aux>
 inline void
-run_decoder_layer_sequence(const emel::model::data &model, const uint64_t layer,
+run_decoder_layer_sequence(::emel::kernel::sm &kernel,
+                           const emel::model::data &model, const uint64_t layer,
                            const uint64_t encoder_frames,
                            const uint64_t token_count, const float *cross_k,
                            const float *cross_v, float *hidden, float *next,
@@ -528,11 +517,11 @@ run_decoder_layer_sequence(const emel::model::data &model, const uint64_t layer,
   for (uint64_t token = 0; token < token_count; ++token) {
     layer_norm_frame<Aux>(hidden + token * width, self_ln_w, self_ln_b, norm);
     linear<Variant, k_embedding_length, k_embedding_length, Aux>(
-        self_q_w, self_q_b, norm, q + token * width);
+        kernel, self_q_w, self_q_b, norm, q + token * width);
     linear_no_bias<Variant, k_embedding_length, k_embedding_length>(
-        self_k_w, norm, k + token * width);
+        kernel, self_k_w, norm, k + token * width);
     linear<Variant, k_embedding_length, k_embedding_length, Aux>(
-        self_v_w, self_v_b, norm, v + token * width);
+        kernel, self_v_w, self_v_b, norm, v + token * width);
   }
 
   const float scale =
@@ -563,7 +552,7 @@ run_decoder_layer_sequence(const emel::model::data &model, const uint64_t layer,
       }
     }
     linear<Variant, k_embedding_length, k_embedding_length, Aux>(
-        self_o_w, self_o_b, attn + token * width, norm);
+        kernel, self_o_w, self_o_b, attn + token * width, norm);
     for (uint64_t dim = 0; dim < width; ++dim) {
       next[token * width + dim] = hidden[token * width + dim] + norm[dim];
     }
@@ -572,7 +561,7 @@ run_decoder_layer_sequence(const emel::model::data &model, const uint64_t layer,
   for (uint64_t token = 0; token < token_count; ++token) {
     layer_norm_frame<Aux>(next + token * width, cross_ln_w, cross_ln_b, norm);
     linear<Variant, k_embedding_length, k_embedding_length, Aux>(
-        cross_q_w, cross_q_b, norm, q + token * width);
+        kernel, cross_q_w, cross_q_b, norm, q + token * width);
     std::fill_n(attn + token * width, static_cast<size_t>(width), 0.0f);
     for (uint64_t head = 0;
          head < static_cast<uint64_t>(k_attention_head_count); ++head) {
@@ -602,7 +591,7 @@ run_decoder_layer_sequence(const emel::model::data &model, const uint64_t layer,
       }
     }
     linear<Variant, k_embedding_length, k_embedding_length, Aux>(
-        cross_o_w, cross_o_b, attn + token * width, norm);
+        kernel, cross_o_w, cross_o_b, attn + token * width, norm);
     for (uint64_t dim = 0; dim < width; ++dim) {
       hidden[token * width + dim] = next[token * width + dim] + norm[dim];
     }
@@ -611,13 +600,13 @@ run_decoder_layer_sequence(const emel::model::data &model, const uint64_t layer,
   for (uint64_t token = 0; token < token_count; ++token) {
     layer_norm_frame<Aux>(hidden + token * width, final_ln_w, final_ln_b, norm);
     linear<Variant, k_embedding_length, k_feed_forward_length, Aux>(
-        fc1_w, fc1_b, norm, ff);
+        kernel, fc1_w, fc1_b, norm, ff);
     for (uint64_t dim = 0; dim < static_cast<uint64_t>(k_feed_forward_length);
          ++dim) {
       ff[dim] = gelu(ff[dim]);
     }
     linear<Variant, k_feed_forward_length, k_embedding_length, Aux>(
-        fc2_w, fc2_b, ff, norm);
+        kernel, fc2_w, fc2_b, ff, norm);
     for (uint64_t dim = 0; dim < width; ++dim) {
       hidden[token * width + dim] += norm[dim];
     }
@@ -638,6 +627,7 @@ inline uint64_t digest_f32(const float *values, const uint64_t count) noexcept {
 template <linear_weight_variant Variant,
           aux_weight_variant Aux = aux_weight_variant::q8_0>
 inline void compute_decoder_logits_for_tokens(
+    ::emel::kernel::sm &kernel,
     const emel::model::data &model, const uint64_t encoder_frames,
     const float *cross_k_cache, const float *cross_v_cache,
     const int32_t *tokens, const uint64_t token_count, float *workspace,
@@ -678,9 +668,9 @@ inline void compute_decoder_logits_for_tokens(
     const uint64_t layer_offset =
         layer * encoder_frames * static_cast<uint64_t>(k_embedding_length);
     run_decoder_layer_sequence<Variant, Aux>(
-        model, layer, encoder_frames, token_count, cross_k_cache + layer_offset,
-        cross_v_cache + layer_offset, hidden, next, q, k, v, attn, norm, ff,
-        scores);
+        kernel, model, layer, encoder_frames, token_count,
+        cross_k_cache + layer_offset, cross_v_cache + layer_offset, hidden,
+        next, q, k, v, attn, norm, ff, scores);
   }
 
   const auto &final_w = *find_tensor(
@@ -693,8 +683,8 @@ inline void compute_decoder_logits_for_tokens(
                         final_w, final_b, norm);
 
   if constexpr (Variant == linear_weight_variant::q8_0) {
-    linear_no_bias<Variant, k_embedding_length, k_vocab_size>(token_embedding,
-                                                              norm, logits);
+    linear_no_bias<Variant, k_embedding_length, k_vocab_size>(
+        kernel, token_embedding, norm, logits);
   } else {
     for (uint64_t token = 0; token < static_cast<uint64_t>(k_vocab_size);
          ++token) {
@@ -801,6 +791,7 @@ inline int32_t select_greedy_timestamp_aware_token(
 template <linear_weight_variant Variant,
           aux_weight_variant Aux = aux_weight_variant::q8_0>
 inline uint64_t run_decoder_sequence(
+    ::emel::kernel::sm &kernel,
     const emel::model::data &model, const float *encoder_state,
     const uint64_t encoder_frames, const decode_policy_runtime &policy,
     const int32_t *prompt_tokens, const uint64_t prompt_token_count,
@@ -824,14 +815,16 @@ inline uint64_t run_decoder_sequence(
   float *cross_v_cache = cross_k_cache + cross_cache_count;
   float *step_workspace = cross_v_cache + cross_cache_count;
   compute_decoder_cross_cache<Variant, Aux>(
-      model, encoder_state, encoder_frames, cross_k_cache, cross_v_cache);
+      kernel, model, encoder_state, encoder_frames, cross_k_cache,
+      cross_v_cache);
   uint64_t token_count = prompt_token_count;
   uint64_t digest = 0u;
   generated_token_count_out = 0u;
   for (uint64_t step = 0; step < generation_limit; ++step) {
     float raw_confidence = 0.0f;
     compute_decoder_logits_for_tokens<Variant, Aux>(
-        model, encoder_frames, cross_k_cache, cross_v_cache, tokens.data(),
+        kernel, model, encoder_frames, cross_k_cache, cross_v_cache,
+        tokens.data(),
         token_count, step_workspace, logits, raw_confidence, digest);
     const int32_t next_token = select_greedy_timestamp_aware_token(
         policy, logits, generated_tokens, step, step == 0u, confidence_out);
diff --git a/src/emel/speech/encoder/whisper/actions.hpp b/src/emel/speech/encoder/whisper/actions.hpp
index bec26911..a3b2b1f2 100644
--- a/src/emel/speech/encoder/whisper/actions.hpp
+++ b/src/emel/speech/encoder/whisper/actions.hpp
@@ -76,7 +76,8 @@ struct effect_run_encoder_variant {
                   context &ctx) const noexcept {
     uint64_t frame_count = 0u;
     const uint64_t digest = kdetail::run_encoder<Variant, Aux>(
-        *runtime_ev.request.contract.model, runtime_ev.request.pcm.data(),
+        ctx.kernel, *runtime_ev.request.contract.model,
+        runtime_ev.request.pcm.data(),
         static_cast<uint64_t>(runtime_ev.request.pcm.size()),
         runtime_ev.request.workspace.data(),
         runtime_ev.request.encoder_state.data(), frame_count);
diff --git a/src/emel/speech/encoder/whisper/context.hpp b/src/emel/speech/encoder/whisper/context.hpp
index 1624efa0..c8e172dc 100644
--- a/src/emel/speech/encoder/whisper/context.hpp
+++ b/src/emel/speech/encoder/whisper/context.hpp
@@ -2,9 +2,12 @@
 
 #include <cstdint>
 
+#include "emel/kernel/sm.hpp"
+
 namespace emel::speech::encoder::whisper::action {
 
 struct context {
+  emel::kernel::sm kernel{emel::kernel::detect_host_kind()};
   uint64_t q8_0_dispatch_count = 0;
   uint64_t q4_0_dispatch_count = 0;
   uint64_t q4_1_dispatch_count = 0;
diff --git a/src/emel/speech/encoder/whisper/detail.hpp b/src/emel/speech/encoder/whisper/detail.hpp
index 53cab70f..17b0c14b 100644
--- a/src/emel/speech/encoder/whisper/detail.hpp
+++ b/src/emel/speech/encoder/whisper/detail.hpp
@@ -14,9 +14,9 @@
 #endif
 
 #include "emel/error/error.hpp"
-#include "emel/kernel/aarch64/actions.hpp"
 #include "emel/kernel/detail.hpp"
 #include "emel/kernel/events.hpp"
+#include "emel/kernel/sm.hpp"
 #include "emel/model/data.hpp"
 #include "emel/speech/encoder/whisper/errors.hpp"
 
@@ -310,14 +310,14 @@ inline float dot_linear_row(const emel::model::data::tensor_record &weight,
 template <uint64_t In, uint64_t Out, bool HasBias,
           aux_weight_variant Aux = aux_weight_variant::q8_0>
 inline void
-linear_q8_0_quantized_input(const emel::model::data::tensor_record &weight,
+linear_q8_0_quantized_input(::emel::kernel::sm &kernel,
+                            const emel::model::data::tensor_record &weight,
                             const emel::model::data::tensor_record *bias,
                             const float *input, float *output) noexcept {
   const uint64_t row_bytes =
       ::emel::kernel::detail::quantized_row_storage_bytes(
           ::emel::kernel::detail::dtype_q8_0, In);
-#if defined(__aarch64__) || defined(__ARM_NEON)
-  ::emel::kernel::event::op_mul_mat request{
+  const ::emel::kernel::event::op_mul_mat request{
       .src0 =
           {
               .data = weight.data,
@@ -351,27 +351,8 @@ linear_q8_0_quantized_input(const emel::model::data::tensor_record &weight,
                       sizeof(float) * Out,
                   },
           },
-      .nth = 1u,
   };
-  ::emel::kernel::aarch64::detail::execute_neon_mul_mat_q8_0_vector_unchecked(
-      request);
-#else
-  constexpr uint64_t block_count = In / ::emel::kernel::detail::quant::QK8_0;
-  std::array<::emel::kernel::detail::quant::block_q8_0,
-             static_cast<size_t>(block_count)>
-      input_blocks = {};
-  ::emel::kernel::detail::quant::quantize_row_q8_0_strided(
-      input, 1u, input_blocks.data(), static_cast<int64_t>(In));
-  for (uint64_t row = 0; row < Out; ++row) {
-    const auto *row_base =
-        static_cast<const uint8_t *>(weight.data) + row * row_bytes;
-    const auto *weight_blocks =
-        reinterpret_cast<const ::emel::kernel::detail::quant::block_q8_0 *>(
-            row_base);
-    output[row] = ::emel::kernel::detail::dot_q8_0_q8_0_row_scalar(
-        weight_blocks, input_blocks.data(), block_count);
-  }
-#endif
+  (void)kernel.process_event(request);
   if constexpr (HasBias) {
     for (uint64_t row = 0; row < Out; ++row) {
       output[row] += read_aux_vector<Aux>(*bias, row);
@@ -594,13 +575,15 @@ inline void layer_norm_frame(const float *input,
 
 template <linear_weight_variant Variant, uint64_t In, uint64_t Out,
           aux_weight_variant Aux = aux_weight_variant::q8_0>
-inline void linear(const emel::model::data::tensor_record &weight,
+inline void linear(::emel::kernel::sm &kernel,
+                   const emel::model::data::tensor_record &weight,
                    const emel::model::data::tensor_record &bias,
                    const float *input, float *output) noexcept {
   if constexpr (Variant == linear_weight_variant::q8_0) {
-    linear_q8_0_quantized_input<In, Out, true, Aux>(weight, &bias, input,
-                                                    output);
+    linear_q8_0_quantized_input<In, Out, true, Aux>(kernel, weight, &bias,
+                                                    input, output);
   } else {
+    (void)kernel;
     for (uint64_t row = 0; row < Out; ++row) {
       output[row] = dot_linear_row<Variant>(weight, row, input, In) +
                     read_aux_vector<Aux>(bias, row);
@@ -609,11 +592,14 @@ inline void linear(const emel::model::data::tensor_record &weight,
 }
 
 template <linear_weight_variant Variant, uint64_t In, uint64_t Out>
-inline void linear_no_bias(const emel::model::data::tensor_record &weight,
+inline void linear_no_bias(::emel::kernel::sm &kernel,
+                           const emel::model::data::tensor_record &weight,
                            const float *input, float *output) noexcept {
   if constexpr (Variant == linear_weight_variant::q8_0) {
-    linear_q8_0_quantized_input<In, Out, false>(weight, nullptr, input, output);
+    linear_q8_0_quantized_input<In, Out, false>(kernel, weight, nullptr, input,
+                                                output);
   } else {
+    (void)kernel;
     for (uint64_t row = 0; row < Out; ++row) {
       output[row] = dot_linear_row<Variant>(weight, row, input, In);
     }
@@ -773,10 +759,10 @@ inline uint64_t write_layer_tensor_name(char *output, const char *prefix,
 
 template <linear_weight_variant Variant, aux_weight_variant Aux>
 inline void
-run_encoder_layer(const emel::model::data &model, const uint64_t layer,
-                  const uint64_t encoder_frames, float *hidden, float *next,
-                  float *q, float *k, float *v, float *attn, float *norm,
-                  float *ff, float *scores) noexcept {
+run_encoder_layer(::emel::kernel::sm &kernel, const emel::model::data &model,
+                  const uint64_t layer, const uint64_t encoder_frames,
+                  float *hidden, float *next, float *q, float *k, float *v,
+                  float *attn, float *norm, float *ff, float *scores) noexcept {
   char name[96] = {};
   const auto layer_tensor = [&](const char *suffix) noexcept {
     const uint64_t name_size =
@@ -806,11 +792,14 @@ run_encoder_layer(const emel::model::data &model, const uint64_t layer,
         hidden + frame * static_cast<uint64_t>(k_embedding_length);
     layer_norm_frame<Aux>(frame_in, ln1_w, ln1_b, norm);
     linear<Variant, k_embedding_length, k_embedding_length, Aux>(
-        q_w, q_b, norm, q + frame * static_cast<uint64_t>(k_embedding_length));
+        kernel, q_w, q_b, norm,
+        q + frame * static_cast<uint64_t>(k_embedding_length));
     linear_no_bias<Variant, k_embedding_length, k_embedding_length>(
-        k_w, norm, k + frame * static_cast<uint64_t>(k_embedding_length));
+        kernel, k_w, norm,
+        k + frame * static_cast<uint64_t>(k_embedding_length));
     linear<Variant, k_embedding_length, k_embedding_length, Aux>(
-        v_w, v_b, norm, v + frame * static_cast<uint64_t>(k_embedding_length));
+        kernel, v_w, v_b, norm,
+        v + frame * static_cast<uint64_t>(k_embedding_length));
   }
 
   const float scale =
@@ -845,8 +834,8 @@ run_encoder_layer(const emel::model::data &model, const uint64_t layer,
       }
     }
     linear<Variant, k_embedding_length, k_embedding_length, Aux>(
-        o_w, o_b, attn + frame * static_cast<uint64_t>(k_embedding_length),
-        norm);
+        kernel, o_w, o_b,
+        attn + frame * static_cast<uint64_t>(k_embedding_length), norm);
     for (uint64_t dim = 0; dim < static_cast<uint64_t>(k_embedding_length);
          ++dim) {
       next[frame * static_cast<uint64_t>(k_embedding_length) + dim] =
@@ -860,13 +849,13 @@ run_encoder_layer(const emel::model::data &model, const uint64_t layer,
         next + frame * static_cast<uint64_t>(k_embedding_length);
     layer_norm_frame<Aux>(frame_data, ln2_w, ln2_b, norm);
     linear<Variant, k_embedding_length, k_feed_forward_length, Aux>(
-        fc1_w, fc1_b, norm, ff);
+        kernel, fc1_w, fc1_b, norm, ff);
     for (uint64_t dim = 0; dim < static_cast<uint64_t>(k_feed_forward_length);
          ++dim) {
       ff[dim] = gelu(ff[dim]);
     }
     linear<Variant, k_feed_forward_length, k_embedding_length, Aux>(
-        fc2_w, fc2_b, ff, norm);
+        kernel, fc2_w, fc2_b, ff, norm);
     for (uint64_t dim = 0; dim < static_cast<uint64_t>(k_embedding_length);
          ++dim) {
       hidden[frame * static_cast<uint64_t>(k_embedding_length) + dim] =
@@ -888,10 +877,10 @@ inline uint64_t digest_f32(const float *values, const uint64_t count) noexcept {
 
 template <linear_weight_variant Variant,
           aux_weight_variant Aux = aux_weight_variant::q8_0>
-inline uint64_t run_encoder(const emel::model::data &model, const float *pcm,
-                            const uint64_t sample_count, float *workspace,
-                            float *output,
-                            uint64_t &encoder_frames_out) noexcept {
+inline uint64_t
+run_encoder(::emel::kernel::sm &kernel, const emel::model::data &model,
+            const float *pcm, const uint64_t sample_count, float *workspace,
+            float *output, uint64_t &encoder_frames_out) noexcept {
   const uint64_t mel_frames = mel_frame_count_for_samples(sample_count);
   const uint64_t encoder_frames =
       encoder_frame_count_for_mel_frames(mel_frames);
@@ -941,8 +930,9 @@ inline uint64_t run_encoder(const emel::model::data &model, const float *pcm,
 
   for (uint64_t layer = 0; layer < static_cast<uint64_t>(k_encoder_block_count);
        ++layer) {
-    run_encoder_layer<Variant, Aux>(model, layer, encoder_frames, hidden, next,
-                                    q, k, v, attn, norm, ff, scores);
+    run_encoder_layer<Variant, Aux>(kernel, model, layer, encoder_frames,
+                                    hidden, next, q, k, v, attn, norm, ff,
+                                    scores);
   }
 
   const auto &final_w = *find_tensor(
diff --git a/src/emel/text/generator/actions.hpp b/src/emel/text/generator/actions.hpp
index 2b784b5c..d4b9fd7d 100644
--- a/src/emel/text/generator/actions.hpp
+++ b/src/emel/text/generator/actions.hpp
@@ -278,7 +278,8 @@ struct request_memory_snapshot {
   }
 };
 
-template <emel::text::generator::detail::step_kind kind, auto run_kernel_fn>
+template <emel::text::generator::detail::step_kind kind,
+          auto run_kernel_fn>
 inline void request_phase_compute(const event::generate_run & ev, context & ctx) noexcept {
   ev.ctx.phase_code = static_cast<int32_t>(emel::error::cast(emel::graph::error::none));
   ev.ctx.graph_output = {};
@@ -363,7 +364,8 @@ inline void request_phase_compute(const event::generate_run & ev, context & ctx)
   ev.ctx.phase_accepted = ctx.graph.process_event(compute_ev);
 }
 
-template <emel::text::generator::detail::step_kind kind, auto run_kernel_fn>
+template <emel::text::generator::detail::step_kind kind,
+          auto run_kernel_fn>
 inline void request_phase_compute_preselected_argmax(const event::generate_run & ev,
                                                      context & ctx) noexcept {
   ev.ctx.phase_code = static_cast<int32_t>(emel::error::cast(emel::graph::error::none));
@@ -502,6 +504,47 @@ struct request_decode_compute_flash_kernel {
   }
 };
 
+struct request_decode_compute_flash_parallel_packed_q8_0 {
+  void operator()(const event::generate_run & ev, context & ctx) const noexcept {
+    request_phase_compute<emel::text::generator::detail::step_kind::decode,
+                          emel::text::generator::detail::
+                              run_kernel_flash_decode_parallel_packed_q8_0>(ev, ctx);
+  }
+};
+
+struct request_decode_compute_flash_parallel_q8_k {
+  void operator()(const event::generate_run & ev, context & ctx) const noexcept {
+    request_phase_compute<emel::text::generator::detail::step_kind::decode,
+                          emel::text::generator::detail::
+                              run_kernel_flash_decode_parallel_q8_k>(ev, ctx);
+  }
+};
+
+struct request_decode_compute_flash_parallel_native_quantized {
+  void operator()(const event::generate_run & ev, context & ctx) const noexcept {
+    request_phase_compute<emel::text::generator::detail::step_kind::decode,
+                          emel::text::generator::detail::
+                              run_kernel_flash_decode_parallel_native_quantized>(ev, ctx);
+  }
+};
+
+struct request_decode_compute_flash_parallel_native_quantized_q8_k_logits {
+  void operator()(const event::generate_run & ev, context & ctx) const noexcept {
+    request_phase_compute<emel::text::generator::detail::step_kind::decode,
+                          emel::text::generator::detail::
+                              run_kernel_flash_decode_parallel_native_quantized_q8_k_logits>(
+        ev, ctx);
+  }
+};
+
+struct request_decode_compute_flash_parallel_kernel {
+  void operator()(const event::generate_run & ev, context & ctx) const noexcept {
+    request_phase_compute<emel::text::generator::detail::step_kind::decode,
+                          emel::text::generator::detail::
+                              run_kernel_flash_decode_parallel_kernel>(ev, ctx);
+  }
+};
+
 struct request_decode_compute_nonflash_packed_q8_0 {
   void operator()(const event::generate_run & ev, context & ctx) const noexcept {
     request_phase_compute<emel::text::generator::detail::step_kind::decode,
@@ -582,6 +625,46 @@ struct request_decode_compute_flash_preselected_argmax_kernel {
   }
 };
 
+struct request_decode_compute_flash_parallel_preselected_argmax_q8_k {
+  void operator()(const event::generate_run & ev, context & ctx) const noexcept {
+    request_phase_compute_preselected_argmax<
+        emel::text::generator::detail::step_kind::decode,
+        emel::text::generator::detail::
+            run_kernel_flash_decode_parallel_preselected_argmax_q8_k>(
+        ev, ctx);
+  }
+};
+
+struct request_decode_compute_flash_parallel_preselected_argmax_native_quantized_q8_k {
+  void operator()(const event::generate_run & ev, context & ctx) const noexcept {
+    request_phase_compute_preselected_argmax<
+        emel::text::generator::detail::step_kind::decode,
+        emel::text::generator::detail::
+            run_kernel_flash_decode_parallel_preselected_argmax_native_quantized_q8_k>(
+        ev, ctx);
+  }
+};
+
+struct request_decode_compute_flash_parallel_preselected_argmax_native_quantized_kernel {
+  void operator()(const event::generate_run & ev, context & ctx) const noexcept {
+    request_phase_compute_preselected_argmax<
+        emel::text::generator::detail::step_kind::decode,
+        emel::text::generator::detail::
+            run_kernel_flash_decode_parallel_preselected_argmax_native_quantized_kernel>(
+        ev, ctx);
+  }
+};
+
+struct request_decode_compute_flash_parallel_preselected_argmax_kernel {
+  void operator()(const event::generate_run & ev, context & ctx) const noexcept {
+    request_phase_compute_preselected_argmax<
+        emel::text::generator::detail::step_kind::decode,
+        emel::text::generator::detail::
+            run_kernel_flash_decode_parallel_preselected_argmax_kernel>(
+        ev, ctx);
+  }
+};
+
 struct request_decode_compute_nonflash_preselected_argmax_q8_k {
   void operator()(const event::generate_run & ev, context & ctx) const noexcept {
     request_phase_compute_preselected_argmax<emel::text::generator::detail::step_kind::decode,
@@ -880,46 +963,50 @@ struct on_unexpected {
 
 struct capture_diagnostics {
   void operator()(const event::capture_diagnostics & ev, const context & ctx) const noexcept {
-    ev.out.kernel_kind = ctx.compute.backend.kernel_kind;
-    ev.out.kernel_dispatch_calls = ctx.compute.backend.kernel_dispatch_calls;
-    ev.out.native_q8_0_dispatch_calls = ctx.compute.backend.native_q8_0_dispatch_calls;
-    ev.out.packed_q8_0_dispatch_calls = ctx.compute.backend.packed_q8_0_dispatch_calls;
-    ev.out.flash_attention_dispatch_calls = ctx.compute.backend.flash_attention_dispatch_calls;
+    const auto & backend = ctx.compute.backend;
+    const auto total = [&backend](auto counter) noexcept {
+      return detail::compute_kernel_counter_total(backend, counter);
+    };
+    ev.out.kernel_kind = backend.kernel_kind;
+    ev.out.kernel_dispatch_calls = backend.kernel_dispatch_calls;
+    ev.out.native_q8_0_dispatch_calls = backend.native_q8_0_dispatch_calls;
+    ev.out.packed_q8_0_dispatch_calls = backend.packed_q8_0_dispatch_calls;
+    ev.out.flash_attention_dispatch_calls = backend.flash_attention_dispatch_calls;
     ev.out.optimized_flash_dispatch_calls =
-        ctx.compute.backend.kernel.optimized_flash_dispatch_count();
-    ev.out.shared_flash_dispatch_calls = ctx.compute.backend.kernel.shared_flash_dispatch_count();
-    ev.out.optimized_q2_dispatch_calls = ctx.compute.backend.kernel.optimized_q2_dispatch_count();
-    ev.out.shared_q2_dispatch_calls = ctx.compute.backend.kernel.shared_q2_dispatch_count();
-    ev.out.optimized_q3_dispatch_calls = ctx.compute.backend.kernel.optimized_q3_dispatch_count();
-    ev.out.shared_q3_dispatch_calls = ctx.compute.backend.kernel.shared_q3_dispatch_count();
-    ev.out.optimized_q4_dispatch_calls = ctx.compute.backend.kernel.optimized_q4_dispatch_count();
+        total(&emel::kernel::sm::optimized_flash_dispatch_count);
+    ev.out.shared_flash_dispatch_calls = total(&emel::kernel::sm::shared_flash_dispatch_count);
+    ev.out.optimized_q2_dispatch_calls = total(&emel::kernel::sm::optimized_q2_dispatch_count);
+    ev.out.shared_q2_dispatch_calls = total(&emel::kernel::sm::shared_q2_dispatch_count);
+    ev.out.optimized_q3_dispatch_calls = total(&emel::kernel::sm::optimized_q3_dispatch_count);
+    ev.out.shared_q3_dispatch_calls = total(&emel::kernel::sm::shared_q3_dispatch_count);
+    ev.out.optimized_q4_dispatch_calls = total(&emel::kernel::sm::optimized_q4_dispatch_count);
     ev.out.optimized_q4_vector_dispatch_calls =
-        ctx.compute.backend.kernel.optimized_q4_vector_dispatch_count();
+        total(&emel::kernel::sm::optimized_q4_vector_dispatch_count);
     ev.out.optimized_q4_vector_packed_dispatch_calls =
-        ctx.compute.backend.kernel.optimized_q4_vector_packed_dispatch_count();
+        total(&emel::kernel::sm::optimized_q4_vector_packed_dispatch_count);
     ev.out.optimized_q4_vector_packed_q8_rhs_dispatch_calls =
-        ctx.compute.backend.kernel.optimized_q4_vector_packed_q8_rhs_dispatch_count();
-    ev.out.shared_q4_dispatch_calls = ctx.compute.backend.kernel.shared_q4_dispatch_count();
-    ev.out.optimized_q6_dispatch_calls = ctx.compute.backend.kernel.optimized_q6_dispatch_count();
+        total(&emel::kernel::sm::optimized_q4_vector_packed_q8_rhs_dispatch_count);
+    ev.out.shared_q4_dispatch_calls = total(&emel::kernel::sm::shared_q4_dispatch_count);
+    ev.out.optimized_q6_dispatch_calls = total(&emel::kernel::sm::optimized_q6_dispatch_count);
     ev.out.optimized_q6_vector_dispatch_calls =
-        ctx.compute.backend.kernel.optimized_q6_vector_dispatch_count();
+        total(&emel::kernel::sm::optimized_q6_vector_dispatch_count);
     ev.out.optimized_q6_vector_argmax_dispatch_calls =
-        ctx.compute.backend.kernel.optimized_q6_vector_argmax_dispatch_count();
+        total(&emel::kernel::sm::optimized_q6_vector_argmax_dispatch_count);
     ev.out.optimized_q6_vector_packed_dispatch_calls =
-        ctx.compute.backend.kernel.optimized_q6_vector_packed_dispatch_count();
+        total(&emel::kernel::sm::optimized_q6_vector_packed_dispatch_count);
     ev.out.optimized_q6_vector_packed_q8_rhs_dispatch_calls =
-        ctx.compute.backend.kernel.optimized_q6_vector_packed_q8_rhs_dispatch_count();
+        total(&emel::kernel::sm::optimized_q6_vector_packed_q8_rhs_dispatch_count);
     ev.out.optimized_q6_vector_packed_q8_rhs_argmax_dispatch_calls =
-        ctx.compute.backend.kernel.optimized_q6_vector_packed_q8_rhs_argmax_dispatch_count();
+        total(&emel::kernel::sm::optimized_q6_vector_packed_q8_rhs_argmax_dispatch_count);
     ev.out.optimized_q6_vector_prepared_q8_rhs_dispatch_calls =
-        ctx.compute.backend.kernel.optimized_q6_vector_prepared_q8_rhs_dispatch_count();
+        total(&emel::kernel::sm::optimized_q6_vector_prepared_q8_rhs_dispatch_count);
     ev.out.optimized_q6_vector_prepared_q8_rhs_i8mm_dispatch_calls =
-        ctx.compute.backend.kernel.optimized_q6_vector_prepared_q8_rhs_i8mm_dispatch_count();
+        total(&emel::kernel::sm::optimized_q6_vector_prepared_q8_rhs_i8mm_dispatch_count);
     ev.out.optimized_q6_vector_prepared_q8_rhs_argmax_i8mm_dispatch_calls =
-        ctx.compute.backend.kernel.optimized_q6_vector_prepared_q8_rhs_argmax_i8mm_dispatch_count();
+        total(&emel::kernel::sm::optimized_q6_vector_prepared_q8_rhs_argmax_i8mm_dispatch_count);
     ev.out.optimized_q6_vector_q8_argmax_prepared_i8mm_dispatch_calls =
-        ctx.compute.backend.kernel.optimized_q6_vector_q8_argmax_prepared_i8mm_dispatch_count();
-    ev.out.shared_q6_dispatch_calls = ctx.compute.backend.kernel.shared_q6_dispatch_count();
+        total(&emel::kernel::sm::optimized_q6_vector_q8_argmax_prepared_i8mm_dispatch_count);
+    ev.out.shared_q6_dispatch_calls = total(&emel::kernel::sm::shared_q6_dispatch_count);
     ev.out.native_quantized_stage_count = detail::quantized_contract_stage_count(
         ctx.compute.backend,
         emel::model::llama::detail::quantized_contract_kind::native_quantized);
@@ -993,6 +1080,16 @@ inline constexpr request_decode_compute_flash_native_quantized
 inline constexpr request_decode_compute_flash_native_quantized_q8_k_logits
     request_decode_compute_flash_native_quantized_q8_k_logits{};
 inline constexpr request_decode_compute_flash_kernel request_decode_compute_flash_kernel{};
+inline constexpr request_decode_compute_flash_parallel_packed_q8_0
+    request_decode_compute_flash_parallel_packed_q8_0{};
+inline constexpr request_decode_compute_flash_parallel_q8_k
+    request_decode_compute_flash_parallel_q8_k{};
+inline constexpr request_decode_compute_flash_parallel_native_quantized
+    request_decode_compute_flash_parallel_native_quantized{};
+inline constexpr request_decode_compute_flash_parallel_native_quantized_q8_k_logits
+    request_decode_compute_flash_parallel_native_quantized_q8_k_logits{};
+inline constexpr request_decode_compute_flash_parallel_kernel
+    request_decode_compute_flash_parallel_kernel{};
 inline constexpr request_decode_compute_nonflash_packed_q8_0
     request_decode_compute_nonflash_packed_q8_0{};
 inline constexpr request_decode_compute_nonflash_q8_k request_decode_compute_nonflash_q8_k{};
@@ -1009,6 +1106,14 @@ inline constexpr request_decode_compute_flash_preselected_argmax_native_quantize
     request_decode_compute_flash_preselected_argmax_native_quantized_kernel{};
 inline constexpr request_decode_compute_flash_preselected_argmax_kernel
     request_decode_compute_flash_preselected_argmax_kernel{};
+inline constexpr request_decode_compute_flash_parallel_preselected_argmax_q8_k
+    request_decode_compute_flash_parallel_preselected_argmax_q8_k{};
+inline constexpr request_decode_compute_flash_parallel_preselected_argmax_native_quantized_q8_k
+    request_decode_compute_flash_parallel_preselected_argmax_native_quantized_q8_k{};
+inline constexpr request_decode_compute_flash_parallel_preselected_argmax_native_quantized_kernel
+    request_decode_compute_flash_parallel_preselected_argmax_native_quantized_kernel{};
+inline constexpr request_decode_compute_flash_parallel_preselected_argmax_kernel
+    request_decode_compute_flash_parallel_preselected_argmax_kernel{};
 inline constexpr request_decode_compute_nonflash_preselected_argmax_q8_k
     request_decode_compute_nonflash_preselected_argmax_q8_k{};
 inline constexpr request_decode_compute_nonflash_preselected_argmax_native_quantized_q8_k
diff --git a/src/emel/text/generator/context.hpp b/src/emel/text/generator/context.hpp
index 0bd4d012..860b6c63 100644
--- a/src/emel/text/generator/context.hpp
+++ b/src/emel/text/generator/context.hpp
@@ -116,7 +116,7 @@ struct context {
   session_limits limits = {};
   session_buffers buffers = {};
   session_state state = {};
-  renderer_session renderer_session = {};
+  emel::text::generator::action::renderer_session renderer_session = {};
 };
 
 }  // namespace emel::text::generator::action
diff --git a/src/emel/text/generator/decode_wavefront/actions.hpp b/src/emel/text/generator/decode_wavefront/actions.hpp
new file mode 100644
index 00000000..b5dd9496
--- /dev/null
+++ b/src/emel/text/generator/decode_wavefront/actions.hpp
@@ -0,0 +1,116 @@
+#pragma once
+
+#include <cstddef>
+
+#include "emel/graph/sm.hpp"
+#include "emel/text/generator/decode_wavefront/context.hpp"
+#include "emel/text/generator/decode_wavefront/events.hpp"
+
+namespace emel::text::generator::decode_wavefront::action {
+
+struct effect_begin_run {
+  void operator()(const event::run & ev, context &) const noexcept {
+    ev.out = {};
+    ev.out.err = emel::error::cast(error::none);
+  }
+};
+
+struct effect_mark_single_lane {
+  void operator()(const event::run & ev, context &) const noexcept {
+    ev.out.grouped = false;
+  }
+};
+
+struct effect_mark_grouped_lanes {
+  void operator()(const event::run & ev, context &) const noexcept {
+    ev.out.grouped = true;
+  }
+};
+
+struct effect_reject_invalid_request {
+  void operator()(const event::run & ev, context &) const noexcept {
+    ev.out.err = emel::error::cast(error::invalid_request);
+  }
+};
+
+struct effect_reject_incompatible_lanes {
+  void operator()(const event::run & ev, context &) const noexcept {
+    ev.out.err = emel::error::cast(error::incompatible_lanes);
+  }
+};
+
+template <size_t lane_index>
+struct effect_dispatch_lane {
+  void operator()(const event::run & ev, context &) const noexcept {
+    auto & lane = ev.lanes[lane_index];
+    const emel::graph::event::compute_reserved reserved_compute{lane.compute};
+    lane.accepted = lane.graph.process_event(reserved_compute);
+    ev.out.dispatched_lanes = static_cast<int32_t>(lane_index + 1u);
+  }
+};
+
+struct effect_dispatch_parallel_lanes {
+  void operator()(const event::run & ev, context & ctx) const noexcept {
+    for (auto & lane : ev.lanes) {
+      lane.accepted = false;
+    }
+
+    lane_scheduler scheduler{*ctx.pool};
+    lane_scheduler::join_group group{};
+    for (auto & lane : ev.lanes) {
+      auto * lane_ptr = &lane;
+      const auto run_lane = [lane_ptr]() noexcept {
+        auto & current_lane = *lane_ptr;
+        const emel::graph::event::compute_reserved reserved_compute{
+            current_lane.compute};
+        current_lane.accepted =
+            current_lane.graph.process_event(reserved_compute);
+      };
+      const bool submitted = scheduler.try_submit(group, run_lane);
+      // Bounded backpressure handling: a rejected submit (queue full, or the
+      // caller is already a pool worker) runs the same lane inline. The
+      // algorithm and output are identical either way; only placement
+      // differs.
+      if (!submitted) {
+        run_lane();
+      }
+    }
+    (void)group.wait();
+    ev.out.dispatched_lanes = static_cast<int32_t>(ev.lanes.size());
+  }
+};
+
+template <size_t lane_index>
+struct effect_mark_lane_rejected {
+  void operator()(const event::run & ev, context &) const noexcept {
+    ev.out.err = emel::error::cast(error::lane_rejected);
+    ev.out.failed_lane = static_cast<int32_t>(lane_index);
+  }
+};
+
+struct effect_commit_done {
+  void operator()(const event::run & ev, context &) const noexcept {
+    ev.out.err = emel::error::cast(error::none);
+    ev.out.failed_lane = event::k_no_failed_lane;
+  }
+};
+
+struct effect_on_unexpected {
+  template <class event_type>
+  void operator()(const event_type & ev, context &) const noexcept {
+    if constexpr (requires { ev.out.err; }) {
+      ev.out.err = emel::error::cast(error::backend);
+    }
+  }
+};
+
+inline constexpr effect_begin_run effect_begin_run{};
+inline constexpr effect_mark_single_lane effect_mark_single_lane{};
+inline constexpr effect_mark_grouped_lanes effect_mark_grouped_lanes{};
+inline constexpr effect_reject_invalid_request effect_reject_invalid_request{};
+inline constexpr effect_reject_incompatible_lanes effect_reject_incompatible_lanes{};
+inline constexpr effect_dispatch_parallel_lanes effect_dispatch_parallel_lanes{};
+inline constexpr effect_commit_done effect_commit_done{};
+inline constexpr effect_on_unexpected effect_on_unexpected{};
+
+}  // namespace emel::text::generator::decode_wavefront::action
diff --git a/src/emel/text/generator/decode_wavefront/context.hpp b/src/emel/text/generator/decode_wavefront/context.hpp
new file mode 100644
index 00000000..8fb03ac1
--- /dev/null
+++ b/src/emel/text/generator/decode_wavefront/context.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "emel/sm.hpp"
+#include "emel/text/generator/decode_wavefront/events.hpp"
+
+namespace emel::text::generator::decode_wavefront::action {
+
+using lane_pool =
+    emel::policy::thread_pool_scheduler<event::k_max_lanes, 16u, 128u>;
+using lane_scheduler = emel::policy::thread_pool_scheduler_ref<lane_pool>;
+
+struct context {
+  lane_pool * pool = nullptr;
+};
+
+}  // namespace emel::text::generator::decode_wavefront::action
diff --git a/src/emel/text/generator/decode_wavefront/errors.hpp b/src/emel/text/generator/decode_wavefront/errors.hpp
new file mode 100644
index 00000000..b90ee02c
--- /dev/null
+++ b/src/emel/text/generator/decode_wavefront/errors.hpp
@@ -0,0 +1,15 @@
+#pragma once
+
+#include "emel/error/error.hpp"
+
+namespace emel::text::generator::decode_wavefront {
+
+enum class error : emel::error::type {
+  none = 0u,
+  invalid_request = (1u << 0),
+  incompatible_lanes = (1u << 1),
+  lane_rejected = (1u << 2),
+  backend = (1u << 3),
+};
+
+}  // namespace emel::text::generator::decode_wavefront
diff --git a/src/emel/text/generator/decode_wavefront/events.hpp b/src/emel/text/generator/decode_wavefront/events.hpp
new file mode 100644
index 00000000..6e56d917
--- /dev/null
+++ b/src/emel/text/generator/decode_wavefront/events.hpp
@@ -0,0 +1,77 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <span>
+
+#include "emel/error/error.hpp"
+#include "emel/graph/events.hpp"
+#include "emel/kernel/any.hpp"
+#include "emel/text/generator/decode_wavefront/errors.hpp"
+#include "emel/text/generator/events.hpp"
+
+namespace emel::graph {
+struct sm;
+}
+
+namespace emel::text::generator::decode_wavefront::event {
+
+inline constexpr size_t k_max_lanes = 8u;
+inline constexpr int32_t k_no_failed_lane = -1;
+
+enum class kernel_route : uint8_t {
+  packed_q8_0,
+  q8_k,
+  native_quantized,
+  native_quantized_q8_k_logits,
+  kernel,
+};
+
+enum class output_contract : uint8_t {
+  materialized_logits,
+  preselected_argmax,
+};
+
+struct compatibility_key {
+  const void * model_identity = nullptr;
+  const void * backend_identity = nullptr;
+  emel::kernel::kernel_kind kernel_kind = emel::kernel::kernel_kind::x86_64;
+  emel::text::generator::attention_mode attention =
+      emel::text::generator::attention_mode::nonflash;
+  kernel_route route = kernel_route::kernel;
+  output_contract output = output_contract::materialized_logits;
+  uint32_t dtype_layout_contract = 0u;
+  uint32_t quantized_contract = 0u;
+  int32_t step_size = 1;
+  int32_t token_count = 1;
+};
+
+struct dispatch_summary {
+  emel::error::type err = emel::error::cast(error::none);
+  bool grouped = false;
+  int32_t dispatched_lanes = 0;
+  int32_t failed_lane = k_no_failed_lane;
+};
+
+struct lane {
+  lane(emel::graph::sm & graph_ref,
+       emel::graph::event::compute & compute_ref,
+       const compatibility_key key_ref,
+       bool & accepted_ref) noexcept
+    : graph(graph_ref), compute(compute_ref), key(key_ref), accepted(accepted_ref) {}
+
+  emel::graph::sm & graph;
+  emel::graph::event::compute & compute;
+  compatibility_key key;
+  bool & accepted;
+};
+
+struct run {
+  run(std::span<lane> lanes_ref, dispatch_summary & out_ref) noexcept
+    : lanes(lanes_ref), out(out_ref) {}
+
+  std::span<lane> lanes = {};
+  dispatch_summary & out;
+};
+
+}  // namespace emel::text::generator::decode_wavefront::event
diff --git a/src/emel/text/generator/decode_wavefront/guards.hpp b/src/emel/text/generator/decode_wavefront/guards.hpp
new file mode 100644
index 00000000..05772115
--- /dev/null
+++ b/src/emel/text/generator/decode_wavefront/guards.hpp
@@ -0,0 +1,152 @@
+#pragma once
+
+#include "emel/text/generator/decode_wavefront/context.hpp"
+#include "emel/text/generator/decode_wavefront/events.hpp"
+
+namespace emel::text::generator::decode_wavefront::guard {
+
+namespace detail {
+
+inline bool compatible_key(const event::compatibility_key & lhs,
+                           const event::compatibility_key & rhs) noexcept {
+  return lhs.model_identity == rhs.model_identity &&
+         lhs.backend_identity == rhs.backend_identity &&
+         lhs.kernel_kind == rhs.kernel_kind &&
+         lhs.attention == rhs.attention &&
+         lhs.route == rhs.route &&
+         lhs.output == rhs.output &&
+         lhs.dtype_layout_contract == rhs.dtype_layout_contract &&
+         lhs.quantized_contract == rhs.quantized_contract &&
+         lhs.step_size == rhs.step_size &&
+         lhs.token_count == rhs.token_count;
+}
+
+inline bool all_lanes_compatible(const event::run & ev) noexcept {
+  const size_t lane_count = ev.lanes.size();
+  if (lane_count == 0u || lane_count > event::k_max_lanes) {
+    return false;
+  }
+
+  const auto & first = ev.lanes[0].key;
+  for (size_t lane_index = 1u; lane_index < lane_count; ++lane_index) {
+    if (!compatible_key(first, ev.lanes[lane_index].key)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline bool valid_lane_count(const event::run & ev) noexcept {
+  return ev.lanes.size() > 0u && ev.lanes.size() <= event::k_max_lanes;
+}
+
+// Parallel dispatch requires one graph actor per lane: concurrent
+// process_event on a shared actor would break the RTC single-writer
+// contract. Lane count is bounded by k_max_lanes, so the pairwise scan is
+// statically bounded.
+inline bool all_lane_graphs_distinct(const event::run & ev) noexcept {
+  const size_t lane_count = ev.lanes.size();
+  for (size_t i = 0u; i < lane_count; ++i) {
+    for (size_t j = i + 1u; j < lane_count; ++j) {
+      if (&ev.lanes[i].graph == &ev.lanes[j].graph) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+}  // namespace detail
+
+struct guard_valid_request {
+  bool operator()(const event::run & ev, const action::context &) const noexcept {
+    return detail::valid_lane_count(ev) && detail::all_lanes_compatible(ev);
+  }
+};
+
+struct guard_invalid_request {
+  bool operator()(const event::run & ev, const action::context &) const noexcept {
+    return !detail::valid_lane_count(ev);
+  }
+};
+
+struct guard_single_lane {
+  bool operator()(const event::run & ev, const action::context &) const noexcept {
+    return ev.lanes.size() == 1u;
+  }
+};
+
+struct guard_multi_lane_compatible {
+  bool operator()(const event::run & ev, const action::context &) const noexcept {
+    return ev.lanes.size() > 1u && detail::all_lanes_compatible(ev);
+  }
+};
+
+struct guard_serial_dispatch {
+  bool operator()(const event::run & ev, const action::context & ctx) const noexcept {
+    return ctx.pool == nullptr || ev.lanes.size() == 1u ||
+           !detail::all_lane_graphs_distinct(ev);
+  }
+};
+
+struct guard_parallel_dispatch {
+  bool operator()(const event::run & ev, const action::context & ctx) const noexcept {
+    return ctx.pool != nullptr && ev.lanes.size() > 1u &&
+           detail::all_lane_graphs_distinct(ev);
+  }
+};
+
+struct guard_multi_lane_incompatible {
+  bool operator()(const event::run & ev, const action::context &) const noexcept {
+    return ev.lanes.size() > 1u && !detail::all_lanes_compatible(ev);
+  }
+};
+
+template <size_t lane_index>
+struct guard_lane_rejected {
+  bool operator()(const event::run & ev, const action::context &) const noexcept {
+    return !ev.lanes[lane_index].accepted;
+  }
+};
+
+template <size_t lane_index>
+struct guard_lane_accepted_and_last {
+  bool operator()(const event::run & ev, const action::context &) const noexcept {
+    return ev.lanes[lane_index].accepted && ev.lanes.size() == lane_index + 1u;
+  }
+};
+
+template <size_t lane_index>
+struct guard_lane_accepted_and_more {
+  bool operator()(const event::run & ev, const action::context &) const noexcept {
+    return ev.lanes[lane_index].accepted && ev.lanes.size() > lane_index + 1u;
+  }
+};
+
+template <size_t lane_index>
+struct guard_parallel_lane_rejected {
+  bool operator()(const event::run & ev, const action::context &) const noexcept {
+    if (ev.lanes.size() <= lane_index) {
+      return false;
+    }
+    for (size_t index = 0u; index < lane_index; ++index) {
+      if (!ev.lanes[index].accepted) {
+        return false;
+      }
+    }
+    return !ev.lanes[lane_index].accepted;
+  }
+};
+
+struct guard_parallel_all_lanes_accepted {
+  bool operator()(const event::run & ev, const action::context &) const noexcept {
+    for (const auto & lane : ev.lanes) {
+      if (!lane.accepted) {
+        return false;
+      }
+    }
+    return true;
+  }
+};
+
+}  // namespace emel::text::generator::decode_wavefront::guard
diff --git a/src/emel/text/generator/decode_wavefront/sm.hpp b/src/emel/text/generator/decode_wavefront/sm.hpp
new file mode 100644
index 00000000..67c85f73
--- /dev/null
+++ b/src/emel/text/generator/decode_wavefront/sm.hpp
@@ -0,0 +1,298 @@
+#pragma once
+// benchmark: designed
+
+#include "emel/sm.hpp"
+#include "emel/text/generator/decode_wavefront/actions.hpp"
+#include "emel/text/generator/decode_wavefront/guards.hpp"
+
+namespace emel::text::generator::decode_wavefront {
+
+// Public alias for the lane thread pool the sm constructor requires, so callers
+// (integrators, benchmarks) can name it without reaching into the action
+// namespace.
+using lane_pool = action::lane_pool;
+
+struct state_idle {};
+struct state_validation_decision {};
+struct state_group_ready {};
+struct state_lane0_decision {};
+struct state_lane1_decision {};
+struct state_lane2_decision {};
+struct state_lane3_decision {};
+struct state_lane4_decision {};
+struct state_lane5_decision {};
+struct state_lane6_decision {};
+struct state_lane7_decision {};
+struct state_parallel_decision {};
+
+struct model {
+  auto operator()() const {
+    namespace sml = stateforward::sml;
+
+    // clang-format off
+    return sml::make_transition_table(
+      //------------------------------------------------------------------------------//
+      // Request validation and compatibility grouping.
+        sml::state<state_validation_decision> <= *sml::state<state_idle>
+                 + sml::event<event::run>
+                 / action::effect_begin_run
+
+      , sml::state<state_idle> <= sml::state<state_validation_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_invalid_request{} ]
+                 / action::effect_reject_invalid_request
+
+      , sml::state<state_idle> <= sml::state<state_validation_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_multi_lane_incompatible{} ]
+                 / action::effect_reject_incompatible_lanes
+
+      , sml::state<state_group_ready> <= sml::state<state_validation_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_single_lane{} ]
+                 / action::effect_mark_single_lane
+
+      , sml::state<state_group_ready> <= sml::state<state_validation_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_multi_lane_compatible{} ]
+                 / action::effect_mark_grouped_lanes
+
+      //------------------------------------------------------------------------------//
+      // Bounded lane dispatch. Serial lanes use explicit transition stages;
+      // pool-backed multi-lane groups fork/join once inside the RTC chain.
+      , sml::state<state_parallel_decision> <= sml::state<state_group_ready>
+                 + sml::completion<event::run>
+                 [ guard::guard_parallel_dispatch{} ]
+                 / action::effect_dispatch_parallel_lanes
+
+      , sml::state<state_lane0_decision> <= sml::state<state_group_ready>
+                 + sml::completion<event::run>
+                 [ guard::guard_serial_dispatch{} ]
+                 / action::effect_dispatch_lane<0>{}
+
+      , sml::state<state_idle> <= sml::state<state_lane0_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_lane_rejected<0>{} ]
+                 / action::effect_mark_lane_rejected<0>{}
+
+      , sml::state<state_idle> <= sml::state<state_lane0_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_lane_accepted_and_last<0>{} ]
+                 / action::effect_commit_done
+
+      , sml::state<state_lane1_decision> <= sml::state<state_lane0_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_lane_accepted_and_more<0>{} ]
+                 / action::effect_dispatch_lane<1>{}
+
+      , sml::state<state_idle> <= sml::state<state_lane1_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_lane_rejected<1>{} ]
+                 / action::effect_mark_lane_rejected<1>{}
+
+      , sml::state<state_idle> <= sml::state<state_lane1_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_lane_accepted_and_last<1>{} ]
+                 / action::effect_commit_done
+
+      , sml::state<state_lane2_decision> <= sml::state<state_lane1_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_lane_accepted_and_more<1>{} ]
+                 / action::effect_dispatch_lane<2>{}
+
+      , sml::state<state_idle> <= sml::state<state_lane2_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_lane_rejected<2>{} ]
+                 / action::effect_mark_lane_rejected<2>{}
+
+      , sml::state<state_idle> <= sml::state<state_lane2_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_lane_accepted_and_last<2>{} ]
+                 / action::effect_commit_done
+
+      , sml::state<state_lane3_decision> <= sml::state<state_lane2_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_lane_accepted_and_more<2>{} ]
+                 / action::effect_dispatch_lane<3>{}
+
+      , sml::state<state_idle> <= sml::state<state_lane3_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_lane_rejected<3>{} ]
+                 / action::effect_mark_lane_rejected<3>{}
+
+      , sml::state<state_idle> <= sml::state<state_lane3_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_lane_accepted_and_last<3>{} ]
+                 / action::effect_commit_done
+
+      , sml::state<state_lane4_decision> <= sml::state<state_lane3_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_lane_accepted_and_more<3>{} ]
+                 / action::effect_dispatch_lane<4>{}
+
+      , sml::state<state_idle> <= sml::state<state_lane4_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_lane_rejected<4>{} ]
+                 / action::effect_mark_lane_rejected<4>{}
+
+      , sml::state<state_idle> <= sml::state<state_lane4_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_lane_accepted_and_last<4>{} ]
+                 / action::effect_commit_done
+
+      , sml::state<state_lane5_decision> <= sml::state<state_lane4_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_lane_accepted_and_more<4>{} ]
+                 / action::effect_dispatch_lane<5>{}
+
+      , sml::state<state_idle> <= sml::state<state_lane5_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_lane_rejected<5>{} ]
+                 / action::effect_mark_lane_rejected<5>{}
+
+      , sml::state<state_idle> <= sml::state<state_lane5_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_lane_accepted_and_last<5>{} ]
+                 / action::effect_commit_done
+
+      , sml::state<state_lane6_decision> <= sml::state<state_lane5_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_lane_accepted_and_more<5>{} ]
+                 / action::effect_dispatch_lane<6>{}
+
+      , sml::state<state_idle> <= sml::state<state_lane6_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_lane_rejected<6>{} ]
+                 / action::effect_mark_lane_rejected<6>{}
+
+      , sml::state<state_idle> <= sml::state<state_lane6_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_lane_accepted_and_last<6>{} ]
+                 / action::effect_commit_done
+
+      , sml::state<state_lane7_decision> <= sml::state<state_lane6_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_lane_accepted_and_more<6>{} ]
+                 / action::effect_dispatch_lane<7>{}
+
+      , sml::state<state_idle> <= sml::state<state_lane7_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_lane_rejected<7>{} ]
+                 / action::effect_mark_lane_rejected<7>{}
+
+      , sml::state<state_idle> <= sml::state<state_lane7_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_lane_accepted_and_last<7>{} ]
+                 / action::effect_commit_done
+
+      , sml::state<state_idle> <= sml::state<state_parallel_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_parallel_lane_rejected<0>{} ]
+                 / action::effect_mark_lane_rejected<0>{}
+
+      , sml::state<state_idle> <= sml::state<state_parallel_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_parallel_lane_rejected<1>{} ]
+                 / action::effect_mark_lane_rejected<1>{}
+
+      , sml::state<state_idle> <= sml::state<state_parallel_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_parallel_lane_rejected<2>{} ]
+                 / action::effect_mark_lane_rejected<2>{}
+
+      , sml::state<state_idle> <= sml::state<state_parallel_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_parallel_lane_rejected<3>{} ]
+                 / action::effect_mark_lane_rejected<3>{}
+
+      , sml::state<state_idle> <= sml::state<state_parallel_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_parallel_lane_rejected<4>{} ]
+                 / action::effect_mark_lane_rejected<4>{}
+
+      , sml::state<state_idle> <= sml::state<state_parallel_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_parallel_lane_rejected<5>{} ]
+                 / action::effect_mark_lane_rejected<5>{}
+
+      , sml::state<state_idle> <= sml::state<state_parallel_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_parallel_lane_rejected<6>{} ]
+                 / action::effect_mark_lane_rejected<6>{}
+
+      , sml::state<state_idle> <= sml::state<state_parallel_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_parallel_lane_rejected<7>{} ]
+                 / action::effect_mark_lane_rejected<7>{}
+
+      , sml::state<state_idle> <= sml::state<state_parallel_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_parallel_all_lanes_accepted{} ]
+                 / action::effect_commit_done
+
+      //------------------------------------------------------------------------------//
+      // Unexpected events.
+      , sml::state<state_idle> <= sml::state<state_idle> + sml::unexpected_event<sml::_>
+                 / action::effect_on_unexpected
+      , sml::state<state_idle> <= sml::state<state_validation_decision>
+                 + sml::unexpected_event<sml::_>
+                 / action::effect_on_unexpected
+      , sml::state<state_idle> <= sml::state<state_group_ready>
+                 + sml::unexpected_event<sml::_>
+                 / action::effect_on_unexpected
+      , sml::state<state_idle> <= sml::state<state_lane0_decision>
+                 + sml::unexpected_event<sml::_>
+                 / action::effect_on_unexpected
+      , sml::state<state_idle> <= sml::state<state_lane1_decision>
+                 + sml::unexpected_event<sml::_>
+                 / action::effect_on_unexpected
+      , sml::state<state_idle> <= sml::state<state_lane2_decision>
+                 + sml::unexpected_event<sml::_>
+                 / action::effect_on_unexpected
+      , sml::state<state_idle> <= sml::state<state_lane3_decision>
+                 + sml::unexpected_event<sml::_>
+                 / action::effect_on_unexpected
+      , sml::state<state_idle> <= sml::state<state_lane4_decision>
+                 + sml::unexpected_event<sml::_>
+                 / action::effect_on_unexpected
+      , sml::state<state_idle> <= sml::state<state_lane5_decision>
+                 + sml::unexpected_event<sml::_>
+                 / action::effect_on_unexpected
+      , sml::state<state_idle> <= sml::state<state_lane6_decision>
+                 + sml::unexpected_event<sml::_>
+                 / action::effect_on_unexpected
+      , sml::state<state_idle> <= sml::state<state_lane7_decision>
+                 + sml::unexpected_event<sml::_>
+                 / action::effect_on_unexpected
+      , sml::state<state_idle> <= sml::state<state_parallel_decision>
+                 + sml::unexpected_event<sml::_>
+                 / action::effect_on_unexpected
+    );
+    // clang-format on
+  }
+};
+
+using static_co_policy =
+    emel::policy::coroutine_scheduler<emel::policy::fifo_scheduler<16u, 64u>>;
+
+struct sm : public emel::co_sm<model, action::context, static_co_policy> {
+  using base_type = emel::co_sm<model, action::context, static_co_policy>;
+  using base_type::is;
+  using base_type::visit_current_states;
+
+  sm() = default;
+  explicit sm(action::lane_pool & pool) : base_type(action::context{.pool = &pool}) {}
+
+  bool process_event(const event::run & ev) {
+    const bool accepted = process_event_async(ev).result();
+    return accepted && ev.out.err == emel::error::cast(error::none);
+  }
+
+  emel::bool_task process_event_async(const event::run & ev) {
+    const bool accepted = base_type::process_event_async(ev).result();
+    return emel::bool_task::from_value(
+        accepted && ev.out.err == emel::error::cast(error::none));
+  }
+};
+
+}  // namespace emel::text::generator::decode_wavefront
diff --git a/src/emel/text/generator/detail.hpp b/src/emel/text/generator/detail.hpp
index 135ee577..e45c2d28 100644
--- a/src/emel/text/generator/detail.hpp
+++ b/src/emel/text/generator/detail.hpp
@@ -5,8 +5,10 @@
 #include <cmath>
 #include <cstdint>
 #include <cstring>
+#include <functional>
 #include <limits>
 #include <memory>
+#include <optional>
 #include <span>
 #include <vector>
 
@@ -29,6 +31,19 @@ struct tensor_matrix {
   int32_t cols = 0;
 };
 
+struct rope_pairing {
+  int32_t x0_stride = 2;
+  int32_t x1_stride = 2;
+  int32_t x1_offset = 1;
+  int32_t x1_half_rot_offset = 0;
+};
+
+inline constexpr rope_pairing normal_rope_pairing() noexcept { return {}; }
+
+inline constexpr rope_pairing neox_rope_pairing() noexcept {
+  return rope_pairing{1, 1, 0, 1};
+}
+
 struct packed_matrix_binding {
   emel::model::data::tensor_record tensor = {};
   std::vector<uint8_t> storage = {};
@@ -42,6 +57,7 @@ struct block_weights {
   int32_t attention_head_dim_kv = 0;
   int32_t attention_rope_dim = 0;
   float attention_rope_freq_base = 10000.0f;
+  rope_pairing attention_rope_pairing = {};
   std::vector<float> attention_norm = {};
   tensor_matrix attention_q = {};
   packed_matrix_binding attention_q_packed = {};
@@ -67,6 +83,19 @@ struct block_weights {
   packed_matrix_binding feed_forward_up_packed = {};
 };
 
+// View-sliced parallel matmul lanes: one kernel actor per pool worker. A
+// parallel dispatch forks one logical matmul into per-lane row-slice events
+// and joins before the enclosing action returns, so no work escapes the RTC
+// boundary and no lane actor is entered concurrently.
+constexpr size_t k_matmul_lanes = 8;
+using matmul_lane_pool = emel::policy::thread_pool_scheduler<k_matmul_lanes, 16u, 128u>;
+using matmul_lane_scheduler = emel::policy::thread_pool_scheduler_ref<matmul_lane_pool>;
+
+enum class matmul_lane_mode : uint8_t {
+  serial = 0,
+  parallel = 1,
+};
+
 struct native_backend {
   const emel::model::data * model = nullptr;
   emel::model::llama::detail::execution_view execution = {};
@@ -74,6 +103,11 @@ struct native_backend {
   emel::model::llama::detail::step_plan prefill_plan = {};
   emel::model::llama::detail::step_plan decode_plan = {};
   emel::kernel::sm kernel = {};
+  // Parallel matmul lane actors and their pool. Worker threads are one-time
+  // prepare() construction (in-place, no heap); slice dispatch reuses these
+  // actors and allocates nothing.
+  std::array<emel::kernel::sm, k_matmul_lanes> lane_kernels = {};
+  std::optional<matmul_lane_pool> lane_pool = {};
   emel::kernel::kernel_kind kernel_kind = emel::kernel::kernel_kind::x86_64;
   uint64_t kernel_dispatch_calls = 0;
   uint64_t native_q8_0_dispatch_calls = 0;
@@ -228,6 +262,12 @@ constexpr int32_t k_error_ok = 0;
 constexpr int32_t k_error_invalid = 1;
 constexpr int32_t k_prefill_q8_chunk_rows = 4;
 constexpr int32_t k_prefill_q8_chunk8_rows = 8;
+// Minimum prompt size before the parallel matmul lanes are worth the fork
+// overhead; route guards read this so the choice stays in transition rows.
+constexpr int32_t k_parallel_min_prefill_tokens = 8;
+// Decode GEMV lanes only pay off once per-matmul work dwarfs the fork
+// overhead; route guards require this minimum model width for parallel decode.
+constexpr int32_t k_parallel_min_gemv_dim = 1024;
 constexpr emel::kernel::kernel_kind detect_host_kernel_kind() noexcept {
 #if defined(__aarch64__) || defined(_M_ARM64)
   return emel::kernel::kernel_kind::aarch64;
@@ -1322,6 +1362,7 @@ inline bool prepare_packed_q6_matrix_layout(tensor_matrix & matrix,
 inline bool prepare_native_matrix_layout(native_backend & backend,
                                          tensor_matrix & matrix,
                                          packed_matrix_binding & packed) noexcept {
+  (void)packed;
   if (matrix.tensor == nullptr) {
     return false;
   }
@@ -2061,12 +2102,14 @@ inline const emel::graph::processor::event::lifecycle_manifest * decode_lifecycl
 inline bool prepare_packed_q8_0_input(native_backend & backend,
                                       std::span<const float> input) noexcept;
 
+template <matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool matmul_vector_prepared_packed_q8_0_input(
     native_backend & backend,
     const tensor_matrix & matrix,
     const int32_t input_cols,
     std::span<float> output) noexcept;
 
+template <matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool matmul_vector_q8_input(
     native_backend & backend,
     const tensor_matrix & matrix,
@@ -2078,6 +2121,7 @@ inline bool prepare_packed_q8_0_chunk4_input(native_backend & backend,
                                              std::span<const float> input,
                                              const int32_t input_cols) noexcept;
 
+template <matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool matmul_chunk4_prepared_packed_q8_0_input(
     native_backend & backend,
     const tensor_matrix & matrix,
@@ -2088,6 +2132,7 @@ inline bool prepare_q8_chunk4_input(native_backend & backend,
                                     std::span<const float> input,
                                     const int32_t input_cols) noexcept;
 
+template <matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool matmul_chunk4_q8_input(native_backend & backend,
                                    const tensor_matrix & matrix,
                                    const int32_t input_cols,
@@ -2106,6 +2151,152 @@ inline bool valid_matmul_vector_shape(const tensor_matrix & matrix,
       static_cast<size_t>(matrix.rows) == output.size();
 }
 
+struct matmul_row_slice {
+  int32_t row_begin = 0;
+  int32_t row_count = 0;
+};
+
+// Row-group granularity for view slicing: packed interleaved dtypes store
+// multiple logical rows per storage group, so slice boundaries must land on
+// group multiples; plain row-major dtypes slice on any row boundary.
+inline uint64_t matmul_slice_group_rows(const emel::kernel::event::dtype type) noexcept {
+  const uint8_t code = emel::kernel::detail::dtype_code(type);
+  const uint64_t x8_group =
+      static_cast<uint64_t>(code == emel::kernel::detail::dtype_q4_k_x8_bl4 ||
+                            code == emel::kernel::detail::dtype_q4_k_x8_bl8 ||
+                            code == emel::kernel::detail::dtype_q6_k_x8 ||
+                            code == emel::kernel::detail::dtype_q6_k_x8_q8_prepared ||
+                            code == emel::kernel::detail::dtype_q6_k_x8_q8_argmax_prepared) *
+      emel::kernel::detail::quant::Q4_K_X8_ROWS;
+  const uint64_t x4_group =
+      static_cast<uint64_t>(code == emel::kernel::detail::dtype_q8_0_x4_bl4 ||
+                            code == emel::kernel::detail::dtype_q8_0_x4_bl8) *
+      emel::kernel::detail::quant::Q8_0_X4_ROWS;
+  return std::max<uint64_t>(x8_group + x4_group, 1u);
+}
+
+// Partition weight rows into at most k_matmul_lanes contiguous group-aligned
+// slices. Pure bounded partition arithmetic; a ragged tail (rows not a group
+// multiple) lands in the final slice, matching the padded storage group the
+// packed formats already carry.
+inline size_t compute_matmul_row_slices(
+    const uint64_t rows,
+    const uint64_t group_rows,
+    std::array<matmul_row_slice, k_matmul_lanes> & slices) noexcept {
+  const uint64_t groups = (rows + group_rows - 1u) / group_rows;
+  const uint64_t lane_count = std::min<uint64_t>(k_matmul_lanes, std::max<uint64_t>(groups, 1u));
+  const uint64_t groups_per_lane = groups / lane_count;
+  const uint64_t extra_groups = groups % lane_count;
+  uint64_t begin_group = 0u;
+  for (uint64_t lane = 0u; lane < lane_count; ++lane) {
+    const uint64_t lane_groups = groups_per_lane + static_cast<uint64_t>(lane < extra_groups);
+    const uint64_t begin_row = begin_group * group_rows;
+    const uint64_t end_row = std::min(rows, (begin_group + lane_groups) * group_rows);
+    slices[lane].row_begin = static_cast<int32_t>(begin_row);
+    slices[lane].row_count = static_cast<int32_t>(end_row - begin_row);
+    begin_group += lane_groups;
+  }
+  return static_cast<size_t>(lane_count);
+}
+
+// Derive a slice event from a full mul_mat event: sliced src0/dst views over
+// a contiguous group-aligned row range, shared src1 input. The event remains
+// a complete work description; lane kernels never learn about slicing.
+inline emel::kernel::event::op_mul_mat compute_sliced_mul_mat_event(
+    const emel::kernel::event::op_mul_mat & ev,
+    const uint64_t group_rows,
+    const matmul_row_slice slice) noexcept {
+  emel::kernel::event::op_mul_mat sliced = ev;
+  const uint64_t begin = static_cast<uint64_t>(slice.row_begin);
+  const uint64_t count = static_cast<uint64_t>(slice.row_count);
+  const uint64_t slice_groups = (count + group_rows - 1u) / group_rows;
+  sliced.src0.data =
+      static_cast<const uint8_t *>(ev.src0.data) + (begin / group_rows) * ev.src0.nb[1];
+  sliced.src0.ne[1] = count;
+  sliced.src0.nb[2] = ev.src0.nb[1] * slice_groups;
+  sliced.src0.nb[3] = sliced.src0.nb[2];
+  sliced.dst.data = static_cast<uint8_t *>(ev.dst.data) + begin * ev.dst.nb[1];
+  sliced.dst.ne[1] = count;
+  sliced.dst.nb[2] = ev.dst.nb[1] * count;
+  sliced.dst.nb[3] = sliced.dst.nb[2];
+  return sliced;
+}
+
+// Fork/join slice dispatch across the lane kernel actors. The caller computes
+// the first slice while pool workers compute the rest; every slice joins
+// before this helper returns, so no work escapes the RTC boundary. Slices
+// write disjoint dst rows and reorder no reductions, so the output is
+// bit-identical to the serial dispatch.
+inline bool compute_mul_mat_sliced_parallel(
+    native_backend & backend,
+    const emel::kernel::event::op_mul_mat & ev) noexcept {
+  if (!backend.lane_pool.has_value() || ev.src0.ne[1] == 0u) {
+    return false;
+  }
+
+  std::array<matmul_row_slice, k_matmul_lanes> slices = {};
+  std::array<emel::kernel::event::op_mul_mat, k_matmul_lanes> lane_events = {};
+  std::array<bool, k_matmul_lanes> lane_ok = {};
+  const uint64_t group_rows = matmul_slice_group_rows(ev.src0.type);
+  const size_t lane_count = compute_matmul_row_slices(ev.src0.ne[1], group_rows, slices);
+
+  matmul_lane_scheduler scheduler{*backend.lane_pool};
+  matmul_lane_scheduler::join_group group{};
+  for (size_t lane = 1u; lane < lane_count; ++lane) {
+    lane_events[lane] = compute_sliced_mul_mat_event(ev, group_rows, slices[lane]);
+    auto & lane_kernel = backend.lane_kernels[lane];
+    lane_kernel.set_kind(backend.kernel_kind);
+    const auto & lane_ev = lane_events[lane];
+    auto & ok_flag = lane_ok[lane];
+    const bool submitted =
+        scheduler.try_submit(group, [&lane_kernel, &lane_ev, &ok_flag]() noexcept {
+          ok_flag = lane_kernel.process_event(lane_ev);
+        });
+    // Bounded backpressure handling: a rejected submit (queue full, or the
+    // caller is already a pool worker) runs the same slice inline. The
+    // algorithm and output are identical either way; only placement differs.
+    if (!submitted) {
+      ok_flag = lane_kernel.process_event(lane_ev);
+    }
+  }
+  lane_events[0] = compute_sliced_mul_mat_event(ev, group_rows, slices[0]);
+  backend.lane_kernels[0].set_kind(backend.kernel_kind);
+  lane_ok[0] = backend.lane_kernels[0].process_event(lane_events[0]);
+  (void)group.wait();
+
+  bool all_ok = true;
+  for (size_t lane = 0u; lane < lane_count; ++lane) {
+    all_ok = all_ok && lane_ok[lane];
+  }
+  return all_ok;
+}
+
+// Compile-time lane-mode seam for mul_mat dispatch. Route guards choose the
+// lane mode; this helper only executes the already-chosen mode.
+template <matmul_lane_mode lanes = matmul_lane_mode::serial>
+inline bool compute_mul_mat(native_backend & backend,
+                            const emel::kernel::event::op_mul_mat & ev) noexcept {
+  if constexpr (lanes == matmul_lane_mode::parallel) {
+    return compute_mul_mat_sliced_parallel(backend, ev);
+  } else {
+    backend.kernel.set_kind(backend.kernel_kind);
+    return backend.kernel.process_event(ev);
+  }
+}
+
+// Evidence counters live per kernel actor; parallel slices accrue on lane
+// actors, so audit reads must sum the primary kernel and every lane.
+template <class counter_fn>
+inline uint64_t compute_kernel_counter_total(const native_backend & backend,
+                                             counter_fn && counter) noexcept {
+  uint64_t total = std::invoke(counter, backend.kernel);
+  for (const auto & lane_kernel : backend.lane_kernels) {
+    total += std::invoke(counter, lane_kernel);
+  }
+  return total;
+}
+
+template <matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool matmul_vector_packed_q8_0(native_backend & backend,
                                       const tensor_matrix & matrix,
                                       std::span<const float> input,
@@ -2115,9 +2306,10 @@ inline bool matmul_vector_packed_q8_0(native_backend & backend,
   }
 
   return prepare_packed_q8_0_input(backend, input) &&
-      matmul_vector_prepared_packed_q8_0_input(backend, matrix, matrix.cols, output);
+      matmul_vector_prepared_packed_q8_0_input<lanes>(backend, matrix, matrix.cols, output);
 }
 
+template <matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool matmul_vector_q8_k(native_backend & backend,
                                const tensor_matrix & matrix,
                                std::span<const float> input,
@@ -2134,21 +2326,24 @@ inline bool matmul_vector_q8_k(native_backend & backend,
   auto q8_input = std::span<emel::kernel::detail::quant::block_q8_k>(
       backend.q8_input_storage.data(), block_count);
   return quantize_vector_q8_k(input, q8_input) &&
-      matmul_vector_q8_input(backend, matrix, q8_input, matrix.cols, output);
+      matmul_vector_q8_input<lanes>(backend, matrix, q8_input, matrix.cols, output);
 }
 
+template <matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool matmul_vector(native_backend & backend,
                           const tensor_matrix & matrix,
                           std::span<const float> input,
                           std::span<float> output) noexcept;
 
+template <matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool matmul_vector_native_quantized(native_backend & backend,
                                            const tensor_matrix & matrix,
                                            std::span<const float> input,
                                            std::span<float> output) noexcept {
-  return matmul_vector(backend, matrix, input, output);
+  return matmul_vector<lanes>(backend, matrix, input, output);
 }
 
+template <matmul_lane_mode lanes>
 inline bool matmul_vector(native_backend & backend,
                           const tensor_matrix & matrix,
                           std::span<const float> input,
@@ -2169,10 +2364,8 @@ inline bool matmul_vector(native_backend & backend,
           static_cast<uint64_t>(input.size())),
       .dst = make_dst_view(
           output.data(), static_cast<uint64_t>(1u), static_cast<uint64_t>(output.size())),
-      .nth = 1,
   };
-  backend.kernel.set_kind(backend.kernel_kind);
-  const bool ok = backend.kernel.process_event(ev);
+  const bool ok = compute_mul_mat<lanes>(backend, ev);
   backend.kernel_dispatch_calls += 1;
   backend.native_q8_0_dispatch_calls += static_cast<uint64_t>(
       matrix.tensor != nullptr &&
@@ -2180,20 +2373,20 @@ inline bool matmul_vector(native_backend & backend,
   return ok;
 }
 
-template <scalar_matmul_route route>
+template <scalar_matmul_route route, matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool matmul_vector_routed(native_backend & backend,
                                  const tensor_matrix & matrix,
                                  std::span<const float> input,
                                  std::span<float> output) noexcept {
   if constexpr (route == scalar_matmul_route::packed_q8_0) {
-    return matmul_vector_packed_q8_0(backend, matrix, input, output);
+    return matmul_vector_packed_q8_0<lanes>(backend, matrix, input, output);
   } else if constexpr (route == scalar_matmul_route::q8_k) {
-    return matmul_vector_q8_k(backend, matrix, input, output);
+    return matmul_vector_q8_k<lanes>(backend, matrix, input, output);
   } else if constexpr (route == scalar_matmul_route::native_quantized ||
                        route == scalar_matmul_route::native_quantized_q8_k_logits) {
-    return matmul_vector_native_quantized(backend, matrix, input, output);
+    return matmul_vector_native_quantized<lanes>(backend, matrix, input, output);
   } else {
-    return matmul_vector(backend, matrix, input, output);
+    return matmul_vector<lanes>(backend, matrix, input, output);
   }
 }
 
@@ -2216,7 +2409,6 @@ inline bool matmul_vector_argmax(native_backend & backend,
           static_cast<uint64_t>(1u),
           static_cast<uint64_t>(input.size())),
       .dst = make_dst_view(&selected_score, static_cast<uint64_t>(1u), static_cast<uint64_t>(1u)),
-      .nth = 1,
       .index_out = &selected_index,
   };
   backend.kernel.set_kind(backend.kernel_kind);
@@ -2260,6 +2452,7 @@ inline bool quantize_vector_q8_0(
   return true;
 }
 
+template <matmul_lane_mode lanes>
 inline bool matmul_vector_q8_input(
     native_backend & backend,
     const tensor_matrix & matrix,
@@ -2281,14 +2474,13 @@ inline bool matmul_vector_q8_input(
       .src1 = make_q8_k_vector_view(input.data(), static_cast<uint64_t>(input_cols)),
       .dst = make_dst_view(
           output.data(), static_cast<uint64_t>(1u), static_cast<uint64_t>(output.size())),
-      .nth = 1,
   };
-  backend.kernel.set_kind(backend.kernel_kind);
-  const bool ok = backend.kernel.process_event(ev);
+  const bool ok = compute_mul_mat<lanes>(backend, ev);
   backend.kernel_dispatch_calls += 1;
   return ok;
 }
 
+template <matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool matmul_vector_q8_0_input(
     native_backend & backend,
     const tensor_matrix & matrix,
@@ -2310,10 +2502,8 @@ inline bool matmul_vector_q8_0_input(
       .src1 = make_q8_0_vector_view(input.data(), static_cast<uint64_t>(input_cols)),
       .dst = make_dst_view(
           output.data(), static_cast<uint64_t>(1u), static_cast<uint64_t>(output.size())),
-      .nth = 1,
   };
-  backend.kernel.set_kind(backend.kernel_kind);
-  const bool ok = backend.kernel.process_event(ev);
+  const bool ok = compute_mul_mat<lanes>(backend, ev);
   backend.kernel_dispatch_calls += 1;
   backend.packed_q8_0_dispatch_calls += static_cast<uint64_t>(
       matrix.tensor != nullptr &&
@@ -2451,6 +2641,7 @@ inline bool prepare_q8_chunk8_input(native_backend & backend,
   return true;
 }
 
+template <matmul_lane_mode lanes>
 inline bool matmul_vector_prepared_packed_q8_0_input(
     native_backend & backend,
     const tensor_matrix & matrix,
@@ -2468,7 +2659,7 @@ inline bool matmul_vector_prepared_packed_q8_0_input(
     return false;
   }
 
-  return matmul_vector_q8_0_input(
+  return matmul_vector_q8_0_input<lanes>(
       backend,
       matrix,
       std::span<const emel::kernel::detail::quant::block_q8_0>(
@@ -2477,6 +2668,7 @@ inline bool matmul_vector_prepared_packed_q8_0_input(
       output);
 }
 
+template <matmul_lane_mode lanes>
 inline bool matmul_chunk4_prepared_packed_q8_0_input(
     native_backend & backend,
     const tensor_matrix & matrix,
@@ -2500,15 +2692,14 @@ inline bool matmul_chunk4_prepared_packed_q8_0_input(
           output.data(),
           static_cast<uint64_t>(k_prefill_q8_chunk_rows),
           static_cast<uint64_t>(matrix.rows)),
-      .nth = 1,
   };
-  backend.kernel.set_kind(backend.kernel_kind);
-  const bool ok = backend.kernel.process_event(ev);
+  const bool ok = compute_mul_mat<lanes>(backend, ev);
   backend.kernel_dispatch_calls += 1;
   backend.packed_q8_0_dispatch_calls += static_cast<uint64_t>(ok);
   return ok;
 }
 
+template <matmul_lane_mode lanes>
 inline bool matmul_chunk4_q8_input(native_backend & backend,
                                    const tensor_matrix & matrix,
                                    const int32_t input_cols,
@@ -2537,14 +2728,13 @@ inline bool matmul_chunk4_q8_input(native_backend & backend,
           output.data(),
           static_cast<uint64_t>(k_prefill_q8_chunk_rows),
           static_cast<uint64_t>(matrix.rows)),
-      .nth = 1,
   };
-  backend.kernel.set_kind(backend.kernel_kind);
-  const bool ok = backend.kernel.process_event(ev);
+  const bool ok = compute_mul_mat<lanes>(backend, ev);
   backend.kernel_dispatch_calls += 1;
   return ok;
 }
 
+template <matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool matmul_chunk8_q8_input(native_backend & backend,
                                    const tensor_matrix & matrix,
                                    const int32_t input_cols,
@@ -2573,10 +2763,8 @@ inline bool matmul_chunk8_q8_input(native_backend & backend,
           output.data(),
           static_cast<uint64_t>(k_prefill_q8_chunk8_rows),
           static_cast<uint64_t>(matrix.rows)),
-      .nth = 1,
   };
-  backend.kernel.set_kind(backend.kernel_kind);
-  const bool ok = backend.kernel.process_event(ev);
+  const bool ok = compute_mul_mat<lanes>(backend, ev);
   backend.kernel_dispatch_calls += 1;
   return ok;
 }
@@ -2592,15 +2780,15 @@ inline bool prepare_chunk4_rhs(native_backend & backend,
   }
 }
 
-template <chunk4_rhs_route route>
+template <chunk4_rhs_route route, matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool matmul_chunk4_prepared(native_backend & backend,
                                    const tensor_matrix & matrix,
                                    const int32_t input_cols,
                                    std::span<float> output) noexcept {
   if constexpr (route == chunk4_rhs_route::packed_q8_0) {
-    return matmul_chunk4_prepared_packed_q8_0_input(backend, matrix, input_cols, output);
+    return matmul_chunk4_prepared_packed_q8_0_input<lanes>(backend, matrix, input_cols, output);
   } else {
-    return matmul_chunk4_q8_input(backend, matrix, input_cols, output);
+    return matmul_chunk4_q8_input<lanes>(backend, matrix, input_cols, output);
   }
 }
 
@@ -2652,7 +2840,6 @@ inline bool matmul_vector_q8_input_argmax(
       .src0 = make_src_view(matrix),
       .src1 = make_q8_k_vector_view(input.data(), static_cast<uint64_t>(input_cols)),
       .dst = make_dst_view(&selected_score, static_cast<uint64_t>(1u), static_cast<uint64_t>(1u)),
-      .nth = 1,
       .index_out = &selected_index,
   };
   backend.kernel.set_kind(backend.kernel_kind);
@@ -2838,34 +3025,71 @@ inline void round_q_for_nonflash(std::span<const float> q_source,
   }
 }
 
-inline void apply_rope(std::span<float> vector,
-                       const int32_t head_count,
-                       const int32_t head_dim,
-                       const int32_t n_rot,
-                       const int32_t position,
-                       const float rope_freq_base) noexcept {
+inline void apply_rope_pairing(std::span<float> vector,
+                               const rope_pairing pairing,
+                               const int32_t head_count,
+                               const int32_t head_dim,
+                               const int32_t n_rot,
+                               const int32_t position,
+                               const float rope_freq_base) noexcept {
   const int32_t rot_dim = std::min(n_rot, head_dim);
   if (head_count <= 0 || head_dim <= 1 || rot_dim <= 1) {
     return;
   }
 
   const float theta_scale = ::powf(rope_freq_base, -2.0f / static_cast<float>(rot_dim));
+  const int32_t pair_count = rot_dim / 2;
+  const int32_t x1_base =
+      pairing.x1_offset + pairing.x1_half_rot_offset * pair_count;
   for (int32_t head = 0; head < head_count; ++head) {
     float * head_ptr =
         vector.data() + (static_cast<size_t>(head) * static_cast<size_t>(head_dim));
     float theta = static_cast<float>(position);
-    for (int32_t dim = 0; dim + 1 < rot_dim; dim += 2) {
+    for (int32_t pair = 0; pair < pair_count; ++pair) {
       const float cos_theta = ::cosf(theta);
       const float sin_theta = ::sinf(theta);
-      const float x0 = head_ptr[dim];
-      const float x1 = head_ptr[dim + 1];
-      head_ptr[dim] = x0 * cos_theta - x1 * sin_theta;
-      head_ptr[dim + 1] = x0 * sin_theta + x1 * cos_theta;
+      const int32_t dim0 = pair * pairing.x0_stride;
+      const int32_t dim1 = pair * pairing.x1_stride + x1_base;
+      const float x0 = head_ptr[dim0];
+      const float x1 = head_ptr[dim1];
+      head_ptr[dim0] = x0 * cos_theta - x1 * sin_theta;
+      head_ptr[dim1] = x0 * sin_theta + x1 * cos_theta;
       theta *= theta_scale;
     }
   }
 }
 
+inline void apply_rope(std::span<float> vector,
+                       const int32_t head_count,
+                       const int32_t head_dim,
+                       const int32_t n_rot,
+                       const int32_t position,
+                       const float rope_freq_base) noexcept {
+  apply_rope_pairing(vector,
+                     normal_rope_pairing(),
+                     head_count,
+                     head_dim,
+                     n_rot,
+                     position,
+                     rope_freq_base);
+}
+
+inline void apply_attention_rope(std::span<float> vector,
+                                 const block_weights & block,
+                                 const int32_t head_count,
+                                 const int32_t head_dim,
+                                 const int32_t n_rot,
+                                 const int32_t position,
+                                 const float rope_freq_base) noexcept {
+  apply_rope_pairing(vector,
+                     block.attention_rope_pairing,
+                     head_count,
+                     head_dim,
+                     n_rot,
+                     position,
+                     rope_freq_base);
+}
+
 #if defined(__ARM_NEON) && defined(__aarch64__)
 inline float32x4_t silu4_neon(float32x4_t x) noexcept {
   const float32x4_t one = vdupq_n_f32(1.0f);
@@ -3151,7 +3375,6 @@ inline emel::kernel::event::op_flash_attn_ext make_flash_attn_request(
       sizeof(uint16_t) * static_cast<uint64_t>(backend.n_ctx) * kv_head_dim);
   request.dst = make_dst_view_3d(
       attn_ctx.data(), head_dim, 1u, head_count);
-  request.nth = 1;
   std::memcpy(request.op_params.data(), &scale, sizeof(scale));
   std::memcpy(request.op_params.data() + sizeof(scale),
               &masked_total_tokens,
@@ -3246,7 +3469,7 @@ inline bool run_attention(native_backend & backend,
           backend.q.data(), static_cast<size_t>(effective_attention_q_dim(backend, block))));
 }
 
-template <scalar_matmul_route route>
+template <scalar_matmul_route route, matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool run_shortconv_block(native_backend & backend,
                                 const block_weights & block,
                                 const int32_t layer_index) noexcept {
@@ -3269,7 +3492,7 @@ inline bool run_shortconv_block(native_backend & backend,
     return false;
   }
 
-  if (!matmul_vector_routed<route>(
+  if (!matmul_vector_routed<route, lanes>(
           backend, block.shortconv_in_proj, backend.norm, backend.shortconv_bcx)) {
     return false;
   }
@@ -3316,7 +3539,7 @@ inline bool run_shortconv_block(native_backend & backend,
       backend.shortconv_bx.data(),
       static_cast<size_t>(backend.n_embd) * sizeof(float));
 
-  if (!matmul_vector_routed<route>(
+  if (!matmul_vector_routed<route, lanes>(
           backend, block.shortconv_out_proj, backend.shortconv_conv_out, backend.projected)) {
     return false;
   }
@@ -3419,7 +3642,9 @@ inline bool run_shortconv_block_chunk4(native_backend & backend,
   return true;
 }
 
-template <emel::text::generator::attention_mode mode, scalar_matmul_route route>
+template <emel::text::generator::attention_mode mode,
+          scalar_matmul_route route,
+          matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool run_layer(native_backend & backend,
                       const int32_t layer_index,
                       const int32_t position) noexcept {
@@ -3438,11 +3663,11 @@ inline bool run_layer(native_backend & backend,
         std::span<const float>(backend.attn_ctx.data(), static_cast<size_t>(q_dim));
     if constexpr (route == scalar_matmul_route::packed_q8_0) {
       if (!prepare_packed_q8_0_input(backend, backend.norm) ||
-          !matmul_vector_prepared_packed_q8_0_input(
+          !matmul_vector_prepared_packed_q8_0_input<lanes>(
               backend, block.attention_q, block.attention_q.cols, q) ||
-          !matmul_vector_prepared_packed_q8_0_input(
+          !matmul_vector_prepared_packed_q8_0_input<lanes>(
               backend, block.attention_k, block.attention_k.cols, k) ||
-          !matmul_vector_prepared_packed_q8_0_input(
+          !matmul_vector_prepared_packed_q8_0_input<lanes>(
               backend, block.attention_v, block.attention_v.cols, v)) {
         return false;
       }
@@ -3455,25 +3680,25 @@ inline bool run_layer(native_backend & backend,
       auto q8_input = std::span<emel::kernel::detail::quant::block_q8_k>(
           backend.q8_input_storage.data(), block_count);
       if (!quantize_vector_q8_k(backend.norm, q8_input) ||
-          !matmul_vector_q8_input(
+          !matmul_vector_q8_input<lanes>(
               backend, block.attention_q, q8_input, block.attention_q.cols, q) ||
-          !matmul_vector_q8_input(
+          !matmul_vector_q8_input<lanes>(
               backend, block.attention_k, q8_input, block.attention_k.cols, k) ||
-          !matmul_vector_q8_input(
+          !matmul_vector_q8_input<lanes>(
               backend, block.attention_v, q8_input, block.attention_v.cols, v)) {
         return false;
       }
     } else if constexpr (route == scalar_matmul_route::native_quantized ||
                          route == scalar_matmul_route::native_quantized_q8_k_logits) {
-      if (!matmul_vector_native_quantized(backend, block.attention_q, backend.norm, q) ||
-          !matmul_vector_native_quantized(backend, block.attention_k, backend.norm, k) ||
-          !matmul_vector_native_quantized(backend, block.attention_v, backend.norm, v)) {
+      if (!matmul_vector_native_quantized<lanes>(backend, block.attention_q, backend.norm, q) ||
+          !matmul_vector_native_quantized<lanes>(backend, block.attention_k, backend.norm, k) ||
+          !matmul_vector_native_quantized<lanes>(backend, block.attention_v, backend.norm, v)) {
         return false;
       }
     } else {
-      if (!matmul_vector(backend, block.attention_q, backend.norm, q) ||
-          !matmul_vector(backend, block.attention_k, backend.norm, k) ||
-          !matmul_vector(backend, block.attention_v, backend.norm, v)) {
+      if (!matmul_vector<lanes>(backend, block.attention_q, backend.norm, q) ||
+          !matmul_vector<lanes>(backend, block.attention_k, backend.norm, k) ||
+          !matmul_vector<lanes>(backend, block.attention_v, backend.norm, v)) {
         return false;
       }
     }
@@ -3487,18 +3712,20 @@ inline bool run_layer(native_backend & backend,
       return false;
     }
 
-    apply_rope(q,
-               backend.n_head,
-               effective_attention_head_dim(backend, block),
-               effective_attention_rope_dim(backend, block),
-               position,
-               effective_attention_rope_freq_base(backend, block));
-    apply_rope(k,
-               backend.n_head_kv,
-               effective_attention_head_dim_kv(backend, block),
-               effective_attention_rope_dim(backend, block),
-               position,
-               effective_attention_rope_freq_base(backend, block));
+    apply_attention_rope(q,
+                         block,
+                         backend.n_head,
+                         effective_attention_head_dim(backend, block),
+                         effective_attention_rope_dim(backend, block),
+                         position,
+                         effective_attention_rope_freq_base(backend, block));
+    apply_attention_rope(k,
+                         block,
+                         backend.n_head_kv,
+                         effective_attention_head_dim_kv(backend, block),
+                         effective_attention_rope_dim(backend, block),
+                         position,
+                         effective_attention_rope_freq_base(backend, block));
     if (!store_attention_kv_cache(
             backend,
             block,
@@ -3507,7 +3734,7 @@ inline bool run_layer(native_backend & backend,
             k,
             v) ||
         !run_attention<mode>(backend, block, layer_index, position) ||
-        !matmul_vector_routed<route>(
+        !matmul_vector_routed<route, lanes>(
             backend, block.attention_output, attn_ctx, backend.projected)) {
       return false;
     }
@@ -3515,7 +3742,7 @@ inline bool run_layer(native_backend & backend,
     for (int32_t idx = 0; idx < backend.n_embd; ++idx) {
       backend.hidden[static_cast<size_t>(idx)] += backend.projected[static_cast<size_t>(idx)];
     }
-  } else if (!run_shortconv_block<route>(backend, block, layer_index)) {
+  } else if (!run_shortconv_block<route, lanes>(backend, block, layer_index)) {
     return false;
   }
 
@@ -3529,9 +3756,9 @@ inline bool run_layer(native_backend & backend,
   auto ffn_hidden = std::span<float>(backend.ffn_hidden.data(), static_cast<size_t>(ffn_dim));
   if constexpr (route == scalar_matmul_route::packed_q8_0) {
     if (!prepare_packed_q8_0_input(backend, backend.norm) ||
-        !matmul_vector_prepared_packed_q8_0_input(
+        !matmul_vector_prepared_packed_q8_0_input<lanes>(
             backend, block.feed_forward_gate, block.feed_forward_gate.cols, gate) ||
-        !matmul_vector_prepared_packed_q8_0_input(
+        !matmul_vector_prepared_packed_q8_0_input<lanes>(
             backend, block.feed_forward_up, block.feed_forward_up.cols, up)) {
       return false;
     }
@@ -3544,13 +3771,13 @@ inline bool run_layer(native_backend & backend,
     auto q8_input = std::span<emel::kernel::detail::quant::block_q8_k>(
         backend.q8_input_storage.data(), block_count);
     if (!quantize_vector_q8_k(backend.norm, q8_input) ||
-        !matmul_vector_q8_input(
+        !matmul_vector_q8_input<lanes>(
             backend,
             block.feed_forward_gate,
             q8_input,
             block.feed_forward_gate.cols,
             gate) ||
-        !matmul_vector_q8_input(
+        !matmul_vector_q8_input<lanes>(
             backend,
             block.feed_forward_up,
             q8_input,
@@ -3560,13 +3787,13 @@ inline bool run_layer(native_backend & backend,
     }
   } else if constexpr (route == scalar_matmul_route::native_quantized ||
                        route == scalar_matmul_route::native_quantized_q8_k_logits) {
-    if (!matmul_vector_native_quantized(backend, block.feed_forward_gate, backend.norm, gate) ||
-        !matmul_vector_native_quantized(backend, block.feed_forward_up, backend.norm, up)) {
+    if (!matmul_vector_native_quantized<lanes>(backend, block.feed_forward_gate, backend.norm, gate) ||
+        !matmul_vector_native_quantized<lanes>(backend, block.feed_forward_up, backend.norm, up)) {
       return false;
     }
   } else {
-    if (!matmul_vector(backend, block.feed_forward_gate, backend.norm, gate) ||
-        !matmul_vector(backend, block.feed_forward_up, backend.norm, up)) {
+    if (!matmul_vector<lanes>(backend, block.feed_forward_gate, backend.norm, gate) ||
+        !matmul_vector<lanes>(backend, block.feed_forward_up, backend.norm, up)) {
       return false;
     }
   }
@@ -3575,7 +3802,7 @@ inline bool run_layer(native_backend & backend,
     ffn_hidden[idx] = silu(gate[idx]) * up[idx];
   }
 
-  if (!matmul_vector_routed<route>(
+  if (!matmul_vector_routed<route, lanes>(
           backend, block.feed_forward_down, ffn_hidden, backend.projected)) {
     return false;
   }
@@ -3601,7 +3828,7 @@ inline bool run_layer_nonflash(native_backend & backend,
       backend, layer_index, position);
 }
 
-template <scalar_matmul_route route>
+template <scalar_matmul_route route, matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool compute_logits(native_backend & backend) noexcept {
   if (!rms_norm(backend.hidden, backend.output_norm, backend.rms_epsilon, backend.norm)) {
     return false;
@@ -3609,7 +3836,7 @@ inline bool compute_logits(native_backend & backend) noexcept {
 
   if constexpr (route == scalar_matmul_route::packed_q8_0) {
     return prepare_packed_q8_0_input(backend, backend.norm) &&
-        matmul_vector_prepared_packed_q8_0_input(
+        matmul_vector_prepared_packed_q8_0_input<lanes>(
                backend, backend.output, backend.n_embd, backend.bound_logits);
   } else if constexpr (route == scalar_matmul_route::q8_k) {
     const size_t block_count =
@@ -3617,14 +3844,14 @@ inline bool compute_logits(native_backend & backend) noexcept {
     auto q8_input = std::span<emel::kernel::detail::quant::block_q8_k>(
         backend.q8_input_storage.data(), block_count);
     return quantize_vector_q8_k(backend.norm, q8_input) &&
-        matmul_vector_q8_input(
+        matmul_vector_q8_input<lanes>(
                backend,
                backend.output,
                q8_input,
                backend.n_embd,
                backend.bound_logits);
   } else if constexpr (route == scalar_matmul_route::native_quantized) {
-    return matmul_vector_native_quantized(
+    return matmul_vector_native_quantized<lanes>(
         backend, backend.output, backend.norm, backend.bound_logits);
   } else if constexpr (route == scalar_matmul_route::native_quantized_q8_k_logits) {
     const size_t block_count =
@@ -3632,14 +3859,14 @@ inline bool compute_logits(native_backend & backend) noexcept {
     auto q8_input = std::span<emel::kernel::detail::quant::block_q8_k>(
         backend.q8_input_storage.data(), block_count);
     return quantize_vector_q8_k(backend.norm, q8_input) &&
-        matmul_vector_q8_input(
+        matmul_vector_q8_input<lanes>(
                backend,
                backend.output,
                q8_input,
                backend.n_embd,
                backend.bound_logits);
   } else {
-    return matmul_vector(backend, backend.output, backend.norm, backend.bound_logits);
+    return matmul_vector<lanes>(backend, backend.output, backend.norm, backend.bound_logits);
   }
 }
 
@@ -3701,7 +3928,7 @@ inline bool run_prefill_scalar_tokens(native_backend & backend,
   return true;
 }
 
-template <chunk4_rhs_route route>
+template <chunk4_rhs_route route, matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool run_shortconv_block_chunk4(native_backend & backend,
                                        const block_weights & block,
                                        const int32_t layer_index) noexcept {
@@ -3727,7 +3954,7 @@ inline bool run_shortconv_block_chunk4(native_backend & backend,
   }
 
   if (!prepare_chunk4_rhs<route>(backend, backend.norm_chunk4, backend.n_embd) ||
-      !matmul_chunk4_prepared<route>(
+      !matmul_chunk4_prepared<route, lanes>(
           backend, block.shortconv_in_proj, backend.n_embd, backend.shortconv_bcx_chunk4)) {
     return false;
   }
@@ -3778,7 +4005,7 @@ inline bool run_shortconv_block_chunk4(native_backend & backend,
   }
 
   if (!prepare_chunk4_rhs<route>(backend, backend.shortconv_conv_out_chunk4, backend.n_embd) ||
-      !matmul_chunk4_prepared<route>(
+      !matmul_chunk4_prepared<route, lanes>(
           backend, block.shortconv_out_proj, backend.n_embd, backend.projected_chunk4) ||
       !add_chunk4_rows_in_place(backend.hidden_chunk4, backend.projected_chunk4, backend.n_embd)) {
     return false;
@@ -3787,6 +4014,7 @@ inline bool run_shortconv_block_chunk4(native_backend & backend,
   return true;
 }
 
+template <matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool run_shortconv_block_chunk8_q8_k(native_backend & backend,
                                             const block_weights & block,
                                             const int32_t layer_index) noexcept {
@@ -3812,7 +4040,7 @@ inline bool run_shortconv_block_chunk8_q8_k(native_backend & backend,
   }
 
   if (!prepare_q8_chunk8_input(backend, backend.norm_chunk8, backend.n_embd) ||
-      !matmul_chunk8_q8_input(
+      !matmul_chunk8_q8_input<lanes>(
           backend, block.shortconv_in_proj, backend.n_embd, backend.shortconv_bcx_chunk8)) {
     return false;
   }
@@ -3863,7 +4091,7 @@ inline bool run_shortconv_block_chunk8_q8_k(native_backend & backend,
   }
 
   if (!prepare_q8_chunk8_input(backend, backend.shortconv_conv_out_chunk8, backend.n_embd) ||
-      !matmul_chunk8_q8_input(
+      !matmul_chunk8_q8_input<lanes>(
           backend, block.shortconv_out_proj, backend.n_embd, backend.projected_chunk8) ||
       !add_chunk8_rows_in_place(backend.hidden_chunk8, backend.projected_chunk8, backend.n_embd)) {
     return false;
@@ -3872,7 +4100,9 @@ inline bool run_shortconv_block_chunk8_q8_k(native_backend & backend,
   return true;
 }
 
-template <emel::text::generator::attention_mode mode, chunk4_rhs_route route>
+template <emel::text::generator::attention_mode mode,
+          chunk4_rhs_route route,
+          matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool run_layer_chunk4(native_backend & backend,
                              const int32_t layer_index,
                              const size_t token_base) noexcept {
@@ -3904,11 +4134,11 @@ inline bool run_layer_chunk4(native_backend & backend,
 
   if (block.uses_attention) {
     if (!prepare_chunk4_rhs<route>(backend, backend.norm_chunk4, backend.n_embd) ||
-        !matmul_chunk4_prepared<route>(
+        !matmul_chunk4_prepared<route, lanes>(
             backend, block.attention_q, backend.n_embd, q_chunk) ||
-        !matmul_chunk4_prepared<route>(
+        !matmul_chunk4_prepared<route, lanes>(
             backend, block.attention_k, backend.n_embd, k_chunk) ||
-        !matmul_chunk4_prepared<route>(
+        !matmul_chunk4_prepared<route, lanes>(
             backend, block.attention_v, backend.n_embd, v_chunk)) {
       return false;
     }
@@ -3941,18 +4171,20 @@ inline bool run_layer_chunk4(native_backend & backend,
         return false;
       }
 
-      apply_rope(q_row,
-                 backend.n_head,
-                 effective_attention_head_dim(backend, block),
-                 effective_attention_rope_dim(backend, block),
-                 position,
-                 effective_attention_rope_freq_base(backend, block));
-      apply_rope(k_row,
-                 backend.n_head_kv,
-                 effective_attention_head_dim_kv(backend, block),
-                 effective_attention_rope_dim(backend, block),
-                 position,
-                 effective_attention_rope_freq_base(backend, block));
+      apply_attention_rope(q_row,
+                           block,
+                           backend.n_head,
+                           effective_attention_head_dim(backend, block),
+                           effective_attention_rope_dim(backend, block),
+                           position,
+                           effective_attention_rope_freq_base(backend, block));
+      apply_attention_rope(k_row,
+                           block,
+                           backend.n_head_kv,
+                           effective_attention_head_dim_kv(backend, block),
+                           effective_attention_rope_dim(backend, block),
+                           position,
+                           effective_attention_rope_freq_base(backend, block));
 
       if (!store_attention_kv_cache(backend, block, layer_index, position, k_row, v_row) ||
           !run_attention_for_q_vector<mode>(backend, block, layer_index, position, q_row)) {
@@ -3967,13 +4199,13 @@ inline bool run_layer_chunk4(native_backend & backend,
     }
 
     if (!prepare_chunk4_rhs<route>(backend, attn_ctx_chunk, q_dim) ||
-        !matmul_chunk4_prepared<route>(
+        !matmul_chunk4_prepared<route, lanes>(
             backend, block.attention_output, q_dim, backend.projected_chunk4) ||
         !add_chunk4_rows_in_place(
             backend.hidden_chunk4, backend.projected_chunk4, backend.n_embd)) {
       return false;
     }
-  } else if (!run_shortconv_block_chunk4<route>(backend, block, layer_index)) {
+  } else if (!run_shortconv_block_chunk4<route, lanes>(backend, block, layer_index)) {
     return false;
   }
 
@@ -3996,9 +4228,9 @@ inline bool run_layer_chunk4(native_backend & backend,
       std::span<float>(backend.ffn_hidden_chunk4.data(),
                        static_cast<size_t>(k_prefill_q8_chunk_rows) * static_cast<size_t>(ffn_dim));
   if (!prepare_chunk4_rhs<route>(backend, backend.norm_chunk4, backend.n_embd) ||
-      !matmul_chunk4_prepared<route>(
+      !matmul_chunk4_prepared<route, lanes>(
           backend, block.feed_forward_gate, backend.n_embd, gate_chunk) ||
-      !matmul_chunk4_prepared<route>(
+      !matmul_chunk4_prepared<route, lanes>(
           backend, block.feed_forward_up, backend.n_embd, up_chunk)) {
     return false;
   }
@@ -4006,7 +4238,7 @@ inline bool run_layer_chunk4(native_backend & backend,
   if (!apply_silu_mul_chunk4(
           gate_chunk, up_chunk, ffn_dim, ffn_hidden_chunk) ||
       !prepare_chunk4_rhs<route>(backend, ffn_hidden_chunk, ffn_dim) ||
-      !matmul_chunk4_prepared<route>(
+      !matmul_chunk4_prepared<route, lanes>(
           backend, block.feed_forward_down, ffn_dim, backend.projected_chunk4) ||
       !add_chunk4_rows_in_place(backend.hidden_chunk4, backend.projected_chunk4, backend.n_embd)) {
     return false;
@@ -4015,7 +4247,8 @@ inline bool run_layer_chunk4(native_backend & backend,
   return true;
 }
 
-template <emel::text::generator::attention_mode mode>
+template <emel::text::generator::attention_mode mode,
+          matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool run_layer_chunk8_q8_k(native_backend & backend,
                                   const int32_t layer_index,
                                   const size_t token_base) noexcept {
@@ -4047,9 +4280,9 @@ inline bool run_layer_chunk8_q8_k(native_backend & backend,
 
   if (block.uses_attention) {
     if (!prepare_q8_chunk8_input(backend, backend.norm_chunk8, backend.n_embd) ||
-        !matmul_chunk8_q8_input(backend, block.attention_q, backend.n_embd, q_chunk) ||
-        !matmul_chunk8_q8_input(backend, block.attention_k, backend.n_embd, k_chunk) ||
-        !matmul_chunk8_q8_input(backend, block.attention_v, backend.n_embd, v_chunk)) {
+        !matmul_chunk8_q8_input<lanes>(backend, block.attention_q, backend.n_embd, q_chunk) ||
+        !matmul_chunk8_q8_input<lanes>(backend, block.attention_k, backend.n_embd, k_chunk) ||
+        !matmul_chunk8_q8_input<lanes>(backend, block.attention_v, backend.n_embd, v_chunk)) {
       return false;
     }
 
@@ -4081,18 +4314,20 @@ inline bool run_layer_chunk8_q8_k(native_backend & backend,
         return false;
       }
 
-      apply_rope(q_row,
-                 backend.n_head,
-                 effective_attention_head_dim(backend, block),
-                 effective_attention_rope_dim(backend, block),
-                 position,
-                 effective_attention_rope_freq_base(backend, block));
-      apply_rope(k_row,
-                 backend.n_head_kv,
-                 effective_attention_head_dim_kv(backend, block),
-                 effective_attention_rope_dim(backend, block),
-                 position,
-                 effective_attention_rope_freq_base(backend, block));
+      apply_attention_rope(q_row,
+                           block,
+                           backend.n_head,
+                           effective_attention_head_dim(backend, block),
+                           effective_attention_rope_dim(backend, block),
+                           position,
+                           effective_attention_rope_freq_base(backend, block));
+      apply_attention_rope(k_row,
+                           block,
+                           backend.n_head_kv,
+                           effective_attention_head_dim_kv(backend, block),
+                           effective_attention_rope_dim(backend, block),
+                           position,
+                           effective_attention_rope_freq_base(backend, block));
 
       if (!store_attention_kv_cache(backend, block, layer_index, position, k_row, v_row) ||
           !run_attention_for_q_vector<mode>(backend, block, layer_index, position, q_row)) {
@@ -4107,12 +4342,12 @@ inline bool run_layer_chunk8_q8_k(native_backend & backend,
     }
 
     if (!prepare_q8_chunk8_input(backend, attn_ctx_chunk, q_dim) ||
-        !matmul_chunk8_q8_input(
+        !matmul_chunk8_q8_input<lanes>(
             backend, block.attention_output, q_dim, backend.projected_chunk8) ||
         !add_chunk8_rows_in_place(backend.hidden_chunk8, backend.projected_chunk8, backend.n_embd)) {
       return false;
     }
-  } else if (!run_shortconv_block_chunk8_q8_k(backend, block, layer_index)) {
+  } else if (!run_shortconv_block_chunk8_q8_k<lanes>(backend, block, layer_index)) {
     return false;
   }
 
@@ -4154,7 +4389,9 @@ inline bool run_layer_chunk8_q8_k(native_backend & backend,
   return true;
 }
 
-template <emel::text::generator::attention_mode mode, chunk4_rhs_route route>
+template <emel::text::generator::attention_mode mode,
+          chunk4_rhs_route route,
+          matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool run_prefill_chunk4_tokens(native_backend & backend,
                                       const size_t token_limit) noexcept {
   for (size_t token_base = 0; token_base < token_limit;
@@ -4177,7 +4414,7 @@ inline bool run_prefill_chunk4_tokens(native_backend & backend,
     }
 
     for (int32_t layer = 0; layer < backend.n_layer; ++layer) {
-      if (!run_layer_chunk4<mode, route>(backend, layer, token_base)) {
+      if (!run_layer_chunk4<mode, route, lanes>(backend, layer, token_base)) {
         return false;
       }
     }
@@ -4199,7 +4436,8 @@ inline bool run_prefill_chunk4_tokens(native_backend & backend,
   return true;
 }
 
-template <emel::text::generator::attention_mode mode>
+template <emel::text::generator::attention_mode mode,
+          matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool run_prefill_chunk8_tokens_q8_k(native_backend & backend,
                                            const size_t token_limit) noexcept {
   for (size_t token_base = 0; token_base < token_limit;
@@ -4222,7 +4460,7 @@ inline bool run_prefill_chunk8_tokens_q8_k(native_backend & backend,
     }
 
     for (int32_t layer = 0; layer < backend.n_layer; ++layer) {
-      if (!run_layer_chunk8_q8_k<mode>(backend, layer, token_base)) {
+      if (!run_layer_chunk8_q8_k<mode, lanes>(backend, layer, token_base)) {
         return false;
       }
     }
@@ -4257,7 +4495,9 @@ inline bool run_prefill(native_backend & backend) noexcept {
   return compute_logits<route>(backend);
 }
 
-template <emel::text::generator::attention_mode mode, chunk4_rhs_route route>
+template <emel::text::generator::attention_mode mode,
+          chunk4_rhs_route route,
+          matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool run_prefill_chunk4(native_backend & backend) noexcept {
   backend.kv_cache_tokens = 0;
   reset_shortconv_cache(backend);
@@ -4266,17 +4506,18 @@ inline bool run_prefill_chunk4(native_backend & backend) noexcept {
   const size_t chunk_limit =
       token_count - (token_count % static_cast<size_t>(k_prefill_q8_chunk_rows));
   if (chunk_limit == 0u ||
-      !run_prefill_chunk4_tokens<mode, route>(backend, chunk_limit) ||
+      !run_prefill_chunk4_tokens<mode, route, lanes>(backend, chunk_limit) ||
       !run_prefill_scalar_tokens<
           mode,
           static_cast<scalar_matmul_route>(route)>(backend, chunk_limit, token_count)) {
     return false;
   }
 
-  return compute_logits<static_cast<scalar_matmul_route>(route)>(backend);
+  return compute_logits<static_cast<scalar_matmul_route>(route), lanes>(backend);
 }
 
-template <emel::text::generator::attention_mode mode>
+template <emel::text::generator::attention_mode mode,
+          matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool run_prefill_chunk8_q8_k(native_backend & backend) noexcept {
   backend.kv_cache_tokens = 0;
   reset_shortconv_cache(backend);
@@ -4285,13 +4526,13 @@ inline bool run_prefill_chunk8_q8_k(native_backend & backend) noexcept {
   const size_t chunk_limit =
       token_count - (token_count % static_cast<size_t>(k_prefill_q8_chunk8_rows));
   if (chunk_limit == 0u ||
-      !run_prefill_chunk8_tokens_q8_k<mode>(backend, chunk_limit) ||
+      !run_prefill_chunk8_tokens_q8_k<mode, lanes>(backend, chunk_limit) ||
       !run_prefill_scalar_tokens<mode, scalar_matmul_route::q8_k>(
           backend, chunk_limit, token_count)) {
     return false;
   }
 
-  return compute_logits<scalar_matmul_route::q8_k>(backend);
+  return compute_logits<scalar_matmul_route::q8_k, lanes>(backend);
 }
 
 inline bool run_prefill_flash(native_backend & backend) noexcept {
@@ -4323,7 +4564,9 @@ inline bool run_prefill_preselected_argmax(native_backend & backend,
       backend, selected_index, selected_score);
 }
 
-template <emel::text::generator::attention_mode mode, chunk4_rhs_route route>
+template <emel::text::generator::attention_mode mode,
+          chunk4_rhs_route route,
+          matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool run_prefill_chunk4_preselected_argmax(native_backend & backend,
                                                   int32_t & selected_index,
                                                   float & selected_score) noexcept {
@@ -4334,7 +4577,7 @@ inline bool run_prefill_chunk4_preselected_argmax(native_backend & backend,
   const size_t chunk_limit =
       token_count - (token_count % static_cast<size_t>(k_prefill_q8_chunk_rows));
   if (chunk_limit == 0u ||
-      !run_prefill_chunk4_tokens<mode, route>(backend, chunk_limit) ||
+      !run_prefill_chunk4_tokens<mode, route, lanes>(backend, chunk_limit) ||
       !run_prefill_scalar_tokens<
           mode,
           static_cast<scalar_matmul_route>(route)>(backend, chunk_limit, token_count)) {
@@ -4350,7 +4593,8 @@ inline bool run_prefill_chunk4_preselected_argmax(native_backend & backend,
   }
 }
 
-template <emel::text::generator::attention_mode mode>
+template <emel::text::generator::attention_mode mode,
+          matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool run_prefill_chunk8_preselected_argmax_q8_k(native_backend & backend,
                                                        int32_t & selected_index,
                                                        float & selected_score) noexcept {
@@ -4361,7 +4605,7 @@ inline bool run_prefill_chunk8_preselected_argmax_q8_k(native_backend & backend,
   const size_t chunk_limit =
       token_count - (token_count % static_cast<size_t>(k_prefill_q8_chunk8_rows));
   if (chunk_limit == 0u ||
-      !run_prefill_chunk8_tokens_q8_k<mode>(backend, chunk_limit) ||
+      !run_prefill_chunk8_tokens_q8_k<mode, lanes>(backend, chunk_limit) ||
       !run_prefill_scalar_tokens<mode, scalar_matmul_route::q8_k>(
           backend, chunk_limit, token_count)) {
     return false;
@@ -4371,7 +4615,9 @@ inline bool run_prefill_chunk8_preselected_argmax_q8_k(native_backend & backend,
       backend, selected_index, selected_score);
 }
 
-template <emel::text::generator::attention_mode mode, scalar_matmul_route route>
+template <emel::text::generator::attention_mode mode,
+          scalar_matmul_route route,
+          matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool run_decode(native_backend & backend,
                        const emel::graph::processor::event::execute & request) noexcept {
   if (backend.bound_token_count != 1 ||
@@ -4395,17 +4641,18 @@ inline bool run_decode(native_backend & backend,
   }
 
   for (int32_t layer = 0; layer < backend.n_layer; ++layer) {
-    if (!run_layer<mode, route>(backend, layer, position)) {
+    if (!run_layer<mode, route, lanes>(backend, layer, position)) {
       return false;
     }
   }
   backend.kv_cache_tokens = position + 1;
-  return compute_logits<route>(backend);
+  return compute_logits<route, lanes>(backend);
 }
 
 template <emel::text::generator::attention_mode mode,
           scalar_matmul_route route,
-          scalar_argmax_route argmax_route>
+          scalar_argmax_route argmax_route,
+          matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool run_decode_preselected_argmax(native_backend & backend,
                                           const emel::graph::processor::event::execute & request,
                                           int32_t & selected_index,
@@ -4431,7 +4678,7 @@ inline bool run_decode_preselected_argmax(native_backend & backend,
   }
 
   for (int32_t layer = 0; layer < backend.n_layer; ++layer) {
-    if (!run_layer<mode, route>(backend, layer, position)) {
+    if (!run_layer<mode, route, lanes>(backend, layer, position)) {
       return false;
     }
   }
@@ -4513,6 +4760,12 @@ inline emel::error::type prepare(native_backend & backend,
 
     auto & weights = backend.blocks[static_cast<size_t>(layer)];
     weights.uses_attention = block.uses_attention;
+    weights.attention_rope_pairing = {
+        model_data.params.rope_pair_x0_stride,
+        model_data.params.rope_pair_x1_stride,
+        model_data.params.rope_pair_x1_offset,
+        model_data.params.rope_pair_x1_half_rot_offset,
+    };
     const bool common_ok =
         dequantize_tensor_vector(*block.attention_norm.tensor, weights.attention_norm) &&
         dequantize_tensor_vector(*block.feed_forward_norm.tensor, weights.feed_forward_norm) &&
@@ -4759,6 +5012,13 @@ inline emel::error::type prepare(native_backend & backend,
   backend.ffn_hidden_chunk8.resize(backend.gate_chunk8.size());
   build_lifecycle(backend);
 
+  for (auto & lane_kernel : backend.lane_kernels) {
+    lane_kernel.set_kind(backend.kernel_kind);
+  }
+  // One-time worker-thread construction for the parallel matmul lanes; the
+  // pool is engaged here so no thread creation ever happens during dispatch.
+  backend.lane_pool.emplace();
+
   return emel::error::cast(emel::model::loader::error::none);
 }
 
@@ -4832,7 +5092,8 @@ inline bool bind_guarded_inputs(const emel::graph::processor::event::execute & r
 
 template <emel::text::generator::attention_mode mode,
           scalar_matmul_route route,
-          step_kind expected_kind>
+          step_kind expected_kind,
+          matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool run_kernel_scalar_mode(const emel::graph::processor::event::execute & request,
                                    int32_t * err_out) noexcept {
   (void)err_out;
@@ -4840,14 +5101,15 @@ inline bool run_kernel_scalar_mode(const emel::graph::processor::event::execute
   if constexpr (expected_kind == step_kind::prefill) {
     return run_prefill<mode, route>(backend);
   } else {
-    return run_decode<mode, route>(backend, request);
+    return run_decode<mode, route, lanes>(backend, request);
   }
 }
 
 template <emel::text::generator::attention_mode mode,
           scalar_matmul_route route,
           scalar_argmax_route argmax_route,
-          step_kind expected_kind>
+          step_kind expected_kind,
+          matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool run_kernel_scalar_preselected_argmax_mode(
     const emel::graph::processor::event::execute & request,
     int32_t * err_out) noexcept {
@@ -4858,7 +5120,7 @@ inline bool run_kernel_scalar_preselected_argmax_mode(
     return run_prefill_preselected_argmax<mode, route, argmax_route>(
         backend, *io.selected_token_out, *io.selected_score_out);
   } else {
-    return run_decode_preselected_argmax<mode, route, argmax_route>(
+    return run_decode_preselected_argmax<mode, route, argmax_route, lanes>(
         backend, request, *io.selected_token_out, *io.selected_score_out);
   }
 }
@@ -4998,6 +5260,56 @@ inline bool run_kernel_flash_decode_kernel(
       step_kind::decode>(request, err_out);
 }
 
+inline bool run_kernel_flash_decode_parallel_packed_q8_0(
+    const emel::graph::processor::event::execute & request,
+    int32_t * err_out) noexcept {
+  return run_kernel_scalar_mode<
+      emel::text::generator::attention_mode::flash,
+      scalar_matmul_route::packed_q8_0,
+      step_kind::decode,
+      matmul_lane_mode::parallel>(request, err_out);
+}
+
+inline bool run_kernel_flash_decode_parallel_q8_k(
+    const emel::graph::processor::event::execute & request,
+    int32_t * err_out) noexcept {
+  return run_kernel_scalar_mode<
+      emel::text::generator::attention_mode::flash,
+      scalar_matmul_route::q8_k,
+      step_kind::decode,
+      matmul_lane_mode::parallel>(request, err_out);
+}
+
+inline bool run_kernel_flash_decode_parallel_native_quantized(
+    const emel::graph::processor::event::execute & request,
+    int32_t * err_out) noexcept {
+  return run_kernel_scalar_mode<
+      emel::text::generator::attention_mode::flash,
+      scalar_matmul_route::native_quantized,
+      step_kind::decode,
+      matmul_lane_mode::parallel>(request, err_out);
+}
+
+inline bool run_kernel_flash_decode_parallel_native_quantized_q8_k_logits(
+    const emel::graph::processor::event::execute & request,
+    int32_t * err_out) noexcept {
+  return run_kernel_scalar_mode<
+      emel::text::generator::attention_mode::flash,
+      scalar_matmul_route::native_quantized_q8_k_logits,
+      step_kind::decode,
+      matmul_lane_mode::parallel>(request, err_out);
+}
+
+inline bool run_kernel_flash_decode_parallel_kernel(
+    const emel::graph::processor::event::execute & request,
+    int32_t * err_out) noexcept {
+  return run_kernel_scalar_mode<
+      emel::text::generator::attention_mode::flash,
+      scalar_matmul_route::kernel,
+      step_kind::decode,
+      matmul_lane_mode::parallel>(request, err_out);
+}
+
 inline bool run_kernel_nonflash_decode_packed_q8_0(
     const emel::graph::processor::event::execute & request,
     int32_t * err_out) noexcept {
@@ -5043,12 +5355,13 @@ inline bool run_kernel_nonflash_decode_kernel(
       step_kind::decode>(request, err_out);
 }
 
-template <emel::text::generator::attention_mode mode>
+template <emel::text::generator::attention_mode mode,
+          matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool run_kernel_prefill_chunk8_q8_k_mode(
     const emel::graph::processor::event::execute & request,
     int32_t * err_out) noexcept {
   (void)err_out;
-  return run_prefill_chunk8_q8_k<mode>(bind_native_backend(request));
+  return run_prefill_chunk8_q8_k<mode, lanes>(bind_native_backend(request));
 }
 
 inline bool run_kernel_flash_prefill_chunk8_q8_k(
@@ -5065,12 +5378,14 @@ inline bool run_kernel_nonflash_prefill_chunk8_q8_k(
       request, err_out);
 }
 
-template <emel::text::generator::attention_mode mode, chunk4_rhs_route route>
+template <emel::text::generator::attention_mode mode,
+          chunk4_rhs_route route,
+          matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool run_kernel_prefill_chunk4_mode(
     const emel::graph::processor::event::execute & request,
     int32_t * err_out) noexcept {
   (void)err_out;
-  return run_prefill_chunk4<mode, route>(bind_native_backend(request));
+  return run_prefill_chunk4<mode, route, lanes>(bind_native_backend(request));
 }
 
 inline bool run_kernel_flash_prefill_chunk4_packed_q8_0(
@@ -5109,6 +5424,35 @@ inline bool run_kernel_nonflash_prefill_chunk4_q8_k(
       request, err_out);
 }
 
+inline bool run_kernel_flash_prefill_parallel_chunk8_q8_k(
+    const emel::graph::processor::event::execute & request,
+    int32_t * err_out) noexcept {
+  return run_kernel_prefill_chunk8_q8_k_mode<
+      emel::text::generator::attention_mode::flash,
+      matmul_lane_mode::parallel>(
+      request, err_out);
+}
+
+inline bool run_kernel_flash_prefill_parallel_chunk4_packed_q8_0(
+    const emel::graph::processor::event::execute & request,
+    int32_t * err_out) noexcept {
+  return run_kernel_prefill_chunk4_mode<
+      emel::text::generator::attention_mode::flash,
+      chunk4_rhs_route::packed_q8_0,
+      matmul_lane_mode::parallel>(
+      request, err_out);
+}
+
+inline bool run_kernel_flash_prefill_parallel_chunk4_q8_k(
+    const emel::graph::processor::event::execute & request,
+    int32_t * err_out) noexcept {
+  return run_kernel_prefill_chunk4_mode<
+      emel::text::generator::attention_mode::flash,
+      chunk4_rhs_route::q8_k,
+      matmul_lane_mode::parallel>(
+      request, err_out);
+}
+
 inline bool run_kernel_flash_prefill_scalar_preselected_argmax_q8_k(
     const emel::graph::processor::event::execute & request,
     int32_t * err_out) noexcept {
@@ -5229,6 +5573,50 @@ inline bool run_kernel_flash_decode_preselected_argmax_kernel(
       step_kind::decode>(request, err_out);
 }
 
+inline bool run_kernel_flash_decode_parallel_preselected_argmax_q8_k(
+    const emel::graph::processor::event::execute & request,
+    int32_t * err_out) noexcept {
+  return run_kernel_scalar_preselected_argmax_mode<
+      emel::text::generator::attention_mode::flash,
+      scalar_matmul_route::q8_k,
+      scalar_argmax_route::q8_k,
+      step_kind::decode,
+      matmul_lane_mode::parallel>(request, err_out);
+}
+
+inline bool run_kernel_flash_decode_parallel_preselected_argmax_native_quantized_q8_k(
+    const emel::graph::processor::event::execute & request,
+    int32_t * err_out) noexcept {
+  return run_kernel_scalar_preselected_argmax_mode<
+      emel::text::generator::attention_mode::flash,
+      scalar_matmul_route::native_quantized,
+      scalar_argmax_route::q8_k,
+      step_kind::decode,
+      matmul_lane_mode::parallel>(request, err_out);
+}
+
+inline bool run_kernel_flash_decode_parallel_preselected_argmax_native_quantized_kernel(
+    const emel::graph::processor::event::execute & request,
+    int32_t * err_out) noexcept {
+  return run_kernel_scalar_preselected_argmax_mode<
+      emel::text::generator::attention_mode::flash,
+      scalar_matmul_route::native_quantized,
+      scalar_argmax_route::kernel,
+      step_kind::decode,
+      matmul_lane_mode::parallel>(request, err_out);
+}
+
+inline bool run_kernel_flash_decode_parallel_preselected_argmax_kernel(
+    const emel::graph::processor::event::execute & request,
+    int32_t * err_out) noexcept {
+  return run_kernel_scalar_preselected_argmax_mode<
+      emel::text::generator::attention_mode::flash,
+      scalar_matmul_route::kernel,
+      scalar_argmax_route::kernel,
+      step_kind::decode,
+      matmul_lane_mode::parallel>(request, err_out);
+}
+
 inline bool run_kernel_nonflash_decode_preselected_argmax_q8_k(
     const emel::graph::processor::event::execute & request,
     int32_t * err_out) noexcept {
@@ -5269,23 +5657,26 @@ inline bool run_kernel_nonflash_decode_preselected_argmax_kernel(
       step_kind::decode>(request, err_out);
 }
 
-template <emel::text::generator::attention_mode mode, chunk4_rhs_route route>
+template <emel::text::generator::attention_mode mode,
+          chunk4_rhs_route route,
+          matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool run_kernel_prefill_chunk4_preselected_argmax_mode(
     const emel::graph::processor::event::execute & request,
     int32_t * err_out) noexcept {
   (void)err_out;
   auto & io = bind_compute_io(request);
-  return run_prefill_chunk4_preselected_argmax<mode, route>(
+  return run_prefill_chunk4_preselected_argmax<mode, route, lanes>(
       bind_native_backend(request), *io.selected_token_out, *io.selected_score_out);
 }
 
-template <emel::text::generator::attention_mode mode>
+template <emel::text::generator::attention_mode mode,
+          matmul_lane_mode lanes = matmul_lane_mode::serial>
 inline bool run_kernel_prefill_chunk8_preselected_argmax_q8_k_mode(
     const emel::graph::processor::event::execute & request,
     int32_t * err_out) noexcept {
   (void)err_out;
   auto & io = bind_compute_io(request);
-  return run_prefill_chunk8_preselected_argmax_q8_k<mode>(
+  return run_prefill_chunk8_preselected_argmax_q8_k<mode, lanes>(
       bind_native_backend(request), *io.selected_token_out, *io.selected_score_out);
 }
 
@@ -5335,6 +5726,32 @@ inline bool run_kernel_nonflash_prefill_chunk4_preselected_argmax_q8_k(
       chunk4_rhs_route::q8_k>(request, err_out);
 }
 
+inline bool run_kernel_flash_prefill_parallel_chunk8_preselected_argmax_q8_k(
+    const emel::graph::processor::event::execute & request,
+    int32_t * err_out) noexcept {
+  return run_kernel_prefill_chunk8_preselected_argmax_q8_k_mode<
+      emel::text::generator::attention_mode::flash,
+      matmul_lane_mode::parallel>(request, err_out);
+}
+
+inline bool run_kernel_flash_prefill_parallel_chunk4_preselected_argmax_packed_q8_0(
+    const emel::graph::processor::event::execute & request,
+    int32_t * err_out) noexcept {
+  return run_kernel_prefill_chunk4_preselected_argmax_mode<
+      emel::text::generator::attention_mode::flash,
+      chunk4_rhs_route::packed_q8_0,
+      matmul_lane_mode::parallel>(request, err_out);
+}
+
+inline bool run_kernel_flash_prefill_parallel_chunk4_preselected_argmax_q8_k(
+    const emel::graph::processor::event::execute & request,
+    int32_t * err_out) noexcept {
+  return run_kernel_prefill_chunk4_preselected_argmax_mode<
+      emel::text::generator::attention_mode::flash,
+      chunk4_rhs_route::q8_k,
+      matmul_lane_mode::parallel>(request, err_out);
+}
+
 inline bool extract_guarded_outputs(const emel::graph::processor::event::execute & request,
                                     int32_t * outputs_out,
                                     int32_t * err_out) noexcept {
diff --git a/src/emel/text/generator/guards.hpp b/src/emel/text/generator/guards.hpp
index 99caea2d..e1e2476c 100644
--- a/src/emel/text/generator/guards.hpp
+++ b/src/emel/text/generator/guards.hpp
@@ -986,6 +986,14 @@ struct guard_decode_compute_backend_unavailable {
   }
 };
 
+struct guard_decode_parallel_lanes_ready {
+  bool operator()(const event::generate_run &, const action::context & ctx) const noexcept {
+    return ctx.compute.backend.lane_pool.has_value() &&
+           ctx.compute.backend.n_embd >=
+               emel::text::generator::detail::k_parallel_min_gemv_dim;
+  }
+};
+
 struct guard_decode_materialized_scalar_packed_q8_0_ready {
   bool operator()(const event::generate_run & ev, const action::context & ctx) const noexcept {
     return detail::guard_decode_materialized_compute_ready(ev.ctx, ctx) &&
@@ -993,6 +1001,13 @@ struct guard_decode_materialized_scalar_packed_q8_0_ready {
   }
 };
 
+struct guard_decode_materialized_parallel_scalar_packed_q8_0_ready {
+  bool operator()(const event::generate_run & ev, const action::context & ctx) const noexcept {
+    return guard_decode_parallel_lanes_ready{}(ev, ctx) &&
+           guard_decode_materialized_scalar_packed_q8_0_ready{}(ev, ctx);
+  }
+};
+
 struct guard_decode_materialized_scalar_q8_k_ready {
   bool operator()(const event::generate_run & ev, const action::context & ctx) const noexcept {
     return detail::guard_decode_materialized_compute_ready(ev.ctx, ctx) &&
@@ -1000,6 +1015,13 @@ struct guard_decode_materialized_scalar_q8_k_ready {
   }
 };
 
+struct guard_decode_materialized_parallel_scalar_q8_k_ready {
+  bool operator()(const event::generate_run & ev, const action::context & ctx) const noexcept {
+    return guard_decode_parallel_lanes_ready{}(ev, ctx) &&
+           guard_decode_materialized_scalar_q8_k_ready{}(ev, ctx);
+  }
+};
+
 struct guard_decode_materialized_scalar_native_quantized_q8_k_ready {
   bool operator()(const event::generate_run & ev, const action::context & ctx) const noexcept {
     return detail::guard_decode_materialized_compute_ready(ev.ctx, ctx) &&
@@ -1007,6 +1029,13 @@ struct guard_decode_materialized_scalar_native_quantized_q8_k_ready {
   }
 };
 
+struct guard_decode_materialized_parallel_scalar_native_quantized_q8_k_ready {
+  bool operator()(const event::generate_run & ev, const action::context & ctx) const noexcept {
+    return guard_decode_parallel_lanes_ready{}(ev, ctx) &&
+           guard_decode_materialized_scalar_native_quantized_q8_k_ready{}(ev, ctx);
+  }
+};
+
 struct guard_decode_materialized_scalar_native_quantized_kernel_ready {
   bool operator()(const event::generate_run & ev, const action::context & ctx) const noexcept {
     return detail::guard_decode_materialized_compute_ready(ev.ctx, ctx) &&
@@ -1014,6 +1043,13 @@ struct guard_decode_materialized_scalar_native_quantized_kernel_ready {
   }
 };
 
+struct guard_decode_materialized_parallel_scalar_native_quantized_kernel_ready {
+  bool operator()(const event::generate_run & ev, const action::context & ctx) const noexcept {
+    return guard_decode_parallel_lanes_ready{}(ev, ctx) &&
+           guard_decode_materialized_scalar_native_quantized_kernel_ready{}(ev, ctx);
+  }
+};
+
 struct guard_decode_materialized_scalar_kernel_ready {
   bool operator()(const event::generate_run & ev, const action::context & ctx) const noexcept {
     return detail::guard_decode_materialized_compute_ready(ev.ctx, ctx) &&
@@ -1021,6 +1057,13 @@ struct guard_decode_materialized_scalar_kernel_ready {
   }
 };
 
+struct guard_decode_materialized_parallel_scalar_kernel_ready {
+  bool operator()(const event::generate_run & ev, const action::context & ctx) const noexcept {
+    return guard_decode_parallel_lanes_ready{}(ev, ctx) &&
+           guard_decode_materialized_scalar_kernel_ready{}(ev, ctx);
+  }
+};
+
 struct guard_decode_preselected_direct_ready {
   bool operator()(const event::generate_run & ev, const action::context & ctx) const noexcept {
     return detail::guard_decode_preselected_compute_ready(ev.ctx, ctx) &&
@@ -1056,6 +1099,34 @@ struct guard_decode_preselected_argmax_kernel_ready {
   }
 };
 
+struct guard_decode_preselected_parallel_argmax_q8_k_ready {
+  bool operator()(const event::generate_run & ev, const action::context & ctx) const noexcept {
+    return guard_decode_parallel_lanes_ready{}(ev, ctx) &&
+           guard_decode_preselected_argmax_q8_k_ready{}(ev, ctx);
+  }
+};
+
+struct guard_decode_preselected_parallel_argmax_native_quantized_q8_k_ready {
+  bool operator()(const event::generate_run & ev, const action::context & ctx) const noexcept {
+    return guard_decode_parallel_lanes_ready{}(ev, ctx) &&
+           guard_decode_preselected_argmax_native_quantized_q8_k_ready{}(ev, ctx);
+  }
+};
+
+struct guard_decode_preselected_parallel_argmax_native_quantized_kernel_ready {
+  bool operator()(const event::generate_run & ev, const action::context & ctx) const noexcept {
+    return guard_decode_parallel_lanes_ready{}(ev, ctx) &&
+           guard_decode_preselected_argmax_native_quantized_kernel_ready{}(ev, ctx);
+  }
+};
+
+struct guard_decode_preselected_parallel_argmax_kernel_ready {
+  bool operator()(const event::generate_run & ev, const action::context & ctx) const noexcept {
+    return guard_decode_parallel_lanes_ready{}(ev, ctx) &&
+           guard_decode_preselected_argmax_kernel_ready{}(ev, ctx);
+  }
+};
+
 struct valid_generate {
   bool operator()(const event::generate_run & ev, const action::context & ctx) const noexcept {
     return !ev.request.messages.empty() &&
diff --git a/src/emel/text/generator/prefill/actions.hpp b/src/emel/text/generator/prefill/actions.hpp
index c04856f7..8a2d170b 100644
--- a/src/emel/text/generator/prefill/actions.hpp
+++ b/src/emel/text/generator/prefill/actions.hpp
@@ -140,6 +140,14 @@ struct request_contract_flash_materialized_chunk8_q8_k {
   }
 };
 
+struct request_contract_flash_materialized_parallel_chunk8_q8_k {
+  void operator()(const event::run & ev, context & ctx) const noexcept {
+    detail::request_compute_contract<
+        emel::text::generator::prefill_compute_contract::flash_materialized_chunk8_q8_k,
+        emel::text::generator::detail::run_kernel_flash_prefill_parallel_chunk8_q8_k>(ev, ctx);
+  }
+};
+
 struct request_contract_flash_materialized_chunk4_packed_q8_0 {
   void operator()(const event::run & ev, context & ctx) const noexcept {
     detail::request_compute_contract<
@@ -148,6 +156,15 @@ struct request_contract_flash_materialized_chunk4_packed_q8_0 {
   }
 };
 
+struct request_contract_flash_materialized_parallel_chunk4_packed_q8_0 {
+  void operator()(const event::run & ev, context & ctx) const noexcept {
+    detail::request_compute_contract<
+        emel::text::generator::prefill_compute_contract::flash_materialized_chunk4_packed_q8_0,
+        emel::text::generator::detail::run_kernel_flash_prefill_parallel_chunk4_packed_q8_0>(
+        ev, ctx);
+  }
+};
+
 struct request_contract_flash_materialized_chunk4_q8_k {
   void operator()(const event::run & ev, context & ctx) const noexcept {
     detail::request_compute_contract<
@@ -156,6 +173,14 @@ struct request_contract_flash_materialized_chunk4_q8_k {
   }
 };
 
+struct request_contract_flash_materialized_parallel_chunk4_q8_k {
+  void operator()(const event::run & ev, context & ctx) const noexcept {
+    detail::request_compute_contract<
+        emel::text::generator::prefill_compute_contract::flash_materialized_chunk4_q8_k,
+        emel::text::generator::detail::run_kernel_flash_prefill_parallel_chunk4_q8_k>(ev, ctx);
+  }
+};
+
 struct request_contract_flash_preselected_scalar_q8_k {
   void operator()(const event::run & ev, context & ctx) const noexcept {
     detail::request_compute_contract_preselected_argmax<
@@ -203,6 +228,16 @@ struct request_contract_flash_preselected_chunk8_q8_k {
   }
 };
 
+struct request_contract_flash_preselected_parallel_chunk8_q8_k {
+  void operator()(const event::run & ev, context & ctx) const noexcept {
+    detail::request_compute_contract_preselected_argmax<
+        emel::text::generator::prefill_compute_contract::flash_preselected_chunk8_q8_k,
+        emel::text::generator::detail::
+            run_kernel_flash_prefill_parallel_chunk8_preselected_argmax_q8_k>(
+        ev, ctx);
+  }
+};
+
 struct request_contract_flash_preselected_chunk4_packed_q8_0 {
   void operator()(const event::run & ev, context & ctx) const noexcept {
     detail::request_compute_contract_preselected_argmax<
@@ -212,6 +247,16 @@ struct request_contract_flash_preselected_chunk4_packed_q8_0 {
   }
 };
 
+struct request_contract_flash_preselected_parallel_chunk4_packed_q8_0 {
+  void operator()(const event::run & ev, context & ctx) const noexcept {
+    detail::request_compute_contract_preselected_argmax<
+        emel::text::generator::prefill_compute_contract::flash_preselected_chunk4_packed_q8_0,
+        emel::text::generator::detail::
+            run_kernel_flash_prefill_parallel_chunk4_preselected_argmax_packed_q8_0>(
+        ev, ctx);
+  }
+};
+
 struct request_contract_flash_preselected_chunk4_q8_k {
   void operator()(const event::run & ev, context & ctx) const noexcept {
     detail::request_compute_contract_preselected_argmax<
@@ -221,6 +266,16 @@ struct request_contract_flash_preselected_chunk4_q8_k {
   }
 };
 
+struct request_contract_flash_preselected_parallel_chunk4_q8_k {
+  void operator()(const event::run & ev, context & ctx) const noexcept {
+    detail::request_compute_contract_preselected_argmax<
+        emel::text::generator::prefill_compute_contract::flash_preselected_chunk4_q8_k,
+        emel::text::generator::detail::
+            run_kernel_flash_prefill_parallel_chunk4_preselected_argmax_q8_k>(
+        ev, ctx);
+  }
+};
+
 struct request_contract_nonflash_materialized_scalar_packed_q8_0 {
   void operator()(const event::run & ev, context & ctx) const noexcept {
     detail::request_compute_contract<
@@ -366,10 +421,16 @@ inline constexpr request_contract_flash_materialized_scalar_kernel
     request_contract_flash_materialized_scalar_kernel{};
 inline constexpr request_contract_flash_materialized_chunk8_q8_k
     request_contract_flash_materialized_chunk8_q8_k{};
+inline constexpr request_contract_flash_materialized_parallel_chunk8_q8_k
+    request_contract_flash_materialized_parallel_chunk8_q8_k{};
 inline constexpr request_contract_flash_materialized_chunk4_packed_q8_0
     request_contract_flash_materialized_chunk4_packed_q8_0{};
+inline constexpr request_contract_flash_materialized_parallel_chunk4_packed_q8_0
+    request_contract_flash_materialized_parallel_chunk4_packed_q8_0{};
 inline constexpr request_contract_flash_materialized_chunk4_q8_k
     request_contract_flash_materialized_chunk4_q8_k{};
+inline constexpr request_contract_flash_materialized_parallel_chunk4_q8_k
+    request_contract_flash_materialized_parallel_chunk4_q8_k{};
 inline constexpr request_contract_flash_preselected_scalar_q8_k
     request_contract_flash_preselected_scalar_q8_k{};
 inline constexpr request_contract_flash_preselected_scalar_native_quantized_q8_k
@@ -380,10 +441,16 @@ inline constexpr request_contract_flash_preselected_scalar_kernel
     request_contract_flash_preselected_scalar_kernel{};
 inline constexpr request_contract_flash_preselected_chunk8_q8_k
     request_contract_flash_preselected_chunk8_q8_k{};
+inline constexpr request_contract_flash_preselected_parallel_chunk8_q8_k
+    request_contract_flash_preselected_parallel_chunk8_q8_k{};
 inline constexpr request_contract_flash_preselected_chunk4_packed_q8_0
     request_contract_flash_preselected_chunk4_packed_q8_0{};
+inline constexpr request_contract_flash_preselected_parallel_chunk4_packed_q8_0
+    request_contract_flash_preselected_parallel_chunk4_packed_q8_0{};
 inline constexpr request_contract_flash_preselected_chunk4_q8_k
     request_contract_flash_preselected_chunk4_q8_k{};
+inline constexpr request_contract_flash_preselected_parallel_chunk4_q8_k
+    request_contract_flash_preselected_parallel_chunk4_q8_k{};
 inline constexpr request_contract_nonflash_materialized_scalar_packed_q8_0
     request_contract_nonflash_materialized_scalar_packed_q8_0{};
 inline constexpr request_contract_nonflash_materialized_scalar_q8_k
diff --git a/src/emel/text/generator/prefill/guards.hpp b/src/emel/text/generator/prefill/guards.hpp
index ec635ca8..1ac2ebec 100644
--- a/src/emel/text/generator/prefill/guards.hpp
+++ b/src/emel/text/generator/prefill/guards.hpp
@@ -57,6 +57,13 @@ inline bool uses_preselected_argmax_direct(const action::context & ctx) noexcept
              ctx.generator.compute.backend);
 }
 
+inline bool uses_parallel_matmul_lanes(const event::run & ev,
+                                       const action::context & ctx) noexcept {
+  return ctx.generator.compute.backend.lane_pool.has_value() &&
+         ev.ctx.prompt_token_count >=
+             emel::text::generator::detail::k_parallel_min_prefill_tokens;
+}
+
 inline bool uses_prefill_chunk4_q8_gemm(const event::run & ev,
                                         const action::context & ctx) noexcept {
   return ev.ctx.prompt_token_count >= emel::text::generator::detail::k_prefill_q8_chunk_rows &&
@@ -353,6 +360,13 @@ struct guard_materialized_logits_with_chunk8_q8_k_ready {
   }
 };
 
+struct guard_materialized_logits_with_parallel_chunk8_q8_k_ready {
+  bool operator()(const event::run & ev, const action::context & ctx) const noexcept {
+    return detail::uses_parallel_matmul_lanes(ev, ctx) &&
+           guard_materialized_logits_with_chunk8_q8_k_ready{}(ev, ctx);
+  }
+};
+
 struct guard_materialized_logits_with_chunk4_packed_q8_0_ready {
   bool operator()(const event::run & ev, const action::context & ctx) const noexcept {
     return emel::text::generator::guard::detail::guard_prefill_materialized_compute_ready(
@@ -361,6 +375,13 @@ struct guard_materialized_logits_with_chunk4_packed_q8_0_ready {
   }
 };
 
+struct guard_materialized_logits_with_parallel_chunk4_packed_q8_0_ready {
+  bool operator()(const event::run & ev, const action::context & ctx) const noexcept {
+    return detail::uses_parallel_matmul_lanes(ev, ctx) &&
+           guard_materialized_logits_with_chunk4_packed_q8_0_ready{}(ev, ctx);
+  }
+};
+
 struct guard_materialized_logits_with_chunk4_q8_k_ready {
   bool operator()(const event::run & ev, const action::context & ctx) const noexcept {
     return emel::text::generator::guard::detail::guard_prefill_materialized_compute_ready(
@@ -369,6 +390,13 @@ struct guard_materialized_logits_with_chunk4_q8_k_ready {
   }
 };
 
+struct guard_materialized_logits_with_parallel_chunk4_q8_k_ready {
+  bool operator()(const event::run & ev, const action::context & ctx) const noexcept {
+    return detail::uses_parallel_matmul_lanes(ev, ctx) &&
+           guard_materialized_logits_with_chunk4_q8_k_ready{}(ev, ctx);
+  }
+};
+
 struct guard_materialized_logits_with_scalar_packed_q8_0_ready {
   bool operator()(const event::run & ev, const action::context & ctx) const noexcept {
     return emel::text::generator::guard::detail::guard_prefill_materialized_compute_ready(
@@ -417,6 +445,13 @@ struct guard_preselected_argmax_with_chunk8_q8_k_ready {
   }
 };
 
+struct guard_preselected_argmax_with_parallel_chunk8_q8_k_ready {
+  bool operator()(const event::run & ev, const action::context & ctx) const noexcept {
+    return detail::uses_parallel_matmul_lanes(ev, ctx) &&
+           guard_preselected_argmax_with_chunk8_q8_k_ready{}(ev, ctx);
+  }
+};
+
 struct guard_preselected_argmax_with_chunk4_packed_q8_0_ready {
   bool operator()(const event::run & ev, const action::context & ctx) const noexcept {
     return emel::text::generator::guard::detail::guard_prefill_preselected_compute_ready(
@@ -425,6 +460,13 @@ struct guard_preselected_argmax_with_chunk4_packed_q8_0_ready {
   }
 };
 
+struct guard_preselected_argmax_with_parallel_chunk4_packed_q8_0_ready {
+  bool operator()(const event::run & ev, const action::context & ctx) const noexcept {
+    return detail::uses_parallel_matmul_lanes(ev, ctx) &&
+           guard_preselected_argmax_with_chunk4_packed_q8_0_ready{}(ev, ctx);
+  }
+};
+
 struct guard_preselected_argmax_with_chunk4_q8_k_ready {
   bool operator()(const event::run & ev, const action::context & ctx) const noexcept {
     return emel::text::generator::guard::detail::guard_prefill_preselected_compute_ready(
@@ -433,6 +475,13 @@ struct guard_preselected_argmax_with_chunk4_q8_k_ready {
   }
 };
 
+struct guard_preselected_argmax_with_parallel_chunk4_q8_k_ready {
+  bool operator()(const event::run & ev, const action::context & ctx) const noexcept {
+    return detail::uses_parallel_matmul_lanes(ev, ctx) &&
+           guard_preselected_argmax_with_chunk4_q8_k_ready{}(ev, ctx);
+  }
+};
+
 struct guard_preselected_argmax_with_scalar_q8_k_ready {
   bool operator()(const event::run & ev, const action::context & ctx) const noexcept {
     return emel::text::generator::guard::detail::guard_prefill_preselected_compute_ready(
diff --git a/src/emel/text/generator/prefill/sm.hpp b/src/emel/text/generator/prefill/sm.hpp
index 8e8b62db..67a086ed 100644
--- a/src/emel/text/generator/prefill/sm.hpp
+++ b/src/emel/text/generator/prefill/sm.hpp
@@ -73,16 +73,31 @@ struct model {
                  [ guard::guard_compute_backend_unavailable{} ]
                  / action::mark_backend_error
 
+      , sml::state<compute_result_decision> <= sml::state<contract_flash_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_materialized_logits_with_parallel_chunk8_q8_k_ready{} ]
+                 / action::request_contract_flash_materialized_parallel_chunk8_q8_k
+
       , sml::state<compute_result_decision> <= sml::state<contract_flash_decision>
                  + sml::completion<event::run>
                  [ guard::guard_materialized_logits_with_chunk8_q8_k_ready{} ]
                  / action::request_contract_flash_materialized_chunk8_q8_k
 
+      , sml::state<compute_result_decision> <= sml::state<contract_flash_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_materialized_logits_with_parallel_chunk4_packed_q8_0_ready{} ]
+                 / action::request_contract_flash_materialized_parallel_chunk4_packed_q8_0
+
       , sml::state<compute_result_decision> <= sml::state<contract_flash_decision>
                  + sml::completion<event::run>
                  [ guard::guard_materialized_logits_with_chunk4_packed_q8_0_ready{} ]
                  / action::request_contract_flash_materialized_chunk4_packed_q8_0
 
+      , sml::state<compute_result_decision> <= sml::state<contract_flash_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_materialized_logits_with_parallel_chunk4_q8_k_ready{} ]
+                 / action::request_contract_flash_materialized_parallel_chunk4_q8_k
+
       , sml::state<compute_result_decision> <= sml::state<contract_flash_decision>
                  + sml::completion<event::run>
                  [ guard::guard_materialized_logits_with_chunk4_q8_k_ready{} ]
@@ -113,16 +128,31 @@ struct model {
                  [ guard::guard_materialized_logits_with_scalar_kernel_ready{} ]
                  / action::request_contract_flash_materialized_scalar_kernel
 
+      , sml::state<compute_result_decision> <= sml::state<contract_flash_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_preselected_argmax_with_parallel_chunk8_q8_k_ready{} ]
+                 / action::request_contract_flash_preselected_parallel_chunk8_q8_k
+
       , sml::state<compute_result_decision> <= sml::state<contract_flash_decision>
                  + sml::completion<event::run>
                  [ guard::guard_preselected_argmax_with_chunk8_q8_k_ready{} ]
                  / action::request_contract_flash_preselected_chunk8_q8_k
 
+      , sml::state<compute_result_decision> <= sml::state<contract_flash_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_preselected_argmax_with_parallel_chunk4_packed_q8_0_ready{} ]
+                 / action::request_contract_flash_preselected_parallel_chunk4_packed_q8_0
+
       , sml::state<compute_result_decision> <= sml::state<contract_flash_decision>
                  + sml::completion<event::run>
                  [ guard::guard_preselected_argmax_with_chunk4_packed_q8_0_ready{} ]
                  / action::request_contract_flash_preselected_chunk4_packed_q8_0
 
+      , sml::state<compute_result_decision> <= sml::state<contract_flash_decision>
+                 + sml::completion<event::run>
+                 [ guard::guard_preselected_argmax_with_parallel_chunk4_q8_k_ready{} ]
+                 / action::request_contract_flash_preselected_parallel_chunk4_q8_k
+
       , sml::state<compute_result_decision> <= sml::state<contract_flash_decision>
                  + sml::completion<event::run>
                  [ guard::guard_preselected_argmax_with_chunk4_q8_k_ready{} ]
diff --git a/src/emel/text/generator/sm.hpp b/src/emel/text/generator/sm.hpp
index d68ac3a2..1ba592c3 100644
--- a/src/emel/text/generator/sm.hpp
+++ b/src/emel/text/generator/sm.hpp
@@ -497,26 +497,51 @@ struct model {
                  + sml::completion<event::generate_run>
                  [ guard::guard_decode_preselected_direct_ready{} ]
 
+      , sml::state<decode_compute_flash_decision> <= sml::state<decode_compute_flash>
+                 + sml::completion<event::generate_run>
+                 [ guard::guard_decode_materialized_parallel_scalar_packed_q8_0_ready{} ]
+                 / action::request_decode_compute_flash_parallel_packed_q8_0
+
       , sml::state<decode_compute_flash_decision> <= sml::state<decode_compute_flash>
                  + sml::completion<event::generate_run>
                  [ guard::guard_decode_materialized_scalar_packed_q8_0_ready{} ]
                  / action::request_decode_compute_flash_packed_q8_0
 
+      , sml::state<decode_compute_flash_decision> <= sml::state<decode_compute_flash>
+                 + sml::completion<event::generate_run>
+                 [ guard::guard_decode_materialized_parallel_scalar_q8_k_ready{} ]
+                 / action::request_decode_compute_flash_parallel_q8_k
+
       , sml::state<decode_compute_flash_decision> <= sml::state<decode_compute_flash>
                  + sml::completion<event::generate_run>
                  [ guard::guard_decode_materialized_scalar_q8_k_ready{} ]
                  / action::request_decode_compute_flash_q8_k
 
+      , sml::state<decode_compute_flash_decision> <= sml::state<decode_compute_flash>
+                 + sml::completion<event::generate_run>
+                 [ guard::guard_decode_materialized_parallel_scalar_native_quantized_q8_k_ready{} ]
+                 / action::request_decode_compute_flash_parallel_native_quantized_q8_k_logits
+
       , sml::state<decode_compute_flash_decision> <= sml::state<decode_compute_flash>
                  + sml::completion<event::generate_run>
                  [ guard::guard_decode_materialized_scalar_native_quantized_q8_k_ready{} ]
                  / action::request_decode_compute_flash_native_quantized_q8_k_logits
 
+      , sml::state<decode_compute_flash_decision> <= sml::state<decode_compute_flash>
+                 + sml::completion<event::generate_run>
+                 [ guard::guard_decode_materialized_parallel_scalar_native_quantized_kernel_ready{} ]
+                 / action::request_decode_compute_flash_parallel_native_quantized
+
       , sml::state<decode_compute_flash_decision> <= sml::state<decode_compute_flash>
                  + sml::completion<event::generate_run>
                  [ guard::guard_decode_materialized_scalar_native_quantized_kernel_ready{} ]
                  / action::request_decode_compute_flash_native_quantized
 
+      , sml::state<decode_compute_flash_decision> <= sml::state<decode_compute_flash>
+                 + sml::completion<event::generate_run>
+                 [ guard::guard_decode_materialized_parallel_scalar_kernel_ready{} ]
+                 / action::request_decode_compute_flash_parallel_kernel
+
       , sml::state<decode_compute_flash_decision> <= sml::state<decode_compute_flash>
                  + sml::completion<event::generate_run>
                  [ guard::guard_decode_materialized_scalar_kernel_ready{} ]
@@ -574,24 +599,50 @@ struct model {
                  [ guard::guard_decode_compute_backend_unavailable{} ]
                  / action::mark_backend_error
 
+      , sml::state<decode_compute_flash_preselected_argmax_decision> <=
+               sml::state<decode_compute_flash_preselected_argmax>
+                 + sml::completion<event::generate_run>
+                 [ guard::guard_decode_preselected_parallel_argmax_q8_k_ready{} ]
+                 / action::request_decode_compute_flash_parallel_preselected_argmax_q8_k
+
       , sml::state<decode_compute_flash_preselected_argmax_decision> <=
                sml::state<decode_compute_flash_preselected_argmax>
                  + sml::completion<event::generate_run>
                  [ guard::guard_decode_preselected_argmax_q8_k_ready{} ]
                  / action::request_decode_compute_flash_preselected_argmax_q8_k
 
+      , sml::state<decode_compute_flash_preselected_argmax_decision> <=
+               sml::state<decode_compute_flash_preselected_argmax>
+                 + sml::completion<event::generate_run>
+                 [ guard::guard_decode_preselected_parallel_argmax_native_quantized_q8_k_ready{} ]
+                 / action::
+                       request_decode_compute_flash_parallel_preselected_argmax_native_quantized_q8_k
+
       , sml::state<decode_compute_flash_preselected_argmax_decision> <=
                sml::state<decode_compute_flash_preselected_argmax>
                  + sml::completion<event::generate_run>
                  [ guard::guard_decode_preselected_argmax_native_quantized_q8_k_ready{} ]
                  / action::request_decode_compute_flash_preselected_argmax_native_quantized_q8_k
 
+      , sml::state<decode_compute_flash_preselected_argmax_decision> <=
+               sml::state<decode_compute_flash_preselected_argmax>
+                 + sml::completion<event::generate_run>
+                 [ guard::guard_decode_preselected_parallel_argmax_native_quantized_kernel_ready{} ]
+                 / action::
+                       request_decode_compute_flash_parallel_preselected_argmax_native_quantized_kernel
+
       , sml::state<decode_compute_flash_preselected_argmax_decision> <=
                sml::state<decode_compute_flash_preselected_argmax>
                  + sml::completion<event::generate_run>
                  [ guard::guard_decode_preselected_argmax_native_quantized_kernel_ready{} ]
                  / action::request_decode_compute_flash_preselected_argmax_native_quantized_kernel
 
+      , sml::state<decode_compute_flash_preselected_argmax_decision> <=
+               sml::state<decode_compute_flash_preselected_argmax>
+                 + sml::completion<event::generate_run>
+                 [ guard::guard_decode_preselected_parallel_argmax_kernel_ready{} ]
+                 / action::request_decode_compute_flash_parallel_preselected_argmax_kernel
+
       , sml::state<decode_compute_flash_preselected_argmax_decision> <=
                sml::state<decode_compute_flash_preselected_argmax>
                  + sml::completion<event::generate_run>
diff --git a/tests/diarization/request/lifecycle_tests.cpp b/tests/diarization/request/lifecycle_tests.cpp
index cdaecb55..29f1e436 100644
--- a/tests/diarization/request/lifecycle_tests.cpp
+++ b/tests/diarization/request/lifecycle_tests.cpp
@@ -15,6 +15,7 @@
 #include "emel/diarization/request/sm.hpp"
 #include "emel/error/error.hpp"
 #include "emel/model/sortformer/detail.hpp"
+#include "../../kernel/test_helpers.hpp"
 
 namespace {
 
@@ -91,7 +92,7 @@ struct request_model_fixture {
       std::vector<float>(static_cast<size_t>(feature_extractor_detail::k_window_length), 1.0f);
 
   request_model_fixture() {
-    std::memset(&model, 0, sizeof(model));
+    emel::tests::reset_model_data(model);
 
     for (int32_t mel = 0; mel < feature_extractor_detail::k_feature_bin_count; ++mel) {
       for (int32_t bin = 0; bin < feature_extractor_detail::k_fft_bin_count; ++bin) {
diff --git a/tests/diarization/sortformer/encoder/lifecycle_tests.cpp b/tests/diarization/sortformer/encoder/lifecycle_tests.cpp
index 97c4a0f1..e16df313 100644
--- a/tests/diarization/sortformer/encoder/lifecycle_tests.cpp
+++ b/tests/diarization/sortformer/encoder/lifecycle_tests.cpp
@@ -19,6 +19,7 @@
 #include "emel/kernel/aarch64/actions.hpp"
 #include "emel/model/data.hpp"
 #include "emel/model/sortformer/detail.hpp"
+#include "../../../kernel/test_helpers.hpp"
 
 namespace {
 
@@ -29,6 +30,11 @@ namespace modules_detail = emel::diarization::sortformer::modules::detail;
 namespace sortformer_detail = emel::diarization::sortformer::detail;
 namespace transformer_detail = emel::diarization::sortformer::transformer::detail;
 
+emel::kernel::sm & test_kernel() {
+  static emel::kernel::sm kernel{emel::kernel::detect_host_kind()};
+  return kernel;
+}
+
 struct tensor_spec {
   std::string_view name = {};
   int32_t n_dims = 0;
@@ -154,7 +160,7 @@ size_t tensor_value_count(const tensor_spec & spec) noexcept {
 void build_encoder_model(emel::model::data & model,
                          const bool include_all_tensors,
                          const bool valid_shapes) {
-  std::memset(&model, 0, sizeof(model));
+  emel::tests::reset_model_data(model);
 
   for (const auto & spec : k_pre_specs) {
     append_tensor(model, spec);
@@ -203,7 +209,7 @@ struct pre_encoder_fixture {
   std::vector<std::vector<float>> layer_storage = {};
 
   pre_encoder_fixture() {
-    std::memset(&model, 0, sizeof(model));
+    emel::tests::reset_model_data(model);
     layer_storage.reserve(static_cast<size_t>(encoder_detail::k_layer_count *
                                               encoder_detail::k_layer_tensor_count));
 
@@ -326,7 +332,7 @@ TEST_CASE("sortformer encoder kernels are deterministic") {
   const std::array<float, 2> dense_bias{0.25f, -0.75f};
   std::array<float, 2> dense_output{};
 
-  REQUIRE(sortformer_detail::compute_dense(dense_input, weights, dense_bias, dense_output));
+  REQUIRE(sortformer_detail::compute_dense(test_kernel(), dense_input, weights, dense_bias, dense_output));
   CHECK(dense_output[0] == doctest::Approx(-1.25f));
   CHECK(dense_output[1] == doctest::Approx(-2.0f));
 }
@@ -337,7 +343,7 @@ TEST_CASE("sortformer encoder dense kernel rejects invalid shapes") {
   const std::array<float, 2> bias{0.0f, 0.0f};
   std::array<float, 2> output{};
 
-  CHECK_FALSE(sortformer_detail::compute_dense(input, weights, bias, output));
+  CHECK_FALSE(sortformer_detail::compute_dense(test_kernel(), input, weights, bias, output));
 }
 
 TEST_CASE("sortformer dense batch helpers cover transposed prepared residual paths") {
@@ -372,7 +378,7 @@ TEST_CASE("sortformer dense batch helpers cover transposed prepared residual pat
 
   REQUIRE(sortformer_detail::prepare_dense_weight_cache(weights, input_dim, output_dim, cache));
   CHECK(sortformer_detail::prepare_dense_weight_cache(weights, input_dim, output_dim, cache));
-  REQUIRE(sortformer_detail::compute_dense_batch_prepared(input_rows,
+  REQUIRE(sortformer_detail::compute_dense_batch_prepared(test_kernel(), input_rows,
                                                           row_count,
                                                           input_dim,
                                                           weights,
@@ -382,7 +388,7 @@ TEST_CASE("sortformer dense batch helpers cover transposed prepared residual pat
                                                           transposed_input,
                                                           transposed_output,
                                                           prepared_output));
-  REQUIRE(sortformer_detail::compute_dense_batch_residual_prepared(input_rows,
+  REQUIRE(sortformer_detail::compute_dense_batch_residual_prepared(test_kernel(), input_rows,
                                                                    row_count,
                                                                    input_dim,
                                                                    weights,
@@ -401,7 +407,7 @@ TEST_CASE("sortformer dense batch helpers cover transposed prepared residual pat
                                                    row_count,
                                                    input_dim,
                                                    transposed_input));
-  REQUIRE(sortformer_detail::compute_dense_batch_from_transposed_scaled_residual_prepared(
+  REQUIRE(sortformer_detail::compute_dense_batch_from_transposed_scaled_residual_prepared(test_kernel(), 
       transposed_input,
       row_count,
       input_dim,
@@ -422,7 +428,7 @@ TEST_CASE("sortformer dense batch helpers cover transposed prepared residual pat
       0u,
       output_dim,
       cache));
-  CHECK_FALSE(sortformer_detail::compute_dense_batch_from_transposed_scaled_residual_prepared(
+  CHECK_FALSE(sortformer_detail::compute_dense_batch_from_transposed_scaled_residual_prepared(test_kernel(), 
       transposed_input,
       row_count,
       input_dim,
@@ -472,7 +478,7 @@ TEST_CASE("sortformer dense batch helpers cover unprepared transposed variants")
   }
 
   REQUIRE(sortformer_detail::prepare_dense_weight_cache(weights, input_dim, output_dim, cache));
-  REQUIRE(sortformer_detail::compute_dense_batch(input_rows,
+  REQUIRE(sortformer_detail::compute_dense_batch(test_kernel(), input_rows,
                                                  row_count,
                                                  input_dim,
                                                  weights,
@@ -485,7 +491,7 @@ TEST_CASE("sortformer dense batch helpers cover unprepared transposed variants")
                                                    row_count,
                                                    input_dim,
                                                    transposed_input));
-  REQUIRE(sortformer_detail::compute_dense_batch_to_transposed(input_rows,
+  REQUIRE(sortformer_detail::compute_dense_batch_to_transposed(test_kernel(), input_rows,
                                                                row_count,
                                                                input_dim,
                                                                weights,
@@ -493,7 +499,7 @@ TEST_CASE("sortformer dense batch helpers cover unprepared transposed variants")
                                                                output_dim,
                                                                transposed_input,
                                                                transposed_output));
-  REQUIRE(sortformer_detail::compute_dense_batch_to_transposed_prepared(input_rows,
+  REQUIRE(sortformer_detail::compute_dense_batch_to_transposed_prepared(test_kernel(), input_rows,
                                                                         row_count,
                                                                         input_dim,
                                                                         weights,
@@ -502,7 +508,7 @@ TEST_CASE("sortformer dense batch helpers cover unprepared transposed variants")
                                                                         output_dim,
                                                                         transposed_input,
                                                                         prepared_transposed_output));
-  REQUIRE(sortformer_detail::compute_dense_batch_from_transposed(transposed_input,
+  REQUIRE(sortformer_detail::compute_dense_batch_from_transposed(test_kernel(), transposed_input,
                                                                  row_count,
                                                                  input_dim,
                                                                  weights,
@@ -510,7 +516,7 @@ TEST_CASE("sortformer dense batch helpers cover unprepared transposed variants")
                                                                  output_dim,
                                                                  from_transposed_scratch,
                                                                  from_transposed_output));
-  REQUIRE(sortformer_detail::compute_dense_batch_from_transposed_prepared(
+  REQUIRE(sortformer_detail::compute_dense_batch_from_transposed_prepared(test_kernel(), 
       transposed_input,
       row_count,
       input_dim,
@@ -520,7 +526,7 @@ TEST_CASE("sortformer dense batch helpers cover unprepared transposed variants")
       output_dim,
       prepared_from_transposed_scratch,
       prepared_from_transposed_output));
-  REQUIRE(sortformer_detail::compute_dense_batch_without_bias(input_rows,
+  REQUIRE(sortformer_detail::compute_dense_batch_without_bias(test_kernel(), input_rows,
                                                              row_count,
                                                              input_dim,
                                                              weights,
@@ -528,7 +534,7 @@ TEST_CASE("sortformer dense batch helpers cover unprepared transposed variants")
                                                              transposed_input,
                                                              without_bias_scratch,
                                                              without_bias_output));
-  REQUIRE(sortformer_detail::compute_dense_batch_without_bias_prepared(input_rows,
+  REQUIRE(sortformer_detail::compute_dense_batch_without_bias_prepared(test_kernel(), input_rows,
                                                                       row_count,
                                                                       input_dim,
                                                                       weights,
@@ -537,7 +543,7 @@ TEST_CASE("sortformer dense batch helpers cover unprepared transposed variants")
                                                                       transposed_input,
                                                                       without_bias_prepared_scratch,
                                                                       without_bias_prepared_output));
-  REQUIRE(sortformer_detail::compute_dense_without_bias(
+  REQUIRE(sortformer_detail::compute_dense_without_bias(test_kernel(), 
       std::span<const float>{input_rows.data(), input_dim},
       weights,
       first_row_without_bias));
@@ -563,11 +569,11 @@ TEST_CASE("sortformer dense batch helpers cover unprepared transposed variants")
                                                        0u,
                                                        input_dim,
                                                        transposed_input));
-  CHECK_FALSE(sortformer_detail::compute_dense_without_bias(
+  CHECK_FALSE(sortformer_detail::compute_dense_without_bias(test_kernel(), 
       std::span<const float>{},
       weights,
       first_row_without_bias));
-  CHECK_FALSE(sortformer_detail::compute_dense_batch_without_bias(input_rows,
+  CHECK_FALSE(sortformer_detail::compute_dense_batch_without_bias(test_kernel(), input_rows,
                                                                  row_count,
                                                                  input_dim,
                                                                  weights,
@@ -577,7 +583,7 @@ TEST_CASE("sortformer dense batch helpers cover unprepared transposed variants")
                                                                  std::span<float>{
                                                                      without_bias_output.data(),
                                                                      without_bias_output.size() - 1u}));
-  CHECK_FALSE(sortformer_detail::compute_dense_batch_to_transposed(input_rows,
+  CHECK_FALSE(sortformer_detail::compute_dense_batch_to_transposed(test_kernel(), input_rows,
                                                                   row_count,
                                                                   input_dim,
                                                                   weights,
@@ -587,7 +593,7 @@ TEST_CASE("sortformer dense batch helpers cover unprepared transposed variants")
                                                                   std::span<float>{
                                                                       transposed_output.data(),
                                                                       transposed_output.size() - 1u}));
-  CHECK_FALSE(sortformer_detail::compute_dense_batch_from_transposed(transposed_input,
+  CHECK_FALSE(sortformer_detail::compute_dense_batch_from_transposed(test_kernel(), transposed_input,
                                                                     row_count,
                                                                     input_dim,
                                                                     weights,
diff --git a/tests/diarization/sortformer/modules/lifecycle_tests.cpp b/tests/diarization/sortformer/modules/lifecycle_tests.cpp
index 6c40e5e0..f694e52a 100644
--- a/tests/diarization/sortformer/modules/lifecycle_tests.cpp
+++ b/tests/diarization/sortformer/modules/lifecycle_tests.cpp
@@ -9,12 +9,18 @@
 #include "emel/diarization/sortformer/cache/detail.hpp"
 #include "emel/diarization/sortformer/modules/detail.hpp"
 #include "emel/model/data.hpp"
+#include "../../../kernel/test_helpers.hpp"
 
 namespace {
 
 namespace cache_detail = emel::diarization::sortformer::cache::detail;
 namespace modules_detail = emel::diarization::sortformer::modules::detail;
 
+emel::kernel::sm & test_kernel() {
+  static emel::kernel::sm kernel{emel::kernel::detect_host_kind()};
+  return kernel;
+}
+
 struct tensor_spec {
   std::string_view name = {};
   int32_t n_dims = 0;
@@ -56,7 +62,7 @@ void append_tensor(emel::model::data & model, const tensor_spec & spec) {
 void build_modules_model(emel::model::data & model,
                          const bool include_all_tensors,
                          const bool valid_shapes) {
-  std::memset(&model, 0, sizeof(model));
+  emel::tests::reset_model_data(model);
   for (size_t index = 0u; index < k_specs.size(); ++index) {
     if (!include_all_tensors && index == k_specs.size() - 1u) {
       continue;
@@ -131,7 +137,7 @@ TEST_CASE("sortformer modules compute projection and speaker logits") {
   weights[1] = 0.25f;
   bias[0] = 0.125f;
 
-  REQUIRE(modules_detail::compute_encoder_projection(encoder_frame, weights, bias, hidden));
+  REQUIRE(modules_detail::compute_encoder_projection(test_kernel(), encoder_frame, weights, bias, hidden));
   CHECK(hidden[0] == doctest::Approx(0.875f));
 
   std::array<float, modules_detail::k_hidden_dim> cached_hidden = {};
diff --git a/tests/diarization/sortformer/output/lifecycle_tests.cpp b/tests/diarization/sortformer/output/lifecycle_tests.cpp
index f0c5ccb3..81a8d4f0 100644
--- a/tests/diarization/sortformer/output/lifecycle_tests.cpp
+++ b/tests/diarization/sortformer/output/lifecycle_tests.cpp
@@ -11,12 +11,18 @@
 #include "emel/diarization/sortformer/modules/detail.hpp"
 #include "emel/diarization/sortformer/output/detail.hpp"
 #include "emel/model/data.hpp"
+#include "../../../kernel/test_helpers.hpp"
 
 namespace {
 
 namespace modules_detail = emel::diarization::sortformer::modules::detail;
 namespace output_detail = emel::diarization::sortformer::output::detail;
 
+emel::kernel::sm & test_kernel() {
+  static emel::kernel::sm kernel{emel::kernel::detect_host_kind()};
+  return kernel;
+}
+
 struct tensor_spec {
   std::string_view name = {};
   int32_t n_dims = 0;
@@ -58,7 +64,7 @@ struct modules_fixture {
                          0.0f);
 
   modules_fixture() {
-    std::memset(&model, 0, sizeof(model));
+    emel::tests::reset_model_data(model);
     for (int32_t index = 0; index < modules_detail::k_hidden_dim; ++index) {
       frame_hidden[(static_cast<size_t>(index) * static_cast<size_t>(modules_detail::k_hidden_dim)) +
                    static_cast<size_t>(index)] = 1.0f;
@@ -126,7 +132,7 @@ TEST_CASE("sortformer output computes deterministic speaker probabilities") {
   std::vector<float> probabilities(static_cast<size_t>(
       output_detail::k_required_probability_value_count));
 
-  REQUIRE(output_detail::compute_speaker_probabilities(hidden_frames,
+  REQUIRE(output_detail::compute_speaker_probabilities(test_kernel(), hidden_frames,
                                                        contract,
                                                        probabilities));
   CHECK(probabilities[0] == doctest::Approx(0.5f));
@@ -135,7 +141,7 @@ TEST_CASE("sortformer output computes deterministic speaker probabilities") {
   CHECK(probabilities[3] == doctest::Approx(1.0f / (1.0f + std::exp(-2.0f))));
 
   std::vector<float> second(probabilities.size());
-  REQUIRE(output_detail::compute_speaker_probabilities(hidden_frames, contract, second));
+  REQUIRE(output_detail::compute_speaker_probabilities(test_kernel(), hidden_frames, contract, second));
   CHECK(second == probabilities);
 }
 
@@ -162,7 +168,7 @@ TEST_CASE("sortformer output uses non-aliased frame-hidden projection") {
   std::vector<float> probabilities(static_cast<size_t>(
       output_detail::k_required_probability_value_count), 0.0f);
 
-  REQUIRE(output_detail::compute_speaker_probabilities(hidden_frames,
+  REQUIRE(output_detail::compute_speaker_probabilities(test_kernel(), hidden_frames,
                                                        contract,
                                                        probabilities));
   CHECK(probabilities[0] == doctest::Approx(1.0f / (1.0f + std::exp(-0.75f))));
@@ -178,19 +184,19 @@ TEST_CASE("sortformer output rejects invalid probability inputs") {
       output_detail::k_required_hidden_value_count - 1), 0.0f);
   std::vector<float> probabilities(static_cast<size_t>(
       output_detail::k_required_probability_value_count));
-  CHECK_FALSE(output_detail::compute_speaker_probabilities(hidden_frames,
+  CHECK_FALSE(output_detail::compute_speaker_probabilities(test_kernel(), hidden_frames,
                                                           contract,
                                                           probabilities));
 
   hidden_frames.resize(static_cast<size_t>(output_detail::k_required_hidden_value_count));
   probabilities.resize(static_cast<size_t>(output_detail::k_required_probability_value_count - 1));
-  CHECK_FALSE(output_detail::compute_speaker_probabilities(hidden_frames,
+  CHECK_FALSE(output_detail::compute_speaker_probabilities(test_kernel(), hidden_frames,
                                                           contract,
                                                           probabilities));
 
   probabilities.resize(static_cast<size_t>(output_detail::k_required_probability_value_count));
   modules_detail::contract empty_contract = {};
-  CHECK_FALSE(output_detail::compute_speaker_probabilities(hidden_frames,
+  CHECK_FALSE(output_detail::compute_speaker_probabilities(test_kernel(), hidden_frames,
                                                           empty_contract,
                                                           probabilities));
 }
diff --git a/tests/diarization/sortformer/transformer/lifecycle_tests.cpp b/tests/diarization/sortformer/transformer/lifecycle_tests.cpp
index 81ab5830..13037ee9 100644
--- a/tests/diarization/sortformer/transformer/lifecycle_tests.cpp
+++ b/tests/diarization/sortformer/transformer/lifecycle_tests.cpp
@@ -10,6 +10,7 @@
 
 #include "emel/diarization/sortformer/transformer/detail.hpp"
 #include "emel/model/data.hpp"
+#include "../../../kernel/test_helpers.hpp"
 
 namespace {
 
@@ -80,7 +81,7 @@ void append_layer_tensor(emel::model::data & model,
 void build_transformer_model(emel::model::data & model,
                              const bool include_all_tensors,
                              const bool valid_shapes) {
-  std::memset(&model, 0, sizeof(model));
+  emel::tests::reset_model_data(model);
 
   for (int32_t layer = 0; layer < transformer_detail::k_layer_count; ++layer) {
     for (size_t index = 0u; index < k_layer_specs.size(); ++index) {
diff --git a/tests/embeddings/te_fixture_data.hpp b/tests/embeddings/te_fixture_data.hpp
index 49902ea7..4e472a96 100644
--- a/tests/embeddings/te_fixture_data.hpp
+++ b/tests/embeddings/te_fixture_data.hpp
@@ -175,7 +175,7 @@ inline bool load_te_vocab_from_file(const std::filesystem::path & path,
     return false;
   }
 
-  std::memset(&vocab_out, 0, sizeof(vocab_out));
+  vocab_out = {};
   vocab_out.tokenizer_model_id = emel::model::data::tokenizer_model::WPM;
   vocab_out.tokenizer_pre_id = emel::model::data::tokenizer_pre::DEFAULT;
   std::strncpy(vocab_out.tokenizer_model_name.data(),
diff --git a/tests/embeddings/text_embedding_lane_tests.cpp b/tests/embeddings/text_embedding_lane_tests.cpp
index 1c1ed220..f0e2ddb0 100644
--- a/tests/embeddings/text_embedding_lane_tests.cpp
+++ b/tests/embeddings/text_embedding_lane_tests.cpp
@@ -39,7 +39,7 @@ struct fake_tokenizer_dispatch_state {
   int32_t bind_error = emel::text::tokenizer::error_code(emel::text::tokenizer::error::none);
   bool tokenize_accept = true;
   int32_t tokenize_error = emel::text::tokenizer::error_code(emel::text::tokenizer::error::none);
-  std::array<int32_t, 4> token_ids = {};
+  std::array<int32_t, 32> token_ids = {};
   int32_t token_count = 1;
   bool saw_bind = false;
   bool saw_tokenize = false;
@@ -75,6 +75,53 @@ bool fake_tokenizer_tokenize_dispatch(
   return true;
 }
 
+void use_minimal_te_tokens(fake_tokenizer_dispatch_state & state,
+                           const emel::model::data::vocab & vocab) noexcept {
+  state.token_ids[0] = vocab.cls_id;
+  state.token_ids[1] = vocab.sep_id;
+  state.token_count = 2;
+}
+
+bool use_real_te_tokens(fake_tokenizer_dispatch_state & state,
+                        const emel::model::data::vocab & vocab,
+                        const std::string_view text) {
+  emel::text::tokenizer::sm tokenizer{};
+  int32_t bind_error =
+      emel::text::tokenizer::error_code(emel::text::tokenizer::error::none);
+  emel::text::tokenizer::event::bind bind{
+    &vocab,
+    emel::text::tokenizer::preprocessor::preprocessor_kind::wpm,
+    emel::text::encoders::encoder_kind::wpm,
+    &bind_error,
+  };
+  if (!tokenizer.process_event(bind) ||
+      bind_error !=
+          emel::text::tokenizer::error_code(emel::text::tokenizer::error::none)) {
+    return false;
+  }
+
+  int32_t tokenize_error =
+      emel::text::tokenizer::error_code(emel::text::tokenizer::error::none);
+  int32_t token_count = 0;
+  emel::text::tokenizer::event::tokenize tokenize{};
+  tokenize.vocab = &vocab;
+  tokenize.text = text;
+  tokenize.add_special = true;
+  tokenize.parse_special = false;
+  tokenize.token_ids_out = state.token_ids.data();
+  tokenize.token_capacity = static_cast<int32_t>(state.token_ids.size());
+  tokenize.token_count_out = &token_count;
+  tokenize.error_out = &tokenize_error;
+  if (!tokenizer.process_event(tokenize) ||
+      tokenize_error !=
+          emel::text::tokenizer::error_code(emel::text::tokenizer::error::none) ||
+      token_count <= 0) {
+    return false;
+  }
+  state.token_count = token_count;
+  return true;
+}
+
 enum class fake_formatter_mode : uint8_t {
   pass_through = 0u,
   invalid_request = 1u,
@@ -394,7 +441,10 @@ TEST_CASE("embeddings text lane supports TE matryoshka truncation when fixture p
     request.truncate_dimension = dimension;
     request.error_out = &embed_error;
 
-    REQUIRE(embedding_generator.process_event(request));
+    const bool embed_accepted = embedding_generator.process_event(request);
+    CAPTURE(static_cast<int>(embed_error));
+    CAPTURE(output_dimension);
+    REQUIRE(embed_accepted);
     CHECK(embed_error == emel::error::cast(emel::embeddings::generator::error::none));
     CHECK(output_dimension == dimension);
     CHECK(l2_norm(std::span<const float>{output.data(), static_cast<size_t>(dimension)}) ==
@@ -493,7 +543,7 @@ TEST_CASE("embeddings generator helper paths cover tensor binding callbacks and
   CHECK_FALSE(embedding_detail::publish_embedding(context, 0, truncated_output));
 
   fake_tokenizer_dispatch_state fake_tokenizer = {};
-  fake_tokenizer.token_ids[0] = fixture.model->vocab_data.cls_id;
+  use_minimal_te_tokens(fake_tokenizer, fixture.model->vocab_data);
   emel::error::type initialize_error =
       emel::error::cast(emel::embeddings::generator::error::none);
   initialize_callback_probe initialize_probe = {};
@@ -582,12 +632,13 @@ TEST_CASE("embeddings generator numeric helpers handle valid and invalid inputs"
   };
 
   std::array<block_q8_0, 1> q8_scratch = {};
-  CHECK(embedding_detail::matmul_f32(f32_matrix, input, output));
+  emel::kernel::sm matmul_kernel{emel::kernel::detect_host_kind()};
+  CHECK(embedding_detail::matmul_f32(matmul_kernel, f32_matrix, input, output));
   CHECK(output[0] == doctest::Approx(6.0f));
   CHECK(output[1] == doctest::Approx(15.0f));
-  CHECK(embedding_detail::matmul(f32_matrix, input, q8_scratch, output));
+  CHECK(embedding_detail::matmul(matmul_kernel, f32_matrix, input, q8_scratch, output));
   CHECK_FALSE(embedding_detail::matmul_f32(
-      f32_matrix, std::span<const float>{input.data(), 2u}, output));
+      matmul_kernel, f32_matrix, std::span<const float>{input.data(), 2u}, output));
 
   std::array<uint16_t, 6> f16_weights = {{
       fp32_to_fp16(1.0f),
@@ -606,7 +657,7 @@ TEST_CASE("embeddings generator numeric helpers handle valid and invalid inputs"
   };
   CHECK(embedding_detail::matmul_f16(f16_matrix, input, output));
   CHECK(output[0] == doctest::Approx(6.0f).epsilon(1.0e-3f));
-  CHECK(embedding_detail::matmul(f16_matrix, input, q8_scratch, output));
+  CHECK(embedding_detail::matmul(matmul_kernel, f16_matrix, input, q8_scratch, output));
   CHECK_FALSE(embedding_detail::matmul_f16(
       f16_matrix, input, std::span<float>{output.data(), 1u}));
 
@@ -622,9 +673,9 @@ TEST_CASE("embeddings generator numeric helpers handle valid and invalid inputs"
     .row_bytes = sizeof(block_q8_0),
   };
   std::array<float, 1> q8_output = {};
-  CHECK(embedding_detail::matmul_q8_0(q8_matrix, q8_row, q8_scratch, q8_output));
+  CHECK(embedding_detail::matmul_q8_0(matmul_kernel, q8_matrix, q8_row, q8_scratch, q8_output));
   CHECK(q8_output[0] == doctest::Approx(static_cast<float>(QK8_0)).epsilon(1.0e-1f));
-  CHECK(embedding_detail::matmul(q8_matrix, q8_row, q8_scratch, q8_output));
+  CHECK(embedding_detail::matmul(matmul_kernel, q8_matrix, q8_row, q8_scratch, q8_output));
   std::array<float, QK8_0> dequantized = {};
   CHECK(embedding_detail::copy_embedding_row(q8_matrix, 0, dequantized));
   CHECK(dequantized[0] == doctest::Approx(1.0f).epsilon(2.0e-1f));
@@ -643,14 +694,30 @@ TEST_CASE("embeddings generator numeric helpers handle valid and invalid inputs"
     .row_bytes = sizeof(block_q5_0),
   };
   std::array<float, 1> q5_output = {};
-  CHECK(embedding_detail::matmul_q5_0(q5_matrix, q5_row, q8_scratch, q5_output));
+  CHECK(embedding_detail::matmul_q5_0(matmul_kernel, q5_matrix, q5_row, q8_scratch, q5_output));
   CHECK(q5_output[0] == doctest::Approx(static_cast<float>(QK5_0)).epsilon(3.0e-1f));
-  CHECK(embedding_detail::matmul(q5_matrix, q5_row, q8_scratch, q5_output));
+  CHECK(embedding_detail::matmul(matmul_kernel, q5_matrix, q5_row, q8_scratch, q5_output));
   std::array<float, QK5_0> q5_dequantized = {};
   CHECK(embedding_detail::copy_embedding_row(q5_matrix, 0, q5_dequantized));
   CHECK(q5_dequantized[0] == doctest::Approx(1.0f).epsilon(2.0e-1f));
   CHECK_FALSE(embedding_detail::copy_embedding_row(q5_matrix, 1, q5_dequantized));
 
+  // Pointwise conv routes each pixel through the kernel machine; a q8_0
+  // matrix whose column count is not block-aligned must fail the matmul and
+  // surface as a pointwise failure.
+  std::array<float, 8> pointwise_input = {};
+  std::array<float, 1> pointwise_output = {};
+  embedding_action::matrix_view misaligned_q8_matrix = {
+    .data = q8_row_storage.data(),
+    .dtype = dtype_q8_0,
+    .rows = 1,
+    .cols = 8,
+    .row_bytes = sizeof(block_q8_0),
+  };
+  CHECK_FALSE(embedding_detail::pointwise_conv_hwc(
+      matmul_kernel, misaligned_q8_matrix, pointwise_input.data(), 1,
+      pointwise_output.data()));
+
   std::array<float, 2> bias_data = {{0.5f, -0.5f}};
   embedding_action::vector_view bias_view = {
     .data = bias_data.data(),
@@ -703,6 +770,37 @@ TEST_CASE("embeddings generator numeric helpers handle valid and invalid inputs"
   embedding_detail::apply_gelu(normalize_values);
 }
 
+TEST_CASE("embeddings generator dense matrix matmul routes multi-column input through kernel") {
+  using emel::kernel::detail::dtype_f32;
+
+  // Weights are 2 rows x 3 cols: {1 2 3} and {4 5 6}.
+  std::array<float, 6> f32_weights = {{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}};
+  embedding_action::matrix_view f32_matrix = {
+    .data = f32_weights.data(),
+    .dtype = dtype_f32,
+    .rows = 2,
+    .cols = 3,
+    .row_bytes = 3u * sizeof(float),
+  };
+
+  // Two input columns packed k-major: column 0 = (1, 1, 1), column 1 = (0, 1, 2).
+  constexpr int32_t input_cols = 2;
+  std::array<float, 6> input = {{1.0f, 0.0f, 1.0f, 1.0f, 1.0f, 2.0f}};
+  std::array<float, 4> output = {};
+  emel::kernel::sm matmul_kernel{emel::kernel::detect_host_kind()};
+  CHECK(embedding_detail::matmul_f32_matrix(
+      matmul_kernel, f32_matrix, input.data(), input_cols, output.data()));
+  CHECK(output[0] == doctest::Approx(6.0f));
+  CHECK(output[1] == doctest::Approx(8.0f));
+  CHECK(output[2] == doctest::Approx(15.0f));
+  CHECK(output[3] == doctest::Approx(17.0f));
+
+  CHECK_FALSE(embedding_detail::matmul_f32_matrix(
+      matmul_kernel, f32_matrix, nullptr, input_cols, output.data()));
+  CHECK_FALSE(embedding_detail::matmul_f32_matrix(
+      matmul_kernel, f32_matrix, input.data(), 0, output.data()));
+}
+
 TEST_CASE("embeddings generator state machine covers callback and prepare error branches when fixture present") {
   if (!te_assets_present()) {
     MESSAGE("skipping embedding state-machine coverage test because maintained assets are not present");
@@ -716,8 +814,8 @@ TEST_CASE("embeddings generator state machine covers callback and prepare error
 
   SUBCASE("initialize and embed done callbacks fire") {
     fake_tokenizer_dispatch_state fake_tokenizer = {};
-    fake_tokenizer.token_ids[0] = fixture.model->vocab_data.cls_id;
-    fake_tokenizer.token_count = 1;
+    REQUIRE(use_real_te_tokens(
+        fake_tokenizer, fixture.model->vocab_data, "callback coverage"));
     emel::text::conditioner::sm conditioner{};
     emel::embeddings::generator::sm embedding_generator{
       *fixture.model,
@@ -774,7 +872,11 @@ TEST_CASE("embeddings generator state machine covers callback and prepare error
             embed_callback_probe,
             &embed_callback_probe::on_error>(&embed_probe);
 
-    REQUIRE(embedding_generator.process_event(request));
+    const bool callback_embed_accepted =
+        embedding_generator.process_event(request);
+    CAPTURE(static_cast<int>(embed_error));
+    CAPTURE(output_dimension);
+    REQUIRE(callback_embed_accepted);
     CHECK(embed_error == emel::error::cast(emel::embeddings::generator::error::none));
     CHECK(embed_probe.done_called);
     CHECK_FALSE(embed_probe.error_called);
@@ -816,8 +918,8 @@ TEST_CASE("embeddings generator state machine covers callback and prepare error
 
   SUBCASE("unexpected embed before initialize rejects without trapping later requests") {
     fake_tokenizer_dispatch_state fake_tokenizer = {};
-    fake_tokenizer.token_ids[0] = fixture.model->vocab_data.cls_id;
-    fake_tokenizer.token_count = 1;
+    REQUIRE(use_real_te_tokens(
+        fake_tokenizer, fixture.model->vocab_data, "callback coverage"));
     emel::text::conditioner::sm conditioner{};
     emel::embeddings::generator::sm embedding_generator{
       *fixture.model,
diff --git a/tests/embeddings/vision_embedding_lane_tests.cpp b/tests/embeddings/vision_embedding_lane_tests.cpp
index 5c9254da..b2bbbfcb 100644
--- a/tests/embeddings/vision_embedding_lane_tests.cpp
+++ b/tests/embeddings/vision_embedding_lane_tests.cpp
@@ -92,7 +92,10 @@ TEST_CASE("embeddings vision lane returns normalized TE embeddings when fixture
     red_dimension,
   };
   red_request.error_out = &red_error;
-  REQUIRE(embedding_generator.process_event(red_request));
+  const bool red_accepted = embedding_generator.process_event(red_request);
+  CAPTURE(static_cast<int>(red_error));
+  CAPTURE(red_dimension);
+  REQUIRE(red_accepted);
   CHECK(red_error == emel::error::cast(emel::embeddings::generator::error::none));
   CHECK(red_dimension == 1280);
   CHECK(l2_norm(std::span<const float>{
@@ -311,6 +314,32 @@ TEST_CASE("embeddings vision helper paths cover image request callbacks and vali
   CHECK(probe.request == &request);
 }
 
+TEST_CASE("embeddings vision pointwise direct path rejects missing lane buffers") {
+  // The direct pointwise path needs its platform lane buffer bound by
+  // bind_pointwise_f16 (packed rhs on aarch64, the scalar transpose
+  // elsewhere); an otherwise valid view without one must be rejected
+  // before any lane memory is read.
+  emel::embeddings::generator::action::matrix_view matrix = {};
+  matrix.dtype = static_cast<uint8_t>(emel::kernel::detail::dtype_f32);
+  matrix.rows = 4;
+  matrix.cols = 4;
+
+  std::array<float, 4> input = {};
+  std::array<float, 4> output = {};
+  CHECK_FALSE(embedding_detail::pointwise_conv_hwc_direct_f32(
+      matrix, input.data(), 1, output.data()));
+  CHECK_FALSE(embedding_detail::pointwise_conv_hwc_direct_f32(
+      matrix, nullptr, 1, output.data()));
+
+  // The lane-buffer check fires before any batch-norm validation, so the
+  // fused variants must reject the same view in every instantiation.
+  emel::embeddings::generator::action::batch_norm_view batch_norm = {};
+  CHECK_FALSE(embedding_detail::pointwise_conv_hwc_direct_f32_bn<false>(
+      matrix, input.data(), 1, batch_norm, output.data()));
+  CHECK_FALSE(embedding_detail::pointwise_conv_hwc_direct_f32_bn<true>(
+      matrix, input.data(), 1, batch_norm, output.data()));
+}
+
 TEST_CASE("embeddings vision pointwise direct path matches scalar pointwise reference") {
 #if !(defined(__aarch64__) || defined(__ARM_NEON))
   return;
diff --git a/tests/graph/assembler/assembler_tests.cpp b/tests/graph/assembler/assembler_tests.cpp
index 001dc038..973beb37 100644
--- a/tests/graph/assembler/assembler_tests.cpp
+++ b/tests/graph/assembler/assembler_tests.cpp
@@ -15,6 +15,7 @@
 #include "emel/model/detail.hpp"
 #include "emel/model/llama/detail.hpp"
 #include "emel/model/loader/errors.hpp"
+#include "../../kernel/test_helpers.hpp"
 
 namespace {
 
@@ -117,7 +118,7 @@ void append_tensor_name(emel::model::data & model,
 }
 
 void build_canonical_model(emel::model::data & model, const int32_t block_count) {
-  std::memset(&model, 0, sizeof(model));
+  emel::tests::reset_model_data(model);
   copy_architecture(model.architecture_name, "llama");
   model.n_layers = block_count;
   model.params.n_embd = 64;
diff --git a/tests/graph/graph_tests.cpp b/tests/graph/graph_tests.cpp
index 83adbd24..df5294ab 100644
--- a/tests/graph/graph_tests.cpp
+++ b/tests/graph/graph_tests.cpp
@@ -22,6 +22,7 @@
 #include "emel/model/loader/errors.hpp"
 #include "emel/graph/tensor/errors.hpp"
 #include "emel/graph/tensor/events.hpp"
+#include "../kernel/test_helpers.hpp"
 
 void * operator new(const std::size_t size) {
   if (emel::test::allocation::g_track_allocations.load(std::memory_order_relaxed)) {
@@ -65,6 +66,14 @@ void operator delete[](void * ptr) noexcept {
   std::free(ptr);
 }
 
+void operator delete(void * ptr, std::size_t) noexcept {
+  std::free(ptr);
+}
+
+void operator delete[](void * ptr, std::size_t) noexcept {
+  std::free(ptr);
+}
+
 namespace {
 
 using execute_t = emel::graph::processor::event::execute;
@@ -269,7 +278,7 @@ void append_tensor_name(emel::model::data & model,
 }
 
 void build_canonical_model(emel::model::data & model, const int32_t block_count) {
-  std::memset(&model, 0, sizeof(model));
+  emel::tests::reset_model_data(model);
   copy_architecture(model.architecture_name, "llama");
   model.n_layers = block_count;
   model.params.n_embd = 64;
@@ -374,6 +383,237 @@ TEST_CASE("graph_machine_reserve_then_compute_success_path") {
   CHECK(tensor_state.lifecycle_state == emel::graph::tensor::event::lifecycle::empty);
 }
 
+TEST_CASE("graph_machine_reserved_compute_uses_existing_reservation") {
+  emel::graph::sm machine{};
+  lifecycle_fixture lifecycle{};
+
+  emel::graph::event::reserve_output reserve_output{};
+  reserve_callbacks reserve_cb{};
+  REQUIRE(machine.process_event(emel::graph::event::reserve{
+    .model_topology = reinterpret_cast<const void *>(0xA5),
+    .output_out = &reserve_output,
+    .lifecycle = &lifecycle.reserve,
+    .max_node_count = 4u,
+    .max_tensor_count = 5u,
+    .bytes_per_tensor = 8u,
+    .workspace_capacity_bytes = 64u,
+    .dispatch_done = {&reserve_cb, reserve_callbacks::on_done},
+    .dispatch_error = {&reserve_cb, reserve_callbacks::on_error},
+  }));
+  REQUIRE(reserve_cb.done_called);
+
+  g_kernel_calls = 0;
+  emel::graph::event::compute_output compute_output{};
+  compute_callbacks compute_cb{};
+  const emel::graph::event::compute compute_request{
+    .step_plan = reinterpret_cast<const void *>(0xB8),
+    .output_out = &compute_output,
+    .lifecycle = &lifecycle.compute,
+    .node_count_hint = 0u,
+    .tensor_count_hint = 0u,
+    .bytes_per_tensor = 0u,
+    .workspace_capacity_bytes = 0u,
+    .step_index = 0,
+    .step_size = 1,
+    .kv_tokens = 1,
+    .expected_outputs = 1,
+    .validate = validate_ok,
+    .prepare_graph = prepare_graph_reuse,
+    .alloc_graph = alloc_graph_ok,
+    .bind_inputs = bind_inputs_ok,
+    .run_kernel = run_kernel_counting,
+    .extract_outputs = extract_outputs_ok,
+    .dispatch_done = {&compute_cb, compute_callbacks::on_done},
+    .dispatch_error = {&compute_cb, compute_callbacks::on_error},
+  };
+
+  CHECK(machine.process_event(emel::graph::event::compute_reserved{compute_request}));
+  CHECK(compute_cb.done_called);
+  CHECK_FALSE(compute_cb.error_called);
+  CHECK(compute_output.graph_topology == reserve_output.graph_topology);
+  CHECK(compute_output.node_count == reserve_output.node_count);
+  CHECK(compute_output.tensor_count == reserve_output.tensor_count);
+  CHECK(compute_output.required_buffer_bytes == reserve_output.required_buffer_bytes);
+  CHECK(compute_output.version == reserve_output.version);
+  CHECK(compute_output.reused_topology == 1u);
+  CHECK(compute_output.outputs_produced == 1);
+  CHECK(compute_output.graph_reused == 1u);
+  CHECK(compute_output.lifecycle == &lifecycle.compute);
+  CHECK(g_kernel_calls == 1);
+}
+
+TEST_CASE("graph_machine_rejects_reserved_compute_without_prepare_graph") {
+  emel::graph::sm machine{};
+  lifecycle_fixture lifecycle{};
+
+  emel::graph::event::reserve_output reserve_output{};
+  reserve_callbacks reserve_cb{};
+  REQUIRE(machine.process_event(emel::graph::event::reserve{
+    .model_topology = reinterpret_cast<const void *>(0xA5),
+    .output_out = &reserve_output,
+    .lifecycle = &lifecycle.reserve,
+    .max_node_count = 4u,
+    .max_tensor_count = 5u,
+    .bytes_per_tensor = 8u,
+    .workspace_capacity_bytes = 64u,
+    .dispatch_done = {&reserve_cb, reserve_callbacks::on_done},
+    .dispatch_error = {&reserve_cb, reserve_callbacks::on_error},
+  }));
+  REQUIRE(reserve_cb.done_called);
+
+  g_kernel_calls = 0;
+  emel::graph::event::compute_output compute_output{
+    .graph_topology = reinterpret_cast<const void *>(0xFE),
+    .node_count = 9u,
+    .tensor_count = 9u,
+    .required_buffer_bytes = 9u,
+    .version = 9u,
+    .reused_topology = 1u,
+    .outputs_produced = 9,
+    .graph_reused = 1u,
+  };
+  compute_callbacks compute_cb{};
+  const emel::graph::event::compute compute_request{
+    .step_plan = reinterpret_cast<const void *>(0xBA),
+    .output_out = &compute_output,
+    .lifecycle = &lifecycle.compute,
+    .node_count_hint = 0u,
+    .tensor_count_hint = 0u,
+    .bytes_per_tensor = 0u,
+    .workspace_capacity_bytes = 0u,
+    .step_index = 0,
+    .step_size = 1,
+    .kv_tokens = 1,
+    .expected_outputs = 1,
+    .validate = validate_ok,
+    .prepare_graph = nullptr,
+    .alloc_graph = alloc_graph_ok,
+    .bind_inputs = bind_inputs_ok,
+    .run_kernel = run_kernel_counting,
+    .extract_outputs = extract_outputs_ok,
+    .dispatch_done = {&compute_cb, compute_callbacks::on_done},
+    .dispatch_error = {&compute_cb, compute_callbacks::on_error},
+  };
+
+  CHECK_FALSE(machine.process_event(emel::graph::event::compute_reserved{compute_request}));
+  CHECK_FALSE(compute_cb.done_called);
+  CHECK(compute_cb.error_called);
+  CHECK(compute_cb.error_code ==
+        static_cast<int32_t>(emel::error::cast(emel::graph::error::invalid_request)));
+  CHECK(compute_output.graph_topology == nullptr);
+  CHECK(compute_output.node_count == 0u);
+  CHECK(compute_output.tensor_count == 0u);
+  CHECK(compute_output.required_buffer_bytes == 0u);
+  CHECK(compute_output.version == 0u);
+  CHECK(compute_output.reused_topology == 0u);
+  CHECK(compute_output.outputs_produced == 0);
+  CHECK(compute_output.graph_reused == 0u);
+  CHECK(g_kernel_calls == 0);
+}
+
+TEST_CASE("graph_machine_rejects_reserved_compute_with_mismatched_lifecycle") {
+  emel::graph::sm machine{};
+  lifecycle_fixture lifecycle{};
+  lifecycle_fixture other_lifecycle{};
+
+  emel::graph::event::reserve_output reserve_output{};
+  reserve_callbacks reserve_cb{};
+  REQUIRE(machine.process_event(emel::graph::event::reserve{
+    .model_topology = reinterpret_cast<const void *>(0xA5),
+    .output_out = &reserve_output,
+    .lifecycle = &lifecycle.reserve,
+    .max_node_count = 4u,
+    .max_tensor_count = 5u,
+    .bytes_per_tensor = 8u,
+    .workspace_capacity_bytes = 64u,
+    .dispatch_done = {&reserve_cb, reserve_callbacks::on_done},
+    .dispatch_error = {&reserve_cb, reserve_callbacks::on_error},
+  }));
+  REQUIRE(reserve_cb.done_called);
+
+  g_kernel_calls = 0;
+  emel::graph::event::compute_output compute_output{};
+  compute_callbacks compute_cb{};
+  const emel::graph::event::compute compute_request{
+    .step_plan = reinterpret_cast<const void *>(0xB8),
+    .output_out = &compute_output,
+    .lifecycle = &other_lifecycle.compute,
+    .node_count_hint = 0u,
+    .tensor_count_hint = 0u,
+    .bytes_per_tensor = 0u,
+    .workspace_capacity_bytes = 0u,
+    .step_index = 0,
+    .step_size = 1,
+    .kv_tokens = 1,
+    .expected_outputs = 1,
+    .validate = validate_ok,
+    .prepare_graph = prepare_graph_reuse,
+    .alloc_graph = alloc_graph_ok,
+    .bind_inputs = bind_inputs_ok,
+    .run_kernel = run_kernel_counting,
+    .extract_outputs = extract_outputs_ok,
+    .dispatch_done = {&compute_cb, compute_callbacks::on_done},
+    .dispatch_error = {&compute_cb, compute_callbacks::on_error},
+  };
+
+  CHECK_FALSE(machine.process_event(emel::graph::event::compute_reserved{compute_request}));
+  CHECK_FALSE(compute_cb.done_called);
+  CHECK(compute_cb.error_called);
+  CHECK(compute_cb.error_code ==
+        static_cast<int32_t>(emel::error::cast(emel::graph::error::invalid_request)));
+  CHECK(g_kernel_calls == 0);
+}
+
+TEST_CASE("graph_machine_rejects_reserved_compute_before_reserve") {
+  emel::graph::sm machine{};
+  lifecycle_fixture lifecycle{};
+
+  g_kernel_calls = 0;
+  emel::graph::event::compute_output compute_output{
+    .graph_topology = reinterpret_cast<const void *>(0xFE),
+    .node_count = 9u,
+    .tensor_count = 9u,
+    .required_buffer_bytes = 9u,
+    .version = 9u,
+    .reused_topology = 1u,
+    .outputs_produced = 9,
+    .graph_reused = 1u,
+  };
+  compute_callbacks compute_cb{};
+  const emel::graph::event::compute compute_request{
+    .step_plan = reinterpret_cast<const void *>(0xB9),
+    .output_out = &compute_output,
+    .lifecycle = &lifecycle.compute,
+    .step_index = 0,
+    .step_size = 1,
+    .kv_tokens = 1,
+    .expected_outputs = 1,
+    .validate = validate_ok,
+    .prepare_graph = prepare_graph_reuse,
+    .alloc_graph = alloc_graph_ok,
+    .bind_inputs = bind_inputs_ok,
+    .run_kernel = run_kernel_counting,
+    .extract_outputs = extract_outputs_ok,
+    .dispatch_done = {&compute_cb, compute_callbacks::on_done},
+    .dispatch_error = {&compute_cb, compute_callbacks::on_error},
+  };
+
+  CHECK_FALSE(machine.process_event(emel::graph::event::compute_reserved{compute_request}));
+  CHECK_FALSE(compute_cb.done_called);
+  CHECK(compute_cb.error_called);
+  CHECK(compute_cb.error_code ==
+        static_cast<int32_t>(emel::error::cast(emel::graph::error::invalid_request)));
+  CHECK(compute_output.graph_topology == nullptr);
+  CHECK(compute_output.node_count == 0u);
+  CHECK(compute_output.tensor_count == 0u);
+  CHECK(compute_output.required_buffer_bytes == 0u);
+  CHECK(compute_output.version == 0u);
+  CHECK(compute_output.reused_topology == 0u);
+  CHECK(compute_output.outputs_produced == 0);
+  CHECK(compute_output.graph_reused == 0u);
+  CHECK(g_kernel_calls == 0);
+}
+
 TEST_CASE("graph_machine_blocks_kernel_until_required_tensors_are_filled") {
   emel::graph::sm machine{};
   lifecycle_fixture lifecycle{};
diff --git a/tests/graph/processor/processor_action_branch_tests.cpp b/tests/graph/processor/processor_action_branch_tests.cpp
index a835a50d..6c55f4d6 100644
--- a/tests/graph/processor/processor_action_branch_tests.cpp
+++ b/tests/graph/processor/processor_action_branch_tests.cpp
@@ -470,6 +470,26 @@ TEST_CASE("graph_processor_releases_publish_targets_after_extract_failure") {
   CHECK(second_output.outputs_produced == 2);
 }
 
+TEST_CASE("graph_processor_process_event_async_execute_completes_in_rtc") {
+  namespace event = emel::graph::processor::event;
+
+  lifecycle_fixture lifecycle{};
+  emel::graph::processor::sm machine{};
+
+  dispatch_state dispatch{};
+  event::execution_output output{};
+  const auto request = make_valid_execute(&output, &dispatch, lifecycle);
+
+  emel::bool_task task = machine.process_event_async(request);
+
+  CHECK(task.result());
+  CHECK(dispatch.done_called);
+  CHECK_FALSE(dispatch.error_called);
+  CHECK(output.outputs_produced == 2);
+  CHECK(output.graph_reused == 1u);
+  CHECK(output.lifecycle == &lifecycle.manifest);
+}
+
 TEST_CASE("graph_processor_step_action_and_guard_branches") {
   namespace event = emel::graph::processor::event;
 
diff --git a/tests/kernel/aarch64_tests.cpp b/tests/kernel/aarch64_tests.cpp
index be2e2c00..c2ffcaef 100644
--- a/tests/kernel/aarch64_tests.cpp
+++ b/tests/kernel/aarch64_tests.cpp
@@ -58,13 +58,11 @@ TEST_CASE("kernel_aarch64_numeric_paths") {
       .src0 = make_src(lhs, dtype::f32, 4),
       .src1 = make_src(rhs, dtype::f32, 4),
       .dst = make_dst(out_add, dtype::f32, 4),
-      .nth = 1,
   };
   const emel::kernel::event::op_mul mul_ev{
       .src0 = make_src(lhs, dtype::f32, 4),
       .src1 = make_src(rhs, dtype::f32, 4),
       .dst = make_dst(out_mul, dtype::f32, 4),
-      .nth = 1,
   };
 
   aarch64_sm machine{emel::kernel::aarch64::action::context{false, {}, 0}};
@@ -103,7 +101,6 @@ TEST_CASE("kernel_aarch64_scalar_path_honors_strides") {
       .src0 = lhs,
       .src1 = rhs,
       .dst = dst,
-      .nth = 1,
   };
 
   aarch64_sm machine{emel::kernel::aarch64::action::context{false, {}, 0}};
@@ -115,26 +112,6 @@ TEST_CASE("kernel_aarch64_scalar_path_honors_strides") {
   CHECK(dst_storage[6] == doctest::Approx(44.0f));
 }
 
-TEST_CASE("kernel_aarch64_rejects_non_single_thread_dispatch") {
-  float lhs[4] = {1.0f, 2.0f, 3.0f, 4.0f};
-  float rhs[4] = {5.0f, 6.0f, 7.0f, 8.0f};
-  float out[4] = {};
-
-  emel::kernel::event::op_add invalid_nth{
-      .src0 = make_src(lhs, dtype::f32, 4),
-      .src1 = make_src(rhs, dtype::f32, 4),
-      .dst = make_dst(out, dtype::f32, 4),
-      .nth = 2,
-  };
-  emel::kernel::event::op_add invalid_ith = invalid_nth;
-  invalid_ith.nth = 1;
-  invalid_ith.ith = 1;
-
-  aarch64_sm machine{};
-  CHECK_FALSE(machine.process_event(invalid_nth));
-  CHECK_FALSE(machine.process_event(invalid_ith));
-}
-
 TEST_CASE("kernel_aarch64_forced_neon_context_path") {
   float lhs[4] = {2.0f, 4.0f, 6.0f, 8.0f};
   float rhs[4] = {1.0f, 3.0f, 5.0f, 7.0f};
@@ -144,7 +121,6 @@ TEST_CASE("kernel_aarch64_forced_neon_context_path") {
       .src0 = make_src(lhs, dtype::f32, 4),
       .src1 = make_src(rhs, dtype::f32, 4),
       .dst = make_dst(out, dtype::f32, 4),
-      .nth = 1,
   };
 
   aarch64_sm machine{emel::kernel::aarch64::action::context{true, {}, 0}};
@@ -181,13 +157,11 @@ TEST_CASE("kernel_aarch64_mul_mat_simd_matches_scalar_tiled_edges") {
       .src0 = make_src(src0.data(), dtype::f32, k, m),
       .src1 = make_src(src1.data(), dtype::f32, n, k),
       .dst = make_dst(dst_simd.data(), dtype::f32, n, m),
-      .nth = 1,
   };
   const emel::kernel::event::op_mul_mat scalar_ev{
       .src0 = make_src(src0.data(), dtype::f32, k, m),
       .src1 = make_src(src1.data(), dtype::f32, n, k),
       .dst = make_dst(dst_scalar.data(), dtype::f32, n, m),
-      .nth = 1,
   };
 
   CHECK(emel::kernel::aarch64::detail::execute_neon_mul_mat(simd_ev));
@@ -229,13 +203,11 @@ TEST_CASE("kernel_aarch64_mul_mat_tail_resets_nan_dst_on_first_depth_block") {
       .src0 = make_src(src0.data(), dtype::f32, k, m),
       .src1 = make_src(src1.data(), dtype::f32, n, k),
       .dst = make_dst(dst_simd.data(), dtype::f32, n, m),
-      .nth = 1,
   };
   const emel::kernel::event::op_mul_mat scalar_ev{
       .src0 = make_src(src0.data(), dtype::f32, k, m),
       .src1 = make_src(src1.data(), dtype::f32, n, k),
       .dst = make_dst(dst_scalar.data(), dtype::f32, n, m),
-      .nth = 1,
   };
 
   CHECK(emel::kernel::aarch64::detail::execute_neon_mul_mat(simd_ev));
@@ -275,13 +247,11 @@ TEST_CASE("kernel_aarch64_mul_mat_simd_matches_scalar_when_matrix_has_eight_rows
       .src0 = make_src(src0.data(), dtype::f32, k, m),
       .src1 = make_src(src1.data(), dtype::f32, n, k),
       .dst = make_dst(dst_simd.data(), dtype::f32, n, m),
-      .nth = 1,
   };
   const emel::kernel::event::op_mul_mat scalar_ev{
       .src0 = make_src(src0.data(), dtype::f32, k, m),
       .src1 = make_src(src1.data(), dtype::f32, n, k),
       .dst = make_dst(dst_scalar.data(), dtype::f32, n, m),
-      .nth = 1,
   };
 
   CHECK(emel::kernel::aarch64::detail::execute_neon_mul_mat(simd_ev));
@@ -332,13 +302,11 @@ TEST_CASE("kernel_aarch64_prepared_f32_lhs_4row_matches_scalar_with_depth_and_ta
       .src0 = make_src(src0.data(), dtype::f32, k, m),
       .src1 = make_src(src1.data(), dtype::f32, n, k),
       .dst = make_dst(dst_prepared.data(), dtype::f32, n, m),
-      .nth = 1,
   };
   const emel::kernel::event::op_mul_mat scalar_ev{
       .src0 = make_src(src0.data(), dtype::f32, k, m),
       .src1 = make_src(src1.data(), dtype::f32, n, k),
       .dst = make_dst(dst_scalar.data(), dtype::f32, n, m),
-      .nth = 1,
   };
 
   CHECK(emel::kernel::aarch64::detail::execute_neon_mul_mat_prepared_f32_lhs_4row(
@@ -393,13 +361,11 @@ TEST_CASE("kernel_aarch64_prepared_f32_lhs_4row_accumulates_twelve_column_depth_
       .src0 = make_src(src0.data(), dtype::f32, k, m),
       .src1 = make_src(src1.data(), dtype::f32, n, k),
       .dst = make_dst(dst_prepared.data(), dtype::f32, n, m),
-      .nth = 1,
   };
   const emel::kernel::event::op_mul_mat scalar_ev{
       .src0 = make_src(src0.data(), dtype::f32, k, m),
       .src1 = make_src(src1.data(), dtype::f32, n, k),
       .dst = make_dst(dst_scalar.data(), dtype::f32, n, m),
-      .nth = 1,
   };
 
   CHECK(emel::kernel::aarch64::detail::execute_neon_mul_mat_prepared_f32_lhs_4row(
@@ -449,7 +415,6 @@ TEST_CASE("kernel_aarch64_prepared_f32_lhs_4row_rejects_invalid_contract") {
       .src0 = make_src(src0.data(), dtype::f32, k, m),
       .src1 = make_src(src1.data(), dtype::f32, n, k),
       .dst = make_dst(dst.data(), dtype::f32, n, m),
-      .nth = 1,
   };
 
   CHECK_FALSE(emel::kernel::aarch64::detail::execute_neon_mul_mat_prepared_f32_lhs_4row(
@@ -711,13 +676,11 @@ TEST_CASE("kernel_aarch64_quantized_mul_mat_simd_matches_scalar") {
         .src0 = make_quantized_src(&block, type, QK_K, 1),
         .src1 = make_src(src1.data(), dtype::f32, 1, QK_K),
         .dst = make_dst(dst_simd.data(), dtype::f32, 1, 1),
-        .nth = 1,
     };
     const emel::kernel::event::op_mul_mat scalar_ev{
         .src0 = make_quantized_src(&block, type, QK_K, 1),
         .src1 = make_src(src1.data(), dtype::f32, 1, QK_K),
         .dst = make_dst(dst_scalar.data(), dtype::f32, 1, 1),
-        .nth = 1,
     };
 
     CHECK(emel::kernel::aarch64::detail::can_use_neon(simd_ev, true));
@@ -779,7 +742,6 @@ TEST_CASE("kernel_aarch64_q4_0_vector_route_is_explicit_and_numeric_match") {
       .src0 = make_quantized_src(q4_rows.data(), dtype::q4_0, QK4_0, row_count),
       .src1 = make_src(input.data(), dtype::f32, 1u, QK4_0),
       .dst = make_dst(simd_out.data(), dtype::f32, 1u, row_count),
-      .nth = 1,
   };
 
   CHECK(emel::kernel::aarch64::detail::can_use_neon_mul_mat_q4_0_vector(ev, true));
@@ -843,7 +805,6 @@ TEST_CASE("kernel_aarch64_q4_1_vector_route_is_explicit_and_numeric_match") {
       .src0 = make_quantized_src(q4_rows.data(), dtype::q4_1, QK4_1, row_count),
       .src1 = make_src(input.data(), dtype::f32, 1u, QK4_1),
       .dst = make_dst(simd_out.data(), dtype::f32, 1u, row_count),
-      .nth = 1,
   };
 
   CHECK(emel::kernel::aarch64::detail::can_use_neon_mul_mat_q4_1_vector(ev, true));
@@ -907,7 +868,6 @@ TEST_CASE("kernel_aarch64_q5_0_vector_route_is_explicit_and_numeric_match") {
       .src0 = make_quantized_src(q5_rows.data(), dtype::q5_0, QK5_0, row_count),
       .src1 = make_src(input.data(), dtype::f32, 1u, QK5_0),
       .dst = make_dst(simd_out.data(), dtype::f32, 1u, row_count),
-      .nth = 1,
   };
 
   CHECK(emel::kernel::aarch64::detail::can_use_neon_mul_mat_q5_0_vector(ev, true));
@@ -981,7 +941,6 @@ TEST_CASE("kernel_aarch64_q4_q5_vector_unchecked_paths_match_scalar_with_tail_ro
                                    row_count),
         .src1 = make_src(input.data(), dtype::f32, 1u, QK4_0),
         .dst = make_dst(optimized.data(), dtype::f32, 1u, row_count),
-        .nth = 1,
     };
 
     emel::kernel::aarch64::detail::execute_neon_mul_mat_q4_0_vector_unchecked(
@@ -1024,7 +983,6 @@ TEST_CASE("kernel_aarch64_q4_q5_vector_unchecked_paths_match_scalar_with_tail_ro
                                    row_count),
         .src1 = make_src(input.data(), dtype::f32, 1u, QK4_1),
         .dst = make_dst(optimized.data(), dtype::f32, 1u, row_count),
-        .nth = 1,
     };
 
     emel::kernel::aarch64::detail::execute_neon_mul_mat_q4_1_vector_unchecked(
@@ -1067,7 +1025,6 @@ TEST_CASE("kernel_aarch64_q4_q5_vector_unchecked_paths_match_scalar_with_tail_ro
                                    row_count),
         .src1 = make_src(input.data(), dtype::f32, 1u, QK5_0),
         .dst = make_dst(optimized.data(), dtype::f32, 1u, row_count),
-        .nth = 1,
     };
 
     emel::kernel::aarch64::detail::execute_neon_mul_mat_q5_0_vector_unchecked(
@@ -1307,7 +1264,7 @@ TEST_CASE("kernel_aarch64_q6_4rows_neon_matches_scalar") {
 
 TEST_CASE("kernel_aarch64_q4_k_row_neon_matches_scalar") {
 #if !(defined(__aarch64__) || defined(__ARM_NEON))
-  SUCCEED();
+  CHECK(true);
   return;
 #else
   using emel::kernel::detail::quant::QK_K;
@@ -1346,7 +1303,7 @@ TEST_CASE("kernel_aarch64_q4_k_row_neon_matches_scalar") {
 
 TEST_CASE("kernel_aarch64_q4_k_2rows_neon_matches_scalar") {
 #if !(defined(__aarch64__) || defined(__ARM_NEON))
-  SUCCEED();
+  CHECK(true);
   return;
 #else
   using emel::kernel::detail::quant::QK_K;
@@ -1432,13 +1389,11 @@ TEST_CASE("kernel_aarch64_sm_reports_q2_vectorized_dispatch_at_kernel_seam") {
       .src0 = make_quantized_src(&q2, dtype::q2_k, QK_K, 1),
       .src1 = make_src(input.data(), dtype::f32, 1, QK_K),
       .dst = make_dst(q2_out, dtype::f32, 1, 1),
-      .nth = 1,
   };
   const emel::kernel::event::op_mul_mat q3_ev{
       .src0 = make_quantized_src(&q3, dtype::q3_k, QK_K, 1),
       .src1 = make_src(input.data(), dtype::f32, 1, QK_K),
       .dst = make_dst(q3_out, dtype::f32, 1, 1),
-      .nth = 1,
   };
 
   aarch64_sm machine{};
@@ -1496,13 +1451,11 @@ TEST_CASE("kernel_aarch64_sm_reports_q3_vectorized_dispatch_at_kernel_seam") {
       .src0 = make_quantized_src(&q2, dtype::q2_k, QK_K, 1),
       .src1 = make_src(input.data(), dtype::f32, 1, QK_K),
       .dst = make_dst(q2_out, dtype::f32, 1, 1),
-      .nth = 1,
   };
   const emel::kernel::event::op_mul_mat q3_ev{
       .src0 = make_quantized_src(&q3, dtype::q3_k, QK_K, 1),
       .src1 = make_src(input.data(), dtype::f32, 1, QK_K),
       .dst = make_dst(q3_out, dtype::f32, 1, 1),
-      .nth = 1,
   };
 
   aarch64_sm machine{};
@@ -1574,19 +1527,16 @@ TEST_CASE("kernel_aarch64_sm_reports_q6_vectorized_dispatch_at_kernel_seam") {
       .src0 = make_quantized_src(&q2, dtype::q2_k, QK_K, 1),
       .src1 = make_src(input.data(), dtype::f32, 1, QK_K),
       .dst = make_dst(q2_out, dtype::f32, 1, 1),
-      .nth = 1,
   };
   const emel::kernel::event::op_mul_mat q3_ev{
       .src0 = make_quantized_src(&q3, dtype::q3_k, QK_K, 1),
       .src1 = make_src(input.data(), dtype::f32, 1, QK_K),
       .dst = make_dst(q3_out, dtype::f32, 1, 1),
-      .nth = 1,
   };
   const emel::kernel::event::op_mul_mat q6_ev{
       .src0 = make_quantized_src(&q6, dtype::q6_k, QK_K, 1),
       .src1 = make_src(input.data(), dtype::f32, 1, QK_K),
       .dst = make_dst(q6_out, dtype::f32, 1, 1),
-      .nth = 1,
   };
 
   aarch64_sm machine{};
@@ -1624,7 +1574,6 @@ TEST_CASE("kernel_aarch64_sm_reports_q8_0_vectorized_dispatch_at_kernel_seam") {
       .src0 = make_quantized_src(&q8, dtype::q8_0, QK8_0, 1),
       .src1 = make_src(input.data(), dtype::f32, 1, QK8_0),
       .dst = make_dst(q8_out, dtype::f32, 1, 1),
-      .nth = 1,
   };
 
   aarch64_sm machine{};
@@ -1675,13 +1624,11 @@ TEST_CASE("kernel_aarch64_q4_k_uses_neon_dispatch_when_dotprod_is_available") {
       .src0 = make_quantized_src(&q4, dtype::q4_k, QK_K, 1),
       .src1 = make_src(input.data(), dtype::f32, 1, QK_K),
       .dst = make_dst(shared_out, dtype::f32, 1, 1),
-      .nth = 1,
   };
   const emel::kernel::event::op_mul_mat scalar_ev{
       .src0 = make_quantized_src(&q4, dtype::q4_k, QK_K, 1),
       .src1 = make_src(input.data(), dtype::f32, 1, QK_K),
       .dst = make_dst(scalar_out, dtype::f32, 1, 1),
-      .nth = 1,
   };
 
   CHECK(emel::kernel::detail::execute_scalar(scalar_ev));
@@ -1747,19 +1694,16 @@ TEST_CASE("kernel_aarch64_supported_quantized_dispatch_is_alloc_free") {
       .src0 = make_quantized_src(&q2, dtype::q2_k, QK_K, 1),
       .src1 = make_src(input.data(), dtype::f32, 1, QK_K),
       .dst = make_dst(q2_out, dtype::f32, 1, 1),
-      .nth = 1,
   };
   const emel::kernel::event::op_mul_mat q3_ev{
       .src0 = make_quantized_src(&q3, dtype::q3_k, QK_K, 1),
       .src1 = make_src(input.data(), dtype::f32, 1, QK_K),
       .dst = make_dst(q3_out, dtype::f32, 1, 1),
-      .nth = 1,
   };
   const emel::kernel::event::op_mul_mat q6_ev{
       .src0 = make_quantized_src(&q6, dtype::q6_k, QK_K, 1),
       .src1 = make_src(input.data(), dtype::f32, 1, QK_K),
       .dst = make_dst(q6_out, dtype::f32, 1, 1),
-      .nth = 1,
   };
   const std::array<float, QK8_0> q8_input = [] {
     std::array<float, QK8_0> values = {};
@@ -1773,7 +1717,6 @@ TEST_CASE("kernel_aarch64_supported_quantized_dispatch_is_alloc_free") {
       .src0 = make_quantized_src(&q8, dtype::q8_0, QK8_0, 1),
       .src1 = make_src(q8_input.data(), dtype::f32, 1, QK8_0),
       .dst = make_dst(q8_out, dtype::f32, 1, 1),
-      .nth = 1,
   };
   std::array<block_q8_0, 4> q8_rows = {};
   for (size_t row = 0; row < q8_rows.size(); ++row) {
@@ -1791,7 +1734,6 @@ TEST_CASE("kernel_aarch64_supported_quantized_dispatch_is_alloc_free") {
       .src0 = make_packed_q8_0_x4_bl4_src(q8_packed_storage.data(), QK8_0, 4u),
       .src1 = make_q8_0_vector_src(q8_input.data(), QK8_0),
       .dst = make_dst(q8_out, dtype::f32, 1, 4u),
-      .nth = 1,
   };
 
   aarch64_sm machine{};
@@ -1861,7 +1803,6 @@ TEST_CASE("kernel_aarch64_q6_matrix_dispatch_does_not_claim_vector_path") {
       .src0 = make_quantized_src(&q6, dtype::q6_k, QK_K, 1),
       .src1 = make_src(input.data(), dtype::f32, 2u, QK_K),
       .dst = make_dst(out.data(), dtype::f32, 2u, 1u),
-      .nth = 1,
   };
 
   aarch64_sm machine{};
@@ -1879,6 +1820,9 @@ TEST_CASE("kernel_aarch64_q6_matrix_dispatch_does_not_claim_vector_path") {
 }
 
 TEST_CASE("kernel_aarch64_q6_packed_vector_route_is_explicit_and_numeric_match") {
+#if !(defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD))
+  CHECK(true);
+#else
   using emel::kernel::detail::quant::QK_K;
   using emel::kernel::detail::quant::block_q6_k;
 
@@ -1946,19 +1890,16 @@ TEST_CASE("kernel_aarch64_q6_packed_vector_route_is_explicit_and_numeric_match")
       .src0 = make_quantized_src(native_rows.data(), dtype::q6_k, QK_K, k_rows),
       .src1 = make_src(input.data(), dtype::f32, 1u, QK_K),
       .dst = make_dst(native_out.data(), dtype::f32, 1u, k_rows),
-      .nth = 1,
   };
   const emel::kernel::event::op_mul_mat packed_ev{
       .src0 = make_packed_q6_k_x8_src(packed_storage.data(), QK_K, k_rows),
       .src1 = make_q8_k_vector_src(q8_input.data(), QK_K),
       .dst = make_dst(packed_out.data(), dtype::f32, 1u, k_rows),
-      .nth = 1,
   };
   const emel::kernel::event::op_mul_mat prepared_ev{
       .src0 = make_prepared_q6_k_x8_q8_src(prepared_storage.data(), QK_K, k_rows),
       .src1 = make_q8_k_vector_src(q8_input.data(), QK_K),
       .dst = make_dst(prepared_out.data(), dtype::f32, 1u, k_rows),
-      .nth = 1,
   };
 
   aarch64_sm native_machine{};
@@ -2004,11 +1945,12 @@ TEST_CASE("kernel_aarch64_q6_packed_vector_route_is_explicit_and_numeric_match")
   CHECK(prepared_machine.optimized_q6_vector_prepared_q8_rhs_i8mm_dispatch_count() == 0u);
   CHECK(prepared_machine.shared_q6_dispatch_count() == 0u);
 #endif
+#endif
 }
 
 TEST_CASE("kernel_aarch64_q4_packed_vector_route_is_explicit_and_numeric_match") {
 #if !(defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD))
-  SUCCEED();
+  CHECK(true);
 #else
   using emel::kernel::detail::quant::Q4_K_X8_ROWS;
   using emel::kernel::detail::quant::QK_K;
@@ -2054,19 +1996,16 @@ TEST_CASE("kernel_aarch64_q4_packed_vector_route_is_explicit_and_numeric_match")
       .src0 = make_quantized_src(native_rows.data(), dtype::q4_k, QK_K, Q4_K_X8_ROWS),
       .src1 = make_src(input.data(), dtype::f32, 1u, QK_K),
       .dst = make_dst(native_out.data(), dtype::f32, 1u, Q4_K_X8_ROWS),
-      .nth = 1,
   };
   const emel::kernel::event::op_mul_mat packed_bl4_ev{
       .src0 = make_packed_q4_k_x8_bl4_src(packed_bl4_storage.data(), QK_K, Q4_K_X8_ROWS),
       .src1 = make_q8_k_vector_src(q8_input.data(), QK_K),
       .dst = make_dst(packed_bl4_out.data(), dtype::f32, 1u, Q4_K_X8_ROWS),
-      .nth = 1,
   };
   const emel::kernel::event::op_mul_mat packed_bl8_ev{
       .src0 = make_packed_q4_k_x8_bl8_src(packed_bl8_storage.data(), QK_K, Q4_K_X8_ROWS),
       .src1 = make_q8_k_vector_src(q8_input.data(), QK_K),
       .dst = make_dst(packed_bl8_out.data(), dtype::f32, 1u, Q4_K_X8_ROWS),
-      .nth = 1,
   };
 
   aarch64_sm native_machine{};
@@ -2142,7 +2081,6 @@ TEST_CASE("kernel_aarch64_sm_reports_q8_0_packed_dispatch_at_kernel_seam") {
       .src0 = make_packed_q8_0_x4_bl8_src(packed_bl8.data(), QK8_0, row_count),
       .src1 = make_q8_0_vector_src(q8_input.data(), QK8_0),
       .dst = make_dst(out, dtype::f32, 1u, row_count),
-      .nth = 1,
   };
   CHECK(machine.process_event(ev));
   CHECK(machine.optimized_q8_0_dispatch_count() == 1u);
@@ -2157,7 +2095,6 @@ TEST_CASE("kernel_aarch64_sm_reports_q8_0_packed_dispatch_at_kernel_seam") {
       .src0 = make_packed_q8_0_x4_bl4_src(packed_bl4.data(), QK8_0, row_count),
       .src1 = make_q8_0_vector_src(q8_input.data(), QK8_0),
       .dst = make_dst(out, dtype::f32, 1u, row_count),
-      .nth = 1,
   };
   CHECK(machine.process_event(ev));
   CHECK(machine.optimized_q8_0_dispatch_count() == 1u);
@@ -2172,7 +2109,6 @@ TEST_CASE("kernel_aarch64_sm_reports_q8_0_packed_dispatch_at_kernel_seam") {
       .src0 = make_packed_q8_0_x4_bl4_src(packed_bl4.data(), QK8_0, row_count),
       .src1 = make_q8_0_vector_src(q8_input.data(), QK8_0),
       .dst = make_dst(out, dtype::f32, 1u, row_count),
-      .nth = 1,
   };
   CHECK_FALSE(machine.process_event(ev));
   CHECK(machine.optimized_q8_0_dispatch_count() == 0u);
@@ -2239,7 +2175,6 @@ TEST_CASE("kernel_aarch64_q8_0_packed_route_is_explicit_and_numeric_match") {
       .src0 = make_packed_q8_0_x4_bl8_src(packed_bl8.data(), QK8_0, row_count),
       .src1 = make_q8_0_vector_src(q8_input.data(), QK8_0),
       .dst = make_dst(packed_out.data(), dtype::f32, 1u, row_count),
-      .nth = 1,
   };
   CHECK(machine.process_event(ev));
   for (size_t row = 0; row < packed_out.size(); ++row) {
@@ -2257,7 +2192,6 @@ TEST_CASE("kernel_aarch64_q8_0_packed_route_is_explicit_and_numeric_match") {
       .src0 = make_packed_q8_0_x4_bl4_src(packed_bl4.data(), QK8_0, row_count),
       .src1 = make_q8_0_vector_src(q8_input.data(), QK8_0),
       .dst = make_dst(packed_out.data(), dtype::f32, 1u, row_count),
-      .nth = 1,
   };
   CHECK(machine.process_event(ev));
   for (size_t row = 0; row < packed_out.size(); ++row) {
@@ -2275,7 +2209,6 @@ TEST_CASE("kernel_aarch64_q8_0_packed_route_is_explicit_and_numeric_match") {
       .src0 = make_packed_q8_0_x4_bl4_src(packed_bl4.data(), QK8_0, row_count),
       .src1 = make_q8_0_vector_src(q8_input.data(), QK8_0),
       .dst = make_dst(packed_out.data(), dtype::f32, 1u, row_count),
-      .nth = 1,
   };
   CHECK_FALSE(machine.process_event(ev));
   CHECK(machine.optimized_q8_0_dispatch_count() == 0u);
@@ -2348,7 +2281,6 @@ TEST_CASE("kernel_aarch64_q8_0_packed_route_matches_multi_block_native_reference
       .src0 = make_packed_q8_0_x4_bl8_src(packed_bl8.data(), col_count, row_count),
       .src1 = make_q8_0_vector_src(q8_input.data(), col_count),
       .dst = make_dst(packed_out.data(), dtype::f32, 1u, row_count),
-      .nth = 1,
   };
   CHECK(machine.process_event(ev));
   for (size_t row = 0; row < packed_out.size(); ++row) {
@@ -2363,7 +2295,6 @@ TEST_CASE("kernel_aarch64_q8_0_packed_route_matches_multi_block_native_reference
       .src0 = make_packed_q8_0_x4_bl4_src(packed_bl4.data(), col_count, row_count),
       .src1 = make_q8_0_vector_src(q8_input.data(), col_count),
       .dst = make_dst(packed_out.data(), dtype::f32, 1u, row_count),
-      .nth = 1,
   };
   CHECK(machine.process_event(ev));
   for (size_t row = 0; row < packed_out.size(); ++row) {
@@ -2374,7 +2305,6 @@ TEST_CASE("kernel_aarch64_q8_0_packed_route_matches_multi_block_native_reference
       .src0 = make_packed_q8_0_x4_bl4_src(packed_bl4.data(), col_count, row_count),
       .src1 = make_q8_0_vector_src(q8_input.data(), col_count),
       .dst = make_dst(packed_out.data(), dtype::f32, 1u, row_count),
-      .nth = 1,
   };
   CHECK_FALSE(machine.process_event(ev));
 #endif
@@ -2453,7 +2383,6 @@ TEST_CASE("kernel_aarch64_q8_0_packed_bl8_matrix_x4_route_is_explicit_and_numeri
       .src0 = make_packed_q8_0_x4_bl8_src(packed_bl8.data(), col_count, row_count),
       .src1 = make_packed_q8_0_x4_bl8_rhs_src(rhs_packed_bl8.data(), rhs_rows, col_count),
       .dst = make_batch_major_dst(packed_out.data(), dtype::f32, rhs_rows, row_count),
-      .nth = 1,
   };
   CHECK(machine.process_event(ev));
   for (size_t idx = 0; idx < packed_out.size(); ++idx) {
@@ -2469,7 +2398,6 @@ TEST_CASE("kernel_aarch64_q8_0_packed_bl8_matrix_x4_route_is_explicit_and_numeri
       .src0 = make_packed_q8_0_x4_bl8_src(packed_bl8.data(), col_count, row_count),
       .src1 = make_packed_q8_0_x4_bl8_rhs_src(rhs_packed_bl8.data(), rhs_rows, col_count),
       .dst = make_batch_major_dst(packed_out.data(), dtype::f32, rhs_rows, row_count),
-      .nth = 1,
   };
   CHECK_FALSE(machine.process_event(ev));
   CHECK(machine.optimized_q8_0_packed_bl8_matrix_x4_dispatch_count() == 0u);
@@ -2544,7 +2472,6 @@ TEST_CASE("kernel_aarch64_q4_k_packed_bl8_matrix_x4_route_is_explicit_and_numeri
       .src0 = make_packed_q4_k_x8_bl8_src(packed_bl8.data(), col_count, row_count),
       .src1 = make_q8_k_x4_rhs_src(rhs_q8.data(), rhs_rows, col_count),
       .dst = make_batch_major_dst(packed_out.data(), dtype::f32, rhs_rows, row_count),
-      .nth = 1,
   };
   CHECK(machine.process_event(ev));
   for (size_t idx = 0; idx < packed_out.size(); ++idx) {
@@ -2559,7 +2486,6 @@ TEST_CASE("kernel_aarch64_q4_k_packed_bl8_matrix_x4_route_is_explicit_and_numeri
       .src0 = make_packed_q4_k_x8_bl8_src(packed_bl8.data(), col_count, row_count),
       .src1 = make_q8_k_x4_rhs_src(rhs_q8.data(), rhs_rows, col_count),
       .dst = make_batch_major_dst(packed_out.data(), dtype::f32, rhs_rows, row_count),
-      .nth = 1,
   };
   CHECK_FALSE(machine.process_event(ev));
 #endif
@@ -2633,7 +2559,6 @@ TEST_CASE("kernel_aarch64_q4_k_packed_bl8_matrix_x8_route_is_explicit_and_numeri
       .src0 = make_packed_q4_k_x8_bl8_src(packed_bl8.data(), col_count, row_count),
       .src1 = make_q8_k_x8_rhs_src(rhs_q8.data(), rhs_rows, col_count),
       .dst = make_batch_major_dst(packed_out.data(), dtype::f32, rhs_rows, row_count),
-      .nth = 1,
   };
   CHECK(machine.process_event(ev));
   for (size_t idx = 0; idx < packed_out.size(); ++idx) {
@@ -2648,7 +2573,6 @@ TEST_CASE("kernel_aarch64_q4_k_packed_bl8_matrix_x8_route_is_explicit_and_numeri
       .src0 = make_packed_q4_k_x8_bl8_src(packed_bl8.data(), col_count, row_count),
       .src1 = make_q8_k_x8_rhs_src(rhs_q8.data(), rhs_rows, col_count),
       .dst = make_batch_major_dst(packed_out.data(), dtype::f32, rhs_rows, row_count),
-      .nth = 1,
   };
   CHECK_FALSE(machine.process_event(ev));
 #endif
@@ -2726,7 +2650,6 @@ TEST_CASE(
       .src0 = make_prepared_q6_k_x8_q8_src(prepared_storage.data(), col_count, row_count),
       .src1 = make_q8_k_x4_rhs_src(rhs_q8.data(), rhs_rows, col_count),
       .dst = make_batch_major_dst(prepared_out.data(), dtype::f32, rhs_rows, row_count),
-      .nth = 1,
   };
   CHECK(machine.process_event(ev));
   for (size_t idx = 0; idx < prepared_out.size(); ++idx) {
@@ -2743,7 +2666,6 @@ TEST_CASE(
       .src0 = make_prepared_q6_k_x8_q8_src(prepared_storage.data(), col_count, row_count),
       .src1 = make_q8_k_x4_rhs_src(rhs_q8.data(), rhs_rows, col_count),
       .dst = make_batch_major_dst(prepared_out.data(), dtype::f32, rhs_rows, row_count),
-      .nth = 1,
   };
   CHECK_FALSE(machine.process_event(ev));
 #endif
@@ -2821,7 +2743,6 @@ TEST_CASE(
       .src0 = make_prepared_q6_k_x8_q8_src(prepared_storage.data(), col_count, row_count),
       .src1 = make_q8_k_x8_rhs_src(rhs_q8.data(), rhs_rows, col_count),
       .dst = make_batch_major_dst(prepared_out.data(), dtype::f32, rhs_rows, row_count),
-      .nth = 1,
   };
   CHECK(machine.process_event(ev));
   for (size_t idx = 0; idx < prepared_out.size(); ++idx) {
@@ -2838,7 +2759,6 @@ TEST_CASE(
       .src0 = make_prepared_q6_k_x8_q8_src(prepared_storage.data(), col_count, row_count),
       .src1 = make_q8_k_x8_rhs_src(rhs_q8.data(), rhs_rows, col_count),
       .dst = make_batch_major_dst(prepared_out.data(), dtype::f32, rhs_rows, row_count),
-      .nth = 1,
   };
   CHECK_FALSE(machine.process_event(ev));
 #endif
@@ -2874,13 +2794,11 @@ TEST_CASE("kernel_aarch64_q8_0_vector_route_is_explicit_and_numeric_match") {
       .src0 = make_quantized_src(q8_rows.data(), dtype::q8_0, QK8_0, row_count),
       .src1 = make_src(input.data(), dtype::f32, 1u, QK8_0),
       .dst = make_dst(optimized_out.data(), dtype::f32, 1u, row_count),
-      .nth = 1,
   };
   const emel::kernel::event::op_mul_mat scalar_ev{
       .src0 = make_quantized_src(q8_rows.data(), dtype::q8_0, QK8_0, row_count),
       .src1 = make_src(input.data(), dtype::f32, 1u, QK8_0),
       .dst = make_dst(scalar_out.data(), dtype::f32, 1u, row_count),
-      .nth = 1,
   };
 
   aarch64_sm machine{};
@@ -2903,6 +2821,9 @@ TEST_CASE("kernel_aarch64_q8_0_vector_route_is_explicit_and_numeric_match") {
 }
 
 TEST_CASE("kernel_aarch64_q6_packed_vector_argmax_route_is_explicit_and_numeric_match") {
+#if !(defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD))
+  CHECK(true);
+#else
   using emel::kernel::detail::quant::QK_K;
   using emel::kernel::detail::quant::block_q6_k;
 
@@ -2961,7 +2882,6 @@ TEST_CASE("kernel_aarch64_q6_packed_vector_argmax_route_is_explicit_and_numeric_
       .src0 = make_quantized_src(native_rows.data(), dtype::q6_k, QK_K, k_rows),
       .src1 = make_src(input.data(), dtype::f32, 1u, QK_K),
       .dst = make_dst(native_out.data(), dtype::f32, 1u, k_rows),
-      .nth = 1,
   };
   float argmax_score = 0.0f;
   int32_t argmax_index = -1;
@@ -2971,14 +2891,12 @@ TEST_CASE("kernel_aarch64_q6_packed_vector_argmax_route_is_explicit_and_numeric_
       .src0 = make_packed_q6_k_x8_src(packed_storage.data(), QK_K, k_rows),
       .src1 = make_q8_k_vector_src(q8_input.data(), QK_K),
       .dst = make_dst(&argmax_score, dtype::f32, 1u, 1u),
-      .nth = 1,
       .index_out = &argmax_index,
   };
   const emel::kernel::event::op_mul_mat_argmax argmax_prepared_ev{
       .src0 = make_argmax_prepared_q6_k_x8_q8_src(argmax_prepared_storage.data(), QK_K, k_rows),
       .src1 = make_q8_k_vector_src(q8_input.data(), QK_K),
       .dst = make_dst(&argmax_prepared_score, dtype::f32, 1u, 1u),
-      .nth = 1,
       .index_out = &argmax_prepared_index,
   };
 
@@ -3039,6 +2957,7 @@ TEST_CASE("kernel_aarch64_q6_packed_vector_argmax_route_is_explicit_and_numeric_
         0u);
   CHECK(argmax_prepared_machine.shared_q6_dispatch_count() == 0u);
 #endif
+#endif
 }
 
 TEST_CASE("kernel_aarch64_detail_branch_paths") {
@@ -3050,7 +2969,6 @@ TEST_CASE("kernel_aarch64_detail_branch_paths") {
       .src0 = make_src(lhs, dtype::f32, 4),
       .src1 = make_src(rhs, dtype::f32, 4),
       .dst = make_dst(dst, dtype::f32, 4),
-      .nth = 1,
   };
 
   CHECK_FALSE(emel::kernel::aarch64::detail::can_use_neon(add_ev, false));
@@ -3091,7 +3009,6 @@ TEST_CASE("kernel_aarch64_detail_branch_paths") {
       .src0 = make_quantized_src(&q2, dtype::q2_k, QK_K, 1),
       .src1 = make_src(quant_rhs, dtype::f32, 1, QK_K),
       .dst = make_dst(quant_dst, dtype::f32, 1, 1),
-      .nth = 1,
   };
 #if defined(__aarch64__) || defined(__ARM_NEON)
   CHECK(emel::kernel::aarch64::detail::can_use_neon(quant_mul_mat_ev, true));
@@ -3102,7 +3019,6 @@ TEST_CASE("kernel_aarch64_detail_branch_paths") {
   emel::kernel::event::op_unary unary_ev{
       .src0 = make_src(lhs, dtype::f32, 4),
       .dst = make_dst(dst, dtype::f32, 4),
-      .nth = 1,
       .subop = emel::kernel::event::unary_subop::relu,
   };
 #if defined(__aarch64__) || defined(__ARM_NEON)
@@ -3132,41 +3048,34 @@ TEST_CASE("kernel_aarch64_detail_helper_edge_paths") {
   const emel::kernel::event::op_dup dup_ev{
       .src0 = make_src(src0, dtype::f32, 4),
       .dst = make_dst(dst0, dtype::f32, 4),
-      .nth = 1,
   };
   const emel::kernel::event::op_add add_ev{
       .src0 = make_src(src0, dtype::f32, 4),
       .src1 = make_src(src0, dtype::f32, 4),
       .dst = make_dst(dst0, dtype::f32, 4),
-      .nth = 1,
   };
   const emel::kernel::event::op_mul mul_ev{
       .src0 = make_src(src0, dtype::f32, 4),
       .src1 = make_src(src0, dtype::f32, 4),
       .dst = make_dst(dst0, dtype::f32, 4),
-      .nth = 1,
   };
   const emel::kernel::event::op_div div_ev{
       .src0 = make_src(src0, dtype::f32, 4),
       .src1 = make_src(src0, dtype::f32, 4),
       .dst = make_dst(dst0, dtype::f32, 4),
-      .nth = 1,
   };
   const emel::kernel::event::op_sqr sqr_ev{
       .src0 = make_src(src0, dtype::f32, 4),
       .dst = make_dst(dst0, dtype::f32, 4),
-      .nth = 1,
   };
   const emel::kernel::event::op_sqrt sqrt_ev{
       .src0 = make_src(src0, dtype::f32, 4),
       .dst = make_dst(dst0, dtype::f32, 4),
-      .nth = 1,
   };
   const emel::kernel::event::op_sub sub_ev{
       .src0 = make_src(src0, dtype::f32, 4),
       .src1 = make_src(src0, dtype::f32, 4),
       .dst = make_dst(dst0, dtype::f32, 4),
-      .nth = 1,
   };
   float src_mm0[8] = {1.0f, 0.5f, -1.0f, 2.0f, 0.0f, -0.5f, 3.0f, 1.0f};
   float src_mm1[32] = {
@@ -3180,12 +3089,10 @@ TEST_CASE("kernel_aarch64_detail_helper_edge_paths") {
       .src0 = make_src(src_mm0, dtype::f32, 4, 2),
       .src1 = make_src(src_mm1, dtype::f32, 8, 4),
       .dst = make_dst(dst_mm, dtype::f32, 8, 2),
-      .nth = 1,
   };
   emel::kernel::event::op_unary unary_ev{
       .src0 = make_src(src0, dtype::f32, 4),
       .dst = make_dst(dst0, dtype::f32, 4),
-      .nth = 1,
       .subop = emel::kernel::event::unary_subop::relu,
   };
 
@@ -3263,7 +3170,6 @@ TEST_CASE("kernel_aarch64_rejects_unimplemented_ops") {
   const emel::kernel::event::op_sum sum_ev{
       .src0 = make_src(src, dtype::f32, 4),
       .dst = make_dst(dst, dtype::f32, 4),
-      .nth = 1,
   };
 
   aarch64_sm machine{};
@@ -3277,7 +3183,6 @@ TEST_CASE("kernel_aarch64_unary_subop_scalar_paths") {
   emel::kernel::event::op_unary unary_ev{
       .src0 = make_src(src, dtype::f32, 4),
       .dst = make_dst(dst, dtype::f32, 4),
-      .nth = 1,
       .subop = emel::kernel::event::unary_subop::abs,
   };
 
@@ -3428,7 +3333,6 @@ TEST_CASE("kernel_aarch64_flash_attn_ext_matches_shared_workspace_on_long_kv_spa
   request.src1.nb[2] = sizeof(uint16_t) * head_dim;
   request.src2.nb[1] = sizeof(uint16_t) * kv_dim;
   request.src2.nb[2] = sizeof(uint16_t) * head_dim;
-  request.nth = 1;
 
   const float scale = 1.0f / std::sqrt(static_cast<float>(head_dim));
   std::memcpy(request.op_params.data(), &scale, sizeof(scale));
@@ -3498,7 +3402,6 @@ TEST_CASE("kernel_aarch64_flash_attn_ext_matches_online_softmax_f16_reference_on
   request.src1.nb[2] = sizeof(uint16_t) * head_dim;
   request.src2.nb[1] = sizeof(uint16_t) * kv_dim;
   request.src2.nb[2] = sizeof(uint16_t) * head_dim;
-  request.nth = 1;
 
   const float scale = 1.0f / std::sqrt(static_cast<float>(head_dim));
   std::memcpy(request.op_params.data(), &scale, sizeof(scale));
@@ -3587,7 +3490,6 @@ TEST_CASE("kernel_aarch64_flash_attn_ext_matches_masked_total_token_reference")
   request.src1 = make_src(k_fp16.data(), dtype::f16, head_dim, kv_tokens, 1u, 1u);
   request.src2 = make_src(v_fp16.data(), dtype::f16, head_dim, kv_tokens, 1u, 1u);
   request.dst = make_dst(neon_dst.data(), dtype::f32, head_dim, 1u, 1u, 1u);
-  request.nth = 1;
   std::memcpy(request.op_params.data(), &scale, sizeof(scale));
   std::memcpy(request.op_params.data() + sizeof(scale), &total_tokens, sizeof(total_tokens));
   request.op_params_size = sizeof(scale) + sizeof(total_tokens);
@@ -3672,7 +3574,6 @@ TEST_CASE("kernel_aarch64_flash_attn_ext_matches_masked_total_token_reference_on
   request.src1.nb[2] = sizeof(uint16_t) * head_dim;
   request.src2.nb[1] = sizeof(uint16_t) * kv_dim;
   request.src2.nb[2] = sizeof(uint16_t) * head_dim;
-  request.nth = 1;
   std::memcpy(request.op_params.data(), &scale, sizeof(scale));
   std::memcpy(request.op_params.data() + sizeof(scale), &total_tokens, sizeof(total_tokens));
   request.op_params_size = sizeof(scale) + sizeof(total_tokens);
@@ -3799,7 +3700,6 @@ TEST_CASE("kernel_aarch64_flash_attn_ext_matches_online_softmax_f16_reference")
   request.src1 = make_src(k_fp16.data(), dtype::f16, head_dim, kv_tokens, 1, 1);
   request.src2 = make_src(v_fp16.data(), dtype::f16, head_dim, kv_tokens, 1, 1);
   request.dst = make_dst(neon_dst.data(), dtype::f32, head_dim, 1, 1, 1);
-  request.nth = 1;
   std::memcpy(request.op_params.data(), &scale, sizeof(scale));
   request.op_params_size = sizeof(scale);
 
diff --git a/tests/kernel/lifecycle_tests.cpp b/tests/kernel/lifecycle_tests.cpp
index d9d933c4..9cb475ba 100644
--- a/tests/kernel/lifecycle_tests.cpp
+++ b/tests/kernel/lifecycle_tests.cpp
@@ -139,25 +139,21 @@ TEST_CASE("kernel_mul_mat_accepts_quantized_qk_weights") {
       .src0 = make_quantized_src(&q2, dtype::q2_k, QK_K, 1),
       .src1 = make_src(input.data(), dtype::f32, 1, QK_K),
       .dst = make_dst(q2_out, dtype::f32, 1, 1),
-      .nth = 1,
   };
   const emel::kernel::event::op_mul_mat q3_ev{
       .src0 = make_quantized_src(&q3, dtype::q3_k, QK_K, 1),
       .src1 = make_src(input.data(), dtype::f32, 1, QK_K),
       .dst = make_dst(q3_out, dtype::f32, 1, 1),
-      .nth = 1,
   };
   const emel::kernel::event::op_mul_mat q4_ev{
       .src0 = make_quantized_src(&q4, dtype::q4_k, QK_K, 1),
       .src1 = make_src(input.data(), dtype::f32, 1, QK_K),
       .dst = make_dst(q4_out, dtype::f32, 1, 1),
-      .nth = 1,
   };
   const emel::kernel::event::op_mul_mat q6_ev{
       .src0 = make_quantized_src(&q6, dtype::q6_k, QK_K, 1),
       .src1 = make_src(input.data(), dtype::f32, 1, QK_K),
       .dst = make_dst(q6_out, dtype::f32, 1, 1),
-      .nth = 1,
   };
 
   CHECK(machine.process_event(q2_ev));
@@ -206,7 +202,6 @@ TEST_CASE("kernel_mul_mat_argmax_accepts_q4_k_weights") {
       .src0 = make_quantized_src(q4_rows.data(), dtype::q4_k, QK_K, 2),
       .src1 = make_src(input.data(), dtype::f32, 1, QK_K),
       .dst = make_dst(&best_score, dtype::f32, 1, 1),
-      .nth = 1,
       .index_out = &best_index,
   };
 
@@ -238,7 +233,6 @@ TEST_CASE("kernel_mul_mat_accepts_q8_0_weights") {
       .src0 = make_quantized_src(q8_rows.data(), dtype::q8_0, QK8_0, 2),
       .src1 = make_src(input.data(), dtype::f32, 1, QK8_0),
       .dst = make_dst(out.data(), dtype::f32, 1, 2),
-      .nth = 1,
   };
 
   CHECK(machine.process_event(q8_ev));
@@ -268,7 +262,6 @@ TEST_CASE("kernel_mul_mat_argmax_accepts_q8_0_weights") {
       .src0 = make_quantized_src(q8_rows.data(), dtype::q8_0, QK8_0, 2),
       .src1 = make_src(input.data(), dtype::f32, 1, QK8_0),
       .dst = make_dst(&best_score, dtype::f32, 1, 1),
-      .nth = 1,
       .index_out = &best_index,
   };
 
@@ -294,13 +287,11 @@ TEST_CASE("kernel_mul_mat_rejects_packed_q6_q8_requests_without_explicit_simd_ro
       .src0 = make_packed_q6_k_x8_src(packed_storage.data(), QK_K, k_rows),
       .src1 = make_q8_k_vector_src(q8_input.data(), QK_K),
       .dst = make_dst(packed_out.data(), dtype::f32, 1u, k_rows),
-      .nth = 1,
   };
   const emel::kernel::event::op_mul_mat prepared_ev{
       .src0 = make_prepared_q6_k_x8_q8_src(prepared_storage.data(), QK_K, k_rows),
       .src1 = make_q8_k_vector_src(q8_input.data(), QK_K),
       .dst = make_dst(prepared_out.data(), dtype::f32, 1u, k_rows),
-      .nth = 1,
   };
 
   CHECK(emel::kernel::detail::validate_dispatch_request(packed_ev));
@@ -330,7 +321,6 @@ TEST_CASE("kernel_aarch64_backend_reports_q2_vectorized_or_shared_dispatch") {
       .src0 = make_quantized_src(&q2, dtype::q2_k, QK_K, 1),
       .src1 = make_src(input.data(), dtype::f32, 1, QK_K),
       .dst = make_dst(q2_out, dtype::f32, 1, 1),
-      .nth = 1,
   };
 
   aarch64_sm machine{};
@@ -366,7 +356,6 @@ TEST_CASE("kernel_aarch64_backend_reports_q3_vectorized_or_shared_dispatch") {
       .src0 = make_quantized_src(&q3, dtype::q3_k, QK_K, 1),
       .src1 = make_src(input.data(), dtype::f32, 1, QK_K),
       .dst = make_dst(q3_out, dtype::f32, 1, 1),
-      .nth = 1,
   };
 
   aarch64_sm machine{};
@@ -402,7 +391,6 @@ TEST_CASE("kernel_aarch64_backend_reports_q6_vectorized_or_shared_dispatch") {
       .src0 = make_quantized_src(&q6, dtype::q6_k, QK_K, 1),
       .src1 = make_src(input.data(), dtype::f32, 1, QK_K),
       .dst = make_dst(q6_out, dtype::f32, 1, 1),
-      .nth = 1,
   };
 
   aarch64_sm machine{};
@@ -441,30 +429,25 @@ TEST_CASE("kernel_backends_expose_explicit_op_transitions") {
   const emel::kernel::event::op_dup op_dup_ok{
       .src0 = make_src(src0, dtype::f32, 4),
       .dst = make_dst(dst, dtype::f32, 4),
-      .nth = 1,
   };
   const emel::kernel::event::op_add op_add_ok{
       .src0 = make_src(src0, dtype::f32, 4),
       .src1 = make_src(src1, dtype::f32, 4),
       .dst = make_dst(dst, dtype::f32, 4),
-      .nth = 1,
   };
   const emel::kernel::event::op_mul op_mul_ok{
       .src0 = make_src(src0, dtype::f32, 4),
       .src1 = make_src(src1, dtype::f32, 4),
       .dst = make_dst(dst, dtype::f32, 4),
-      .nth = 1,
   };
   const emel::kernel::event::op_mul_mat op_mul_mat_ok{
       .src0 = make_src(src0, dtype::f32, 2, 2),
       .src1 = make_src(src1, dtype::f32, 2, 2),
       .dst = make_dst(dst, dtype::f32, 2, 2),
-      .nth = 1,
   };
   const emel::kernel::event::op_soft_max op_soft_max_ok{
       .src0 = make_src(src0, dtype::f32, 4),
       .dst = make_dst(dst, dtype::f32, 4),
-      .nth = 1,
   };
 
   emel::kernel::event::op_dup op_dup_invalid = op_dup_ok;
@@ -512,13 +495,11 @@ TEST_CASE("kernel_aggregate_actions_cover_maintained_lanes") {
       .src0 = make_src(lhs, dtype::f32, 4),
       .src1 = make_src(rhs, dtype::f32, 4),
       .dst = make_dst(primary_out, dtype::f32, 4),
-      .nth = 1,
   };
   const emel::kernel::event::op_add secondary_ev{
       .src0 = make_src(lhs, dtype::f32, 4),
       .src1 = make_src(rhs, dtype::f32, 4),
       .dst = make_dst(secondary_out, dtype::f32, 4),
-      .nth = 1,
   };
 
   emel::kernel::action::context ctx{};
@@ -556,7 +537,6 @@ TEST_CASE("kernel_validation_branch_paths") {
       .src0 = make_src(src0, dtype::f32, 4),
       .src1 = make_src(src1, dtype::f32, 4),
       .dst = make_dst(dst, dtype::f32, 4),
-      .nth = 1,
   };
 
   CHECK(emel::kernel::detail::validate_dispatch_request(ev));
@@ -573,14 +553,6 @@ TEST_CASE("kernel_validation_branch_paths") {
   invalid.dst.data = nullptr;
   CHECK_FALSE(emel::kernel::detail::validate_dispatch_request(invalid));
 
-  invalid = ev;
-  invalid.nth = 0;
-  CHECK_FALSE(emel::kernel::detail::validate_dispatch_request(invalid));
-
-  invalid = ev;
-  invalid.ith = 1;
-  CHECK_FALSE(emel::kernel::detail::validate_dispatch_request(invalid));
-
   invalid = ev;
   invalid.op_params_size = static_cast<uint32_t>(invalid.op_params.size() + 1);
   CHECK_FALSE(emel::kernel::detail::validate_dispatch_request(invalid));
@@ -598,7 +570,6 @@ TEST_CASE("kernel_detail_negative_compute_paths") {
   emel::kernel::event::op_dup dup_ev{
       .src0 = make_src(src0, dtype::f32, 4),
       .dst = make_dst(dst, dtype::f32, 3),
-      .nth = 1,
   };
   CHECK_FALSE(emel::kernel::detail::run_copy(dup_ev));
 
@@ -606,7 +577,6 @@ TEST_CASE("kernel_detail_negative_compute_paths") {
       .src0 = make_src(src0, dtype::f32, 4),
       .src1 = make_src(src1, dtype::f32, 3),
       .dst = make_dst(dst, dtype::f32, 4),
-      .nth = 1,
   };
   CHECK_FALSE(emel::kernel::detail::run_binary(
       add_ev, [](const float lhs, const float rhs) { return lhs + rhs; }));
@@ -614,7 +584,6 @@ TEST_CASE("kernel_detail_negative_compute_paths") {
   emel::kernel::event::op_sqr sqr_ev{
       .src0 = make_src(src0, dtype::f32, 4),
       .dst = make_dst(dst, dtype::f32, 3),
-      .nth = 1,
   };
   CHECK_FALSE(emel::kernel::detail::run_unary(
       sqr_ev, [](const float value) { return value * value; }));
@@ -623,7 +592,6 @@ TEST_CASE("kernel_detail_negative_compute_paths") {
       .src0 = make_src(src0, dtype::f32, 2, 2),
       .src1 = make_src(src1, dtype::f32, 2, 3),
       .dst = make_dst(dst, dtype::f32, 2, 2),
-      .nth = 1,
   };
   CHECK_FALSE(emel::kernel::detail::run_mul_mat(mul_mat_ev));
   mul_mat_ev.src0.ne[0] = 0;
@@ -632,7 +600,6 @@ TEST_CASE("kernel_detail_negative_compute_paths") {
   emel::kernel::event::op_soft_max soft_max_ev{
       .src0 = make_src(src0, dtype::f32, 0, 2),
       .dst = make_dst(dst, dtype::f32, 4, 2),
-      .nth = 1,
   };
   CHECK_FALSE(emel::kernel::detail::run_soft_max(soft_max_ev));
 }
@@ -684,7 +651,6 @@ TEST_CASE("kernel_detail_stride_paths_and_scalar_helpers") {
       .src0 = src0,
       .src1 = src1,
       .dst = dst,
-      .nth = 1,
   };
   CHECK(emel::kernel::detail::run_binary(
       add_ev, [](const float lhs, const float rhs) { return lhs + rhs; }));
@@ -697,7 +663,6 @@ TEST_CASE("kernel_detail_stride_paths_and_scalar_helpers") {
       .src0 = make_src(src0_storage, dtype::f32, 2, 2),
       .src1 = make_src(src1_storage, dtype::f32, 2, 2),
       .dst = make_dst(dst_storage, dtype::f32, 2, 2),
-      .nth = 1,
   };
   mul_mat_ev.src0.nb[0] = sizeof(float) * 2;
   mul_mat_ev.src0.nb[1] = mul_mat_ev.src0.nb[0] * mul_mat_ev.src0.ne[0];
@@ -715,7 +680,6 @@ TEST_CASE("kernel_detail_stride_paths_and_scalar_helpers") {
   emel::kernel::event::op_soft_max soft_max_ev{
       .src0 = make_src(src0_storage, dtype::f32, 2, 2),
       .dst = make_dst(dst_storage, dtype::f32, 2, 2),
-      .nth = 1,
   };
   soft_max_ev.src0.nb[0] = sizeof(float) * 2;
   soft_max_ev.src0.nb[1] = soft_max_ev.src0.nb[0] * soft_max_ev.src0.ne[0];
@@ -730,7 +694,6 @@ TEST_CASE("kernel_detail_stride_paths_and_scalar_helpers") {
       .src0 = make_src(src0_storage, dtype::f32, 4),
       .src1 = make_src(src1_storage, dtype::f32, 4),
       .dst = make_dst(dst_storage, dtype::f32, 4),
-      .nth = 1,
   };
   emel::kernel::detail::execute_scalar_unchecked(div_ev);
   CHECK(dst_storage[0] == doctest::Approx(0.5f));
@@ -738,7 +701,6 @@ TEST_CASE("kernel_detail_stride_paths_and_scalar_helpers") {
   const emel::kernel::event::op_sum unsupported_ev{
       .src0 = make_src(src0_storage, dtype::f32, 4),
       .dst = make_dst(dst_storage, dtype::f32, 4),
-      .nth = 1,
   };
   CHECK_FALSE(emel::kernel::detail::execute_scalar(unsupported_ev));
 }
@@ -752,7 +714,6 @@ TEST_CASE("kernel_backends_reject_quantized_dispatch_dtypes") {
       .src0 = make_src(src0, dtype::q4_0, 4),
       .src1 = make_src(src1, dtype::q4_0, 4),
       .dst = make_dst(dst, dtype::q4_0, 4),
-      .nth = 1,
   };
 
   x86_64_sm x86_64_machine{};
@@ -820,7 +781,6 @@ TEST_CASE("kernel_flash_attn_ext_matches_online_softmax_f16_reference_small") {
   request_x86.src1 = make_src(k_fp16.data(), dtype::f16, head_dim, kv_tokens, 1u, 1u);
   request_x86.src2 = make_src(v_fp16.data(), dtype::f16, head_dim, kv_tokens, 1u, 1u);
   request_x86.dst = make_dst(dst_x86.data(), dtype::f32, head_dim, 1u, 1u, 1u);
-  request_x86.nth = 1;
   const float scale = 1.0f;
   std::memcpy(request_x86.op_params.data(), &scale, sizeof(scale));
   request_x86.op_params_size = sizeof(scale);
@@ -883,7 +843,6 @@ TEST_CASE("kernel_flash_attn_ext_matches_online_softmax_f16_reference") {
   request_x86.src1 = make_src(k_fp16.data(), dtype::f16, head_dim, kv_tokens, 1u, 1u);
   request_x86.src2 = make_src(v_fp16.data(), dtype::f16, head_dim, kv_tokens, 1u, 1u);
   request_x86.dst = make_dst(dst_x86.data(), dtype::f32, head_dim, 1u, 1u, 1u);
-  request_x86.nth = 1;
   std::memcpy(request_x86.op_params.data(), &scale, sizeof(scale));
   request_x86.op_params_size = sizeof(scale);
 
@@ -962,7 +921,6 @@ TEST_CASE("kernel_flash_attn_ext_matches_online_softmax_f16_reference_on_long_mu
   request_x86.src1.nb[2] = sizeof(uint16_t) * head_dim;
   request_x86.src2.nb[1] = sizeof(uint16_t) * kv_dim;
   request_x86.src2.nb[2] = sizeof(uint16_t) * head_dim;
-  request_x86.nth = 1;
   std::memcpy(request_x86.op_params.data(), &scale, sizeof(scale));
   request_x86.op_params_size = sizeof(scale);
 
@@ -1045,7 +1003,6 @@ TEST_CASE("kernel_flash_attn_ext_matches_masked_total_token_reference") {
   request_x86.src1 = make_src(k_fp16.data(), dtype::f16, head_dim, kv_tokens, 1u, 1u);
   request_x86.src2 = make_src(v_fp16.data(), dtype::f16, head_dim, kv_tokens, 1u, 1u);
   request_x86.dst = make_dst(dst_x86.data(), dtype::f32, head_dim, 1u, 1u, 1u);
-  request_x86.nth = 1;
   std::memcpy(request_x86.op_params.data(), &scale, sizeof(scale));
   std::memcpy(
       request_x86.op_params.data() + sizeof(scale), &total_tokens, sizeof(total_tokens));
@@ -1117,7 +1074,6 @@ TEST_CASE("kernel_flash_attn_ext_matches_masked_total_token_reference_on_long_mu
   request_x86.src1.nb[2] = sizeof(uint16_t) * head_dim;
   request_x86.src2.nb[1] = sizeof(uint16_t) * kv_dim;
   request_x86.src2.nb[2] = sizeof(uint16_t) * head_dim;
-  request_x86.nth = 1;
   std::memcpy(request_x86.op_params.data(), &scale, sizeof(scale));
   std::memcpy(
       request_x86.op_params.data() + sizeof(scale), &total_tokens, sizeof(total_tokens));
diff --git a/tests/kernel/test_helpers.hpp b/tests/kernel/test_helpers.hpp
index 90ff1f78..5f030e1c 100644
--- a/tests/kernel/test_helpers.hpp
+++ b/tests/kernel/test_helpers.hpp
@@ -4,12 +4,14 @@
 #include <cmath>
 #include <cstdint>
 #include <cstring>
+#include <memory>
 #include <span>
 #include <type_traits>
 #include <vector>
 
 #include "emel/kernel/detail.hpp"
 #include "emel/kernel/events.hpp"
+#include "emel/model/data.hpp"
 
 namespace emel::kernel::test {
 
@@ -315,7 +317,6 @@ inline emel::kernel::event::op_flash_attn_ext make_flash_attn_ext_event(
   ev.src1 = make_src(fixture.k, dtype::f16, 4, 2, 1, 1);
   ev.src2 = make_src(fixture.v, dtype::f16, 4, 2, 1, 1);
   ev.dst = make_dst(fixture.dst, dtype::f32, 4, 1, 1, 1);
-  ev.nth = 1;
 
   const float scale = 1.0f;
   std::memcpy(ev.op_params.data(), &scale, sizeof(scale));
@@ -649,8 +650,6 @@ inline event_type make_smoke_op_event() {
   ev.src1 = make_src(src1, dtype::f32, 4);
   ev.src2 = make_src(src2, dtype::f32, 4);
   ev.dst = make_dst(dst, dtype::f32, 4);
-  ev.ith = 0;
-  ev.nth = 1;
 
   if constexpr (std::is_same_v<event_type, emel::kernel::event::op_mul_mat>) {
     ev.src0 = make_src(src0, dtype::f32, 2, 2);
@@ -678,3 +677,12 @@ inline event_type make_smoke_op_event() {
 }
 
 }  // namespace emel::kernel::test
+
+namespace emel::tests {
+
+inline void reset_model_data(emel::model::data & model) {
+  std::destroy_at(&model);
+  std::construct_at(&model);
+}
+
+}  // namespace emel::tests
diff --git a/tests/kernel/x86_64_tests.cpp b/tests/kernel/x86_64_tests.cpp
index d94a8823..645ecee6 100644
--- a/tests/kernel/x86_64_tests.cpp
+++ b/tests/kernel/x86_64_tests.cpp
@@ -1,23 +1,176 @@
 #include <doctest/doctest.h>
 
 #include <array>
-#include <cstdint>
 #include <cmath>
+#include <cstring>
+#include <cstdint>
 #include <limits>
+#include <span>
+#include <vector>
 
-#include "test_helpers.hpp"
+#include "../allocation_tracker.hpp"
 #include "emel/kernel/x86_64/actions.hpp"
 #include "emel/kernel/x86_64/detail.hpp"
 #include "emel/kernel/x86_64/sm.hpp"
+#include "test_helpers.hpp"
 
 namespace {
 
 using x86_64_sm = emel::kernel::x86_64::sm;
+using allocation_scope = emel::test::allocation::allocation_scope;
 using emel::kernel::test::dtype;
+using emel::kernel::test::flash_attn_ext_fixture;
+using emel::kernel::test::flash_attn_reference_f16_scores;
+using emel::kernel::test::flash_attn_reference_masked_total_tokens;
+using emel::kernel::test::k_flash_online_f16_abs_tolerance;
 using emel::kernel::test::make_dst;
+using emel::kernel::test::make_flash_attn_ext_event;
+using emel::kernel::test::make_quantized_src;
 using emel::kernel::test::make_src;
+using emel::kernel::test::to_fp16_storage;
+using emel::kernel::test::within_flash_online_f16_tolerance;
+
+using emel::kernel::detail::quant::QK_K;
+using emel::kernel::detail::quant::QK4_0;
+using emel::kernel::detail::quant::QK4_1;
+using emel::kernel::detail::quant::QK5_0;
+using emel::kernel::detail::quant::QK8_0;
+using emel::kernel::detail::quant::block_q2_k;
+using emel::kernel::detail::quant::block_q3_k;
+using emel::kernel::detail::quant::block_q4_0;
+using emel::kernel::detail::quant::block_q4_1;
+using emel::kernel::detail::quant::block_q4_k;
+using emel::kernel::detail::quant::block_q5_0;
+using emel::kernel::detail::quant::block_q6_k;
+using emel::kernel::detail::quant::block_q8_0;
+using emel::kernel::detail::quant::block_q8_k;
+
+template <size_t block_count>
+std::array<float, QK_K * block_count> make_quantized_rhs_values(
+    const uint32_t salt) {
+  std::array<float, QK_K * block_count> values = {};
+  for (size_t i = 0; i < values.size(); ++i) {
+    const int32_t centered =
+        static_cast<int32_t>(((i + salt) * (7u + salt)) % 41u) - 20;
+    values[i] = static_cast<float>(centered) * 0.03125f;
+  }
+  return values;
+}
+
+void fill_q2_block(block_q2_k & q2, const uint32_t salt) {
+  q2.d = static_cast<uint16_t>(0x3c00u + (salt % 17u));
+  q2.dmin = static_cast<uint16_t>(0x3800u + (salt % 11u));
+  for (size_t i = 0; i < q2.scales.size(); ++i) {
+    q2.scales[i] = static_cast<uint8_t>((((i + salt) % 13u) << 4u) |
+                                        (((i * 5u) + salt) % 15u));
+  }
+  for (size_t i = 0; i < q2.qs.size(); ++i) {
+    q2.qs[i] =
+        static_cast<uint8_t>((i * (23u + salt)) ^ ((i + salt) >> 1u));
+  }
+}
+
+void fill_q3_block(block_q3_k & q3, const uint32_t salt) {
+  q3.d = static_cast<uint16_t>(0x3c00u + (salt % 19u));
+  for (size_t i = 0; i < q3.scales.size(); ++i) {
+    q3.scales[i] =
+        static_cast<uint8_t>((i * (17u + salt)) ^ (0x5au + salt));
+  }
+  for (size_t i = 0; i < q3.hmask.size(); ++i) {
+    q3.hmask[i] =
+        static_cast<uint8_t>((i * (9u + salt)) ^ (0xa5u - salt));
+  }
+  for (size_t i = 0; i < q3.qs.size(); ++i) {
+    q3.qs[i] =
+        static_cast<uint8_t>((i * (13u + salt)) ^ (0x33u + salt * 7u));
+  }
+}
+
+void fill_q4_block(block_q4_k & q4, const uint32_t salt) {
+  q4.d = static_cast<uint16_t>(0x3c00u + (salt % 17u));
+  q4.dmin = static_cast<uint16_t>(0x3800u + (salt % 13u));
+  for (size_t i = 0; i < q4.scales.size(); ++i) {
+    q4.scales[i] =
+        static_cast<uint8_t>((i * (11u + salt)) ^ (0x47u + salt));
+  }
+  for (size_t i = 0; i < q4.qs.size(); ++i) {
+    q4.qs[i] =
+        static_cast<uint8_t>((i * (5u + salt)) ^ (0x9du - salt));
+  }
+}
+
+void fill_q6_block(block_q6_k & q6, const uint32_t salt) {
+  q6.d = static_cast<uint16_t>(0x3c00u + (salt % 23u));
+  for (size_t i = 0; i < q6.scales.size(); ++i) {
+    const int32_t scale_value =
+        static_cast<int32_t>(((i + salt) * 3u) % 31u) - 15;
+    q6.scales[i] = static_cast<int8_t>(scale_value);
+  }
+  for (size_t i = 0; i < q6.ql.size(); ++i) {
+    q6.ql[i] =
+        static_cast<uint8_t>((i * (19u + salt)) ^ (0x6cu + salt));
+  }
+  for (size_t i = 0; i < q6.qh.size(); ++i) {
+    q6.qh[i] =
+        static_cast<uint8_t>((i * (7u + salt)) ^ (0x95u - salt));
+  }
+}
+
+void fill_q4_0_block(block_q4_0 & q4, const uint32_t salt) {
+  q4.d = static_cast<uint16_t>(0x3c00u + (salt % 17u));
+  for (size_t i = 0; i < q4.qs.size(); ++i) {
+    q4.qs[i] = static_cast<uint8_t>((i * (7u + salt)) ^ (0x53u + salt));
+  }
+}
 
-}  // namespace
+void fill_q4_1_block(block_q4_1 & q4, const uint32_t salt) {
+  q4.d = static_cast<uint16_t>(0x3c00u + (salt % 19u));
+  q4.m = static_cast<uint16_t>(0x3800u + (salt % 11u));
+  for (size_t i = 0; i < q4.qs.size(); ++i) {
+    q4.qs[i] = static_cast<uint8_t>((i * (11u + salt)) ^ (0x2eu + salt));
+  }
+}
+
+void fill_q5_0_block(block_q5_0 & q5, const uint32_t salt) {
+  q5.d = static_cast<uint16_t>(0x3c00u + (salt % 13u));
+  for (size_t i = 0; i < q5.qh.size(); ++i) {
+    q5.qh[i] = static_cast<uint8_t>((i * (29u + salt)) ^ (0xb4u - salt));
+  }
+  for (size_t i = 0; i < q5.qs.size(); ++i) {
+    q5.qs[i] = static_cast<uint8_t>((i * (13u + salt)) ^ (0x71u + salt));
+  }
+}
+
+void fill_q8_0_block(block_q8_0 & q8, const uint32_t salt) {
+  q8.d = static_cast<uint16_t>(0x3c00u + (salt % 23u));
+  for (size_t i = 0; i < q8.qs.size(); ++i) {
+    const int32_t centered =
+        static_cast<int32_t>(((i + salt) * (5u + salt)) % 251u) - 125;
+    q8.qs[i] = static_cast<int8_t>(centered);
+  }
+}
+
+emel::kernel::x86_64::detail::host_feature_contract avx2_fma_contract(
+    const bool enabled) {
+  return emel::kernel::x86_64::detail::host_feature_contract{
+      .avx2_available = enabled,
+      .fma_available = enabled,
+      .f16c_available = enabled,
+  };
+}
+
+bool host_has_avx2_fma() {
+  return emel::kernel::x86_64::detail::avx2_intrinsics_compiled &&
+         emel::kernel::x86_64::detail::detect_avx2() &&
+         emel::kernel::x86_64::detail::detect_fma();
+}
+
+bool host_has_avx2_fma_f16c() {
+  return host_has_avx2_fma() &&
+         emel::kernel::x86_64::detail::detect_f16c();
+}
+
+} // namespace
 
 TEST_CASE("kernel_x86_64_numeric_paths") {
   float lhs[4] = {1.0f, 2.0f, 3.0f, 4.0f};
@@ -29,13 +182,11 @@ TEST_CASE("kernel_x86_64_numeric_paths") {
       .src0 = make_src(lhs, dtype::f32, 4),
       .src1 = make_src(rhs, dtype::f32, 4),
       .dst = make_dst(out_add, dtype::f32, 4),
-      .nth = 1,
   };
   const emel::kernel::event::op_mul mul_ev{
       .src0 = make_src(lhs, dtype::f32, 4),
       .src1 = make_src(rhs, dtype::f32, 4),
       .dst = make_dst(out_mul, dtype::f32, 4),
-      .nth = 1,
   };
 
   x86_64_sm machine{emel::kernel::x86_64::action::context{false, {}, 0}};
@@ -56,7 +207,8 @@ TEST_CASE("kernel_x86_64_numeric_paths") {
 
 TEST_CASE("kernel_x86_64_scalar_path_honors_strides") {
   float lhs_storage[8] = {1.0f, 91.0f, 2.0f, 92.0f, 3.0f, 93.0f, 4.0f, 94.0f};
-  float rhs_storage[8] = {10.0f, 81.0f, 20.0f, 82.0f, 30.0f, 83.0f, 40.0f, 84.0f};
+  float rhs_storage[8] = {10.0f, 81.0f, 20.0f, 82.0f,
+                          30.0f, 83.0f, 40.0f, 84.0f};
   float dst_storage[8] = {};
 
   auto lhs = make_src(lhs_storage, dtype::f32, 4);
@@ -74,7 +226,6 @@ TEST_CASE("kernel_x86_64_scalar_path_honors_strides") {
       .src0 = lhs,
       .src1 = rhs,
       .dst = dst,
-      .nth = 1,
   };
 
   x86_64_sm machine{emel::kernel::x86_64::action::context{false, {}, 0}};
@@ -86,26 +237,6 @@ TEST_CASE("kernel_x86_64_scalar_path_honors_strides") {
   CHECK(dst_storage[6] == doctest::Approx(44.0f));
 }
 
-TEST_CASE("kernel_x86_64_rejects_non_single_thread_dispatch") {
-  float lhs[4] = {1.0f, 2.0f, 3.0f, 4.0f};
-  float rhs[4] = {5.0f, 6.0f, 7.0f, 8.0f};
-  float out[4] = {};
-
-  emel::kernel::event::op_add invalid_nth{
-      .src0 = make_src(lhs, dtype::f32, 4),
-      .src1 = make_src(rhs, dtype::f32, 4),
-      .dst = make_dst(out, dtype::f32, 4),
-      .nth = 2,
-  };
-  emel::kernel::event::op_add invalid_ith = invalid_nth;
-  invalid_ith.nth = 1;
-  invalid_ith.ith = 1;
-
-  x86_64_sm machine{};
-  CHECK_FALSE(machine.process_event(invalid_nth));
-  CHECK_FALSE(machine.process_event(invalid_ith));
-}
-
 TEST_CASE("kernel_x86_64_forced_avx2_context_path") {
   if (!emel::kernel::x86_64::detail::avx2_intrinsics_compiled ||
       !emel::kernel::x86_64::detail::detect_avx2()) {
@@ -120,7 +251,6 @@ TEST_CASE("kernel_x86_64_forced_avx2_context_path") {
       .src0 = make_src(lhs, dtype::f32, 4),
       .src1 = make_src(rhs, dtype::f32, 4),
       .dst = make_dst(out, dtype::f32, 4),
-      .nth = 1,
   };
 
   x86_64_sm machine{emel::kernel::x86_64::action::context{true, {}, 0}};
@@ -131,6 +261,1051 @@ TEST_CASE("kernel_x86_64_forced_avx2_context_path") {
   CHECK(out[3] == doctest::Approx(15.0f));
 }
 
+TEST_CASE("kernel_x86_64_host_feature_contract_is_published") {
+  const emel::kernel::x86_64::detail::host_feature_contract contract{
+      .avx2_available = true,
+      .fma_available = true,
+      .f16c_available = true,
+  };
+
+  const x86_64_sm machine{
+      emel::kernel::x86_64::action::context{contract, {}, 0}};
+
+  CHECK(machine.avx2_available());
+  CHECK(machine.fma_available());
+  CHECK(machine.f16c_available());
+  CHECK(machine.avx2_fma_f16c_available());
+  CHECK_FALSE(machine.avx512_claimed());
+  CHECK_FALSE(machine.avx_vnni_claimed());
+  CHECK_FALSE(machine.amx_claimed());
+  CHECK_FALSE(machine.bf16_claimed());
+  CHECK_FALSE(machine.native_fp16_claimed());
+}
+
+TEST_CASE("kernel_x86_64_q2_row_avx2_fma_matches_scalar") {
+  if (!host_has_avx2_fma()) {
+    return;
+  }
+
+  constexpr size_t block_count = 4u;
+  std::array<block_q2_k, block_count> q2_blocks = {};
+  for (size_t block = 0; block < block_count; ++block) {
+    fill_q2_block(q2_blocks[block], static_cast<uint32_t>(block + 1u));
+  }
+
+  const auto rhs_values = make_quantized_rhs_values<block_count>(3u);
+  std::array<block_q8_k, block_count> q8_blocks = {};
+  emel::kernel::detail::quant::quantize_row_q8_k_strided(
+      rhs_values.data(), 1u, q8_blocks.data(), static_cast<int64_t>(QK_K * block_count));
+
+  const float scalar = emel::kernel::detail::dot_q2_k_q8_k_row_scalar(
+      q2_blocks.data(), q8_blocks.data(), block_count);
+  const float optimized =
+      emel::kernel::x86_64::detail::dot_q2_k_q8_k_row_avx2_fma(
+          q2_blocks.data(), q8_blocks.data(), block_count);
+
+  CHECK(optimized == doctest::Approx(scalar).epsilon(1e-6f));
+}
+
+TEST_CASE("kernel_x86_64_q3_row_avx2_fma_matches_scalar") {
+  if (!host_has_avx2_fma()) {
+    return;
+  }
+
+  constexpr size_t block_count = 4u;
+  std::array<block_q3_k, block_count> q3_blocks = {};
+  for (size_t block = 0; block < block_count; ++block) {
+    fill_q3_block(q3_blocks[block], static_cast<uint32_t>(block + 5u));
+  }
+
+  const auto rhs_values = make_quantized_rhs_values<block_count>(7u);
+  std::array<block_q8_k, block_count> q8_blocks = {};
+  emel::kernel::detail::quant::quantize_row_q8_k_strided(
+      rhs_values.data(), 1u, q8_blocks.data(), static_cast<int64_t>(QK_K * block_count));
+
+  const float scalar = emel::kernel::detail::dot_q3_k_q8_k_row_scalar(
+      q3_blocks.data(), q8_blocks.data(), block_count);
+  const float optimized =
+      emel::kernel::x86_64::detail::dot_q3_k_q8_k_row_avx2_fma(
+          q3_blocks.data(), q8_blocks.data(), block_count);
+
+  CHECK(optimized == doctest::Approx(scalar).epsilon(1e-6f));
+}
+
+TEST_CASE("kernel_x86_64_q4_row_avx2_fma_matches_scalar") {
+  if (!host_has_avx2_fma()) {
+    return;
+  }
+
+  constexpr size_t block_count = 4u;
+  std::array<block_q4_k, block_count> q4_blocks = {};
+  for (size_t block = 0; block < block_count; ++block) {
+    fill_q4_block(q4_blocks[block], static_cast<uint32_t>(block + 9u));
+  }
+
+  const auto rhs_values = make_quantized_rhs_values<block_count>(13u);
+  std::array<block_q8_k, block_count> q8_blocks = {};
+  emel::kernel::detail::quant::quantize_row_q8_k_strided(
+      rhs_values.data(), 1u, q8_blocks.data(), static_cast<int64_t>(QK_K * block_count));
+
+  const float scalar = emel::kernel::detail::dot_q4_k_q8_k_row_scalar(
+      q4_blocks.data(), q8_blocks.data(), block_count);
+  const float optimized =
+      emel::kernel::x86_64::detail::dot_q4_k_q8_k_row_avx2_fma(
+          q4_blocks.data(), q8_blocks.data(), block_count);
+
+  CHECK(optimized == doctest::Approx(scalar).epsilon(1e-6f));
+}
+
+TEST_CASE("kernel_x86_64_q2_mul_mat_uses_optimized_and_shared_routes") {
+  constexpr size_t block_count = 2u;
+  constexpr uint64_t k = QK_K * block_count;
+  constexpr uint64_t rows = 2u;
+  constexpr uint64_t cols = 2u;
+
+  std::array<block_q2_k, rows * block_count> q2_rows = {};
+  for (size_t idx = 0; idx < q2_rows.size(); ++idx) {
+    fill_q2_block(q2_rows[idx], static_cast<uint32_t>(idx + 11u));
+  }
+
+  std::array<float, k * cols> rhs = {};
+  for (size_t i = 0; i < rhs.size(); ++i) {
+    const int32_t centered = static_cast<int32_t>((i * 5u) % 31u) - 15;
+    rhs[i] = static_cast<float>(centered) * 0.0625f;
+  }
+
+  float expected[rows * cols] = {};
+  const emel::kernel::event::op_mul_mat scalar_ev{
+      .src0 = make_quantized_src(q2_rows.data(), dtype::q2_k, k, rows),
+      .src1 = make_src(rhs.data(), dtype::f32, cols, k),
+      .dst = make_dst(expected, dtype::f32, cols, rows),
+  };
+  CHECK(emel::kernel::detail::execute_scalar(scalar_ev));
+
+  if (host_has_avx2_fma()) {
+    float optimized_out[rows * cols] = {};
+    const emel::kernel::event::op_mul_mat optimized_ev{
+        .src0 = make_quantized_src(q2_rows.data(), dtype::q2_k, k, rows),
+        .src1 = make_src(rhs.data(), dtype::f32, cols, k),
+        .dst = make_dst(optimized_out, dtype::f32, cols, rows),
+    };
+    x86_64_sm optimized_machine{
+        emel::kernel::x86_64::action::context{avx2_fma_contract(true), {}, 0}};
+
+    CHECK(optimized_machine.process_event(optimized_ev));
+    CHECK(optimized_machine.optimized_q2_dispatch_count() == 1u);
+    CHECK(optimized_machine.shared_q2_dispatch_count() == 0u);
+    for (size_t i = 0; i < std::size(optimized_out); ++i) {
+      CHECK(optimized_out[i] == doctest::Approx(expected[i]).epsilon(1e-6f));
+    }
+  }
+
+  float shared_out[rows * cols] = {};
+  const emel::kernel::event::op_mul_mat shared_ev{
+      .src0 = make_quantized_src(q2_rows.data(), dtype::q2_k, k, rows),
+      .src1 = make_src(rhs.data(), dtype::f32, cols, k),
+      .dst = make_dst(shared_out, dtype::f32, cols, rows),
+  };
+  x86_64_sm shared_machine{
+      emel::kernel::x86_64::action::context{avx2_fma_contract(false), {}, 0}};
+
+  CHECK(shared_machine.process_event(shared_ev));
+  CHECK(shared_machine.optimized_q2_dispatch_count() == 0u);
+  CHECK(shared_machine.shared_q2_dispatch_count() == 1u);
+  for (size_t i = 0; i < std::size(shared_out); ++i) {
+    CHECK(shared_out[i] == doctest::Approx(expected[i]).epsilon(1e-6f));
+  }
+}
+
+TEST_CASE("kernel_x86_64_q3_mul_mat_uses_optimized_and_shared_routes") {
+  constexpr size_t block_count = 2u;
+  constexpr uint64_t k = QK_K * block_count;
+  constexpr uint64_t rows = 2u;
+  constexpr uint64_t cols = 2u;
+
+  std::array<block_q3_k, rows * block_count> q3_rows = {};
+  for (size_t idx = 0; idx < q3_rows.size(); ++idx) {
+    fill_q3_block(q3_rows[idx], static_cast<uint32_t>(idx + 19u));
+  }
+
+  std::array<float, k * cols> rhs = {};
+  for (size_t i = 0; i < rhs.size(); ++i) {
+    const int32_t centered = static_cast<int32_t>((i * 7u) % 37u) - 18;
+    rhs[i] = static_cast<float>(centered) * 0.0625f;
+  }
+
+  float expected[rows * cols] = {};
+  const emel::kernel::event::op_mul_mat scalar_ev{
+      .src0 = make_quantized_src(q3_rows.data(), dtype::q3_k, k, rows),
+      .src1 = make_src(rhs.data(), dtype::f32, cols, k),
+      .dst = make_dst(expected, dtype::f32, cols, rows),
+  };
+  CHECK(emel::kernel::detail::execute_scalar(scalar_ev));
+
+  if (host_has_avx2_fma()) {
+    float optimized_out[rows * cols] = {};
+    const emel::kernel::event::op_mul_mat optimized_ev{
+        .src0 = make_quantized_src(q3_rows.data(), dtype::q3_k, k, rows),
+        .src1 = make_src(rhs.data(), dtype::f32, cols, k),
+        .dst = make_dst(optimized_out, dtype::f32, cols, rows),
+    };
+    x86_64_sm optimized_machine{
+        emel::kernel::x86_64::action::context{avx2_fma_contract(true), {}, 0}};
+
+    CHECK(optimized_machine.process_event(optimized_ev));
+    CHECK(optimized_machine.optimized_q3_dispatch_count() == 1u);
+    CHECK(optimized_machine.shared_q3_dispatch_count() == 0u);
+    for (size_t i = 0; i < std::size(optimized_out); ++i) {
+      CHECK(optimized_out[i] == doctest::Approx(expected[i]).epsilon(1e-6f));
+    }
+  }
+
+  float shared_out[rows * cols] = {};
+  const emel::kernel::event::op_mul_mat shared_ev{
+      .src0 = make_quantized_src(q3_rows.data(), dtype::q3_k, k, rows),
+      .src1 = make_src(rhs.data(), dtype::f32, cols, k),
+      .dst = make_dst(shared_out, dtype::f32, cols, rows),
+  };
+  x86_64_sm shared_machine{
+      emel::kernel::x86_64::action::context{avx2_fma_contract(false), {}, 0}};
+
+  CHECK(shared_machine.process_event(shared_ev));
+  CHECK(shared_machine.optimized_q3_dispatch_count() == 0u);
+  CHECK(shared_machine.shared_q3_dispatch_count() == 1u);
+  for (size_t i = 0; i < std::size(shared_out); ++i) {
+    CHECK(shared_out[i] == doctest::Approx(expected[i]).epsilon(1e-6f));
+  }
+}
+
+TEST_CASE("kernel_x86_64_q4_mul_mat_uses_optimized_and_shared_routes") {
+  constexpr size_t block_count = 2u;
+  constexpr uint64_t k = QK_K * block_count;
+  constexpr uint64_t rows = 2u;
+  constexpr uint64_t cols = 2u;
+
+  std::array<block_q4_k, rows * block_count> q4_rows = {};
+  for (size_t idx = 0; idx < q4_rows.size(); ++idx) {
+    fill_q4_block(q4_rows[idx], static_cast<uint32_t>(idx + 29u));
+  }
+
+  std::array<float, k * cols> rhs = {};
+  for (size_t i = 0; i < rhs.size(); ++i) {
+    const int32_t centered = static_cast<int32_t>((i * 7u) % 37u) - 18;
+    rhs[i] = static_cast<float>(centered) * 0.0625f;
+  }
+
+  float expected[rows * cols] = {};
+  const emel::kernel::event::op_mul_mat scalar_ev{
+      .src0 = make_quantized_src(q4_rows.data(), dtype::q4_k, k, rows),
+      .src1 = make_src(rhs.data(), dtype::f32, cols, k),
+      .dst = make_dst(expected, dtype::f32, cols, rows),
+  };
+  CHECK(emel::kernel::detail::execute_scalar(scalar_ev));
+
+  if (host_has_avx2_fma()) {
+    float optimized_out[rows * cols] = {};
+    const emel::kernel::event::op_mul_mat optimized_ev{
+        .src0 = make_quantized_src(q4_rows.data(), dtype::q4_k, k, rows),
+        .src1 = make_src(rhs.data(), dtype::f32, cols, k),
+        .dst = make_dst(optimized_out, dtype::f32, cols, rows),
+    };
+    x86_64_sm optimized_machine{
+        emel::kernel::x86_64::action::context{avx2_fma_contract(true), {}, 0}};
+
+    CHECK(optimized_machine.process_event(optimized_ev));
+    CHECK(optimized_machine.optimized_q4_dispatch_count() == 1u);
+    CHECK(optimized_machine.shared_q4_dispatch_count() == 0u);
+    for (size_t i = 0; i < std::size(optimized_out); ++i) {
+      CHECK(optimized_out[i] == doctest::Approx(expected[i]).epsilon(1e-6f));
+    }
+  }
+
+  float shared_out[rows * cols] = {};
+  const emel::kernel::event::op_mul_mat shared_ev{
+      .src0 = make_quantized_src(q4_rows.data(), dtype::q4_k, k, rows),
+      .src1 = make_src(rhs.data(), dtype::f32, cols, k),
+      .dst = make_dst(shared_out, dtype::f32, cols, rows),
+  };
+  x86_64_sm shared_machine{
+      emel::kernel::x86_64::action::context{avx2_fma_contract(false), {}, 0}};
+
+  CHECK(shared_machine.process_event(shared_ev));
+  CHECK(shared_machine.optimized_q4_dispatch_count() == 0u);
+  CHECK(shared_machine.shared_q4_dispatch_count() == 1u);
+  for (size_t i = 0; i < std::size(shared_out); ++i) {
+    CHECK(shared_out[i] == doctest::Approx(expected[i]).epsilon(1e-6f));
+  }
+}
+
+TEST_CASE("kernel_x86_64_q4_0_row_avx2_fma_matches_scalar") {
+  if (!host_has_avx2_fma()) {
+    return;
+  }
+
+  constexpr size_t block_count = 8u;
+  constexpr size_t k = QK4_0 * block_count;
+  std::array<block_q4_0, block_count> q4_blocks = {};
+  for (size_t block = 0; block < block_count; ++block) {
+    fill_q4_0_block(q4_blocks[block], static_cast<uint32_t>(block + 3u));
+  }
+
+  std::array<float, k> rhs_values = {};
+  for (size_t i = 0; i < rhs_values.size(); ++i) {
+    const int32_t centered = static_cast<int32_t>((i * 5u) % 43u) - 21;
+    rhs_values[i] = static_cast<float>(centered) * 0.03125f;
+  }
+  std::array<block_q8_0, block_count> q8_blocks = {};
+  emel::kernel::detail::quant::quantize_row_q8_0_strided(
+      rhs_values.data(), 1u, q8_blocks.data(), static_cast<int64_t>(k));
+
+  const float scalar = emel::kernel::detail::dot_q4_0_q8_0_row_scalar(
+      q4_blocks.data(), q8_blocks.data(), block_count);
+  const float optimized =
+      emel::kernel::x86_64::detail::dot_q4_0_q8_0_row_avx2_fma(
+          q4_blocks.data(), q8_blocks.data(), block_count);
+
+  CHECK(optimized == doctest::Approx(scalar).epsilon(1e-6f));
+}
+
+TEST_CASE("kernel_x86_64_q4_1_row_avx2_fma_matches_scalar") {
+  if (!host_has_avx2_fma()) {
+    return;
+  }
+
+  constexpr size_t block_count = 8u;
+  constexpr size_t k = QK4_1 * block_count;
+  std::array<block_q4_1, block_count> q4_blocks = {};
+  for (size_t block = 0; block < block_count; ++block) {
+    fill_q4_1_block(q4_blocks[block], static_cast<uint32_t>(block + 5u));
+  }
+
+  std::array<float, k> rhs_values = {};
+  for (size_t i = 0; i < rhs_values.size(); ++i) {
+    const int32_t centered = static_cast<int32_t>((i * 7u) % 39u) - 19;
+    rhs_values[i] = static_cast<float>(centered) * 0.03125f;
+  }
+  std::array<block_q8_0, block_count> q8_blocks = {};
+  emel::kernel::detail::quant::quantize_row_q8_0_strided(
+      rhs_values.data(), 1u, q8_blocks.data(), static_cast<int64_t>(k));
+
+  const float scalar = emel::kernel::detail::dot_q4_1_q8_0_row_scalar(
+      q4_blocks.data(), q8_blocks.data(), block_count);
+  const float optimized =
+      emel::kernel::x86_64::detail::dot_q4_1_q8_0_row_avx2_fma(
+          q4_blocks.data(), q8_blocks.data(), block_count);
+
+  CHECK(optimized == doctest::Approx(scalar).epsilon(1e-6f));
+}
+
+TEST_CASE("kernel_x86_64_q5_0_row_avx2_fma_matches_scalar") {
+  if (!host_has_avx2_fma()) {
+    return;
+  }
+
+  constexpr size_t block_count = 8u;
+  constexpr size_t k = QK5_0 * block_count;
+  std::array<block_q5_0, block_count> q5_blocks = {};
+  for (size_t block = 0; block < block_count; ++block) {
+    fill_q5_0_block(q5_blocks[block], static_cast<uint32_t>(block + 7u));
+  }
+
+  std::array<float, k> rhs_values = {};
+  for (size_t i = 0; i < rhs_values.size(); ++i) {
+    const int32_t centered = static_cast<int32_t>((i * 11u) % 47u) - 23;
+    rhs_values[i] = static_cast<float>(centered) * 0.03125f;
+  }
+  std::array<block_q8_0, block_count> q8_blocks = {};
+  emel::kernel::detail::quant::quantize_row_q8_0_strided(
+      rhs_values.data(), 1u, q8_blocks.data(), static_cast<int64_t>(k));
+
+  const float scalar = emel::kernel::detail::dot_q5_0_q8_0_row_scalar(
+      q5_blocks.data(), q8_blocks.data(), block_count);
+  const float optimized =
+      emel::kernel::x86_64::detail::dot_q5_0_q8_0_row_avx2_fma(
+          q5_blocks.data(), q8_blocks.data(), block_count);
+
+  CHECK(optimized == doctest::Approx(scalar).epsilon(1e-6f));
+}
+
+TEST_CASE("kernel_x86_64_q8_0_row_avx2_fma_matches_scalar") {
+  if (!host_has_avx2_fma()) {
+    return;
+  }
+
+  constexpr size_t block_count = 8u;
+  constexpr size_t k = QK8_0 * block_count;
+  std::array<block_q8_0, block_count> lhs_blocks = {};
+  for (size_t block = 0; block < block_count; ++block) {
+    fill_q8_0_block(lhs_blocks[block], static_cast<uint32_t>(block + 9u));
+  }
+
+  std::array<float, k> rhs_values = {};
+  for (size_t i = 0; i < rhs_values.size(); ++i) {
+    const int32_t centered = static_cast<int32_t>((i * 13u) % 51u) - 25;
+    rhs_values[i] = static_cast<float>(centered) * 0.03125f;
+  }
+  std::array<block_q8_0, block_count> q8_blocks = {};
+  emel::kernel::detail::quant::quantize_row_q8_0_strided(
+      rhs_values.data(), 1u, q8_blocks.data(), static_cast<int64_t>(k));
+
+  const float scalar = emel::kernel::detail::dot_q8_0_q8_0_row_scalar(
+      lhs_blocks.data(), q8_blocks.data(), block_count);
+  const float optimized =
+      emel::kernel::x86_64::detail::dot_q8_0_q8_0_row_avx2_fma(
+          lhs_blocks.data(), q8_blocks.data(), block_count);
+
+  CHECK(optimized == doctest::Approx(scalar).epsilon(1e-6f));
+}
+
+TEST_CASE("kernel_x86_64_q4_0_mul_mat_uses_optimized_and_shared_routes") {
+  constexpr size_t block_count = 4u;
+  constexpr uint64_t k = QK4_0 * block_count;
+  constexpr uint64_t rows = 2u;
+  constexpr uint64_t cols = 2u;
+
+  std::array<block_q4_0, rows * block_count> q4_rows = {};
+  for (size_t idx = 0; idx < q4_rows.size(); ++idx) {
+    fill_q4_0_block(q4_rows[idx], static_cast<uint32_t>(idx + 31u));
+  }
+
+  std::array<float, k * cols> rhs = {};
+  for (size_t i = 0; i < rhs.size(); ++i) {
+    const int32_t centered = static_cast<int32_t>((i * 7u) % 37u) - 18;
+    rhs[i] = static_cast<float>(centered) * 0.0625f;
+  }
+
+  float expected[rows * cols] = {};
+  const emel::kernel::event::op_mul_mat scalar_ev{
+      .src0 = make_quantized_src(q4_rows.data(), dtype::q4_0, k, rows),
+      .src1 = make_src(rhs.data(), dtype::f32, cols, k),
+      .dst = make_dst(expected, dtype::f32, cols, rows),
+  };
+  CHECK(emel::kernel::detail::execute_scalar(scalar_ev));
+
+  if (host_has_avx2_fma()) {
+    float optimized_out[rows * cols] = {};
+    const emel::kernel::event::op_mul_mat optimized_ev{
+        .src0 = make_quantized_src(q4_rows.data(), dtype::q4_0, k, rows),
+        .src1 = make_src(rhs.data(), dtype::f32, cols, k),
+        .dst = make_dst(optimized_out, dtype::f32, cols, rows),
+    };
+    x86_64_sm optimized_machine{
+        emel::kernel::x86_64::action::context{avx2_fma_contract(true), {}, 0}};
+
+    CHECK(optimized_machine.process_event(optimized_ev));
+    CHECK(optimized_machine.optimized_q4_0_dispatch_count() == 1u);
+    CHECK(optimized_machine.shared_q4_0_dispatch_count() == 0u);
+    for (size_t i = 0; i < std::size(optimized_out); ++i) {
+      CHECK(optimized_out[i] == doctest::Approx(expected[i]).epsilon(1e-6f));
+    }
+  }
+
+  float shared_out[rows * cols] = {};
+  const emel::kernel::event::op_mul_mat shared_ev{
+      .src0 = make_quantized_src(q4_rows.data(), dtype::q4_0, k, rows),
+      .src1 = make_src(rhs.data(), dtype::f32, cols, k),
+      .dst = make_dst(shared_out, dtype::f32, cols, rows),
+  };
+  x86_64_sm shared_machine{
+      emel::kernel::x86_64::action::context{avx2_fma_contract(false), {}, 0}};
+
+  CHECK(shared_machine.process_event(shared_ev));
+  CHECK(shared_machine.optimized_q4_0_dispatch_count() == 0u);
+  CHECK(shared_machine.shared_q4_0_dispatch_count() == 1u);
+  for (size_t i = 0; i < std::size(shared_out); ++i) {
+    CHECK(shared_out[i] == doctest::Approx(expected[i]).epsilon(1e-6f));
+  }
+}
+
+TEST_CASE("kernel_x86_64_q4_1_mul_mat_uses_optimized_and_shared_routes") {
+  constexpr size_t block_count = 4u;
+  constexpr uint64_t k = QK4_1 * block_count;
+  constexpr uint64_t rows = 2u;
+  constexpr uint64_t cols = 2u;
+
+  std::array<block_q4_1, rows * block_count> q4_rows = {};
+  for (size_t idx = 0; idx < q4_rows.size(); ++idx) {
+    fill_q4_1_block(q4_rows[idx], static_cast<uint32_t>(idx + 37u));
+  }
+
+  std::array<float, k * cols> rhs = {};
+  for (size_t i = 0; i < rhs.size(); ++i) {
+    const int32_t centered = static_cast<int32_t>((i * 5u) % 33u) - 16;
+    rhs[i] = static_cast<float>(centered) * 0.0625f;
+  }
+
+  float expected[rows * cols] = {};
+  const emel::kernel::event::op_mul_mat scalar_ev{
+      .src0 = make_quantized_src(q4_rows.data(), dtype::q4_1, k, rows),
+      .src1 = make_src(rhs.data(), dtype::f32, cols, k),
+      .dst = make_dst(expected, dtype::f32, cols, rows),
+  };
+  CHECK(emel::kernel::detail::execute_scalar(scalar_ev));
+
+  if (host_has_avx2_fma()) {
+    float optimized_out[rows * cols] = {};
+    const emel::kernel::event::op_mul_mat optimized_ev{
+        .src0 = make_quantized_src(q4_rows.data(), dtype::q4_1, k, rows),
+        .src1 = make_src(rhs.data(), dtype::f32, cols, k),
+        .dst = make_dst(optimized_out, dtype::f32, cols, rows),
+    };
+    x86_64_sm optimized_machine{
+        emel::kernel::x86_64::action::context{avx2_fma_contract(true), {}, 0}};
+
+    CHECK(optimized_machine.process_event(optimized_ev));
+    CHECK(optimized_machine.optimized_q4_1_dispatch_count() == 1u);
+    CHECK(optimized_machine.shared_q4_1_dispatch_count() == 0u);
+    for (size_t i = 0; i < std::size(optimized_out); ++i) {
+      CHECK(optimized_out[i] == doctest::Approx(expected[i]).epsilon(1e-6f));
+    }
+  }
+
+  float shared_out[rows * cols] = {};
+  const emel::kernel::event::op_mul_mat shared_ev{
+      .src0 = make_quantized_src(q4_rows.data(), dtype::q4_1, k, rows),
+      .src1 = make_src(rhs.data(), dtype::f32, cols, k),
+      .dst = make_dst(shared_out, dtype::f32, cols, rows),
+  };
+  x86_64_sm shared_machine{
+      emel::kernel::x86_64::action::context{avx2_fma_contract(false), {}, 0}};
+
+  CHECK(shared_machine.process_event(shared_ev));
+  CHECK(shared_machine.optimized_q4_1_dispatch_count() == 0u);
+  CHECK(shared_machine.shared_q4_1_dispatch_count() == 1u);
+  for (size_t i = 0; i < std::size(shared_out); ++i) {
+    CHECK(shared_out[i] == doctest::Approx(expected[i]).epsilon(1e-6f));
+  }
+}
+
+TEST_CASE("kernel_x86_64_q5_0_mul_mat_uses_optimized_and_shared_routes") {
+  constexpr size_t block_count = 4u;
+  constexpr uint64_t k = QK5_0 * block_count;
+  constexpr uint64_t rows = 2u;
+  constexpr uint64_t cols = 2u;
+
+  std::array<block_q5_0, rows * block_count> q5_rows = {};
+  for (size_t idx = 0; idx < q5_rows.size(); ++idx) {
+    fill_q5_0_block(q5_rows[idx], static_cast<uint32_t>(idx + 41u));
+  }
+
+  std::array<float, k * cols> rhs = {};
+  for (size_t i = 0; i < rhs.size(); ++i) {
+    const int32_t centered = static_cast<int32_t>((i * 11u) % 41u) - 20;
+    rhs[i] = static_cast<float>(centered) * 0.0625f;
+  }
+
+  float expected[rows * cols] = {};
+  const emel::kernel::event::op_mul_mat scalar_ev{
+      .src0 = make_quantized_src(q5_rows.data(), dtype::q5_0, k, rows),
+      .src1 = make_src(rhs.data(), dtype::f32, cols, k),
+      .dst = make_dst(expected, dtype::f32, cols, rows),
+  };
+  CHECK(emel::kernel::detail::execute_scalar(scalar_ev));
+
+  if (host_has_avx2_fma()) {
+    float optimized_out[rows * cols] = {};
+    const emel::kernel::event::op_mul_mat optimized_ev{
+        .src0 = make_quantized_src(q5_rows.data(), dtype::q5_0, k, rows),
+        .src1 = make_src(rhs.data(), dtype::f32, cols, k),
+        .dst = make_dst(optimized_out, dtype::f32, cols, rows),
+    };
+    x86_64_sm optimized_machine{
+        emel::kernel::x86_64::action::context{avx2_fma_contract(true), {}, 0}};
+
+    CHECK(optimized_machine.process_event(optimized_ev));
+    CHECK(optimized_machine.optimized_q5_0_dispatch_count() == 1u);
+    CHECK(optimized_machine.shared_q5_0_dispatch_count() == 0u);
+    for (size_t i = 0; i < std::size(optimized_out); ++i) {
+      CHECK(optimized_out[i] == doctest::Approx(expected[i]).epsilon(1e-6f));
+    }
+  }
+
+  float shared_out[rows * cols] = {};
+  const emel::kernel::event::op_mul_mat shared_ev{
+      .src0 = make_quantized_src(q5_rows.data(), dtype::q5_0, k, rows),
+      .src1 = make_src(rhs.data(), dtype::f32, cols, k),
+      .dst = make_dst(shared_out, dtype::f32, cols, rows),
+  };
+  x86_64_sm shared_machine{
+      emel::kernel::x86_64::action::context{avx2_fma_contract(false), {}, 0}};
+
+  CHECK(shared_machine.process_event(shared_ev));
+  CHECK(shared_machine.optimized_q5_0_dispatch_count() == 0u);
+  CHECK(shared_machine.shared_q5_0_dispatch_count() == 1u);
+  for (size_t i = 0; i < std::size(shared_out); ++i) {
+    CHECK(shared_out[i] == doctest::Approx(expected[i]).epsilon(1e-6f));
+  }
+}
+
+TEST_CASE("kernel_x86_64_q8_0_mul_mat_uses_optimized_and_shared_routes") {
+  constexpr size_t block_count = 4u;
+  constexpr uint64_t k = QK8_0 * block_count;
+  constexpr uint64_t rows = 2u;
+  constexpr uint64_t cols = 2u;
+
+  std::array<block_q8_0, rows * block_count> q8_rows = {};
+  for (size_t idx = 0; idx < q8_rows.size(); ++idx) {
+    fill_q8_0_block(q8_rows[idx], static_cast<uint32_t>(idx + 43u));
+  }
+
+  std::array<float, k * cols> rhs = {};
+  for (size_t i = 0; i < rhs.size(); ++i) {
+    const int32_t centered = static_cast<int32_t>((i * 13u) % 45u) - 22;
+    rhs[i] = static_cast<float>(centered) * 0.0625f;
+  }
+
+  float expected[rows * cols] = {};
+  const emel::kernel::event::op_mul_mat scalar_ev{
+      .src0 = make_quantized_src(q8_rows.data(), dtype::q8_0, k, rows),
+      .src1 = make_src(rhs.data(), dtype::f32, cols, k),
+      .dst = make_dst(expected, dtype::f32, cols, rows),
+  };
+  CHECK(emel::kernel::detail::execute_scalar(scalar_ev));
+
+  if (host_has_avx2_fma()) {
+    float optimized_out[rows * cols] = {};
+    const emel::kernel::event::op_mul_mat optimized_ev{
+        .src0 = make_quantized_src(q8_rows.data(), dtype::q8_0, k, rows),
+        .src1 = make_src(rhs.data(), dtype::f32, cols, k),
+        .dst = make_dst(optimized_out, dtype::f32, cols, rows),
+    };
+    x86_64_sm optimized_machine{
+        emel::kernel::x86_64::action::context{avx2_fma_contract(true), {}, 0}};
+
+    CHECK(optimized_machine.process_event(optimized_ev));
+    CHECK(optimized_machine.optimized_q8_0_dispatch_count() == 1u);
+    CHECK(optimized_machine.shared_q8_0_dispatch_count() == 0u);
+    for (size_t i = 0; i < std::size(optimized_out); ++i) {
+      CHECK(optimized_out[i] == doctest::Approx(expected[i]).epsilon(1e-6f));
+    }
+  }
+
+  float shared_out[rows * cols] = {};
+  const emel::kernel::event::op_mul_mat shared_ev{
+      .src0 = make_quantized_src(q8_rows.data(), dtype::q8_0, k, rows),
+      .src1 = make_src(rhs.data(), dtype::f32, cols, k),
+      .dst = make_dst(shared_out, dtype::f32, cols, rows),
+  };
+  x86_64_sm shared_machine{
+      emel::kernel::x86_64::action::context{avx2_fma_contract(false), {}, 0}};
+
+  CHECK(shared_machine.process_event(shared_ev));
+  CHECK(shared_machine.optimized_q8_0_dispatch_count() == 0u);
+  CHECK(shared_machine.shared_q8_0_dispatch_count() == 1u);
+  for (size_t i = 0; i < std::size(shared_out); ++i) {
+    CHECK(shared_out[i] == doctest::Approx(expected[i]).epsilon(1e-6f));
+  }
+}
+
+TEST_CASE("kernel_x86_64_q6_row_avx2_fma_matches_scalar") {
+  if (!host_has_avx2_fma()) {
+    return;
+  }
+
+  constexpr size_t block_count = 4u;
+  std::array<block_q6_k, block_count> q6_blocks = {};
+  for (size_t block = 0; block < block_count; ++block) {
+    fill_q6_block(q6_blocks[block], static_cast<uint32_t>(block + 29u));
+  }
+
+  const auto rhs_values = make_quantized_rhs_values<block_count>(11u);
+  std::array<block_q8_k, block_count> q8_blocks = {};
+  emel::kernel::detail::quant::quantize_row_q8_k_strided(
+      rhs_values.data(), 1u, q8_blocks.data(),
+      static_cast<int64_t>(QK_K * block_count));
+
+  const float scalar = emel::kernel::detail::dot_q6_k_q8_k_row_scalar(
+      q6_blocks.data(), q8_blocks.data(), block_count);
+  const float optimized =
+      emel::kernel::x86_64::detail::dot_q6_k_q8_k_row_avx2_fma(
+          q6_blocks.data(), q8_blocks.data(), block_count);
+
+  CHECK(optimized == doctest::Approx(scalar).epsilon(1e-5f));
+}
+
+TEST_CASE("kernel_x86_64_q6_mul_mat_uses_optimized_and_shared_routes") {
+  constexpr size_t block_count = 2u;
+  constexpr uint64_t k = QK_K * block_count;
+  constexpr uint64_t rows = 2u;
+  constexpr uint64_t cols = 2u;
+
+  std::array<block_q6_k, rows * block_count> q6_rows = {};
+  for (size_t idx = 0; idx < q6_rows.size(); ++idx) {
+    fill_q6_block(q6_rows[idx], static_cast<uint32_t>(idx + 37u));
+  }
+
+  std::array<float, k * cols> rhs = {};
+  for (size_t i = 0; i < rhs.size(); ++i) {
+    const int32_t centered = static_cast<int32_t>((i * 11u) % 43u) - 21;
+    rhs[i] = static_cast<float>(centered) * 0.0625f;
+  }
+
+  float expected[rows * cols] = {};
+  const emel::kernel::event::op_mul_mat scalar_ev{
+      .src0 = make_quantized_src(q6_rows.data(), dtype::q6_k, k, rows),
+      .src1 = make_src(rhs.data(), dtype::f32, cols, k),
+      .dst = make_dst(expected, dtype::f32, cols, rows),
+  };
+  CHECK(emel::kernel::detail::execute_scalar(scalar_ev));
+
+  if (host_has_avx2_fma()) {
+    float optimized_out[rows * cols] = {};
+    const emel::kernel::event::op_mul_mat optimized_ev{
+        .src0 = make_quantized_src(q6_rows.data(), dtype::q6_k, k, rows),
+        .src1 = make_src(rhs.data(), dtype::f32, cols, k),
+        .dst = make_dst(optimized_out, dtype::f32, cols, rows),
+    };
+    x86_64_sm optimized_machine{
+        emel::kernel::x86_64::action::context{avx2_fma_contract(true), {}, 0}};
+
+    CHECK(optimized_machine.process_event(optimized_ev));
+    CHECK(optimized_machine.optimized_q6_dispatch_count() == 1u);
+    CHECK(optimized_machine.shared_q6_dispatch_count() == 0u);
+    for (size_t i = 0; i < std::size(optimized_out); ++i) {
+      CHECK(optimized_out[i] == doctest::Approx(expected[i]).epsilon(1e-5f));
+    }
+  }
+
+  float shared_out[rows * cols] = {};
+  const emel::kernel::event::op_mul_mat shared_ev{
+      .src0 = make_quantized_src(q6_rows.data(), dtype::q6_k, k, rows),
+      .src1 = make_src(rhs.data(), dtype::f32, cols, k),
+      .dst = make_dst(shared_out, dtype::f32, cols, rows),
+  };
+  x86_64_sm shared_machine{
+      emel::kernel::x86_64::action::context{avx2_fma_contract(false), {}, 0}};
+
+  CHECK(shared_machine.process_event(shared_ev));
+  CHECK(shared_machine.optimized_q6_dispatch_count() == 0u);
+  CHECK(shared_machine.shared_q6_dispatch_count() == 1u);
+  for (size_t i = 0; i < std::size(shared_out); ++i) {
+    CHECK(shared_out[i] == doctest::Approx(expected[i]).epsilon(1e-5f));
+  }
+}
+
+TEST_CASE("kernel_x86_64_quantized_hot_path_dispatches_without_allocation") {
+  if (!host_has_avx2_fma()) {
+    return;
+  }
+
+  constexpr size_t block_count = 1u;
+  constexpr uint64_t k = QK_K * block_count;
+  std::array<float, k> rhs = {};
+  for (size_t i = 0; i < rhs.size(); ++i) {
+    const int32_t centered = static_cast<int32_t>((i * 13u) % 47u) - 23;
+    rhs[i] = static_cast<float>(centered) * 0.03125f;
+  }
+
+  block_q2_k q2 = {};
+  block_q3_k q3 = {};
+  block_q6_k q6 = {};
+  fill_q2_block(q2, 43u);
+  fill_q3_block(q3, 47u);
+  fill_q6_block(q6, 53u);
+
+  float q2_out[1] = {};
+  float q3_out[1] = {};
+  float q6_out[1] = {};
+  const emel::kernel::event::op_mul_mat q2_ev{
+      .src0 = make_quantized_src(&q2, dtype::q2_k, k, 1u),
+      .src1 = make_src(rhs.data(), dtype::f32, 1u, k),
+      .dst = make_dst(q2_out, dtype::f32, 1u, 1u),
+  };
+  const emel::kernel::event::op_mul_mat q3_ev{
+      .src0 = make_quantized_src(&q3, dtype::q3_k, k, 1u),
+      .src1 = make_src(rhs.data(), dtype::f32, 1u, k),
+      .dst = make_dst(q3_out, dtype::f32, 1u, 1u),
+  };
+  const emel::kernel::event::op_mul_mat q6_ev{
+      .src0 = make_quantized_src(&q6, dtype::q6_k, k, 1u),
+      .src1 = make_src(rhs.data(), dtype::f32, 1u, k),
+      .dst = make_dst(q6_out, dtype::f32, 1u, 1u),
+  };
+
+  x86_64_sm machine{
+      emel::kernel::x86_64::action::context{avx2_fma_contract(true), {}, 0}};
+  allocation_scope allocations{};
+  CHECK(machine.process_event(q2_ev));
+  CHECK(machine.process_event(q3_ev));
+  CHECK(machine.process_event(q6_ev));
+  CHECK(allocations.allocations() == 0u);
+  CHECK(machine.optimized_q2_dispatch_count() == 1u);
+  CHECK(machine.shared_q2_dispatch_count() == 0u);
+  CHECK(machine.optimized_q3_dispatch_count() == 1u);
+  CHECK(machine.shared_q3_dispatch_count() == 0u);
+  CHECK(machine.optimized_q6_dispatch_count() == 1u);
+  CHECK(machine.shared_q6_dispatch_count() == 0u);
+}
+
+TEST_CASE("kernel_x86_64_flash_attn_ext_uses_optimized_backend_path") {
+  const bool host_flash =
+      emel::kernel::x86_64::detail::avx2_intrinsics_compiled &&
+      emel::kernel::x86_64::detail::detect_avx2() &&
+      emel::kernel::x86_64::detail::detect_fma() &&
+      emel::kernel::x86_64::detail::detect_f16c();
+  if (!host_flash ) {
+    return;
+  }
+
+  flash_attn_ext_fixture fixture{};
+  const auto request = make_flash_attn_ext_event(fixture);
+  const emel::kernel::x86_64::detail::host_feature_contract contract{
+      .avx2_available = true,
+      .fma_available = true,
+      .f16c_available = true,
+  };
+  x86_64_sm machine{emel::kernel::x86_64::action::context{contract, {}, 0}};
+
+  CHECK(machine.process_event(request));
+
+  const auto q = std::span<const float>(fixture.q, request.src0.ne[0]);
+  const auto k = std::span<const uint16_t>(fixture.k, request.src0.ne[0] *
+                                                          request.src1.ne[1]);
+  const auto v = std::span<const uint16_t>(fixture.v, request.src0.ne[0] *
+                                                          request.src2.ne[1]);
+  const std::vector<float> expected = flash_attn_reference_f16_scores(
+      q, k, v, request.src0.ne[0], request.src1.ne[1], 1.0f);
+  CHECK(machine.optimized_flash_dispatch_count() == 1u);
+  CHECK(machine.shared_flash_dispatch_count() == 0u);
+  CHECK(within_flash_online_f16_tolerance(fixture.dst[0], expected[0]));
+  CHECK(within_flash_online_f16_tolerance(fixture.dst[1], expected[1]));
+  CHECK(within_flash_online_f16_tolerance(fixture.dst[2], expected[2]));
+  CHECK(within_flash_online_f16_tolerance(fixture.dst[3], expected[3]));
+}
+
+TEST_CASE(
+    "kernel_x86_64_flash_attn_ext_falls_back_when_feature_contract_disabled") {
+  flash_attn_ext_fixture fixture{};
+  const auto request = make_flash_attn_ext_event(fixture);
+  const emel::kernel::x86_64::detail::host_feature_contract contract{
+      .avx2_available = false,
+      .fma_available = false,
+      .f16c_available = false,
+  };
+  x86_64_sm machine{emel::kernel::x86_64::action::context{contract, {}, 0}};
+
+  CHECK(machine.process_event(request));
+
+  const auto q = std::span<const float>(fixture.q, request.src0.ne[0]);
+  const auto k = std::span<const uint16_t>(fixture.k, request.src0.ne[0] *
+                                                          request.src1.ne[1]);
+  const auto v = std::span<const uint16_t>(fixture.v, request.src0.ne[0] *
+                                                          request.src2.ne[1]);
+  const std::vector<float> expected = flash_attn_reference_f16_scores(
+      q, k, v, request.src0.ne[0], request.src1.ne[1], 1.0f);
+  CHECK(machine.optimized_flash_dispatch_count() == 0u);
+  CHECK(machine.shared_flash_dispatch_count() == 1u);
+  CHECK(within_flash_online_f16_tolerance(fixture.dst[0], expected[0]));
+  CHECK(within_flash_online_f16_tolerance(fixture.dst[1], expected[1]));
+  CHECK(within_flash_online_f16_tolerance(fixture.dst[2], expected[2]));
+  CHECK(within_flash_online_f16_tolerance(fixture.dst[3], expected[3]));
+}
+
+TEST_CASE("kernel_x86_64_flash_attn_ext_reuses_persistent_workspace_on_"
+          "optimized_path") {
+  const bool host_flash =
+      emel::kernel::x86_64::detail::avx2_intrinsics_compiled &&
+      emel::kernel::x86_64::detail::detect_avx2() &&
+      emel::kernel::x86_64::detail::detect_fma() &&
+      emel::kernel::x86_64::detail::detect_f16c();
+  if (!host_flash) {
+    return;
+  }
+
+  flash_attn_ext_fixture fixture{};
+  const auto request = make_flash_attn_ext_event(fixture);
+  const emel::kernel::x86_64::detail::host_feature_contract contract{
+      .avx2_available = true,
+      .fma_available = true,
+      .f16c_available = true,
+  };
+  x86_64_sm machine{emel::kernel::x86_64::action::context{contract, {}, 0}};
+
+  CHECK(machine.process_event(request));
+  CHECK(machine.flash_attn_workspace_prepared_tokens() == 2u);
+  CHECK(machine.flash_attn_workspace_reuse_count() == 0u);
+
+  std::fill_n(fixture.dst, request.dst.ne[0], 0.0f);
+  CHECK(machine.process_event(request));
+  CHECK(machine.optimized_flash_dispatch_count() == 2u);
+  CHECK(machine.shared_flash_dispatch_count() == 0u);
+  CHECK(machine.flash_attn_workspace_prepared_tokens() == 2u);
+  CHECK(machine.flash_attn_workspace_reuse_count() == 1u);
+}
+
+TEST_CASE("kernel_x86_64_flash_attn_ext_matches_masked_total_token_reference") {
+  if (!host_has_avx2_fma_f16c()) {
+    return;
+  }
+
+  flash_attn_ext_fixture fixture{};
+  auto request = make_flash_attn_ext_event(fixture);
+  const uint32_t masked_total_tokens = 8u;
+  const float scale = 1.0f;
+  std::memcpy(request.op_params.data(), &scale, sizeof(scale));
+  std::memcpy(request.op_params.data() + sizeof(scale), &masked_total_tokens,
+              sizeof(masked_total_tokens));
+  request.op_params_size = sizeof(scale) + sizeof(masked_total_tokens);
+  const emel::kernel::x86_64::detail::host_feature_contract contract{
+      .avx2_available = true,
+      .fma_available = true,
+      .f16c_available = true,
+  };
+  x86_64_sm machine{emel::kernel::x86_64::action::context{contract, {}, 0}};
+
+  CHECK(machine.process_event(request));
+
+  const auto q = std::span<const float>(fixture.q, request.src0.ne[0]);
+  const auto k = std::span<const uint16_t>(fixture.k, request.src0.ne[0] *
+                                                          request.src1.ne[1]);
+  const auto v = std::span<const uint16_t>(fixture.v, request.src0.ne[0] *
+                                                          request.src2.ne[1]);
+  const std::vector<float> expected = flash_attn_reference_masked_total_tokens(
+      q, k, v, request.src0.ne[0], request.src1.ne[1], masked_total_tokens,
+      scale);
+  CHECK(machine.optimized_flash_dispatch_count() == 1u);
+  CHECK(machine.shared_flash_dispatch_count() == 0u);
+  CHECK(fixture.dst[0] ==
+        doctest::Approx(expected[0]).epsilon(k_flash_online_f16_abs_tolerance));
+  CHECK(fixture.dst[1] ==
+        doctest::Approx(expected[1]).epsilon(k_flash_online_f16_abs_tolerance));
+  CHECK(fixture.dst[2] ==
+        doctest::Approx(expected[2]).epsilon(k_flash_online_f16_abs_tolerance));
+  CHECK(fixture.dst[3] ==
+        doctest::Approx(expected[3]).epsilon(k_flash_online_f16_abs_tolerance));
+}
+
+TEST_CASE(
+    "kernel_x86_64_flash_attn_ext_matches_masked_total_token_reference_on_"
+    "long_multihead_kv") {
+  if (!host_has_avx2_fma_f16c()) {
+    return;
+  }
+
+  constexpr uint64_t head_dim = 64u;
+  constexpr uint64_t head_count = 12u;
+  constexpr uint64_t kv_head_count = 12u;
+  constexpr uint64_t kv_tokens = 769u;
+  constexpr uint32_t total_tokens = 2048u;
+  const uint64_t kv_dim = head_dim * kv_head_count;
+  const float scale = 1.0f / std::sqrt(static_cast<float>(head_dim));
+
+  std::vector<float> q(head_dim * head_count);
+  std::vector<float> k(kv_dim * kv_tokens);
+  std::vector<float> v(kv_dim * kv_tokens);
+  std::vector<float> dst(head_dim * head_count, 0.0f);
+
+  for (uint64_t head = 0; head < head_count; ++head) {
+    for (uint64_t dim = 0; dim < head_dim; ++dim) {
+      const double angle = static_cast<double>((head + 1u) * (dim + 3u));
+      q[head * head_dim + dim] =
+          emel::kernel::detail::quant::fp16_to_fp32(
+              emel::kernel::detail::quant::fp32_to_fp16(
+                  static_cast<float>(std::sin(angle * 0.03125))));
+    }
+  }
+
+  for (uint64_t token = 0; token < kv_tokens; ++token) {
+    for (uint64_t head = 0; head < kv_head_count; ++head) {
+      for (uint64_t dim = 0; dim < head_dim; ++dim) {
+        const uint64_t offset = token * kv_dim + head * head_dim + dim;
+        const double base =
+            static_cast<double>((token + 1u) * (head + 3u) * (dim + 5u));
+        k[offset] = emel::kernel::detail::quant::fp16_to_fp32(
+            emel::kernel::detail::quant::fp32_to_fp16(
+                static_cast<float>(std::cos(base * 0.0078125))));
+        v[offset] = emel::kernel::detail::quant::fp16_to_fp32(
+            emel::kernel::detail::quant::fp32_to_fp16(
+                static_cast<float>(std::sin(base * 0.01171875))));
+      }
+    }
+  }
+  const auto k_fp16 = to_fp16_storage(k);
+  const auto v_fp16 = to_fp16_storage(v);
+
+  emel::kernel::event::op_flash_attn_ext request{};
+  request.src0 = make_src(q.data(), dtype::f32, head_dim, 1u, head_count);
+  request.src1 = make_src(k_fp16.data(), dtype::f16, head_dim, kv_tokens,
+                          kv_head_count);
+  request.src2 = make_src(v_fp16.data(), dtype::f16, head_dim, kv_tokens,
+                          kv_head_count);
+  request.dst = make_dst(dst.data(), dtype::f32, head_dim, 1u, head_count);
+  request.src1.nb[1] = sizeof(uint16_t) * kv_dim;
+  request.src1.nb[2] = sizeof(uint16_t) * head_dim;
+  request.src2.nb[1] = sizeof(uint16_t) * kv_dim;
+  request.src2.nb[2] = sizeof(uint16_t) * head_dim;
+  std::memcpy(request.op_params.data(), &scale, sizeof(scale));
+  std::memcpy(request.op_params.data() + sizeof(scale), &total_tokens,
+              sizeof(total_tokens));
+  request.op_params_size = sizeof(scale) + sizeof(total_tokens);
+
+  x86_64_sm machine{
+      emel::kernel::x86_64::action::context{avx2_fma_contract(true), {}, 0}};
+  REQUIRE(machine.process_event(request));
+  CHECK(machine.optimized_flash_dispatch_count() == 1u);
+  CHECK(machine.shared_flash_dispatch_count() == 0u);
+
+  std::vector<float> expected(head_dim * head_count, 0.0f);
+  for (uint64_t head = 0; head < head_count; ++head) {
+    std::vector<float> k_head(kv_tokens * head_dim);
+    std::vector<float> v_head(kv_tokens * head_dim);
+    for (uint64_t token = 0; token < kv_tokens; ++token) {
+      const uint64_t src_offset = token * kv_dim + head * head_dim;
+      const uint64_t dst_offset = token * head_dim;
+      std::memcpy(k_head.data() + dst_offset, k.data() + src_offset,
+                  sizeof(float) * head_dim);
+      std::memcpy(v_head.data() + dst_offset, v.data() + src_offset,
+                  sizeof(float) * head_dim);
+    }
+    const auto expected_head = flash_attn_reference_masked_total_tokens(
+        std::span<const float>(q.data() + head * head_dim, head_dim),
+        std::span<const float>(k_head.data(), k_head.size()),
+        std::span<const float>(v_head.data(), v_head.size()), head_dim,
+        kv_tokens, total_tokens, scale);
+    for (uint64_t dim = 0; dim < head_dim; ++dim) {
+      expected[head * head_dim + dim] =
+          expected_head[static_cast<size_t>(dim)];
+    }
+  }
+
+  for (size_t idx = 0; idx < dst.size(); ++idx) {
+    CHECK(within_flash_online_f16_tolerance(dst[idx], expected[idx]));
+  }
+}
+
+TEST_CASE("kernel_x86_64_host_feature_contract_can_fail_closed") {
+  const emel::kernel::x86_64::detail::host_feature_contract contract{};
+  const x86_64_sm machine{
+      emel::kernel::x86_64::action::context{contract, {}, 0}};
+
+  CHECK_FALSE(machine.avx2_available());
+  CHECK_FALSE(machine.fma_available());
+  CHECK_FALSE(machine.f16c_available());
+  CHECK_FALSE(machine.avx2_fma_f16c_available());
+  CHECK_FALSE(machine.avx512_claimed());
+  CHECK_FALSE(machine.avx_vnni_claimed());
+  CHECK_FALSE(machine.amx_claimed());
+  CHECK_FALSE(machine.bf16_claimed());
+  CHECK_FALSE(machine.native_fp16_claimed());
+}
+
+TEST_CASE("kernel_x86_64_detects_host_feature_contract") {
+  const auto contract =
+      emel::kernel::x86_64::detail::detect_host_feature_contract();
+
+  CHECK(contract.avx2_available == emel::kernel::x86_64::detail::detect_avx2());
+  CHECK(contract.fma_available == emel::kernel::x86_64::detail::detect_fma());
+  CHECK(contract.f16c_available == emel::kernel::x86_64::detail::detect_f16c());
+  CHECK(contract.avx2_fma_f16c_available() ==
+        (contract.avx2_available && contract.fma_available &&
+         contract.f16c_available));
+  CHECK_FALSE(contract.avx512_claimed);
+  CHECK_FALSE(contract.avx_vnni_claimed);
+  CHECK_FALSE(contract.amx_claimed);
+  CHECK_FALSE(contract.bf16_claimed);
+  CHECK_FALSE(contract.native_fp16_claimed);
+}
+
 TEST_CASE("kernel_x86_64_unary_subop_supported_and_unsupported_paths") {
   float src[4] = {-2.0f, -1.0f, 1.0f, 2.0f};
   float dst[4] = {};
@@ -138,7 +1313,6 @@ TEST_CASE("kernel_x86_64_unary_subop_supported_and_unsupported_paths") {
   emel::kernel::event::op_unary unary_ev{
       .src0 = make_src(src, dtype::f32, 4),
       .dst = make_dst(dst, dtype::f32, 4),
-      .nth = 1,
       .subop = emel::kernel::event::unary_subop::neg,
   };
 
@@ -207,7 +1381,6 @@ TEST_CASE("kernel_x86_64_rejects_unimplemented_ops") {
   const emel::kernel::event::op_sum sum_ev{
       .src0 = make_src(src, dtype::f32, 4),
       .dst = make_dst(dst, dtype::f32, 4),
-      .nth = 1,
   };
 
   x86_64_sm machine{};
@@ -215,8 +1388,9 @@ TEST_CASE("kernel_x86_64_rejects_unimplemented_ops") {
 }
 
 TEST_CASE("kernel_x86_64_mul_mat_simd_matches_scalar_tiled_edges") {
-  const bool host_avx2 = emel::kernel::x86_64::detail::avx2_intrinsics_compiled &&
-                         emel::kernel::x86_64::detail::detect_avx2();
+  const bool host_avx2 =
+      emel::kernel::x86_64::detail::avx2_intrinsics_compiled &&
+      emel::kernel::x86_64::detail::detect_avx2();
   if (!host_avx2) {
     return;
   }
@@ -243,13 +1417,11 @@ TEST_CASE("kernel_x86_64_mul_mat_simd_matches_scalar_tiled_edges") {
       .src0 = make_src(src0.data(), dtype::f32, k, m),
       .src1 = make_src(src1.data(), dtype::f32, n, k),
       .dst = make_dst(dst_simd.data(), dtype::f32, n, m),
-      .nth = 1,
   };
   const emel::kernel::event::op_mul_mat scalar_ev{
       .src0 = make_src(src0.data(), dtype::f32, k, m),
       .src1 = make_src(src1.data(), dtype::f32, n, k),
       .dst = make_dst(dst_scalar.data(), dtype::f32, n, m),
-      .nth = 1,
   };
 
   CHECK(emel::kernel::x86_64::detail::execute_avx2_mul_mat(simd_ev));
@@ -261,9 +1433,166 @@ TEST_CASE("kernel_x86_64_mul_mat_simd_matches_scalar_tiled_edges") {
   }
 }
 
+TEST_CASE("kernel_x86_64_f32_mul_mat_fma_simd_matches_scalar_tiled_edges") {
+  if (!host_has_avx2_fma()) {
+    return;
+  }
+
+  constexpr uint64_t k = 131;
+  constexpr uint64_t m = 7;
+  constexpr uint64_t n = 73;
+
+  std::array<float, k * m> src0{};
+  std::array<float, k * n> src1{};
+  std::array<float, n * m> dst_simd{};
+  std::array<float, n * m> dst_scalar{};
+
+  for (uint64_t i = 0; i < src0.size(); ++i) {
+    const int64_t centered = static_cast<int64_t>(i % 17u) - 8;
+    src0[static_cast<size_t>(i)] = static_cast<float>(centered) * 0.0625f;
+  }
+  for (uint64_t i = 0; i < src1.size(); ++i) {
+    const int64_t centered = static_cast<int64_t>(i % 23u) - 11;
+    src1[static_cast<size_t>(i)] = static_cast<float>(centered) * 0.03125f;
+  }
+
+  const emel::kernel::event::op_mul_mat simd_ev{
+      .src0 = make_src(src0.data(), dtype::f32, k, m),
+      .src1 = make_src(src1.data(), dtype::f32, n, k),
+      .dst = make_dst(dst_simd.data(), dtype::f32, n, m),
+  };
+  const emel::kernel::event::op_mul_mat scalar_ev{
+      .src0 = make_src(src0.data(), dtype::f32, k, m),
+      .src1 = make_src(src1.data(), dtype::f32, n, k),
+      .dst = make_dst(dst_scalar.data(), dtype::f32, n, m),
+  };
+
+  CHECK(emel::kernel::x86_64::detail::execute_avx2_fma_mul_mat(simd_ev));
+  CHECK(emel::kernel::detail::execute_scalar(scalar_ev));
+
+  for (uint64_t idx = 0; idx < dst_simd.size(); ++idx) {
+    CHECK(dst_simd[static_cast<size_t>(idx)] ==
+          doctest::Approx(dst_scalar[static_cast<size_t>(idx)]).epsilon(1e-5f));
+  }
+}
+
+TEST_CASE("kernel_x86_64_f32_mul_mat_uses_fma_and_avx2_only_routes") {
+  const bool host_avx2 =
+      emel::kernel::x86_64::detail::avx2_intrinsics_compiled &&
+      emel::kernel::x86_64::detail::detect_avx2();
+  if (!host_avx2) {
+    return;
+  }
+
+  constexpr uint64_t k = 16;
+  constexpr uint64_t m = 4;
+  constexpr uint64_t n = 8;
+
+  std::array<float, k * m> src0{};
+  std::array<float, k * n> src1{};
+  for (uint64_t i = 0; i < src0.size(); ++i) {
+    const int64_t centered = static_cast<int64_t>(i % 13u) - 6;
+    src0[static_cast<size_t>(i)] = static_cast<float>(centered) * 0.125f;
+  }
+  for (uint64_t i = 0; i < src1.size(); ++i) {
+    const int64_t centered = static_cast<int64_t>(i % 19u) - 9;
+    src1[static_cast<size_t>(i)] = static_cast<float>(centered) * 0.0625f;
+  }
+
+  std::array<float, n * m> expected{};
+  const emel::kernel::event::op_mul_mat scalar_ev{
+      .src0 = make_src(src0.data(), dtype::f32, k, m),
+      .src1 = make_src(src1.data(), dtype::f32, n, k),
+      .dst = make_dst(expected.data(), dtype::f32, n, m),
+  };
+  CHECK(emel::kernel::detail::execute_scalar(scalar_ev));
+
+  if (host_has_avx2_fma()) {
+    std::array<float, n * m> fma_out{};
+    const emel::kernel::event::op_mul_mat fma_ev{
+        .src0 = make_src(src0.data(), dtype::f32, k, m),
+        .src1 = make_src(src1.data(), dtype::f32, n, k),
+        .dst = make_dst(fma_out.data(), dtype::f32, n, m),
+    };
+    x86_64_sm fma_machine{
+        emel::kernel::x86_64::action::context{avx2_fma_contract(true), {}, 0}};
+
+    CHECK(fma_machine.process_event(fma_ev));
+    CHECK(fma_machine.optimized_f32_fma_dispatch_count() == 1u);
+    for (size_t i = 0; i < expected.size(); ++i) {
+      CHECK(fma_out[i] == doctest::Approx(expected[i]).epsilon(1e-5f));
+    }
+  }
+
+  std::array<float, n * m> avx2_out{};
+  const emel::kernel::event::op_mul_mat avx2_ev{
+      .src0 = make_src(src0.data(), dtype::f32, k, m),
+      .src1 = make_src(src1.data(), dtype::f32, n, k),
+      .dst = make_dst(avx2_out.data(), dtype::f32, n, m),
+  };
+  const emel::kernel::x86_64::detail::host_feature_contract avx2_only_contract{
+      .avx2_available = true,
+      .fma_available = false,
+      .f16c_available = false,
+  };
+  x86_64_sm avx2_machine{
+      emel::kernel::x86_64::action::context{avx2_only_contract, {}, 0}};
+
+  CHECK(avx2_machine.process_event(avx2_ev));
+  CHECK(avx2_machine.optimized_f32_fma_dispatch_count() == 0u);
+  for (size_t i = 0; i < expected.size(); ++i) {
+    CHECK(avx2_out[i] == doctest::Approx(expected[i]).epsilon(1e-5f));
+  }
+}
+
+TEST_CASE("kernel_x86_64_f32_mul_mat_vector_fma_route_matches_scalar") {
+  if (!host_has_avx2_fma()) {
+    return;
+  }
+
+  constexpr uint64_t k = 133;
+  constexpr uint64_t m = 5;
+
+  std::array<float, k * m> src0{};
+  std::array<float, k> src1{};
+  for (uint64_t i = 0; i < src0.size(); ++i) {
+    const int64_t centered = static_cast<int64_t>(i % 29u) - 14;
+    src0[static_cast<size_t>(i)] = static_cast<float>(centered) * 0.0625f;
+  }
+  for (uint64_t i = 0; i < src1.size(); ++i) {
+    const int64_t centered = static_cast<int64_t>(i % 31u) - 15;
+    src1[static_cast<size_t>(i)] = static_cast<float>(centered) * 0.03125f;
+  }
+
+  std::array<float, m> expected{};
+  const emel::kernel::event::op_mul_mat scalar_ev{
+      .src0 = make_src(src0.data(), dtype::f32, k, m),
+      .src1 = make_src(src1.data(), dtype::f32, 1u, k),
+      .dst = make_dst(expected.data(), dtype::f32, 1u, m),
+  };
+  CHECK(emel::kernel::detail::execute_scalar(scalar_ev));
+
+  std::array<float, m> vector_out{};
+  const emel::kernel::event::op_mul_mat vector_ev{
+      .src0 = make_src(src0.data(), dtype::f32, k, m),
+      .src1 = make_src(src1.data(), dtype::f32, 1u, k),
+      .dst = make_dst(vector_out.data(), dtype::f32, 1u, m),
+  };
+  x86_64_sm vector_machine{
+      emel::kernel::x86_64::action::context{avx2_fma_contract(true), {}, 0}};
+
+  CHECK(vector_machine.process_event(vector_ev));
+  CHECK(vector_machine.optimized_f32_fma_vector_dispatch_count() == 1u);
+  CHECK(vector_machine.optimized_f32_fma_dispatch_count() == 0u);
+  for (size_t i = 0; i < expected.size(); ++i) {
+    CHECK(vector_out[i] == doctest::Approx(expected[i]).epsilon(1e-5f));
+  }
+}
+
 TEST_CASE("kernel_x86_64_mul_mat_tail_resets_nan_dst_on_first_depth_block") {
-  const bool host_avx2 = emel::kernel::x86_64::detail::avx2_intrinsics_compiled &&
-                         emel::kernel::x86_64::detail::detect_avx2();
+  const bool host_avx2 =
+      emel::kernel::x86_64::detail::avx2_intrinsics_compiled &&
+      emel::kernel::x86_64::detail::detect_avx2();
   if (!host_avx2) {
     return;
   }
@@ -286,20 +1615,19 @@ TEST_CASE("kernel_x86_64_mul_mat_tail_resets_nan_dst_on_first_depth_block") {
     src1[static_cast<size_t>(i)] = static_cast<float>(centered) * 0.0625f;
   }
 
-  std::fill(dst_simd.begin(), dst_simd.end(), std::numeric_limits<float>::quiet_NaN());
+  std::fill(dst_simd.begin(), dst_simd.end(),
+            std::numeric_limits<float>::quiet_NaN());
   dst_scalar.fill(0.0f);
 
   const emel::kernel::event::op_mul_mat simd_ev{
       .src0 = make_src(src0.data(), dtype::f32, k, m),
       .src1 = make_src(src1.data(), dtype::f32, n, k),
       .dst = make_dst(dst_simd.data(), dtype::f32, n, m),
-      .nth = 1,
   };
   const emel::kernel::event::op_mul_mat scalar_ev{
       .src0 = make_src(src0.data(), dtype::f32, k, m),
       .src1 = make_src(src1.data(), dtype::f32, n, k),
       .dst = make_dst(dst_scalar.data(), dtype::f32, n, m),
-      .nth = 1,
   };
 
   CHECK(emel::kernel::x86_64::detail::execute_avx2_mul_mat(simd_ev));
@@ -313,8 +1641,9 @@ TEST_CASE("kernel_x86_64_mul_mat_tail_resets_nan_dst_on_first_depth_block") {
 }
 
 TEST_CASE("kernel_x86_64_detail_branch_paths") {
-  const bool host_avx2 = emel::kernel::x86_64::detail::avx2_intrinsics_compiled &&
-                         emel::kernel::x86_64::detail::detect_avx2();
+  const bool host_avx2 =
+      emel::kernel::x86_64::detail::avx2_intrinsics_compiled &&
+      emel::kernel::x86_64::detail::detect_avx2();
 
   float lhs[4] = {1.0f, 2.0f, 3.0f, 4.0f};
   float rhs[4] = {4.0f, 3.0f, 2.0f, 1.0f};
@@ -324,11 +1653,11 @@ TEST_CASE("kernel_x86_64_detail_branch_paths") {
       .src0 = make_src(lhs, dtype::f32, 4),
       .src1 = make_src(rhs, dtype::f32, 4),
       .dst = make_dst(dst, dtype::f32, 4),
-      .nth = 1,
   };
 
   CHECK_FALSE(emel::kernel::x86_64::detail::can_use_avx2(add_ev, false));
-  CHECK(emel::kernel::x86_64::detail::can_use_avx2(add_ev, host_avx2) == host_avx2);
+  CHECK(emel::kernel::x86_64::detail::can_use_avx2(add_ev, host_avx2) ==
+        host_avx2);
   CHECK(emel::kernel::x86_64::detail::execute_request(
       add_ev, emel::kernel::x86_64::action::context{host_avx2, {}, 0}));
 
@@ -351,17 +1680,18 @@ TEST_CASE("kernel_x86_64_detail_branch_paths") {
   emel::kernel::event::op_unary unary_ev{
       .src0 = make_src(lhs, dtype::f32, 4),
       .dst = make_dst(dst, dtype::f32, 4),
-      .nth = 1,
       .subop = emel::kernel::event::unary_subop::relu,
   };
-  CHECK(emel::kernel::x86_64::detail::can_use_avx2(unary_ev, host_avx2) == host_avx2);
+  CHECK(emel::kernel::x86_64::detail::can_use_avx2(unary_ev, host_avx2) ==
+        host_avx2);
   unary_ev.subop = emel::kernel::event::unary_subop::exp;
   CHECK_FALSE(emel::kernel::x86_64::detail::can_use_avx2(unary_ev, host_avx2));
 }
 
 TEST_CASE("kernel_x86_64_detail_helper_edge_paths") {
-  const bool host_avx2 = emel::kernel::x86_64::detail::avx2_intrinsics_compiled &&
-                         emel::kernel::x86_64::detail::detect_avx2();
+  const bool host_avx2 =
+      emel::kernel::x86_64::detail::avx2_intrinsics_compiled &&
+      emel::kernel::x86_64::detail::detect_avx2();
 
   float src0[4] = {1.0f, 2.0f, 3.0f, 4.0f};
   float dst0[4] = {};
@@ -380,60 +1710,50 @@ TEST_CASE("kernel_x86_64_detail_helper_edge_paths") {
   const emel::kernel::event::op_dup dup_ev{
       .src0 = make_src(src0, dtype::f32, 4),
       .dst = make_dst(dst0, dtype::f32, 4),
-      .nth = 1,
   };
   const emel::kernel::event::op_add add_ev{
       .src0 = make_src(src0, dtype::f32, 4),
       .src1 = make_src(src0, dtype::f32, 4),
       .dst = make_dst(dst0, dtype::f32, 4),
-      .nth = 1,
   };
   const emel::kernel::event::op_mul mul_ev{
       .src0 = make_src(src0, dtype::f32, 4),
       .src1 = make_src(src0, dtype::f32, 4),
       .dst = make_dst(dst0, dtype::f32, 4),
-      .nth = 1,
   };
   const emel::kernel::event::op_div div_ev{
       .src0 = make_src(src0, dtype::f32, 4),
       .src1 = make_src(src0, dtype::f32, 4),
       .dst = make_dst(dst0, dtype::f32, 4),
-      .nth = 1,
   };
   const emel::kernel::event::op_sqr sqr_ev{
       .src0 = make_src(src0, dtype::f32, 4),
       .dst = make_dst(dst0, dtype::f32, 4),
-      .nth = 1,
   };
   const emel::kernel::event::op_sqrt sqrt_ev{
       .src0 = make_src(src0, dtype::f32, 4),
       .dst = make_dst(dst0, dtype::f32, 4),
-      .nth = 1,
   };
   const emel::kernel::event::op_sub sub_ev{
       .src0 = make_src(src0, dtype::f32, 4),
       .src1 = make_src(src0, dtype::f32, 4),
       .dst = make_dst(dst0, dtype::f32, 4),
-      .nth = 1,
   };
   float src_mm0[8] = {1.0f, 0.5f, -1.0f, 2.0f, 0.0f, -0.5f, 3.0f, 1.0f};
   float src_mm1[32] = {
-      1.0f,  0.0f,  0.5f, -1.0f, 0.5f, 1.0f, -0.5f, 2.0f,
-      0.0f,  1.0f,  1.0f,  0.0f, 2.0f, 0.5f,  0.0f, 1.0f,
-      -1.0f, 2.0f,  0.0f,  1.0f, 1.5f, 0.0f,  2.0f, -0.5f,
-      2.0f,  -1.0f, 1.0f,  0.5f, 0.0f, 1.0f, -1.0f, 1.0f,
+      1.0f, 0.0f,  0.5f, -1.0f, 0.5f, 1.0f,  -0.5f, 2.0f, 0.0f,  1.0f, 1.0f,
+      0.0f, 2.0f,  0.5f, 0.0f,  1.0f, -1.0f, 2.0f,  0.0f, 1.0f,  1.5f, 0.0f,
+      2.0f, -0.5f, 2.0f, -1.0f, 1.0f, 0.5f,  0.0f,  1.0f, -1.0f, 1.0f,
   };
   float dst_mm[16] = {};
   const emel::kernel::event::op_mul_mat mul_mat_ev{
       .src0 = make_src(src_mm0, dtype::f32, 4, 2),
       .src1 = make_src(src_mm1, dtype::f32, 8, 4),
       .dst = make_dst(dst_mm, dtype::f32, 8, 2),
-      .nth = 1,
   };
   emel::kernel::event::op_unary unary_ev{
       .src0 = make_src(src0, dtype::f32, 4),
       .dst = make_dst(dst0, dtype::f32, 4),
-      .nth = 1,
       .subop = emel::kernel::event::unary_subop::relu,
   };
 
@@ -446,7 +1766,8 @@ TEST_CASE("kernel_x86_64_detail_helper_edge_paths") {
     CHECK(emel::kernel::x86_64::detail::execute_avx2_sqr(sqr_ev));
     CHECK(emel::kernel::x86_64::detail::execute_avx2_sqrt(sqrt_ev));
     CHECK(emel::kernel::x86_64::detail::execute_avx2_mul_mat(mul_mat_ev));
-    CHECK(emel::kernel::x86_64::detail::execute_avx2_unary(unary_ev));
+    emel::kernel::x86_64::detail::execute_simd_unary_subop_unchecked<
+        emel::kernel::event::unary_subop::relu>(unary_ev);
     CHECK(emel::kernel::x86_64::detail::execute_simd(dup_ev));
     CHECK(emel::kernel::x86_64::detail::execute_simd(add_ev));
     CHECK(emel::kernel::x86_64::detail::execute_simd(sub_ev));
@@ -455,7 +1776,7 @@ TEST_CASE("kernel_x86_64_detail_helper_edge_paths") {
     CHECK(emel::kernel::x86_64::detail::execute_simd(sqr_ev));
     CHECK(emel::kernel::x86_64::detail::execute_simd(sqrt_ev));
     CHECK(emel::kernel::x86_64::detail::execute_simd(mul_mat_ev));
-    CHECK(emel::kernel::x86_64::detail::execute_simd(unary_ev));
+    CHECK_FALSE(emel::kernel::x86_64::detail::execute_simd(unary_ev));
   }
 #if !(defined(__x86_64__) || defined(_M_X64))
   CHECK_FALSE(emel::kernel::x86_64::detail::execute_avx2_dup(dup_ev));
@@ -466,7 +1787,6 @@ TEST_CASE("kernel_x86_64_detail_helper_edge_paths") {
   CHECK_FALSE(emel::kernel::x86_64::detail::execute_avx2_sqr(sqr_ev));
   CHECK_FALSE(emel::kernel::x86_64::detail::execute_avx2_sqrt(sqrt_ev));
   CHECK_FALSE(emel::kernel::x86_64::detail::execute_avx2_mul_mat(mul_mat_ev));
-  CHECK_FALSE(emel::kernel::x86_64::detail::execute_avx2_unary(unary_ev));
   emel::kernel::x86_64::detail::execute_simd_unchecked(dup_ev);
   emel::kernel::x86_64::detail::execute_simd_unchecked(add_ev);
   emel::kernel::x86_64::detail::execute_simd_unchecked(sub_ev);
@@ -475,7 +1795,8 @@ TEST_CASE("kernel_x86_64_detail_helper_edge_paths") {
   emel::kernel::x86_64::detail::execute_simd_unchecked(sqr_ev);
   emel::kernel::x86_64::detail::execute_simd_unchecked(sqrt_ev);
   emel::kernel::x86_64::detail::execute_simd_unchecked(mul_mat_ev);
-  emel::kernel::x86_64::detail::execute_simd_unchecked(unary_ev);
+  emel::kernel::x86_64::detail::execute_simd_unary_subop_unchecked<
+      emel::kernel::event::unary_subop::relu>(unary_ev);
   CHECK_FALSE(emel::kernel::x86_64::detail::execute_simd(dup_ev));
   CHECK_FALSE(emel::kernel::x86_64::detail::execute_simd(add_ev));
   CHECK_FALSE(emel::kernel::x86_64::detail::execute_simd(sub_ev));
@@ -500,16 +1821,17 @@ TEST_CASE("kernel_x86_64_simd_action_exec_marks_done") {
       .src0 = make_src(src0, dtype::f32, 4),
       .src1 = make_src(src1, dtype::f32, 4),
       .dst = make_dst(dst, dtype::f32, 4),
-      .nth = 1,
   };
 
   emel::kernel::x86_64::event::dispatch_ctx dispatch_ctx{};
   emel::kernel::x86_64::action::context ctx{false, {}, 0};
-  const emel::kernel::x86_64::event::dispatch_op_add dispatch_ev{add_ev, dispatch_ctx};
+  const emel::kernel::x86_64::event::dispatch_op_add dispatch_ev{add_ev,
+                                                                 dispatch_ctx};
 
   emel::kernel::x86_64::action::exec_simd_op_add(dispatch_ev, ctx);
 
-  CHECK(dispatch_ctx.outcome == emel::kernel::x86_64::events::phase_outcome::done);
-  CHECK(dispatch_ctx.err ==
-        static_cast<int32_t>(emel::error::cast(emel::kernel::x86_64::error::none)));
+  CHECK(dispatch_ctx.outcome ==
+        emel::kernel::x86_64::events::phase_outcome::done);
+  CHECK(dispatch_ctx.err == static_cast<int32_t>(emel::error::cast(
+                                emel::kernel::x86_64::error::none)));
 }
diff --git a/tests/model/loader/lifecycle_tests.cpp b/tests/model/loader/lifecycle_tests.cpp
index 21317383..3a150cae 100644
--- a/tests/model/loader/lifecycle_tests.cpp
+++ b/tests/model/loader/lifecycle_tests.cpp
@@ -31,6 +31,7 @@
 #include "emel/model/sortformer/detail.hpp"
 #include "emel/model/tensor/sm.hpp"
 #include "emel/model/whisper/detail.hpp"
+#include "../../kernel/test_helpers.hpp"
 
 namespace {
 
@@ -284,7 +285,7 @@ void append_tensor_with_shape(emel::model::data &model,
 
 void build_canonical_model(emel::model::data &model,
                            const int32_t block_count) {
-  std::memset(&model, 0, sizeof(model));
+  emel::tests::reset_model_data(model);
   copy_name(model.architecture_name, "llama");
   model.n_layers = block_count;
   model.params.n_embd = 64;
@@ -322,7 +323,7 @@ void build_canonical_model(emel::model::data &model,
 
 void build_qwen3_model(emel::model::data &model, const int32_t block_count,
                        const bool include_q_norm, const bool include_k_norm) {
-  std::memset(&model, 0, sizeof(model));
+  emel::tests::reset_model_data(model);
   copy_name(model.architecture_name, "qwen3");
   model.n_layers = block_count;
   model.params.n_embd = 64;
@@ -380,7 +381,7 @@ bool is_lfm2_attention_layer(const int32_t block_index) {
 
 void build_lfm2_model(emel::model::data &model, const bool include_output_norm,
                       const bool corrupt_conv_block_contract) {
-  std::memset(&model, 0, sizeof(model));
+  emel::tests::reset_model_data(model);
   copy_name(model.architecture_name, "lfm2");
   model.n_layers = 16;
   model.params.n_layer = 16;
@@ -440,9 +441,73 @@ void build_lfm2_model(emel::model::data &model, const bool include_output_norm,
   model.n_tensors = tensor_index;
 }
 
+// LFM2.5-230M geometry: 14 blocks, attention at even blocks >= 2, classified
+// via the per-layer kv-head pattern instead of the 1.2B fallback layout.
+void build_lfm2_230m_model(emel::model::data &model) {
+  emel::tests::reset_model_data(model);
+  copy_name(model.architecture_name, "lfm2");
+  model.n_layers = 14;
+  model.params.n_layer = 14;
+  model.params.n_ctx = 128000;
+  model.params.n_embd = 1024;
+  model.params.n_embd_out = 1024;
+  model.params.n_head = 16;
+  model.params.n_head_kv = 8;
+  model.params.n_vocab = 65536;
+  model.params.shortconv_l_cache = 3;
+  model.params.rope_freq_base = 1000000.0f;
+  model.params.attention_layer_pattern_count =
+      static_cast<uint32_t>(model.n_layers);
+  for (int32_t block = 0; block < model.n_layers; ++block) {
+    model.params.attention_layer_pattern_flags[static_cast<size_t>(block)] =
+        (block >= 2 && block % 2 == 0) ? 1u : 0u;
+  }
+  model.weights_data = model.tensors.data();
+  model.weights_size = 4096u;
+
+  uint32_t tensor_index = 0u;
+  const auto add = [&](const std::string_view name) {
+    append_tensor_name(model, model.tensors[tensor_index], name);
+    ++tensor_index;
+  };
+  const auto add_block = [&](const int32_t block,
+                             const std::string_view suffix) {
+    add(std::string{"blk."} + std::to_string(block) + "." +
+        std::string{suffix});
+  };
+
+  add("token_embd.weight");
+  add("token_embd_norm.weight");
+
+  for (int32_t block = 0; block < model.n_layers; ++block) {
+    add_block(block, "attn_norm.weight");
+    add_block(block, "ffn_norm.weight");
+    add_block(block, "ffn_gate.weight");
+    add_block(block, "ffn_down.weight");
+    add_block(block, "ffn_up.weight");
+
+    if (model.params.attention_layer_pattern_flags[static_cast<size_t>(
+            block)] != 0u) {
+      add_block(block, "attn_q.weight");
+      add_block(block, "attn_k.weight");
+      add_block(block, "attn_v.weight");
+      add_block(block, "attn_q_norm.weight");
+      add_block(block, "attn_k_norm.weight");
+      add_block(block, "attn_output.weight");
+      continue;
+    }
+
+    add_block(block, "shortconv.conv.weight");
+    add_block(block, "shortconv.in_proj.weight");
+    add_block(block, "shortconv.out_proj.weight");
+  }
+
+  model.n_tensors = tensor_index;
+}
+
 void build_gemma4_model(emel::model::data &model,
                         const bool include_output_weight) {
-  std::memset(&model, 0, sizeof(model));
+  emel::tests::reset_model_data(model);
   copy_name(model.architecture_name, "gemma4");
   model.n_layers = 35;
   model.params.n_layer = 35;
@@ -513,7 +578,7 @@ void build_gemma4_model(emel::model::data &model,
 
 void build_omniembed_model(emel::model::data &model,
                            const bool include_audio_projection) {
-  std::memset(&model, 0, sizeof(model));
+  emel::tests::reset_model_data(model);
   copy_name(model.architecture_name, "omniembed");
   model.params.n_embd = 1280;
   model.params.n_embd_out = 1280;
@@ -568,7 +633,7 @@ void build_omniembed_model(emel::model::data &model,
 
 void build_sortformer_model(emel::model::data &model,
                             const bool include_modules_family) {
-  std::memset(&model, 0, sizeof(model));
+  emel::tests::reset_model_data(model);
   copy_name(model.architecture_name, "sortformer");
   model.params.n_features = 4;
   model.weights_data = model.tensors.data();
@@ -592,7 +657,7 @@ void build_sortformer_model(emel::model::data &model,
 
 void build_whisper_model(emel::model::data &model,
                          const bool include_decoder_cross_attn) {
-  std::memset(&model, 0, sizeof(model));
+  emel::tests::reset_model_data(model);
   copy_name(model.architecture_name, "whisper");
   model.params.n_features = 80;
   model.params.n_vocab = 51865;
@@ -2497,6 +2562,26 @@ TEST_CASE("model_execution_contract_rejects_lfm2_with_noncanonical_hybrid_"
         emel::error::cast(emel::model::loader::error::model_invalid));
 }
 
+TEST_CASE("model_execution_contract_accepts_lfm2_230m_pattern_layout") {
+  auto model = std::make_unique<emel::model::data>();
+  build_lfm2_230m_model(*model);
+
+  CHECK(emel::model::validate_execution_contract(*model) ==
+        emel::error::cast(emel::model::loader::error::none));
+}
+
+TEST_CASE("model_execution_contract_rejects_lfm2_230m_when_pattern_"
+          "contradicts_block_tensors") {
+  auto model = std::make_unique<emel::model::data>();
+  build_lfm2_230m_model(*model);
+  // Block 2 carries attention tensors; flipping its pattern flag must fail the
+  // hybrid tensor contract instead of silently reclassifying the block.
+  model->params.attention_layer_pattern_flags[2] = 0u;
+
+  CHECK(emel::model::validate_execution_contract(*model) ==
+        emel::error::cast(emel::model::loader::error::model_invalid));
+}
+
 TEST_CASE("model_execution_contract_rejects_lfm2_attention_block_with_"
           "shortconv_weights") {
   auto model = std::make_unique<emel::model::data>();
@@ -2573,6 +2658,10 @@ TEST_CASE("model_detail_loads_gemma4_hparams_from_gguf_binding") {
   CHECK(model->params.attention_shared_kv_layers == 20);
   CHECK(model->params.n_rot == 512);
   CHECK(model->params.n_rot_swa == 256);
+  CHECK(model->params.rope_pair_x0_stride == 1);
+  CHECK(model->params.rope_pair_x1_stride == 1);
+  CHECK(model->params.rope_pair_x1_offset == 0);
+  CHECK(model->params.rope_pair_x1_half_rot_offset == 1);
   CHECK(model->params.full_attention_interval == 5);
   CHECK(model->params.final_logit_softcapping == doctest::Approx(30.0f));
   CHECK(model->params.rope_freq_base == doctest::Approx(1000000.0f));
@@ -2703,6 +2792,10 @@ TEST_CASE(
   CHECK(model->params.attention_key_length == 64);
   CHECK(model->params.attention_value_length == 80);
   CHECK(model->params.n_rot == 64);
+  CHECK(model->params.rope_pair_x0_stride == 1);
+  CHECK(model->params.rope_pair_x1_stride == 1);
+  CHECK(model->params.rope_pair_x1_offset == 0);
+  CHECK(model->params.rope_pair_x1_half_rot_offset == 1);
   CHECK(model->params.n_layer == 24);
   CHECK(model->params.n_vocab == 4);
   CHECK(model->params.tie_word_embeddings);
@@ -2750,6 +2843,10 @@ TEST_CASE("model_detail_loads_lfm2_hparams_from_gguf_binding") {
   CHECK(model->params.n_layer == 16);
   CHECK(model->params.n_vocab == 65536);
   CHECK(model->params.shortconv_l_cache == 3);
+  CHECK(model->params.rope_pair_x0_stride == 1);
+  CHECK(model->params.rope_pair_x1_stride == 1);
+  CHECK(model->params.rope_pair_x1_offset == 0);
+  CHECK(model->params.rope_pair_x1_half_rot_offset == 1);
   CHECK(model->params.tie_word_embeddings);
   CHECK(model->params.attention_layer_norm_rms_epsilon ==
         doctest::Approx(1e-6f));
diff --git a/tests/models/LFM2.5-230M-Q8_0.gguf b/tests/models/LFM2.5-230M-Q8_0.gguf
new file mode 100644
index 00000000..d85b80d1
--- /dev/null
+++ b/tests/models/LFM2.5-230M-Q8_0.gguf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:855be85429300602eda72958547614703541b7d6dd965a8f8f6052b85a7aa935
+size 246598496
diff --git a/tests/sm/sm_policy_tests.cpp b/tests/sm/sm_policy_tests.cpp
index 8596c00e..96d6c2d0 100644
--- a/tests/sm/sm_policy_tests.cpp
+++ b/tests/sm/sm_policy_tests.cpp
@@ -1,6 +1,12 @@
 #include <doctest/doctest.h>
 #include <stateforward/sml/utility/dispatch_table.hpp>
 
+#include <array>
+#include <atomic>
+#include <semaphore>
+#include <thread>
+
+#include "../allocation_tracker.hpp"
 #include "emel/sm.hpp"
 
 namespace {
@@ -130,6 +136,119 @@ struct logger_surface_model {
   }
 };
 
+using co_inline_policy =
+    emel::policy::coroutine_scheduler<emel::policy::inline_scheduler>;
+using co_static_policy =
+    emel::policy::coroutine_scheduler<emel::policy::fifo_scheduler<8u, 64u>>;
+using co_thread_pool_pool =
+    emel::policy::thread_pool_scheduler<2u, 8u, 128u>;
+using co_thread_pool_scheduler =
+    emel::policy::thread_pool_scheduler_ref<co_thread_pool_pool>;
+using co_thread_pool_policy =
+    emel::policy::coroutine_scheduler<co_thread_pool_scheduler>;
+using allocation_scope = emel::test::allocation::allocation_scope;
+
+struct event_co_mark {
+  int32_t & marker;
+};
+
+struct event_co_error {
+  int32_t * error_out = nullptr;
+};
+
+struct event_co_wait {
+  std::atomic<bool> & release;
+  std::atomic<int32_t> & entered;
+};
+
+struct state_co_idle {};
+struct state_co_done {};
+struct state_co_error {};
+
+struct effect_co_mark {
+  void operator()(const event_co_mark & ev) const noexcept {
+    ev.marker = 11;
+  }
+};
+
+struct effect_co_error {
+  void operator()(const event_co_error & ev) const noexcept {
+    *ev.error_out = 3;
+  }
+};
+
+struct effect_co_wait {
+  void operator()(const event_co_wait & ev) const noexcept {
+    ev.entered.fetch_add(1, std::memory_order_release);
+    while (!ev.release.load(std::memory_order_acquire)) {
+      std::this_thread::yield();
+    }
+  }
+};
+
+struct co_surface_model {
+  auto operator()() const noexcept {
+    namespace sml = stateforward::sml;
+    // clang-format off
+    return sml::make_transition_table(
+        sml::state<state_co_done> <= *sml::state<state_co_idle>
+          + sml::event<event_co_mark> / effect_co_mark{}
+      , sml::state<state_co_done> <= sml::state<state_co_idle>
+          + sml::event<event_co_wait> / effect_co_wait{}
+      , sml::state<state_co_error> <= sml::state<state_co_idle>
+          + sml::event<event_co_error> / effect_co_error{}
+    );
+    // clang-format on
+  }
+};
+
+struct context_co_probe {
+  int32_t value = 0;
+};
+
+struct event_co_context_mark {
+  int32_t & marker;
+};
+
+struct state_co_context_idle {};
+struct state_co_context_done {};
+
+struct effect_co_context_mark {
+  void operator()(const event_co_context_mark & ev,
+                  context_co_probe & ctx) const noexcept {
+    ctx.value += 1;
+    ev.marker = ctx.value;
+  }
+};
+
+struct co_context_model {
+  auto operator()() const noexcept {
+    namespace sml = stateforward::sml;
+    // clang-format off
+    return sml::make_transition_table(
+        sml::state<state_co_context_done> <= *sml::state<state_co_context_idle>
+          + sml::event<event_co_context_mark> / effect_co_context_mark{}
+    );
+    // clang-format on
+  }
+};
+
+template <class predicate>
+void require_eventually(const char * label, predicate && pred) {
+  for (int32_t attempt = 0; attempt < 100000; ++attempt) {
+    if (pred()) {
+      return;
+    }
+    std::this_thread::yield();
+  }
+  FAIL(label);
+}
+
+template <class scheduler>
+concept has_schedule_method = requires(scheduler & scheduler_in) {
+  scheduler_in.schedule([]() noexcept {});
+};
+
 }  // namespace
 
 TEST_CASE("sm_normalize_event_result_handles_error_out") {
@@ -192,3 +311,597 @@ TEST_CASE("stateforward_sml_logger_policy_observes_dispatch") {
   CHECK(logger.state_changes >= 1);
   CHECK(machine.is(sml::state<state_logger_done>));
 }
+
+TEST_CASE("co_sm_process_event_uses_stateforward_utility_surface") {
+  using default_machine_type = emel::co_sm<co_surface_model>;
+  using machine_type = emel::co_sm<co_surface_model, void, co_inline_policy>;
+  static_assert(std::same_as<default_machine_type::scheduler_type,
+                             emel::policy::inline_scheduler>);
+  static_assert(std::same_as<machine_type::scheduler_type,
+                             emel::policy::inline_scheduler>);
+  static_assert(
+      emel::policy::strict_ordering_scheduler_contract<
+          machine_type::scheduler_type>);
+
+  namespace sml = stateforward::sml;
+  machine_type machine{};
+
+  int32_t marker = 0;
+  CHECK(machine.process_event(event_co_mark{.marker = marker}));
+  CHECK(marker == 11);
+  CHECK(machine.is(sml::state<state_co_done>));
+
+  bool scheduled = false;
+  machine.scheduler().schedule([&scheduled]() noexcept { scheduled = true; });
+  CHECK(scheduled);
+}
+
+TEST_CASE("co_sm_process_event_async_inline_completes_immediately") {
+  namespace sml = stateforward::sml;
+  emel::co_sm<co_surface_model, void, co_inline_policy> machine{};
+
+  int32_t marker = 0;
+  emel::bool_task task =
+      machine.process_event_async(event_co_mark{.marker = marker});
+
+  CHECK(task.result());
+  CHECK(marker == 11);
+  CHECK(machine.is(sml::state<state_co_done>));
+}
+
+TEST_CASE("co_sm_normalizes_error_out_for_sync_and_inline_async") {
+  emel::co_sm<co_surface_model, void, co_inline_policy> sync_machine{};
+  int32_t sync_err = 0;
+  CHECK_FALSE(sync_machine.process_event(event_co_error{.error_out = &sync_err}));
+  CHECK(sync_err == 3);
+
+  emel::co_sm<co_surface_model, void, co_inline_policy> async_machine{};
+  int32_t async_err = 0;
+  emel::bool_task task =
+      async_machine.process_event_async(event_co_error{.error_out = &async_err});
+  CHECK_FALSE(task.result());
+  CHECK(async_err == 3);
+}
+
+TEST_CASE("co_sm_normalizes_error_out_for_static_scheduler_immediate_async") {
+  emel::co_sm<co_surface_model, void, co_static_policy> machine{};
+  int32_t err = 0;
+  emel::bool_task task = machine.process_event_async(event_co_error{.error_out = &err});
+
+  CHECK(task.await_ready());
+  CHECK_FALSE(task.result());
+  CHECK(err == 3);
+}
+
+TEST_CASE("co_sm_static_scheduler_rejects_busy_async_without_escaping_rtc") {
+  namespace sml = stateforward::sml;
+  emel::co_sm<co_surface_model, void, co_static_policy> machine{};
+
+  int32_t marker = 0;
+  bool task_ready = false;
+  bool task_result = true;
+  const bool drained = machine.scheduler().try_run_immediate([&]() {
+    emel::bool_task task =
+        machine.process_event_async(event_co_mark{.marker = marker});
+    task_ready = task.await_ready();
+    task_result = task.result();
+  });
+
+  CHECK(drained);
+  CHECK(task_ready);
+  CHECK_FALSE(task_result);
+  CHECK(marker == 0);
+  CHECK_FALSE(machine.is(sml::state<state_co_done>));
+}
+
+TEST_CASE("thread_pool_scheduler_policy_is_static_multi_consumer") {
+  using scheduler_type = co_thread_pool_policy::scheduler_type;
+  static_assert(scheduler_type::multi_consumer);
+  static_assert(!scheduler_type::single_consumer);
+  static_assert(!scheduler_type::owns_workers);
+  static_assert(scheduler_type::run_to_completion);
+  static_assert(scheduler_type::static_worker_count == 2u);
+  static_assert(scheduler_type::static_capacity == 8u);
+  static_assert(std::is_copy_constructible_v<scheduler_type>);
+  static_assert(!co_thread_pool_pool::run_to_completion);
+  static_assert(!has_schedule_method<co_thread_pool_pool>);
+  static_assert(
+      !stateforward::sml::utility::policy::valid_coroutine_scheduler<
+          co_thread_pool_pool>);
+
+  co_thread_pool_pool pool{};
+  emel::co_sm<co_surface_model, void, co_thread_pool_policy> machine{
+      co_thread_pool_scheduler{pool}};
+  CHECK(machine.scheduler().immediate_run_count() == 0u);
+  CHECK(machine.scheduler().scheduled_run_count() == 0u);
+  CHECK(machine.scheduler().worker_run_count() == 0u);
+}
+
+TEST_CASE("co_sm_thread_pool_scheduler_runs_inline_when_idle") {
+  namespace sml = stateforward::sml;
+  co_thread_pool_pool pool{};
+  emel::co_sm<co_surface_model, void, co_thread_pool_policy> machine{
+      co_thread_pool_scheduler{pool}};
+
+  int32_t marker = 0;
+  emel::bool_task task =
+      machine.process_event_async(event_co_mark{.marker = marker});
+
+  CHECK(task.await_ready());
+  CHECK(task.result());
+  CHECK(marker == 11);
+  CHECK(machine.is(sml::state<state_co_done>));
+  CHECK(machine.scheduler().immediate_run_count() == 1u);
+  CHECK(machine.scheduler().scheduled_run_count() == 0u);
+  CHECK(machine.scheduler().worker_run_count() == 0u);
+}
+
+TEST_CASE("co_sm_thread_pool_scheduler_uses_worker_when_inline_busy") {
+  namespace sml = stateforward::sml;
+  co_thread_pool_pool pool{};
+  emel::co_sm<co_surface_model, void, co_thread_pool_policy> machine{
+      co_thread_pool_scheduler{pool}};
+
+  int32_t marker = 0;
+  bool task_ready = false;
+  bool task_result = false;
+  const bool outer_completed = pool.try_run_immediate([&]() {
+    emel::bool_task task =
+        machine.process_event_async(event_co_mark{.marker = marker});
+    task_ready = task.await_ready();
+    task_result = task.result();
+  });
+
+  CHECK(outer_completed);
+  CHECK(task_ready);
+  CHECK(task_result);
+  CHECK(marker == 11);
+  CHECK(machine.is(sml::state<state_co_done>));
+  CHECK(machine.scheduler().immediate_run_count() == 1u);
+  CHECK(machine.scheduler().scheduled_run_count() == 1u);
+  CHECK(machine.scheduler().worker_run_count() == 1u);
+}
+
+TEST_CASE("co_sm_thread_pool_scheduler_worker_result_waits_for_scheduler_quiescence") {
+  co_thread_pool_pool pool{};
+  emel::co_sm<co_surface_model, void, co_thread_pool_policy> first{
+      co_thread_pool_scheduler{pool}};
+  emel::co_sm<co_surface_model, void, co_thread_pool_policy> second{
+      co_thread_pool_scheduler{pool}};
+  std::binary_semaphore inline_lane_held{0};
+  std::binary_semaphore release_inline_lane{0};
+  std::atomic<bool> holder_result{false};
+  std::thread inline_lane_holder{[&]() {
+    const bool held = pool.try_run_immediate([&]() noexcept {
+      holder_result.store(true, std::memory_order_release);
+      inline_lane_held.release();
+      release_inline_lane.acquire();
+    });
+    if (!held) {
+      inline_lane_held.release();
+    }
+  }};
+  inline_lane_held.acquire();
+  CHECK(holder_result.load(std::memory_order_acquire));
+
+  int32_t first_marker = 0;
+  emel::bool_task first_task =
+      first.process_event_async(event_co_mark{.marker = first_marker});
+  CHECK(first_task.result());
+  CHECK(first_marker == 11);
+  CHECK(pool.worker_run_count() == 1u);
+  CHECK(pool.scheduled_run_count() == 1u);
+
+  release_inline_lane.release();
+  inline_lane_holder.join();
+
+  const uint64_t immediate_before_second = pool.immediate_run_count();
+  const uint64_t scheduled_before_second = pool.scheduled_run_count();
+  int32_t second_marker = 0;
+  emel::bool_task second_task =
+      second.process_event_async(event_co_mark{.marker = second_marker});
+  CHECK(second_task.result());
+  CHECK(second_marker == 11);
+  CHECK(pool.immediate_run_count() == immediate_before_second + 1u);
+  CHECK(pool.scheduled_run_count() == scheduled_before_second);
+}
+
+TEST_CASE("co_sm_thread_pool_scheduler_rejects_concurrent_actor_dispatch") {
+  co_thread_pool_pool pool{};
+  emel::co_sm<co_surface_model, void, co_thread_pool_policy> machine{
+      co_thread_pool_scheduler{pool}};
+  std::atomic<bool> release{false};
+  std::atomic<int32_t> entered{0};
+  bool first_result = false;
+
+  std::thread first_dispatch([&]() {
+    first_result = machine
+                       .process_event_async(event_co_wait{
+                           .release = release,
+                           .entered = entered,
+                       })
+                       .result();
+  });
+
+  require_eventually("first dispatch did not enter action", [&]() {
+    return entered.load(std::memory_order_acquire) == 1;
+  });
+
+  int32_t marker = 0;
+  emel::bool_task second_task =
+      machine.process_event_async(event_co_mark{.marker = marker});
+  CHECK(second_task.await_ready());
+  CHECK_FALSE(second_task.result());
+  CHECK(marker == 0);
+
+  release.store(true, std::memory_order_release);
+  first_dispatch.join();
+  CHECK(first_result);
+}
+
+TEST_CASE("co_sm_thread_pool_scheduler_rejects_concurrent_state_inspection") {
+  namespace sml = stateforward::sml;
+  co_thread_pool_pool pool{};
+  emel::co_sm<co_surface_model, void, co_thread_pool_policy> machine{
+      co_thread_pool_scheduler{pool}};
+  std::atomic<bool> release{false};
+  std::atomic<int32_t> entered{0};
+  bool first_result = false;
+  bool visited = false;
+
+  std::thread first_dispatch([&]() {
+    first_result = machine
+                       .process_event_async(event_co_wait{
+                           .release = release,
+                           .entered = entered,
+                       })
+                       .result();
+  });
+
+  require_eventually("first dispatch did not enter action", [&]() {
+    return entered.load(std::memory_order_acquire) == 1;
+  });
+
+  CHECK_FALSE(machine.is(sml::state<state_co_done>));
+  machine.visit_current_states([&](const auto &) { visited = true; });
+  CHECK_FALSE(visited);
+
+  release.store(true, std::memory_order_release);
+  first_dispatch.join();
+  CHECK(first_result);
+}
+
+TEST_CASE("co_sm_thread_pool_scheduler_allows_concurrent_different_actors") {
+  co_thread_pool_pool pool{};
+  emel::co_sm<co_surface_model, void, co_thread_pool_policy> first{
+      co_thread_pool_scheduler{pool}};
+  emel::co_sm<co_surface_model, void, co_thread_pool_policy> second{
+      co_thread_pool_scheduler{pool}};
+  int32_t first_marker = 0;
+  int32_t second_marker = 0;
+  bool first_result = false;
+  bool second_result = false;
+
+  std::thread first_thread([&]() {
+    first_result = first.process_event_async(
+                            event_co_mark{.marker = first_marker})
+                       .result();
+  });
+  std::thread second_thread([&]() {
+    second_result = second.process_event_async(
+                              event_co_mark{.marker = second_marker})
+                        .result();
+  });
+
+  first_thread.join();
+  second_thread.join();
+
+  CHECK(first_result);
+  CHECK(second_result);
+  CHECK(first_marker == 11);
+  CHECK(second_marker == 11);
+}
+
+TEST_CASE("thread_pool_scheduler_rejects_same_pool_nested_wait") {
+  using scheduler_type = emel::policy::thread_pool_scheduler<1u, 8u, 128u>;
+  scheduler_type scheduler{};
+  std::atomic<bool> outer_entered{false};
+  std::atomic<bool> nested_ran{false};
+  std::atomic<bool> nested_completed{true};
+  std::atomic<bool> done{false};
+
+  scheduler.submit([&]() noexcept {
+    outer_entered.store(true, std::memory_order_release);
+    nested_completed.store(
+        scheduler.run_or_schedule_and_wait([&]() noexcept {
+          nested_ran.store(true, std::memory_order_release);
+        }),
+        std::memory_order_release);
+    done.store(true, std::memory_order_release);
+  });
+
+  require_eventually("nested scheduler test did not finish", [&]() {
+    return done.load(std::memory_order_acquire);
+  });
+
+  CHECK(outer_entered.load(std::memory_order_acquire));
+  CHECK_FALSE(nested_completed.load(std::memory_order_acquire));
+  CHECK_FALSE(nested_ran.load(std::memory_order_acquire));
+}
+
+TEST_CASE("thread_pool_scheduler_ref_fork_join_runs_submitted_tasks_before_wait_returns") {
+  using scheduler_type = emel::policy::thread_pool_scheduler<2u, 8u, 128u>;
+  using scheduler_ref = emel::policy::thread_pool_scheduler_ref<scheduler_type>;
+  scheduler_type scheduler{};
+  scheduler_ref ref{scheduler};
+  scheduler_ref::join_group group{};
+  std::atomic<int32_t> entered{0};
+  std::atomic<int32_t> finished{0};
+  std::atomic<bool> release{false};
+
+  CHECK(ref.try_submit(group, [&]() noexcept {
+    entered.fetch_add(1, std::memory_order_release);
+    while (!release.load(std::memory_order_acquire)) {
+      std::this_thread::yield();
+    }
+    finished.fetch_add(1, std::memory_order_release);
+  }));
+  CHECK(ref.try_submit(group, [&]() noexcept {
+    entered.fetch_add(1, std::memory_order_release);
+    while (!release.load(std::memory_order_acquire)) {
+      std::this_thread::yield();
+    }
+    finished.fetch_add(1, std::memory_order_release);
+  }));
+
+  require_eventually("fork/join tasks did not enter concurrently", [&]() {
+    return entered.load(std::memory_order_acquire) == 2;
+  });
+
+  release.store(true, std::memory_order_release);
+  CHECK(group.wait());
+  CHECK(finished.load(std::memory_order_acquire) == 2);
+  CHECK(scheduler.worker_run_count() == 2u);
+}
+
+TEST_CASE("thread_pool_scheduler_ref_fork_join_ignores_pre_wait_completion_tokens") {
+  using scheduler_type = emel::policy::thread_pool_scheduler<1u, 8u, 128u>;
+  using scheduler_ref = emel::policy::thread_pool_scheduler_ref<scheduler_type>;
+  scheduler_type scheduler{};
+  scheduler_ref ref{scheduler};
+  scheduler_ref::join_group group{};
+  std::atomic<bool> fast_done{false};
+  std::atomic<bool> blocking_entered{false};
+  std::atomic<bool> release_blocking{false};
+  std::atomic<bool> wait_returned{false};
+  bool wait_result = false;
+
+  CHECK(ref.try_submit(group, [&]() noexcept {
+    fast_done.store(true, std::memory_order_release);
+  }));
+  require_eventually("fast fork/join task did not finish", [&]() {
+    return fast_done.load(std::memory_order_acquire);
+  });
+
+  CHECK(ref.try_submit(group, [&]() noexcept {
+    blocking_entered.store(true, std::memory_order_release);
+    while (!release_blocking.load(std::memory_order_acquire)) {
+      std::this_thread::yield();
+    }
+  }));
+  require_eventually("blocking fork/join task did not enter", [&]() {
+    return blocking_entered.load(std::memory_order_acquire);
+  });
+
+  std::thread waiter{[&]() {
+    wait_result = group.wait();
+    wait_returned.store(true, std::memory_order_release);
+  }};
+
+  for (int32_t attempt = 0; attempt < 1000; ++attempt) {
+    if (wait_returned.load(std::memory_order_acquire)) {
+      break;
+    }
+    std::this_thread::yield();
+  }
+  CHECK_FALSE(wait_returned.load(std::memory_order_acquire));
+
+  release_blocking.store(true, std::memory_order_release);
+  waiter.join();
+  CHECK(wait_result);
+  CHECK(wait_returned.load(std::memory_order_acquire));
+}
+
+TEST_CASE("thread_pool_scheduler_ref_fork_join_rejects_same_pool_worker_submit") {
+  using scheduler_type = emel::policy::thread_pool_scheduler<1u, 8u, 128u>;
+  using scheduler_ref = emel::policy::thread_pool_scheduler_ref<scheduler_type>;
+  scheduler_type scheduler{};
+  scheduler_ref ref{scheduler};
+  std::atomic<bool> nested_submitted{true};
+  std::atomic<bool> nested_joined{true};
+  std::atomic<bool> done{false};
+
+  scheduler.submit([&]() noexcept {
+    scheduler_ref::join_group group{};
+    nested_submitted.store(
+        ref.try_submit(group, []() noexcept {}), std::memory_order_release);
+    nested_joined.store(group.wait(), std::memory_order_release);
+    done.store(true, std::memory_order_release);
+  });
+
+  require_eventually("same-pool fork/join rejection did not finish", [&]() {
+    return done.load(std::memory_order_acquire);
+  });
+
+  CHECK_FALSE(nested_submitted.load(std::memory_order_acquire));
+  CHECK_FALSE(nested_joined.load(std::memory_order_acquire));
+}
+
+TEST_CASE("thread_pool_scheduler_try_submit_reports_full_queue") {
+  using scheduler_type = emel::policy::thread_pool_scheduler<1u, 2u, 128u>;
+  scheduler_type scheduler{};
+  std::atomic<bool> worker_entered{false};
+  std::atomic<bool> release_worker{false};
+  std::atomic<bool> queued_ran{false};
+
+  CHECK(scheduler.try_submit([&]() noexcept {
+    worker_entered.store(true, std::memory_order_release);
+    while (!release_worker.load(std::memory_order_acquire)) {
+      std::this_thread::yield();
+    }
+  }));
+
+  require_eventually("worker did not enter first task", [&]() {
+    return worker_entered.load(std::memory_order_acquire);
+  });
+
+  CHECK(scheduler.try_submit([&]() noexcept {
+    queued_ran.store(true, std::memory_order_release);
+  }));
+  CHECK_FALSE(scheduler.try_submit([]() noexcept {}));
+
+  release_worker.store(true, std::memory_order_release);
+  require_eventually("queued task did not run", [&]() {
+    return queued_ran.load(std::memory_order_acquire);
+  });
+}
+
+TEST_CASE("thread_pool_scheduler_accepts_multiple_producers") {
+  using scheduler_type = emel::policy::thread_pool_scheduler<2u, 32u, 128u>;
+  scheduler_type scheduler{};
+  std::atomic<int32_t> completed{0};
+  std::array<std::thread, 4> producers{};
+
+  for (auto & producer : producers) {
+    producer = std::thread([&scheduler, &completed]() {
+      for (int32_t idx = 0; idx < 4; ++idx) {
+        scheduler.submit([&completed]() noexcept {
+          completed.fetch_add(1, std::memory_order_release);
+        });
+      }
+    });
+  }
+
+  for (auto & producer : producers) {
+    producer.join();
+  }
+
+  require_eventually("multi-producer tasks did not finish", [&]() {
+    return completed.load(std::memory_order_acquire) == 16;
+  });
+
+  CHECK(scheduler.scheduled_run_count() == 16u);
+  CHECK(scheduler.worker_run_count() == 16u);
+}
+
+TEST_CASE("thread_pool_scheduler_ref_fork_join_survives_rapid_repeated_rounds") {
+  // Regression: the join latch previously used a closed_/pending_ handshake plus
+  // a per-group binary_semaphore. The handshake had a Dekker-style StoreLoad
+  // race (wait() could miss the final completion while the last completer missed
+  // the close), and the semaphore could be destroyed by the returning waiter
+  // while the completer was still inside release()/notify. Either could strand a
+  // wakeup and deadlock wait() after many back-to-back rounds, most readily when
+  // the lane count equals the worker count. Drive enough rounds that a
+  // reintroduced race surfaces (a regression makes wait() hang here).
+  using scheduler_type = emel::policy::thread_pool_scheduler<8u, 64u, 128u>;
+  using scheduler_ref = emel::policy::thread_pool_scheduler_ref<scheduler_type>;
+  scheduler_type scheduler{};
+  constexpr int32_t k_rounds = 20000;
+  constexpr int32_t k_lanes = 8;
+  std::atomic<int64_t> ran{0};
+  bool all_submitted = true;
+  bool all_joined = true;
+
+  for (int32_t round = 0; round < k_rounds; ++round) {
+    scheduler_ref ref{scheduler};
+    scheduler_ref::join_group group{};
+    for (int32_t lane = 0; lane < k_lanes; ++lane) {
+      all_submitted &= ref.try_submit(group, [&ran]() noexcept {
+        ran.fetch_add(1, std::memory_order_relaxed);
+      });
+    }
+    all_joined &= group.wait();
+  }
+
+  CHECK(all_submitted);
+  CHECK(all_joined);
+  CHECK(ran.load(std::memory_order_acquire) ==
+        static_cast<int64_t>(k_rounds) * k_lanes);
+  CHECK(scheduler.worker_run_count() ==
+        static_cast<uint64_t>(k_rounds) * k_lanes);
+}
+
+TEST_CASE("co_sm_contextful_wrapper_injects_context") {
+  namespace sml = stateforward::sml;
+  context_co_probe ctx{.value = 40};
+  emel::co_sm<co_context_model, context_co_probe, co_inline_policy> machine{ctx};
+
+  int32_t marker = 0;
+  CHECK(machine.process_event(event_co_context_mark{.marker = marker}));
+  CHECK(marker == 41);
+  CHECK(machine.is(sml::state<state_co_context_done>));
+}
+
+TEST_CASE("fixed_coroutine_allocator_has_no_heap_fallback") {
+  emel::policy::fixed_coroutine_allocator<64, 1> allocator{};
+  void * first = allocator.allocate(16, alignof(std::max_align_t));
+  CHECK(first != nullptr);
+  CHECK(allocator.allocate(16, alignof(std::max_align_t)) == nullptr);
+
+  allocator.deallocate(first, 16, alignof(std::max_align_t));
+  void * reused = allocator.allocate(16, alignof(std::max_align_t));
+  CHECK(reused == first);
+  allocator.deallocate(reused, 16, alignof(std::max_align_t));
+}
+
+TEST_CASE("co_sm_thread_pool_scheduler_async_dispatch_does_not_allocate") {
+  {
+    co_thread_pool_pool pool{};
+    emel::co_sm<co_surface_model, void, co_thread_pool_policy> machine{
+        co_thread_pool_scheduler{pool}};
+    int32_t marker = 0;
+    allocation_scope allocations{};
+
+    emel::bool_task task =
+        machine.process_event_async(event_co_mark{.marker = marker});
+
+    CHECK(task.result());
+    CHECK(marker == 11);
+    CHECK(allocations.allocations() == 0u);
+  }
+
+  {
+    co_thread_pool_pool pool{};
+    emel::co_sm<co_surface_model, void, co_thread_pool_policy> machine{
+        co_thread_pool_scheduler{pool}};
+    std::binary_semaphore inline_lane_held{0};
+    std::binary_semaphore release_inline_lane{0};
+    std::atomic<bool> holder_result{false};
+    std::thread inline_lane_holder{[&]() {
+      const bool held = pool.try_run_immediate([&]() noexcept {
+        holder_result.store(true, std::memory_order_release);
+        inline_lane_held.release();
+        release_inline_lane.acquire();
+      });
+      if (!held) {
+        inline_lane_held.release();
+      }
+    }};
+    inline_lane_held.acquire();
+    CHECK(holder_result.load(std::memory_order_acquire));
+
+    int32_t marker = 0;
+    {
+      allocation_scope allocations{};
+      emel::bool_task task =
+          machine.process_event_async(event_co_mark{.marker = marker});
+
+      CHECK(task.result());
+      CHECK(marker == 11);
+      CHECK(allocations.allocations() == 0u);
+    }
+
+    release_inline_lane.release();
+    inline_lane_holder.join();
+  }
+}
diff --git a/tests/speech/decoder/whisper/lifecycle_tests.cpp b/tests/speech/decoder/whisper/lifecycle_tests.cpp
index 808f680e..d4c6e371 100644
--- a/tests/speech/decoder/whisper/lifecycle_tests.cpp
+++ b/tests/speech/decoder/whisper/lifecycle_tests.cpp
@@ -181,6 +181,46 @@ void mark_decoder_aux_tensors_f32(emel::model::data & model) {
   }
 }
 
+void mark_decoder_linear_weights(emel::model::data & model,
+                                 const int32_t dtype) {
+  for (uint32_t index = 0; index < model.n_tensors; ++index) {
+    auto & tensor = model.tensors[index];
+    const auto name = emel::model::tensor_name_view(model, tensor);
+    if (name.starts_with("model.decoder.layers.") && tensor.n_dims == 2 &&
+        name.ends_with(".weight")) {
+      tensor.type = dtype;
+    }
+  }
+}
+
+// Rebinds every decoder aux tensor (biases, layer norms, positions) onto a
+// shared zero-filled f32 arena so a q8_0+f32-aux decode can run end to end
+// with in-bounds aux reads. The caller must keep the returned arena alive
+// for the lifetime of the mutated model.
+std::vector<float> repoint_decoder_aux_tensors_to_f32(emel::model::data & model) {
+  namespace whisper = emel::speech::decoder::whisper::detail;
+  std::vector<float> arena(
+      static_cast<size_t>(whisper::k_embedding_length) *
+          static_cast<size_t>(whisper::k_decoder_sequence_token_count),
+      0.0f);
+  for (uint32_t index = 0; index < model.n_tensors; ++index) {
+    auto & tensor = model.tensors[index];
+    const auto name = emel::model::tensor_name_view(model, tensor);
+    if (name.starts_with("model.decoder.") &&
+        (tensor.n_dims == 1 || name == "model.decoder.embed_positions.weight")) {
+      uint64_t count = 1u;
+      for (int32_t dim = 0; dim < tensor.n_dims; ++dim) {
+        count *= static_cast<uint64_t>(tensor.dims[static_cast<size_t>(dim)]);
+      }
+      REQUIRE(count <= arena.size());
+      tensor.type = static_cast<int32_t>(emel::kernel::detail::dtype_f32);
+      tensor.data = arena.data();
+      tensor.data_size = count * sizeof(float);
+    }
+  }
+  return arena;
+}
+
 loaded_whisper_fixture load_fixture_or_skip() {
   const auto fixture_path = whisper_fixture_path();
   if (!std::filesystem::exists(fixture_path)) {
@@ -428,24 +468,25 @@ void exercise_q4_decoder_linear_shape() {
 
   std::vector<float> input(static_cast<size_t>(In), 1.0f);
   std::vector<float> output(static_cast<size_t>(Out), -1.0f);
+  emel::kernel::sm linear_kernel{emel::kernel::detect_host_kind()};
   whisper::linear<whisper::linear_weight_variant::q4_0, In, Out>(
-      q4_0_tensor, bias_tensor, input.data(), output.data());
+      linear_kernel, q4_0_tensor, bias_tensor, input.data(), output.data());
   CHECK(output[0] == doctest::Approx(0.0f));
 
   std::fill(output.begin(), output.end(), -1.0f);
   whisper::linear<whisper::linear_weight_variant::q4_1, In, Out>(
-      q4_1_tensor, bias_tensor, input.data(), output.data());
+      linear_kernel, q4_1_tensor, bias_tensor, input.data(), output.data());
   CHECK(output[0] == doctest::Approx(0.0f));
 
   if constexpr (In == 384u && Out == 384u) {
     std::fill(output.begin(), output.end(), -1.0f);
     whisper::linear_no_bias<whisper::linear_weight_variant::q4_0, In, Out>(
-        q4_0_tensor, input.data(), output.data());
+        linear_kernel, q4_0_tensor, input.data(), output.data());
     CHECK(output[0] == doctest::Approx(0.0f));
 
     std::fill(output.begin(), output.end(), -1.0f);
     whisper::linear_no_bias<whisper::linear_weight_variant::q4_1, In, Out>(
-        q4_1_tensor, input.data(), output.data());
+        linear_kernel, q4_1_tensor, input.data(), output.data());
     CHECK(output[0] == doctest::Approx(0.0f));
   }
 }
@@ -594,9 +635,10 @@ TEST_CASE("whisper_decoder_detail_reads_f32_aux_and_q4_linear_rows") {
   CHECK(whisper::dot_linear_row<whisper::linear_weight_variant::q4_0>(
             q4_0_tensor, 0u, input.data(), input.size()) ==
         doctest::Approx(8.0f));
+  emel::kernel::sm linear_kernel{emel::kernel::detect_host_kind()};
   whisper::linear_no_bias<whisper::linear_weight_variant::q4_0,
                           quant::QK4_0, 1u>(
-      q4_0_tensor, input.data(), output.data());
+      linear_kernel, q4_0_tensor, input.data(), output.data());
   CHECK(output[0] == doctest::Approx(8.0f));
 
   CHECK(whisper::read_matrix_q4_1_value(q4_1_tensor, 0u, 0u) ==
@@ -608,7 +650,7 @@ TEST_CASE("whisper_decoder_detail_reads_f32_aux_and_q4_linear_rows") {
         doctest::Approx(72.0f));
   whisper::linear<whisper::linear_weight_variant::q4_1, quant::QK4_1, 1u,
                   whisper::aux_weight_variant::f32>(
-      q4_1_tensor, aux_tensor, input.data(), output.data());
+      linear_kernel, q4_1_tensor, aux_tensor, input.data(), output.data());
   CHECK(output[0] == doctest::Approx(73.25f));
 
   std::array<float, static_cast<size_t>(whisper::k_embedding_length)> norm_in{};
@@ -642,6 +684,50 @@ TEST_CASE("whisper_decoder_detail_exercises_compiled_q4_linear_shapes") {
   exercise_q4_decoder_linear_shape<1536u, 384u>();
 }
 
+TEST_CASE("whisper_decoder_detail_q8_linear_routes_through_kernel_machine") {
+  namespace whisper = emel::speech::decoder::whisper::detail;
+  namespace quant = emel::kernel::detail::quant;
+
+  std::array<quant::block_q8_0, 2> weight_blocks{};
+  for (size_t row = 0; row < weight_blocks.size(); ++row) {
+    weight_blocks[row].d = quant::fp32_to_fp16(0.25f + static_cast<float>(row));
+    for (size_t lane = 0; lane < quant::QK8_0; ++lane) {
+      weight_blocks[row].qs[lane] = static_cast<int8_t>((lane % 5u) - 2u);
+    }
+  }
+  emel::model::data::tensor_record weight{};
+  weight.type = static_cast<int32_t>(emel::kernel::detail::dtype_q8_0);
+  weight.n_dims = 2;
+  weight.dims = {static_cast<int64_t>(quant::QK8_0), 2, 1, 1};
+  weight.data = weight_blocks.data();
+  weight.data_size = sizeof(weight_blocks);
+
+  std::array<quant::block_q8_0, 1> bias_blocks{};
+  bias_blocks[0].d = quant::fp32_to_fp16(0.5f);
+  bias_blocks[0].qs[0] = 3;
+  bias_blocks[0].qs[1] = -4;
+  emel::model::data::tensor_record bias{};
+  bias.type = static_cast<int32_t>(emel::kernel::detail::dtype_q8_0);
+  bias.n_dims = 1;
+  bias.dims = {static_cast<int64_t>(quant::QK8_0), 1, 1, 1};
+  bias.data = bias_blocks.data();
+  bias.data_size = sizeof(bias_blocks);
+
+  std::array<float, quant::QK8_0> input{};
+  input.fill(1.0f);
+  std::array<float, 2> output{};
+  emel::kernel::sm linear_kernel{emel::kernel::detect_host_kind()};
+
+  whisper::linear<whisper::linear_weight_variant::q8_0, quant::QK8_0, 2>(
+      linear_kernel, weight, bias, input.data(), output.data());
+  CHECK(output[0] == doctest::Approx(0.75f).epsilon(0.001));
+
+  whisper::linear_no_bias<whisper::linear_weight_variant::q8_0, quant::QK8_0,
+                          2>(linear_kernel, weight, input.data(),
+                             output.data());
+  CHECK(output[1] == doctest::Approx(-3.75f).epsilon(0.001));
+}
+
 TEST_CASE("whisper_decoder_detail_exercises_q4_logits_path") {
   namespace whisper = emel::speech::decoder::whisper::detail;
 
@@ -660,11 +746,12 @@ TEST_CASE("whisper_decoder_detail_exercises_q4_logits_path") {
   float confidence = -1.0f;
   uint64_t digest = 0u;
 
+  emel::kernel::sm logits_kernel{emel::kernel::detect_host_kind()};
   whisper::compute_decoder_logits_for_tokens<
       whisper::linear_weight_variant::q4_0>(
-      *fixture.model, encoder_frames, cross_k.data(), cross_v.data(),
-      tokens.data(), tokens.size(), workspace.data(), logits.data(),
-      confidence, digest);
+      logits_kernel, *fixture.model, encoder_frames, cross_k.data(),
+      cross_v.data(), tokens.data(), tokens.size(), workspace.data(),
+      logits.data(), confidence, digest);
 
   CHECK(confidence == doctest::Approx(0.0f));
   CHECK(logits[0] == doctest::Approx(0.0f));
@@ -962,9 +1049,11 @@ TEST_CASE("whisper_decoder_runs_first_q8_token_from_public_event") {
   stop_policy.notimestamps = -6;
   stop_policy.timestamp_begin = 0;
   stop_policy.space = -7;
+  emel::kernel::sm stop_kernel{emel::kernel::detect_host_kind()};
   const uint64_t stop_digest =
       decoder::detail::run_decoder_sequence<
           decoder::detail::linear_weight_variant::q8_0>(
+          stop_kernel,
           *loaded.decoder_contract.model, encoded.encoder_state.data(),
           static_cast<uint64_t>(encoded.frames), stop_policy,
           policy.prompt_tokens.data(), policy.prompt_tokens.size(),
@@ -976,6 +1065,144 @@ TEST_CASE("whisper_decoder_runs_first_q8_token_from_public_event") {
   CHECK(stop_digest != 0u);
 }
 
+TEST_CASE("whisper_decoder_runs_q4_0_variant_from_public_event") {
+  auto loaded = load_fixture_or_skip();
+  if (loaded.model == nullptr) {
+    return;
+  }
+
+  mark_decoder_linear_weights(
+      *loaded.model, static_cast<int32_t>(emel::kernel::detail::dtype_q4_0));
+  std::vector<float> encoder_state(
+      static_cast<size_t>(decoder::detail::k_embedding_length), 0.0f);
+  std::vector<float> workspace(static_cast<size_t>(
+      decoder::detail::required_decoder_workspace_floats(1u)));
+  std::vector<float> logits(static_cast<size_t>(decoder::detail::k_vocab_size));
+  const auto &policy =
+      emel::speech::tokenizer::whisper::tiny_asr_decode_policy();
+  std::array<int32_t, 1> generated_tokens = {};
+  int32_t generated_token_count = 0;
+  int32_t token = 0;
+  float confidence = 0.0f;
+  uint64_t digest = 0;
+  emel::error::type err = emel::error::cast(decoder::error::none);
+  decoder::sm machine{};
+
+  decoder::event::decode request{loaded.decoder_contract,
+                                 encoder_state,
+                                 1,
+                                 policy,
+                                 generated_tokens,
+                                 generated_token_count,
+                                 workspace,
+                                 logits,
+                                 token,
+                                 confidence,
+                                 digest};
+  request.error_out = &err;
+  CHECK(machine.process_event(request));
+  CHECK(err == emel::error::cast(decoder::error::none));
+  CHECK(generated_token_count == 1);
+  CHECK(token >= 0);
+  CHECK(token < decoder::detail::k_vocab_size);
+  CHECK(generated_tokens[0] == token);
+  CHECK(machine.q4_0_dispatch_count() == 1u);
+  CHECK(machine.q8_0_dispatch_count() == 0u);
+  CHECK(machine.q4_1_dispatch_count() == 0u);
+}
+
+TEST_CASE("whisper_decoder_runs_q4_1_variant_from_public_event") {
+  auto loaded = load_fixture_or_skip();
+  if (loaded.model == nullptr) {
+    return;
+  }
+
+  mark_decoder_linear_weights(
+      *loaded.model, static_cast<int32_t>(emel::kernel::detail::dtype_q4_1));
+  std::vector<float> encoder_state(
+      static_cast<size_t>(decoder::detail::k_embedding_length), 0.0f);
+  std::vector<float> workspace(static_cast<size_t>(
+      decoder::detail::required_decoder_workspace_floats(1u)));
+  std::vector<float> logits(static_cast<size_t>(decoder::detail::k_vocab_size));
+  const auto &policy =
+      emel::speech::tokenizer::whisper::tiny_asr_decode_policy();
+  std::array<int32_t, 1> generated_tokens = {};
+  int32_t generated_token_count = 0;
+  int32_t token = 0;
+  float confidence = 0.0f;
+  uint64_t digest = 0;
+  emel::error::type err = emel::error::cast(decoder::error::none);
+  decoder::sm machine{};
+
+  decoder::event::decode request{loaded.decoder_contract,
+                                 encoder_state,
+                                 1,
+                                 policy,
+                                 generated_tokens,
+                                 generated_token_count,
+                                 workspace,
+                                 logits,
+                                 token,
+                                 confidence,
+                                 digest};
+  request.error_out = &err;
+  CHECK(machine.process_event(request));
+  CHECK(err == emel::error::cast(decoder::error::none));
+  CHECK(generated_token_count == 1);
+  CHECK(token >= 0);
+  CHECK(token < decoder::detail::k_vocab_size);
+  CHECK(generated_tokens[0] == token);
+  CHECK(machine.q4_1_dispatch_count() == 1u);
+  CHECK(machine.q8_0_dispatch_count() == 0u);
+  CHECK(machine.q4_0_dispatch_count() == 0u);
+}
+
+TEST_CASE("whisper_decoder_runs_q8_f32_aux_variant_from_public_event") {
+  auto loaded = load_fixture_or_skip();
+  if (loaded.model == nullptr) {
+    return;
+  }
+
+  const std::vector<float> aux_arena =
+      repoint_decoder_aux_tensors_to_f32(*loaded.model);
+  std::vector<float> encoder_state(
+      static_cast<size_t>(decoder::detail::k_embedding_length), 0.0f);
+  std::vector<float> workspace(static_cast<size_t>(
+      decoder::detail::required_decoder_workspace_floats(1u)));
+  std::vector<float> logits(static_cast<size_t>(decoder::detail::k_vocab_size));
+  const auto &policy =
+      emel::speech::tokenizer::whisper::tiny_asr_decode_policy();
+  std::array<int32_t, 1> generated_tokens = {};
+  int32_t generated_token_count = 0;
+  int32_t token = 0;
+  float confidence = 0.0f;
+  uint64_t digest = 0;
+  emel::error::type err = emel::error::cast(decoder::error::none);
+  decoder::sm machine{};
+
+  decoder::event::decode request{loaded.decoder_contract,
+                                 encoder_state,
+                                 1,
+                                 policy,
+                                 generated_tokens,
+                                 generated_token_count,
+                                 workspace,
+                                 logits,
+                                 token,
+                                 confidence,
+                                 digest};
+  request.error_out = &err;
+  CHECK(machine.process_event(request));
+  CHECK(err == emel::error::cast(decoder::error::none));
+  CHECK(generated_token_count == 1);
+  CHECK(token >= 0);
+  CHECK(token < decoder::detail::k_vocab_size);
+  CHECK(generated_tokens[0] == token);
+  CHECK(machine.q8_0_dispatch_count() == 1u);
+  CHECK(machine.q4_0_dispatch_count() == 0u);
+  CHECK(machine.q4_1_dispatch_count() == 0u);
+}
+
 TEST_CASE("whisper_decoder_routes_q8_linear_f32_aux_variant") {
   auto loaded = load_fixture_or_skip();
   if (loaded.model == nullptr) {
diff --git a/tests/speech/encoder/whisper/detail_tests.cpp b/tests/speech/encoder/whisper/detail_tests.cpp
index 9691a435..5529123a 100644
--- a/tests/speech/encoder/whisper/detail_tests.cpp
+++ b/tests/speech/encoder/whisper/detail_tests.cpp
@@ -150,14 +150,17 @@ TEST_CASE("whisper detail q4 helpers decode both lanes and dot rows") {
                                  static_cast<int64_t>(kernel::quant::QK8_0),
                                  2);
   std::array<float, 2> linear_out{};
+  emel::kernel::sm linear_kernel{emel::kernel::detect_host_kind()};
   whisper::linear<whisper::linear_weight_variant::q4_0,
                   kernel::quant::QK4_0,
-                  2>(q4_0_tensor, bias_tensor, input.data(), linear_out.data());
+                  2>(linear_kernel, q4_0_tensor, bias_tensor, input.data(),
+                     linear_out.data());
   CHECK(linear_out[0] == doctest::Approx(-6.0f));
 
   whisper::linear_no_bias<whisper::linear_weight_variant::q4_1,
                           kernel::quant::QK4_1,
-                          2>(q4_1_tensor, input.data(), linear_out.data());
+                          2>(linear_kernel, q4_1_tensor, input.data(),
+                             linear_out.data());
   CHECK(linear_out[0] == doctest::Approx(28.0f));
 }
 
@@ -216,14 +219,16 @@ TEST_CASE("whisper detail q8 helpers decode rows and linear outputs") {
                           1);
 
   std::array<float, 2> output{};
+  emel::kernel::sm linear_kernel{emel::kernel::detect_host_kind()};
   whisper::linear<whisper::linear_weight_variant::q8_0,
                   kernel::quant::QK8_0,
-                  2>(weight, bias, input.data(), output.data());
+                  2>(linear_kernel, weight, bias, input.data(), output.data());
   CHECK(output[0] == doctest::Approx(0.75f).epsilon(0.001));
 
   whisper::linear_no_bias<whisper::linear_weight_variant::q8_0,
                           kernel::quant::QK8_0,
-                          2>(weight, input.data(), output.data());
+                          2>(linear_kernel, weight, input.data(),
+                             output.data());
   CHECK(output[1] == doctest::Approx(-3.75f).epsilon(0.001));
 }
 
diff --git a/tests/speech/encoder/whisper/lifecycle_tests.cpp b/tests/speech/encoder/whisper/lifecycle_tests.cpp
index 737002ff..bef8811a 100644
--- a/tests/speech/encoder/whisper/lifecycle_tests.cpp
+++ b/tests/speech/encoder/whisper/lifecycle_tests.cpp
@@ -188,6 +188,47 @@ void mark_encoder_aux_tensors_f32(emel::model::data &model) {
   }
 }
 
+void mark_encoder_linear_weights(emel::model::data &model,
+                                 const int32_t dtype) {
+  for (uint32_t index = 0; index < model.n_tensors; ++index) {
+    auto &tensor = model.tensors[index];
+    const auto name = emel::model::tensor_name_view(model, tensor);
+    if (name.starts_with("model.encoder.layers.") && tensor.n_dims == 2 &&
+        name.ends_with(".weight")) {
+      tensor.type = dtype;
+    }
+  }
+}
+
+// Rebinds every encoder aux tensor (biases, layer norms, positions) onto a
+// shared zero-filled f32 arena so a q8_0+f32-aux encode can run end to end
+// with in-bounds aux reads. The caller must keep the returned arena alive
+// for the lifetime of the mutated model.
+std::vector<float>
+repoint_encoder_aux_tensors_to_f32(emel::model::data &model) {
+  std::vector<float> arena(
+      static_cast<size_t>(encoder::detail::k_embedding_length) *
+          static_cast<size_t>(encoder::detail::k_max_encoder_frame_count),
+      0.0f);
+  for (uint32_t index = 0; index < model.n_tensors; ++index) {
+    auto &tensor = model.tensors[index];
+    const auto name = emel::model::tensor_name_view(model, tensor);
+    if (name.starts_with("model.encoder.") &&
+        (tensor.n_dims == 1 ||
+         name == "model.encoder.embed_positions.weight")) {
+      uint64_t count = 1u;
+      for (int32_t dim = 0; dim < tensor.n_dims; ++dim) {
+        count *= static_cast<uint64_t>(tensor.dims[static_cast<size_t>(dim)]);
+      }
+      REQUIRE(count <= arena.size());
+      tensor.type = static_cast<int32_t>(emel::kernel::detail::dtype_f32);
+      tensor.data = arena.data();
+      tensor.data_size = count * sizeof(float);
+    }
+  }
+  return arena;
+}
+
 loaded_whisper_fixture load_fixture_or_skip() {
   const auto fixture_path = whisper_fixture_path();
   if (!std::filesystem::exists(fixture_path)) {
@@ -405,6 +446,96 @@ TEST_CASE("whisper_encoder_runs_full_q8_encoder_from_public_event") {
   CHECK(digest_again == digest);
 }
 
+TEST_CASE("whisper_encoder_runs_q4_0_variant_from_public_event") {
+  auto loaded = load_fixture_or_skip();
+  if (loaded.model == nullptr) {
+    return;
+  }
+
+  mark_encoder_linear_weights(
+      *loaded.model, static_cast<int32_t>(emel::kernel::detail::dtype_q4_0));
+  const std::vector<float> pcm = deterministic_pcm(320);
+  std::vector<float> workspace(static_cast<size_t>(
+      encoder::detail::required_workspace_floats(pcm.size())));
+  std::vector<float> output(static_cast<size_t>(
+      encoder::detail::required_encoder_output_floats(pcm.size())));
+  int32_t frames = 0;
+  int32_t width = 0;
+  uint64_t digest = 0;
+  emel::error::type err = emel::error::cast(encoder::error::none);
+  encoder::sm machine{};
+  encoder::event::encode request{
+      loaded.contract, pcm, 16000, 1, workspace, output, frames, width, digest};
+  request.error_out = &err;
+  CHECK(machine.process_event(request));
+  CHECK(err == emel::error::cast(encoder::error::none));
+  CHECK(frames == 1);
+  CHECK(width == encoder::detail::k_embedding_length);
+  CHECK(machine.q4_0_dispatch_count() == 1u);
+  CHECK(machine.q8_0_dispatch_count() == 0u);
+  CHECK(machine.q4_1_dispatch_count() == 0u);
+}
+
+TEST_CASE("whisper_encoder_runs_q4_1_variant_from_public_event") {
+  auto loaded = load_fixture_or_skip();
+  if (loaded.model == nullptr) {
+    return;
+  }
+
+  mark_encoder_linear_weights(
+      *loaded.model, static_cast<int32_t>(emel::kernel::detail::dtype_q4_1));
+  const std::vector<float> pcm = deterministic_pcm(320);
+  std::vector<float> workspace(static_cast<size_t>(
+      encoder::detail::required_workspace_floats(pcm.size())));
+  std::vector<float> output(static_cast<size_t>(
+      encoder::detail::required_encoder_output_floats(pcm.size())));
+  int32_t frames = 0;
+  int32_t width = 0;
+  uint64_t digest = 0;
+  emel::error::type err = emel::error::cast(encoder::error::none);
+  encoder::sm machine{};
+  encoder::event::encode request{
+      loaded.contract, pcm, 16000, 1, workspace, output, frames, width, digest};
+  request.error_out = &err;
+  CHECK(machine.process_event(request));
+  CHECK(err == emel::error::cast(encoder::error::none));
+  CHECK(frames == 1);
+  CHECK(width == encoder::detail::k_embedding_length);
+  CHECK(machine.q4_1_dispatch_count() == 1u);
+  CHECK(machine.q8_0_dispatch_count() == 0u);
+  CHECK(machine.q4_0_dispatch_count() == 0u);
+}
+
+TEST_CASE("whisper_encoder_runs_q8_f32_aux_variant_from_public_event") {
+  auto loaded = load_fixture_or_skip();
+  if (loaded.model == nullptr) {
+    return;
+  }
+
+  const std::vector<float> aux_arena =
+      repoint_encoder_aux_tensors_to_f32(*loaded.model);
+  const std::vector<float> pcm = deterministic_pcm(320);
+  std::vector<float> workspace(static_cast<size_t>(
+      encoder::detail::required_workspace_floats(pcm.size())));
+  std::vector<float> output(static_cast<size_t>(
+      encoder::detail::required_encoder_output_floats(pcm.size())));
+  int32_t frames = 0;
+  int32_t width = 0;
+  uint64_t digest = 0;
+  emel::error::type err = emel::error::cast(encoder::error::none);
+  encoder::sm machine{};
+  encoder::event::encode request{
+      loaded.contract, pcm, 16000, 1, workspace, output, frames, width, digest};
+  request.error_out = &err;
+  CHECK(machine.process_event(request));
+  CHECK(err == emel::error::cast(encoder::error::none));
+  CHECK(frames == 1);
+  CHECK(width == encoder::detail::k_embedding_length);
+  CHECK(machine.q8_0_dispatch_count() == 1u);
+  CHECK(machine.q4_0_dispatch_count() == 0u);
+  CHECK(machine.q4_1_dispatch_count() == 0u);
+}
+
 TEST_CASE("whisper_recognizer_runs_fixture_through_public_actor") {
   auto loaded = load_fixture_or_skip();
   if (loaded.model == nullptr) {
diff --git a/tests/text/generator/decode_wavefront/lifecycle_tests.cpp b/tests/text/generator/decode_wavefront/lifecycle_tests.cpp
new file mode 100644
index 00000000..0bc35fd0
--- /dev/null
+++ b/tests/text/generator/decode_wavefront/lifecycle_tests.cpp
@@ -0,0 +1,577 @@
+#include <array>
+#include <atomic>
+#include <cstdint>
+#include <span>
+#include <thread>
+
+#include <doctest/doctest.h>
+
+#include "emel/error/error.hpp"
+#include "emel/graph/events.hpp"
+#include "emel/graph/sm.hpp"
+#include "emel/text/generator/decode_wavefront/sm.hpp"
+
+namespace {
+
+namespace wavefront = emel::text::generator::decode_wavefront;
+using execute_t = emel::graph::processor::event::execute;
+
+bool validate_ok(const execute_t &, int32_t * err_out) {
+  if (err_out != nullptr) {
+    *err_out = 0;
+  }
+  return true;
+}
+
+bool prepare_graph_reuse(const execute_t &, bool * reused_out, int32_t * err_out) {
+  if (reused_out != nullptr) {
+    *reused_out = true;
+  }
+  if (err_out != nullptr) {
+    *err_out = 0;
+  }
+  return true;
+}
+
+bool alloc_graph_ok(const execute_t &, int32_t * err_out) {
+  if (err_out != nullptr) {
+    *err_out = 0;
+  }
+  return true;
+}
+
+bool bind_inputs_ok(const execute_t &, int32_t * err_out) {
+  if (err_out != nullptr) {
+    *err_out = 0;
+  }
+  return true;
+}
+
+bool run_kernel_counting(const execute_t & request, int32_t * err_out) {
+  auto * calls = static_cast<int32_t *>(request.compute_ctx);
+  *calls += 1;
+  if (err_out != nullptr) {
+    *err_out = 0;
+  }
+  return true;
+}
+
+bool run_kernel_rejected(const execute_t & request, int32_t * err_out) {
+  auto * calls = static_cast<int32_t *>(request.compute_ctx);
+  *calls += 1;
+  if (err_out != nullptr) {
+    *err_out = 1;
+  }
+  return false;
+}
+
+struct parallel_kernel_context {
+  std::atomic<int32_t> entered{0};
+  std::atomic<bool> release{false};
+};
+
+bool run_kernel_wait_for_release(const execute_t & request, int32_t * err_out) {
+  auto * ctx = static_cast<parallel_kernel_context *>(request.compute_ctx);
+  ctx->entered.fetch_add(1, std::memory_order_release);
+  while (!ctx->release.load(std::memory_order_acquire)) {
+    std::this_thread::yield();
+  }
+  if (err_out != nullptr) {
+    *err_out = 0;
+  }
+  return true;
+}
+
+bool extract_outputs_ok(const execute_t &, int32_t * outputs_out, int32_t * err_out) {
+  if (outputs_out != nullptr) {
+    *outputs_out = 1;
+  }
+  if (err_out != nullptr) {
+    *err_out = 0;
+  }
+  return true;
+}
+
+struct reserve_callbacks {
+  bool done_called = false;
+  bool error_called = false;
+
+  static bool on_done(void * owner, const emel::graph::events::reserve_done &) noexcept {
+    auto * self = static_cast<reserve_callbacks *>(owner);
+    self->done_called = true;
+    return true;
+  }
+
+  static bool on_error(void * owner, const emel::graph::events::reserve_error &) noexcept {
+    auto * self = static_cast<reserve_callbacks *>(owner);
+    self->error_called = true;
+    return true;
+  }
+};
+
+struct compute_callbacks {
+  bool done_called = false;
+  bool error_called = false;
+  int32_t error_code = 0;
+
+  static bool on_done(void * owner, const emel::graph::events::compute_done &) noexcept {
+    auto * self = static_cast<compute_callbacks *>(owner);
+    self->done_called = true;
+    return true;
+  }
+
+  static bool on_error(void * owner, const emel::graph::events::compute_error & ev) noexcept {
+    auto * self = static_cast<compute_callbacks *>(owner);
+    self->error_called = true;
+    self->error_code = ev.err;
+    return true;
+  }
+};
+
+struct lifecycle_fixture {
+  int32_t leaf_tensor = 11;
+  int32_t compute_tensor = 29;
+  std::array<emel::graph::processor::event::lifecycle_tensor_binding, 2> tensors{{
+      {
+          .tensor_id = 0,
+          .buffer = &leaf_tensor,
+          .buffer_bytes = sizeof(leaf_tensor),
+          .consumer_refs = 0,
+          .is_leaf = true,
+      },
+      {
+          .tensor_id = 1,
+          .buffer = &compute_tensor,
+          .buffer_bytes = sizeof(compute_tensor),
+          .consumer_refs = 1,
+          .is_leaf = false,
+      },
+  }};
+  std::array<int32_t, 1> required_ids = {0};
+  std::array<int32_t, 1> publish_ids = {1};
+  std::array<int32_t, 1> release_ids = {1};
+  emel::graph::processor::event::lifecycle_phase phase{
+      .required_filled_ids = required_ids.data(),
+      .required_filled_count = static_cast<int32_t>(required_ids.size()),
+      .publish_ids = publish_ids.data(),
+      .publish_count = static_cast<int32_t>(publish_ids.size()),
+      .release_ids = release_ids.data(),
+      .release_count = static_cast<int32_t>(release_ids.size()),
+  };
+  emel::graph::processor::event::lifecycle_manifest reserve{
+      .tensors = tensors.data(),
+      .tensor_count = static_cast<int32_t>(tensors.size()),
+      .phase = nullptr,
+  };
+  emel::graph::processor::event::lifecycle_manifest compute{
+      .tensors = tensors.data(),
+      .tensor_count = static_cast<int32_t>(tensors.size()),
+      .phase = &phase,
+  };
+};
+
+struct graph_lane_fixture {
+  emel::graph::sm graph{};
+  lifecycle_fixture lifecycle{};
+  reserve_callbacks reserve_cb{};
+  compute_callbacks compute_cb{};
+  emel::graph::event::reserve_output reserve_output{};
+  emel::graph::event::compute_output compute_output{};
+  emel::graph::event::compute compute_request{};
+  int32_t kernel_calls = 0;
+  bool lane_accepted = false;
+
+  void reserve_graph() {
+    const emel::graph::event::reserve reserve_request{
+        .model_topology = reinterpret_cast<const void *>(0xA5),
+        .output_out = &reserve_output,
+        .lifecycle = &lifecycle.reserve,
+        .max_node_count = 4u,
+        .max_tensor_count = 5u,
+        .bytes_per_tensor = 8u,
+        .workspace_capacity_bytes = 64u,
+        .dispatch_done = {&reserve_cb, reserve_callbacks::on_done},
+        .dispatch_error = {&reserve_cb, reserve_callbacks::on_error},
+    };
+    REQUIRE(graph.process_event(reserve_request));
+    REQUIRE(reserve_cb.done_called);
+    REQUIRE_FALSE(reserve_cb.error_called);
+  }
+
+  void bind_compute(emel::graph::event::run_kernel_fn kernel_fn = run_kernel_counting,
+                    void * compute_ctx = nullptr) {
+    compute_request = emel::graph::event::compute{
+        .step_plan = reinterpret_cast<const void *>(0xB6),
+        .output_out = &compute_output,
+        .lifecycle = &lifecycle.compute,
+        .node_count_hint = reserve_output.node_count,
+        .tensor_count_hint = reserve_output.tensor_count,
+        .bytes_per_tensor = 8u,
+        .workspace_capacity_bytes = 64u,
+        .step_index = 0,
+        .step_size = 1,
+        .kv_tokens = 1,
+        .expected_outputs = 1,
+        .compute_ctx = compute_ctx == nullptr ? static_cast<void *>(&kernel_calls)
+                                              : compute_ctx,
+        .validate = validate_ok,
+        .prepare_graph = prepare_graph_reuse,
+        .alloc_graph = alloc_graph_ok,
+        .bind_inputs = bind_inputs_ok,
+        .run_kernel = kernel_fn,
+        .extract_outputs = extract_outputs_ok,
+        .dispatch_done = {&compute_cb, compute_callbacks::on_done},
+        .dispatch_error = {&compute_cb, compute_callbacks::on_error},
+    };
+  }
+};
+
+wavefront::event::compatibility_key make_key(
+    const void * model_identity,
+    const void * backend_identity,
+    const wavefront::event::kernel_route route =
+        wavefront::event::kernel_route::q8_k,
+    const wavefront::event::output_contract output =
+        wavefront::event::output_contract::preselected_argmax) {
+  return wavefront::event::compatibility_key{
+      .model_identity = model_identity,
+      .backend_identity = backend_identity,
+      .kernel_kind = emel::kernel::kernel_kind::x86_64,
+      .attention = emel::text::generator::attention_mode::flash,
+      .route = route,
+      .output = output,
+      .dtype_layout_contract = static_cast<uint32_t>(route),
+      .quantized_contract = static_cast<uint32_t>(route),
+      .step_size = 1,
+      .token_count = 1,
+  };
+}
+
+void prepare_lane(graph_lane_fixture & fixture,
+                  emel::graph::event::run_kernel_fn kernel_fn = run_kernel_counting,
+                  void * compute_ctx = nullptr) {
+  fixture.reserve_graph();
+  fixture.bind_compute(kernel_fn, compute_ctx);
+}
+
+template <class predicate>
+bool eventually(predicate && pred) {
+  for (int32_t attempt = 0; attempt < 100000; ++attempt) {
+    if (pred()) {
+      return true;
+    }
+    std::this_thread::yield();
+  }
+  return false;
+}
+
+}  // namespace
+
+TEST_CASE("decode wavefront dispatches one lane inline without grouping") {
+  int model_tag = 1;
+  int backend_tag = 2;
+  graph_lane_fixture fixture{};
+  prepare_lane(fixture);
+
+  const auto key = make_key(&model_tag, &backend_tag);
+  wavefront::event::lane lane{fixture.graph, fixture.compute_request, key,
+                              fixture.lane_accepted};
+  wavefront::event::dispatch_summary summary{};
+  wavefront::event::run request{std::span<wavefront::event::lane>{&lane, 1u},
+                                summary};
+  wavefront::sm machine{};
+
+  CHECK(machine.process_event(request));
+  CHECK(machine.is(stateforward::sml::state<wavefront::state_idle>));
+  CHECK_FALSE(summary.grouped);
+  CHECK(summary.dispatched_lanes == 1);
+  CHECK(summary.failed_lane == wavefront::event::k_no_failed_lane);
+  CHECK(fixture.lane_accepted);
+  CHECK(fixture.compute_cb.done_called);
+  CHECK_FALSE(fixture.compute_cb.error_called);
+  CHECK(fixture.compute_output.reused_topology == 1u);
+  CHECK(fixture.compute_output.node_count == fixture.reserve_output.node_count);
+  CHECK(fixture.kernel_calls == 1);
+}
+
+TEST_CASE("decode wavefront groups compatible lanes with bounded explicit stages") {
+  int model_tag = 1;
+  int backend_tag = 2;
+  std::array<graph_lane_fixture, 4> fixtures{};
+  for (auto & fixture : fixtures) {
+    prepare_lane(fixture);
+  }
+
+  const auto key = make_key(&model_tag, &backend_tag);
+  std::array<wavefront::event::lane, 4> lanes{{
+      {fixtures[0].graph, fixtures[0].compute_request, key, fixtures[0].lane_accepted},
+      {fixtures[1].graph, fixtures[1].compute_request, key, fixtures[1].lane_accepted},
+      {fixtures[2].graph, fixtures[2].compute_request, key, fixtures[2].lane_accepted},
+      {fixtures[3].graph, fixtures[3].compute_request, key, fixtures[3].lane_accepted},
+  }};
+  wavefront::event::dispatch_summary summary{};
+  wavefront::event::run request{std::span<wavefront::event::lane>{lanes},
+                                summary};
+  wavefront::sm machine{};
+
+  CHECK(machine.process_event(request));
+  CHECK(machine.is(stateforward::sml::state<wavefront::state_idle>));
+  CHECK(summary.grouped);
+  CHECK(summary.dispatched_lanes == 4);
+  CHECK(summary.failed_lane == wavefront::event::k_no_failed_lane);
+  for (const auto & fixture : fixtures) {
+    CHECK(fixture.lane_accepted);
+    CHECK(fixture.compute_cb.done_called);
+    CHECK_FALSE(fixture.compute_cb.error_called);
+    CHECK(fixture.compute_output.reused_topology == 1u);
+    CHECK(fixture.compute_output.node_count == fixture.reserve_output.node_count);
+    CHECK(fixture.kernel_calls == 1);
+  }
+}
+
+TEST_CASE("decode wavefront routes duplicate graph actors through serial path") {
+  int model_tag = 1;
+  int backend_tag = 2;
+  std::array<graph_lane_fixture, 2> fixtures{};
+  prepare_lane(fixtures[0]);
+
+  const auto key = make_key(&model_tag, &backend_tag);
+  std::array<wavefront::event::lane, 2> lanes{{
+      {fixtures[0].graph, fixtures[0].compute_request, key, fixtures[0].lane_accepted},
+      {fixtures[0].graph, fixtures[0].compute_request, key, fixtures[1].lane_accepted},
+  }};
+  wavefront::event::dispatch_summary summary{};
+  wavefront::event::run request{std::span<wavefront::event::lane>{lanes},
+                                summary};
+  wavefront::action::lane_pool pool{};
+  wavefront::sm machine{pool};
+
+  CHECK(machine.process_event(request));
+  CHECK(machine.is(stateforward::sml::state<wavefront::state_idle>));
+  CHECK(summary.grouped);
+  CHECK(summary.dispatched_lanes == 2);
+  CHECK(summary.failed_lane == wavefront::event::k_no_failed_lane);
+  CHECK(pool.scheduled_run_count() == 0u);
+  CHECK(fixtures[0].lane_accepted);
+  CHECK(fixtures[1].lane_accepted);
+  CHECK(fixtures[0].kernel_calls == 2);
+}
+
+TEST_CASE("decode wavefront lane pool dispatches compatible lanes concurrently") {
+  int model_tag = 1;
+  int backend_tag = 2;
+  std::array<graph_lane_fixture, 2> fixtures{};
+  parallel_kernel_context kernel_ctx{};
+  for (auto & fixture : fixtures) {
+    prepare_lane(fixture, run_kernel_wait_for_release, &kernel_ctx);
+  }
+
+  const auto key = make_key(&model_tag, &backend_tag);
+  std::array<wavefront::event::lane, 2> lanes{{
+      {fixtures[0].graph, fixtures[0].compute_request, key, fixtures[0].lane_accepted},
+      {fixtures[1].graph, fixtures[1].compute_request, key, fixtures[1].lane_accepted},
+  }};
+  wavefront::event::dispatch_summary summary{};
+  wavefront::event::run request{std::span<wavefront::event::lane>{lanes},
+                                summary};
+  wavefront::action::lane_pool pool{};
+  wavefront::sm machine{pool};
+  std::atomic<bool> dispatch_returned{false};
+  bool accepted = false;
+
+  std::thread dispatch_thread{[&]() {
+    accepted = machine.process_event(request);
+    dispatch_returned.store(true, std::memory_order_release);
+  }};
+
+  const bool both_lanes_entered = eventually([&]() {
+    return kernel_ctx.entered.load(std::memory_order_acquire) == 2;
+  });
+  CHECK(both_lanes_entered);
+  CHECK_FALSE(dispatch_returned.load(std::memory_order_acquire));
+
+  kernel_ctx.release.store(true, std::memory_order_release);
+  dispatch_thread.join();
+
+  CHECK(accepted);
+  CHECK(machine.is(stateforward::sml::state<wavefront::state_idle>));
+  CHECK(summary.grouped);
+  CHECK(summary.dispatched_lanes == 2);
+  CHECK(summary.failed_lane == wavefront::event::k_no_failed_lane);
+  CHECK(pool.scheduled_run_count() == 2u);
+  CHECK(pool.worker_run_count() == 2u);
+  for (const auto & fixture : fixtures) {
+    CHECK(fixture.lane_accepted);
+    CHECK(fixture.compute_cb.done_called);
+    CHECK_FALSE(fixture.compute_cb.error_called);
+    CHECK(fixture.compute_output.reused_topology == 1u);
+  }
+}
+
+TEST_CASE("decode wavefront rejects incompatible multi-lane groups before dispatch") {
+  int model_tag = 1;
+  int backend_tag = 2;
+  std::array<graph_lane_fixture, 2> fixtures{};
+  for (auto & fixture : fixtures) {
+    prepare_lane(fixture);
+  }
+
+  const auto first_key = make_key(&model_tag, &backend_tag);
+  const auto second_key =
+      make_key(&model_tag, &backend_tag, wavefront::event::kernel_route::kernel);
+  std::array<wavefront::event::lane, 2> lanes{{
+      {fixtures[0].graph, fixtures[0].compute_request, first_key,
+       fixtures[0].lane_accepted},
+      {fixtures[1].graph, fixtures[1].compute_request, second_key,
+       fixtures[1].lane_accepted},
+  }};
+  wavefront::event::dispatch_summary summary{};
+  wavefront::event::run request{std::span<wavefront::event::lane>{lanes},
+                                summary};
+  wavefront::sm machine{};
+
+  CHECK_FALSE(machine.process_event(request));
+  CHECK(machine.is(stateforward::sml::state<wavefront::state_idle>));
+  CHECK(summary.err == emel::error::cast(wavefront::error::incompatible_lanes));
+  CHECK_FALSE(summary.grouped);
+  CHECK(summary.dispatched_lanes == 0);
+  CHECK_FALSE(fixtures[0].lane_accepted);
+  CHECK_FALSE(fixtures[1].lane_accepted);
+  CHECK(fixtures[0].kernel_calls == 0);
+  CHECK(fixtures[1].kernel_calls == 0);
+}
+
+TEST_CASE("decode wavefront reports the first rejected lane and stops") {
+  int model_tag = 1;
+  int backend_tag = 2;
+  std::array<graph_lane_fixture, 3> fixtures{};
+  prepare_lane(fixtures[0]);
+  prepare_lane(fixtures[1], run_kernel_rejected);
+  prepare_lane(fixtures[2]);
+
+  const auto key = make_key(&model_tag, &backend_tag);
+  std::array<wavefront::event::lane, 3> lanes{{
+      {fixtures[0].graph, fixtures[0].compute_request, key, fixtures[0].lane_accepted},
+      {fixtures[1].graph, fixtures[1].compute_request, key, fixtures[1].lane_accepted},
+      {fixtures[2].graph, fixtures[2].compute_request, key, fixtures[2].lane_accepted},
+  }};
+  wavefront::event::dispatch_summary summary{};
+  wavefront::event::run request{std::span<wavefront::event::lane>{lanes},
+                                summary};
+  wavefront::sm machine{};
+
+  CHECK_FALSE(machine.process_event(request));
+  CHECK(machine.is(stateforward::sml::state<wavefront::state_idle>));
+  CHECK(summary.err == emel::error::cast(wavefront::error::lane_rejected));
+  CHECK(summary.grouped);
+  CHECK(summary.dispatched_lanes == 2);
+  CHECK(summary.failed_lane == 1);
+  CHECK(fixtures[0].lane_accepted);
+  CHECK_FALSE(fixtures[1].lane_accepted);
+  CHECK_FALSE(fixtures[2].lane_accepted);
+  CHECK(fixtures[0].kernel_calls == 1);
+  CHECK(fixtures[1].kernel_calls == 1);
+  CHECK(fixtures[2].kernel_calls == 0);
+}
+
+TEST_CASE("decode wavefront parallel dispatch reports first rejected lane after join") {
+  int model_tag = 1;
+  int backend_tag = 2;
+  std::array<graph_lane_fixture, 3> fixtures{};
+  prepare_lane(fixtures[0]);
+  prepare_lane(fixtures[1], run_kernel_rejected);
+  prepare_lane(fixtures[2]);
+
+  const auto key = make_key(&model_tag, &backend_tag);
+  std::array<wavefront::event::lane, 3> lanes{{
+      {fixtures[0].graph, fixtures[0].compute_request, key, fixtures[0].lane_accepted},
+      {fixtures[1].graph, fixtures[1].compute_request, key, fixtures[1].lane_accepted},
+      {fixtures[2].graph, fixtures[2].compute_request, key, fixtures[2].lane_accepted},
+  }};
+  wavefront::event::dispatch_summary summary{};
+  wavefront::event::run request{std::span<wavefront::event::lane>{lanes},
+                                summary};
+  wavefront::action::lane_pool pool{};
+  wavefront::sm machine{pool};
+
+  CHECK_FALSE(machine.process_event(request));
+  CHECK(machine.is(stateforward::sml::state<wavefront::state_idle>));
+  CHECK(summary.err == emel::error::cast(wavefront::error::lane_rejected));
+  CHECK(summary.grouped);
+  CHECK(summary.dispatched_lanes == 3);
+  CHECK(summary.failed_lane == 1);
+  CHECK(fixtures[0].lane_accepted);
+  CHECK_FALSE(fixtures[1].lane_accepted);
+  CHECK(fixtures[2].lane_accepted);
+  CHECK(fixtures[0].kernel_calls == 1);
+  CHECK(fixtures[1].kernel_calls == 1);
+  CHECK(fixtures[2].kernel_calls == 1);
+}
+
+TEST_CASE("decode wavefront rejects requests beyond the fixed lane bound") {
+  int model_tag = 1;
+  int backend_tag = 2;
+  emel::graph::sm graph{};
+  emel::graph::event::compute compute{};
+  std::array<bool, wavefront::event::k_max_lanes + 1u> accepted{};
+  const auto key = make_key(&model_tag, &backend_tag);
+  std::array<wavefront::event::lane, wavefront::event::k_max_lanes + 1u> lanes{{
+      {graph, compute, key, accepted[0]},
+      {graph, compute, key, accepted[1]},
+      {graph, compute, key, accepted[2]},
+      {graph, compute, key, accepted[3]},
+      {graph, compute, key, accepted[4]},
+      {graph, compute, key, accepted[5]},
+      {graph, compute, key, accepted[6]},
+      {graph, compute, key, accepted[7]},
+      {graph, compute, key, accepted[8]},
+  }};
+  wavefront::event::dispatch_summary summary{};
+  wavefront::event::run request{std::span<wavefront::event::lane>{lanes},
+                                summary};
+  wavefront::sm machine{};
+
+  CHECK_FALSE(machine.process_event(request));
+  CHECK(machine.is(stateforward::sml::state<wavefront::state_idle>));
+  CHECK(summary.err == emel::error::cast(wavefront::error::invalid_request));
+  CHECK(summary.dispatched_lanes == 0);
+  for (const bool lane_accepted : accepted) {
+    CHECK_FALSE(lane_accepted);
+  }
+}
+
+TEST_CASE("decode wavefront async surface completes within the RTC call") {
+  int model_tag = 1;
+  int backend_tag = 2;
+  graph_lane_fixture fixture{};
+  prepare_lane(fixture);
+
+  const auto key = make_key(&model_tag, &backend_tag);
+  wavefront::event::lane lane{fixture.graph, fixture.compute_request, key,
+                              fixture.lane_accepted};
+  wavefront::event::dispatch_summary summary{};
+  wavefront::event::run request{std::span<wavefront::event::lane>{&lane, 1u},
+                                summary};
+  wavefront::sm machine{};
+
+  emel::bool_task task = machine.process_event_async(request);
+  CHECK(task.result());
+  CHECK(machine.is(stateforward::sml::state<wavefront::state_idle>));
+  CHECK(summary.dispatched_lanes == 1);
+  CHECK(fixture.lane_accepted);
+  CHECK(fixture.compute_cb.done_called);
+  CHECK(fixture.kernel_calls == 1);
+}
+
+TEST_CASE("decode wavefront async surface normalizes invalid requests") {
+  wavefront::event::dispatch_summary summary{};
+  wavefront::event::run request{std::span<wavefront::event::lane>{}, summary};
+  wavefront::sm machine{};
+
+  emel::bool_task task = machine.process_event_async(request);
+  CHECK(task.await_ready());
+  CHECK_FALSE(task.result());
+  CHECK(machine.is(stateforward::sml::state<wavefront::state_idle>));
+  CHECK(summary.err == emel::error::cast(wavefront::error::invalid_request));
+  CHECK(summary.dispatched_lanes == 0);
+}
diff --git a/tests/text/generator/detail_tests.cpp b/tests/text/generator/detail_tests.cpp
index 34699d59..91bfbea3 100644
--- a/tests/text/generator/detail_tests.cpp
+++ b/tests/text/generator/detail_tests.cpp
@@ -152,6 +152,13 @@ struct runtime_request_fixture {
   }
 };
 
+void bind_neox_rope_pairing(emel::model::data & model) {
+  model.params.rope_pair_x0_stride = 1;
+  model.params.rope_pair_x1_stride = 1;
+  model.params.rope_pair_x1_offset = 0;
+  model.params.rope_pair_x1_half_rot_offset = 1;
+}
+
 struct qwen3_runtime_fixture {
   emel::model::data model = {};
   std::vector<std::vector<float>> tensor_storage = {};
@@ -167,6 +174,7 @@ struct qwen3_runtime_fixture {
     model.params.n_layer = 1;
     model.params.attention_layer_norm_rms_epsilon = 1.0e-5f;
     model.params.rope_freq_base = 10000.0f;
+    bind_neox_rope_pairing(model);
     model.n_layers = 1;
     model.weights_data = model.tensors.data();
     model.weights_size = 1u;
@@ -278,6 +286,7 @@ struct gemma4_runtime_fixture {
     model.params.attention_layer_norm_rms_epsilon = 1.0e-6f;
     model.params.rope_freq_base = 10000.0f;
     model.params.tie_word_embeddings = true;
+    bind_neox_rope_pairing(model);
     model.n_layers = 1;
     model.weights_data = model.tensors.data();
     model.weights_size = 1u;
@@ -556,6 +565,7 @@ template <int32_t prompt_tokens> struct hybrid_chunked_q8_runtime_fixture {
 
   hybrid_chunked_q8_runtime_fixture() {
     std::memcpy(model.architecture_name.data(), "lfm2", 4u);
+    bind_neox_rope_pairing(model);
     for (int32_t token = 0; token < k_prompt_tokens; ++token) {
       token_ids[static_cast<size_t>(token)] = token;
       positions[static_cast<size_t>(token)] = token;
@@ -661,6 +671,8 @@ template <int32_t prompt_tokens> struct hybrid_chunked_q8_runtime_fixture {
     attention_block.attention_output = attention_block.attention_q;
     attention_block.attention_q_norm = attention_q_norm_storage;
     attention_block.attention_k_norm = attention_k_norm_storage;
+    attention_block.attention_rope_pairing =
+        emel::text::generator::detail::neox_rope_pairing();
     attention_block.feed_forward_norm = ffn_norm_storage;
     attention_block.feed_forward_gate = shortconv_block.feed_forward_gate;
     attention_block.feed_forward_down = shortconv_block.feed_forward_gate;
@@ -822,6 +834,36 @@ void apply_rope_reference(std::span<float> vector, const int32_t head_count,
   }
 }
 
+void apply_rope_neox_reference(std::span<float> vector,
+                               const int32_t head_count,
+                               const int32_t head_dim, const int32_t n_rot,
+                               const int32_t position,
+                               const float rope_freq_base) {
+  const int32_t rot_dim = std::min(n_rot, head_dim);
+  if (head_count <= 0 || head_dim <= 1 || rot_dim <= 1) {
+    return;
+  }
+
+  const float theta_scale =
+      ::powf(rope_freq_base, -2.0f / static_cast<float>(rot_dim));
+  const int32_t pair_stride = rot_dim / 2;
+  for (int32_t head = 0; head < head_count; ++head) {
+    float *head_ptr = vector.data() + (static_cast<size_t>(head) *
+                                       static_cast<size_t>(head_dim));
+    float theta = static_cast<float>(position);
+    for (int32_t dim = 0; dim < pair_stride; ++dim) {
+      const float cos_theta = ::cosf(theta);
+      const float sin_theta = ::sinf(theta);
+      const int32_t dim1 = dim + pair_stride;
+      const float x0 = head_ptr[dim];
+      const float x1 = head_ptr[dim1];
+      head_ptr[dim] = x0 * cos_theta - x1 * sin_theta;
+      head_ptr[dim1] = x0 * sin_theta + x1 * cos_theta;
+      theta *= theta_scale;
+    }
+  }
+}
+
 std::vector<float> flash_attention_online_reference(
     const emel::text::generator::detail::native_backend &backend,
     const int32_t layer_index, const int32_t position,
@@ -929,6 +971,126 @@ TEST_CASE("generator_detail_apply_rope_matches_ggml_float_recurrence") {
   }
 }
 
+TEST_CASE("generator_detail_lfm2_attention_uses_neox_rope_layout") {
+  constexpr int32_t k_embd = 64;
+  constexpr int32_t k_position = 3;
+  auto model = std::make_unique<emel::model::data>();
+  std::memcpy(model->architecture_name.data(), "lfm2", 4u);
+
+  std::vector<float> identity(static_cast<size_t>(k_embd) *
+                                  static_cast<size_t>(k_embd),
+                              0.0f);
+  std::vector<float> zero_matrix(static_cast<size_t>(k_embd) *
+                                     static_cast<size_t>(k_embd),
+                                 0.0f);
+  for (int32_t idx = 0; idx < k_embd; ++idx) {
+    identity[static_cast<size_t>(idx) * static_cast<size_t>(k_embd) +
+             static_cast<size_t>(idx)] = 1.0f;
+  }
+
+  auto identity_tensor =
+      make_tensor_record(identity.data(), emel::kernel::detail::dtype_f32,
+                         k_embd, k_embd);
+  auto zero_tensor =
+      make_tensor_record(zero_matrix.data(), emel::kernel::detail::dtype_f32,
+                         k_embd, k_embd);
+
+  auto backend =
+      std::make_unique<emel::text::generator::detail::native_backend>();
+  backend->model = model.get();
+  backend->kernel_kind = emel::kernel::kernel_kind::x86_64;
+  backend->n_embd = k_embd;
+  backend->n_head = 1;
+  backend->n_head_kv = 1;
+  backend->n_layer = 1;
+  backend->n_ctx = 8;
+  backend->n_rot = k_embd;
+  backend->head_dim = k_embd;
+  backend->head_dim_kv = k_embd;
+  backend->max_q_dim = k_embd;
+  backend->max_kv_dim = k_embd;
+  backend->n_rep = 1;
+  backend->rms_epsilon = 1.0e-6f;
+  backend->rope_freq_base = 1000000.0f;
+  backend->blocks.resize(1u);
+  backend->layer_cache_offsets = {0u};
+  backend->flash_layer_cache_offsets = {0u};
+  backend->hidden.resize(k_embd);
+  backend->norm.resize(k_embd);
+  backend->q.resize(k_embd);
+  backend->q_attn.resize(k_embd);
+  backend->k.resize(k_embd);
+  backend->v.resize(k_embd);
+  backend->attn_scores.resize(backend->n_ctx);
+  backend->attn_probs.resize(backend->n_ctx);
+  backend->attn_probs_rounded.resize(backend->n_ctx);
+  backend->attn_value_column.resize(backend->n_ctx);
+  backend->attn_ctx.resize(k_embd);
+  backend->projected.resize(k_embd);
+  backend->gate.resize(k_embd);
+  backend->up.resize(k_embd);
+  backend->ffn_hidden.resize(k_embd);
+  backend->key_cache.resize(static_cast<size_t>(backend->n_ctx) *
+                            static_cast<size_t>(k_embd));
+  backend->value_cache.resize(static_cast<size_t>(backend->n_ctx) *
+                              static_cast<size_t>(k_embd));
+  backend->flash_key_cache.resize(static_cast<size_t>(backend->n_ctx) *
+                                  static_cast<size_t>(k_embd));
+  backend->flash_value_cache.resize(static_cast<size_t>(backend->n_ctx) *
+                                    static_cast<size_t>(k_embd));
+
+  for (int32_t idx = 0; idx < k_embd; ++idx) {
+    backend->hidden[static_cast<size_t>(idx)] =
+        std::sin(static_cast<float>(idx + 1) * 0.03125f);
+  }
+
+  auto &block = backend->blocks.front();
+  block.uses_attention = true;
+  block.attention_norm.assign(k_embd, 1.0f);
+  block.attention_q.tensor = &identity_tensor;
+  block.attention_q.rows = k_embd;
+  block.attention_q.cols = k_embd;
+  block.attention_k = block.attention_q;
+  block.attention_v = block.attention_q;
+  block.attention_output = block.attention_q;
+  block.attention_q_norm.assign(k_embd, 1.0f);
+  block.attention_k_norm.assign(k_embd, 1.0f);
+  block.feed_forward_norm.assign(k_embd, 1.0f);
+  block.feed_forward_gate.tensor = &zero_tensor;
+  block.feed_forward_gate.rows = k_embd;
+  block.feed_forward_gate.cols = k_embd;
+  block.feed_forward_down = block.feed_forward_gate;
+  block.feed_forward_up = block.feed_forward_gate;
+  block.attention_q_dim = k_embd;
+  block.attention_kv_dim = k_embd;
+  block.attention_head_dim = k_embd;
+  block.attention_head_dim_kv = k_embd;
+  block.attention_rope_dim = k_embd;
+  block.attention_rope_freq_base = backend->rope_freq_base;
+  block.attention_rope_pairing =
+      emel::text::generator::detail::neox_rope_pairing();
+
+  std::array<float, k_embd> expected_k = {};
+  REQUIRE(emel::text::generator::detail::rms_norm(
+      backend->hidden, block.attention_norm, backend->rms_epsilon,
+      std::span<float>(expected_k.data(), expected_k.size())));
+  apply_qwen3_headwise_rms_norm(expected_k, block.attention_k_norm, 1, k_embd,
+                                backend->rms_epsilon);
+  apply_rope_neox_reference(expected_k, 1, k_embd, k_embd, k_position,
+                            backend->rope_freq_base);
+
+  REQUIRE(emel::text::generator::detail::run_layer_nonflash(
+      *backend, 0, k_position));
+  const size_t cache_offset = emel::text::generator::detail::layer_cache_offset(
+      *backend, block, 0, k_position);
+  for (size_t idx = 0; idx < expected_k.size(); ++idx) {
+    CHECK(backend->k[idx] == doctest::Approx(expected_k[idx]).epsilon(1.0e-5));
+    CHECK(emel::text::generator::detail::quant::fp16_to_fp32(
+              backend->key_cache[cache_offset + idx]) ==
+          doctest::Approx(round_fp16_value(expected_k[idx])).epsilon(1.0e-5));
+  }
+}
+
 TEST_CASE("generator_detail_dequantizes_q2_k_blocks") {
   block_q2_k block = {};
   block.d = 0x3c00u;
@@ -1013,8 +1175,14 @@ TEST_CASE(
   backend.output = backend.output_native;
   backend.output_argmax = backend.output_native;
 
+#if defined(__aarch64__) && defined(__ARM_NEON) &&                             \
+    defined(__ARM_FEATURE_DOTPROD)
   CHECK(
       emel::text::generator::detail::packed_q6_k_x8_logits_supported(backend));
+#else
+  CHECK_FALSE(
+      emel::text::generator::detail::packed_q6_k_x8_logits_supported(backend));
+#endif
 #if defined(__aarch64__) && defined(__ARM_NEON) &&                             \
     defined(__ARM_FEATURE_MATMUL_INT8)
   CHECK(emel::text::generator::detail::prepared_q6_k_x8_q8_logits_supported(
@@ -1027,11 +1195,21 @@ TEST_CASE(
 
   REQUIRE(emel::text::generator::detail::prepare_output_logits(backend));
   REQUIRE(emel::text::generator::detail::prepare_q8_input_workspace(backend));
+#if defined(__aarch64__) && defined(__ARM_NEON) &&                             \
+    (defined(__ARM_FEATURE_MATMUL_INT8) || defined(__ARM_FEATURE_DOTPROD))
   CHECK(backend.q8_input_storage.size() == 1u);
   CHECK(emel::text::generator::guard::detail::q8_input_path_supported(
       backend, backend.output));
   CHECK(emel::text::generator::guard::detail::q8_input_argmax_path_supported(
       backend, backend.output_argmax));
+#else
+  CHECK(backend.q8_input_storage.empty());
+  CHECK_FALSE(emel::text::generator::guard::detail::q8_input_path_supported(
+      backend, backend.output));
+  CHECK_FALSE(
+      emel::text::generator::guard::detail::q8_input_argmax_path_supported(
+          backend, backend.output_argmax));
+#endif
 
 #if defined(__aarch64__) && defined(__ARM_NEON) &&                             \
     defined(__ARM_FEATURE_MATMUL_INT8)
@@ -1269,7 +1447,7 @@ TEST_CASE("generator_detail_routes_static_q4_block_matrices_through_generic_q8_"
 
 TEST_CASE("generator_detail_q6_logits_paths_slice_oversized_q8_workspace") {
 #if !(defined(__aarch64__) && defined(__ARM_NEON))
-  SUCCEED();
+  CHECK(true);
 #else
   auto q6_rows = make_q6_rows();
   auto q6_tensor = make_tensor_record(
@@ -2088,9 +2266,16 @@ TEST_CASE(
   emel::model::data::tensor_record packed_tensor = q6_tensor;
   packed_tensor.type = emel::kernel::detail::dtype_q6_k_x8;
   backend.output_argmax.tensor = &packed_tensor;
+#if defined(__aarch64__) && defined(__ARM_NEON) &&                             \
+    defined(__ARM_FEATURE_DOTPROD)
   CHECK(
       emel::text::generator::guard::detail::preselected_argmax_direct_supported(
           backend));
+#else
+  CHECK_FALSE(
+      emel::text::generator::guard::detail::preselected_argmax_direct_supported(
+          backend));
+#endif
 
   emel::model::data::tensor_record unsupported_tensor = q6_tensor;
   unsupported_tensor.type = emel::kernel::detail::dtype_f32;
@@ -2398,7 +2583,7 @@ TEST_CASE("generator_detail_chunk4_packed_q8_0_helpers_are_explicit_and_"
   CHECK(backend.kernel_dispatch_calls == 1u);
   CHECK(backend.packed_q8_0_dispatch_calls == 1u);
 #else
-  CHECK_FALSE(emel::text::generator::detail::prepare_packed_q8_0_chunk4_input(
+  REQUIRE(emel::text::generator::detail::prepare_packed_q8_0_chunk4_input(
       backend, rhs_dense, col_count));
   std::vector<float> output(
       static_cast<size_t>(rhs_rows) * static_cast<size_t>(row_count), 0.0f);
diff --git a/tests/text/generator/lifecycle_tests.cpp b/tests/text/generator/lifecycle_tests.cpp
index 5b887c1e..c4c1ef6f 100644
--- a/tests/text/generator/lifecycle_tests.cpp
+++ b/tests/text/generator/lifecycle_tests.cpp
@@ -47,6 +47,14 @@ constexpr bool host_is_aarch64() noexcept {
 #endif
 }
 
+constexpr bool host_is_x86_64() noexcept {
+#if defined(__x86_64__) || defined(_M_X64)
+  return true;
+#else
+  return false;
+#endif
+}
+
 struct callback_tracker {
   bool initialize_done_called = false;
   bool initialize_error_called = false;
@@ -1058,7 +1066,7 @@ TEST_CASE("generator_rejects_invalid_initialize_request") {
 }
 
 TEST_CASE("generator_initialize_rejects_missing_injected_dependencies_through_sml") {
-  emel::text::generator::sm generator{};
+  auto generator = std::make_unique<emel::text::generator::sm>();
   emel::text::tokenizer::sm tokenizer{};
   std::array<emel::logits::sampler::fn, 1> samplers = {
       emel::logits::sampler::fn::from<sampler_select_argmax>(),
@@ -1081,8 +1089,8 @@ TEST_CASE("generator_initialize_rejects_missing_injected_dependencies_through_sm
           &tracker,
           on_initialize_error);
 
-  CHECK_FALSE(generator.process_event(request));
-  CHECK(generator.is(stateforward::sml::state<emel::text::generator::uninitialized>));
+  CHECK_FALSE(generator->process_event(request));
+  CHECK(generator->is(stateforward::sml::state<emel::text::generator::uninitialized>));
   CHECK_FALSE(tracker.initialize_done_called);
   CHECK(tracker.initialize_error_called);
   CHECK(error == emel::error::cast(emel::text::generator::error::invalid_request));
@@ -1155,7 +1163,7 @@ TEST_CASE("generator_generate_runs_native_generator_contract") {
   const auto diagnostics = capture_generator_diagnostics(*fixture->generator);
   CHECK(diagnostics.kernel_dispatch_calls > 0u);
   CHECK(diagnostics.flash_attention_dispatch_calls > 0u);
-  if (host_is_aarch64()) {
+  if (host_is_aarch64() || host_is_x86_64()) {
     CHECK(diagnostics.optimized_flash_dispatch_calls > 0u);
     CHECK(diagnostics.shared_flash_dispatch_calls == 0u);
   } else {
@@ -1529,6 +1537,18 @@ TEST_CASE("generator_generate_quantized_contract_fixture_preserves_zero_disallow
     CHECK(diagnostics.optimized_q6_vector_prepared_q8_rhs_i8mm_dispatch_calls == 0u);
     CHECK(diagnostics.optimized_q6_vector_prepared_q8_rhs_dispatch_calls == 0u);
 #endif
+  } else if (host_is_x86_64()) {
+    CHECK(diagnostics.optimized_q2_dispatch_calls > 0u);
+    CHECK(diagnostics.shared_q2_dispatch_calls == 0u);
+    CHECK(diagnostics.optimized_q3_dispatch_calls > 0u);
+    CHECK(diagnostics.shared_q3_dispatch_calls == 0u);
+    CHECK(diagnostics.optimized_q6_dispatch_calls > 0u);
+    CHECK(diagnostics.shared_q6_dispatch_calls == 0u);
+    CHECK(diagnostics.optimized_q6_vector_dispatch_calls == 0u);
+    CHECK(diagnostics.optimized_q6_vector_packed_dispatch_calls == 0u);
+    CHECK(diagnostics.optimized_q6_vector_packed_q8_rhs_dispatch_calls == 0u);
+    CHECK(diagnostics.optimized_q6_vector_prepared_q8_rhs_dispatch_calls == 0u);
+    CHECK(diagnostics.optimized_q6_vector_prepared_q8_rhs_i8mm_dispatch_calls == 0u);
   } else {
     CHECK(diagnostics.optimized_q6_vector_dispatch_calls == 0u);
     CHECK(diagnostics.optimized_q6_vector_packed_dispatch_calls == 0u);
diff --git a/tests/text/generator/parallel_matmul_tests.cpp b/tests/text/generator/parallel_matmul_tests.cpp
new file mode 100644
index 00000000..e358ff3b
--- /dev/null
+++ b/tests/text/generator/parallel_matmul_tests.cpp
@@ -0,0 +1,245 @@
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+#include <doctest/doctest.h>
+
+// View-sliced parallel matmul behavior: pack-group-aligned slice arithmetic
+// and bit-exact parity between serial and parallel lane dispatch. Route-level
+// proof against maintained model fixtures lives in the generator lifecycle
+// tests; this file covers the slicing and fork/join detail surface.
+
+#include "emel/text/generator/detail.hpp"
+
+namespace {
+
+namespace gen_detail = emel::text::generator::detail;
+using emel::kernel::event::dtype;
+using gen_detail::k_matmul_lanes;
+using gen_detail::matmul_lane_mode;
+using gen_detail::matmul_row_slice;
+
+emel::model::data::tensor_record make_tensor_record(void *data,
+                                                    const int32_t type,
+                                                    const int32_t cols,
+                                                    const int32_t rows) {
+  emel::model::data::tensor_record tensor = {};
+  tensor.data = data;
+  tensor.type = type;
+  tensor.n_dims = 2;
+  tensor.dims[0] = static_cast<int64_t>(cols);
+  tensor.dims[1] = static_cast<int64_t>(rows);
+  tensor.data_size =
+      gen_detail::row_storage_bytes(tensor, cols) * static_cast<uint64_t>(rows);
+  return tensor;
+}
+
+void check_slices_tile_rows(const uint64_t rows, const uint64_t group_rows) {
+  CAPTURE(rows);
+  CAPTURE(group_rows);
+  std::array<matmul_row_slice, k_matmul_lanes> slices = {};
+  const size_t count =
+      gen_detail::compute_matmul_row_slices(rows, group_rows, slices);
+  REQUIRE(count >= 1u);
+  REQUIRE(count <= k_matmul_lanes);
+  uint64_t expected_begin = 0u;
+  for (size_t lane = 0; lane < count; ++lane) {
+    CHECK(static_cast<uint64_t>(slices[lane].row_begin) == expected_begin);
+    CHECK(slices[lane].row_count > 0);
+    CHECK(static_cast<uint64_t>(slices[lane].row_begin) % group_rows == 0u);
+    expected_begin += static_cast<uint64_t>(slices[lane].row_count);
+  }
+  CHECK(expected_begin == rows);
+}
+
+TEST_CASE("parallel matmul slice group rows match pack formats") {
+  CHECK(gen_detail::matmul_slice_group_rows(dtype::f32) == 1u);
+  CHECK(gen_detail::matmul_slice_group_rows(dtype::f16) == 1u);
+  CHECK(gen_detail::matmul_slice_group_rows(dtype::q8_0) == 1u);
+  CHECK(gen_detail::matmul_slice_group_rows(dtype::q4_k) == 1u);
+  CHECK(gen_detail::matmul_slice_group_rows(dtype::q6_k) == 1u);
+  CHECK(gen_detail::matmul_slice_group_rows(dtype::q8_0_x4_bl4) == 4u);
+  CHECK(gen_detail::matmul_slice_group_rows(dtype::q8_0_x4_bl8) == 4u);
+  CHECK(gen_detail::matmul_slice_group_rows(dtype::q4_k_x8_bl4) == 8u);
+  CHECK(gen_detail::matmul_slice_group_rows(dtype::q4_k_x8_bl8) == 8u);
+  CHECK(gen_detail::matmul_slice_group_rows(dtype::q6_k_x8) == 8u);
+  CHECK(gen_detail::matmul_slice_group_rows(dtype::q6_k_x8_q8_prepared) == 8u);
+  CHECK(gen_detail::matmul_slice_group_rows(
+            dtype::q6_k_x8_q8_argmax_prepared) == 8u);
+}
+
+TEST_CASE("parallel matmul slices tile rows contiguously and group aligned") {
+  check_slices_tile_rows(13u, 1u);
+  check_slices_tile_rows(3u, 1u);
+  check_slices_tile_rows(1u, 1u);
+  check_slices_tile_rows(256u, 1u);
+  check_slices_tile_rows(64u, 4u);
+  check_slices_tile_rows(20u, 4u);
+  check_slices_tile_rows(100u, 8u);
+  check_slices_tile_rows(8u, 8u);
+  check_slices_tile_rows(4096u, 8u);
+}
+
+TEST_CASE("parallel matmul ragged tail lands in final slice") {
+  std::array<matmul_row_slice, k_matmul_lanes> slices = {};
+  const size_t count = gen_detail::compute_matmul_row_slices(100u, 8u, slices);
+  REQUIRE(count == k_matmul_lanes);
+  const auto &tail = slices[count - 1u];
+  CHECK(static_cast<uint64_t>(tail.row_begin) % 8u == 0u);
+  CHECK(static_cast<uint64_t>(tail.row_begin) +
+            static_cast<uint64_t>(tail.row_count) ==
+        100u);
+}
+
+TEST_CASE("parallel matmul sliced event offsets views by storage groups") {
+  std::array<uint8_t, 1024> src0_storage = {};
+  std::array<float, 64> dst_storage = {};
+  emel::kernel::event::op_mul_mat ev = {};
+  ev.src0.data = src0_storage.data();
+  ev.src0.type = dtype::q4_k_x8_bl8;
+  ev.src0.ne = {256u, 64u, 1u, 1u};
+  ev.src0.nb = {1u, 128u, 128u * 8u, 128u * 8u};
+  ev.dst.data = dst_storage.data();
+  ev.dst.type = dtype::f32;
+  ev.dst.ne = {1u, 64u, 1u, 1u};
+  ev.dst.nb = {sizeof(float), sizeof(float), sizeof(float) * 64u,
+               sizeof(float) * 64u};
+
+  const matmul_row_slice slice{16, 24};
+  const auto sliced = gen_detail::compute_sliced_mul_mat_event(ev, 8u, slice);
+  CHECK(sliced.src0.data == src0_storage.data() + (16u / 8u) * 128u);
+  CHECK(sliced.src0.ne[1] == 24u);
+  CHECK(sliced.src0.nb[1] == ev.src0.nb[1]);
+  CHECK(sliced.src0.nb[2] == 128u * 3u);
+  CHECK(sliced.dst.data ==
+        reinterpret_cast<uint8_t *>(dst_storage.data()) + 16u * sizeof(float));
+  CHECK(sliced.dst.ne[1] == 24u);
+  CHECK(sliced.src1.data == ev.src1.data);
+}
+
+struct parallel_backend_fixture {
+  gen_detail::native_backend backend = {};
+
+  parallel_backend_fixture() {
+    backend.kernel_kind = gen_detail::detect_host_kernel_kind();
+    backend.kernel.set_kind(backend.kernel_kind);
+    backend.lane_pool.emplace();
+  }
+};
+
+TEST_CASE("parallel matmul f32 gemv matches serial dispatch bit exact") {
+  constexpr int32_t rows = 61;
+  constexpr int32_t cols = 32;
+  std::vector<float> weights(static_cast<size_t>(rows) *
+                             static_cast<size_t>(cols));
+  for (size_t idx = 0; idx < weights.size(); ++idx) {
+    weights[idx] = 0.25f * static_cast<float>((idx * 31u + 7u) % 17u) - 2.0f;
+  }
+  std::vector<float> input(static_cast<size_t>(cols));
+  for (size_t idx = 0; idx < input.size(); ++idx) {
+    input[idx] = 0.5f * static_cast<float>((idx * 13u + 3u) % 11u) - 2.5f;
+  }
+
+  auto record = make_tensor_record(weights.data(),
+                                   emel::kernel::detail::dtype_f32, cols, rows);
+  gen_detail::tensor_matrix matrix{&record, rows, cols};
+
+  parallel_backend_fixture fixture;
+  std::vector<float> out_serial(static_cast<size_t>(rows), -1.0f);
+  std::vector<float> out_parallel(static_cast<size_t>(rows), -2.0f);
+
+  emel::kernel::event::op_mul_mat serial_ev{
+      .src0 = gen_detail::make_src_view(matrix),
+      .src1 = gen_detail::make_src_view(input.data(), static_cast<uint64_t>(1u),
+                                        static_cast<uint64_t>(input.size())),
+      .dst = gen_detail::make_dst_view(
+          out_serial.data(), static_cast<uint64_t>(1u),
+          static_cast<uint64_t>(out_serial.size())),
+  };
+  emel::kernel::event::op_mul_mat parallel_ev = serial_ev;
+  parallel_ev.dst =
+      gen_detail::make_dst_view(out_parallel.data(), static_cast<uint64_t>(1u),
+                                static_cast<uint64_t>(out_parallel.size()));
+
+  CHECK(gen_detail::compute_mul_mat<matmul_lane_mode::serial>(fixture.backend,
+                                                              serial_ev));
+  CHECK(gen_detail::compute_mul_mat<matmul_lane_mode::parallel>(fixture.backend,
+                                                                parallel_ev));
+  CHECK(std::memcmp(out_serial.data(), out_parallel.data(),
+                    out_serial.size() * sizeof(float)) == 0);
+}
+
+TEST_CASE("parallel matmul q8_0 gemv matches serial dispatch bit exact") {
+  constexpr int32_t rows = 61;
+  constexpr int32_t cols = 64;
+  constexpr size_t blocks_per_row =
+      static_cast<size_t>(cols) / static_cast<size_t>(gen_detail::quant::QK8_0);
+  std::vector<gen_detail::quant::block_q8_0> weights(static_cast<size_t>(rows) *
+                                                     blocks_per_row);
+  for (size_t block = 0; block < weights.size(); ++block) {
+    weights[block].d = gen_detail::quant::fp32_to_fp16(
+        0.01f + 0.001f * static_cast<float>(block % 7u));
+    for (size_t idx = 0; idx < weights[block].qs.size(); ++idx) {
+      weights[block].qs[idx] = static_cast<int8_t>(
+          static_cast<int32_t>((block * 37u + idx * 5u) % 255u) - 127);
+    }
+  }
+  std::vector<float> input(static_cast<size_t>(cols));
+  for (size_t idx = 0; idx < input.size(); ++idx) {
+    input[idx] = 0.125f * static_cast<float>((idx * 7u + 1u) % 19u) - 1.0f;
+  }
+
+  auto record = make_tensor_record(
+      weights.data(), emel::kernel::detail::dtype_q8_0, cols, rows);
+  gen_detail::tensor_matrix matrix{&record, rows, cols};
+
+  parallel_backend_fixture fixture;
+  std::vector<float> out_serial(static_cast<size_t>(rows), -1.0f);
+  std::vector<float> out_parallel(static_cast<size_t>(rows), -2.0f);
+
+  emel::kernel::event::op_mul_mat serial_ev{
+      .src0 = gen_detail::make_src_view(matrix),
+      .src1 = gen_detail::make_src_view(input.data(), static_cast<uint64_t>(1u),
+                                        static_cast<uint64_t>(input.size())),
+      .dst = gen_detail::make_dst_view(
+          out_serial.data(), static_cast<uint64_t>(1u),
+          static_cast<uint64_t>(out_serial.size())),
+  };
+  emel::kernel::event::op_mul_mat parallel_ev = serial_ev;
+  parallel_ev.dst =
+      gen_detail::make_dst_view(out_parallel.data(), static_cast<uint64_t>(1u),
+                                static_cast<uint64_t>(out_parallel.size()));
+
+  CHECK(gen_detail::compute_mul_mat<matmul_lane_mode::serial>(fixture.backend,
+                                                              serial_ev));
+  CHECK(gen_detail::compute_mul_mat<matmul_lane_mode::parallel>(fixture.backend,
+                                                                parallel_ev));
+  CHECK(std::memcmp(out_serial.data(), out_parallel.data(),
+                    out_serial.size() * sizeof(float)) == 0);
+}
+
+TEST_CASE("parallel matmul rejects disengaged lane pool") {
+  gen_detail::native_backend backend = {};
+  backend.kernel_kind = gen_detail::detect_host_kernel_kind();
+
+  std::array<float, 8> weights = {1.0f, 2.0f, 3.0f, 4.0f,
+                                  5.0f, 6.0f, 7.0f, 8.0f};
+  std::array<float, 4> input = {1.0f, 1.0f, 1.0f, 1.0f};
+  std::array<float, 2> output = {};
+  auto record =
+      make_tensor_record(weights.data(), emel::kernel::detail::dtype_f32, 4, 2);
+  gen_detail::tensor_matrix matrix{&record, 2, 4};
+
+  const emel::kernel::event::op_mul_mat ev{
+      .src0 = gen_detail::make_src_view(matrix),
+      .src1 = gen_detail::make_src_view(input.data(), static_cast<uint64_t>(1u),
+                                        static_cast<uint64_t>(input.size())),
+      .dst = gen_detail::make_dst_view(output.data(), static_cast<uint64_t>(1u),
+                                       static_cast<uint64_t>(output.size())),
+  };
+  CHECK_FALSE(
+      gen_detail::compute_mul_mat<matmul_lane_mode::parallel>(backend, ev));
+}
+
+} // namespace
diff --git a/tools/bench/CMakeLists.txt b/tools/bench/CMakeLists.txt
index 34baa604..0da9970b 100644
--- a/tools/bench/CMakeLists.txt
+++ b/tools/bench/CMakeLists.txt
@@ -25,7 +25,9 @@ if(EMEL_BENCH_SUITE_FILTER STREQUAL "batch_planner" OR
    EMEL_BENCH_SUITE_FILTER STREQUAL "gbnf_rule_parser" OR
    EMEL_BENCH_SUITE_FILTER STREQUAL "flash_attention" OR
    EMEL_BENCH_SUITE_FILTER STREQUAL "kernel_x86_64" OR
-   EMEL_BENCH_SUITE_FILTER STREQUAL "kernel_aarch64")
+   EMEL_BENCH_SUITE_FILTER STREQUAL "kernel_aarch64" OR
+   EMEL_BENCH_SUITE_FILTER STREQUAL "decode_wavefront" OR
+   EMEL_BENCH_SUITE_FILTER STREQUAL "parallel_matmul")
   set(EMEL_BENCH_NEEDS_LLAMA ON)
 endif()
 
@@ -84,12 +86,62 @@ if(EMEL_BENCH_NEEDS_LLAMA)
   if(BENCH_REFERENCE_REF_VALUE STREQUAL "")
     set(BENCH_REFERENCE_REF_VALUE "${REF_IMPL_REF}")
   endif()
+
+  function(emel_patch_reference_metadata_logging reference_source_dir)
+    set(loader_path "${reference_source_dir}/src/llama-model-loader.cpp")
+    if(NOT EXISTS "${loader_path}")
+      message(FATAL_ERROR "Missing reference loader source at ${loader_path}")
+    endif()
+    file(READ "${loader_path}" loader_source)
+    set(patched_marker "            std::string value          = type == GGUF_TYPE_ARRAY")
+    string(FIND "${loader_source}" "${patched_marker}" patch_pos)
+    if(patch_pos EQUAL -1)
+      set(original_line "            std::string value          = gguf_kv_to_str(meta.get(), i);\n            const size_t MAX_VALUE_LEN = 40;")
+      set(patched_line "            std::string value          = type == GGUF_TYPE_ARRAY\n                ? \"<array>\"\n                : gguf_kv_to_str(meta.get(), i);\n            const size_t MAX_VALUE_LEN = 40;")
+      string(FIND "${loader_source}" "${original_line}" original_pos)
+      if(original_pos EQUAL -1)
+        set(original_line "            std::string value          = gguf_kv_to_str(metadata, i);\n            const size_t MAX_VALUE_LEN = 40;")
+        set(patched_line "            std::string value          = type == GGUF_TYPE_ARRAY\n                ? \"<array>\"\n                : gguf_kv_to_str(metadata, i);\n            const size_t MAX_VALUE_LEN = 40;")
+        string(FIND "${loader_source}" "${original_line}" original_pos)
+        if(original_pos EQUAL -1)
+          message(FATAL_ERROR
+            "Unable to patch reference loader metadata logging in ${loader_path}")
+        endif()
+      endif()
+      string(REPLACE "${original_line}" "${patched_line}" loader_source "${loader_source}")
+      file(WRITE "${loader_path}" "${loader_source}")
+    endif()
+
+    set(llama_path "${reference_source_dir}/src/llama.cpp")
+    if(NOT EXISTS "${llama_path}")
+      message(FATAL_ERROR "Missing reference llama source at ${llama_path}")
+    endif()
+    file(READ "${llama_path}" llama_source)
+    set(original_print "        model.load_stats(ml);\n        model.print_info();")
+    set(patched_print "        model.load_stats(ml);\n        // EMEL bench disables reference metadata-only logging here; decode remains unchanged.")
+    string(FIND "${llama_source}" "${patched_print}" print_patch_pos)
+    if(print_patch_pos EQUAL -1)
+      string(FIND "${llama_source}" "${original_print}" print_original_pos)
+      if(print_original_pos EQUAL -1)
+        message(FATAL_ERROR
+          "Unable to patch reference model info logging in ${llama_path}")
+      endif()
+      string(REPLACE "${original_print}" "${patched_print}" llama_source "${llama_source}")
+      file(WRITE "${llama_path}" "${llama_source}")
+    endif()
+  endfunction()
+
+  emel_patch_reference_metadata_logging("${reference_impl_SOURCE_DIR}")
 endif()
 
 set(EMEL_ENABLE_TESTS OFF CACHE BOOL "" FORCE)
 add_subdirectory(${EMEL_ROOT} emel)
 
 find_path(NLOHMANN_JSON_INCLUDE_DIR nlohmann/json.hpp)
+if(NOT NLOHMANN_JSON_INCLUDE_DIR AND EMEL_BENCH_NEEDS_LLAMA AND
+   EXISTS "${reference_impl_SOURCE_DIR}/vendor/nlohmann/json.hpp")
+  set(NLOHMANN_JSON_INCLUDE_DIR "${reference_impl_SOURCE_DIR}/vendor")
+endif()
 
 if(EMEL_BENCH_NEEDS_LLAMA)
   set(LLAMA_ALL_WARNINGS OFF CACHE BOOL "" FORCE)
@@ -202,8 +254,13 @@ function(configure_bench_runner_common_target target_name)
       ${CMAKE_CURRENT_SOURCE_DIR}
       ${EMEL_ROOT}/src
       ${EMEL_ROOT}/include
-      ${NLOHMANN_JSON_INCLUDE_DIR}
   )
+  if(NLOHMANN_JSON_INCLUDE_DIR)
+    target_include_directories(${target_name}
+      PRIVATE
+        ${NLOHMANN_JSON_INCLUDE_DIR}
+    )
+  endif()
   if(EMEL_BENCH_NEEDS_LLAMA)
     target_include_directories(${target_name}
       PRIVATE
@@ -211,6 +268,7 @@ function(configure_bench_runner_common_target target_name)
         ${reference_impl_SOURCE_DIR}/src
         ${reference_impl_SOURCE_DIR}/ggml/include
         ${reference_impl_SOURCE_DIR}/include
+        ${reference_impl_SOURCE_DIR}/vendor
     )
   endif()
 endfunction()
@@ -276,6 +334,13 @@ add_bench_runner_suite(logits_sampler logits/sampler_bench.cpp EMEL_BENCH_ENABLE
 add_bench_runner_suite(kernel_x86_64 kernel/x86_64_bench.cpp EMEL_BENCH_ENABLE_KERNEL_X86_64)
 add_bench_runner_suite(kernel_aarch64 kernel/aarch64_bench.cpp EMEL_BENCH_ENABLE_KERNEL_AARCH64)
 add_bench_runner_suite(sm_any sm_any_bench.cpp EMEL_BENCH_ENABLE_SM_ANY)
+add_bench_runner_suite(sm_scheduler sm_scheduler_bench.cpp EMEL_BENCH_ENABLE_SM_SCHEDULER)
+add_bench_runner_suite(graph_processor graph/processor_bench.cpp
+                       EMEL_BENCH_ENABLE_GRAPH_PROCESSOR)
+add_bench_runner_suite(decode_wavefront text/generator/decode_wavefront_bench.cpp
+                       EMEL_BENCH_ENABLE_DECODE_WAVEFRONT)
+add_bench_runner_suite(parallel_matmul text/generator/parallel_matmul_bench.cpp
+                       EMEL_BENCH_ENABLE_PARALLEL_MATMUL)
 add_bench_runner_suite(tokenizer_preprocessor_bpe text/tokenizer/preprocessor/bpe_bench.cpp
                        EMEL_BENCH_ENABLE_TOKENIZER_PREPROCESSOR_BPE)
 add_bench_runner_suite(tokenizer_preprocessor_spm text/tokenizer/preprocessor/spm_bench.cpp
diff --git a/tools/bench/bench_cases.hpp b/tools/bench/bench_cases.hpp
index e5e05f01..020a3234 100644
--- a/tools/bench/bench_cases.hpp
+++ b/tools/bench/bench_cases.hpp
@@ -74,6 +74,14 @@ void append_emel_kernel_aarch64_cases(std::vector<result> & results, const confi
 void append_reference_kernel_aarch64_cases(std::vector<result> & results, const config & cfg);
 void append_emel_sm_any_cases(std::vector<result> & results, const config & cfg);
 void append_reference_sm_any_cases(std::vector<result> & results, const config & cfg);
+void append_emel_sm_scheduler_cases(std::vector<result> & results, const config & cfg);
+void append_reference_sm_scheduler_cases(std::vector<result> & results, const config & cfg);
+void append_emel_graph_processor_cases(std::vector<result> & results, const config & cfg);
+void append_reference_graph_processor_cases(std::vector<result> & results, const config & cfg);
+void append_emel_decode_wavefront_cases(std::vector<result> & results, const config & cfg);
+void append_reference_decode_wavefront_cases(std::vector<result> & results, const config & cfg);
+void append_emel_parallel_matmul_cases(std::vector<result> & results, const config & cfg);
+void append_reference_parallel_matmul_cases(std::vector<result> & results, const config & cfg);
 void append_emel_tokenizer_preprocessor_bpe_cases(std::vector<result> & results,
                                                   const config & cfg);
 void append_reference_tokenizer_preprocessor_bpe_cases(std::vector<result> & results,
diff --git a/tools/bench/bench_dependency_manifest.cpp b/tools/bench/bench_dependency_manifest.cpp
index 1e4af779..011dd0a8 100644
--- a/tools/bench/bench_dependency_manifest.cpp
+++ b/tools/bench/bench_dependency_manifest.cpp
@@ -196,6 +196,31 @@ constexpr std::array k_records{
     dependency_record{"kernel_aarch64", kind::source, "src/emel/kernel", "kernel_inputs"},
 
     dependency_record{"sm_any", kind::source, "tools/bench/sm_any_bench.cpp", "suite_source"},
+    dependency_record{"sm_scheduler",
+                      kind::source,
+                      "tools/bench/sm_scheduler_bench.cpp",
+                      "suite_source"},
+    dependency_record{"sm_scheduler", kind::source, "src/emel/sm.hpp", "scheduler_policy"},
+    dependency_record{"graph_processor",
+                      kind::source,
+                      "tools/bench/graph/processor_bench.cpp",
+                      "suite_source"},
+    dependency_record{"decode_wavefront",
+                      kind::source,
+                      "tools/bench/text/generator/decode_wavefront_bench.cpp",
+                      "suite_source"},
+    dependency_record{"decode_wavefront",
+                      kind::source,
+                      "src/emel/text/generator/decode_wavefront",
+                      "wavefront_actor"},
+    dependency_record{"parallel_matmul",
+                      kind::source,
+                      "tools/bench/text/generator/parallel_matmul_bench.cpp",
+                      "suite_source"},
+    dependency_record{"parallel_matmul",
+                      kind::source,
+                      "src/emel/text/generator/detail.hpp",
+                      "parallel_matmul_lanes"},
 
     dependency_record{"tokenizer_preprocessor_bpe",
                       kind::source,
diff --git a/tools/bench/bench_disabled_cases.cpp b/tools/bench/bench_disabled_cases.cpp
index c89809f3..9bd6af42 100644
--- a/tools/bench/bench_disabled_cases.cpp
+++ b/tools/bench/bench_disabled_cases.cpp
@@ -190,6 +190,26 @@ void append_emel_sm_any_cases(std::vector<result> &, const config &) {}
 void append_reference_sm_any_cases(std::vector<result> &, const config &) {}
 #endif
 
+#ifndef EMEL_BENCH_ENABLE_SM_SCHEDULER
+void append_emel_sm_scheduler_cases(std::vector<result> &, const config &) {}
+void append_reference_sm_scheduler_cases(std::vector<result> &, const config &) {}
+#endif
+
+#ifndef EMEL_BENCH_ENABLE_GRAPH_PROCESSOR
+void append_emel_graph_processor_cases(std::vector<result> &, const config &) {}
+void append_reference_graph_processor_cases(std::vector<result> &, const config &) {}
+#endif
+
+#ifndef EMEL_BENCH_ENABLE_DECODE_WAVEFRONT
+void append_emel_decode_wavefront_cases(std::vector<result> &, const config &) {}
+void append_reference_decode_wavefront_cases(std::vector<result> &, const config &) {}
+#endif
+
+#ifndef EMEL_BENCH_ENABLE_PARALLEL_MATMUL
+void append_emel_parallel_matmul_cases(std::vector<result> &, const config &) {}
+void append_reference_parallel_matmul_cases(std::vector<result> &, const config &) {}
+#endif
+
 #ifndef EMEL_BENCH_ENABLE_TOKENIZER_PREPROCESSOR_BPE
 void append_emel_tokenizer_preprocessor_bpe_cases(std::vector<result> &, const config &) {}
 void append_reference_tokenizer_preprocessor_bpe_cases(std::vector<result> &, const config &) {}
diff --git a/tools/bench/bench_runner.cpp b/tools/bench/bench_runner.cpp
index a08a06b0..b67a66cc 100644
--- a/tools/bench/bench_runner.cpp
+++ b/tools/bench/bench_runner.cpp
@@ -191,6 +191,22 @@ bool is_diarization_sortformer_case_name(const std::string & name) {
   return name.rfind("diarization/sortformer/", 0u) == 0u;
 }
 
+bool is_graph_processor_case_name(const std::string & name) {
+  return name.rfind("graph/processor_", 0u) == 0u;
+}
+
+bool is_decode_wavefront_case_name(const std::string & name) {
+  return name.rfind("decode_wavefront/", 0u) == 0u;
+}
+
+bool is_parallel_matmul_case_name(const std::string & name) {
+  return name.rfind("parallel_matmul/", 0u) == 0u;
+}
+
+bool is_sm_scheduler_case_name(const std::string & name) {
+  return name.rfind("sm_scheduler/", 0u) == 0u;
+}
+
 bool case_supported_on_host(const bench::test_case & tc) {
   if (tc.append_emel == bench::append_emel_kernel_x86_64_cases ||
       tc.append_reference == bench::append_reference_kernel_x86_64_cases) {
@@ -331,6 +347,13 @@ std::vector<bench::result> run_benchmarks(const bench::config & cfg,
                  selected_suite.data());
     std::exit(1);
   }
+  if (filter_by_suite && results.empty()) {
+    std::fprintf(stderr,
+                 "error: no benchmark entries matched selected suite '%.*s'\n",
+                 static_cast<int>(selected_suite.size()),
+                 selected_suite.data());
+    std::exit(1);
+  }
   return results;
 }
 
@@ -528,7 +551,16 @@ void print_compare(const std::vector<bench::result> & emel_results,
   }
   const bool generation_present = generation_emel != emel_sorted.end() ||
       generation_ref != ref_sorted.end();
-  if ((any_generation_emel || any_generation_ref) && !generation_present) {
+  // A workload-scoped diagnostic run (EMEL_GENERATION_WORKLOAD_ID naming a
+  // non-maintained workload) legitimately omits the maintained publication
+  // case; unfiltered and maintained-workload runs must still carry it.
+  const char * workload_selector = std::getenv("EMEL_GENERATION_WORKLOAD_ID");
+  const bool maintained_case_expected = workload_selector == nullptr ||
+      workload_selector[0] == '\0' ||
+      std::string_view{workload_selector} == "lfm2_single_user_hello_max_tokens_1_v1" ||
+      std::string_view{workload_selector} == bench::k_generation_case_name;
+  if ((any_generation_emel || any_generation_ref) && !generation_present &&
+      maintained_case_expected) {
     std::fprintf(stderr,
                  "error: missing current maintained generation case %.*s\n",
                  static_cast<int>(bench::k_generation_case_name.size()),
@@ -652,36 +684,40 @@ void print_compare(const std::vector<bench::result> & emel_results,
                    shared_flash_dispatch_calls);
       std::exit(1);
     }
-    if (k_host_is_aarch64 && flash_dispatch_calls == 0) {
+    const bool host_uses_optimized_flash = k_host_is_aarch64 || k_host_is_x86_64;
+    if (host_uses_optimized_flash && flash_dispatch_calls == 0) {
       std::fprintf(stderr,
-                   "error: missing ARM flash attribution flash_dispatch_calls=%" PRIu64 "\n",
+                   "error: missing optimized flash attribution flash_dispatch_calls=%" PRIu64 "\n",
                    flash_dispatch_calls);
       std::exit(1);
     }
-    if (flash_dispatch_calls != 0 && k_host_is_aarch64 &&
+    if (flash_dispatch_calls != 0 && host_uses_optimized_flash &&
         (optimized_flash_dispatch_calls == 0 || shared_flash_dispatch_calls != 0)) {
       std::fprintf(stderr,
-                   "error: invalid ARM flash attribution optimized_flash_dispatch_calls=%" PRIu64
+                   "error: invalid optimized flash attribution "
+                   "optimized_flash_dispatch_calls=%" PRIu64
                    " shared_flash_dispatch_calls=%" PRIu64 "\n",
                    optimized_flash_dispatch_calls,
                    shared_flash_dispatch_calls);
       std::exit(1);
     }
-    if (flash_dispatch_calls != 0 && !k_host_is_aarch64 &&
+    if (flash_dispatch_calls != 0 && !host_uses_optimized_flash &&
         (optimized_flash_dispatch_calls != 0 || shared_flash_dispatch_calls != 0)) {
       std::fprintf(stderr,
-                   "error: invalid non-ARM flash attribution optimized_flash_dispatch_calls=%" PRIu64
+                   "error: invalid non-optimized flash attribution "
+                   "optimized_flash_dispatch_calls=%" PRIu64
                    " shared_flash_dispatch_calls=%" PRIu64 "\n",
                    optimized_flash_dispatch_calls,
                    shared_flash_dispatch_calls);
       std::exit(1);
     }
-    const bool invalid_lfm2_quantized_evidence =
+    const bool invalid_lfm2_quantized_common =
         native_q8_0_dispatch_calls != 0 || packed_q8_0_dispatch_calls != 0 ||
         optimized_q2_dispatch_calls != 0 || shared_q2_dispatch_calls != 0 ||
         optimized_q3_dispatch_calls != 0 || shared_q3_dispatch_calls != 0 ||
         optimized_q4_dispatch_calls == 0 || shared_q4_dispatch_calls != 0 ||
         optimized_q6_dispatch_calls == 0 || shared_q6_dispatch_calls != 0;
+    const bool invalid_lfm2_quantized_evidence = invalid_lfm2_quantized_common;
     const bool invalid_default_quantized_evidence =
         (native_q8_0_dispatch_calls + packed_q8_0_dispatch_calls) == 0 ||
         optimized_q2_dispatch_calls != 0 || shared_q2_dispatch_calls != 0 ||
@@ -892,6 +928,49 @@ void print_compare(const std::vector<bench::result> & emel_results,
                   emel_entry.comparable ? "baseline_matched" : "measurement_only");
       continue;
     }
+    if (is_graph_processor_case_name(emel_entry.name)) {
+      std::printf("%s emel.cpp %.3f ns/op, reference-baseline %.3f ns/op, "
+                  "ratio=%.3fx\n",
+                  emel_entry.name.c_str(),
+                  emel_entry.ns_per_op,
+                  ref_entry.ns_per_op,
+                  emel_entry.ns_per_op / ref_entry.ns_per_op);
+      continue;
+    }
+    if (is_parallel_matmul_case_name(emel_entry.name)) {
+      const char * baseline_label =
+          emel_entry.name.find("ggml") != std::string::npos ? "llama.cpp"
+                                                            : "reference-baseline";
+      std::printf("%s emel.cpp %.3f ns/op, %s %.3f ns/op, ratio=%.3fx\n",
+                  emel_entry.name.c_str(),
+                  emel_entry.ns_per_op,
+                  baseline_label,
+                  ref_entry.ns_per_op,
+                  emel_entry.ns_per_op / ref_entry.ns_per_op);
+      continue;
+    }
+    if (is_decode_wavefront_case_name(emel_entry.name)) {
+      const char * baseline_label =
+          emel_entry.name.find("ggml") != std::string::npos
+              ? "llama.cpp"
+              : "reserved-scalar-baseline";
+      std::printf("%s emel.cpp %.3f ns/op, %s %.3f ns/op, ratio=%.3fx\n",
+                  emel_entry.name.c_str(),
+                  emel_entry.ns_per_op,
+                  baseline_label,
+                  ref_entry.ns_per_op,
+                  emel_entry.ns_per_op / ref_entry.ns_per_op);
+      continue;
+    }
+    if (is_sm_scheduler_case_name(emel_entry.name)) {
+      std::printf("%s thread_pool %.3f ns/op, inline_co_sm %.3f ns/op, "
+                  "ratio=%.3fx\n",
+                  emel_entry.name.c_str(),
+                  emel_entry.ns_per_op,
+                  ref_entry.ns_per_op,
+                  emel_entry.ns_per_op / ref_entry.ns_per_op);
+      continue;
+    }
     const double ratio = emel_entry.ns_per_op / ref_entry.ns_per_op;
     std::printf("%s emel.cpp %.3f ns/op, llama.cpp %.3f ns/op, ratio=%.3fx\n",
                 emel_entry.name.c_str(),
diff --git a/tools/bench/bench_runner_registry.cpp b/tools/bench/bench_runner_registry.cpp
index 97237cd0..710d5ae8 100644
--- a/tools/bench/bench_runner_registry.cpp
+++ b/tools/bench/bench_runner_registry.cpp
@@ -18,8 +18,8 @@ constexpr test_case make_test_case(const append_case_fn emel_fn,
   };
 }
 
-const std::array<test_case, 29> & all_runner_cases() {
-  static const std::array<test_case, 29> cases = {{
+const std::array<test_case, 33> & all_runner_cases() {
+  static const std::array<test_case, 33> cases = {{
     make_test_case(append_emel_batch_planner_cases,
                    append_reference_batch_planner_cases,
                    "batch_planner"),
@@ -59,6 +59,18 @@ const std::array<test_case, 29> & all_runner_cases() {
                    append_reference_kernel_aarch64_cases,
                    "kernel_aarch64"),
     make_test_case(append_emel_sm_any_cases, append_reference_sm_any_cases, "sm_any"),
+    make_test_case(append_emel_sm_scheduler_cases,
+                   append_reference_sm_scheduler_cases,
+                   "sm_scheduler"),
+    make_test_case(append_emel_graph_processor_cases,
+                   append_reference_graph_processor_cases,
+                   "graph_processor"),
+    make_test_case(append_emel_decode_wavefront_cases,
+                   append_reference_decode_wavefront_cases,
+                   "decode_wavefront"),
+    make_test_case(append_emel_parallel_matmul_cases,
+                   append_reference_parallel_matmul_cases,
+                   "parallel_matmul"),
     make_test_case(append_emel_tokenizer_preprocessor_bpe_cases,
                    append_reference_tokenizer_preprocessor_bpe_cases,
                    "tokenizer_preprocessor_bpe"),
diff --git a/tools/bench/bench_runner_tests.cpp b/tools/bench/bench_runner_tests.cpp
index e27d309d..0353833f 100644
--- a/tools/bench/bench_runner_tests.cpp
+++ b/tools/bench/bench_runner_tests.cpp
@@ -315,6 +315,71 @@ process_capture run_generation_bench_capture(const std::string & mode,
   return capture;
 }
 
+process_capture run_suite_bench_capture(const std::string & suite,
+                                        const std::string & mode,
+                                        const std::string & tag,
+                                        const bool enable_internal = false) {
+  const std::filesystem::path tmp_dir =
+      std::filesystem::temp_directory_path() / "emel-bench-runner-tests" / tag;
+  std::filesystem::create_directories(tmp_dir);
+  const std::filesystem::path stdout_path = tmp_dir / "stdout.txt";
+  const std::filesystem::path stderr_path = tmp_dir / "stderr.txt";
+
+  std::string command;
+#if defined(_WIN32)
+  command = "set EMEL_BENCH_SUITE=" + suite + " && ";
+  command += "set EMEL_BENCH_ITERS=1 && ";
+  command += "set EMEL_BENCH_RUNS=1 && ";
+  command += "set EMEL_BENCH_WARMUP_ITERS=0 && ";
+  command += "set EMEL_BENCH_WARMUP_RUNS=0 && ";
+  if (enable_internal) {
+    command += "set EMEL_BENCH_INTERNAL=1 && ";
+  }
+  command += quote_arg_windows(bench_runner_binary_path().string());
+  command += " --mode=" + mode + " > ";
+  command += quote_arg_windows(stdout_path.string());
+  command += " 2> ";
+  command += quote_arg_windows(stderr_path.string());
+#else
+  command = "ulimit -s 8192; ";
+  command += "EMEL_BENCH_SUITE=" + quote_arg_posix(suite) + " ";
+  command += "EMEL_BENCH_ITERS=1 ";
+  command += "EMEL_BENCH_RUNS=1 ";
+  command += "EMEL_BENCH_WARMUP_ITERS=0 ";
+  command += "EMEL_BENCH_WARMUP_RUNS=0 ";
+  if (enable_internal) {
+    command += "EMEL_BENCH_INTERNAL=1 ";
+  }
+  command += quote_arg_posix(bench_runner_binary_path().string());
+  command += " --mode=" + mode + " > ";
+  command += quote_arg_posix(stdout_path.string());
+  command += " 2> ";
+  command += quote_arg_posix(stderr_path.string());
+#endif
+
+  const int status = std::system(command.c_str());
+  process_capture capture{};
+  capture.stdout_text = read_file(stdout_path);
+  capture.stderr_text = read_file(stderr_path);
+
+  std::error_code ec;
+  std::filesystem::remove(stdout_path, ec);
+  std::filesystem::remove(stderr_path, ec);
+
+  if (status == -1) {
+    return capture;
+  }
+#if defined(_WIN32)
+  capture.exit_code = status;
+#else
+  if (!WIFEXITED(status)) {
+    return capture;
+  }
+  capture.exit_code = WEXITSTATUS(status);
+#endif
+  return capture;
+}
+
 process_capture run_diarization_bench_capture(const std::string & mode,
                                               const bool emit_jsonl = false) {
   const std::filesystem::path tmp_dir =
@@ -806,18 +871,23 @@ TEST_CASE("benchmark runner registration is localized outside the orchestrator")
   CHECK(emel::bench::registered_runner_count() >= 29u);
   CHECK(emel::bench::find_registered_runner("generation") != nullptr);
   CHECK(emel::bench::find_registered_runner("diarization_sortformer") != nullptr);
+  CHECK(emel::bench::find_registered_runner("sm_scheduler") != nullptr);
   CHECK(emel::bench::find_registered_runner("tokenizer") != nullptr);
   CHECK(emel::bench::find_registered_runner("missing_suite") == nullptr);
 
   bool saw_generation = false;
+  bool saw_sm_scheduler = false;
   bool saw_tokenizer = false;
   for (std::size_t i = 0; i < emel::bench::registered_runner_count(); ++i) {
     saw_generation = saw_generation ||
       emel::bench::registered_runner_suite_at(i) == std::string_view{"generation"};
+    saw_sm_scheduler = saw_sm_scheduler ||
+      emel::bench::registered_runner_suite_at(i) == std::string_view{"sm_scheduler"};
     saw_tokenizer = saw_tokenizer ||
       emel::bench::registered_runner_suite_at(i) == std::string_view{"tokenizer"};
   }
   CHECK(saw_generation);
+  CHECK(saw_sm_scheduler);
   CHECK(saw_tokenizer);
 }
 
@@ -848,6 +918,7 @@ TEST_CASE("bench runner suites build through independent object targets") {
   CHECK(cmake_source.find("add_bench_runner_suite(generation generation_bench.cpp") !=
         std::string::npos);
   CHECK(cmake_source.find("add_bench_runner_suite(diarization_sortformer") != std::string::npos);
+  CHECK(cmake_source.find("add_bench_runner_suite(sm_scheduler") != std::string::npos);
   CHECK(cmake_source.find("EMEL_BENCH_SUITE_FILTER STREQUAL \"memory_kv\"") !=
         std::string::npos);
   CHECK(cmake_source.find("EMEL_BENCH_SUITE_FILTER STREQUAL \"memory_recurrent\"") !=
@@ -857,6 +928,27 @@ TEST_CASE("bench runner suites build through independent object targets") {
   CHECK(cmake_source.find("BENCH_RUNNER_SUITE_TARGETS") != std::string::npos);
 }
 
+TEST_CASE("bench runner emits internal sm scheduler cases") {
+  const process_capture capture =
+      run_suite_bench_capture("sm_scheduler", "compare", "sm-scheduler-compare", true);
+
+  CHECK(capture.exit_code == 0);
+  CHECK(capture.stderr_text.find("error:") == std::string::npos);
+  CHECK(capture.stdout_text.find("sm_scheduler/idle_async") != std::string::npos);
+  CHECK(capture.stdout_text.find("sm_scheduler/busy_worker_async") != std::string::npos);
+  CHECK(capture.stdout_text.find("thread_pool") != std::string::npos);
+  CHECK(capture.stdout_text.find("inline_co_sm") != std::string::npos);
+}
+
+TEST_CASE("bench runner rejects internal sm scheduler suite without explicit enable") {
+  const process_capture capture =
+      run_suite_bench_capture("sm_scheduler", "compare", "sm-scheduler-disabled");
+
+  CHECK(capture.exit_code != 0);
+  CHECK(capture.stderr_text.find("no benchmark entries matched selected suite 'sm_scheduler'") !=
+        std::string::npos);
+}
+
 TEST_CASE("benchmark dependency manifest covers registered runners conservatively") {
   namespace manifest = emel::bench::dependency_manifest;
 
diff --git a/tools/bench/dependency_manifest.txt b/tools/bench/dependency_manifest.txt
index ae43ed23..b7d56579 100644
--- a/tools/bench/dependency_manifest.txt
+++ b/tools/bench/dependency_manifest.txt
@@ -56,6 +56,13 @@ record runner=kernel_aarch64 kind=source path=tools/bench/kernel/aarch64_bench.c
 record runner=kernel_aarch64 kind=source path=tools/bench/kernel/bench_common.hpp reason=common
 record runner=kernel_aarch64 kind=source path=src/emel/kernel reason=kernel_inputs
 record runner=sm_any kind=source path=tools/bench/sm_any_bench.cpp reason=suite_source
+record runner=sm_scheduler kind=source path=tools/bench/sm_scheduler_bench.cpp reason=suite_source
+record runner=sm_scheduler kind=source path=src/emel/sm.hpp reason=scheduler_policy
+record runner=graph_processor kind=source path=tools/bench/graph/processor_bench.cpp reason=suite_source
+record runner=decode_wavefront kind=source path=tools/bench/text/generator/decode_wavefront_bench.cpp reason=suite_source
+record runner=decode_wavefront kind=source path=src/emel/text/generator/decode_wavefront reason=wavefront_actor
+record runner=parallel_matmul kind=source path=tools/bench/text/generator/parallel_matmul_bench.cpp reason=suite_source
+record runner=parallel_matmul kind=source path=src/emel/text/generator/detail.hpp reason=parallel_matmul_lanes
 record runner=tokenizer_preprocessor_bpe kind=source path=tools/bench/text/tokenizer/preprocessor/bpe_bench.cpp reason=suite_source
 record runner=tokenizer_preprocessor_spm kind=source path=tools/bench/text/tokenizer/preprocessor/spm_bench.cpp reason=suite_source
 record runner=tokenizer_preprocessor_ugm kind=source path=tools/bench/text/tokenizer/preprocessor/ugm_bench.cpp reason=suite_source
diff --git a/tools/bench/diarization/sortformer_bench.cpp b/tools/bench/diarization/sortformer_bench.cpp
index 7fb1dfb6..d7475326 100644
--- a/tools/bench/diarization/sortformer_bench.cpp
+++ b/tools/bench/diarization/sortformer_bench.cpp
@@ -16,6 +16,7 @@
 #include "emel/diarization/sortformer/encoder/feature_extractor/detail.hpp"
 #include "emel/diarization/sortformer/executor/sm.hpp"
 #include "emel/diarization/sortformer/output/detail.hpp"
+#include "emel/kernel/sm.hpp"
 
 namespace emel::bench {
 
@@ -373,9 +374,10 @@ make_stage_profile_result(const fixture::model_fixture &model,
     fail_sortformer_setup("stage_profile_executor");
   }
 
+  static emel::kernel::sm probabilities_kernel{emel::kernel::detect_host_kind()};
   const double probabilities_ns = measure_once_ns([&]() {
     g_stage_ok_sink = output_detail::compute_speaker_probabilities(
-        hidden_frames, modules_contract, probabilities);
+        probabilities_kernel, hidden_frames, modules_contract, probabilities);
   });
   if (!g_stage_ok_sink) {
     fail_sortformer_setup("stage_profile_probabilities");
diff --git a/tools/bench/generation_bench.cpp b/tools/bench/generation_bench.cpp
index 523de1ef..c1ffb45f 100644
--- a/tools/bench/generation_bench.cpp
+++ b/tools/bench/generation_bench.cpp
@@ -92,16 +92,23 @@ constexpr generation_fixture_spec k_gemma4_generation_fixture = {
     .fixture = &k_gemma4_emel_generation_fixture,
 };
 
-constexpr std::array<generation_fixture_spec, 2> k_compare_generation_fixtures =
+constexpr generation_fixture_spec k_lfm2_230m_generation_fixture = {
+    .fixture = &emel::tools::generation_fixture_registry::
+        k_lfm2_230m_generation_fixture,
+};
+
+constexpr std::array<generation_fixture_spec, 3> k_compare_generation_fixtures =
     {
         k_qwen3_generation_fixture,
         k_lfm2_generation_fixture,
+        k_lfm2_230m_generation_fixture,
 };
 
-constexpr std::array<generation_fixture_spec, 3> k_emel_generation_fixtures = {
+constexpr std::array<generation_fixture_spec, 4> k_emel_generation_fixtures = {
     k_qwen3_generation_fixture,
     k_lfm2_generation_fixture,
     k_gemma4_generation_fixture,
+    k_lfm2_230m_generation_fixture,
 };
 
 using llama_model_ptr =
@@ -1625,8 +1632,15 @@ make_reference_context(llama_model *model,
   params.n_batch = 512;
   params.n_ubatch = 512;
   params.n_seq_max = 1;
-  params.n_threads = 1;
-  params.n_threads_batch = 1;
+  // Matched-core comparisons set EMEL_BENCH_REFERENCE_THREADS to EMEL's lane
+  // count; the maintained publication rows stay at the 1-thread default.
+  int32_t reference_threads = 1;
+  if (const char * threads_env = std::getenv("EMEL_BENCH_REFERENCE_THREADS");
+      threads_env != nullptr && threads_env[0] != '\0') {
+    reference_threads = std::max<int32_t>(1, std::atoi(threads_env));
+  }
+  params.n_threads = reference_threads;
+  params.n_threads_batch = reference_threads;
   params.embeddings = false;
   params.cb_eval = eval_callback;
   params.cb_eval_user_data = eval_user_data;
diff --git a/tools/bench/generation_variants/lfm2_230m/single_user_hello/parity/max_tokens_1.json b/tools/bench/generation_variants/lfm2_230m/single_user_hello/parity/max_tokens_1.json
new file mode 100644
index 00000000..8f1e6b9b
--- /dev/null
+++ b/tools/bench/generation_variants/lfm2_230m/single_user_hello/parity/max_tokens_1.json
@@ -0,0 +1,21 @@
+{
+  "schema": "generation_workload/v1",
+  "id": "lfm2_230m_single_user_hello_max_tokens_1_v1",
+  "case_name": "generation/preloaded_request/lfm2_5_230m_q8_0_prompt_hello_max_tokens_1",
+  "compare_group": "generation/preloaded_request/lfm2_5_230m_q8_0_prompt_hello_max_tokens_1",
+  "fixture_name": "LFM2.5-230M-Q8_0.gguf",
+  "fixture_rel": "tests/models/LFM2.5-230M-Q8_0.gguf",
+  "fixture_slug": "lfm2_5_230m_q8_0",
+  "prompt_fixture_id": "single_user_hello_v1",
+  "prompt_fixture_path": "tools/bench/generation_prompts/single_user_hello.json",
+  "formatter_mode": "chat_template_supported_v1",
+  "formatter_contract": "source=tokenizer.chat_template support=supported_contract shape=structured_chat_messages_v1 roles=system,user tools=none add_generation_prompt=true enable_thinking=false preserve_thinking=false bos=<|startoftext|>",
+  "sampling_id": "argmax_v1",
+  "stop_id": "model_stop_or_max_tokens_v1",
+  "seed": 0,
+  "max_output_tokens": 1,
+  "comparable": true,
+  "comparison_mode": "parity",
+  "comparability_note": "matched_emel_and_reference_workload",
+  "current_publication": false
+}
diff --git a/tools/bench/generation_variants/lfm2_230m/single_user_hello/parity/max_tokens_100.json b/tools/bench/generation_variants/lfm2_230m/single_user_hello/parity/max_tokens_100.json
new file mode 100644
index 00000000..0f106c5e
--- /dev/null
+++ b/tools/bench/generation_variants/lfm2_230m/single_user_hello/parity/max_tokens_100.json
@@ -0,0 +1,21 @@
+{
+  "schema": "generation_workload/v1",
+  "id": "lfm2_230m_single_user_hello_max_tokens_100_v1",
+  "case_name": "generation/preloaded_request/lfm2_5_230m_q8_0_prompt_hello_max_tokens_100",
+  "compare_group": "generation/preloaded_request/lfm2_5_230m_q8_0_prompt_hello_max_tokens_100",
+  "fixture_name": "LFM2.5-230M-Q8_0.gguf",
+  "fixture_rel": "tests/models/LFM2.5-230M-Q8_0.gguf",
+  "fixture_slug": "lfm2_5_230m_q8_0",
+  "prompt_fixture_id": "single_user_hello_v1",
+  "prompt_fixture_path": "tools/bench/generation_prompts/single_user_hello.json",
+  "formatter_mode": "chat_template_supported_v1",
+  "formatter_contract": "source=tokenizer.chat_template support=supported_contract shape=structured_chat_messages_v1 roles=system,user tools=none add_generation_prompt=true enable_thinking=false preserve_thinking=false bos=<|startoftext|>",
+  "sampling_id": "argmax_v1",
+  "stop_id": "model_stop_or_max_tokens_v1",
+  "seed": 0,
+  "max_output_tokens": 100,
+  "comparable": true,
+  "comparison_mode": "parity",
+  "comparability_note": "matched_emel_and_reference_workload",
+  "current_publication": false
+}
diff --git a/tools/bench/graph/processor_bench.cpp b/tools/bench/graph/processor_bench.cpp
new file mode 100644
index 00000000..b4de215f
--- /dev/null
+++ b/tools/bench/graph/processor_bench.cpp
@@ -0,0 +1,331 @@
+#include "bench_cases.hpp"
+
+#include <array>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+
+#include "emel/error/error.hpp"
+#include "emel/graph/processor/context.hpp"
+#include "emel/graph/processor/errors.hpp"
+#include "emel/graph/processor/events.hpp"
+#include "emel/graph/processor/sm.hpp"
+#include "emel/graph/tensor/errors.hpp"
+#include "emel/graph/tensor/events.hpp"
+#include "emel/graph/tensor/sm.hpp"
+#include "emel/sm.hpp"
+
+namespace {
+
+using execute_t = emel::graph::processor::event::execute;
+namespace processor = emel::graph::processor;
+namespace processor_action = emel::graph::processor::action;
+using processor_context = processor_action::context;
+using processor_error = emel::graph::processor::error;
+
+struct lifecycle_fixture {
+  int32_t leaf_tensor = 11;
+  int32_t compute_tensor = 29;
+  emel::graph::tensor::sm tensor_machine{};
+  std::array<emel::graph::processor::event::lifecycle_tensor_binding, 2> tensors{{
+      {
+          .tensor_id = 0,
+          .buffer = &leaf_tensor,
+          .buffer_bytes = sizeof(leaf_tensor),
+          .consumer_refs = 0,
+          .is_leaf = true,
+      },
+      {
+          .tensor_id = 1,
+          .buffer = &compute_tensor,
+          .buffer_bytes = sizeof(compute_tensor),
+          .consumer_refs = 1,
+          .is_leaf = false,
+      },
+  }};
+  std::array<int32_t, 1> required_ids = {0};
+  std::array<int32_t, 1> publish_ids = {1};
+  std::array<int32_t, 1> release_ids = {1};
+  emel::graph::processor::event::lifecycle_phase phase{
+      .required_filled_ids = required_ids.data(),
+      .required_filled_count = static_cast<int32_t>(required_ids.size()),
+      .publish_ids = publish_ids.data(),
+      .publish_count = static_cast<int32_t>(publish_ids.size()),
+      .release_ids = release_ids.data(),
+      .release_count = static_cast<int32_t>(release_ids.size()),
+  };
+  emel::graph::processor::event::lifecycle_manifest manifest{
+      .tensors = tensors.data(),
+      .tensor_count = static_cast<int32_t>(tensors.size()),
+      .phase = &phase,
+  };
+};
+
+struct dispatch_state {
+  bool done_called = false;
+  bool error_called = false;
+  int32_t output_count = 0;
+  int32_t error_code = 0;
+
+  void reset() noexcept {
+    done_called = false;
+    error_called = false;
+    output_count = 0;
+    error_code = 0;
+  }
+
+  static bool on_done(
+      void * owner,
+      const emel::graph::processor::events::execution_done & ev) noexcept {
+    auto * self = static_cast<dispatch_state *>(owner);
+    self->done_called = true;
+    self->output_count = ev.output.outputs_produced;
+    return true;
+  }
+
+  static bool on_error(
+      void * owner,
+      const emel::graph::processor::events::execution_error & ev) noexcept {
+    auto * self = static_cast<dispatch_state *>(owner);
+    self->error_called = true;
+    self->error_code = ev.err;
+    return true;
+  }
+};
+
+struct baseline_processor_sm : public emel::sm<processor::model, processor_context> {
+  using base_type = emel::sm<processor::model, processor_context>;
+  using base_type::base_type;
+
+  bool process_event(const emel::graph::processor::event::execute & ev) {
+    emel::graph::processor::event::execute_ctx ctx{};
+    emel::graph::processor::event::execute_step evt{ev, ctx};
+    const bool accepted = base_type::process_event(evt);
+    return accepted && ctx.err == emel::error::cast(emel::graph::processor::error::none);
+  }
+};
+
+template <class machine_type>
+struct bench_fixture {
+  lifecycle_fixture lifecycle{};
+  machine_type machine{};
+  dispatch_state dispatch{};
+  emel::graph::processor::event::execution_output output{};
+  execute_t request = {};
+  volatile int32_t sink = 0;
+};
+
+[[noreturn]] void bench_abort(const char * message) {
+  std::fprintf(stderr, "error: graph processor benchmark setup failed: %s\n", message);
+  std::abort();
+}
+
+void reserve_lifecycle(lifecycle_fixture & lifecycle) {
+  int32_t err =
+      static_cast<int32_t>(emel::error::cast(emel::graph::tensor::error::none));
+  const bool leaf_ok = lifecycle.tensor_machine.process_event(
+      emel::graph::tensor::event::reserve_tensor{
+          .tensor_id = lifecycle.tensors[0].tensor_id,
+          .buffer = lifecycle.tensors[0].buffer,
+          .buffer_bytes = lifecycle.tensors[0].buffer_bytes,
+          .consumer_refs = lifecycle.tensors[0].consumer_refs,
+          .is_leaf = lifecycle.tensors[0].is_leaf,
+          .error_out = &err,
+      });
+  const bool compute_ok = lifecycle.tensor_machine.process_event(
+      emel::graph::tensor::event::reserve_tensor{
+          .tensor_id = lifecycle.tensors[1].tensor_id,
+          .buffer = lifecycle.tensors[1].buffer,
+          .buffer_bytes = lifecycle.tensors[1].buffer_bytes,
+          .consumer_refs = lifecycle.tensors[1].consumer_refs,
+          .is_leaf = lifecycle.tensors[1].is_leaf,
+          .error_out = &err,
+      });
+  if (!leaf_ok || !compute_ok) {
+    bench_abort("tensor reservation failed");
+  }
+}
+
+bool validate_ok(const execute_t &, int32_t * err_out) {
+  if (err_out != nullptr) {
+    *err_out = 0;
+  }
+  return true;
+}
+
+bool prepare_reused(const execute_t &, bool * reused_out, int32_t * err_out) {
+  if (reused_out != nullptr) {
+    *reused_out = true;
+  }
+  if (err_out != nullptr) {
+    *err_out = 0;
+  }
+  return true;
+}
+
+bool prepare_needs_alloc(const execute_t &, bool * reused_out, int32_t * err_out) {
+  if (reused_out != nullptr) {
+    *reused_out = false;
+  }
+  if (err_out != nullptr) {
+    *err_out = 0;
+  }
+  return true;
+}
+
+bool alloc_ok(const execute_t &, int32_t * err_out) {
+  if (err_out != nullptr) {
+    *err_out = 0;
+  }
+  return true;
+}
+
+bool bind_ok(const execute_t &, int32_t * err_out) {
+  if (err_out != nullptr) {
+    *err_out = 0;
+  }
+  return true;
+}
+
+bool kernel_ok(const execute_t &, int32_t * err_out) {
+  if (err_out != nullptr) {
+    *err_out = 0;
+  }
+  return true;
+}
+
+bool extract_ok(const execute_t &, int32_t * outputs_out, int32_t * err_out) {
+  if (outputs_out != nullptr) {
+    *outputs_out = 2;
+  }
+  if (err_out != nullptr) {
+    *err_out = 0;
+  }
+  return true;
+}
+
+execute_t make_execute(emel::graph::processor::event::execution_output & output,
+                       dispatch_state & state,
+                       lifecycle_fixture & lifecycle,
+                       emel::graph::processor::event::prepare_graph_fn prepare_fn) {
+  return execute_t{
+      .step_plan = reinterpret_cast<const void *>(0xCC11),
+      .output_out = &output,
+      .lifecycle = &lifecycle.manifest,
+      .tensor_machine = &lifecycle.tensor_machine,
+      .step_index = 0,
+      .step_size = 1,
+      .kv_tokens = 1,
+      .expected_outputs = 1,
+      .positions_count = 0,
+      .seq_mask_words = 1,
+      .seq_masks_count = 0,
+      .seq_primary_ids_count = 0,
+      .validate = validate_ok,
+      .prepare_graph = prepare_fn,
+      .alloc_graph = alloc_ok,
+      .bind_inputs = bind_ok,
+      .run_kernel = kernel_ok,
+      .extract_outputs = extract_ok,
+      .dispatch_done = {&state, dispatch_state::on_done},
+      .dispatch_error = {&state, dispatch_state::on_error},
+  };
+}
+
+template <class machine_type>
+bench_fixture<machine_type> make_happy_fixture(
+    emel::graph::processor::event::prepare_graph_fn prepare_fn) {
+  bench_fixture<machine_type> fixture{};
+  reserve_lifecycle(fixture.lifecycle);
+  fixture.request =
+      make_execute(fixture.output, fixture.dispatch, fixture.lifecycle, prepare_fn);
+  if (!fixture.machine.process_event(fixture.request) ||
+      !fixture.dispatch.done_called ||
+      fixture.output.outputs_produced != 2) {
+    bench_abort("happy path validation failed");
+  }
+  fixture.dispatch.reset();
+  fixture.output = {};
+  return fixture;
+}
+
+template <class machine_type>
+bench_fixture<machine_type> make_invalid_fixture() {
+  bench_fixture<machine_type> fixture{};
+  reserve_lifecycle(fixture.lifecycle);
+  fixture.request =
+      make_execute(fixture.output, fixture.dispatch, fixture.lifecycle, prepare_reused);
+  fixture.request.step_size = 0;
+  if (fixture.machine.process_event(fixture.request) ||
+      !fixture.dispatch.error_called ||
+      fixture.dispatch.error_code !=
+          static_cast<int32_t>(emel::error::cast(processor_error::invalid_request))) {
+    bench_abort("invalid path validation failed");
+  }
+  fixture.dispatch.reset();
+  fixture.output = {};
+  return fixture;
+}
+
+template <class machine_type>
+void append_reused_case(std::vector<emel::bench::result> & results,
+                        const emel::bench::config & cfg) {
+  auto fixture = make_happy_fixture<machine_type>(prepare_reused);
+  auto fn = [&fixture]() {
+    fixture.dispatch.reset();
+    fixture.output = {};
+    const bool ok = fixture.machine.process_event(fixture.request);
+    fixture.sink += ok ? fixture.output.outputs_produced : -1;
+  };
+  results.push_back(emel::bench::measure_case("graph/processor_reused", cfg, fn));
+}
+
+template <class machine_type>
+void append_alloc_case(std::vector<emel::bench::result> & results,
+                       const emel::bench::config & cfg) {
+  auto fixture = make_happy_fixture<machine_type>(prepare_needs_alloc);
+  auto fn = [&fixture]() {
+    fixture.dispatch.reset();
+    fixture.output = {};
+    const bool ok = fixture.machine.process_event(fixture.request);
+    fixture.sink += ok ? fixture.output.outputs_produced : -1;
+  };
+  results.push_back(emel::bench::measure_case("graph/processor_alloc", cfg, fn));
+}
+
+template <class machine_type>
+void append_invalid_case(std::vector<emel::bench::result> & results,
+                         const emel::bench::config & cfg) {
+  auto fixture = make_invalid_fixture<machine_type>();
+  auto fn = [&fixture]() {
+    fixture.dispatch.reset();
+    fixture.output = {};
+    const bool ok = fixture.machine.process_event(fixture.request);
+    fixture.sink += ok ? 1 : fixture.dispatch.error_code;
+  };
+  results.push_back(emel::bench::measure_case("graph/processor_invalid", cfg, fn));
+}
+
+template <class machine_type>
+void append_processor_cases(std::vector<emel::bench::result> & results,
+                            const emel::bench::config & cfg) {
+  append_reused_case<machine_type>(results, cfg);
+  append_alloc_case<machine_type>(results, cfg);
+  append_invalid_case<machine_type>(results, cfg);
+}
+
+}  // namespace
+
+namespace emel::bench {
+
+void append_emel_graph_processor_cases(std::vector<result> & results,
+                                       const config & cfg) {
+  append_processor_cases<emel::graph::processor::sm>(results, cfg);
+}
+
+void append_reference_graph_processor_cases(std::vector<result> & results,
+                                            const config & cfg) {
+  append_processor_cases<baseline_processor_sm>(results, cfg);
+}
+
+}  // namespace emel::bench
diff --git a/tools/bench/kernel/bench_common.hpp b/tools/bench/kernel/bench_common.hpp
index 120674c6..97e32979 100644
--- a/tools/bench/kernel/bench_common.hpp
+++ b/tools/bench/kernel/bench_common.hpp
@@ -141,7 +141,6 @@ void append_emel_backend_cases(std::vector<emel::bench::result> & results,
     emel::kernel::event::op_dup ev{
       .src0 = make_src_view(src.data(), static_cast<uint64_t>(k_vec_len)),
       .dst = make_dst_view(dst.data(), static_cast<uint64_t>(k_vec_len)),
-      .nth = 1,
     };
 
     auto fn = [&]() {
@@ -159,7 +158,6 @@ void append_emel_backend_cases(std::vector<emel::bench::result> & results,
       .src0 = make_src_view(lhs.data(), static_cast<uint64_t>(k_vec_len)),
       .src1 = make_src_view(rhs.data(), static_cast<uint64_t>(k_vec_len)),
       .dst = make_dst_view(dst.data(), static_cast<uint64_t>(k_vec_len)),
-      .nth = 1,
     };
 
     auto fn = [&]() {
@@ -177,7 +175,6 @@ void append_emel_backend_cases(std::vector<emel::bench::result> & results,
       .src0 = make_src_view(lhs.data(), static_cast<uint64_t>(k_vec_len)),
       .src1 = make_src_view(rhs.data(), static_cast<uint64_t>(k_vec_len)),
       .dst = make_dst_view(dst.data(), static_cast<uint64_t>(k_vec_len)),
-      .nth = 1,
     };
 
     auto fn = [&]() {
@@ -195,7 +192,6 @@ void append_emel_backend_cases(std::vector<emel::bench::result> & results,
       .src0 = make_src_view(lhs.data(), static_cast<uint64_t>(k_vec_len)),
       .src1 = make_src_view(rhs.data(), static_cast<uint64_t>(k_vec_len)),
       .dst = make_dst_view(dst.data(), static_cast<uint64_t>(k_vec_len)),
-      .nth = 1,
     };
 
     auto fn = [&]() {
@@ -213,7 +209,6 @@ void append_emel_backend_cases(std::vector<emel::bench::result> & results,
       .src0 = make_src_view(lhs.data(), static_cast<uint64_t>(k_vec_len)),
       .src1 = make_src_view(rhs.data(), static_cast<uint64_t>(k_vec_len)),
       .dst = make_dst_view(dst.data(), static_cast<uint64_t>(k_vec_len)),
-      .nth = 1,
     };
 
     auto fn = [&]() {
@@ -229,7 +224,6 @@ void append_emel_backend_cases(std::vector<emel::bench::result> & results,
     emel::kernel::event::op_sqr ev{
       .src0 = make_src_view(src.data(), static_cast<uint64_t>(k_vec_len)),
       .dst = make_dst_view(dst.data(), static_cast<uint64_t>(k_vec_len)),
-      .nth = 1,
     };
 
     auto fn = [&]() {
@@ -245,7 +239,6 @@ void append_emel_backend_cases(std::vector<emel::bench::result> & results,
     emel::kernel::event::op_sqrt ev{
       .src0 = make_src_view(src.data(), static_cast<uint64_t>(k_vec_len)),
       .dst = make_dst_view(dst.data(), static_cast<uint64_t>(k_vec_len)),
-      .nth = 1,
     };
 
     auto fn = [&]() {
@@ -261,7 +254,6 @@ void append_emel_backend_cases(std::vector<emel::bench::result> & results,
     emel::kernel::event::op_log ev{
       .src0 = make_src_view(src.data(), static_cast<uint64_t>(k_vec_len)),
       .dst = make_dst_view(dst.data(), static_cast<uint64_t>(k_vec_len)),
-      .nth = 1,
     };
 
     auto fn = [&]() {
@@ -277,7 +269,6 @@ void append_emel_backend_cases(std::vector<emel::bench::result> & results,
     emel::kernel::event::op_sin ev{
       .src0 = make_src_view(src.data(), static_cast<uint64_t>(k_vec_len)),
       .dst = make_dst_view(dst.data(), static_cast<uint64_t>(k_vec_len)),
-      .nth = 1,
     };
 
     auto fn = [&]() {
@@ -293,7 +284,6 @@ void append_emel_backend_cases(std::vector<emel::bench::result> & results,
     emel::kernel::event::op_cos ev{
       .src0 = make_src_view(src.data(), static_cast<uint64_t>(k_vec_len)),
       .dst = make_dst_view(dst.data(), static_cast<uint64_t>(k_vec_len)),
-      .nth = 1,
     };
 
     auto fn = [&]() {
@@ -313,7 +303,6 @@ void append_emel_backend_cases(std::vector<emel::bench::result> & results,
       .dst = make_dst_view(dst.data(),
                            static_cast<uint64_t>(k_softmax_width),
                            static_cast<uint64_t>(k_softmax_rows)),
-      .nth = 1,
     };
 
     auto fn = [&]() {
@@ -344,7 +333,6 @@ void append_emel_backend_cases(std::vector<emel::bench::result> & results,
       .dst = make_dst_view(dst.data(),
                            static_cast<uint64_t>(k_mm_n),
                            static_cast<uint64_t>(k_mm_m)),
-      .nth = 1,
     };
 
     auto fn = [&]() {
@@ -360,7 +348,6 @@ void append_emel_backend_cases(std::vector<emel::bench::result> & results,
     emel::kernel::event::op_unary ev{
       .src0 = make_src_view(src.data(), static_cast<uint64_t>(k_vec_len)),
       .dst = make_dst_view(dst.data(), static_cast<uint64_t>(k_vec_len)),
-      .nth = 1,
       .subop = emel::kernel::event::unary_subop::neg,
     };
 
@@ -378,7 +365,6 @@ void append_emel_backend_cases(std::vector<emel::bench::result> & results,
     emel::kernel::event::op_unary ev{
       .src0 = make_src_view(src.data(), static_cast<uint64_t>(k_vec_len)),
       .dst = make_dst_view(dst.data(), static_cast<uint64_t>(k_vec_len)),
-      .nth = 1,
       .subop = emel::kernel::event::unary_subop::relu,
     };
 
@@ -396,7 +382,6 @@ void append_emel_backend_cases(std::vector<emel::bench::result> & results,
     emel::kernel::event::op_unary ev{
       .src0 = make_src_view(src.data(), static_cast<uint64_t>(k_vec_len)),
       .dst = make_dst_view(dst.data(), static_cast<uint64_t>(k_vec_len)),
-      .nth = 1,
       .subop = emel::kernel::event::unary_subop::exp,
     };
 
diff --git a/tools/bench/kernel/flash_attention_bench.cpp b/tools/bench/kernel/flash_attention_bench.cpp
index d95fea2e..eecf86eb 100644
--- a/tools/bench/kernel/flash_attention_bench.cpp
+++ b/tools/bench/kernel/flash_attention_bench.cpp
@@ -230,7 +230,6 @@ emel::kernel::event::op_flash_attn_ext make_flash_attention_event(
       sizeof(uint16_t) * spec.head_dim,
       sizeof(uint16_t) * spec.kv_tokens * spec.head_dim);
   ev.dst = make_dst_view_3d(dst, spec.head_dim, 1u, spec.head_count);
-  ev.nth = 1;
   std::memcpy(ev.op_params.data(), &scale, sizeof(scale));
   std::memcpy(ev.op_params.data() + sizeof(scale),
               &masked_total_tokens,
diff --git a/tools/bench/kernel/x86_64_bench.cpp b/tools/bench/kernel/x86_64_bench.cpp
index 90d1d246..eff407d2 100644
--- a/tools/bench/kernel/x86_64_bench.cpp
+++ b/tools/bench/kernel/x86_64_bench.cpp
@@ -1,9 +1,358 @@
 #include "bench_cases.hpp"
 
+#include <array>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#include "emel/kernel/detail.hpp"
 #include "emel/kernel/x86_64/sm.hpp"
 
 #include "kernel/bench_common.hpp"
 
+namespace {
+
+using dtype = emel::kernel::event::dtype;
+using emel::kernel::detail::quant::QK_K;
+using emel::kernel::detail::quant::block_q2_k;
+using emel::kernel::detail::quant::block_q3_k;
+using emel::kernel::detail::quant::block_q6_k;
+
+constexpr size_t k_x86_quantized_block_count = 2u;
+constexpr uint64_t k_x86_quantized_k = QK_K * k_x86_quantized_block_count;
+constexpr uint64_t k_x86_quantized_rows = 2u;
+constexpr uint64_t k_x86_quantized_cols = 2u;
+
+template <class tensor_type>
+void fill_x86_dense_nb(tensor_type & tensor, const uint64_t elem_size) {
+  tensor.nb[0] = elem_size;
+  tensor.nb[1] = tensor.nb[0] * tensor.ne[0];
+  tensor.nb[2] = tensor.nb[1] * tensor.ne[1];
+  tensor.nb[3] = tensor.nb[2] * tensor.ne[2];
+}
+
+emel::kernel::event::tensor_view make_x86_quantized_src(
+    const void * data,
+    const dtype type,
+    const uint64_t ne0,
+    const uint64_t ne1) {
+  emel::kernel::event::tensor_view out{};
+  const size_t row_bytes = emel::kernel::detail::quantized_row_storage_bytes(
+      emel::kernel::detail::dtype_code(type), ne0);
+  out.data = data;
+  out.type = type;
+  out.ne = {ne0, ne1, 1u, 1u};
+  out.nb[0] = 1u;
+  out.nb[1] = row_bytes;
+  out.nb[2] = row_bytes * ne1;
+  out.nb[3] = out.nb[2];
+  return out;
+}
+
+emel::kernel::event::tensor_view make_x86_src_view(
+    const void * data,
+    const dtype type,
+    const uint64_t ne0,
+    const uint64_t ne1 = 1u,
+    const uint64_t ne2 = 1u) {
+  emel::kernel::event::tensor_view out{};
+  out.data = data;
+  out.type = type;
+  out.ne = {ne0, ne1, ne2, 1u};
+  fill_x86_dense_nb(out, emel::kernel::detail::dtype_size_bytes(
+                             emel::kernel::detail::dtype_code(type)));
+  return out;
+}
+
+emel::kernel::event::tensor_view_mut make_x86_dst_view(
+    void * data,
+    const dtype type,
+    const uint64_t ne0,
+    const uint64_t ne1 = 1u,
+    const uint64_t ne2 = 1u) {
+  emel::kernel::event::tensor_view_mut out{};
+  out.data = data;
+  out.type = type;
+  out.ne = {ne0, ne1, ne2, 1u};
+  fill_x86_dense_nb(out, emel::kernel::detail::dtype_size_bytes(
+                             emel::kernel::detail::dtype_code(type)));
+  return out;
+}
+
+void fill_x86_q2_block(block_q2_k & q2, const uint32_t salt) {
+  q2.d = static_cast<uint16_t>(0x3c00u + (salt % 17u));
+  q2.dmin = static_cast<uint16_t>(0x3800u + (salt % 11u));
+  for (size_t i = 0; i < q2.scales.size(); ++i) {
+    q2.scales[i] = static_cast<uint8_t>((((i + salt) % 13u) << 4u) |
+                                        (((i * 5u) + salt) % 15u));
+  }
+  for (size_t i = 0; i < q2.qs.size(); ++i) {
+    q2.qs[i] = static_cast<uint8_t>((i * (23u + salt)) ^ ((i + salt) >> 1u));
+  }
+}
+
+void fill_x86_q3_block(block_q3_k & q3, const uint32_t salt) {
+  q3.d = static_cast<uint16_t>(0x3c00u + (salt % 19u));
+  for (size_t i = 0; i < q3.scales.size(); ++i) {
+    q3.scales[i] = static_cast<uint8_t>((i * (17u + salt)) ^ (0x5au + salt));
+  }
+  for (size_t i = 0; i < q3.hmask.size(); ++i) {
+    q3.hmask[i] = static_cast<uint8_t>((i * (9u + salt)) ^ (0xa5u - salt));
+  }
+  for (size_t i = 0; i < q3.qs.size(); ++i) {
+    q3.qs[i] = static_cast<uint8_t>((i * (13u + salt)) ^ (0x33u + salt * 7u));
+  }
+}
+
+void fill_x86_q6_block(block_q6_k & q6, const uint32_t salt) {
+  q6.d = static_cast<uint16_t>(0x3c00u + (salt % 23u));
+  for (size_t i = 0; i < q6.scales.size(); ++i) {
+    const int32_t scale_value =
+        static_cast<int32_t>(((i + salt) * 3u) % 31u) - 15;
+    q6.scales[i] = static_cast<int8_t>(scale_value);
+  }
+  for (size_t i = 0; i < q6.ql.size(); ++i) {
+    q6.ql[i] = static_cast<uint8_t>((i * (19u + salt)) ^ (0x6cu + salt));
+  }
+  for (size_t i = 0; i < q6.qh.size(); ++i) {
+    q6.qh[i] = static_cast<uint8_t>((i * (7u + salt)) ^ (0x95u - salt));
+  }
+}
+
+std::array<float, k_x86_quantized_k * k_x86_quantized_cols>
+make_x86_quantized_rhs_values(const uint32_t salt) {
+  std::array<float, k_x86_quantized_k * k_x86_quantized_cols> rhs{};
+  for (size_t i = 0; i < rhs.size(); ++i) {
+    const int32_t centered = static_cast<int32_t>((i * (5u + salt)) % 43u) - 21;
+    rhs[i] = static_cast<float>(centered) * 0.0625f;
+  }
+  return rhs;
+}
+
+template <class block_type, class fill_block_fn>
+std::array<block_type, k_x86_quantized_rows * k_x86_quantized_block_count>
+make_x86_quantized_rows(fill_block_fn fill_block, const uint32_t salt) {
+  std::array<block_type, k_x86_quantized_rows * k_x86_quantized_block_count>
+      rows{};
+  for (size_t idx = 0; idx < rows.size(); ++idx) {
+    fill_block(rows[idx], static_cast<uint32_t>(idx) + salt);
+  }
+  return rows;
+}
+
+template <class block_type, class optimized_counter_fn,
+          class shared_counter_fn>
+void append_emel_x86_quantized_case(std::vector<emel::bench::result> & results,
+                                    const emel::bench::config & cfg,
+                                    const char * case_name,
+                                    const dtype block_type_code,
+                                    const block_type * blocks,
+                                    const float * rhs,
+                                    optimized_counter_fn optimized_counter,
+                                    shared_counter_fn shared_counter) {
+  std::array<float, k_x86_quantized_rows * k_x86_quantized_cols> dst{};
+  const emel::kernel::event::op_mul_mat ev{
+      .src0 = make_x86_quantized_src(blocks,
+                                     block_type_code,
+                                     k_x86_quantized_k,
+                                     k_x86_quantized_rows),
+      .src1 = make_x86_src_view(rhs, dtype::f32, k_x86_quantized_cols,
+                                k_x86_quantized_k),
+      .dst = make_x86_dst_view(dst.data(), dtype::f32, k_x86_quantized_cols,
+                               k_x86_quantized_rows),
+  };
+  emel::kernel::x86_64::sm machine{};
+  volatile float sink = 0.0f;
+
+  auto fn = [&]() {
+    const uint64_t optimized_before = optimized_counter(machine);
+    const uint64_t shared_before = shared_counter(machine);
+    const bool ok = machine.process_event(ev);
+    if (!ok || optimized_counter(machine) != optimized_before + 1u ||
+        shared_counter(machine) != shared_before) {
+      std::abort();
+    }
+    sink += dst[0];
+  };
+  results.push_back(emel::bench::measure_case(case_name, cfg, fn));
+  (void)sink;
+}
+
+template <class block_type>
+void append_reference_x86_quantized_case(
+    std::vector<emel::bench::result> & results,
+    const emel::bench::config & cfg,
+    const char * case_name,
+    const dtype block_type_code,
+    const block_type * blocks,
+    const float * rhs) {
+  std::array<float, k_x86_quantized_rows * k_x86_quantized_cols> dst{};
+  const emel::kernel::event::op_mul_mat ev{
+      .src0 = make_x86_quantized_src(blocks,
+                                     block_type_code,
+                                     k_x86_quantized_k,
+                                     k_x86_quantized_rows),
+      .src1 = make_x86_src_view(rhs, dtype::f32, k_x86_quantized_cols,
+                                k_x86_quantized_k),
+      .dst = make_x86_dst_view(dst.data(), dtype::f32, k_x86_quantized_cols,
+                               k_x86_quantized_rows),
+  };
+  volatile float sink = 0.0f;
+
+  auto fn = [&]() {
+    const bool ok = emel::kernel::detail::execute_scalar(ev);
+    if (!ok) {
+      std::abort();
+    }
+    sink += dst[0];
+  };
+  results.push_back(emel::bench::measure_case(case_name, cfg, fn));
+  (void)sink;
+}
+
+struct x86_flash_fixture {
+  float q[4] = {1.0f, 0.0f, 0.0f, 0.0f};
+  uint16_t k[8] = {
+      0x3c00u, 0x0000u, 0x0000u, 0x0000u,
+      0x0000u, 0x3c00u, 0x0000u, 0x0000u,
+  };
+  uint16_t v[8] = {
+      0x4000u, 0x0000u, 0x0000u, 0x0000u,
+      0x0000u, 0x4400u, 0x0000u, 0x0000u,
+  };
+  float dst[4] = {};
+};
+
+emel::kernel::event::op_flash_attn_ext make_x86_flash_event(
+    x86_flash_fixture & fixture) {
+  emel::kernel::event::op_flash_attn_ext ev{};
+  ev.src0 = make_x86_src_view(fixture.q, dtype::f32, 4u, 1u, 1u);
+  ev.src1 = make_x86_src_view(fixture.k, dtype::f16, 4u, 2u, 1u);
+  ev.src2 = make_x86_src_view(fixture.v, dtype::f16, 4u, 2u, 1u);
+  ev.dst = make_x86_dst_view(fixture.dst, dtype::f32, 4u, 1u, 1u);
+  const float scale = 1.0f;
+  std::memcpy(ev.op_params.data(), &scale, sizeof(scale));
+  ev.op_params_size = sizeof(scale);
+  return ev;
+}
+
+void append_emel_x86_flash_case(std::vector<emel::bench::result> & results,
+                                const emel::bench::config & cfg) {
+  x86_flash_fixture fixture{};
+  const emel::kernel::event::op_flash_attn_ext ev =
+      make_x86_flash_event(fixture);
+  emel::kernel::x86_64::sm machine{};
+  volatile float sink = 0.0f;
+
+  auto fn = [&]() {
+    const uint64_t optimized_before = machine.optimized_flash_dispatch_count();
+    const uint64_t shared_before = machine.shared_flash_dispatch_count();
+    const bool ok = machine.process_event(ev);
+    if (!ok || machine.optimized_flash_dispatch_count() !=
+                   optimized_before + 1u ||
+        machine.shared_flash_dispatch_count() != shared_before) {
+      std::abort();
+    }
+    sink += fixture.dst[0];
+  };
+  results.push_back(emel::bench::measure_case(
+      "kernel/x86_64/op_flash_attn_ext_decode_like", cfg, fn));
+  (void)sink;
+}
+
+void append_reference_x86_flash_case(std::vector<emel::bench::result> & results,
+                                     const emel::bench::config & cfg) {
+  x86_flash_fixture fixture{};
+  const emel::kernel::event::op_flash_attn_ext ev =
+      make_x86_flash_event(fixture);
+  const emel::kernel::x86_64::detail::host_feature_contract contract{
+      .avx2_available = false,
+      .fma_available = false,
+      .f16c_available = false,
+  };
+  emel::kernel::x86_64::sm machine{
+      emel::kernel::x86_64::action::context{contract, {}, 0}};
+  volatile float sink = 0.0f;
+
+  auto fn = [&]() {
+    const uint64_t shared_before = machine.shared_flash_dispatch_count();
+    const bool ok = machine.process_event(ev);
+    if (!ok || machine.optimized_flash_dispatch_count() != 0u ||
+        machine.shared_flash_dispatch_count() != shared_before + 1u) {
+      std::abort();
+    }
+    sink += fixture.dst[0];
+  };
+  results.push_back(emel::bench::measure_case(
+      "kernel/x86_64/op_flash_attn_ext_decode_like", cfg, fn));
+  (void)sink;
+}
+
+void append_emel_x86_optimized_cases(std::vector<emel::bench::result> & results,
+                                     const emel::bench::config & cfg) {
+  const auto q2_rows =
+      make_x86_quantized_rows<block_q2_k>(fill_x86_q2_block, 11u);
+  const auto q3_rows =
+      make_x86_quantized_rows<block_q3_k>(fill_x86_q3_block, 19u);
+  const auto q6_rows =
+      make_x86_quantized_rows<block_q6_k>(fill_x86_q6_block, 37u);
+  const auto rhs = make_x86_quantized_rhs_values(3u);
+
+  append_emel_x86_flash_case(results, cfg);
+  append_emel_x86_quantized_case(
+      results, cfg, "kernel/x86_64/op_mul_mat_q2_k_q8_k", dtype::q2_k,
+      q2_rows.data(), rhs.data(),
+      [](const emel::kernel::x86_64::sm & machine) {
+        return machine.optimized_q2_dispatch_count();
+      },
+      [](const emel::kernel::x86_64::sm & machine) {
+        return machine.shared_q2_dispatch_count();
+      });
+  append_emel_x86_quantized_case(
+      results, cfg, "kernel/x86_64/op_mul_mat_q3_k_q8_k", dtype::q3_k,
+      q3_rows.data(), rhs.data(),
+      [](const emel::kernel::x86_64::sm & machine) {
+        return machine.optimized_q3_dispatch_count();
+      },
+      [](const emel::kernel::x86_64::sm & machine) {
+        return machine.shared_q3_dispatch_count();
+      });
+  append_emel_x86_quantized_case(
+      results, cfg, "kernel/x86_64/op_mul_mat_q6_k_q8_k", dtype::q6_k,
+      q6_rows.data(), rhs.data(),
+      [](const emel::kernel::x86_64::sm & machine) {
+        return machine.optimized_q6_dispatch_count();
+      },
+      [](const emel::kernel::x86_64::sm & machine) {
+        return machine.shared_q6_dispatch_count();
+      });
+}
+
+void append_reference_x86_optimized_cases(
+    std::vector<emel::bench::result> & results,
+    const emel::bench::config & cfg) {
+  const auto q2_rows =
+      make_x86_quantized_rows<block_q2_k>(fill_x86_q2_block, 11u);
+  const auto q3_rows =
+      make_x86_quantized_rows<block_q3_k>(fill_x86_q3_block, 19u);
+  const auto q6_rows =
+      make_x86_quantized_rows<block_q6_k>(fill_x86_q6_block, 37u);
+  const auto rhs = make_x86_quantized_rhs_values(3u);
+
+  append_reference_x86_flash_case(results, cfg);
+  append_reference_x86_quantized_case(results, cfg,
+                                      "kernel/x86_64/op_mul_mat_q2_k_q8_k",
+                                      dtype::q2_k, q2_rows.data(), rhs.data());
+  append_reference_x86_quantized_case(results, cfg,
+                                      "kernel/x86_64/op_mul_mat_q3_k_q8_k",
+                                      dtype::q3_k, q3_rows.data(), rhs.data());
+  append_reference_x86_quantized_case(results, cfg,
+                                      "kernel/x86_64/op_mul_mat_q6_k_q8_k",
+                                      dtype::q6_k, q6_rows.data(), rhs.data());
+}
+
+}  // namespace
+
 namespace emel::bench {
 
 void append_emel_kernel_x86_64_cases(std::vector<result> & results, const config & cfg) {
@@ -12,10 +361,12 @@ void append_emel_kernel_x86_64_cases(std::vector<result> & results, const config
     return x86_machine.process_event(ev);
   };
   append_emel_backend_cases(results, cfg, "x86_64", exec);
+  append_emel_x86_optimized_cases(results, cfg);
 }
 
 void append_reference_kernel_x86_64_cases(std::vector<result> & results, const config & cfg) {
   append_reference_backend_cases(results, cfg, "x86_64");
+  append_reference_x86_optimized_cases(results, cfg);
 }
 
 }  // namespace emel::bench
diff --git a/tools/bench/quality_gates_tests.cpp b/tools/bench/quality_gates_tests.cpp
index 9fae1268..53f397b3 100644
--- a/tools/bench/quality_gates_tests.cpp
+++ b/tools/bench/quality_gates_tests.cpp
@@ -54,6 +54,19 @@ TEST_CASE("quality gates exclude nested sml machine headers from coverage source
   CHECK(helper.find("src/emel/**/*/sm.hpp") != std::string::npos);
 }
 
+TEST_CASE("coverage script enforces thresholds on changed executable lines") {
+  const std::string script = read_file(repo_root() / "scripts" / "test_with_coverage.sh");
+
+  CHECK(script.find("COVERAGE_CHANGED_LINE_ONLY") != std::string::npos);
+  CHECK(script.find("required_tools+=(python3)") != std::string::npos);
+  CHECK(script.find("collect_changed_lines()") != std::string::npos);
+  CHECK(script.find("enforce_changed_line_coverage()") != std::string::npos);
+  CHECK(script.find("--json \"$coverage_json\"") != std::string::npos);
+  CHECK(script.find("changed-line coverage:") != std::string::npos);
+  CHECK(script.find("--fail-under-line \"$LINE_COVERAGE_MIN\"") != std::string::npos);
+  CHECK(script.find("--fail-under-branch \"$BRANCH_COVERAGE_MIN\"") != std::string::npos);
+}
+
 TEST_CASE("quality gates consume benchmark dependency manifest conservatively") {
   const std::string script = read_file(repo_root() / "scripts" / "quality_gates.sh");
 
@@ -336,10 +349,24 @@ TEST_CASE("quality gates skip host-incompatible benchmark suites during full exp
   CHECK(helper.find("kernel_aarch64)") != std::string::npos);
   CHECK(helper.find("\"aarch64\"") != std::string::npos);
   CHECK(helper.find("\"arm64\"") != std::string::npos);
-  CHECK(helper.find("sm_any)") != std::string::npos);
+  CHECK(helper.find("sm_any|sm_scheduler)") != std::string::npos);
   CHECK(helper.find("EMEL_BENCH_INTERNAL") != std::string::npos);
 }
 
+TEST_CASE("quality gates enable internal env for selected internal benchmark suites") {
+  const std::string script = read_file(repo_root() / "scripts" / "quality_gates.sh");
+  const std::size_t run_start = script.find("run_benchmark_gates()");
+  REQUIRE(run_start != std::string::npos);
+
+  const std::size_t run_end = script.find("run_coverage_gate()", run_start);
+  REQUIRE(run_end != std::string::npos);
+
+  const std::string run_body = script.substr(run_start, run_end - run_start);
+  CHECK(run_body.find("sm_any|sm_scheduler)") != std::string::npos);
+  CHECK(run_body.find("bench_extra_env+=(EMEL_BENCH_INTERNAL=1)") !=
+        std::string::npos);
+}
+
 TEST_CASE("quality gates check benchmark manifest before deciding benchmark branch") {
   const std::string script = read_file(repo_root() / "scripts" / "quality_gates.sh");
   const std::size_t run_start = script.find("run_benchmark_gates()");
diff --git a/tools/bench/sm_scheduler_bench.cpp b/tools/bench/sm_scheduler_bench.cpp
new file mode 100644
index 00000000..a8a69f51
--- /dev/null
+++ b/tools/bench/sm_scheduler_bench.cpp
@@ -0,0 +1,152 @@
+#include "bench_cases.hpp"
+
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <semaphore>
+#include <thread>
+
+#include "emel/sm.hpp"
+
+namespace {
+
+bool bench_internal_enabled() {
+  const char * value = std::getenv("EMEL_BENCH_INTERNAL");
+  if (value == nullptr || value[0] == '\0') {
+    return false;
+  }
+  return value[0] != '0';
+}
+
+struct event_tick {
+  volatile int32_t & sink;
+};
+
+struct state_idle {};
+
+struct effect_tick {
+  void operator()(const event_tick & ev) const noexcept {
+    ev.sink += 1;
+  }
+};
+
+struct scheduler_model {
+  auto operator()() const noexcept {
+    namespace sml = stateforward::sml;
+    // clang-format off
+    return sml::make_transition_table(
+        sml::state<state_idle> <= *sml::state<state_idle>
+          + sml::event<event_tick> / effect_tick{}
+    );
+    // clang-format on
+  }
+};
+
+using inline_co_policy =
+    emel::policy::coroutine_scheduler<emel::policy::inline_scheduler>;
+using thread_pool_pool =
+    emel::policy::thread_pool_scheduler<2u, 1024u, 128u>;
+using thread_pool_scheduler =
+    emel::policy::thread_pool_scheduler_ref<thread_pool_pool>;
+using thread_pool_co_policy =
+    emel::policy::coroutine_scheduler<thread_pool_scheduler>;
+
+using inline_machine = emel::co_sm<scheduler_model, void, inline_co_policy>;
+using thread_pool_machine =
+    emel::co_sm<scheduler_model, void, thread_pool_co_policy>;
+
+emel::bench::result annotate_result(emel::bench::result result,
+                                    const char * lane,
+                                    const char * backend_id) {
+  result.lane = lane;
+  result.backend_id = backend_id;
+  result.workload_id = "single_transition_event";
+  result.comparison_mode = "thread_pool_scheduler_vs_inline_co_sm";
+  result.comparable = true;
+  result.note = "internal scheduler overhead microbenchmark";
+  return result;
+}
+
+void append_thread_pool_idle_case(std::vector<emel::bench::result> & results,
+                                  const emel::bench::config & cfg) {
+  volatile int32_t sink = 0;
+  event_tick ev{sink};
+  thread_pool_pool pool{};
+  thread_pool_machine machine{thread_pool_scheduler{pool}};
+  auto fn = [&]() { (void)machine.process_event_async(ev).result(); };
+  results.push_back(annotate_result(
+      emel::bench::measure_case("sm_scheduler/idle_async", cfg, fn),
+      "thread_pool_idle",
+      "emel_thread_pool_scheduler"));
+}
+
+void append_thread_pool_busy_case(std::vector<emel::bench::result> & results,
+                                  const emel::bench::config & cfg) {
+  volatile int32_t sink = 0;
+  event_tick ev{sink};
+  thread_pool_pool pool{};
+  thread_pool_machine machine{thread_pool_scheduler{pool}};
+  std::binary_semaphore inline_lane_held{0};
+  std::binary_semaphore release_inline_lane{0};
+  std::thread inline_lane_holder{[&]() noexcept {
+    const bool held = pool.try_run_immediate([&]() noexcept {
+      inline_lane_held.release();
+      release_inline_lane.acquire();
+    });
+    if (!held) {
+      std::fprintf(stderr,
+                   "error: sm_scheduler busy benchmark could not hold inline lane\n");
+      std::abort();
+    }
+  }};
+  inline_lane_held.acquire();
+
+  auto fn = [&]() {
+    (void)machine.process_event_async(ev).result();
+  };
+  results.push_back(annotate_result(
+      emel::bench::measure_case("sm_scheduler/busy_worker_async", cfg, fn),
+      "thread_pool_worker",
+      "emel_thread_pool_scheduler"));
+  release_inline_lane.release();
+  inline_lane_holder.join();
+}
+
+void append_inline_idle_case(std::vector<emel::bench::result> & results,
+                             const emel::bench::config & cfg,
+                             const char * name,
+                             const char * lane) {
+  volatile int32_t sink = 0;
+  event_tick ev{sink};
+  inline_machine machine{};
+  auto fn = [&]() { (void)machine.process_event_async(ev).result(); };
+  results.push_back(annotate_result(
+      emel::bench::measure_case(name, cfg, fn), lane, "emel_inline_scheduler"));
+}
+
+}  // namespace
+
+namespace emel::bench {
+
+void append_emel_sm_scheduler_cases(std::vector<result> & results,
+                                    const config & cfg) {
+  if (!bench_internal_enabled()) {
+    return;
+  }
+
+  append_thread_pool_idle_case(results, cfg);
+  append_thread_pool_busy_case(results, cfg);
+}
+
+void append_reference_sm_scheduler_cases(std::vector<result> & results,
+                                         const config & cfg) {
+  if (!bench_internal_enabled()) {
+    return;
+  }
+
+  append_inline_idle_case(results, cfg, "sm_scheduler/idle_async", "inline_idle");
+  append_inline_idle_case(
+      results, cfg, "sm_scheduler/busy_worker_async", "inline_idle_baseline");
+}
+
+}  // namespace emel::bench
diff --git a/tools/bench/text/generator/decode_wavefront_bench.cpp b/tools/bench/text/generator/decode_wavefront_bench.cpp
new file mode 100644
index 00000000..951cbbb6
--- /dev/null
+++ b/tools/bench/text/generator/decode_wavefront_bench.cpp
@@ -0,0 +1,654 @@
+#include "bench_cases.hpp"
+
+#include <array>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <memory>
+#include <span>
+#include <string_view>
+#include <vector>
+
+#include "emel/error/error.hpp"
+#include "emel/graph/events.hpp"
+#include "emel/graph/sm.hpp"
+#include "emel/text/generator/decode_wavefront/sm.hpp"
+
+// Reference lane only: ggml/llama.cpp drives the comparison result and never
+// touches the EMEL lane, per the split-lane benchmark contract.
+extern "C" {
+#include "ggml-cpu.h"
+#include "ggml.h"
+}
+
+namespace {
+
+namespace wavefront = emel::text::generator::decode_wavefront;
+using execute_t = emel::graph::processor::event::execute;
+
+bool validate_ok(const execute_t &, int32_t * err_out) {
+  if (err_out != nullptr) {
+    *err_out = 0;
+  }
+  return true;
+}
+
+bool prepare_graph_reuse(const execute_t &, bool * reused_out, int32_t * err_out) {
+  if (reused_out != nullptr) {
+    *reused_out = true;
+  }
+  if (err_out != nullptr) {
+    *err_out = 0;
+  }
+  return true;
+}
+
+bool alloc_graph_ok(const execute_t &, int32_t * err_out) {
+  if (err_out != nullptr) {
+    *err_out = 0;
+  }
+  return true;
+}
+
+bool bind_inputs_ok(const execute_t &, int32_t * err_out) {
+  if (err_out != nullptr) {
+    *err_out = 0;
+  }
+  return true;
+}
+
+bool run_kernel_counting(const execute_t & request, int32_t * err_out) {
+  auto * calls = static_cast<int32_t *>(request.compute_ctx);
+  *calls += 1;
+  if (err_out != nullptr) {
+    *err_out = 0;
+  }
+  return true;
+}
+
+// Realistic per-lane decode compute: one f32 GEMV  y[dim] = W[dim x dim] @ x[dim],
+// representative of an autoregressive decode-step projection. Each lane owns its
+// own weights/activation buffers, so lanes share no bottleneck and parallel
+// dispatch is the right strategy. Both the scalar and wavefront lanes run the
+// identical kernel, so the comparison isolates sequential vs parallel dispatch.
+struct gemv_work {
+  int32_t dim = 0;
+  int32_t calls = 0;
+  std::vector<float> weights;  // dim*dim
+  std::vector<float> input;    // dim
+  std::vector<float> output;   // dim
+  volatile float sink = 0.0f;
+
+  void init(int32_t dim_in, uint32_t seed) {
+    dim = dim_in;
+    weights.resize(static_cast<size_t>(dim) * static_cast<size_t>(dim));
+    input.resize(static_cast<size_t>(dim));
+    output.resize(static_cast<size_t>(dim));
+    uint32_t state = seed * 2654435761u + 1u;
+    const auto next = [&state]() {
+      state ^= state << 13;
+      state ^= state >> 17;
+      state ^= state << 5;
+      return static_cast<float>((state >> 8) & 0xffffu) / 65536.0f - 0.5f;
+    };
+    for (auto & value : weights) {
+      value = next();
+    }
+    for (auto & value : input) {
+      value = next();
+    }
+  }
+
+  void compute() noexcept {
+    const float * __restrict w = weights.data();
+    const float * __restrict x = input.data();
+    float * __restrict y = output.data();
+    constexpr int32_t k_lanes = 8;  // independent accumulators -> SIMD FMA
+    for (int32_t row = 0; row < dim; ++row) {
+      const float * __restrict w_row = w + static_cast<size_t>(row) * dim;
+      float acc[k_lanes] = {0};
+      int32_t col = 0;
+      for (; col + k_lanes <= dim; col += k_lanes) {
+        for (int32_t k = 0; k < k_lanes; ++k) {
+          acc[k] += w_row[col + k] * x[col + k];
+        }
+      }
+      float tail = 0.0f;
+      for (; col < dim; ++col) {
+        tail += w_row[col] * x[col];
+      }
+      for (int32_t k = 0; k < k_lanes; ++k) {
+        tail += acc[k];
+      }
+      y[row] = tail;
+    }
+    sink += y[0];
+    calls += 1;
+  }
+};
+
+bool run_kernel_gemv(const execute_t & request, int32_t * err_out) {
+  static_cast<gemv_work *>(request.compute_ctx)->compute();
+  if (err_out != nullptr) {
+    *err_out = 0;
+  }
+  return true;
+}
+
+int32_t decode_gemv_dim() {
+  const char * value = std::getenv("EMEL_BENCH_DECODE_GEMV_DIM");
+  if (value == nullptr || value[0] == '\0') {
+    return 256;
+  }
+  const long parsed = std::strtol(value, nullptr, 10);
+  if (parsed < 1 || parsed > 8192) {
+    return 256;
+  }
+  return static_cast<int32_t>(parsed);
+}
+
+bool extract_outputs_ok(const execute_t &, int32_t * outputs_out, int32_t * err_out) {
+  if (outputs_out != nullptr) {
+    *outputs_out = 1;
+  }
+  if (err_out != nullptr) {
+    *err_out = 0;
+  }
+  return true;
+}
+
+struct reserve_callbacks {
+  bool done_called = false;
+  bool error_called = false;
+
+  static bool on_done(void * owner, const emel::graph::events::reserve_done &) noexcept {
+    auto * self = static_cast<reserve_callbacks *>(owner);
+    self->done_called = true;
+    return true;
+  }
+
+  static bool on_error(void * owner, const emel::graph::events::reserve_error &) noexcept {
+    auto * self = static_cast<reserve_callbacks *>(owner);
+    self->error_called = true;
+    return true;
+  }
+};
+
+struct compute_callbacks {
+  bool done_called = false;
+  bool error_called = false;
+
+  void reset() noexcept {
+    done_called = false;
+    error_called = false;
+  }
+
+  static bool on_done(void * owner, const emel::graph::events::compute_done &) noexcept {
+    auto * self = static_cast<compute_callbacks *>(owner);
+    self->done_called = true;
+    return true;
+  }
+
+  static bool on_error(void * owner, const emel::graph::events::compute_error &) noexcept {
+    auto * self = static_cast<compute_callbacks *>(owner);
+    self->error_called = true;
+    return true;
+  }
+};
+
+struct lifecycle_fixture {
+  int32_t leaf_tensor = 11;
+  int32_t compute_tensor = 29;
+  std::array<emel::graph::processor::event::lifecycle_tensor_binding, 2> tensors{{
+      {
+          .tensor_id = 0,
+          .buffer = &leaf_tensor,
+          .buffer_bytes = sizeof(leaf_tensor),
+          .consumer_refs = 0,
+          .is_leaf = true,
+      },
+      {
+          .tensor_id = 1,
+          .buffer = &compute_tensor,
+          .buffer_bytes = sizeof(compute_tensor),
+          .consumer_refs = 1,
+          .is_leaf = false,
+      },
+  }};
+  std::array<int32_t, 1> required_ids = {0};
+  std::array<int32_t, 1> publish_ids = {1};
+  std::array<int32_t, 1> release_ids = {1};
+  emel::graph::processor::event::lifecycle_phase phase{
+      .required_filled_ids = required_ids.data(),
+      .required_filled_count = static_cast<int32_t>(required_ids.size()),
+      .publish_ids = publish_ids.data(),
+      .publish_count = static_cast<int32_t>(publish_ids.size()),
+      .release_ids = release_ids.data(),
+      .release_count = static_cast<int32_t>(release_ids.size()),
+  };
+  emel::graph::processor::event::lifecycle_manifest reserve{
+      .tensors = tensors.data(),
+      .tensor_count = static_cast<int32_t>(tensors.size()),
+      .phase = nullptr,
+  };
+  emel::graph::processor::event::lifecycle_manifest compute{
+      .tensors = tensors.data(),
+      .tensor_count = static_cast<int32_t>(tensors.size()),
+      .phase = &phase,
+  };
+};
+
+struct graph_lane_fixture {
+  emel::graph::sm graph{};
+  lifecycle_fixture lifecycle{};
+  reserve_callbacks reserve_cb{};
+  compute_callbacks compute_cb{};
+  emel::graph::event::reserve_output reserve_output{};
+  emel::graph::event::compute_output compute_output{};
+  emel::graph::event::compute compute_request{};
+  int32_t kernel_calls = 0;
+  int32_t gemv_dim = 0;
+  uint32_t gemv_seed = 0;
+  gemv_work kernel_work{};
+  bool lane_accepted = false;
+
+  void reserve_graph() {
+    const emel::graph::event::reserve reserve_request{
+        .model_topology = reinterpret_cast<const void *>(0xA5),
+        .output_out = &reserve_output,
+        .lifecycle = &lifecycle.reserve,
+        .max_node_count = 4u,
+        .max_tensor_count = 5u,
+        .bytes_per_tensor = 8u,
+        .workspace_capacity_bytes = 64u,
+        .dispatch_done = {&reserve_cb, reserve_callbacks::on_done},
+        .dispatch_error = {&reserve_cb, reserve_callbacks::on_error},
+    };
+    if (!graph.process_event(reserve_request) || !reserve_cb.done_called ||
+        reserve_cb.error_called) {
+      std::fprintf(stderr, "error: decode wavefront bench graph reserve failed\n");
+      std::abort();
+    }
+  }
+
+  void bind_compute() {
+    void * kernel_ctx = &kernel_calls;
+    bool (*kernel)(const execute_t &, int32_t *) = run_kernel_counting;
+    if (gemv_dim > 0) {
+      kernel_work.init(gemv_dim, gemv_seed);
+      kernel_ctx = &kernel_work;
+      kernel = run_kernel_gemv;
+    }
+    compute_request = emel::graph::event::compute{
+        .step_plan = reinterpret_cast<const void *>(0xB6),
+        .output_out = &compute_output,
+        .lifecycle = &lifecycle.compute,
+        .node_count_hint = reserve_output.node_count,
+        .tensor_count_hint = reserve_output.tensor_count,
+        .bytes_per_tensor = 8u,
+        .workspace_capacity_bytes = 64u,
+        .step_index = 0,
+        .step_size = 1,
+        .kv_tokens = 1,
+        .expected_outputs = 1,
+        .compute_ctx = kernel_ctx,
+        .validate = validate_ok,
+        .prepare_graph = prepare_graph_reuse,
+        .alloc_graph = alloc_graph_ok,
+        .bind_inputs = bind_inputs_ok,
+        .run_kernel = kernel,
+        .extract_outputs = extract_outputs_ok,
+        .dispatch_done = {&compute_cb, compute_callbacks::on_done},
+        .dispatch_error = {&compute_cb, compute_callbacks::on_error},
+    };
+  }
+
+  void reset_iteration() noexcept {
+    compute_output = {};
+    compute_cb.reset();
+    lane_accepted = false;
+  }
+};
+
+wavefront::event::compatibility_key make_key(const void * model_identity,
+                                             const void * backend_identity) {
+  return wavefront::event::compatibility_key{
+      .model_identity = model_identity,
+      .backend_identity = backend_identity,
+      .kernel_kind = emel::kernel::kernel_kind::x86_64,
+      .attention = emel::text::generator::attention_mode::flash,
+      .route = wavefront::event::kernel_route::q8_k,
+      .output = wavefront::event::output_contract::preselected_argmax,
+      .dtype_layout_contract =
+          static_cast<uint32_t>(wavefront::event::kernel_route::q8_k),
+      .quantized_contract =
+          static_cast<uint32_t>(wavefront::event::kernel_route::q8_k),
+      .step_size = 1,
+      .token_count = 1,
+  };
+}
+
+template <size_t lane_count>
+struct decode_wavefront_fixture {
+  int model_tag = 1;
+  int backend_tag = 2;
+  std::array<std::unique_ptr<graph_lane_fixture>, lane_count> lanes{};
+  std::vector<wavefront::event::lane> wavefront_lanes{};
+  wavefront::lane_pool pool{};
+  wavefront::sm machine{pool};
+  volatile int32_t sink = 0;
+
+  explicit decode_wavefront_fixture(int32_t gemv_dim = 0) {
+    const auto key = make_key(&model_tag, &backend_tag);
+    wavefront_lanes.reserve(lane_count);
+    for (size_t lane_index = 0u; lane_index < lane_count; ++lane_index) {
+      lanes[lane_index] = std::make_unique<graph_lane_fixture>();
+      lanes[lane_index]->gemv_dim = gemv_dim;
+      lanes[lane_index]->gemv_seed =
+          static_cast<uint32_t>(lane_index * 131u + 17u);
+      lanes[lane_index]->reserve_graph();
+      lanes[lane_index]->bind_compute();
+      wavefront_lanes.emplace_back(lanes[lane_index]->graph,
+                                   lanes[lane_index]->compute_request,
+                                   key,
+                                   lanes[lane_index]->lane_accepted);
+    }
+  }
+
+  void run_scalar() noexcept {
+    int32_t accepted_count = 0;
+    for (auto & lane : lanes) {
+      lane->reset_iteration();
+      const emel::graph::event::compute_reserved reserved_compute{lane->compute_request};
+      const bool accepted = lane->graph.process_event(reserved_compute);
+      if (!accepted || !lane->compute_cb.done_called || lane->compute_cb.error_called) {
+        std::fprintf(stderr, "error: decode wavefront bench reserved scalar failed\n");
+        std::abort();
+      }
+      accepted_count += static_cast<int32_t>(accepted);
+    }
+    sink += accepted_count;
+  }
+
+  void run_wavefront() noexcept {
+    for (auto & lane : lanes) {
+      lane->reset_iteration();
+    }
+    wavefront::event::dispatch_summary summary{};
+    wavefront::event::run request{std::span<wavefront::event::lane>{wavefront_lanes},
+                                  summary};
+    const bool accepted = machine.process_event(request);
+    if (!accepted || summary.dispatched_lanes != static_cast<int32_t>(lane_count)) {
+      std::fprintf(stderr,
+                   "error: decode wavefront bench wavefront dispatch failed "
+                   "lanes=%zu accepted=%d err=%d dispatched=%d failed_lane=%d\n",
+                   lane_count,
+                   accepted ? 1 : 0,
+                   summary.err,
+                   summary.dispatched_lanes,
+                   summary.failed_lane);
+      std::abort();
+    }
+    for (const auto & lane : lanes) {
+      if (!lane->lane_accepted || !lane->compute_cb.done_called ||
+          lane->compute_cb.error_called) {
+        std::fprintf(stderr, "error: decode wavefront bench wavefront lane failed\n");
+        std::abort();
+      }
+    }
+    sink += accepted ? summary.dispatched_lanes : -1;
+  }
+};
+
+template <size_t lane_count>
+constexpr const char * case_name() noexcept;
+
+template <>
+constexpr const char * case_name<1>() noexcept {
+  return "decode_wavefront/batch1";
+}
+
+template <>
+constexpr const char * case_name<4>() noexcept {
+  return "decode_wavefront/batch4";
+}
+
+template <>
+constexpr const char * case_name<8>() noexcept {
+  return "decode_wavefront/batch8";
+}
+
+template <size_t lane_count>
+emel::bench::result annotate_result(emel::bench::result result,
+                                    const std::string_view lane_name) {
+  result.lane = std::string{lane_name};
+  result.backend_id = lane_name == "wavefront" ? "emel_decode_wavefront_sm"
+                                               : "emel_graph_sm_reserved_scalar";
+  result.workload_id = "reserved_decode_graph_dispatch";
+  result.comparison_mode = "reserved_scalar_vs_wavefront";
+  result.output_tokens = lane_count;
+  result.max_output_tokens = lane_count;
+  result.comparable = true;
+  result.note = "reserved graph compute fixture; scalar lane is direct per-lane reserved compute";
+  return result;
+}
+
+template <size_t lane_count>
+void append_wavefront_case(std::vector<emel::bench::result> & results,
+                           const emel::bench::config & cfg) {
+  auto fixture = std::make_unique<decode_wavefront_fixture<lane_count>>();
+  auto fn = [&fixture]() { fixture->run_wavefront(); };
+  results.push_back(annotate_result<lane_count>(
+      emel::bench::measure_case(case_name<lane_count>(), cfg, fn), "wavefront"));
+}
+
+template <size_t lane_count>
+void append_scalar_case(std::vector<emel::bench::result> & results,
+                        const emel::bench::config & cfg) {
+  auto fixture = std::make_unique<decode_wavefront_fixture<lane_count>>();
+  auto fn = [&fixture]() { fixture->run_scalar(); };
+  results.push_back(annotate_result<lane_count>(
+      emel::bench::measure_case(case_name<lane_count>(), cfg, fn), "reserved_scalar"));
+}
+
+template <size_t lane_count>
+constexpr const char * gemv_case_name() noexcept;
+
+template <>
+constexpr const char * gemv_case_name<1>() noexcept {
+  return "decode_wavefront/gemv_batch1";
+}
+
+template <>
+constexpr const char * gemv_case_name<4>() noexcept {
+  return "decode_wavefront/gemv_batch4";
+}
+
+template <>
+constexpr const char * gemv_case_name<8>() noexcept {
+  return "decode_wavefront/gemv_batch8";
+}
+
+// Realistic-compute variant: each lane runs a decode-representative GEMV, so the
+// comparison reflects real per-token decode cost. The wavefront lane forks the
+// lanes across the thread-pool co_sm; the reserved-scalar lane runs them
+// sequentially through one graph sm. Both run the identical kernel.
+template <size_t lane_count>
+void append_gemv_wavefront_case(std::vector<emel::bench::result> & results,
+                                const emel::bench::config & cfg) {
+  auto fixture =
+      std::make_unique<decode_wavefront_fixture<lane_count>>(decode_gemv_dim());
+  auto fn = [&fixture]() { fixture->run_wavefront(); };
+  results.push_back(annotate_result<lane_count>(
+      emel::bench::measure_case(gemv_case_name<lane_count>(), cfg, fn), "wavefront"));
+}
+
+template <size_t lane_count>
+void append_gemv_scalar_case(std::vector<emel::bench::result> & results,
+                             const emel::bench::config & cfg) {
+  auto fixture =
+      std::make_unique<decode_wavefront_fixture<lane_count>>(decode_gemv_dim());
+  auto fn = [&fixture]() { fixture->run_scalar(); };
+  results.push_back(annotate_result<lane_count>(
+      emel::bench::measure_case(gemv_case_name<lane_count>(), cfg, fn), "reserved_scalar"));
+}
+
+template <size_t lane_count>
+constexpr const char * ggml_case_name() noexcept;
+
+template <>
+constexpr const char * ggml_case_name<1>() noexcept {
+  return "decode_wavefront/ggml_batch1";
+}
+
+template <>
+constexpr const char * ggml_case_name<4>() noexcept {
+  return "decode_wavefront/ggml_batch4";
+}
+
+template <>
+constexpr const char * ggml_case_name<8>() noexcept {
+  return "decode_wavefront/ggml_batch8";
+}
+
+// Reference lane: ggml/llama.cpp computes lane_count independent f32 GEMVs as one
+// graph of mul_mat nodes, parallelized by its own warm threadpool over the same
+// core budget (n_threads = lane_count). This mirrors the EMEL wavefront's work
+// (lane_count independent f32 [dim x dim] @ [dim] projections) but with ggml's
+// intra-op threading instead of EMEL's inter-op fork/join. EMEL-owned code is
+// never invoked here.
+struct ggml_decode_reference {
+  int32_t dim = 0;
+  int32_t n_threads = 0;
+  ggml_context * ctx = nullptr;
+  ggml_cgraph * graph = nullptr;
+  ggml_threadpool * threadpool = nullptr;
+  std::vector<uint8_t> work{};
+  ggml_cplan plan{};
+  volatile float sink = 0.0f;
+
+  ggml_decode_reference(int32_t lanes, int32_t dim_in)
+      : dim(dim_in), n_threads(lanes) {
+    const size_t arena =
+        static_cast<size_t>(lanes) *
+            (static_cast<size_t>(dim) * dim + 2u * dim) * sizeof(float) +
+        32u * 1024u * 1024u;
+    ggml_init_params init{};
+    init.mem_size = arena;
+    init.mem_buffer = nullptr;
+    init.no_alloc = false;
+    ctx = ggml_init(init);
+    if (ctx == nullptr) {
+      std::fprintf(stderr, "error: decode wavefront ggml reference init failed\n");
+      std::abort();
+    }
+    graph = ggml_new_graph(ctx);
+    for (int32_t lane = 0; lane < lanes; ++lane) {
+      ggml_tensor * weights = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, dim);
+      ggml_tensor * activation = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, 1);
+      uint32_t state = static_cast<uint32_t>(lane) * 131u + 17u;
+      state = state * 2654435761u + 1u;
+      const auto next = [&state]() {
+        state ^= state << 13;
+        state ^= state >> 17;
+        state ^= state << 5;
+        return static_cast<float>((state >> 8) & 0xffffu) / 65536.0f - 0.5f;
+      };
+      float * wd = static_cast<float *>(weights->data);
+      for (size_t i = 0; i < static_cast<size_t>(dim) * dim; ++i) {
+        wd[i] = next();
+      }
+      float * xd = static_cast<float *>(activation->data);
+      for (int32_t i = 0; i < dim; ++i) {
+        xd[i] = next();
+      }
+      ggml_build_forward_expand(graph, ggml_mul_mat(ctx, weights, activation));
+    }
+    ggml_threadpool_params tp = ggml_threadpool_params_default(n_threads);
+    tp.poll = 100;  // warm polling, matching EMEL's warm worker pool
+    threadpool = ggml_threadpool_new(&tp);
+    plan = ggml_graph_plan(graph, n_threads, threadpool);
+    work.resize(plan.work_size != 0u ? plan.work_size : 1u);
+    plan.work_data = work.data();
+  }
+
+  ~ggml_decode_reference() {
+    if (threadpool != nullptr) {
+      ggml_threadpool_free(threadpool);
+    }
+    if (ctx != nullptr) {
+      ggml_free(ctx);
+    }
+  }
+
+  ggml_decode_reference(const ggml_decode_reference &) = delete;
+  ggml_decode_reference & operator=(const ggml_decode_reference &) = delete;
+
+  void run() noexcept {
+    if (ggml_graph_compute(graph, &plan) != GGML_STATUS_SUCCESS) {
+      std::fprintf(stderr, "error: decode wavefront ggml reference compute failed\n");
+      std::abort();
+    }
+    sink += 1.0f;
+  }
+};
+
+template <size_t lane_count>
+void append_ggml_wavefront_case(std::vector<emel::bench::result> & results,
+                                const emel::bench::config & cfg) {
+  auto fixture =
+      std::make_unique<decode_wavefront_fixture<lane_count>>(decode_gemv_dim());
+  auto fn = [&fixture]() { fixture->run_wavefront(); };
+  results.push_back(annotate_result<lane_count>(
+      emel::bench::measure_case(ggml_case_name<lane_count>(), cfg, fn), "wavefront"));
+}
+
+template <size_t lane_count>
+void append_ggml_reference_case(std::vector<emel::bench::result> & results,
+                                const emel::bench::config & cfg) {
+  auto fixture = std::make_unique<ggml_decode_reference>(
+      static_cast<int32_t>(lane_count), decode_gemv_dim());
+  auto fn = [&fixture]() { fixture->run(); };
+  auto result = emel::bench::measure_case(ggml_case_name<lane_count>(), cfg, fn);
+  result.lane = "ggml";
+  result.backend_id = "ggml_threadpool_mul_mat";
+  result.workload_id = "independent_decode_gemv_lanes";
+  result.comparison_mode = "wavefront_vs_ggml";
+  result.output_tokens = lane_count;
+  result.max_output_tokens = lane_count;
+  result.comparable = true;
+  result.note = "ggml reference: lane_count independent f32 GEMV mul_mat, warm threadpool";
+  results.push_back(std::move(result));
+}
+
+}  // namespace
+
+namespace emel::bench {
+
+void append_emel_decode_wavefront_cases(std::vector<result> & results,
+                                        const config & cfg) {
+  append_wavefront_case<1>(results, cfg);
+  append_wavefront_case<4>(results, cfg);
+  append_wavefront_case<8>(results, cfg);
+  append_gemv_wavefront_case<1>(results, cfg);
+  append_gemv_wavefront_case<4>(results, cfg);
+  append_gemv_wavefront_case<8>(results, cfg);
+  append_ggml_wavefront_case<1>(results, cfg);
+  append_ggml_wavefront_case<4>(results, cfg);
+  append_ggml_wavefront_case<8>(results, cfg);
+}
+
+void append_reference_decode_wavefront_cases(std::vector<result> & results,
+                                             const config & cfg) {
+  append_scalar_case<1>(results, cfg);
+  append_scalar_case<4>(results, cfg);
+  append_scalar_case<8>(results, cfg);
+  append_gemv_scalar_case<1>(results, cfg);
+  append_gemv_scalar_case<4>(results, cfg);
+  append_gemv_scalar_case<8>(results, cfg);
+  append_ggml_reference_case<1>(results, cfg);
+  append_ggml_reference_case<4>(results, cfg);
+  append_ggml_reference_case<8>(results, cfg);
+}
+
+}  // namespace emel::bench
diff --git a/tools/bench/text/generator/parallel_matmul_bench.cpp b/tools/bench/text/generator/parallel_matmul_bench.cpp
new file mode 100644
index 00000000..d8da518b
--- /dev/null
+++ b/tools/bench/text/generator/parallel_matmul_bench.cpp
@@ -0,0 +1,358 @@
+#include "bench_cases.hpp"
+
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+#include <cstdio>
+#include <cstdlib>
+#include <memory>
+
+#include "emel/kernel/detail.hpp"
+#include "emel/kernel/events.hpp"
+#include "emel/kernel/sm.hpp"
+#include "emel/sm.hpp"
+
+// Reference lane only: ggml/llama.cpp drives the ggml_* comparison results and
+// never touches the EMEL lane, per the split-lane benchmark contract.
+extern "C" {
+#include "ggml-cpu.h"
+#include "ggml.h"
+}
+
+// Focused parallel-matmul suite: the EMEL lane forks one logical mul_mat into
+// per-lane row-slice events across kernel actors on the warm thread pool (the
+// production view-sliced decode/prefill parallel route shape). The reference
+// lane carries two baselines: the plain-named cases run the identical full
+// event through one EMEL kernel actor serially (fork/join speedup proof), and
+// the ggml_* cases run the same logical matmul through ggml's warm threadpool
+// at the same core budget (n_threads = k_lanes), so the compare row pairs
+// EMEL's inter-actor row slicing against ggml's intra-op thread chunking.
+// Operand class: plain GGUF-native blocks on BOTH sides. This exercises
+// EMEL's shared (non-repacked) kernels, not the repacked x4/x8 kernels the
+// production decode routes dispatch after prepare(); the production-class
+// cross-engine number comes from the end-to-end generation compare.
+
+namespace {
+
+using emel::kernel::event::dtype;
+using emel::kernel::event::op_mul_mat;
+using emel::kernel::event::tensor_view;
+using emel::kernel::event::tensor_view_mut;
+
+constexpr size_t k_lanes = 8u;
+constexpr int32_t k_dim = 2048;
+constexpr int32_t k_gemm_tokens = 8;
+
+using lane_pool = emel::policy::thread_pool_scheduler<k_lanes, 16u, 128u>;
+using lane_scheduler = emel::policy::thread_pool_scheduler_ref<lane_pool>;
+
+emel::kernel::kernel_kind host_kernel_kind() {
+#if defined(__aarch64__) || defined(_M_ARM64)
+  return emel::kernel::kernel_kind::aarch64;
+#else
+  return emel::kernel::kernel_kind::x86_64;
+#endif
+}
+
+tensor_view make_weight_view(const void * data,
+                             const dtype type,
+                             const uint64_t row_bytes,
+                             const int32_t cols,
+                             const int32_t rows) {
+  tensor_view view{};
+  view.data = data;
+  view.type = type;
+  view.ne = {static_cast<uint64_t>(cols), static_cast<uint64_t>(rows), 1u, 1u};
+  view.nb[0] = type == dtype::f32 ? sizeof(float) : 1u;
+  view.nb[1] = row_bytes;
+  view.nb[2] = row_bytes * static_cast<uint64_t>(rows);
+  view.nb[3] = view.nb[2];
+  return view;
+}
+
+tensor_view make_input_view(const float * data, const int32_t tokens, const int32_t cols) {
+  tensor_view view{};
+  view.data = data;
+  view.type = dtype::f32;
+  view.ne = {static_cast<uint64_t>(tokens), static_cast<uint64_t>(cols), 1u, 1u};
+  view.nb[0] = sizeof(float);
+  view.nb[1] = sizeof(float) * static_cast<uint64_t>(tokens);
+  view.nb[2] = view.nb[1] * static_cast<uint64_t>(cols);
+  view.nb[3] = view.nb[2];
+  return view;
+}
+
+tensor_view_mut make_output_view(float * data, const int32_t tokens, const int32_t rows) {
+  tensor_view_mut view{};
+  view.data = data;
+  view.type = dtype::f32;
+  view.ne = {static_cast<uint64_t>(tokens), static_cast<uint64_t>(rows), 1u, 1u};
+  view.nb[0] = sizeof(float);
+  view.nb[1] = sizeof(float) * static_cast<uint64_t>(tokens);
+  view.nb[2] = view.nb[1] * static_cast<uint64_t>(rows);
+  view.nb[3] = view.nb[2];
+  return view;
+}
+
+// Mirror the production group-aligned contiguous row split; k_dim rows divide
+// evenly by lane count for every case in this suite.
+op_mul_mat make_sliced_event(const op_mul_mat & ev,
+                             const uint64_t row_begin,
+                             const uint64_t row_count) {
+  op_mul_mat sliced = ev;
+  sliced.src0.data =
+      static_cast<const uint8_t *>(ev.src0.data) + row_begin * ev.src0.nb[1];
+  sliced.src0.ne[1] = row_count;
+  sliced.src0.nb[2] = ev.src0.nb[1] * row_count;
+  sliced.src0.nb[3] = sliced.src0.nb[2];
+  sliced.dst.data = static_cast<uint8_t *>(ev.dst.data) + row_begin * ev.dst.nb[1];
+  sliced.dst.ne[1] = row_count;
+  sliced.dst.nb[2] = ev.dst.nb[1] * row_count;
+  sliced.dst.nb[3] = sliced.dst.nb[2];
+  return sliced;
+}
+
+struct lane_fixture {
+  std::array<emel::kernel::sm, k_lanes> kernels = {};
+  lane_pool pool = {};
+
+  lane_fixture() {
+    for (auto & kernel : kernels) {
+      kernel.set_kind(host_kernel_kind());
+    }
+  }
+};
+
+struct case_buffers {
+  std::vector<uint8_t> weights = {};
+  std::vector<float> input = {};
+  std::vector<float> output = {};
+  op_mul_mat ev = {};
+};
+
+case_buffers make_case(const dtype type, const int32_t tokens) {
+  case_buffers buffers;
+  const uint8_t code = emel::kernel::detail::dtype_code(type);
+  const uint64_t row_bytes =
+      type == dtype::f32
+          ? sizeof(float) * static_cast<uint64_t>(k_dim)
+          : emel::kernel::detail::quantized_row_storage_bytes(
+                code, static_cast<uint64_t>(k_dim));
+  buffers.weights.assign(row_bytes * static_cast<size_t>(k_dim), 0u);
+  for (size_t idx = 0; idx < buffers.weights.size(); ++idx) {
+    buffers.weights[idx] = static_cast<uint8_t>((idx * 31u + 7u) & 0x3fu);
+  }
+  if (type == dtype::f32) {
+    auto * values = reinterpret_cast<float *>(buffers.weights.data());
+    const size_t count = buffers.weights.size() / sizeof(float);
+    for (size_t idx = 0; idx < count; ++idx) {
+      values[idx] = 0.25f * static_cast<float>((idx * 13u + 5u) % 17u) - 2.0f;
+    }
+  }
+
+  buffers.input.assign(
+      static_cast<size_t>(tokens) * static_cast<size_t>(k_dim), 0.0f);
+  for (size_t idx = 0; idx < buffers.input.size(); ++idx) {
+    buffers.input[idx] = 0.125f * static_cast<float>((idx * 7u + 3u) % 19u) - 1.0f;
+  }
+  buffers.output.assign(
+      static_cast<size_t>(tokens) * static_cast<size_t>(k_dim), 0.0f);
+
+  buffers.ev.src0 = make_weight_view(buffers.weights.data(), type, row_bytes, k_dim, k_dim);
+  buffers.ev.src1 = make_input_view(buffers.input.data(), tokens, k_dim);
+  buffers.ev.dst = make_output_view(buffers.output.data(), tokens, k_dim);
+  return buffers;
+}
+
+struct bench_case_spec {
+  const char * name;
+  dtype type;
+  int32_t tokens;
+};
+
+constexpr std::array<bench_case_spec, 5> k_cases = {{
+    {"parallel_matmul/gemv_f32", dtype::f32, 1},
+    {"parallel_matmul/gemv_q8_0", dtype::q8_0, 1},
+    {"parallel_matmul/gemv_q4_k", dtype::q4_k, 1},
+    {"parallel_matmul/gemv_q6_k", dtype::q6_k, 1},
+    {"parallel_matmul/gemm8_f32", dtype::f32, k_gemm_tokens},
+}};
+
+// Same EMEL fork/join work measured under ggml_* names; the reference lane
+// answers these with ggml's threadpool so compare rows read EMEL vs llama.cpp.
+constexpr std::array<bench_case_spec, 5> k_ggml_cases = {{
+    {"parallel_matmul/ggml_gemv_f32", dtype::f32, 1},
+    {"parallel_matmul/ggml_gemv_q8_0", dtype::q8_0, 1},
+    {"parallel_matmul/ggml_gemv_q4_k", dtype::q4_k, 1},
+    {"parallel_matmul/ggml_gemv_q6_k", dtype::q6_k, 1},
+    {"parallel_matmul/ggml_gemm8_f32", dtype::f32, k_gemm_tokens},
+}};
+
+ggml_type ggml_type_for(const dtype type) {
+  switch (type) {
+    case dtype::q8_0:
+      return GGML_TYPE_Q8_0;
+    case dtype::q4_k:
+      return GGML_TYPE_Q4_K;
+    case dtype::q6_k:
+      return GGML_TYPE_Q6_K;
+    default:
+      return GGML_TYPE_F32;
+  }
+}
+
+// Reference lane: ggml computes the same logical [k_dim x k_dim] @ [k_dim x
+// tokens] matmul as one mul_mat node, parallelized by its own warm threadpool
+// over the same core budget as the EMEL lane (n_threads = k_lanes). Weight
+// bytes use the identical deterministic pattern as make_case; block layouts
+// (q8_0/q4_K/q6_K) are byte-compatible between the two implementations.
+struct ggml_matmul_reference {
+  ggml_context * ctx = nullptr;
+  ggml_cgraph * graph = nullptr;
+  ggml_threadpool * threadpool = nullptr;
+  std::vector<uint8_t> work = {};
+  ggml_cplan plan = {};
+  volatile float sink = 0.0f;
+
+  ggml_matmul_reference(const ggml_type type, const int32_t tokens) {
+    const size_t row_bytes = ggml_row_size(type, k_dim);
+    const size_t weight_bytes = row_bytes * static_cast<size_t>(k_dim);
+    ggml_init_params init{};
+    init.mem_size = weight_bytes +
+                    static_cast<size_t>(tokens) * static_cast<size_t>(k_dim) *
+                        2u * sizeof(float) +
+                    32u * 1024u * 1024u;
+    init.mem_buffer = nullptr;
+    init.no_alloc = false;
+    ctx = ggml_init(init);
+    if (ctx == nullptr) {
+      std::fprintf(stderr, "error: parallel matmul ggml reference init failed\n");
+      std::abort();
+    }
+    graph = ggml_new_graph(ctx);
+    ggml_tensor * weights = ggml_new_tensor_2d(ctx, type, k_dim, k_dim);
+    ggml_tensor * activation = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, k_dim, tokens);
+    if (type == GGML_TYPE_F32) {
+      auto * values = static_cast<float *>(weights->data);
+      const size_t count = weight_bytes / sizeof(float);
+      for (size_t idx = 0; idx < count; ++idx) {
+        values[idx] = 0.25f * static_cast<float>((idx * 13u + 5u) % 17u) - 2.0f;
+      }
+    } else {
+      auto * bytes = static_cast<uint8_t *>(weights->data);
+      for (size_t idx = 0; idx < weight_bytes; ++idx) {
+        bytes[idx] = static_cast<uint8_t>((idx * 31u + 7u) & 0x3fu);
+      }
+    }
+    auto * input = static_cast<float *>(activation->data);
+    const size_t input_count = static_cast<size_t>(tokens) * static_cast<size_t>(k_dim);
+    for (size_t idx = 0; idx < input_count; ++idx) {
+      input[idx] = 0.125f * static_cast<float>((idx * 7u + 3u) % 19u) - 1.0f;
+    }
+    ggml_build_forward_expand(graph, ggml_mul_mat(ctx, weights, activation));
+    ggml_threadpool_params tp =
+        ggml_threadpool_params_default(static_cast<int32_t>(k_lanes));
+    tp.poll = 100;  // warm polling, matching EMEL's warm lane pool
+    threadpool = ggml_threadpool_new(&tp);
+    plan = ggml_graph_plan(graph, static_cast<int32_t>(k_lanes), threadpool);
+    work.resize(plan.work_size != 0u ? plan.work_size : 1u);
+    plan.work_data = work.data();
+  }
+
+  ~ggml_matmul_reference() {
+    if (threadpool != nullptr) {
+      ggml_threadpool_free(threadpool);
+    }
+    if (ctx != nullptr) {
+      ggml_free(ctx);
+    }
+  }
+
+  ggml_matmul_reference(const ggml_matmul_reference &) = delete;
+  ggml_matmul_reference & operator=(const ggml_matmul_reference &) = delete;
+
+  void run() noexcept {
+    if (ggml_graph_compute(graph, &plan) != GGML_STATUS_SUCCESS) {
+      std::fprintf(stderr, "error: parallel matmul ggml reference compute failed\n");
+      std::abort();
+    }
+    sink += 1.0f;
+  }
+};
+
+}  // namespace
+
+namespace emel::bench {
+
+void append_emel_parallel_matmul_cases(std::vector<result> & results, const config & cfg) {
+  static lane_fixture fixture;
+  volatile float sink = 0.0f;
+
+  const auto measure_parallel = [&](const bench_case_spec & spec) {
+    auto buffers = make_case(spec.type, spec.tokens);
+    std::array<op_mul_mat, k_lanes> lane_events = {};
+    constexpr uint64_t rows_per_lane = static_cast<uint64_t>(k_dim) / k_lanes;
+    for (size_t lane = 0; lane < k_lanes; ++lane) {
+      lane_events[lane] = make_sliced_event(
+          buffers.ev, static_cast<uint64_t>(lane) * rows_per_lane, rows_per_lane);
+    }
+
+    auto fn = [&]() {
+      std::array<bool, k_lanes> lane_ok = {};
+      lane_scheduler scheduler{fixture.pool};
+      lane_scheduler::join_group group{};
+      for (size_t lane = 1; lane < k_lanes; ++lane) {
+        auto & kernel = fixture.kernels[lane];
+        const auto & lane_ev = lane_events[lane];
+        auto & ok_flag = lane_ok[lane];
+        const bool submitted =
+            scheduler.try_submit(group, [&kernel, &lane_ev, &ok_flag]() noexcept {
+              ok_flag = kernel.process_event(lane_ev);
+            });
+        if (!submitted) {
+          ok_flag = kernel.process_event(lane_ev);
+        }
+      }
+      lane_ok[0] = fixture.kernels[0].process_event(lane_events[0]);
+      (void)group.wait();
+      bool all_ok = true;
+      for (const bool ok : lane_ok) {
+        all_ok = all_ok && ok;
+      }
+      sink += all_ok ? buffers.output[0] : -1.0f;
+    };
+    results.push_back(measure_case(spec.name, cfg, fn));
+  };
+
+  for (const auto & spec : k_cases) {
+    measure_parallel(spec);
+  }
+  for (const auto & spec : k_ggml_cases) {
+    measure_parallel(spec);
+  }
+}
+
+void append_reference_parallel_matmul_cases(std::vector<result> & results, const config & cfg) {
+  static emel::kernel::sm kernel;
+  kernel.set_kind(host_kernel_kind());
+  volatile float sink = 0.0f;
+
+  for (const auto & spec : k_cases) {
+    auto buffers = make_case(spec.type, spec.tokens);
+    auto fn = [&]() {
+      const bool ok = kernel.process_event(buffers.ev);
+      sink += ok ? buffers.output[0] : -1.0f;
+    };
+    results.push_back(measure_case(spec.name, cfg, fn));
+  }
+  for (const auto & spec : k_ggml_cases) {
+    auto fixture =
+        std::make_unique<ggml_matmul_reference>(ggml_type_for(spec.type), spec.tokens);
+    auto fn = [&fixture]() { fixture->run(); };
+    results.push_back(measure_case(spec.name, cfg, fn));
+  }
+}
+
+}  // namespace emel::bench
diff --git a/tools/decode_wavefront_eval/CMakeLists.txt b/tools/decode_wavefront_eval/CMakeLists.txt
new file mode 100644
index 00000000..7025b3b2
--- /dev/null
+++ b/tools/decode_wavefront_eval/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.20)
+
+project(emel_decode_wavefront_eval_project LANGUAGES C CXX)
+
+if(NOT DEFINED EMEL_ROOT_DIR)
+  message(FATAL_ERROR "EMEL_ROOT_DIR must point at the emel.cpp repository root")
+endif()
+
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_C_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+set(EMEL_ENABLE_TESTS OFF CACHE BOOL "" FORCE)
+add_subdirectory("${EMEL_ROOT_DIR}" emel_root)
+
+find_package(Threads REQUIRED)
+
+add_executable(emel_decode_wavefront_eval
+  main.cpp
+)
+
+target_link_libraries(emel_decode_wavefront_eval
+  PRIVATE
+    emel_core
+    emel
+    Threads::Threads
+)
diff --git a/tools/decode_wavefront_eval/main.cpp b/tools/decode_wavefront_eval/main.cpp
new file mode 100644
index 00000000..c961db50
--- /dev/null
+++ b/tools/decode_wavefront_eval/main.cpp
@@ -0,0 +1,714 @@
+// Decode-wavefront real-model eval.
+//
+// Measures what the decode-wavefront's scheduling mechanism (thread-pool
+// inter-op parallelism) does on REAL LFM2.5 decode: N independent generators of
+// the SAME loaded model (shared read-only weights, per-generator KV/activation
+// state) each run a real generate(). We compare running the N generate() calls
+// sequentially vs forking them across the wavefront's thread_pool_scheduler.
+//
+// Everything is driven through public state-machine process_event(...) calls;
+// no kernel detail.hpp/actions.hpp helper is touched. The load recipe is the
+// architecture-generic path (load_hparams_from_gguf), so it works for lfm2.
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+#include <memory>
+#include <span>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "../bench/model_load_strategy.hpp"
+#include "emel/error/error.hpp"
+#include "emel/gguf/loader/any.hpp"
+#include "emel/gguf/loader/errors.hpp"
+#include "emel/gguf/loader/events.hpp"
+#include "emel/gguf/loader/sm.hpp"
+#include "emel/io/events.hpp"
+#include "emel/io/read/sm.hpp"
+#include "emel/io/staged_read/sm.hpp"
+#include "emel/io/source/any.hpp"
+#include "emel/logits/sampler/events.hpp"
+#include "emel/model/data.hpp"
+#include "emel/model/detail.hpp"
+#include "emel/model/loader/errors.hpp"
+#include "emel/model/loader/events.hpp"
+#include "emel/model/loader/sm.hpp"
+#include "emel/model/tensor/errors.hpp"
+#include "emel/model/tensor/events.hpp"
+#include "emel/model/tensor/sm.hpp"
+#include "emel/sm.hpp"
+#include "emel/text/conditioner/sm.hpp"
+#include "emel/text/formatter/format.hpp"
+#include "emel/text/generator/errors.hpp"
+#include "emel/text/generator/events.hpp"
+#include "emel/text/generator/sm.hpp"
+#include "emel/text/tokenizer/sm.hpp"
+
+namespace {
+
+// The wavefront's lane pool (same type as
+// src/emel/text/generator/decode_wavefront/context.hpp).
+using lane_pool = emel::policy::thread_pool_scheduler<8u, 16u, 128u>;
+using lane_scheduler = emel::policy::thread_pool_scheduler_ref<lane_pool>;
+
+constexpr size_t k_output_capacity = 8192u;
+
+struct gguf_capture {
+  bool probe_done = false;
+  bool probe_error = false;
+  bool bind_done = false;
+  bool bind_error = false;
+  bool parse_done = false;
+  bool parse_error = false;
+  emel::gguf::loader::requirements requirements = {};
+  emel::error::type err = emel::error::cast(emel::gguf::loader::error::none);
+};
+
+struct load_capture {
+  bool done = false;
+  bool error = false;
+  emel::error::type err = emel::error::cast(emel::model::loader::error::none);
+};
+
+struct initialize_capture {
+  bool done = false;
+  bool error = false;
+  emel::error::type err = emel::error::cast(emel::text::generator::error::none);
+};
+
+struct generation_capture {
+  bool done = false;
+  bool error = false;
+  emel::error::type err = emel::error::cast(emel::text::generator::error::none);
+  int32_t tokens_generated = 0;
+  size_t output_length = 0u;
+};
+
+struct emel_fixture {
+  std::unique_ptr<emel::model::data> model_data =
+      std::make_unique<emel::model::data>();
+  std::vector<uint8_t> file_bytes = {};
+  std::vector<uint8_t> kv_arena = {};
+  uint64_t gguf_tensor_data_bytes = 0u;
+  std::vector<uint8_t> read_copy_storage = {};
+  std::vector<emel::gguf::loader::kv_entry> kv_entries = {};
+  uint32_t gguf_tensor_count = 0u;
+  std::vector<emel::model::tensor::effect_request> effect_requests = {};
+  std::vector<emel::model::tensor::effect_result> effect_results = {};
+  std::vector<emel::io::event::tensor_load_span> io_load_spans = {};
+  emel::gguf::loader::sm gguf_loader = {};
+  emel::io::read::sm io_read = {};
+  emel::io::staged_read::sm io_staged_read = {};
+  emel::io::loader::sm io_loader{
+      {.io_read = &io_read, .io_staged_read = &io_staged_read}};
+  emel::model::tensor::sm tensor_loader = {};
+  emel::model::loader::sm model_loader = {};
+  gguf_capture gguf = {};
+  load_capture load = {};
+};
+
+// One independent decode lane: its own tokenizer/conditioner/generator and
+// output buffer, all referencing the single shared model_data.
+struct lane_session {
+  emel::text::tokenizer::sm tokenizer = {};
+  emel::text::conditioner::sm conditioner = {};
+  std::unique_ptr<emel::text::generator::sm> generator = {};
+  initialize_capture initialize = {};
+  generation_capture generation = {};
+  std::array<char, k_output_capacity> output = {};
+  size_t output_length = 0u;
+  std::string reference_text = {};
+  bool last_ok = false;
+};
+
+// ---- gguf / model-loader callbacks (architecture-generic) ----
+
+void on_probe_done(void *owner,
+                   const emel::gguf::loader::events::probe_done &ev) {
+  auto &f = *static_cast<emel_fixture *>(owner);
+  f.gguf.probe_done = true;
+  f.gguf.probe_error = false;
+  f.gguf.requirements = ev.requirements_out;
+}
+void on_probe_error(void *owner,
+                    const emel::gguf::loader::events::probe_error &ev) {
+  auto &f = *static_cast<emel_fixture *>(owner);
+  f.gguf.probe_error = true;
+  f.gguf.err = ev.err;
+}
+void on_bind_done(void *owner, const emel::gguf::loader::events::bind_done &) {
+  auto &f = *static_cast<emel_fixture *>(owner);
+  f.gguf.bind_done = true;
+  f.gguf.bind_error = false;
+}
+void on_bind_error(void *owner,
+                   const emel::gguf::loader::events::bind_error &ev) {
+  auto &f = *static_cast<emel_fixture *>(owner);
+  f.gguf.bind_error = true;
+  f.gguf.err = ev.err;
+}
+void on_parse_done(void *owner, const emel::gguf::loader::events::parse_done &) {
+  auto &f = *static_cast<emel_fixture *>(owner);
+  f.gguf.parse_done = true;
+  f.gguf.parse_error = false;
+}
+void on_parse_error(void *owner,
+                    const emel::gguf::loader::events::parse_error &ev) {
+  auto &f = *static_cast<emel_fixture *>(owner);
+  f.gguf.parse_error = true;
+  f.gguf.err = ev.err;
+}
+void on_load_done(void *owner,
+                  const emel::model::loader::events::load_done &) {
+  auto &f = *static_cast<emel_fixture *>(owner);
+  f.load.done = true;
+  f.load.error = false;
+  f.load.err = emel::error::cast(emel::model::loader::error::none);
+}
+void on_load_error(void *owner,
+                   const emel::model::loader::events::load_error &ev) {
+  auto &f = *static_cast<emel_fixture *>(owner);
+  f.load.error = true;
+  f.load.err = ev.err;
+}
+
+void on_initialize_done(void *owner,
+                        const emel::text::generator::events::initialize_done &) {
+  auto &s = *static_cast<lane_session *>(owner);
+  s.initialize.done = true;
+  s.initialize.error = false;
+}
+void on_initialize_error(
+    void *owner, const emel::text::generator::events::initialize_error &ev) {
+  auto &s = *static_cast<lane_session *>(owner);
+  s.initialize.error = true;
+  s.initialize.err = ev.err;
+}
+void on_generation_done(
+    void *owner, const emel::text::generator::events::generation_done &ev) {
+  auto &s = *static_cast<lane_session *>(owner);
+  s.generation.done = true;
+  s.generation.error = false;
+  s.generation.tokens_generated = ev.tokens_generated;
+  s.generation.output_length = ev.output_length;
+}
+void on_generation_error(
+    void *owner, const emel::text::generator::events::generation_error &ev) {
+  auto &s = *static_cast<lane_session *>(owner);
+  s.generation.error = true;
+  s.generation.err = ev.err;
+  s.generation.tokens_generated = ev.tokens_generated;
+  s.generation.output_length = ev.output_length;
+}
+
+bool tokenizer_bind_dispatch(void *tokenizer_sm,
+                             const emel::text::tokenizer::event::bind &ev) {
+  return static_cast<emel::text::tokenizer::sm *>(tokenizer_sm)
+      ->process_event(ev);
+}
+bool tokenizer_tokenize_dispatch(
+    void *tokenizer_sm, const emel::text::tokenizer::event::tokenize &ev) {
+  return static_cast<emel::text::tokenizer::sm *>(tokenizer_sm)
+      ->process_event(ev);
+}
+
+emel::error::type map_gguf_error(const emel::error::type err) {
+  using ge = emel::gguf::loader::error;
+  using me = emel::model::loader::error;
+  switch (err) {
+  case emel::error::cast(ge::none):
+    return emel::error::cast(me::none);
+  case emel::error::cast(ge::invalid_request):
+    return emel::error::cast(me::invalid_request);
+  case emel::error::cast(ge::model_invalid):
+    return emel::error::cast(me::model_invalid);
+  case emel::error::cast(ge::capacity):
+    return emel::error::cast(me::backend_error);
+  case emel::error::cast(ge::parse_failed):
+    return emel::error::cast(me::parse_failed);
+  case emel::error::cast(ge::internal_error):
+    return emel::error::cast(me::internal_error);
+  default:
+    return emel::error::cast(me::untracked);
+  }
+}
+
+bool copy_tensor_names(const std::span<const uint8_t> file_image,
+                       emel::model::data &model_data) {
+  model_data.name_bytes_used = 0u;
+  for (uint32_t index = 0u; index < model_data.n_tensors; ++index) {
+    auto &tensor = model_data.tensors[index];
+    const size_t name_offset = static_cast<size_t>(tensor.name_offset);
+    const size_t name_length = static_cast<size_t>(tensor.name_length);
+    if (name_offset + name_length > file_image.size() ||
+        model_data.name_bytes_used + name_length >
+            model_data.name_storage.size()) {
+      return false;
+    }
+    const uint32_t copied_offset = model_data.name_bytes_used;
+    if (name_length > 0u) {
+      std::memcpy(model_data.name_storage.data() + copied_offset,
+                  file_image.data() + name_offset, name_length);
+    }
+    model_data.name_bytes_used += static_cast<uint32_t>(name_length);
+    tensor.name_offset = copied_offset;
+  }
+  return true;
+}
+
+emel::model::detail::kv_binding kv_binding_from_fixture(
+    const emel_fixture &fixture) {
+  return emel::model::detail::kv_binding{
+      .arena = std::span<const uint8_t>{fixture.kv_arena.data(),
+                                        fixture.kv_arena.size()},
+      .entries = std::span<const emel::gguf::loader::kv_entry>{
+          fixture.kv_entries.data(), fixture.kv_entries.size()},
+  };
+}
+
+// Architecture-generic metadata: resolves architecture from the gguf and calls
+// the registered load_hparams (lfm2, qwen3, llama, ...). No model-family gate.
+emel::error::type populate_model_metadata(const emel_fixture &fixture,
+                                          emel::model::data &model_data) {
+  return emel::model::detail::load_hparams_from_gguf(
+             kv_binding_from_fixture(fixture), model_data)
+             ? emel::error::cast(emel::model::loader::error::none)
+             : emel::error::cast(emel::model::loader::error::model_invalid);
+}
+
+emel::error::type prebind_emel_gguf_storage(emel_fixture &fixture) {
+  if (fixture.file_bytes.empty()) {
+    return emel::error::cast(emel::model::loader::error::invalid_request);
+  }
+  const std::span<const uint8_t> file_image{fixture.file_bytes.data(),
+                                            fixture.file_bytes.size()};
+  fixture.gguf_tensor_count = 0u;
+  fixture.gguf = {};
+  emel::gguf::loader::requirements requirements = {};
+  const emel::gguf::loader::event::probe_done_fn probe_done_cb{&fixture,
+                                                               on_probe_done};
+  const emel::gguf::loader::event::probe_error_fn probe_error_cb{&fixture,
+                                                                 on_probe_error};
+  const emel::gguf::loader::event::probe probe_ev{file_image, requirements,
+                                                  probe_done_cb, probe_error_cb};
+  if (!fixture.gguf_loader.process_event(probe_ev) || !fixture.gguf.probe_done ||
+      fixture.gguf.probe_error) {
+    return map_gguf_error(fixture.gguf.err);
+  }
+  if (requirements.tensor_count >
+      static_cast<uint32_t>(emel::model::data::k_max_tensors)) {
+    return emel::error::cast(emel::model::loader::error::model_invalid);
+  }
+  const uint64_t arena_bytes =
+      emel::gguf::loader::required_kv_arena_bytes(requirements);
+  if (arena_bytes == std::numeric_limits<uint64_t>::max()) {
+    return emel::error::cast(emel::model::loader::error::backend_error);
+  }
+  fixture.kv_arena.resize(static_cast<size_t>(arena_bytes));
+  fixture.kv_entries.resize(requirements.kv_count);
+  fixture.gguf_tensor_count = requirements.tensor_count;
+  fixture.gguf_tensor_data_bytes = requirements.tensor_data_bytes;
+  return emel::error::cast(emel::model::loader::error::none);
+}
+
+emel::error::type run_emel_parse_model(void *owner,
+                                       const emel::model::loader::event::load &req) {
+  auto &fixture = *static_cast<emel_fixture *>(owner);
+  if (req.file_image == nullptr || req.file_size == 0u) {
+    return emel::error::cast(emel::model::loader::error::invalid_request);
+  }
+  const std::span<const uint8_t> file_image{
+      static_cast<const uint8_t *>(req.file_image),
+      static_cast<size_t>(req.file_size)};
+
+  fixture.gguf = {};
+  const emel::gguf::loader::event::bind_done_fn bind_done_cb{&fixture,
+                                                             on_bind_done};
+  const emel::gguf::loader::event::bind_error_fn bind_error_cb{&fixture,
+                                                               on_bind_error};
+  const emel::gguf::loader::event::bind_storage bind_ev{
+      std::span<uint8_t>{fixture.kv_arena},
+      std::span<emel::gguf::loader::kv_entry>{fixture.kv_entries},
+      std::span<emel::model::data::tensor_record>{req.model_data.tensors.data(),
+                                                  fixture.gguf_tensor_count},
+      bind_done_cb, bind_error_cb};
+  if (!fixture.gguf_loader.process_event(bind_ev) || !fixture.gguf.bind_done ||
+      fixture.gguf.bind_error) {
+    return map_gguf_error(fixture.gguf.err);
+  }
+
+  fixture.gguf = {};
+  const emel::gguf::loader::event::parse_done_fn parse_done_cb{&fixture,
+                                                               on_parse_done};
+  const emel::gguf::loader::event::parse_error_fn parse_error_cb{&fixture,
+                                                                 on_parse_error};
+  const emel::gguf::loader::event::parse parse_ev{file_image, parse_done_cb,
+                                                  parse_error_cb};
+  if (!fixture.gguf_loader.process_event(parse_ev) || !fixture.gguf.parse_done ||
+      fixture.gguf.parse_error) {
+    return map_gguf_error(fixture.gguf.err);
+  }
+
+  req.model_data.n_tensors = fixture.gguf_tensor_count;
+  if (!copy_tensor_names(file_image, req.model_data)) {
+    return emel::error::cast(emel::model::loader::error::backend_error);
+  }
+  return populate_model_metadata(fixture, req.model_data);
+}
+
+emel::error::type run_emel_map_layers(void *,
+                                      const emel::model::loader::event::load &req) {
+  int32_t max_block_index = -1;
+  for (uint32_t index = 0u; index < req.model_data.n_tensors; ++index) {
+    int32_t block_index = -1;
+    if (emel::model::try_parse_block_index(
+            emel::model::tensor_name_view(req.model_data,
+                                          req.model_data.tensors[index]),
+            block_index) &&
+        block_index > max_block_index) {
+      max_block_index = block_index;
+    }
+  }
+  if (max_block_index >= 0) {
+    req.model_data.n_layers = max_block_index + 1;
+    return emel::error::cast(emel::model::loader::error::none);
+  }
+  if (req.model_data.params.n_layer > 0) {
+    req.model_data.n_layers = req.model_data.params.n_layer;
+    return emel::error::cast(emel::model::loader::error::none);
+  }
+  return emel::error::cast(emel::model::loader::error::model_invalid);
+}
+
+emel::error::type run_emel_validate_structure(
+    void *, const emel::model::loader::event::load &req) {
+  if (req.model_data.n_tensors == 0u || req.model_data.n_layers <= 0 ||
+      req.model_data.weights_data == nullptr ||
+      req.model_data.weights_size == 0u) {
+    return emel::error::cast(emel::model::loader::error::model_invalid);
+  }
+  return emel::error::cast(emel::model::loader::error::none);
+}
+
+emel::error::type run_emel_validate_architecture(
+    void *, const emel::model::loader::event::load &req) {
+  return emel::model::validate_execution_contract(req.model_data);
+}
+
+emel::text::tokenizer::preprocessor::preprocessor_kind
+generation_preprocessor_variant(const emel::model::data &model_data) {
+  using pk = emel::text::tokenizer::preprocessor::preprocessor_kind;
+  using tm = emel::model::data::tokenizer_model;
+  switch (model_data.vocab_data.tokenizer_model_id) {
+  case tm::SPM:
+    return pk::spm;
+  case tm::BPE:
+    return pk::bpe;
+  case tm::WPM:
+    return pk::wpm;
+  case tm::UGM:
+    return pk::ugm;
+  case tm::RWKV:
+    return pk::rwkv;
+  case tm::PLAMO2:
+    return pk::plamo2;
+  default:
+    return pk::fallback;
+  }
+}
+
+emel::text::encoders::encoder_kind
+generation_encoder_variant(const emel::model::data &model_data) {
+  using ek = emel::text::encoders::encoder_kind;
+  using tm = emel::model::data::tokenizer_model;
+  switch (model_data.vocab_data.tokenizer_model_id) {
+  case tm::SPM:
+    return ek::spm;
+  case tm::BPE:
+    return ek::bpe;
+  case tm::WPM:
+    return ek::wpm;
+  case tm::UGM:
+    return ek::ugm;
+  case tm::RWKV:
+    return ek::rwkv;
+  case tm::PLAMO2:
+    return ek::plamo2;
+  default:
+    return ek::fallback;
+  }
+}
+
+bool prepare_emel_fixture(emel_fixture &fixture, const std::string &model_path) {
+  if (emel::io::source::load_file_bytes(model_path, fixture.file_bytes) !=
+      emel::error::cast(emel::io::read::error::none)) {
+    std::fprintf(stderr, "load: source file load failed (%s)\n",
+                 model_path.c_str());
+    return false;
+  }
+  if (prebind_emel_gguf_storage(fixture) !=
+      emel::error::cast(emel::model::loader::error::none)) {
+    std::fprintf(stderr, "load: prebind gguf failed\n");
+    return false;
+  }
+  fixture.load = {};
+  fixture.effect_requests.resize(emel::model::data::k_max_tensors);
+  fixture.effect_results.resize(emel::model::data::k_max_tensors);
+  fixture.io_load_spans.resize(emel::model::data::k_max_tensors);
+  emel::model::loader::event::parse_model_fn parse_model{&fixture,
+                                                         run_emel_parse_model};
+  emel::model::loader::event::load load_ev{*fixture.model_data, parse_model};
+  load_ev.model_path = model_path;
+  load_ev.file_image = fixture.file_bytes.data();
+  load_ev.file_size = fixture.file_bytes.size();
+  load_ev.tensor_loader = &fixture.tensor_loader;
+  load_ev.effect_requests = std::span{fixture.effect_requests};
+  load_ev.effect_results = std::span{fixture.effect_results};
+  load_ev.io_load_spans = std::span<emel::io::event::tensor_load_span>{
+      fixture.io_load_spans.data(), fixture.io_load_spans.size()};
+  emel::tools::bind_model_load_io_strategy(load_ev, fixture.io_loader);
+  if (load_ev.io_strategy == emel::io::loader::event::strategy_kind::read_copy ||
+      load_ev.io_strategy ==
+          emel::io::loader::event::strategy_kind::staged_read) {
+    fixture.read_copy_storage.resize(
+        static_cast<size_t>(fixture.gguf_tensor_data_bytes));
+    load_ev.read_copy_storage = std::span<uint8_t>{fixture.read_copy_storage};
+  }
+  load_ev.map_layers = {nullptr, run_emel_map_layers};
+  load_ev.validate_structure = {nullptr, run_emel_validate_structure};
+  load_ev.validate_architecture_impl = {nullptr, run_emel_validate_architecture};
+  load_ev.on_done = {&fixture, on_load_done};
+  load_ev.on_error = {&fixture, on_load_error};
+  if (!fixture.model_loader.process_event(load_ev) || !fixture.load.done ||
+      fixture.load.error) {
+    std::fprintf(stderr, "load: model_loader failed done=%d error=%d err=%d\n",
+                 fixture.load.done ? 1 : 0, fixture.load.error ? 1 : 0,
+                 fixture.load.err);
+    return false;
+  }
+  return true;
+}
+
+bool initialize_lane(lane_session &s, const emel::model::data &model,
+                     int32_t prompt_capacity, int32_t tokens) {
+  const int32_t decode_capacity = std::max<int32_t>(4, tokens);
+  const int32_t block_capacity =
+      std::max<int32_t>(8, prompt_capacity + decode_capacity + 4);
+  s.initialize = {};
+  emel::error::type error_out =
+      emel::error::cast(emel::text::generator::error::none);
+  emel::text::generator::event::initialize request{
+      &s.tokenizer, tokenizer_bind_dispatch, tokenizer_tokenize_dispatch,
+      std::span<emel::logits::sampler::fn>{}};
+  request.preprocessor_variant = generation_preprocessor_variant(model);
+  request.encoder_variant = generation_encoder_variant(model);
+  request.add_special = false;
+  request.parse_special = false;
+  request.selection_mode =
+      emel::text::generator::selection_mode::preselected_argmax;
+  request.max_prompt_tokens = prompt_capacity;
+  request.max_generated_tokens = decode_capacity;
+  request.max_blocks = block_capacity;
+  request.block_tokens = 16;
+  request.strip_leading_space = false;
+  request.error_out = &error_out;
+  request.on_done = {&s, on_initialize_done};
+  request.on_error = {&s, on_initialize_error};
+  const bool accepted = s.generator->process_event(request);
+  return accepted && s.initialize.done && !s.initialize.error &&
+         error_out == emel::error::cast(emel::text::generator::error::none);
+}
+
+bool run_generate(lane_session &s, const std::string_view prompt,
+                  int32_t tokens) {
+  s.generation = {};
+  s.output_length = 0u;
+  emel::error::type error_out =
+      emel::error::cast(emel::text::generator::error::none);
+  std::array<emel::text::formatter::chat_message, 1> messages = {
+      {{.role = "user", .content = prompt}}};
+  emel::text::generator::event::generate request{
+      std::span<const emel::text::formatter::chat_message>{messages}, tokens,
+      std::span<char>{s.output}, s.output_length};
+  request.add_generation_prompt = false;
+  request.enable_thinking = false;
+  request.error_out = &error_out;
+  request.on_done = {&s, on_generation_done};
+  request.on_error = {&s, on_generation_error};
+  const bool accepted = s.generator->process_event(request);
+  s.last_ok = accepted && s.generation.done && !s.generation.error &&
+              error_out == emel::error::cast(emel::text::generator::error::none);
+  return s.last_ok;
+}
+
+std::string lane_output_text(const lane_session &s) {
+  return std::string{s.output.data(), s.generation.output_length};
+}
+
+void run_sequential(const std::span<std::unique_ptr<lane_session>> active,
+                    const std::string_view prompt, int32_t tokens) {
+  for (auto &s : active) {
+    run_generate(*s, prompt, tokens);
+  }
+}
+
+void run_parallel(lane_pool &pool,
+                  const std::span<std::unique_ptr<lane_session>> active,
+                  const std::string_view prompt, int32_t tokens) {
+  lane_scheduler scheduler{pool};
+  lane_scheduler::join_group group{};
+  for (auto &s : active) {
+    lane_session *lane = s.get();
+    (void)scheduler.try_submit(
+        group, [lane, prompt, tokens]() noexcept {
+          run_generate(*lane, prompt, tokens);
+        });
+  }
+  (void)group.wait();
+}
+
+double ns_per_pass(const std::chrono::steady_clock::time_point t0,
+                   const std::chrono::steady_clock::time_point t1,
+                   int32_t iters) {
+  const double ns =
+      std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count();
+  return ns / static_cast<double>(iters);
+}
+
+}  // namespace
+
+int main(int argc, char **argv) {
+  if (argc < 2) {
+    std::fprintf(stderr,
+                 "Usage: %s <model_path> [max_lanes=8] [tokens=32] [iters=20]\n",
+                 argc > 0 ? argv[0] : "emel_decode_wavefront_eval");
+    return 1;
+  }
+  const std::string model_path = argv[1];
+  const int32_t max_lanes =
+      argc > 2 ? std::clamp(static_cast<int32_t>(std::atoi(argv[2])), 1, 8) : 8;
+  const int32_t tokens =
+      argc > 3 ? std::max(1, std::atoi(argv[3])) : 32;
+  const int32_t iters = argc > 4 ? std::max(1, std::atoi(argv[4])) : 20;
+  constexpr std::string_view k_prompt =
+      "The history of artificial intelligence began";
+  const int32_t prompt_capacity = 64;
+
+  auto fixture = std::make_unique<emel_fixture>();
+  if (!prepare_emel_fixture(*fixture, model_path)) {
+    std::fprintf(stderr, "FAILED: prepare_emel_fixture\n");
+    return 1;
+  }
+  if (!emel::model::detail::load_vocab_from_gguf(
+          kv_binding_from_fixture(*fixture), fixture->model_data->vocab_data)) {
+    std::fprintf(stderr, "FAILED: load_vocab_from_gguf\n");
+    return 1;
+  }
+  fixture->model_data->params.n_vocab =
+      static_cast<int32_t>(fixture->model_data->vocab_data.n_tokens);
+
+  const emel::model::data &shared_model = *fixture->model_data;
+  std::printf("# model=%s arch=%.*s n_layer=%d n_embd=%d n_head=%d "
+              "n_head_kv=%d n_vocab=%d\n",
+              model_path.c_str(),
+              static_cast<int>(emel::model::architecture_name_view(shared_model)
+                                   .size()),
+              emel::model::architecture_name_view(shared_model).data(),
+              shared_model.params.n_layer, shared_model.params.n_embd,
+              shared_model.params.n_head, shared_model.params.n_head_kv,
+              shared_model.params.n_vocab);
+
+  // Build N independent generators sharing the one loaded model.
+  std::vector<std::unique_ptr<lane_session>> sessions;
+  sessions.reserve(static_cast<size_t>(max_lanes));
+  for (int32_t lane = 0; lane < max_lanes; ++lane) {
+    auto s = std::make_unique<lane_session>();
+    s->generator = std::make_unique<emel::text::generator::sm>(
+        shared_model, s->conditioner, nullptr,
+        emel::text::formatter::format_raw);
+    if (!initialize_lane(*s, shared_model, prompt_capacity, tokens)) {
+      std::fprintf(stderr, "FAILED: initialize_lane lane=%d\n", lane);
+      return 1;
+    }
+    sessions.push_back(std::move(s));
+  }
+
+  // Single-lane decode confirmation.
+  if (!run_generate(*sessions[0], k_prompt, tokens)) {
+    std::fprintf(stderr, "FAILED: single-lane generate\n");
+    return 1;
+  }
+  std::printf("# single_lane_ok tokens_generated=%d output=\"%.*s\"\n",
+              sessions[0]->generation.tokens_generated,
+              static_cast<int>(std::min<size_t>(
+                  sessions[0]->generation.output_length, 200u)),
+              sessions[0]->output.data());
+  std::printf("# prompt=\"%.*s\" tokens=%d iters=%d threads(pool)=8\n",
+              static_cast<int>(k_prompt.size()), k_prompt.data(), tokens,
+              iters);
+  std::printf("# (decode timing excludes model load + generator init; "
+              "model weights shared read-only across lanes)\n");
+
+  lane_pool pool;
+  const std::array<int32_t, 4> lane_counts = {1, 2, 4, 8};
+  for (const int32_t n : lane_counts) {
+    if (n > max_lanes) {
+      continue;
+    }
+    const std::span<std::unique_ptr<lane_session>> active{sessions.data(),
+                                                          static_cast<size_t>(n)};
+    // Warmup.
+    run_sequential(active, k_prompt, tokens);
+    // Reference outputs (sequential).
+    run_sequential(active, k_prompt, tokens);
+    for (auto &s : active) {
+      s->reference_text = lane_output_text(*s);
+    }
+    bool all_ok = true;
+    for (auto &s : active) {
+      all_ok = all_ok && s->last_ok && s->generation.tokens_generated > 0;
+    }
+
+    const auto seq0 = std::chrono::steady_clock::now();
+    for (int32_t i = 0; i < iters; ++i) {
+      run_sequential(active, k_prompt, tokens);
+    }
+    const auto seq1 = std::chrono::steady_clock::now();
+
+    const auto par0 = std::chrono::steady_clock::now();
+    for (int32_t i = 0; i < iters; ++i) {
+      run_parallel(pool, active, k_prompt, tokens);
+    }
+    const auto par1 = std::chrono::steady_clock::now();
+
+    // Determinism: parallel outputs must match the sequential reference.
+    bool deterministic = true;
+    for (auto &s : active) {
+      deterministic = deterministic && s->last_ok &&
+                      lane_output_text(*s) == s->reference_text;
+    }
+
+    const double seq_ns = ns_per_pass(seq0, seq1, iters);
+    const double par_ns = ns_per_pass(par0, par1, iters);
+    const double seq_ms = seq_ns / 1.0e6;
+    const double par_ms = par_ns / 1.0e6;
+    const double speedup = par_ns > 0.0 ? seq_ns / par_ns : 0.0;
+    const double total_tokens = static_cast<double>(n) * tokens;
+    const double seq_tok_s = total_tokens / (seq_ns / 1.0e9);
+    const double par_tok_s = total_tokens / (par_ns / 1.0e9);
+    std::printf("lanes=%d tokens=%d seq_ms=%.3f par_ms=%.3f speedup=%.2fx "
+                "seq_tok_s=%.1f par_tok_s=%.1f deterministic=%s%s\n",
+                n, tokens, seq_ms, par_ms, speedup, seq_tok_s, par_tok_s,
+                deterministic ? "yes" : "no", all_ok ? "" : " [GEN_FAIL]");
+  }
+  return 0;
+}
diff --git a/tools/generation_fixture_registry.hpp b/tools/generation_fixture_registry.hpp
index 8599c21b..79d38ccc 100644
--- a/tools/generation_fixture_registry.hpp
+++ b/tools/generation_fixture_registry.hpp
@@ -26,9 +26,17 @@ inline constexpr maintained_fixture k_lfm2_generation_fixture = {
     .current_publication = true,
 };
 
-inline constexpr std::array<maintained_fixture, 2> k_maintained_generation_fixtures = {
+inline constexpr maintained_fixture k_lfm2_230m_generation_fixture = {
+    .name = "LFM2.5-230M-Q8_0.gguf",
+    .slug = "lfm2_5_230m_q8_0",
+    .fixture_rel = "tests/models/LFM2.5-230M-Q8_0.gguf",
+    .current_publication = false,
+};
+
+inline constexpr std::array<maintained_fixture, 3> k_maintained_generation_fixtures = {
     k_qwen3_generation_fixture,
     k_lfm2_generation_fixture,
+    k_lfm2_230m_generation_fixture,
 };
 
 }  // namespace emel::tools::generation_fixture_registry
diff --git a/tools/generation_formatter_contract.hpp b/tools/generation_formatter_contract.hpp
index 7d0ebf88..2316090e 100644
--- a/tools/generation_formatter_contract.hpp
+++ b/tools/generation_formatter_contract.hpp
@@ -48,6 +48,29 @@ inline constexpr std::string_view k_unsupported_template_contract =
     "add_generation_prompt=true enable_thinking=false keep_past_thinking=false "
     "bos=<|startoftext|>";
 
+// LFM2.5 template revision that renames keep_past_thinking to
+// preserve_thinking (e.g. LFM2.5-230M); simple system/user chats render
+// byte-identically to the keep_past_thinking revision, so both bind the same
+// formatter. The parity lane's token-identical check guards this equivalence.
+inline constexpr std::string_view k_supported_preserve_thinking_contract =
+    "source=tokenizer.chat_template support=supported_contract "
+    "shape=structured_chat_messages_v1 roles=system,user tools=none "
+    "add_generation_prompt=true enable_thinking=false preserve_thinking=false "
+    "bos=<|startoftext|>";
+
+inline constexpr std::array<std::string_view, 9>
+    k_supported_preserve_thinking_template_markers = {
+        "{{- bos_token -}}",
+        "preserve_thinking",
+        "messages[0][\"role\"] == \"system\"",
+        "\"List of tools: [\"",
+        "message.role == \"assistant\"",
+        "</think>",
+        "add_generation_prompt",
+        "<|im_start|>assistant\\n",
+        "<|im_start|>system\\n",
+};
+
 inline constexpr std::array<std::string_view, 9> k_supported_primary_template_markers = {
     "{{- bos_token -}}",
     "keep_past_thinking",
@@ -381,6 +404,16 @@ inline bool template_matches_supported_contract(
   return true;
 }
 
+inline bool template_matches_supported_preserve_thinking_contract(
+    const std::string_view primary_template) noexcept {
+  for (const std::string_view marker : k_supported_preserve_thinking_template_markers) {
+    if (primary_template.find(marker) == std::string_view::npos) {
+      return false;
+    }
+  }
+  return true;
+}
+
 inline bool template_matches_supported_qwen_contract(
     const std::string_view primary_template) noexcept {
   for (const std::string_view marker : k_supported_qwen_primary_template_markers) {
@@ -423,6 +456,14 @@ inline formatter_binding resolve_primary_template_binding(
         .contract = k_supported_contract,
     };
   }
+  if (template_matches_supported_preserve_thinking_contract(primary_template)) {
+    return formatter_binding{
+        .formatter_ctx = &k_supported_formatter_sentinel,
+        .format_prompt = format_supported_contract,
+        .support = support_kind::supported_contract,
+        .contract = k_supported_preserve_thinking_contract,
+    };
+  }
   if (template_matches_supported_qwen_contract(primary_template)) {
     return formatter_binding{
         .formatter_ctx = &k_supported_qwen_formatter_sentinel,
@@ -506,6 +547,12 @@ inline reference_formatter_info resolve_reference_formatter_info(
     return formatter;
   }
 
+  if (template_matches_supported_preserve_thinking_contract(formatter.primary_template)) {
+    formatter.support = support_kind::supported_contract;
+    formatter.contract = k_supported_preserve_thinking_contract;
+    return formatter;
+  }
+
   if (template_matches_supported_qwen_contract(formatter.primary_template)) {
     formatter.support = support_kind::supported_contract;
     formatter.contract = k_supported_qwen_contract;
@@ -538,7 +585,9 @@ inline size_t formatted_capacity_upper_bound(
     const std::span<const emel::text::formatter::chat_message> messages,
     const bool add_generation_prompt) noexcept {
   size_t capacity = 0u;
-  if (binding.contract == k_supported_contract || binding.contract == k_no_template_contract) {
+  if (binding.contract == k_supported_contract ||
+      binding.contract == k_supported_preserve_thinking_contract ||
+      binding.contract == k_no_template_contract) {
     capacity += k_bos.size();
   }
   if (binding.contract == k_supported_gemma4_contract) {
diff --git a/tools/paritychecker/CMakeLists.txt b/tools/paritychecker/CMakeLists.txt
index 0765340e..f76450ec 100644
--- a/tools/paritychecker/CMakeLists.txt
+++ b/tools/paritychecker/CMakeLists.txt
@@ -43,6 +43,52 @@ if(PARITYCHECKER_REFERENCE_REF_VALUE STREQUAL "")
   set(PARITYCHECKER_REFERENCE_REF_VALUE "${REF_IMPL_REF}")
 endif()
 
+function(emel_patch_reference_metadata_logging reference_source_dir)
+  set(loader_path "${reference_source_dir}/src/llama-model-loader.cpp")
+  if(NOT EXISTS "${loader_path}")
+    message(FATAL_ERROR "Missing reference loader source at ${loader_path}")
+  endif()
+  file(READ "${loader_path}" loader_source)
+  set(patched_marker "            std::string value          = type == GGUF_TYPE_ARRAY")
+  string(FIND "${loader_source}" "${patched_marker}" patch_pos)
+  if(patch_pos EQUAL -1)
+    set(original_line "            std::string value          = gguf_kv_to_str(meta.get(), i);\n            const size_t MAX_VALUE_LEN = 40;")
+    set(patched_line "            std::string value          = type == GGUF_TYPE_ARRAY\n                ? \"<array>\"\n                : gguf_kv_to_str(meta.get(), i);\n            const size_t MAX_VALUE_LEN = 40;")
+    string(FIND "${loader_source}" "${original_line}" original_pos)
+    if(original_pos EQUAL -1)
+      set(original_line "            std::string value          = gguf_kv_to_str(metadata, i);\n            const size_t MAX_VALUE_LEN = 40;")
+      set(patched_line "            std::string value          = type == GGUF_TYPE_ARRAY\n                ? \"<array>\"\n                : gguf_kv_to_str(metadata, i);\n            const size_t MAX_VALUE_LEN = 40;")
+      string(FIND "${loader_source}" "${original_line}" original_pos)
+      if(original_pos EQUAL -1)
+        message(FATAL_ERROR
+          "Unable to patch reference loader metadata logging in ${loader_path}")
+      endif()
+    endif()
+    string(REPLACE "${original_line}" "${patched_line}" loader_source "${loader_source}")
+    file(WRITE "${loader_path}" "${loader_source}")
+  endif()
+
+  set(llama_path "${reference_source_dir}/src/llama.cpp")
+  if(NOT EXISTS "${llama_path}")
+    message(FATAL_ERROR "Missing reference llama source at ${llama_path}")
+  endif()
+  file(READ "${llama_path}" llama_source)
+  set(original_print "        model.load_stats(ml);\n        model.print_info();")
+  set(patched_print "        model.load_stats(ml);\n        // EMEL parity disables reference metadata-only logging here; decode remains unchanged.")
+  string(FIND "${llama_source}" "${patched_print}" print_patch_pos)
+  if(print_patch_pos EQUAL -1)
+    string(FIND "${llama_source}" "${original_print}" print_original_pos)
+    if(print_original_pos EQUAL -1)
+      message(FATAL_ERROR
+        "Unable to patch reference model info logging in ${llama_path}")
+    endif()
+    string(REPLACE "${original_print}" "${patched_print}" llama_source "${llama_source}")
+    file(WRITE "${llama_path}" "${llama_source}")
+  endif()
+endfunction()
+
+emel_patch_reference_metadata_logging("${reference_impl_SOURCE_DIR}")
+
 set(LLAMA_ALL_WARNINGS OFF CACHE BOOL "" FORCE)
 set(LLAMA_FATAL_WARNINGS OFF CACHE BOOL "" FORCE)
 include(${reference_impl_SOURCE_DIR}/ggml/cmake/common.cmake)
@@ -182,6 +228,7 @@ target_include_directories(paritychecker
     ${reference_impl_SOURCE_DIR}/src
     ${reference_impl_SOURCE_DIR}/ggml/include
     ${reference_impl_SOURCE_DIR}/include
+    ${reference_impl_SOURCE_DIR}/vendor
 )
 
 target_compile_definitions(paritychecker
@@ -217,6 +264,7 @@ target_include_directories(paritychecker_tests
     ${reference_impl_SOURCE_DIR}/src
     ${reference_impl_SOURCE_DIR}/ggml/include
     ${reference_impl_SOURCE_DIR}/include
+    ${reference_impl_SOURCE_DIR}/vendor
     ${DOCTEST_INCLUDE_DIR}
 )
 
diff --git a/tools/paritychecker/parity_engines.cpp b/tools/paritychecker/parity_engines.cpp
index 357b1ef3..a5df3ffa 100644
--- a/tools/paritychecker/parity_engines.cpp
+++ b/tools/paritychecker/parity_engines.cpp
@@ -918,6 +918,8 @@ struct reference_backend {
   int32_t vocab_size = 0;
   emel::tools::generation_formatter_contract::reference_formatter_info
       formatter = {};
+  emel::tools::generation_formatter_contract::formatter_binding
+      formatter_binding = {};
   int32_t emel_reference_decode_calls = 0;
   int32_t emel_reference_logits_calls = 0;
   int32_t direct_reference_decode_calls = 0;
@@ -1070,6 +1072,20 @@ bool quantized_contract_matches(const quantized_contract_summary &lhs,
          lhs.explicit_no_claim == rhs.explicit_no_claim;
 }
 
+bool audit_has_native_quantized_tensor_type(
+    const emel::model::llama::quantized_path_audit &audit,
+    const emel::kernel::event::dtype dtype) {
+  const int32_t tensor_type = static_cast<int32_t>(dtype);
+  for (const auto &stage : audit.stages) {
+    if (stage.tensor_type == tensor_type &&
+        stage.contract ==
+            emel::model::llama::quantized_contract_kind::native_quantized) {
+      return true;
+    }
+  }
+  return false;
+}
+
 emel::error::type sampler_select_argmax(generation_load_state &state,
                                         int32_t &candidate_ids,
                                         float &candidate_scores,
@@ -1687,9 +1703,8 @@ bool tokenize_reference_prompt(const reference_backend &backend,
   }
 
   std::string formatted_prompt = {};
-  if (!emel::tools::generation_formatter_contract::
-          format_reference_single_user_prompt(backend.formatter, opts.text,
-                                              formatted_prompt)) {
+  if (!emel::tools::generation_formatter_contract::format_single_user_prompt(
+          backend.formatter_binding, opts.text, formatted_prompt)) {
     return false;
   }
 
@@ -1911,16 +1926,27 @@ bool emel_token_is_stop(const emel::model::data::vocab &vocab,
   return token_id == vocab.eos_id || token_id == vocab.eot_id;
 }
 
-llama_context_ptr make_reference_context(reference_backend &backend) {
+uint32_t reference_context_capacity(const size_t prompt_token_count,
+                                    const int32_t max_tokens) noexcept {
+  constexpr uint32_t k_min_reference_context_tokens = 512u;
+  const uint64_t requested =
+      static_cast<uint64_t>(prompt_token_count) +
+      static_cast<uint64_t>(std::max(0, max_tokens)) + 1u;
+  const uint64_t bounded =
+      std::min<uint64_t>(requested, std::numeric_limits<uint32_t>::max());
+  return std::max<uint32_t>(k_min_reference_context_tokens,
+                            static_cast<uint32_t>(bounded));
+}
+
+llama_context_ptr make_reference_context(reference_backend &backend,
+                                         const uint32_t context_tokens = 512u) {
   llama_context_params context_params = llama_context_default_params();
+  const uint32_t reference_context_tokens =
+      std::max<uint32_t>(512u, context_tokens);
   context_params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
-  context_params.n_ctx = 0;
-  const int32_t batch_capacity =
-      backend.model != nullptr
-          ? std::max(512, llama_model_n_ctx_train(backend.model.get()))
-          : 512;
-  context_params.n_batch = batch_capacity;
-  context_params.n_ubatch = batch_capacity;
+  context_params.n_ctx = reference_context_tokens;
+  context_params.n_batch = reference_context_tokens;
+  context_params.n_ubatch = reference_context_tokens;
   context_params.n_seq_max = 1;
   context_params.n_threads = 1;
   context_params.n_threads_batch = 1;
@@ -1942,16 +1968,18 @@ run_reference_generate(reference_backend &backend,
     return emel::error::cast(emel::text::generator::error::invalid_request);
   }
 
-  llama_context_ptr ctx = make_reference_context(backend);
-  if (ctx == nullptr) {
-    return emel::error::cast(emel::text::generator::error::backend);
-  }
-
   std::vector<llama_token> prompt_tokens;
   if (!tokenize_reference_prompt(backend, opts, prompt_tokens)) {
     return emel::error::cast(emel::text::generator::error::invalid_request);
   }
 
+  llama_context_ptr ctx = make_reference_context(
+      backend,
+      reference_context_capacity(prompt_tokens.size(), opts.max_tokens));
+  if (ctx == nullptr) {
+    return emel::error::cast(emel::text::generator::error::backend);
+  }
+
   llama_batch prompt_batch = llama_batch_get_one(
       prompt_tokens.data(), static_cast<int32_t>(prompt_tokens.size()));
   if (run_direct_reference_decode(backend, ctx.get(), prompt_batch) != 0) {
@@ -17548,7 +17576,6 @@ bool run_backend_kernel_parity(const char *backend, exec_fn exec) {
     emel::kernel::event::op_dup ev{
         .src0 = make_src_view(src.data(), static_cast<uint64_t>(k_vec_len)),
         .dst = make_dst_view(emel_out.data(), static_cast<uint64_t>(k_vec_len)),
-        .nth = 1,
     };
     std::vector<float> ggml_out;
     if (!exec(ev)) {
@@ -17571,7 +17598,6 @@ bool run_backend_kernel_parity(const char *backend, exec_fn exec) {
         .src0 = make_src_view(lhs.data(), static_cast<uint64_t>(k_vec_len)),
         .src1 = make_src_view(rhs.data(), static_cast<uint64_t>(k_vec_len)),
         .dst = make_dst_view(emel_out.data(), static_cast<uint64_t>(k_vec_len)),
-        .nth = 1,
     };
     std::vector<float> ggml_out;
     if (!exec(ev)) {
@@ -17595,7 +17621,6 @@ bool run_backend_kernel_parity(const char *backend, exec_fn exec) {
         .src0 = make_src_view(lhs.data(), static_cast<uint64_t>(k_vec_len)),
         .src1 = make_src_view(rhs.data(), static_cast<uint64_t>(k_vec_len)),
         .dst = make_dst_view(emel_out.data(), static_cast<uint64_t>(k_vec_len)),
-        .nth = 1,
     };
     std::vector<float> ggml_out;
     if (!exec(ev)) {
@@ -17619,7 +17644,6 @@ bool run_backend_kernel_parity(const char *backend, exec_fn exec) {
         .src0 = make_src_view(lhs.data(), static_cast<uint64_t>(k_vec_len)),
         .src1 = make_src_view(rhs.data(), static_cast<uint64_t>(k_vec_len)),
         .dst = make_dst_view(emel_out.data(), static_cast<uint64_t>(k_vec_len)),
-        .nth = 1,
     };
     std::vector<float> ggml_out;
     if (!exec(ev)) {
@@ -17643,7 +17667,6 @@ bool run_backend_kernel_parity(const char *backend, exec_fn exec) {
         .src0 = make_src_view(lhs.data(), static_cast<uint64_t>(k_vec_len)),
         .src1 = make_src_view(rhs.data(), static_cast<uint64_t>(k_vec_len)),
         .dst = make_dst_view(emel_out.data(), static_cast<uint64_t>(k_vec_len)),
-        .nth = 1,
     };
     std::vector<float> ggml_out;
     if (!exec(ev)) {
@@ -17665,7 +17688,6 @@ bool run_backend_kernel_parity(const char *backend, exec_fn exec) {
     emel::kernel::event::op_sqr ev{
         .src0 = make_src_view(src.data(), static_cast<uint64_t>(k_vec_len)),
         .dst = make_dst_view(emel_out.data(), static_cast<uint64_t>(k_vec_len)),
-        .nth = 1,
     };
     std::vector<float> ggml_out;
     if (!exec(ev)) {
@@ -17686,7 +17708,6 @@ bool run_backend_kernel_parity(const char *backend, exec_fn exec) {
     emel::kernel::event::op_sqrt ev{
         .src0 = make_src_view(src.data(), static_cast<uint64_t>(k_vec_len)),
         .dst = make_dst_view(emel_out.data(), static_cast<uint64_t>(k_vec_len)),
-        .nth = 1,
     };
     std::vector<float> ggml_out;
     if (!exec(ev)) {
@@ -17707,7 +17728,6 @@ bool run_backend_kernel_parity(const char *backend, exec_fn exec) {
     emel::kernel::event::op_log ev{
         .src0 = make_src_view(src.data(), static_cast<uint64_t>(k_vec_len)),
         .dst = make_dst_view(emel_out.data(), static_cast<uint64_t>(k_vec_len)),
-        .nth = 1,
     };
     std::vector<float> ggml_out;
     if (!exec(ev)) {
@@ -17728,7 +17748,6 @@ bool run_backend_kernel_parity(const char *backend, exec_fn exec) {
     emel::kernel::event::op_sin ev{
         .src0 = make_src_view(src.data(), static_cast<uint64_t>(k_vec_len)),
         .dst = make_dst_view(emel_out.data(), static_cast<uint64_t>(k_vec_len)),
-        .nth = 1,
     };
     std::vector<float> ggml_out;
     if (!exec(ev)) {
@@ -17749,7 +17768,6 @@ bool run_backend_kernel_parity(const char *backend, exec_fn exec) {
     emel::kernel::event::op_cos ev{
         .src0 = make_src_view(src.data(), static_cast<uint64_t>(k_vec_len)),
         .dst = make_dst_view(emel_out.data(), static_cast<uint64_t>(k_vec_len)),
-        .nth = 1,
     };
     std::vector<float> ggml_out;
     if (!exec(ev)) {
@@ -17775,7 +17793,6 @@ bool run_backend_kernel_parity(const char *backend, exec_fn exec) {
         .dst = make_dst_view(emel_out.data(),
                              static_cast<uint64_t>(k_softmax_width),
                              static_cast<uint64_t>(k_softmax_rows)),
-        .nth = 1,
     };
     std::vector<float> ggml_out;
     if (!exec(ev)) {
@@ -17806,7 +17823,6 @@ bool run_backend_kernel_parity(const char *backend, exec_fn exec) {
                               static_cast<uint64_t>(k_mm_k)),
         .dst = make_dst_view(emel_out.data(), static_cast<uint64_t>(k_mm_n),
                              static_cast<uint64_t>(k_mm_m)),
-        .nth = 1,
     };
     std::vector<float> ggml_out;
     if (!exec(ev)) {
@@ -17825,7 +17841,6 @@ bool run_backend_kernel_parity(const char *backend, exec_fn exec) {
     emel::kernel::event::op_unary ev{
         .src0 = make_src_view(src.data(), static_cast<uint64_t>(k_vec_len)),
         .dst = make_dst_view(emel_out.data(), static_cast<uint64_t>(k_vec_len)),
-        .nth = 1,
         .subop = emel::kernel::event::unary_subop::neg,
     };
     std::vector<float> ggml_out;
@@ -17848,7 +17863,6 @@ bool run_backend_kernel_parity(const char *backend, exec_fn exec) {
     emel::kernel::event::op_unary ev{
         .src0 = make_src_view(src.data(), static_cast<uint64_t>(k_vec_len)),
         .dst = make_dst_view(emel_out.data(), static_cast<uint64_t>(k_vec_len)),
-        .nth = 1,
         .subop = emel::kernel::event::unary_subop::relu,
     };
     std::vector<float> ggml_out;
@@ -17871,7 +17885,6 @@ bool run_backend_kernel_parity(const char *backend, exec_fn exec) {
     emel::kernel::event::op_unary ev{
         .src0 = make_src_view(src.data(), static_cast<uint64_t>(k_vec_len)),
         .dst = make_dst_view(emel_out.data(), static_cast<uint64_t>(k_vec_len)),
-        .nth = 1,
         .subop = emel::kernel::event::unary_subop::exp,
     };
     std::vector<float> ggml_out;
@@ -17894,7 +17907,6 @@ bool run_backend_kernel_parity(const char *backend, exec_fn exec) {
     emel::kernel::event::op_sum ev{
         .src0 = make_src_view(src.data(), static_cast<uint64_t>(k_vec_len)),
         .dst = make_dst_view(emel_out.data(), static_cast<uint64_t>(k_vec_len)),
-        .nth = 1,
     };
     if (exec(ev)) {
       fail("op_sum", "expected unsupported op to be rejected");
@@ -18192,6 +18204,7 @@ int run_generation_harness_contract(
                  fixture->name.data());
     return 1;
   }
+  state.reference.formatter_binding = state.formatter_binding;
 
   const emel::error::type initialize_err =
       run_emel_initialize_generator(state, opts);
@@ -18311,11 +18324,29 @@ int run_generation_harness_contract(
       generation_diagnostics.shared_q6_dispatch_calls;
   const auto runtime_contract =
       runtime_quantized_contract_summary(generation_diagnostics);
-  if (generation_kernel_kind == emel::kernel::kernel_kind::aarch64 &&
+  emel::model::llama::quantized_path_audit audit_view{};
+  const bool audit_available =
+      build_execution_audit(*state.model_data, audit_view);
+  const bool native_q2_k =
+      audit_available &&
+      audit_has_native_quantized_tensor_type(
+          audit_view, emel::kernel::event::dtype::q2_k);
+  const bool native_q3_k =
+      audit_available &&
+      audit_has_native_quantized_tensor_type(
+          audit_view, emel::kernel::event::dtype::q3_k);
+  const bool native_q6_k =
+      audit_available &&
+      audit_has_native_quantized_tensor_type(
+          audit_view, emel::kernel::event::dtype::q6_k);
+  const bool optimized_flash_kernel =
+      generation_kernel_kind == emel::kernel::kernel_kind::aarch64 ||
+      generation_kernel_kind == emel::kernel::kernel_kind::x86_64;
+  if (optimized_flash_kernel &&
       (flash_dispatch_calls == 0u || optimized_flash_dispatch_calls == 0u ||
        shared_flash_dispatch_calls != 0u)) {
     std::fprintf(stderr,
-                 "generation flash proof failed (fixture=%s kernel_kind=%s "
+                 "generation optimized flash proof failed (fixture=%s kernel_kind=%s "
                  "flash_dispatch_calls=%" PRIu64
                  " optimized_flash_dispatch_calls=%" PRIu64
                  " shared_flash_dispatch_calls=%" PRIu64 ")\n",
@@ -18325,7 +18356,7 @@ int run_generation_harness_contract(
     dump_generation_failure_surface(state, &emel_result, nullptr, opts);
     return 1;
   }
-  if (generation_kernel_kind != emel::kernel::kernel_kind::aarch64 &&
+  if (!optimized_flash_kernel &&
       (optimized_flash_dispatch_calls != 0u ||
        shared_flash_dispatch_calls != 0u)) {
     std::fprintf(stderr,
@@ -18338,9 +18369,23 @@ int run_generation_harness_contract(
     dump_generation_failure_surface(state, &emel_result, nullptr, opts);
     return 1;
   }
-  if (optimized_q2_dispatch_calls != 0u || shared_q2_dispatch_calls != 0u ||
-      optimized_q3_dispatch_calls != 0u || shared_q3_dispatch_calls != 0u ||
-      shared_q6_dispatch_calls != 0u) {
+  const bool x86_quantized_proof_failed =
+      generation_kernel_kind == emel::kernel::kernel_kind::x86_64 &&
+      ((native_q2_k && optimized_q2_dispatch_calls == 0u) ||
+       (!native_q2_k && optimized_q2_dispatch_calls != 0u) ||
+       shared_q2_dispatch_calls != 0u ||
+       (native_q3_k && optimized_q3_dispatch_calls == 0u) ||
+       (!native_q3_k && optimized_q3_dispatch_calls != 0u) ||
+       shared_q3_dispatch_calls != 0u ||
+       (native_q6_k && optimized_q6_dispatch_calls == 0u) ||
+       (!native_q6_k && optimized_q6_dispatch_calls != 0u) ||
+       shared_q6_dispatch_calls != 0u);
+  const bool non_x86_quantized_proof_failed =
+      generation_kernel_kind != emel::kernel::kernel_kind::x86_64 &&
+      (optimized_q2_dispatch_calls != 0u || shared_q2_dispatch_calls != 0u ||
+       optimized_q3_dispatch_calls != 0u || shared_q3_dispatch_calls != 0u ||
+       shared_q6_dispatch_calls != 0u);
+  if (x86_quantized_proof_failed || non_x86_quantized_proof_failed) {
     std::fprintf(
         stderr,
         "generation quantized dispatch proof failed (fixture=%s kernel_kind=%s "
@@ -18372,8 +18417,7 @@ int run_generation_harness_contract(
     return 1;
   }
 
-  emel::model::llama::quantized_path_audit audit_view{};
-  if (build_execution_audit(*state.model_data, audit_view)) {
+  if (audit_available) {
     const auto audit_contract = build_quantized_contract_summary(audit_view);
     if (!quantized_contract_matches(runtime_contract, audit_contract)) {
       std::fprintf(
diff --git a/tools/paritychecker/paritychecker_tests.cpp b/tools/paritychecker/paritychecker_tests.cpp
index b079fba6..7833ff55 100644
--- a/tools/paritychecker/paritychecker_tests.cpp
+++ b/tools/paritychecker/paritychecker_tests.cpp
@@ -13,8 +13,6 @@
 
 #include <doctest/doctest.h>
 
-#include "llama.h"
-
 #include "../generation_fixture_registry.hpp"
 #include "../generation_formatter_contract.hpp"
 #include "emel/io/source/any.hpp"
@@ -90,25 +88,6 @@ bool file_exists(const std::filesystem::path &path) {
   return true;
 }
 
-struct llama_backend_guard {
-  llama_backend_guard() { llama_backend_init(); }
-
-  ~llama_backend_guard() { llama_backend_free(); }
-};
-
-bool reference_tokenizer_lane_supported(
-    const std::filesystem::path &model_path) {
-  llama_backend_guard backend_guard{};
-  llama_model_params model_params = llama_model_default_params();
-  model_params.vocab_only = true;
-  model_params.check_tensors = false;
-
-  std::unique_ptr<llama_model, decltype(&llama_model_free)> model(
-      llama_model_load_from_file(model_path.string().c_str(), model_params),
-      llama_model_free);
-  return model != nullptr && llama_model_get_vocab(model.get()) != nullptr;
-}
-
 std::filesystem::path maintained_generation_fixture_path(
     const emel::tools::generation_fixture_registry::maintained_fixture
         &fixture) {
@@ -128,6 +107,15 @@ std::filesystem::path maintained_generation_baseline_path(
           "_prompt_hello_max_tokens_" + std::to_string(max_tokens) + ".txt");
 }
 
+bool tokenizer_sweep_fixture_supported(const std::filesystem::path &path) {
+  const std::string name = path.filename().string();
+  return name.find("Llama-") != std::string::npos ||
+         name.find("distilgpt2") != std::string::npos ||
+         name.find("bert-base-uncased") != std::string::npos ||
+         name.find("flan-t5") != std::string::npos ||
+         name.find("rwkv") != std::string::npos;
+}
+
 std::vector<std::string> discover_models() {
   std::vector<std::string> models;
   const auto dir = models_dir();
@@ -142,13 +130,10 @@ std::vector<std::string> discover_models() {
     if (path.extension() != ".gguf") {
       continue;
     }
-    // The canonical Qwen generation fixture is covered by dedicated
-    // maintained-generation tests, not the generic tiny-model tokenizer parity
-    // sweep.
-    if (path.filename() == "Qwen3-0.6B-Q8_0.gguf") {
-      continue;
-    }
-    if (!reference_tokenizer_lane_supported(path)) {
+    // Keep llama.cpp backend lifetime inside the CLI subprocesses. The generic
+    // tokenizer sweep covers the tiny tokenizer fixtures; generation, speech,
+    // diarization, and embedding fixtures have dedicated gates.
+    if (!tokenizer_sweep_fixture_supported(path)) {
       continue;
     }
     models.push_back(path.string());
@@ -600,7 +585,8 @@ void check_generation_flash_attribution(const process_capture &capture) {
   CHECK(parse_flash_dispatch_metric(capture.stdout_text, "optimized") >= 0);
   CHECK(parse_flash_dispatch_metric(capture.stdout_text, "shared") >= 0);
   CHECK(parse_flash_dispatch_calls(capture.stdout_text) > 0);
-  if (expected_generation_kernel_kind() == "aarch64") {
+  if (expected_generation_kernel_kind() == "aarch64" ||
+      expected_generation_kernel_kind() == "x86_64") {
     CHECK(parse_flash_dispatch_metric(capture.stdout_text, "optimized") > 0);
     CHECK(parse_flash_dispatch_metric(capture.stdout_text, "shared") == 0);
   } else {
@@ -610,37 +596,59 @@ void check_generation_flash_attribution(const process_capture &capture) {
 }
 
 void check_generation_quantized_attribution(const process_capture &capture) {
-  CHECK(parse_named_metric(capture.stdout_text,
-                           "optimized_q2_dispatch_calls") >= 0);
-  CHECK(parse_named_metric(capture.stdout_text, "shared_q2_dispatch_calls") >=
-        0);
-  CHECK(parse_named_metric(capture.stdout_text,
-                           "optimized_q3_dispatch_calls") >= 0);
-  CHECK(parse_named_metric(capture.stdout_text, "shared_q3_dispatch_calls") >=
-        0);
-  CHECK(parse_named_metric(capture.stdout_text,
-                           "optimized_q6_dispatch_calls") >= 0);
-  CHECK(parse_named_metric(capture.stdout_text, "shared_q6_dispatch_calls") >=
-        0);
+  const int optimized_q2_dispatch_calls =
+      parse_named_metric(capture.stdout_text, "optimized_q2_dispatch_calls");
+  const int shared_q2_dispatch_calls =
+      parse_named_metric(capture.stdout_text, "shared_q2_dispatch_calls");
+  const int optimized_q3_dispatch_calls =
+      parse_named_metric(capture.stdout_text, "optimized_q3_dispatch_calls");
+  const int shared_q3_dispatch_calls =
+      parse_named_metric(capture.stdout_text, "shared_q3_dispatch_calls");
+  const int optimized_q6_dispatch_calls =
+      parse_named_metric(capture.stdout_text, "optimized_q6_dispatch_calls");
+  const int shared_q6_dispatch_calls =
+      parse_named_metric(capture.stdout_text, "shared_q6_dispatch_calls");
+  CHECK(optimized_q2_dispatch_calls >= 0);
+  CHECK(shared_q2_dispatch_calls >= 0);
+  CHECK(optimized_q3_dispatch_calls >= 0);
+  CHECK(shared_q3_dispatch_calls >= 0);
+  CHECK(optimized_q6_dispatch_calls >= 0);
+  CHECK(shared_q6_dispatch_calls >= 0);
   const int native_q8_0_dispatch_calls =
       parse_named_metric(capture.stdout_text, "native_q8_0_dispatch_calls");
   const int packed_q8_0_dispatch_calls =
       parse_named_metric(capture.stdout_text, "packed_q8_0_dispatch_calls");
+  const bool native_q2_k =
+      capture.stdout_text.find("tensor_type=q2_k contract=native_quantized") !=
+      std::string::npos;
+  const bool native_q3_k =
+      capture.stdout_text.find("tensor_type=q3_k contract=native_quantized") !=
+      std::string::npos;
+  const bool native_q6_k =
+      capture.stdout_text.find("tensor_type=q6_k contract=native_quantized") !=
+      std::string::npos;
   CHECK(native_q8_0_dispatch_calls >= 0);
   CHECK(packed_q8_0_dispatch_calls >= 0);
-  CHECK(native_q8_0_dispatch_calls + packed_q8_0_dispatch_calls > 0);
-  CHECK(parse_named_metric(capture.stdout_text,
-                           "optimized_q2_dispatch_calls") == 0);
-  CHECK(parse_named_metric(capture.stdout_text, "shared_q2_dispatch_calls") ==
-        0);
-  CHECK(parse_named_metric(capture.stdout_text,
-                           "optimized_q3_dispatch_calls") == 0);
-  CHECK(parse_named_metric(capture.stdout_text, "shared_q3_dispatch_calls") ==
-        0);
-  CHECK(parse_named_metric(capture.stdout_text,
-                           "optimized_q6_dispatch_calls") == 0);
-  CHECK(parse_named_metric(capture.stdout_text, "shared_q6_dispatch_calls") ==
-        0);
+  if (!native_q2_k && !native_q3_k && !native_q6_k) {
+    CHECK(native_q8_0_dispatch_calls + packed_q8_0_dispatch_calls > 0);
+  }
+  if (expected_generation_kernel_kind() == "x86_64") {
+    CHECK((native_q2_k ? optimized_q2_dispatch_calls > 0
+                       : optimized_q2_dispatch_calls == 0));
+    CHECK(shared_q2_dispatch_calls == 0);
+    CHECK((native_q3_k ? optimized_q3_dispatch_calls > 0
+                       : optimized_q3_dispatch_calls == 0));
+    CHECK(shared_q3_dispatch_calls == 0);
+    CHECK((native_q6_k ? optimized_q6_dispatch_calls > 0
+                       : optimized_q6_dispatch_calls == 0));
+    CHECK(shared_q6_dispatch_calls == 0);
+  } else {
+    CHECK(optimized_q2_dispatch_calls == 0);
+    CHECK(shared_q2_dispatch_calls == 0);
+    CHECK(optimized_q3_dispatch_calls == 0);
+    CHECK(shared_q3_dispatch_calls == 0);
+    CHECK(shared_q6_dispatch_calls == 0);
+  }
 }
 
 void check_generation_quantized_stage_audit(const process_capture &capture) {
@@ -1508,6 +1516,9 @@ TEST_CASE("paritychecker matches current maintained generation publication "
     CHECK(capture.stdout_text.find("reference_impl:") != std::string::npos);
     CHECK(capture.stdout_text.find("generation_baseline:") !=
           std::string::npos);
+    check_generation_flash_attribution(capture);
+    check_generation_quantized_attribution(capture);
+    check_generation_quantized_stage_audit(capture);
     CHECK(capture.stdout_text.find(
               "reference_impl: source=maintained_generation_baseline") ==
           std::string::npos);