audulus · wtholliday · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026
diff --git a/docs/FP_CODEGEN_PLAN.md b/docs/FP_CODEGEN_PLAN.md
@@ -487,6 +487,19 @@ supports `f64` but the hot-path benchmarks don't exercise it). Option:
 Recommendation: **Option B** for now. Add f64 F variants later if real
 f64 workloads emerge.
 
+> **Resolved: Option A was implemented.** A real f64 DSP workload emerged
+> (`benchmark/biquad_f64.lyte`), and on the int path it ran ~2.9× slower
+> than the f32 biquad — every op paid a GPR↔FP crossing and nothing fused.
+> f64 now gets a full parallel `double` window (`d0..d3` + `dfsp`), the
+> exact analogue of the float window, with `D`-suffix StackOps for
+> arithmetic, comparisons, conversions, memory, and math, plus the mirrored
+> fused superinstructions (`get_get_dmul_sum*`, `get_set*D`,
+> `get_f64const_dgt_jiz`). The two FP windows use all 8 FP arg registers
+> (`v0..v7` / `xmm0..xmm7`). Result: f64 biquad dropped from ~0.36s to
+> ~0.16s on the VM-host reference (`benchmark/run.sh`), ~1.2× the f32
+> biquad — the residual gap is f64's 2× state bandwidth, not dispatch or
+> crossing overhead.
+
 ### 6.3 Function call arg passing
 
 When calling a function with mixed int and float args, how are they

diff --git a/docs/Stack_VM.md b/docs/Stack_VM.md
@@ -33,11 +33,13 @@ On bare-metal Apple M4, the Stack VM runs biquad at 0.104s, sort at
   It was deleted — see `docs/HOT_LOCALS.md` for the design exploration
   and why the cache never paid off on this VM.
 - **No per-op runtime type check.** The codegen tracks each stack slot's
-  type statically, and int vs f32 ops pick the right window at emit
-  time — `IAdd` reads `t0/t1`, `FAddF` reads `f0/f1`, and so on. f64
-  values ride the int window as bit patterns (rare in hot code).
+  type statically, and int / f32 / f64 ops pick the right window at emit
+  time — `IAdd` reads `t0/t1`, `FAddF` reads `f0/f1`, `DAddD` reads
+  `d0/d1`, and so on. f64 now has its own dedicated `double` window
+  (`d0..d3`, spilling to `*dfsp`), the exact analogue of the f32 float
+  window, so f64 arithmetic also stays in FP registers.
 - **Per-window stack-depth validation.** `src/stack_depth.rs` runs
-  forward over each function tracking int and float stack depth
+  forward over each function tracking int, float, and double stack depth
   independently; any jump target or call site that doesn't match
   between incoming edges is a codegen bug.
 
@@ -247,8 +249,14 @@ Two non-obvious choices here, both load-bearing:
    register-indirect store. Commit `21f2949`.
 
 f32 and f64 coexist on the logical operand stack. The codegen tracks
-each slot's type statically; f32 slots live in the float window, f64
-slots (rare) ride the int window as bit patterns.
+each slot's type statically; f32 slots live in the float window (`f0..f3`)
+and f64 slots live in a parallel double window (`d0..d3`). The two FP
+windows together use the full 8 FP argument registers (`v0..v7` on
+aarch64, `xmm0..xmm7` on x86-64). f64 gets the same fused superinstructions
+as f32 — the multiply-accumulate sum chain, variable-move chains, and the
+const-compare-branch — so f64 DSP loops reach near-parity with f32 (the
+biquad f64 benchmark runs ~1.2× the f32 time, the gap being f64's 2× state
+bandwidth, versus ~2.9× before the double window existed).
 
 ## Hot local cache (removed)
 
@@ -477,8 +485,6 @@ VM is safe to call from a real-time audio thread.
 
 ### Limitations
 
-- **f64 is second-class.** f64 values ride the int window, pay GPR↔FP
-  crossings on every op. Fine for correctness tests, not hot-pathed.
 - **No SIMD vector types in the VM.** `f32x4` lowers to per-lane
   scalar f32 ops via fused loads/stores. A proper SIMD window would
   need another register tier.

diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
@@ -18,6 +18,9 @@ libfuzzer-sys = "0.4.12"
 lyte = { path = "../" }
 libc = "0.2.186"
 
+[build-dependencies]
+cc = "1.2.62"
+
 [[bin]]
 name = "lexer"
 path = "fuzz_targets/lexer.rs"

diff --git a/fuzz/build.rs b/fuzz/build.rs
@@ -0,0 +1,14 @@
+fn main() {
+    // Propagate has_stack_interp cfg if the C compiler is Clang.
+    // Mirrors cli/build.rs so the differential fuzz target can compile and
+    // run the stack backend (which depends on the Clang-only C interpreter).
+    let compiler = cc::Build::new().try_get_compiler();
+    let is_clang = compiler
+        .as_ref()
+        .map(|c| c.is_like_clang())
+        .unwrap_or(false);
+    if is_clang {
+        println!("cargo:rustc-cfg=has_stack_interp");
+    }
+    println!("cargo:rustc-check-cfg=cfg(has_stack_interp)");
+}
diff --git a/fuzz/fuzz_targets/differential.rs b/fuzz/fuzz_targets/differential.rs
@@ -60,6 +60,21 @@ impl<'a> Gen<'a> {
             ));
         }
 
+        // Optionally emit an `a * b + c` helper for each float type. The call
+        // exercises float argument/return-value bridging — i.e. saving and
+        // restoring the relevant float window (f0..f3 / d0..d3) around a call.
+        let mut has_fma = [false; FLOAT_KINDS.len()];
+        for (k, kind) in FLOAT_KINDS.iter().enumerate() {
+            if self.next() % 3 == 0 {
+                has_fma[k] = true;
+                decls.push(format!(
+                    "{}(a: {t}, b: {t}, c: {t}) -> {t} {{\n    return a * b + c\n}}",
+                    kind.fma,
+                    t = kind.ty
+                ));
+            }
+        }
+
         // 1-4 initial integer variables
         let n_vars = (self.next() % 4 + 1) as usize;
         for i in 0..n_vars {
@@ -106,6 +121,24 @@ impl<'a> Gen<'a> {
             vars.push((name, VarType::Enum(n_variants)));
         }
 
+        // Optionally declare 1-2 variables of each float type. f32 drives the
+        // stack VM's single float window (f0..f3, spilling through fsp); f64
+        // drives the double window (d0..d3, spilling through dfsp). Values are
+        // kept small so downstream `as i32` casts stay well inside i32 range.
+        let mut has_floats = [false; FLOAT_KINDS.len()];
+        for (k, kind) in FLOAT_KINDS.iter().enumerate() {
+            if self.next() % 2 == 0 {
+                has_floats[k] = true;
+                let n_fvars = (self.next() % 2 + 1) as usize; // 1-2
+                for i in 0..n_fvars {
+                    let name = format!("{}{}", kind.var_prefix, i);
+                    main_lines
+                        .push(format!("    var {} = {}", name, self.gen_float_literal(kind.ty)));
+                    vars.push((name, VarType::Float(kind.ty)));
+                }
+            }
+        }
+
         // 1-5 computed values, each printed
         let n_stmts = (self.next() % 5 + 1) as usize;
         for i in 0..n_stmts {
@@ -116,6 +149,25 @@ impl<'a> Gen<'a> {
             vars.push((name, VarType::Int));
         }
 
+        // Float computations, each printed as i32. Printing the truncated
+        // integer (rather than the raw float) keeps output comparable across
+        // backends — Rust's float Display ("3") and the C interpreter's printf
+        // ("3.0") format integral floats differently, so raw float prints
+        // would diverge on formatting alone. The float arithmetic itself is
+        // still fully exercised before the cast.
+        for (k, kind) in FLOAT_KINDS.iter().enumerate() {
+            if !has_floats[k] {
+                continue;
+            }
+            let n_fstmts = (self.next() % 3 + 1) as usize; // 1-3
+            for i in 0..n_fstmts {
+                let name = format!("{}r{}", kind.var_prefix, i);
+                let expr = self.gen_float_expr(&vars, kind, has_fma[k], 2);
+                main_lines.push(format!("    let {} = {}", name, expr));
+                main_lines.push(format!("    print({} as i32)", name));
+            }
+        }
+
         main_lines.push("}".to_string());
 
         let mut parts = decls;
@@ -289,11 +341,101 @@ impl<'a> Gen<'a> {
         let val = (self.next() % 200) as i32 - 100;
         self.format_int(val)
     }
+
+    /// A small float literal in [0.0, 9.9], typed as `ty` ("f32"/"f64").
+    /// Bare float literals are already f32, so an `as f32` cast would be an
+    /// unsupported identity conversion — emit the bare literal for f32 and an
+    /// explicit `as f64` conversion for f64. Values are bounded so that
+    /// products of a few of these stay far inside i32 range after the final
+    /// `as i32` cast.
+    fn gen_float_literal(&mut self, ty: &str) -> String {
+        let whole = self.next() % 10;
+        let frac = self.next() % 10;
+        if ty == "f32" {
+            format!("{}.{}", whole, frac)
+        } else {
+            format!("({}.{} as {})", whole, frac, ty)
+        }
+    }
+
+    fn gen_float_expr(
+        &mut self,
+        vars: &[(String, VarType)],
+        kind: &FloatKind,
+        has_fma: bool,
+        depth: u8,
+    ) -> String {
+        if depth == 0 {
+            return self.gen_float_leaf(vars, kind);
+        }
+        let max_choice = if has_fma { 8 } else { 6 };
+        match self.next() % max_choice {
+            // Float literal
+            0..=1 => self.gen_float_literal(kind.ty),
+            // Float variable reference (of this type)
+            2..=3 => self.gen_float_var(vars, kind.ty),
+            // Binary arithmetic (no division — avoids safety errors)
+            4..=5 => {
+                let ops = ["+", "-", "*"];
+                let op = ops[self.next() as usize % ops.len()];
+                let l = self.gen_float_expr(vars, kind, has_fma, depth - 1);
+                let r = self.gen_float_expr(vars, kind, has_fma, depth - 1);
+                format!("({} {} {})", l, op, r)
+            }
+            // Helper call (only if its `a * b + c` helper was emitted) —
+            // exercises float argument/return-value bridging across a call.
+            6..=7 => {
+                let a = self.gen_float_expr(vars, kind, has_fma, depth - 1);
+                let b = self.gen_float_expr(vars, kind, has_fma, depth - 1);
+                let c = self.gen_float_expr(vars, kind, has_fma, depth - 1);
+                format!("{}({}, {}, {})", kind.fma, a, b, c)
+            }
+            _ => self.gen_float_literal(kind.ty),
+        }
+    }
+
+    fn gen_float_leaf(&mut self, vars: &[(String, VarType)], kind: &FloatKind) -> String {
+        if self.next() % 2 == 0 {
+            self.gen_float_var(vars, kind.ty)
+        } else {
+            self.gen_float_literal(kind.ty)
+        }
+    }
+
+    fn gen_float_var(&mut self, vars: &[(String, VarType)], ty: &str) -> String {
+        let fvars: Vec<&str> = vars
+            .iter()
+            .filter_map(|(name, t)| match t {
+                VarType::Float(vt) if *vt == ty => Some(name.as_str()),
+                _ => None,
+            })
+            .collect();
+        if fvars.is_empty() {
+            self.gen_float_literal(ty)
+        } else {
+            let idx = self.next() as usize % fvars.len();
+            fvars[idx].to_string()
+        }
+    }
+}
+
+/// A float type the generator can emit, paired with the names it uses for
+/// that type's variables and `a * b + c` helper.
+struct FloatKind {
+    ty: &'static str,
+    var_prefix: &'static str,
+    fma: &'static str,
 }
 
+const FLOAT_KINDS: [FloatKind; 2] = [
+    FloatKind { ty: "f32", var_prefix: "sv", fma: "sfma" },
+    FloatKind { ty: "f64", var_prefix: "fv", fma: "ffma" },
+];
+
 #[derive(Clone)]
 enum VarType {
     Int,
+    Float(&'static str),
     Struct,
     Array(usize),
     Enum(usize),
@@ -321,6 +463,11 @@ fn capture_stdout<F: FnOnce()>(f: F) -> String {
     f();
 
     std::io::stdout().flush().ok();
+    // The stack backend's C interpreter prints via C stdio (printf), which
+    // buffers independently of Rust's stdout. Flush all C streams before
+    // restoring fd 1, or the buffered output is lost and the capture comes
+    // back empty.
+    unsafe { libc::fflush(std::ptr::null_mut()) };
 
     unsafe { libc::dup2(saved_fd, 1) };
     unsafe { libc::close(saved_fd) };
@@ -356,6 +503,17 @@ fn run_backend(program: &str, backend: &str) -> Option<String> {
             });
             Some(output)
         }
+        #[cfg(has_stack_interp)]
+        "stack" => {
+            // Compile to the stack VM and run it through the C interpreter
+            // (the same path cli/src/main.rs uses for `--backend stack`).
+            let output = capture_stdout(|| {
+                if let Ok(program) = compiler.compile_stack() {
+                    let _ = lyte::stack_interp_bridge::run(&program);
+                }
+            });
+            Some(output)
+        }
         #[cfg(target_arch = "aarch64")]
         "asm" => {
             let output = capture_stdout(|| {
@@ -460,4 +618,10 @@ fuzz_target!(|data: &[u8]| {
         let llvm_output = run_backend(&program, "llvm");
         assert_same("VM", &vm_output, "LLVM", &llvm_output, &program);
     }
+
+    #[cfg(has_stack_interp)]
+    {
+        let stack_output = run_backend(&program, "stack");
+        assert_same("VM", &vm_output, "STACK", &stack_output, &program);
+    }
 });
diff --git a/src/compiler.rs b/src/compiler.rs
@@ -1980,7 +1980,31 @@ mod tests {
     // At Return/ReturnVoid the entry depth must be 0 (plus the op's
     // own delta, which is 0 for both return ops).
     fn assert_f_window_balanced(program: &crate::stack_ir::StackProgram, label: &str) {
-        use crate::stack_depth::float_stack_delta;
+        assert_window_balanced(
+            program,
+            label,
+            crate::stack_depth::float_stack_delta,
+            "f-window",
+        );
+        assert_window_balanced(
+            program,
+            label,
+            crate::stack_depth::double_stack_delta,
+            "d-window",
+        );
+    }
+
+    /// Verify a register-window (float or double) stays balanced across the
+    /// CFG: every merge agrees on depth, every Return leaves depth 0, and no
+    /// op underflows. `delta` selects which window to check.
+    #[cfg(test)]
+    fn assert_window_balanced(
+        program: &crate::stack_ir::StackProgram,
+        label: &str,
+        delta: fn(&crate::stack_ir::StackOp) -> i32,
+        window: &str,
+    ) {
+        let float_stack_delta = delta;
         use crate::stack_ir::StackOp;
 
         for func in &program.functions {
@@ -2044,7 +2068,9 @@ mod tests {
                         }
                     }
                     StackOp::FusedF32ConstFGtJumpIfZeroF(_, off)
-                    | StackOp::FusedGetF32ConstFGtJumpIfZeroF(_, _, off) => {
+                    | StackOp::FusedGetF32ConstFGtJumpIfZeroF(_, _, off)
+                    | StackOp::FusedF64ConstDGtJumpIfZeroD(_, off)
+                    | StackOp::FusedGetF64ConstDGtJumpIfZeroD(_, _, off) => {
                         let t = (i as i64 + 1 + *off as i64) as usize;
                         if t < n {
                             succs.push(t);
@@ -2066,9 +2092,9 @@ mod tests {
                         worklist.push(s);
                     } else if in_depth[s] != d_out {
                         panic!(
-                            "[{}] {}: f-window depth mismatch at op {} \
+                            "[{}] {}: {} depth mismatch at op {} \
                              (from op {}): {} vs {}",
-                            label, func.name, s, i, in_depth[s], d_out,
+                            label, func.name, window, s, i, in_depth[s], d_out,
                         );
                     }
                 }
@@ -2081,19 +2107,21 @@ mod tests {
                 if matches!(op, StackOp::Return | StackOp::ReturnVoid) {
                     assert!(
                         in_depth[i] == 0,
-                        "[{}] {}: f-window leaks {} slot(s) at return op {}",
+                        "[{}] {}: {} leaks {} slot(s) at return op {}",
                         label,
                         func.name,
+                        window,
                         in_depth[i],
                         i,
                     );
                 }
                 let d_out = in_depth[i] + float_stack_delta(op);
                 assert!(
                     d_out >= 0,
-                    "[{}] {}: f-window underflow at op {} ({:?}): in={} delta={}",
+                    "[{}] {}: {} underflow at op {} ({:?}): in={} delta={}",
                     label,
                     func.name,
+                    window,
                     i,
                     op,
                     in_depth[i],