diff --git a/docs/FP_CODEGEN_PLAN.md b/docs/FP_CODEGEN_PLAN.md
index 88c1c32f..6729c245 100644
--- a/docs/FP_CODEGEN_PLAN.md
+++ b/docs/FP_CODEGEN_PLAN.md
@@ -487,6 +487,19 @@ supports `f64` but the hot-path benchmarks don't exercise it). Option:
 Recommendation: **Option B** for now. Add f64 F variants later if real
 f64 workloads emerge.
 
+> **Resolved: Option A was implemented.** A real f64 DSP workload emerged
+> (`benchmark/biquad_f64.lyte`), and on the int path it ran ~2.9× slower
+> than the f32 biquad — every op paid a GPR↔FP crossing and nothing fused.
+> f64 now gets a full parallel `double` window (`d0..d3` + `dfsp`), the
+> exact analogue of the float window, with `D`-suffix StackOps for
+> arithmetic, comparisons, conversions, memory, and math, plus the mirrored
+> fused superinstructions (`get_get_dmul_sum*`, `get_set*D`,
+> `get_f64const_dgt_jiz`). The two FP windows use all 8 FP arg registers
+> (`v0..v7` / `xmm0..xmm7`). Result: f64 biquad dropped from ~0.36s to
+> ~0.16s on the VM-host reference (`benchmark/run.sh`), ~1.2× the f32
+> biquad — the residual gap is f64's 2× state bandwidth, not dispatch or
+> crossing overhead.
+
 ### 6.3 Function call arg passing
 
 When calling a function with mixed int and float args, how are they
diff --git a/docs/Stack_VM.md b/docs/Stack_VM.md
index 9bfb67d5..4aa26c73 100644
--- a/docs/Stack_VM.md
+++ b/docs/Stack_VM.md
@@ -33,11 +33,13 @@ On bare-metal Apple M4, the Stack VM runs biquad at 0.104s, sort at
   It was deleted — see `docs/HOT_LOCALS.md` for the design exploration
   and why the cache never paid off on this VM.
 - **No per-op runtime type check.** The codegen tracks each stack slot's
-  type statically, and int vs f32 ops pick the right window at emit
-  time — `IAdd` reads `t0/t1`, `FAddF` reads `f0/f1`, and so on. f64
-  values ride the int window as bit patterns (rare in hot code).
+  type statically, and int / f32 / f64 ops pick the right window at emit
+  time — `IAdd` reads `t0/t1`, `FAddF` reads `f0/f1`, `DAddD` reads
+  `d0/d1`, and so on. f64 now has its own dedicated `double` window
+  (`d0..d3`, spilling to `*dfsp`), the exact analogue of the f32 float
+  window, so f64 arithmetic also stays in FP registers.
 - **Per-window stack-depth validation.** `src/stack_depth.rs` runs
-  forward over each function tracking int and float stack depth
+  forward over each function tracking int, float, and double stack depth
   independently; any jump target or call site that doesn't match
   between incoming edges is a codegen bug.
 
@@ -247,8 +249,14 @@ Two non-obvious choices here, both load-bearing:
    register-indirect store. Commit `21f2949`.
 
 f32 and f64 coexist on the logical operand stack. The codegen tracks
-each slot's type statically; f32 slots live in the float window, f64
-slots (rare) ride the int window as bit patterns.
+each slot's type statically; f32 slots live in the float window (`f0..f3`)
+and f64 slots live in a parallel double window (`d0..d3`). The two FP
+windows together use the full 8 FP argument registers (`v0..v7` on
+aarch64, `xmm0..xmm7` on x86-64). f64 gets the same fused superinstructions
+as f32 — the multiply-accumulate sum chain, variable-move chains, and the
+const-compare-branch — so f64 DSP loops reach near-parity with f32 (the
+biquad f64 benchmark runs ~1.2× the f32 time, the gap being f64's 2× state
+bandwidth, versus ~2.9× before the double window existed).
 
 ## Hot local cache (removed)
 
@@ -477,8 +485,6 @@ VM is safe to call from a real-time audio thread.
 
 ### Limitations
 
-- **f64 is second-class.** f64 values ride the int window, pay GPR↔FP
-  crossings on every op. Fine for correctness tests, not hot-pathed.
 - **No SIMD vector types in the VM.** `f32x4` lowers to per-lane
   scalar f32 ops via fused loads/stores. A proper SIMD window would
   need another register tier.
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
index a0c12e42..4735ce44 100644
--- a/fuzz/Cargo.toml
+++ b/fuzz/Cargo.toml
@@ -18,6 +18,9 @@ libfuzzer-sys = "0.4.12"
 lyte = { path = "../" }
 libc = "0.2.186"
 
+[build-dependencies]
+cc = "1.2.62"
+
 [[bin]]
 name = "lexer"
 path = "fuzz_targets/lexer.rs"
diff --git a/fuzz/build.rs b/fuzz/build.rs
new file mode 100644
index 00000000..b8f1360e
--- /dev/null
+++ b/fuzz/build.rs
@@ -0,0 +1,14 @@
+fn main() {
+    // Propagate has_stack_interp cfg if the C compiler is Clang.
+    // Mirrors cli/build.rs so the differential fuzz target can compile and
+    // run the stack backend (which depends on the Clang-only C interpreter).
+    let compiler = cc::Build::new().try_get_compiler();
+    let is_clang = compiler
+        .as_ref()
+        .map(|c| c.is_like_clang())
+        .unwrap_or(false);
+    if is_clang {
+        println!("cargo:rustc-cfg=has_stack_interp");
+    }
+    println!("cargo:rustc-check-cfg=cfg(has_stack_interp)");
+}
diff --git a/fuzz/fuzz_targets/differential.rs b/fuzz/fuzz_targets/differential.rs
index b6b40496..ed5b0605 100644
--- a/fuzz/fuzz_targets/differential.rs
+++ b/fuzz/fuzz_targets/differential.rs
@@ -60,6 +60,21 @@ impl<'a> Gen<'a> {
             ));
         }
 
+        // Optionally emit an `a * b + c` helper for each float type. The call
+        // exercises float argument/return-value bridging — i.e. saving and
+        // restoring the relevant float window (f0..f3 / d0..d3) around a call.
+        let mut has_fma = [false; FLOAT_KINDS.len()];
+        for (k, kind) in FLOAT_KINDS.iter().enumerate() {
+            if self.next() % 3 == 0 {
+                has_fma[k] = true;
+                decls.push(format!(
+                    "{}(a: {t}, b: {t}, c: {t}) -> {t} {{\n    return a * b + c\n}}",
+                    kind.fma,
+                    t = kind.ty
+                ));
+            }
+        }
+
         // 1-4 initial integer variables
         let n_vars = (self.next() % 4 + 1) as usize;
         for i in 0..n_vars {
@@ -106,6 +121,24 @@ impl<'a> Gen<'a> {
             vars.push((name, VarType::Enum(n_variants)));
         }
 
+        // Optionally declare 1-2 variables of each float type. f32 drives the
+        // stack VM's single float window (f0..f3, spilling through fsp); f64
+        // drives the double window (d0..d3, spilling through dfsp). Values are
+        // kept small so downstream `as i32` casts stay well inside i32 range.
+        let mut has_floats = [false; FLOAT_KINDS.len()];
+        for (k, kind) in FLOAT_KINDS.iter().enumerate() {
+            if self.next() % 2 == 0 {
+                has_floats[k] = true;
+                let n_fvars = (self.next() % 2 + 1) as usize; // 1-2
+                for i in 0..n_fvars {
+                    let name = format!("{}{}", kind.var_prefix, i);
+                    main_lines
+                        .push(format!("    var {} = {}", name, self.gen_float_literal(kind.ty)));
+                    vars.push((name, VarType::Float(kind.ty)));
+                }
+            }
+        }
+
         // 1-5 computed values, each printed
         let n_stmts = (self.next() % 5 + 1) as usize;
         for i in 0..n_stmts {
@@ -116,6 +149,25 @@ impl<'a> Gen<'a> {
             vars.push((name, VarType::Int));
         }
 
+        // Float computations, each printed as i32. Printing the truncated
+        // integer (rather than the raw float) keeps output comparable across
+        // backends — Rust's float Display ("3") and the C interpreter's printf
+        // ("3.0") format integral floats differently, so raw float prints
+        // would diverge on formatting alone. The float arithmetic itself is
+        // still fully exercised before the cast.
+        for (k, kind) in FLOAT_KINDS.iter().enumerate() {
+            if !has_floats[k] {
+                continue;
+            }
+            let n_fstmts = (self.next() % 3 + 1) as usize; // 1-3
+            for i in 0..n_fstmts {
+                let name = format!("{}r{}", kind.var_prefix, i);
+                let expr = self.gen_float_expr(&vars, kind, has_fma[k], 2);
+                main_lines.push(format!("    let {} = {}", name, expr));
+                main_lines.push(format!("    print({} as i32)", name));
+            }
+        }
+
         main_lines.push("}".to_string());
 
         let mut parts = decls;
@@ -289,11 +341,101 @@ impl<'a> Gen<'a> {
         let val = (self.next() % 200) as i32 - 100;
         self.format_int(val)
     }
+
+    /// A small float literal in [0.0, 9.9], typed as `ty` ("f32"/"f64").
+    /// Bare float literals are already f32, so an `as f32` cast would be an
+    /// unsupported identity conversion — emit the bare literal for f32 and an
+    /// explicit `as f64` conversion for f64. Values are bounded so that
+    /// products of a few of these stay far inside i32 range after the final
+    /// `as i32` cast.
+    fn gen_float_literal(&mut self, ty: &str) -> String {
+        let whole = self.next() % 10;
+        let frac = self.next() % 10;
+        if ty == "f32" {
+            format!("{}.{}", whole, frac)
+        } else {
+            format!("({}.{} as {})", whole, frac, ty)
+        }
+    }
+
+    fn gen_float_expr(
+        &mut self,
+        vars: &[(String, VarType)],
+        kind: &FloatKind,
+        has_fma: bool,
+        depth: u8,
+    ) -> String {
+        if depth == 0 {
+            return self.gen_float_leaf(vars, kind);
+        }
+        let max_choice = if has_fma { 8 } else { 6 };
+        match self.next() % max_choice {
+            // Float literal
+            0..=1 => self.gen_float_literal(kind.ty),
+            // Float variable reference (of this type)
+            2..=3 => self.gen_float_var(vars, kind.ty),
+            // Binary arithmetic (no division — avoids safety errors)
+            4..=5 => {
+                let ops = ["+", "-", "*"];
+                let op = ops[self.next() as usize % ops.len()];
+                let l = self.gen_float_expr(vars, kind, has_fma, depth - 1);
+                let r = self.gen_float_expr(vars, kind, has_fma, depth - 1);
+                format!("({} {} {})", l, op, r)
+            }
+            // Helper call (only if its `a * b + c` helper was emitted) —
+            // exercises float argument/return-value bridging across a call.
+            6..=7 => {
+                let a = self.gen_float_expr(vars, kind, has_fma, depth - 1);
+                let b = self.gen_float_expr(vars, kind, has_fma, depth - 1);
+                let c = self.gen_float_expr(vars, kind, has_fma, depth - 1);
+                format!("{}({}, {}, {})", kind.fma, a, b, c)
+            }
+            _ => self.gen_float_literal(kind.ty),
+        }
+    }
+
+    fn gen_float_leaf(&mut self, vars: &[(String, VarType)], kind: &FloatKind) -> String {
+        if self.next() % 2 == 0 {
+            self.gen_float_var(vars, kind.ty)
+        } else {
+            self.gen_float_literal(kind.ty)
+        }
+    }
+
+    fn gen_float_var(&mut self, vars: &[(String, VarType)], ty: &str) -> String {
+        let fvars: Vec<&str> = vars
+            .iter()
+            .filter_map(|(name, t)| match t {
+                VarType::Float(vt) if *vt == ty => Some(name.as_str()),
+                _ => None,
+            })
+            .collect();
+        if fvars.is_empty() {
+            self.gen_float_literal(ty)
+        } else {
+            let idx = self.next() as usize % fvars.len();
+            fvars[idx].to_string()
+        }
+    }
+}
+
+/// A float type the generator can emit, paired with the names it uses for
+/// that type's variables and `a * b + c` helper.
+struct FloatKind {
+    ty: &'static str,
+    var_prefix: &'static str,
+    fma: &'static str,
 }
 
+const FLOAT_KINDS: [FloatKind; 2] = [
+    FloatKind { ty: "f32", var_prefix: "sv", fma: "sfma" },
+    FloatKind { ty: "f64", var_prefix: "fv", fma: "ffma" },
+];
+
 #[derive(Clone)]
 enum VarType {
     Int,
+    Float(&'static str),
     Struct,
     Array(usize),
     Enum(usize),
@@ -321,6 +463,11 @@ fn capture_stdout<F: FnOnce()>(f: F) -> String {
     f();
 
     std::io::stdout().flush().ok();
+    // The stack backend's C interpreter prints via C stdio (printf), which
+    // buffers independently of Rust's stdout. Flush all C streams before
+    // restoring fd 1, or the buffered output is lost and the capture comes
+    // back empty.
+    unsafe { libc::fflush(std::ptr::null_mut()) };
 
     unsafe { libc::dup2(saved_fd, 1) };
     unsafe { libc::close(saved_fd) };
@@ -356,6 +503,17 @@ fn run_backend(program: &str, backend: &str) -> Option<String> {
             });
             Some(output)
         }
+        #[cfg(has_stack_interp)]
+        "stack" => {
+            // Compile to the stack VM and run it through the C interpreter
+            // (the same path cli/src/main.rs uses for `--backend stack`).
+            let output = capture_stdout(|| {
+                if let Ok(program) = compiler.compile_stack() {
+                    let _ = lyte::stack_interp_bridge::run(&program);
+                }
+            });
+            Some(output)
+        }
         #[cfg(target_arch = "aarch64")]
         "asm" => {
             let output = capture_stdout(|| {
@@ -460,4 +618,10 @@ fuzz_target!(|data: &[u8]| {
         let llvm_output = run_backend(&program, "llvm");
         assert_same("VM", &vm_output, "LLVM", &llvm_output, &program);
     }
+
+    #[cfg(has_stack_interp)]
+    {
+        let stack_output = run_backend(&program, "stack");
+        assert_same("VM", &vm_output, "STACK", &stack_output, &program);
+    }
 });
diff --git a/src/compiler.rs b/src/compiler.rs
index 12172642..225023b7 100644
--- a/src/compiler.rs
+++ b/src/compiler.rs
@@ -1980,7 +1980,31 @@ mod tests {
     // At Return/ReturnVoid the entry depth must be 0 (plus the op's
     // own delta, which is 0 for both return ops).
     fn assert_f_window_balanced(program: &crate::stack_ir::StackProgram, label: &str) {
-        use crate::stack_depth::float_stack_delta;
+        assert_window_balanced(
+            program,
+            label,
+            crate::stack_depth::float_stack_delta,
+            "f-window",
+        );
+        assert_window_balanced(
+            program,
+            label,
+            crate::stack_depth::double_stack_delta,
+            "d-window",
+        );
+    }
+
+    /// Verify a register-window (float or double) stays balanced across the
+    /// CFG: every merge agrees on depth, every Return leaves depth 0, and no
+    /// op underflows. `delta` selects which window to check.
+    #[cfg(test)]
+    fn assert_window_balanced(
+        program: &crate::stack_ir::StackProgram,
+        label: &str,
+        delta: fn(&crate::stack_ir::StackOp) -> i32,
+        window: &str,
+    ) {
+        let float_stack_delta = delta;
         use crate::stack_ir::StackOp;
 
         for func in &program.functions {
@@ -2044,7 +2068,9 @@ mod tests {
                         }
                     }
                     StackOp::FusedF32ConstFGtJumpIfZeroF(_, off)
-                    | StackOp::FusedGetF32ConstFGtJumpIfZeroF(_, _, off) => {
+                    | StackOp::FusedGetF32ConstFGtJumpIfZeroF(_, _, off)
+                    | StackOp::FusedF64ConstDGtJumpIfZeroD(_, off)
+                    | StackOp::FusedGetF64ConstDGtJumpIfZeroD(_, _, off) => {
                         let t = (i as i64 + 1 + *off as i64) as usize;
                         if t < n {
                             succs.push(t);
@@ -2066,9 +2092,9 @@ mod tests {
                         worklist.push(s);
                     } else if in_depth[s] != d_out {
                         panic!(
-                            "[{}] {}: f-window depth mismatch at op {} \
+                            "[{}] {}: {} depth mismatch at op {} \
                              (from op {}): {} vs {}",
-                            label, func.name, s, i, in_depth[s], d_out,
+                            label, func.name, window, s, i, in_depth[s], d_out,
                         );
                     }
                 }
@@ -2081,9 +2107,10 @@ mod tests {
                 if matches!(op, StackOp::Return | StackOp::ReturnVoid) {
                     assert!(
                         in_depth[i] == 0,
-                        "[{}] {}: f-window leaks {} slot(s) at return op {}",
+                        "[{}] {}: {} leaks {} slot(s) at return op {}",
                         label,
                         func.name,
+                        window,
                         in_depth[i],
                         i,
                     );
@@ -2091,9 +2118,10 @@ mod tests {
                 let d_out = in_depth[i] + float_stack_delta(op);
                 assert!(
                     d_out >= 0,
-                    "[{}] {}: f-window underflow at op {} ({:?}): in={} delta={}",
+                    "[{}] {}: {} underflow at op {} ({:?}): in={} delta={}",
                     label,
                     func.name,
+                    window,
                     i,
                     op,
                     in_depth[i],
diff --git a/src/stack_codegen.rs b/src/stack_codegen.rs
index b8b70c15..4d28ff1f 100644
--- a/src/stack_codegen.rs
+++ b/src/stack_codegen.rs
@@ -563,9 +563,11 @@ impl<'a> FunctionTranslator<'a> {
                     // path. Just emit the return.
                     func.emit(StackOp::ReturnVoid);
                 } else {
-                    // Fall-through return: bridge f32 results back to t0.
+                    // Fall-through return: bridge f32/f64 results back to t0.
                     if matches!(&*self.decl.ret, Type::Float32) {
                         func.emit(StackOp::FToBitsF);
+                    } else if matches!(&*self.decl.ret, Type::Float64) {
+                        func.emit(StackOp::DToBitsD);
                     }
                     func.emit(StackOp::Return);
                 }
@@ -600,6 +602,8 @@ impl<'a> FunctionTranslator<'a> {
                 let ty = self.expr_type(expr);
                 if matches!(&*ty, Type::Float32) {
                     func.emit(StackOp::DropF);
+                } else if matches!(&*ty, Type::Float64) {
+                    func.emit(StackOp::DropD);
                 } else {
                     func.emit(StackOp::Drop);
                 }
@@ -663,6 +667,8 @@ impl<'a> FunctionTranslator<'a> {
                 let ty = self.expr_type(expr);
                 if matches!(&*ty, Type::Float32) {
                     func.emit(StackOp::DropF);
+                } else if matches!(&*ty, Type::Float64) {
+                    func.emit(StackOp::DropD);
                 } else {
                     func.emit(StackOp::Drop);
                 }
@@ -709,7 +715,7 @@ impl<'a> FunctionTranslator<'a> {
                     }
                     Type::Float64 => {
                         let value: f64 = s.parse().unwrap_or(0.0);
-                        func.emit(StackOp::F64Const(value));
+                        func.emit(StackOp::F64ConstD(value));
                     }
                     _ => {
                         let value: f32 = s.parse().unwrap_or(0.0);
@@ -796,17 +802,10 @@ impl<'a> FunctionTranslator<'a> {
                     // Scalar: translate init, store in local.
                     self.translate_expr(init, func);
                     let local = self.alloc_scalar();
-                    let is_f32 = matches!(&*ty, Type::Float32);
                     if self.void_ctx {
-                        if is_f32 {
-                            func.emit(StackOp::LocalSetF(local));
-                        } else {
-                            func.emit(StackOp::LocalSet(local));
-                        }
-                    } else if is_f32 {
-                        func.emit(StackOp::LocalTeeF(local));
+                        self.emit_local_set(&ty, local, func);
                     } else {
-                        func.emit(StackOp::LocalTee(local));
+                        self.emit_local_tee(&ty, local, func);
                     }
                     self.variables.insert(name, LocalKind::Scalar(local));
                     self.variable_types.insert(name, ty);
@@ -832,15 +831,10 @@ impl<'a> FunctionTranslator<'a> {
 
                 if !self.is_ptr_type(&ty) {
                     let local = self.alloc_scalar();
-                    let is_f32 = matches!(&*ty, Type::Float32);
                     if let Some(init_id) = init {
                         if !self.try_emit_binop_set(local, init_id, func) {
                             self.translate_expr(init_id, func);
-                            if is_f32 {
-                                func.emit(StackOp::LocalSetF(local));
-                            } else {
-                                func.emit(StackOp::LocalSet(local));
-                            }
+                            self.emit_local_set(&ty, local, func);
                         }
                     } else {
                         // Uninitialized local — zero the slot via the int window.
@@ -940,11 +934,13 @@ impl<'a> FunctionTranslator<'a> {
                     func.emit(StackOp::MemCopy(size));
                     func.emit(StackOp::ReturnVoid);
                 } else {
-                    // f32 return values travel through t0 (int window). If
-                    // the preceding expression left the value in the float
+                    // f32/f64 return values travel through t0 (int window).
+                    // If the preceding expression left the value in a FP
                     // window, bridge it back to the int window first.
                     if matches!(&*ret_ty, Type::Float32) {
                         func.emit(StackOp::FToBitsF);
+                    } else if matches!(&*ret_ty, Type::Float64) {
+                        func.emit(StackOp::DToBitsD);
                     }
                     func.emit(StackOp::Return);
                 }
@@ -1073,11 +1069,7 @@ impl<'a> FunctionTranslator<'a> {
         if let Some(&kind) = self.variables.get(&name) {
             match kind {
                 LocalKind::Scalar(slot) => {
-                    if matches!(&*ty, Type::Float32) {
-                        func.emit(StackOp::LocalGetF(slot));
-                    } else {
-                        func.emit(StackOp::LocalGet(slot));
-                    }
+                    self.emit_local_get(&ty, slot, func);
                 }
                 LocalKind::Reference(slot) => {
                     func.emit(StackOp::LocalGet(slot));
@@ -1198,34 +1190,34 @@ impl<'a> FunctionTranslator<'a> {
         match op {
             Binop::Plus => match &*ty {
                 Type::Float32 => func.emit(StackOp::FAddF),
-                Type::Float64 => func.emit(StackOp::DAdd),
+                Type::Float64 => func.emit(StackOp::DAddD),
                 _ => func.emit(StackOp::IAdd),
             },
             Binop::Minus => match &*ty {
                 Type::Float32 => func.emit(StackOp::FSubF),
-                Type::Float64 => func.emit(StackOp::DSub),
+                Type::Float64 => func.emit(StackOp::DSubD),
                 _ => func.emit(StackOp::ISub),
             },
             Binop::Mult => match &*ty {
                 Type::Float32 => func.emit(StackOp::FMulF),
-                Type::Float64 => func.emit(StackOp::DMul),
+                Type::Float64 => func.emit(StackOp::DMulD),
                 _ => func.emit(StackOp::IMul),
             },
             Binop::Div => match &*ty {
                 Type::Float32 => func.emit(StackOp::FDivF),
-                Type::Float64 => func.emit(StackOp::DDiv),
+                Type::Float64 => func.emit(StackOp::DDivD),
                 Type::UInt32 | Type::UInt8 => func.emit(StackOp::UDiv),
                 _ => func.emit(StackOp::IDiv),
             },
             Binop::Mod => func.emit(StackOp::IRem),
             Binop::Pow => match &*ty {
                 Type::Float32 => func.emit(StackOp::FPowF),
-                Type::Float64 => func.emit(StackOp::DPow),
+                Type::Float64 => func.emit(StackOp::DPowD),
                 _ => func.emit(StackOp::IPow),
             },
             Binop::Equal => match &*ty {
                 Type::Float32 => func.emit(StackOp::FEqF),
-                Type::Float64 => func.emit(StackOp::DEq),
+                Type::Float64 => func.emit(StackOp::DEqD),
                 Type::Name(_, _) | Type::Tuple(_) | Type::Array(_, _) => {
                     let size = ty.size(self.decls) as u32;
                     func.emit(StackOp::MemEq(size));
@@ -1238,6 +1230,7 @@ impl<'a> FunctionTranslator<'a> {
             },
             Binop::NotEqual => match &*ty {
                 Type::Float32 => func.emit(StackOp::FNeF),
+                Type::Float64 => func.emit(StackOp::DNeD),
                 Type::Name(_, _) | Type::Tuple(_) | Type::Array(_, _) => {
                     let size = ty.size(self.decls) as u32;
                     func.emit(StackOp::MemNe(size));
@@ -1250,27 +1243,27 @@ impl<'a> FunctionTranslator<'a> {
             },
             Binop::Less => match &*ty {
                 Type::Float32 => func.emit(StackOp::FLtF),
-                Type::Float64 => func.emit(StackOp::DLt),
+                Type::Float64 => func.emit(StackOp::DLtD),
                 Type::UInt32 | Type::UInt8 => func.emit(StackOp::ULt),
                 _ => func.emit(StackOp::ILt),
             },
             Binop::Greater => {
                 match &*ty {
                     Type::Float32 => func.emit(StackOp::FGtF),
-                    Type::Float64 => func.emit(StackOp::IGt), // TODO: DGt
+                    Type::Float64 => func.emit(StackOp::DGtD),
                     Type::UInt32 | Type::UInt8 => func.emit(StackOp::UGt),
                     _ => func.emit(StackOp::IGt),
                 }
             }
             Binop::Leq => match &*ty {
                 Type::Float32 => func.emit(StackOp::FLeF),
-                Type::Float64 => func.emit(StackOp::DLe),
+                Type::Float64 => func.emit(StackOp::DLeD),
                 _ => func.emit(StackOp::ILe),
             },
             Binop::Geq => {
                 match &*ty {
                     Type::Float32 => func.emit(StackOp::FGeF),
-                    Type::Float64 => func.emit(StackOp::IGe), // TODO: DGe
+                    Type::Float64 => func.emit(StackOp::DGeD),
                     Type::UInt32 | Type::UInt8 => func.emit(StackOp::IGe), // unsigned uses signed
                     _ => func.emit(StackOp::IGe),
                 }
@@ -1284,7 +1277,6 @@ impl<'a> FunctionTranslator<'a> {
     /// Translate an assignment expression.
     fn translate_assign(&mut self, lhs_id: ExprID, rhs_id: ExprID, func: &mut StackFunction) {
         let lhs_ty = self.representation_type(lhs_id);
-        let lhs_is_f32 = matches!(&*lhs_ty, Type::Float32);
 
         // Check for captured variable assignment (double indirection).
         if let Expr::Id(name) = &self.decl.arena.exprs[lhs_id] {
@@ -1294,31 +1286,17 @@ impl<'a> FunctionTranslator<'a> {
                 let val_local = self.alloc_scalar();
                 let addr_local = *self.captured_slots.get(&name).unwrap();
                 if self.void_ctx {
-                    if lhs_is_f32 {
-                        func.emit(StackOp::LocalSetF(val_local));
-                    } else {
-                        func.emit(StackOp::LocalSet(val_local));
-                    }
-                } else if lhs_is_f32 {
-                    func.emit(StackOp::LocalTeeF(val_local));
+                    self.emit_local_set(&lhs_ty, val_local, func);
                 } else {
-                    func.emit(StackOp::LocalTee(val_local));
+                    self.emit_local_tee(&lhs_ty, val_local, func);
                 }
                 // Stack: [value]. Need to store through captured pointer.
                 // Push addr, then value, then store.
                 func.emit(StackOp::LocalGet(addr_local)); // push captured addr
-                if lhs_is_f32 {
-                    func.emit(StackOp::LocalGetF(val_local));
-                } else {
-                    func.emit(StackOp::LocalGet(val_local)); // push value
-                }
+                self.emit_local_get(&lhs_ty, val_local, func); // push value
                 self.emit_store_op(&lhs_ty, func);
                 if !self.void_ctx {
-                    if lhs_is_f32 {
-                        func.emit(StackOp::LocalGetF(val_local));
-                    } else {
-                        func.emit(StackOp::LocalGet(val_local)); // result value
-                    }
+                    self.emit_local_get(&lhs_ty, val_local, func); // result value
                 }
                 return;
             }
@@ -1336,15 +1314,9 @@ impl<'a> FunctionTranslator<'a> {
                 }
                 self.translate_expr(rhs_id, func);
                 if self.void_ctx {
-                    if lhs_is_f32 {
-                        func.emit(StackOp::LocalSetF(slot));
-                    } else {
-                        func.emit(StackOp::LocalSet(slot));
-                    }
-                } else if lhs_is_f32 {
-                    func.emit(StackOp::LocalTeeF(slot));
+                    self.emit_local_set(&lhs_ty, slot, func);
                 } else {
-                    func.emit(StackOp::LocalTee(slot));
+                    self.emit_local_tee(&lhs_ty, slot, func);
                 }
                 return;
             }
@@ -1440,31 +1412,19 @@ impl<'a> FunctionTranslator<'a> {
                 self.translate_expr(rhs_id, func);
                 self.emit_wrap_for_expected_slice(lhs_ty, rhs_id, func);
                 let tmp = self.alloc_scalar();
-                if lhs_is_f32 {
-                    func.emit(StackOp::LocalSetF(tmp));
-                } else {
-                    func.emit(StackOp::LocalSet(tmp));
-                }
+                self.emit_local_set(&lhs_ty, tmp, func);
                 tmp
             }
         } else {
             self.translate_expr(rhs_id, func);
             self.emit_wrap_for_expected_slice(lhs_ty, rhs_id, func);
             let tmp = self.alloc_scalar();
-            if lhs_is_f32 {
-                func.emit(StackOp::LocalSetF(tmp));
-            } else {
-                func.emit(StackOp::LocalSet(tmp));
-            }
+            self.emit_local_set(&lhs_ty, tmp, func);
             tmp
         };
 
         self.translate_lvalue(lhs_id, func); // pushes address
-        if lhs_is_f32 {
-            func.emit(StackOp::LocalGetF(val_local));
-        } else {
-            func.emit(StackOp::LocalGet(val_local));
-        }
+        self.emit_local_get(&lhs_ty, val_local, func);
 
         // For Func type field assignment, only copy func_idx (8 bytes).
         if matches!(&*lhs_ty, Type::Func(_, _)) {
@@ -1481,11 +1441,7 @@ impl<'a> FunctionTranslator<'a> {
 
         self.emit_store_op(&lhs_ty, func);
         if !self.void_ctx {
-            if lhs_is_f32 {
-                func.emit(StackOp::LocalGetF(val_local));
-            } else {
-                func.emit(StackOp::LocalGet(val_local));
-            }
+            self.emit_local_get(&lhs_ty, val_local, func);
         }
     }
 
@@ -1673,7 +1629,7 @@ impl<'a> FunctionTranslator<'a> {
         match op {
             Unop::Neg => match &*ty {
                 Type::Float32 => func.emit(StackOp::FNegF),
-                Type::Float64 => func.emit(StackOp::DNeg),
+                Type::Float64 => func.emit(StackOp::DNegD),
                 _ => func.emit(StackOp::INeg),
             },
             Unop::Not => {
@@ -1728,6 +1684,7 @@ impl<'a> FunctionTranslator<'a> {
                     let ty = self.expr_type(arg_id);
                     match &*ty {
                         Type::Float32 => func.emit(StackOp::PrintF32F),
+                        Type::Float64 => func.emit(StackOp::PrintF64D),
                         _ => func.emit(StackOp::PrintI32),
                     }
                 }
@@ -1807,29 +1764,29 @@ impl<'a> FunctionTranslator<'a> {
                 ("isinf$f32", StackOp::IsinfF32F),
             ];
             let unary_math_f64: &[(&str, StackOp)] = &[
-                ("sin$f64", StackOp::SinF64),
-                ("cos$f64", StackOp::CosF64),
-                ("tan$f64", StackOp::TanF64),
-                ("asin$f64", StackOp::AsinF64),
-                ("acos$f64", StackOp::AcosF64),
-                ("atan$f64", StackOp::AtanF64),
-                ("sinh$f64", StackOp::SinhF64),
-                ("cosh$f64", StackOp::CoshF64),
-                ("tanh$f64", StackOp::TanhF64),
-                ("asinh$f64", StackOp::AsinhF64),
-                ("acosh$f64", StackOp::AcoshF64),
-                ("atanh$f64", StackOp::AtanhF64),
-                ("ln$f64", StackOp::LnF64),
-                ("exp$f64", StackOp::ExpF64),
-                ("exp2$f64", StackOp::Exp2F64),
-                ("log10$f64", StackOp::Log10F64),
-                ("log2$f64", StackOp::Log2F64),
-                ("sqrt$f64", StackOp::SqrtF64),
-                ("abs$f64", StackOp::AbsF64),
-                ("floor$f64", StackOp::FloorF64),
-                ("ceil$f64", StackOp::CeilF64),
-                ("isnan$f64", StackOp::IsnanF64),
-                ("isinf$f64", StackOp::IsinfF64),
+                ("sin$f64", StackOp::SinF64D),
+                ("cos$f64", StackOp::CosF64D),
+                ("tan$f64", StackOp::TanF64D),
+                ("asin$f64", StackOp::AsinF64D),
+                ("acos$f64", StackOp::AcosF64D),
+                ("atan$f64", StackOp::AtanF64D),
+                ("sinh$f64", StackOp::SinhF64D),
+                ("cosh$f64", StackOp::CoshF64D),
+                ("tanh$f64", StackOp::TanhF64D),
+                ("asinh$f64", StackOp::AsinhF64D),
+                ("acosh$f64", StackOp::AcoshF64D),
+                ("atanh$f64", StackOp::AtanhF64D),
+                ("ln$f64", StackOp::LnF64D),
+                ("exp$f64", StackOp::ExpF64D),
+                ("exp2$f64", StackOp::Exp2F64D),
+                ("log10$f64", StackOp::Log10F64D),
+                ("log2$f64", StackOp::Log2F64D),
+                ("sqrt$f64", StackOp::SqrtF64D),
+                ("abs$f64", StackOp::AbsF64D),
+                ("floor$f64", StackOp::FloorF64D),
+                ("ceil$f64", StackOp::CeilF64D),
+                ("isnan$f64", StackOp::IsnanF64D),
+                ("isinf$f64", StackOp::IsinfF64D),
             ];
             for (n, op) in unary_math_f32.iter() {
                 if *name == *n {
@@ -1856,7 +1813,7 @@ impl<'a> FunctionTranslator<'a> {
             if *name == "atan2$f64$f64" {
                 self.translate_expr(arg_ids[0], func);
                 self.translate_expr(arg_ids[1], func);
-                func.emit(StackOp::Atan2F64);
+                func.emit(StackOp::Atan2F64D);
                 return;
             }
 
@@ -1870,7 +1827,7 @@ impl<'a> FunctionTranslator<'a> {
             if *name == "pow$f64$f64" {
                 self.translate_expr(arg_ids[0], func);
                 self.translate_expr(arg_ids[1], func);
-                func.emit(StackOp::DPow);
+                func.emit(StackOp::DPowD);
                 return;
             }
 
@@ -1883,24 +1840,23 @@ impl<'a> FunctionTranslator<'a> {
                 let is_f64 = name.contains("f64");
                 let is_min = name.contains("min");
                 // Local set/get for the a/b temps: f32 goes through the
-                // float window (LocalSetF/LocalGetF); f64 through the
-                // int window (LocalSet/LocalGet — f64 values ride as
-                // u64 bit patterns).
+                // f32 lives in the float window (LocalSetF/LocalGetF),
+                // f64 in the double window (LocalSetD/LocalGetD).
                 let local_set = |slot: u16| {
                     if is_f64 {
-                        StackOp::LocalSet(slot)
+                        StackOp::LocalSetD(slot)
                     } else {
                         StackOp::LocalSetF(slot)
                     }
                 };
                 let local_get = |slot: u16| {
                     if is_f64 {
-                        StackOp::LocalGet(slot)
+                        StackOp::LocalGetD(slot)
                     } else {
                         StackOp::LocalGetF(slot)
                     }
                 };
-                let cmp_lt = if is_f64 { StackOp::DLt } else { StackOp::FLtF };
+                let cmp_lt = if is_f64 { StackOp::DLtD } else { StackOp::FLtF };
                 self.translate_expr(arg_ids[0], func);
                 let a_local = self.alloc_scalar();
                 func.emit(local_set(a_local));
@@ -1975,6 +1931,8 @@ impl<'a> FunctionTranslator<'a> {
                                 let arg_ty = self.expr_type(arg);
                                 if matches!(&*arg_ty, Type::Float32) {
                                     func.emit(StackOp::FToBitsF);
+                                } else if matches!(&*arg_ty, Type::Float64) {
+                                    func.emit(StackOp::DToBitsD);
                                 }
                                 c_arg_count += 1;
                             }
@@ -1995,6 +1953,8 @@ impl<'a> FunctionTranslator<'a> {
                         // translate_call's +1 invariant still holds.
                         if matches!(&*ret_ty, Type::Float32) {
                             func.emit(StackOp::BitsToFF);
+                        } else if matches!(&*ret_ty, Type::Float64) {
+                            func.emit(StackOp::BitsToDD);
                         }
                         return;
                     }
@@ -2041,6 +2001,8 @@ impl<'a> FunctionTranslator<'a> {
                     let arg_ty = self.expr_type(*arg_id);
                     if matches!(&*arg_ty, Type::Float32) {
                         func.emit(StackOp::FToBitsF);
+                    } else if matches!(&*arg_ty, Type::Float64) {
+                        func.emit(StackOp::DToBitsD);
                     }
                 }
                 if param_ty.is_some_and(|t| matches!(&*t, Type::Slice(_))) {
@@ -2082,6 +2044,9 @@ impl<'a> FunctionTranslator<'a> {
                 // Bridge into the float window so the surrounding codegen
                 // can consume them as f32 directly.
                 func.emit(StackOp::BitsToFF);
+            } else if matches!(&*ret_ty, Type::Float64) {
+                // Same for f64: bridge t0 bits into the double window.
+                func.emit(StackOp::BitsToDD);
             }
             // Otherwise the call already pushed its return value.
 
@@ -2553,10 +2518,10 @@ impl<'a> FunctionTranslator<'a> {
         match (&*src_ty, &*target_ty) {
             (Type::Int32, Type::Float32) => func.emit(StackOp::I32ToF32F),
             (Type::Float32, Type::Int32) => func.emit(StackOp::F32ToI32F),
-            (Type::Int32, Type::Float64) => func.emit(StackOp::I32ToF64),
-            (Type::Float64, Type::Int32) => func.emit(StackOp::F64ToI32),
-            (Type::Float32, Type::Float64) => func.emit(StackOp::F32ToF64),
-            (Type::Float64, Type::Float32) => func.emit(StackOp::F64ToF32),
+            (Type::Int32, Type::Float64) => func.emit(StackOp::I32ToF64D),
+            (Type::Float64, Type::Int32) => func.emit(StackOp::F64ToI32D),
+            (Type::Float32, Type::Float64) => func.emit(StackOp::F32ToF64D),
+            (Type::Float64, Type::Float32) => func.emit(StackOp::F64ToF32D),
             (Type::Int32, Type::Int8) | (Type::UInt32, Type::Int8) => func.emit(StackOp::I32ToI8),
             (Type::Int8, Type::Int32) => func.emit(StackOp::I8ToI32),
             (Type::Int32, Type::UInt32) | (Type::UInt32, Type::Int32) => {
@@ -2707,13 +2672,41 @@ impl<'a> FunctionTranslator<'a> {
         }
     }
 
+    /// Emit the window-appropriate `local.get` for a scalar of `ty`:
+    /// f32 → float window, f64 → double window, everything else → int.
+    fn emit_local_get(&self, ty: &TypeID, slot: u16, func: &mut StackFunction) {
+        match &**ty {
+            Type::Float32 => func.emit(StackOp::LocalGetF(slot)),
+            Type::Float64 => func.emit(StackOp::LocalGetD(slot)),
+            _ => func.emit(StackOp::LocalGet(slot)),
+        }
+    }
+
+    /// Window-appropriate `local.set` (see `emit_local_get`).
+    fn emit_local_set(&self, ty: &TypeID, slot: u16, func: &mut StackFunction) {
+        match &**ty {
+            Type::Float32 => func.emit(StackOp::LocalSetF(slot)),
+            Type::Float64 => func.emit(StackOp::LocalSetD(slot)),
+            _ => func.emit(StackOp::LocalSet(slot)),
+        }
+    }
+
+    /// Window-appropriate `local.tee` (see `emit_local_get`).
+    fn emit_local_tee(&self, ty: &TypeID, slot: u16, func: &mut StackFunction) {
+        match &**ty {
+            Type::Float32 => func.emit(StackOp::LocalTeeF(slot)),
+            Type::Float64 => func.emit(StackOp::LocalTeeD(slot)),
+            _ => func.emit(StackOp::LocalTee(slot)),
+        }
+    }
+
     /// Emit a load instruction based on type. Pops address, pushes value.
     fn emit_load(&self, ty: &TypeID, func: &mut StackFunction) {
         match &**ty {
             Type::Bool | Type::Int8 | Type::UInt8 => func.emit(StackOp::Load8),
             Type::Float32 => func.emit(StackOp::LoadF32F),
             Type::Int32 | Type::UInt32 => func.emit(StackOp::Load32),
-            Type::Float64 => func.emit(StackOp::Load64),
+            Type::Float64 => func.emit(StackOp::LoadF64D),
             _ => func.emit(StackOp::Load64),
         }
     }
@@ -2732,7 +2725,7 @@ impl<'a> FunctionTranslator<'a> {
                 func.emit(StackOp::Load32Off(offset));
             }
             Type::Float64 => {
-                func.emit(StackOp::Load64Off(offset));
+                func.emit(StackOp::LoadF64OffD(offset));
             }
             _ => {
                 func.emit(StackOp::Load64Off(offset));
@@ -2750,7 +2743,7 @@ impl<'a> FunctionTranslator<'a> {
                 Type::Bool | Type::Int8 | Type::UInt8 => func.emit(StackOp::Store8),
                 Type::Float32 => func.emit(StackOp::StoreF32F),
                 Type::Int32 | Type::UInt32 => func.emit(StackOp::Store32),
-                Type::Float64 => func.emit(StackOp::Store64),
+                Type::Float64 => func.emit(StackOp::StoreF64D),
                 _ => func.emit(StackOp::Store64),
             }
         }
@@ -2844,7 +2837,7 @@ impl<'a> FunctionTranslator<'a> {
                     func.emit(StackOp::Store32Off(offset));
                 }
                 Type::Float64 => {
-                    func.emit(StackOp::Store64Off(offset));
+                    func.emit(StackOp::StoreF64OffD(offset));
                 }
                 _ => {
                     func.emit(StackOp::Store32Off(offset));
diff --git a/src/stack_depth.rs b/src/stack_depth.rs
index eccc7627..62bc8d18 100644
--- a/src/stack_depth.rs
+++ b/src/stack_depth.rs
@@ -28,6 +28,8 @@ pub fn compute_depths(func: &StackFunction) -> Vec<u8> {
             | StackOp::FusedBoundsCheck8JumpIfZero(_, off) => Some(*off),
             StackOp::FusedF32ConstFGtJumpIfZeroF(_, off) => Some(*off),
             StackOp::FusedGetF32ConstFGtJumpIfZeroF(_, _, off) => Some(*off),
+            StackOp::FusedF64ConstDGtJumpIfZeroD(_, off) => Some(*off),
+            StackOp::FusedGetF64ConstDGtJumpIfZeroD(_, _, off) => Some(*off),
             _ => None,
         };
         if let Some(off) = off {
@@ -315,6 +317,190 @@ pub fn stack_delta(op: &StackOp) -> i32 {
         | StackOp::FusedGetGetFMulSum7F(_, _)
         | StackOp::FusedGetGetFMulSum8F(_, _) => 0,
         StackOp::FusedTeeSliceStore32F(_, _, _) => 0,
+
+        // === Double-window (D) ops: integer-window deltas ===
+        // Pure d-window ops don't touch the int window.
+        StackOp::F64ConstD(_)
+        | StackOp::LocalGetD(_)
+        | StackOp::LocalSetD(_)
+        | StackOp::LocalTeeD(_)
+        | StackOp::DropD
+        | StackOp::DAddD
+        | StackOp::DSubD
+        | StackOp::DMulD
+        | StackOp::DDivD
+        | StackOp::DPowD
+        | StackOp::DNegD
+        | StackOp::F32ToF64D
+        | StackOp::F64ToF32D
+        | StackOp::SinF64D
+        | StackOp::CosF64D
+        | StackOp::TanF64D
+        | StackOp::AsinF64D
+        | StackOp::AcosF64D
+        | StackOp::AtanF64D
+        | StackOp::SinhF64D
+        | StackOp::CoshF64D
+        | StackOp::TanhF64D
+        | StackOp::AsinhF64D
+        | StackOp::AcoshF64D
+        | StackOp::AtanhF64D
+        | StackOp::LnF64D
+        | StackOp::ExpF64D
+        | StackOp::Exp2F64D
+        | StackOp::Log10F64D
+        | StackOp::Log2F64D
+        | StackOp::SqrtF64D
+        | StackOp::AbsF64D
+        | StackOp::FloorF64D
+        | StackOp::CeilF64D
+        | StackOp::Atan2F64D
+        | StackOp::PrintF64D => 0,
+
+        // d-window → int window: push 1 to the int window.
+        StackOp::DEqD
+        | StackOp::DNeD
+        | StackOp::DLtD
+        | StackOp::DLeD
+        | StackOp::DGtD
+        | StackOp::DGeD
+        | StackOp::F64ToI32D
+        | StackOp::DToBitsD
+        | StackOp::IsnanF64D
+        | StackOp::IsinfF64D => 1,
+
+        // int window → d-window: pop 1 from the int window. Loads/stores
+        // take the address from the int window (value lives in d-window).
+        StackOp::I32ToF64D
+        | StackOp::BitsToDD
+        | StackOp::LoadF64D
+        | StackOp::LoadF64OffD(_)
+        | StackOp::StoreF64D
+        | StackOp::StoreF64OffD(_) => -1,
+
+        // Fused d-window superinstructions read operands from locals[] and
+        // operate only on the d-window, so the int window is untouched.
+        StackOp::FusedGetGetDMulD(_, _)
+        | StackOp::FusedGetGetDMulDAddD(_, _)
+        | StackOp::FusedGetGetDMulDSubD(_, _)
+        | StackOp::FusedGetGetDMulSum2D(_, _)
+        | StackOp::FusedGetGetDMulSum3D(_, _)
+        | StackOp::FusedGetGetDMulSum4D(_, _)
+        | StackOp::FusedGetGetDMulSum5D(_, _)
+        | StackOp::FusedGetGetDMulSum6D(_, _)
+        | StackOp::FusedGetGetDMulSum7D(_, _)
+        | StackOp::FusedGetGetDMulSum8D(_, _)
+        | StackOp::FusedGetSetD(_, _)
+        | StackOp::FusedGetSet2D(_)
+        | StackOp::FusedGetSet3D(_)
+        | StackOp::FusedGetSet4D(_)
+        | StackOp::FusedGetSet5D(_)
+        | StackOp::FusedGetSet6D(_)
+        | StackOp::FusedGetSet7D(_)
+        | StackOp::FusedGetSet8D(_)
+        | StackOp::FusedF64ConstDGtJumpIfZeroD(_, _)
+        | StackOp::FusedGetF64ConstDGtJumpIfZeroD(_, _, _) => 0,
+    }
+}
+
+/// Stack depth change for an instruction (double window only).
+///
+/// The f64 analogue of `float_stack_delta`. Tracks d0..d3 occupancy
+/// independently from the integer and float windows. Ops that don't
+/// touch the double window contribute 0 via the catch-all.
+pub fn double_stack_delta(op: &StackOp) -> i32 {
+    match op {
+        // Pushes onto the d-window.
+        StackOp::F64ConstD(_)
+        | StackOp::LocalGetD(_)
+        | StackOp::I32ToF64D
+        | StackOp::BitsToDD
+        | StackOp::F32ToF64D
+        | StackOp::LoadF64D
+        | StackOp::LoadF64OffD(_) => 1,
+
+        // Pops from the d-window (single value).
+        StackOp::LocalSetD(_)
+        | StackOp::DropD
+        | StackOp::F64ToI32D
+        | StackOp::DToBitsD
+        | StackOp::F64ToF32D
+        | StackOp::IsnanF64D
+        | StackOp::IsinfF64D
+        | StackOp::StoreF64D
+        | StackOp::StoreF64OffD(_)
+        | StackOp::PrintF64D => -1,
+
+        // Peek (LocalTeeD) and unary in-window ops: net 0.
+        StackOp::LocalTeeD(_)
+        | StackOp::DNegD
+        | StackOp::SinF64D
+        | StackOp::CosF64D
+        | StackOp::TanF64D
+        | StackOp::AsinF64D
+        | StackOp::AcosF64D
+        | StackOp::AtanF64D
+        | StackOp::SinhF64D
+        | StackOp::CoshF64D
+        | StackOp::TanhF64D
+        | StackOp::AsinhF64D
+        | StackOp::AcoshF64D
+        | StackOp::AtanhF64D
+        | StackOp::LnF64D
+        | StackOp::ExpF64D
+        | StackOp::Exp2F64D
+        | StackOp::Log10F64D
+        | StackOp::Log2F64D
+        | StackOp::SqrtF64D
+        | StackOp::AbsF64D
+        | StackOp::FloorF64D
+        | StackOp::CeilF64D => 0,
+
+        // Binary d-window arith: pop 2, push 1 = -1.
+        StackOp::DAddD
+        | StackOp::DSubD
+        | StackOp::DMulD
+        | StackOp::DDivD
+        | StackOp::DPowD
+        | StackOp::Atan2F64D => -1,
+
+        // Double comparisons: pop 2 from d-window (result goes to int).
+        StackOp::DEqD
+        | StackOp::DNeD
+        | StackOp::DLtD
+        | StackOp::DLeD
+        | StackOp::DGtD
+        | StackOp::DGeD => -2,
+
+        // Fused d-window superinstructions.
+        // Push one result: bare mul and the mul-accumulate sums.
+        StackOp::FusedGetGetDMulD(_, _)
+        | StackOp::FusedGetGetDMulSum2D(_, _)
+        | StackOp::FusedGetGetDMulSum3D(_, _)
+        | StackOp::FusedGetGetDMulSum4D(_, _)
+        | StackOp::FusedGetGetDMulSum5D(_, _)
+        | StackOp::FusedGetGetDMulSum6D(_, _)
+        | StackOp::FusedGetGetDMulSum7D(_, _)
+        | StackOp::FusedGetGetDMulSum8D(_, _) => 1,
+
+        // Accumulate onto d0 in place (net 0). Get/set move chains and the
+        // direct-from-local compare/jump don't touch the d-window.
+        StackOp::FusedGetGetDMulDAddD(_, _)
+        | StackOp::FusedGetGetDMulDSubD(_, _)
+        | StackOp::FusedGetSetD(_, _)
+        | StackOp::FusedGetSet2D(_)
+        | StackOp::FusedGetSet3D(_)
+        | StackOp::FusedGetSet4D(_)
+        | StackOp::FusedGetSet5D(_)
+        | StackOp::FusedGetSet6D(_)
+        | StackOp::FusedGetSet7D(_)
+        | StackOp::FusedGetSet8D(_)
+        | StackOp::FusedGetF64ConstDGtJumpIfZeroD(_, _, _) => 0,
+
+        // Pops d0 for the comparison before branching.
+        StackOp::FusedF64ConstDGtJumpIfZeroD(_, _) => -1,
+
+        _ => 0,
     }
 }
 
@@ -419,14 +605,17 @@ pub fn float_stack_delta(op: &StackOp) -> i32 {
         // f0 ± coeff*state from frame slot — net 0 in f-window.
         StackOp::FusedGetAddrFMulFAddF(_, _, _) | StackOp::FusedGetAddrFMulFSubF(_, _, _) => 0,
 
-        // Crossings: F→int pops f-window
+        // Crossings: F→int pops f-window. F32ToF64D pops the f-window
+        // (f32) and pushes the d-window (handled in double_stack_delta).
         StackOp::F32ToI32F
         | StackOp::FToBitsF
         | StackOp::IsnanF32F
         | StackOp::IsinfF32F
-        | StackOp::F32ToF64 => -1,
-        // int→F pushes f-window
-        StackOp::I32ToF32F | StackOp::BitsToFF | StackOp::F64ToF32 => 1,
+        | StackOp::F32ToF64
+        | StackOp::F32ToF64D => -1,
+        // int→F pushes f-window. F64ToF32D pops the d-window and pushes
+        // the f-window (f32).
+        StackOp::I32ToF32F | StackOp::BitsToFF | StackOp::F64ToF32 | StackOp::F64ToF32D => 1,
 
         // f-window stores pop f0
         StackOp::StoreF32F | StackOp::StoreF32OffF(_) => -1,
diff --git a/src/stack_inline.rs b/src/stack_inline.rs
index 5396f925..1b32d006 100644
--- a/src/stack_inline.rs
+++ b/src/stack_inline.rs
@@ -154,6 +154,12 @@ fn inline_calls_in(func: &mut StackFunction, bodies: &[Option<Vec<StackOp>>]) {
             StackOp::FusedGetF32ConstFGtJumpIfZeroF(_, _, off) => {
                 Some((i as i64 + 1 + *off as i64) as usize)
             }
+            StackOp::FusedF64ConstDGtJumpIfZeroD(_, off) => {
+                Some((i as i64 + 1 + *off as i64) as usize)
+            }
+            StackOp::FusedGetF64ConstDGtJumpIfZeroD(_, _, off) => {
+                Some((i as i64 + 1 + *off as i64) as usize)
+            }
             _ => None,
         };
 
@@ -183,6 +189,8 @@ fn inline_calls_in(func: &mut StackFunction, bodies: &[Option<Vec<StackOp>>]) {
                 | StackOp::FusedBoundsCheck8JumpIfZero(_, o) => *o = new_off as i32,
                 StackOp::FusedF32ConstFGtJumpIfZeroF(_, o) => *o = new_off as i32,
                 StackOp::FusedGetF32ConstFGtJumpIfZeroF(_, _, o) => *o = new_off as i32,
+                StackOp::FusedF64ConstDGtJumpIfZeroD(_, o) => *o = new_off as i32,
+                StackOp::FusedGetF64ConstDGtJumpIfZeroD(_, _, o) => *o = new_off as i32,
                 _ => unreachable!(),
             }
         }
diff --git a/src/stack_interp.c b/src/stack_interp.c
index 116ee8c6..08bb3767 100644
--- a/src/stack_interp.c
+++ b/src/stack_interp.c
@@ -103,6 +103,16 @@ static inline void store_f32_unaligned(void* p, float v) {
     memcpy(p, &v, sizeof(v));
 }
 
+static inline double load_f64_unaligned(const void* p) {
+    double v;
+    memcpy(&v, p, sizeof(v));
+    return v;
+}
+
+static inline void store_f64_unaligned(void* p, double v) {
+    memcpy(p, &v, sizeof(v));
+}
+
 // ============================================================================
 // Integer power
 // ============================================================================
@@ -147,7 +157,7 @@ static int64_t ipow(int64_t base, uint32_t exp) {
 // cross between GPR and FP register files, dodging the ~3-cycle
 // fmov/movq penalty that the old "f32 bit-pattern in u64" design
 // paid on every float arithmetic op.
-#define HANDLER_ARGS Ctx* ctx, Instruction* pc, uint64_t* sp, float* fsp, uint64_t* locals, uint64_t t0, uint64_t t1, uint64_t t2, uint64_t t3, float f0, float f1, float f2, float f3, void* _nh_raw
+#define HANDLER_ARGS Ctx* ctx, Instruction* pc, uint64_t* sp, float* fsp, double* dfsp, uint64_t* locals, uint64_t t0, uint64_t t1, uint64_t t2, uint64_t t3, float f0, float f1, float f2, float f3, double d0, double d1, double d2, double d3, void* _nh_raw
 
 #define HANDLER(name) PRESERVE_NONE void name(HANDLER_ARGS)
 // Cast nh from void* for use in NEXT macro.
@@ -163,14 +173,14 @@ static int64_t ipow(int64_t base, uint32_t exp) {
     do { \
         Instruction* _next = pc + 1; \
         void* _new_nh = (_next + 1)->handler; \
-        __attribute__((musttail)) return ((Handler)_nh_raw)(ctx, _next, sp, fsp, locals, t0, t1, t2, t3, f0, f1, f2, f3, _new_nh); \
+        __attribute__((musttail)) return ((Handler)_nh_raw)(ctx, _next, sp, fsp, dfsp, locals, t0, t1, t2, t3, f0, f1, f2, f3, d0, d1, d2, d3, _new_nh); \
     } while(0)
 
 #define DISPATCH() \
     do { \
         Handler _target_h = (Handler)pc->handler; \
         void* _new_nh = (pc + 1)->handler; \
-        __attribute__((musttail)) return _target_h(ctx, pc, sp, fsp, locals, t0, t1, t2, t3, f0, f1, f2, f3, _new_nh); \
+        __attribute__((musttail)) return _target_h(ctx, pc, sp, fsp, dfsp, locals, t0, t1, t2, t3, f0, f1, f2, f3, d0, d1, d2, d3, _new_nh); \
     } while(0)
 
 // How many backward jumps between cancel-callback invocations. Must match
@@ -211,6 +221,22 @@ static int64_t ipow(int64_t base, uint32_t exp) {
 #define FDROP1() do { f0 = f1; f1 = f2; f2 = f3; f3 = *--fsp; } while(0)
 #define FBINOP_SHIFT() do { f1 = f2; f2 = f3; f3 = *--fsp; } while(0)
 
+// f64 TOS window push/pop — exact mirror of the f32 window above, but
+// typed `double` and spilling through `dfsp`. Keeps f64 values in FP
+// registers across arithmetic, dodging the GPR↔FP crossings the old
+// integer-window f64 path paid on every op.
+#define DPUSH(val) do { \
+    *dfsp++ = d3; \
+    d3 = d2; d2 = d1; d1 = d0; d0 = (val); \
+} while(0)
+
+#define DPOP(dst) do { \
+    (dst) = d0; d0 = d1; d1 = d2; d2 = d3; d3 = *--dfsp; \
+} while(0)
+
+#define DDROP1() do { d0 = d1; d1 = d2; d2 = d3; d3 = *--dfsp; } while(0)
+#define DBINOP_SHIFT() do { d1 = d2; d2 = d3; d3 = *--dfsp; } while(0)
+
 // ============================================================================
 // Handlers
 // ============================================================================
@@ -1766,6 +1792,226 @@ HANDLER(op_print_f32_f) {
     NEXT();
 }
 
+// ============================================================================
+// Double-window (D) handlers — the f64 analogue of the float window.
+// f64 values live in d0..d3 (FP regs), spilling through dfsp. Each handler
+// mirrors its f32 counterpart above with float→double / f→d / fsp→dfsp.
+// ============================================================================
+
+// --- Constants and locals (double window) ---
+
+HANDLER(op_f64_const_d) {
+    // imm[0] is the f64 bit pattern. Reinterpret as double and push.
+    DPUSH(as_f64(pc->imm[0]));
+    NEXT();
+}
+
+// imm[0] is a pre-shifted byte offset into locals[] (see encode_imm),
+// so the handler emits a single `ldr d, [locals, imm0]` with no shift.
+HANDLER(op_local_get_d) {
+    DPUSH(*(double*)((uint8_t*)locals + pc->imm[0]));
+    NEXT();
+}
+HANDLER(op_local_set_d) {
+    *(double*)((uint8_t*)locals + pc->imm[0]) = d0;
+    DDROP1();
+    NEXT();
+}
+HANDLER(op_local_tee_d) {
+    *(double*)((uint8_t*)locals + pc->imm[0]) = d0;
+    NEXT();
+}
+HANDLER(op_drop_d) {
+    DDROP1();
+    NEXT();
+}
+
+// --- Double arithmetic (binary): pop b=d0, a=d1, push a OP b ---
+
+HANDLER(op_dadd_d) {
+    d0 = d1 + d0;
+    DBINOP_SHIFT();
+    NEXT();
+}
+HANDLER(op_dsub_d) {
+    d0 = d1 - d0;
+    DBINOP_SHIFT();
+    NEXT();
+}
+HANDLER(op_dmul_d) {
+    d0 = d1 * d0;
+    DBINOP_SHIFT();
+    NEXT();
+}
+HANDLER(op_ddiv_d) {
+    d0 = d1 / d0;
+    DBINOP_SHIFT();
+    NEXT();
+}
+HANDLER(op_dpow_d) {
+    d0 = pow(d1, d0);
+    DBINOP_SHIFT();
+    NEXT();
+}
+HANDLER(op_dneg_d) {
+    d0 = -d0;
+    NEXT();
+}
+
+// --- Comparisons: pop 2 from d-window, push 0/1 to int window ---
+
+#define DW_CMP(name, op) \
+HANDLER(name) { \
+    PUSH((d1 op d0) ? 1ULL : 0ULL); \
+    /* drop both doubles */ \
+    d0 = d2; d1 = d3; d2 = *--dfsp; d3 = *--dfsp; \
+    NEXT(); \
+}
+DW_CMP(op_deq_d, ==)
+DW_CMP(op_dne_d, !=)
+DW_CMP(op_dlt_d, <)
+DW_CMP(op_dle_d, <=)
+DW_CMP(op_dgt_d, >)
+DW_CMP(op_dge_d, >=)
+
+// --- Conversions / window crossings ---
+
+HANDLER(op_f64_to_i32_d) {
+    // Pop d0, push int t0 = (int32)d0
+    int64_t v = (int64_t)(int32_t)d0;
+    PUSH((uint64_t)v);
+    DDROP1();
+    NEXT();
+}
+HANDLER(op_i32_to_f64_d) {
+    // Pop t0 (int32), push d0 = (double)i
+    double v = (double)(int32_t)t0;
+    DROP1();
+    DPUSH(v);
+    NEXT();
+}
+HANDLER(op_to_bits_d) {
+    // Pop d0, push int t0 = bit pattern of d0
+    PUSH(from_f64(d0));
+    DDROP1();
+    NEXT();
+}
+HANDLER(op_from_bits_d) {
+    // Pop t0 (bit pattern), push d0 = double
+    double v = as_f64(t0);
+    DROP1();
+    DPUSH(v);
+    NEXT();
+}
+HANDLER(op_f32_to_f64_d) {
+    // Pop f0 (f32) from float window, push widened d0 (f64).
+    double v = (double)f0;
+    FDROP1();
+    DPUSH(v);
+    NEXT();
+}
+HANDLER(op_f64_to_f32_d) {
+    // Pop d0 (f64) from double window, push narrowed f0 (f32).
+    float v = (float)d0;
+    DDROP1();
+    FPUSH(v);
+    NEXT();
+}
+
+// --- Double memory loads: pop addr from int window, push to d-window ---
+
+HANDLER(op_load_f64_d) {
+    double v = load_f64_unaligned((const void*)t0);
+    DROP1();
+    DPUSH(v);
+    NEXT();
+}
+HANDLER(op_load_f64_off_d) {
+    int32_t off = (int32_t)pc->imm[0];
+    double v = load_f64_unaligned((uint8_t*)t0 + off);
+    DROP1();
+    DPUSH(v);
+    NEXT();
+}
+
+// --- Double memory stores: pop d0 (value), pop t0 (addr) ---
+
+HANDLER(op_store_f64_d) {
+    store_f64_unaligned((void*)t0, d0);
+    DROP1();
+    DDROP1();
+    NEXT();
+}
+HANDLER(op_store_f64_off_d) {
+    int32_t off = (int32_t)pc->imm[0];
+    store_f64_unaligned((uint8_t*)t0 + off, d0);
+    DROP1();
+    DDROP1();
+    NEXT();
+}
+
+// --- Math intrinsics (double window) ---
+
+#define DW_F64_UNARY(name, func) \
+HANDLER(name) { \
+    d0 = func(d0); \
+    NEXT(); \
+}
+DW_F64_UNARY(op_sin_f64_d,   sin)
+DW_F64_UNARY(op_cos_f64_d,   cos)
+DW_F64_UNARY(op_tan_f64_d,   tan)
+DW_F64_UNARY(op_asin_f64_d,  asin)
+DW_F64_UNARY(op_acos_f64_d,  acos)
+DW_F64_UNARY(op_atan_f64_d,  atan)
+DW_F64_UNARY(op_sinh_f64_d,  sinh)
+DW_F64_UNARY(op_cosh_f64_d,  cosh)
+DW_F64_UNARY(op_tanh_f64_d,  tanh)
+DW_F64_UNARY(op_asinh_f64_d, asinh)
+DW_F64_UNARY(op_acosh_f64_d, acosh)
+DW_F64_UNARY(op_atanh_f64_d, atanh)
+DW_F64_UNARY(op_ln_f64_d,    log)
+DW_F64_UNARY(op_exp_f64_d,   exp)
+DW_F64_UNARY(op_exp2_f64_d,  exp2)
+DW_F64_UNARY(op_log10_f64_d, log10)
+DW_F64_UNARY(op_log2_f64_d,  log2)
+DW_F64_UNARY(op_sqrt_f64_d,  sqrt)
+DW_F64_UNARY(op_abs_f64_d,   fabs)
+DW_F64_UNARY(op_floor_f64_d, floor)
+DW_F64_UNARY(op_ceil_f64_d,  ceil)
+
+HANDLER(op_atan2_f64_d) {
+    // Binary in d-window: pop b=d0, a=d1, push atan2(a, b).
+    d0 = atan2(d1, d0);
+    DBINOP_SHIFT();
+    NEXT();
+}
+
+HANDLER(op_isnan_f64_d) {
+    int v = isnan(d0) ? 1 : 0;
+    PUSH((uint64_t)v);
+    DDROP1();
+    NEXT();
+}
+HANDLER(op_isinf_f64_d) {
+    int v = isinf(d0) ? 1 : 0;
+    PUSH((uint64_t)v);
+    DDROP1();
+    NEXT();
+}
+
+// --- Debug ---
+
+HANDLER(op_print_f64_d) {
+    double val = d0;
+    DDROP1();
+    if (val == floor(val) && fabs(val) < 1e15) {
+        printf("%.1f\n", val);
+    } else {
+        printf("%g\n", val);
+    }
+    NEXT();
+}
+
 // --- Float-window fused superinstructions (Phase 5) ---
 
 // NOTE: imm[0] and imm[1] on these handlers are pre-shifted byte offsets
@@ -1959,6 +2205,119 @@ HANDLER(op_fused_get_f32const_fgt_jiz_f) {
     NEXT();
 }
 
+// ============================================================================
+// Double-window (D) fused superinstructions — mirror the F-window set.
+// Operands are read directly from locals[] by pre-shifted byte offset.
+// ============================================================================
+
+HANDLER(op_fused_get_get_dmul_d) {
+    double a = *(double*)((uint8_t*)locals + pc->imm[0]);
+    double b = *(double*)((uint8_t*)locals + pc->imm[1]);
+    DPUSH(a * b);
+    NEXT();
+}
+HANDLER(op_fused_get_get_dmul_dadd_d) {
+    double a = *(double*)((uint8_t*)locals + pc->imm[0]);
+    double b = *(double*)((uint8_t*)locals + pc->imm[1]);
+    d0 = d0 + a * b;
+    NEXT();
+}
+HANDLER(op_fused_get_get_dmul_dsub_d) {
+    double a = *(double*)((uint8_t*)locals + pc->imm[0]);
+    double b = *(double*)((uint8_t*)locals + pc->imm[1]);
+    d0 = d0 - a * b;
+    NEXT();
+}
+
+#define DMUL_SUM_HANDLER(name, TERMS) \
+HANDLER(name) { \
+    uint8_t sub_mask = (uint8_t)pc->imm[2]; \
+    double acc = 0.0; \
+    for (int i = 0; i < (TERMS); i++) { \
+        uint8_t a_idx = imm_u8(pc, i * 2); \
+        uint8_t b_idx = imm_u8(pc, i * 2 + 1); \
+        double a = *(double*)((uint8_t*)locals + (size_t)a_idx * 8); \
+        double b = *(double*)((uint8_t*)locals + (size_t)b_idx * 8); \
+        double prod = a * b; \
+        acc = (sub_mask & (1u << i)) ? (acc - prod) : (acc + prod); \
+    } \
+    DPUSH(acc); \
+    NEXT(); \
+}
+DMUL_SUM_HANDLER(op_fused_get_get_dmul_sum2_d, 2)
+DMUL_SUM_HANDLER(op_fused_get_get_dmul_sum3_d, 3)
+DMUL_SUM_HANDLER(op_fused_get_get_dmul_sum4_d, 4)
+DMUL_SUM_HANDLER(op_fused_get_get_dmul_sum5_d, 5)
+DMUL_SUM_HANDLER(op_fused_get_get_dmul_sum6_d, 6)
+DMUL_SUM_HANDLER(op_fused_get_get_dmul_sum7_d, 7)
+DMUL_SUM_HANDLER(op_fused_get_get_dmul_sum8_d, 8)
+
+#define COPY_D_PAIR(N) do { \
+    uint8_t src = imm_u8(pc, (N) * 2); \
+    uint8_t dst = imm_u8(pc, (N) * 2 + 1); \
+    *(double*)((uint8_t*)locals + (size_t)dst * 8) = *(double*)((uint8_t*)locals + (size_t)src * 8); \
+} while (0)
+
+HANDLER(op_fused_get_set_d) {
+    COPY_D_PAIR(0);
+    NEXT();
+}
+HANDLER(op_fused_get_set2_d) {
+    COPY_D_PAIR(0); COPY_D_PAIR(1);
+    NEXT();
+}
+HANDLER(op_fused_get_set3_d) {
+    COPY_D_PAIR(0); COPY_D_PAIR(1); COPY_D_PAIR(2);
+    NEXT();
+}
+HANDLER(op_fused_get_set4_d) {
+    COPY_D_PAIR(0); COPY_D_PAIR(1); COPY_D_PAIR(2); COPY_D_PAIR(3);
+    NEXT();
+}
+HANDLER(op_fused_get_set5_d) {
+    COPY_D_PAIR(0); COPY_D_PAIR(1); COPY_D_PAIR(2); COPY_D_PAIR(3); COPY_D_PAIR(4);
+    NEXT();
+}
+HANDLER(op_fused_get_set6_d) {
+    COPY_D_PAIR(0); COPY_D_PAIR(1); COPY_D_PAIR(2); COPY_D_PAIR(3); COPY_D_PAIR(4);
+    COPY_D_PAIR(5);
+    NEXT();
+}
+HANDLER(op_fused_get_set7_d) {
+    COPY_D_PAIR(0); COPY_D_PAIR(1); COPY_D_PAIR(2); COPY_D_PAIR(3); COPY_D_PAIR(4);
+    COPY_D_PAIR(5); COPY_D_PAIR(6);
+    NEXT();
+}
+HANDLER(op_fused_get_set8_d) {
+    COPY_D_PAIR(0); COPY_D_PAIR(1); COPY_D_PAIR(2); COPY_D_PAIR(3); COPY_D_PAIR(4);
+    COPY_D_PAIR(5); COPY_D_PAIR(6); COPY_D_PAIR(7);
+    NEXT();
+}
+
+HANDLER(op_fused_f64const_dgt_jiz_d) {
+    double val = d0;
+    double limit = as_f64(pc->imm[0]);
+    DDROP1();
+    if (!(val > limit)) {
+        int64_t off = (int64_t)pc->imm[1];
+        pc = pc + 1 + off;
+        if (off < 0) POLL_CANCEL();
+        DISPATCH();
+    }
+    NEXT();
+}
+HANDLER(op_fused_get_f64const_dgt_jiz_d) {
+    double val = *(double*)((uint8_t*)locals + pc->imm[0]);
+    double limit = as_f64(pc->imm[1]);
+    if (!(val > limit)) {
+        int64_t off = (int64_t)pc->imm[2];
+        pc = pc + 1 + off;
+        if (off < 0) POLL_CANCEL();
+        DISPATCH();
+    }
+    NEXT();
+}
+
 // ============================================================================
 // Entry point
 // ============================================================================
@@ -1992,7 +2351,7 @@ int64_t stack_interp_run(Ctx* ctx, uint32_t entry_func) {
     // and x86-64.
     Instruction* pc = ctx->functions[entry_func].code;
     Handler initial_nh = (Handler)(pc + 1)->handler;
-    ((Handler)pc->handler)(ctx, pc, ctx->stack_base, ctx->float_stack, entry_locals, 0, 0, 0, 0, 0.0f, 0.0f, 0.0f, 0.0f, initial_nh);
+    ((Handler)pc->handler)(ctx, pc, ctx->stack_base, ctx->float_stack, ctx->double_stack, entry_locals, 0, 0, 0, 0, 0.0f, 0.0f, 0.0f, 0.0f, 0.0, 0.0, 0.0, 0.0, initial_nh);
 
     return ctx->result;
 }
diff --git a/src/stack_interp.h b/src/stack_interp.h
index 2c9506f1..9ba3a360 100644
--- a/src/stack_interp.h
+++ b/src/stack_interp.h
@@ -87,6 +87,15 @@ typedef struct Ctx {
     float*       float_stack;
     size_t       float_stack_cap;
 
+    // Double spill stack: backing store for the f64 TOS window (d0..d3)
+    // when its depth exceeds 4. Mirrors float_stack but typed `double`,
+    // so f64 arithmetic stays in FP registers and never pays the GPR↔FP
+    // crossing the old "f64 bit-pattern in u64" design forced. The live
+    // top pointer lives in the `dfsp` handler argument; this is the base
+    // for bounds checks and the initial value passed to the entry handler.
+    double*      double_stack;
+    size_t       double_stack_cap;
+
     // Closure pointer (set by call_closure, read by handlers)
     uint64_t     closure_ptr;
 
@@ -148,25 +157,34 @@ typedef struct Ctx {
 // float values coexist on the logical stack; static types at each
 // position tell the codegen which window to use.
 //
-// The window is typed as `float` (not `double`) so f32 arithmetic
-// compiles to direct single-precision FMA/fadd/... instructions
-// without the fcvt round-trips that a double-typed window forces on
-// every op. f64 values — rare in our hot workloads — still travel
-// through the integer window paying GPR↔FP crossings.
+// The f32 window is typed as `float` (not `double`) so f32 arithmetic
+// compiles to direct single-precision FMA/fadd/... instructions without
+// the fcvt round-trips that a double-typed window forces on every op.
+//
+// f64 gets its own parallel 4-slot window (d0..d3) typed `double`, living
+// in a separate set of FP/SIMD registers, with its own spill pointer
+// `dfsp`. f64 arithmetic stays in FP registers throughout, the same way
+// f32 does. The two windows together use 8 FP argument registers
+// (v0..v7 on aarch64, xmm0..xmm7 on x86-64) — the full FP arg budget.
 typedef PRESERVE_NONE void (*Handler)(
     Ctx*          ctx,
     Instruction*  pc,
     uint64_t*     sp,
-    float*        fsp,    // float spill pointer (lives in a GPR via preserve_none)
+    float*        fsp,    // float (f32) spill pointer (lives in a GPR via preserve_none)
+    double*       dfsp,   // double (f64) spill pointer (lives in a GPR via preserve_none)
     uint64_t*     locals, // frame pointer: scalars, then local memory contiguously
     uint64_t      t0,     // int TOS window (GPRs)
     uint64_t      t1,
     uint64_t      t2,
     uint64_t      t3,
-    float         f0,     // float TOS window (FP regs)
+    float         f0,     // f32 TOS window (FP regs)
     float         f1,
     float         f2,
     float         f3,
+    double        d0,     // f64 TOS window (FP regs)
+    double        d1,
+    double        d2,
+    double        d3,
     void*         nh      // preloaded handler for the NEXT instruction (cast to Handler)
 );
 
diff --git a/src/stack_interp_bridge.rs b/src/stack_interp_bridge.rs
index 4d6615e4..c87972f3 100644
--- a/src/stack_interp_bridge.rs
+++ b/src/stack_interp_bridge.rs
@@ -42,6 +42,8 @@ struct Ctx {
     stack_base: *mut u64,
     float_stack: *mut f32,
     float_stack_cap: usize,
+    double_stack: *mut f64,
+    double_stack_cap: usize,
     closure_ptr: u64,
     result: i64,
     done: i32,
@@ -286,6 +288,82 @@ extern "C" {
     fn op_fused_get_set8_f();
     fn op_fused_f32const_fgt_jiz_f();
     fn op_fused_get_f32const_fgt_jiz_f();
+
+    // === Double-window (D) handlers ===
+    fn op_f64_const_d();
+    fn op_local_get_d();
+    fn op_local_set_d();
+    fn op_local_tee_d();
+    fn op_drop_d();
+    fn op_dadd_d();
+    fn op_dsub_d();
+    fn op_dmul_d();
+    fn op_ddiv_d();
+    fn op_dpow_d();
+    fn op_dneg_d();
+    fn op_deq_d();
+    fn op_dne_d();
+    fn op_dlt_d();
+    fn op_dle_d();
+    fn op_dgt_d();
+    fn op_dge_d();
+    fn op_f64_to_i32_d();
+    fn op_i32_to_f64_d();
+    fn op_to_bits_d();
+    fn op_from_bits_d();
+    fn op_f32_to_f64_d();
+    fn op_f64_to_f32_d();
+    fn op_load_f64_d();
+    fn op_load_f64_off_d();
+    fn op_store_f64_d();
+    fn op_store_f64_off_d();
+    fn op_sin_f64_d();
+    fn op_cos_f64_d();
+    fn op_tan_f64_d();
+    fn op_asin_f64_d();
+    fn op_acos_f64_d();
+    fn op_atan_f64_d();
+    fn op_sinh_f64_d();
+    fn op_cosh_f64_d();
+    fn op_tanh_f64_d();
+    fn op_asinh_f64_d();
+    fn op_acosh_f64_d();
+    fn op_atanh_f64_d();
+    fn op_ln_f64_d();
+    fn op_exp_f64_d();
+    fn op_exp2_f64_d();
+    fn op_log10_f64_d();
+    fn op_log2_f64_d();
+    fn op_sqrt_f64_d();
+    fn op_abs_f64_d();
+    fn op_floor_f64_d();
+    fn op_ceil_f64_d();
+    fn op_atan2_f64_d();
+    fn op_isnan_f64_d();
+    fn op_isinf_f64_d();
+    fn op_print_f64_d();
+
+    // === Double-window fused handlers ===
+    fn op_fused_get_get_dmul_d();
+    fn op_fused_get_get_dmul_dadd_d();
+    fn op_fused_get_get_dmul_dsub_d();
+    fn op_fused_get_get_dmul_sum2_d();
+    fn op_fused_get_get_dmul_sum3_d();
+    fn op_fused_get_get_dmul_sum4_d();
+    fn op_fused_get_get_dmul_sum5_d();
+    fn op_fused_get_get_dmul_sum6_d();
+    fn op_fused_get_get_dmul_sum7_d();
+    fn op_fused_get_get_dmul_sum8_d();
+    fn op_fused_get_set_d();
+    fn op_fused_get_set2_d();
+    fn op_fused_get_set3_d();
+    fn op_fused_get_set4_d();
+    fn op_fused_get_set5_d();
+    fn op_fused_get_set6_d();
+    fn op_fused_get_set7_d();
+    fn op_fused_get_set8_d();
+    fn op_fused_f64const_dgt_jiz_d();
+    fn op_fused_get_f64const_dgt_jiz_d();
 }
 
 /// Get the C handler function pointer for a StackOp.
@@ -522,6 +600,84 @@ fn handler_for(op: &StackOp) -> *const () {
         StackOp::FusedGetF32ConstFGtJumpIfZeroF(_, _, _) => {
             op_fused_get_f32const_fgt_jiz_f as *const ()
         }
+
+        // === Double-window (D) ops ===
+        StackOp::F64ConstD(_) => op_f64_const_d as *const (),
+        StackOp::LocalGetD(_) => op_local_get_d as *const (),
+        StackOp::LocalSetD(_) => op_local_set_d as *const (),
+        StackOp::LocalTeeD(_) => op_local_tee_d as *const (),
+        StackOp::DropD => op_drop_d as *const (),
+        StackOp::DAddD => op_dadd_d as *const (),
+        StackOp::DSubD => op_dsub_d as *const (),
+        StackOp::DMulD => op_dmul_d as *const (),
+        StackOp::DDivD => op_ddiv_d as *const (),
+        StackOp::DPowD => op_dpow_d as *const (),
+        StackOp::DNegD => op_dneg_d as *const (),
+        StackOp::DEqD => op_deq_d as *const (),
+        StackOp::DNeD => op_dne_d as *const (),
+        StackOp::DLtD => op_dlt_d as *const (),
+        StackOp::DLeD => op_dle_d as *const (),
+        StackOp::DGtD => op_dgt_d as *const (),
+        StackOp::DGeD => op_dge_d as *const (),
+        StackOp::F64ToI32D => op_f64_to_i32_d as *const (),
+        StackOp::I32ToF64D => op_i32_to_f64_d as *const (),
+        StackOp::DToBitsD => op_to_bits_d as *const (),
+        StackOp::BitsToDD => op_from_bits_d as *const (),
+        StackOp::F32ToF64D => op_f32_to_f64_d as *const (),
+        StackOp::F64ToF32D => op_f64_to_f32_d as *const (),
+        StackOp::LoadF64D => op_load_f64_d as *const (),
+        StackOp::LoadF64OffD(_) => op_load_f64_off_d as *const (),
+        StackOp::StoreF64D => op_store_f64_d as *const (),
+        StackOp::StoreF64OffD(_) => op_store_f64_off_d as *const (),
+        StackOp::SinF64D => op_sin_f64_d as *const (),
+        StackOp::CosF64D => op_cos_f64_d as *const (),
+        StackOp::TanF64D => op_tan_f64_d as *const (),
+        StackOp::AsinF64D => op_asin_f64_d as *const (),
+        StackOp::AcosF64D => op_acos_f64_d as *const (),
+        StackOp::AtanF64D => op_atan_f64_d as *const (),
+        StackOp::SinhF64D => op_sinh_f64_d as *const (),
+        StackOp::CoshF64D => op_cosh_f64_d as *const (),
+        StackOp::TanhF64D => op_tanh_f64_d as *const (),
+        StackOp::AsinhF64D => op_asinh_f64_d as *const (),
+        StackOp::AcoshF64D => op_acosh_f64_d as *const (),
+        StackOp::AtanhF64D => op_atanh_f64_d as *const (),
+        StackOp::LnF64D => op_ln_f64_d as *const (),
+        StackOp::ExpF64D => op_exp_f64_d as *const (),
+        StackOp::Exp2F64D => op_exp2_f64_d as *const (),
+        StackOp::Log10F64D => op_log10_f64_d as *const (),
+        StackOp::Log2F64D => op_log2_f64_d as *const (),
+        StackOp::SqrtF64D => op_sqrt_f64_d as *const (),
+        StackOp::AbsF64D => op_abs_f64_d as *const (),
+        StackOp::FloorF64D => op_floor_f64_d as *const (),
+        StackOp::CeilF64D => op_ceil_f64_d as *const (),
+        StackOp::Atan2F64D => op_atan2_f64_d as *const (),
+        StackOp::IsnanF64D => op_isnan_f64_d as *const (),
+        StackOp::IsinfF64D => op_isinf_f64_d as *const (),
+        StackOp::PrintF64D => op_print_f64_d as *const (),
+
+        // === Double-window fused ops ===
+        StackOp::FusedGetGetDMulD(_, _) => op_fused_get_get_dmul_d as *const (),
+        StackOp::FusedGetGetDMulDAddD(_, _) => op_fused_get_get_dmul_dadd_d as *const (),
+        StackOp::FusedGetGetDMulDSubD(_, _) => op_fused_get_get_dmul_dsub_d as *const (),
+        StackOp::FusedGetGetDMulSum2D(_, _) => op_fused_get_get_dmul_sum2_d as *const (),
+        StackOp::FusedGetGetDMulSum3D(_, _) => op_fused_get_get_dmul_sum3_d as *const (),
+        StackOp::FusedGetGetDMulSum4D(_, _) => op_fused_get_get_dmul_sum4_d as *const (),
+        StackOp::FusedGetGetDMulSum5D(_, _) => op_fused_get_get_dmul_sum5_d as *const (),
+        StackOp::FusedGetGetDMulSum6D(_, _) => op_fused_get_get_dmul_sum6_d as *const (),
+        StackOp::FusedGetGetDMulSum7D(_, _) => op_fused_get_get_dmul_sum7_d as *const (),
+        StackOp::FusedGetGetDMulSum8D(_, _) => op_fused_get_get_dmul_sum8_d as *const (),
+        StackOp::FusedGetSetD(_, _) => op_fused_get_set_d as *const (),
+        StackOp::FusedGetSet2D(_) => op_fused_get_set2_d as *const (),
+        StackOp::FusedGetSet3D(_) => op_fused_get_set3_d as *const (),
+        StackOp::FusedGetSet4D(_) => op_fused_get_set4_d as *const (),
+        StackOp::FusedGetSet5D(_) => op_fused_get_set5_d as *const (),
+        StackOp::FusedGetSet6D(_) => op_fused_get_set6_d as *const (),
+        StackOp::FusedGetSet7D(_) => op_fused_get_set7_d as *const (),
+        StackOp::FusedGetSet8D(_) => op_fused_get_set8_d as *const (),
+        StackOp::FusedF64ConstDGtJumpIfZeroD(_, _) => op_fused_f64const_dgt_jiz_d as *const (),
+        StackOp::FusedGetF64ConstDGtJumpIfZeroD(_, _, _) => {
+            op_fused_get_f64const_dgt_jiz_d as *const ()
+        }
     }
 }
 
@@ -719,6 +875,42 @@ fn encode_imm(op: &StackOp, func_idx: u32) -> [u64; 3] {
         StackOp::FusedGetF32ConstFGtJumpIfZeroF(n, v, off) => {
             [(*n as u64) * 8, f32::to_bits(*v) as u64, *off as i64 as u64]
         }
+        // === Double-window (D) ops ===
+        // f64 const: imm[0] is the f64 bit pattern. Local indices are
+        // pre-shifted to byte offsets so the handler emits `ldr d` with
+        // no scale (locals[] has 8-byte stride, same as the f32 path).
+        StackOp::F64ConstD(v) => [f64::to_bits(*v), 0, 0],
+        StackOp::LocalGetD(n) | StackOp::LocalSetD(n) | StackOp::LocalTeeD(n) => {
+            [(*n as u64) * 8, 0, 0]
+        }
+        StackOp::LoadF64OffD(o) | StackOp::StoreF64OffD(o) => [*o as i64 as u64, 0, 0],
+        // Double-window fused ops. get_get_dmul* carry two pre-shifted byte
+        // offsets; the sum/get-set chains carry raw u8 local indices (the
+        // handler shifts them); the compare-jumps carry an f64 const + off.
+        StackOp::FusedGetGetDMulD(a, b)
+        | StackOp::FusedGetGetDMulDAddD(a, b)
+        | StackOp::FusedGetGetDMulDSubD(a, b) => [(*a as u64) * 8, (*b as u64) * 8, 0],
+        StackOp::FusedGetGetDMulSum2D(p, mask) => pack_u8_imms_with_tail(p, *mask),
+        StackOp::FusedGetGetDMulSum3D(p, mask) => pack_u8_imms_with_tail(p, *mask),
+        StackOp::FusedGetGetDMulSum4D(p, mask) => pack_u8_imms_with_tail(p, *mask),
+        StackOp::FusedGetGetDMulSum5D(p, mask) => pack_u8_imms_with_tail(p, *mask),
+        StackOp::FusedGetGetDMulSum6D(p, mask) => pack_u8_imms_with_tail(p, *mask),
+        StackOp::FusedGetGetDMulSum7D(p, mask) => pack_u8_imms_with_tail(p, *mask),
+        StackOp::FusedGetGetDMulSum8D(p, mask) => pack_u8_imms_with_tail(p, *mask),
+        StackOp::FusedGetSetD(src, dst) => pack_u8_imms(&[*src, *dst]),
+        StackOp::FusedGetSet2D(p) => pack_u8_imms(p),
+        StackOp::FusedGetSet3D(p) => pack_u8_imms(p),
+        StackOp::FusedGetSet4D(p) => pack_u8_imms(p),
+        StackOp::FusedGetSet5D(p) => pack_u8_imms(p),
+        StackOp::FusedGetSet6D(p) => pack_u8_imms(p),
+        StackOp::FusedGetSet7D(p) => pack_u8_imms(p),
+        StackOp::FusedGetSet8D(p) => pack_u8_imms(p),
+        StackOp::FusedF64ConstDGtJumpIfZeroD(v, off) => {
+            [f64::to_bits(*v), *off as i64 as u64, 0]
+        }
+        StackOp::FusedGetF64ConstDGtJumpIfZeroD(n, v, off) => {
+            [(*n as u64) * 8, f64::to_bits(*v), *off as i64 as u64]
+        }
         _ => [0, 0, 0],
     }
 }
@@ -749,6 +941,7 @@ pub struct StackBackend {
     operand_stack: Vec<u64>,
     frame_stack: Vec<u64>,
     float_stack: Vec<f32>,
+    double_stack: Vec<f64>,
     ctx: Ctx,
 }
 
@@ -793,6 +986,8 @@ impl StackBackend {
         let mut frame_stack: Vec<u64> = vec![0u64; frame_stack_cap];
         let float_stack_cap: usize = 64 * 1024;
         let mut float_stack: Vec<f32> = vec![0.0f32; float_stack_cap];
+        let double_stack_cap: usize = 64 * 1024;
+        let mut double_stack: Vec<f64> = vec![0.0f64; double_stack_cap];
 
         let ctx = Ctx {
             call_stack: call_stack.as_mut_ptr(),
@@ -807,6 +1002,8 @@ impl StackBackend {
             stack_base: operand_stack.as_mut_ptr(),
             float_stack: float_stack.as_mut_ptr(),
             float_stack_cap,
+            double_stack: double_stack.as_mut_ptr(),
+            double_stack_cap,
             closure_ptr: 0,
             result: 0,
             done: 0,
@@ -825,6 +1022,7 @@ impl StackBackend {
             operand_stack,
             frame_stack,
             float_stack,
+            double_stack,
             ctx,
         }
     }
@@ -869,6 +1067,7 @@ impl StackBackend {
             &self.operand_stack,
             &self.frame_stack,
             &self.float_stack,
+            &self.double_stack,
         );
         result
     }
diff --git a/src/stack_ir.rs b/src/stack_ir.rs
index 5707e629..5e0d06d6 100644
--- a/src/stack_ir.rs
+++ b/src/stack_ir.rs
@@ -483,6 +483,121 @@ pub enum StackOp {
     /// if !(locals[n] > const) jump. Pop 0, conditionally jump.
     FusedGetF32ConstFGtJumpIfZeroF(u16, f32, i32),
 
+    // === Double-window (D) ops — the f64 analogue of the float window ===
+    // f64 values live in a parallel 4-slot FP register window (d0..d3),
+    // spilling through `dfsp`. These mirror the F-window ops one-for-one
+    // so f64 arithmetic stays in FP registers and never pays the GPR↔FP
+    // crossing the old int-window f64 path forced on every op.
+    /// Push an f64 constant onto the double window.
+    F64ConstD(f64),
+    /// Push scalar local N (interpreted as f64) onto the double window.
+    LocalGetD(u16),
+    /// Pop top of double window into scalar local N (stored as f64 bits).
+    LocalSetD(u16),
+    /// Copy top of double window into scalar local N (don't pop).
+    LocalTeeD(u16),
+    /// Pop and discard top of double window.
+    DropD,
+
+    // Double arithmetic (binary pop 2 push 1, all in d-window).
+    DAddD,
+    DSubD,
+    DMulD,
+    DDivD,
+    DPowD,
+    /// Pop 1 push 1 (negate) on d-window.
+    DNegD,
+
+    // Double comparisons: pop 2 from d-window, push 0/1 to the int window.
+    DEqD,
+    DNeD,
+    DLtD,
+    DLeD,
+    DGtD,
+    DGeD,
+
+    // Crossings between the double window and the int / float windows.
+    /// Pop d0 (f64), push i32 (truncated) to int window.
+    F64ToI32D,
+    /// Pop i32 from int window, push f64 to d-window.
+    I32ToF64D,
+    /// Pop d0, push its raw u64 bit pattern to the int window (call/return
+    /// bridge — mirrors FToBitsF).
+    DToBitsD,
+    /// Pop u64 bits from int window, push as f64 to d-window (mirrors BitsToFF).
+    BitsToDD,
+    /// Pop f0 (f32) from the float window, push widened f64 to d-window.
+    F32ToF64D,
+    /// Pop d0 (f64) from the double window, push narrowed f32 to f-window.
+    F64ToF32D,
+
+    // Double memory loads/stores: address comes from the int window, the
+    // value lives in the d-window.
+    LoadF64D,
+    LoadF64OffD(i32),
+    StoreF64D,
+    StoreF64OffD(i32),
+
+    // Double math intrinsics (pop 1 push 1 in d-window unless noted).
+    SinF64D,
+    CosF64D,
+    TanF64D,
+    AsinF64D,
+    AcosF64D,
+    AtanF64D,
+    SinhF64D,
+    CoshF64D,
+    TanhF64D,
+    AsinhF64D,
+    AcoshF64D,
+    AtanhF64D,
+    LnF64D,
+    ExpF64D,
+    Exp2F64D,
+    Log10F64D,
+    Log2F64D,
+    SqrtF64D,
+    AbsF64D,
+    FloorF64D,
+    CeilF64D,
+    /// Pop d0, push i32 0/1 to int window.
+    IsnanF64D,
+    IsinfF64D,
+    /// Pop 2 from d-window, push 1 (atan2).
+    Atan2F64D,
+    /// Pop d0 and print it.
+    PrintF64D,
+
+    // === Double-window fused superinstructions (mirror the F-window set) ===
+    /// Push locals[a] * locals[b] onto the d-window.
+    FusedGetGetDMulD(u16, u16),
+    /// d0 += locals[a] * locals[b] (accumulate; net 0 on the d-window).
+    FusedGetGetDMulDAddD(u16, u16),
+    /// d0 -= locals[a] * locals[b].
+    FusedGetGetDMulDSubD(u16, u16),
+    /// Fused multiply-accumulate sum: read N (local,local) pairs, multiply
+    /// each, sum with per-term add/sub from the mask, push one result.
+    FusedGetGetDMulSum2D([u8; 4], u8),
+    FusedGetGetDMulSum3D([u8; 6], u8),
+    FusedGetGetDMulSum4D([u8; 8], u8),
+    FusedGetGetDMulSum5D([u8; 10], u8),
+    FusedGetGetDMulSum6D([u8; 12], u8),
+    FusedGetGetDMulSum7D([u8; 14], u8),
+    FusedGetGetDMulSum8D([u8; 16], u8),
+    /// d-window variable-move chains: locals[dst] = locals[src] (×N).
+    FusedGetSetD(u8, u8),
+    FusedGetSet2D([u8; 4]),
+    FusedGetSet3D([u8; 6]),
+    FusedGetSet4D([u8; 8]),
+    FusedGetSet5D([u8; 10]),
+    FusedGetSet6D([u8; 12]),
+    FusedGetSet7D([u8; 14]),
+    FusedGetSet8D([u8; 16]),
+    /// if !(d0 > const) jump; pops d0.
+    FusedF64ConstDGtJumpIfZeroD(f64, i32),
+    /// if !(locals[n] > const) jump. Pop 0, conditionally jump.
+    FusedGetF64ConstDGtJumpIfZeroD(u16, f64, i32),
+
     Halt,
     Nop,
 }
@@ -1034,6 +1149,146 @@ impl fmt::Display for StackOp {
             StackOp::FusedGetF32ConstFGtJumpIfZeroF(n, v, o) => {
                 write!(f, "fw.fused.get_f32const_fgt_jiz {} {} {}", n, v, o)
             }
+            // === Double-window (D) ops ===
+            StackOp::F64ConstD(v) => write!(f, "dw.f64.const {}", v),
+            StackOp::LocalGetD(n) => write!(f, "dw.local.get {}", n),
+            StackOp::LocalSetD(n) => write!(f, "dw.local.set {}", n),
+            StackOp::LocalTeeD(n) => write!(f, "dw.local.tee {}", n),
+            StackOp::DropD => write!(f, "dw.drop"),
+            StackOp::DAddD => write!(f, "dw.f64.add"),
+            StackOp::DSubD => write!(f, "dw.f64.sub"),
+            StackOp::DMulD => write!(f, "dw.f64.mul"),
+            StackOp::DDivD => write!(f, "dw.f64.div"),
+            StackOp::DPowD => write!(f, "dw.f64.pow"),
+            StackOp::DNegD => write!(f, "dw.f64.neg"),
+            StackOp::DEqD => write!(f, "dw.f64.eq"),
+            StackOp::DNeD => write!(f, "dw.f64.ne"),
+            StackOp::DLtD => write!(f, "dw.f64.lt"),
+            StackOp::DLeD => write!(f, "dw.f64.le"),
+            StackOp::DGtD => write!(f, "dw.f64.gt"),
+            StackOp::DGeD => write!(f, "dw.f64.ge"),
+            StackOp::F64ToI32D => write!(f, "dw.convert.f64_to_i32"),
+            StackOp::I32ToF64D => write!(f, "dw.convert.i32_to_f64"),
+            StackOp::DToBitsD => write!(f, "dw.to_bits"),
+            StackOp::BitsToDD => write!(f, "dw.from_bits"),
+            StackOp::F32ToF64D => write!(f, "dw.convert.f32_to_f64"),
+            StackOp::F64ToF32D => write!(f, "dw.convert.f64_to_f32"),
+            StackOp::LoadF64D => write!(f, "dw.f64.load"),
+            StackOp::LoadF64OffD(o) => write!(f, "dw.f64.load offset={}", o),
+            StackOp::StoreF64D => write!(f, "dw.f64.store"),
+            StackOp::StoreF64OffD(o) => write!(f, "dw.f64.store offset={}", o),
+            StackOp::SinF64D => write!(f, "dw.f64.sin"),
+            StackOp::CosF64D => write!(f, "dw.f64.cos"),
+            StackOp::TanF64D => write!(f, "dw.f64.tan"),
+            StackOp::AsinF64D => write!(f, "dw.f64.asin"),
+            StackOp::AcosF64D => write!(f, "dw.f64.acos"),
+            StackOp::AtanF64D => write!(f, "dw.f64.atan"),
+            StackOp::SinhF64D => write!(f, "dw.f64.sinh"),
+            StackOp::CoshF64D => write!(f, "dw.f64.cosh"),
+            StackOp::TanhF64D => write!(f, "dw.f64.tanh"),
+            StackOp::AsinhF64D => write!(f, "dw.f64.asinh"),
+            StackOp::AcoshF64D => write!(f, "dw.f64.acosh"),
+            StackOp::AtanhF64D => write!(f, "dw.f64.atanh"),
+            StackOp::LnF64D => write!(f, "dw.f64.ln"),
+            StackOp::ExpF64D => write!(f, "dw.f64.exp"),
+            StackOp::Exp2F64D => write!(f, "dw.f64.exp2"),
+            StackOp::Log10F64D => write!(f, "dw.f64.log10"),
+            StackOp::Log2F64D => write!(f, "dw.f64.log2"),
+            StackOp::SqrtF64D => write!(f, "dw.f64.sqrt"),
+            StackOp::AbsF64D => write!(f, "dw.f64.abs"),
+            StackOp::FloorF64D => write!(f, "dw.f64.floor"),
+            StackOp::CeilF64D => write!(f, "dw.f64.ceil"),
+            StackOp::IsnanF64D => write!(f, "dw.f64.isnan"),
+            StackOp::IsinfF64D => write!(f, "dw.f64.isinf"),
+            StackOp::Atan2F64D => write!(f, "dw.f64.atan2"),
+            StackOp::PrintF64D => write!(f, "dw.debug.print_f64"),
+            // === Double-window fused superinstructions ===
+            StackOp::FusedGetGetDMulD(a, b) => write!(f, "dw.fused.get_get_dmul {} {}", a, b),
+            StackOp::FusedGetGetDMulDAddD(a, b) => {
+                write!(f, "dw.fused.get_get_dmul_dadd {} {}", a, b)
+            }
+            StackOp::FusedGetGetDMulDSubD(a, b) => {
+                write!(f, "dw.fused.get_get_dmul_dsub {} {}", a, b)
+            }
+            StackOp::FusedGetGetDMulSum2D(p, mask) => write!(
+                f,
+                "dw.fused.get_get_dmul_sum2 {} {} {} {} {}",
+                p[0], p[1], p[2], p[3], mask
+            ),
+            StackOp::FusedGetGetDMulSum3D(p, mask) => write!(
+                f,
+                "dw.fused.get_get_dmul_sum3 {} {} {} {} {} {} {}",
+                p[0], p[1], p[2], p[3], p[4], p[5], mask
+            ),
+            StackOp::FusedGetGetDMulSum4D(p, mask) => write!(
+                f,
+                "dw.fused.get_get_dmul_sum4 {} {} {} {} {} {} {} {} {}",
+                p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], mask
+            ),
+            StackOp::FusedGetGetDMulSum5D(p, mask) => write!(
+                f,
+                "dw.fused.get_get_dmul_sum5 {} {} {} {} {} {} {} {} {} {} {}",
+                p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], mask
+            ),
+            StackOp::FusedGetGetDMulSum6D(p, mask) => write!(
+                f,
+                "dw.fused.get_get_dmul_sum6 {} {} {} {} {} {} {} {} {} {} {} {} {}",
+                p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], mask
+            ),
+            StackOp::FusedGetGetDMulSum7D(p, mask) => write!(
+                f,
+                "dw.fused.get_get_dmul_sum7 {} {} {} {} {} {} {} {} {} {} {} {} {} {} {}",
+                p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12],
+                p[13], mask
+            ),
+            StackOp::FusedGetGetDMulSum8D(p, mask) => write!(
+                f,
+                "dw.fused.get_get_dmul_sum8 {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {}",
+                p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12],
+                p[13], p[14], p[15], mask
+            ),
+            StackOp::FusedGetSetD(src, dst) => write!(f, "dw.fused.get_set {} {}", src, dst),
+            StackOp::FusedGetSet2D(p) => {
+                write!(f, "dw.fused.get_set2 {} {} {} {}", p[0], p[1], p[2], p[3])
+            }
+            StackOp::FusedGetSet3D(p) => write!(
+                f,
+                "dw.fused.get_set3 {} {} {} {} {} {}",
+                p[0], p[1], p[2], p[3], p[4], p[5]
+            ),
+            StackOp::FusedGetSet4D(p) => write!(
+                f,
+                "dw.fused.get_set4 {} {} {} {} {} {} {} {}",
+                p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]
+            ),
+            StackOp::FusedGetSet5D(p) => write!(
+                f,
+                "dw.fused.get_set5 {} {} {} {} {} {} {} {} {} {}",
+                p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9]
+            ),
+            StackOp::FusedGetSet6D(p) => write!(
+                f,
+                "dw.fused.get_set6 {} {} {} {} {} {} {} {} {} {} {} {}",
+                p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11]
+            ),
+            StackOp::FusedGetSet7D(p) => write!(
+                f,
+                "dw.fused.get_set7 {} {} {} {} {} {} {} {} {} {} {} {} {} {}",
+                p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12],
+                p[13]
+            ),
+            StackOp::FusedGetSet8D(p) => write!(
+                f,
+                "dw.fused.get_set8 {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {}",
+                p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12],
+                p[13], p[14], p[15]
+            ),
+            StackOp::FusedF64ConstDGtJumpIfZeroD(v, o) => {
+                write!(f, "dw.fused.f64const_dgt_jiz {} {}", v, o)
+            }
+            StackOp::FusedGetF64ConstDGtJumpIfZeroD(n, v, o) => {
+                write!(f, "dw.fused.get_f64const_dgt_jiz {} {} {}", n, v, o)
+            }
             StackOp::Halt => write!(f, "halt"),
             StackOp::Nop => write!(f, "nop"),
         }
diff --git a/src/stack_optimize.rs b/src/stack_optimize.rs
index 28293d17..a6051286 100644
--- a/src/stack_optimize.rs
+++ b/src/stack_optimize.rs
@@ -39,6 +39,8 @@ fn compute_jump_targets(ops: &[StackOp]) -> Vec<bool> {
             | StackOp::FusedBoundsCheck8JumpIfZero(_, off) => Some(*off),
             StackOp::FusedF32ConstFGtJumpIfZeroF(_, off) => Some(*off),
             StackOp::FusedGetF32ConstFGtJumpIfZeroF(_, _, off) => Some(*off),
+            StackOp::FusedF64ConstDGtJumpIfZeroD(_, off) => Some(*off),
+            StackOp::FusedGetF64ConstDGtJumpIfZeroD(_, _, off) => Some(*off),
             _ => None,
         };
         if let Some(off) = off {
@@ -139,6 +141,130 @@ fn make_fused_get_get_fmul_sum_f(bytes: &[u8], sub_mask: u8) -> StackOp {
     }
 }
 
+// === Double-window (D) chain builders — mirror the F-window helpers ===
+
+fn make_fused_get_set_d(bytes: &[u8]) -> StackOp {
+    match bytes.len() {
+        2 => StackOp::FusedGetSetD(bytes[0], bytes[1]),
+        4 => StackOp::FusedGetSet2D([bytes[0], bytes[1], bytes[2], bytes[3]]),
+        6 => StackOp::FusedGetSet3D([bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5]]),
+        8 => StackOp::FusedGetSet4D([
+            bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7],
+        ]),
+        10 => StackOp::FusedGetSet5D([
+            bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7],
+            bytes[8], bytes[9],
+        ]),
+        12 => StackOp::FusedGetSet6D([
+            bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7],
+            bytes[8], bytes[9], bytes[10], bytes[11],
+        ]),
+        14 => StackOp::FusedGetSet7D([
+            bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7],
+            bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], bytes[13],
+        ]),
+        16 => StackOp::FusedGetSet8D([
+            bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7],
+            bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], bytes[13], bytes[14], bytes[15],
+        ]),
+        _ => unreachable!("unsupported double get/set chain length"),
+    }
+}
+
+fn packed_get_set_d_chain(ops: &[StackOp], start: usize, pairs: usize) -> Option<Vec<u8>> {
+    let mut bytes = Vec::with_capacity(pairs * 2);
+    for j in 0..pairs {
+        match (&ops[start + j * 2], &ops[start + j * 2 + 1]) {
+            (StackOp::LocalGetD(src), StackOp::LocalSetD(dst)) if *src < 256 && *dst < 256 => {
+                bytes.push(*src as u8);
+                bytes.push(*dst as u8);
+            }
+            _ => return None,
+        }
+    }
+    Some(bytes)
+}
+
+fn make_fused_get_get_dmul_sum_d(bytes: &[u8], sub_mask: u8) -> StackOp {
+    match bytes.len() {
+        4 => StackOp::FusedGetGetDMulSum2D([bytes[0], bytes[1], bytes[2], bytes[3]], sub_mask),
+        6 => StackOp::FusedGetGetDMulSum3D(
+            [bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5]],
+            sub_mask,
+        ),
+        8 => StackOp::FusedGetGetDMulSum4D(
+            [
+                bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7],
+            ],
+            sub_mask,
+        ),
+        10 => StackOp::FusedGetGetDMulSum5D(
+            [
+                bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7],
+                bytes[8], bytes[9],
+            ],
+            sub_mask,
+        ),
+        12 => StackOp::FusedGetGetDMulSum6D(
+            [
+                bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7],
+                bytes[8], bytes[9], bytes[10], bytes[11],
+            ],
+            sub_mask,
+        ),
+        14 => StackOp::FusedGetGetDMulSum7D(
+            [
+                bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7],
+                bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], bytes[13],
+            ],
+            sub_mask,
+        ),
+        16 => StackOp::FusedGetGetDMulSum8D(
+            [
+                bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7],
+                bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], bytes[13], bytes[14],
+                bytes[15],
+            ],
+            sub_mask,
+        ),
+        _ => unreachable!("unsupported double mul-sum chain length"),
+    }
+}
+
+fn packed_dmul_sum_fused_chain(
+    ops: &[StackOp],
+    start: usize,
+    terms: usize,
+) -> Option<(Vec<u8>, u8)> {
+    let mut bytes = Vec::with_capacity(terms * 2);
+    let mut sub_mask = 0u8;
+
+    match &ops[start] {
+        StackOp::FusedGetGetDMulD(a, b) if *a < 256 && *b < 256 => {
+            bytes.push(*a as u8);
+            bytes.push(*b as u8);
+        }
+        _ => return None,
+    }
+
+    for term in 1..terms {
+        match &ops[start + term] {
+            StackOp::FusedGetGetDMulDAddD(a, b) if *a < 256 && *b < 256 => {
+                bytes.push(*a as u8);
+                bytes.push(*b as u8);
+            }
+            StackOp::FusedGetGetDMulDSubD(a, b) if *a < 256 && *b < 256 => {
+                bytes.push(*a as u8);
+                bytes.push(*b as u8);
+                sub_mask |= 1 << term;
+            }
+            _ => return None,
+        }
+    }
+
+    Some((bytes, sub_mask))
+}
+
 fn make_fused_bounds_check_jiz(bytes: &[u8], off: i32) -> StackOp {
     match bytes.len() {
         2 => StackOp::FusedBoundsCheck1JumpIfZero([bytes[0], bytes[1]], off),
@@ -594,6 +720,186 @@ fn fuse(func: &mut StackFunction) {
             continue;
         }
 
+        // ================= Double-window (D) fusions =================
+        // Mirror the F-window rules above. Ordered so the multi-term
+        // mul-sum chain (built from already-fused ops on a prior pass) is
+        // tried before the 4-op and 3-op builders, and the 4-op builders
+        // before the 3-op one so the longest match wins at each position.
+
+        // Constant-fold fw.f32.const v + dw.convert.f32_to_f64 → dw.f64.const.
+        // Widening f32→f64 is exact, so this is always valid; it both folds
+        // the many `<lit> as f64` conversions and exposes the constant to
+        // the d-window compare-jump fusion below.
+        if i + 1 < len && !spans_target(i, 2) {
+            if let (StackOp::F32ConstF(v), StackOp::F32ToF64D) = (&ops[i], &ops[i + 1]) {
+                let v = *v as f64;
+                ops[i] = StackOp::F64ConstD(v);
+                ops[i + 1] = StackOp::Nop;
+                i += 2;
+                continue;
+            }
+        }
+
+        // dw.fused.get_get_dmul[_dadd|_dsub]* chain → get_get_dmul_sumN.
+        let mut fused_dmul_sum = false;
+        for terms in (2..=8).rev() {
+            let span = terms;
+            if i + span <= len && !spans_target(i, span) {
+                if let Some((pairs, sub_mask)) = packed_dmul_sum_fused_chain(ops, i, terms) {
+                    ops[i] = make_fused_get_get_dmul_sum_d(&pairs, sub_mask);
+                    for slot in ops.iter_mut().take(i + span).skip(i + 1) {
+                        *slot = StackOp::Nop;
+                    }
+                    i += span;
+                    fused_dmul_sum = true;
+                    break;
+                }
+            }
+        }
+        if fused_dmul_sum {
+            continue;
+        }
+
+        // dw.local.get a + dw.local.get b + dw.f64.mul + dw.f64.add
+        //              → dw.fused.get_get_dmul_dadd.
+        if i + 3 < len && !spans_target(i, 4) {
+            if let (
+                StackOp::LocalGetD(a),
+                StackOp::LocalGetD(b),
+                StackOp::DMulD,
+                StackOp::DAddD,
+            ) = (&ops[i], &ops[i + 1], &ops[i + 2], &ops[i + 3])
+            {
+                ops[i] = StackOp::FusedGetGetDMulDAddD(*a, *b);
+                ops[i + 1] = StackOp::Nop;
+                ops[i + 2] = StackOp::Nop;
+                ops[i + 3] = StackOp::Nop;
+                i += 4;
+                continue;
+            }
+        }
+
+        // dw.local.get a + dw.local.get b + dw.f64.mul + dw.f64.sub
+        //              → dw.fused.get_get_dmul_dsub.
+        if i + 3 < len && !spans_target(i, 4) {
+            if let (
+                StackOp::LocalGetD(a),
+                StackOp::LocalGetD(b),
+                StackOp::DMulD,
+                StackOp::DSubD,
+            ) = (&ops[i], &ops[i + 1], &ops[i + 2], &ops[i + 3])
+            {
+                ops[i] = StackOp::FusedGetGetDMulDSubD(*a, *b);
+                ops[i + 1] = StackOp::Nop;
+                ops[i + 2] = StackOp::Nop;
+                ops[i + 3] = StackOp::Nop;
+                i += 4;
+                continue;
+            }
+        }
+
+        // dw.local.get a + dw.local.get b + dw.f64.mul → dw.fused.get_get_dmul.
+        if i + 2 < len && !spans_target(i, 3) {
+            if let (StackOp::LocalGetD(a), StackOp::LocalGetD(b), StackOp::DMulD) =
+                (&ops[i], &ops[i + 1], &ops[i + 2])
+            {
+                let a = *a;
+                let b = *b;
+                ops[i] = StackOp::FusedGetGetDMulD(a, b);
+                ops[i + 1] = StackOp::Nop;
+                ops[i + 2] = StackOp::Nop;
+                i += 3;
+                continue;
+            }
+        }
+
+        // dw.f64.const v + dw.f64.gt + jump_if_zero off
+        //              → FusedF64ConstDGtJumpIfZeroD.
+        if i + 2 < len && !spans_target(i, 3) {
+            if let (StackOp::F64ConstD(v), StackOp::DGtD, StackOp::JumpIfZero(off)) =
+                (&ops[i], &ops[i + 1], &ops[i + 2])
+            {
+                let v = *v;
+                let new_off = *off + 2;
+                ops[i] = StackOp::FusedF64ConstDGtJumpIfZeroD(v, new_off);
+                ops[i + 1] = StackOp::Nop;
+                ops[i + 2] = StackOp::Nop;
+                i += 3;
+                continue;
+            }
+        }
+
+        // dw.local.get n + FusedF64ConstDGtJumpIfZeroD v off
+        //              → FusedGetF64ConstDGtJumpIfZeroD.
+        if i + 1 < len && !spans_target(i, 2) {
+            if let (StackOp::LocalGetD(n), StackOp::FusedF64ConstDGtJumpIfZeroD(v, off)) =
+                (&ops[i], &ops[i + 1])
+            {
+                let n = *n;
+                let v = *v;
+                let new_off = *off + 1;
+                ops[i] = StackOp::FusedGetF64ConstDGtJumpIfZeroD(n, v, new_off);
+                ops[i + 1] = StackOp::Nop;
+                i += 2;
+                continue;
+            }
+        }
+
+        // dw.local.tee N + dw.drop → dw.local.set N.
+        if i + 1 < len && !spans_target(i, 2) {
+            if let StackOp::LocalTeeD(n) = ops[i] {
+                if matches!(ops[i + 1], StackOp::DropD) {
+                    ops[i] = StackOp::LocalSetD(n);
+                    ops[i + 1] = StackOp::Nop;
+                    i += 2;
+                    continue;
+                }
+            }
+        }
+
+        // dw.local.get + dw.drop → nop (dead read).
+        if i + 1 < len && !spans_target(i, 2) {
+            if matches!(ops[i], StackOp::LocalGetD(_)) && matches!(ops[i + 1], StackOp::DropD) {
+                ops[i] = StackOp::Nop;
+                ops[i + 1] = StackOp::Nop;
+                i += 2;
+                continue;
+            }
+        }
+
+        // dw.local.get src + dw.local.set dst (×N) → FusedGetSetND (moves).
+        let mut fused_get_set_d_chain = false;
+        for pairs in (2..=8).rev() {
+            let span = pairs * 2;
+            if i + span <= len && !spans_target(i, span) {
+                if let Some(bytes) = packed_get_set_d_chain(ops, i, pairs) {
+                    ops[i] = make_fused_get_set_d(&bytes);
+                    for slot in ops.iter_mut().take(i + span).skip(i + 1) {
+                        *slot = StackOp::Nop;
+                    }
+                    i += span;
+                    fused_get_set_d_chain = true;
+                    break;
+                }
+            }
+        }
+        if fused_get_set_d_chain {
+            continue;
+        }
+        // Single dw.local.get + dw.local.set → FusedGetSetD.
+        if i + 1 < len && !spans_target(i, 2) {
+            if let (StackOp::LocalGetD(src), StackOp::LocalSetD(dst)) = (&ops[i], &ops[i + 1]) {
+                if *src < 256 && *dst < 256 {
+                    let src = *src as u8;
+                    let dst = *dst as u8;
+                    ops[i] = StackOp::FusedGetSetD(src, dst);
+                    ops[i + 1] = StackOp::Nop;
+                    i += 2;
+                    continue;
+                }
+            }
+        }
+
         // Float-window: fw.local.get a + fw.local.get b + fw.f32.mul + fw.f32.add
         //              → fw.fused.get_get_fmul_fadd.
         if i + 3 < len && !spans_target(i, 4) {
@@ -1111,6 +1417,18 @@ fn strip_nops(func: &mut StackFunction) {
                 let new_off = target_new as i32 - new_idx[old] as i32 - 1;
                 StackOp::FusedGetF32ConstFGtJumpIfZeroF(*n, *v, new_off)
             }
+            StackOp::FusedF64ConstDGtJumpIfZeroD(v, off) => {
+                let target_old = (old as i64 + 1 + *off as i64) as usize;
+                let target_new = new_idx[target_old];
+                let new_off = target_new as i32 - new_idx[old] as i32 - 1;
+                StackOp::FusedF64ConstDGtJumpIfZeroD(*v, new_off)
+            }
+            StackOp::FusedGetF64ConstDGtJumpIfZeroD(n, v, off) => {
+                let target_old = (old as i64 + 1 + *off as i64) as usize;
+                let target_new = new_idx[target_old];
+                let new_off = target_new as i32 - new_idx[old] as i32 - 1;
+                StackOp::FusedGetF64ConstDGtJumpIfZeroD(*n, *v, new_off)
+            }
             other => other.clone(),
         };
         new_ops.push(adjusted);
diff --git a/tests/cases/f64_window.lyte b/tests/cases/f64_window.lyte
new file mode 100644
index 00000000..7485b278
--- /dev/null
+++ b/tests/cases/f64_window.lyte
@@ -0,0 +1,79 @@
+// f64 double-window coverage: arithmetic, all comparisons, the fused
+// multiply-add sum chain, f64 across function calls (arg/return bridging),
+// deep expression chains that spill the d-window, conversions, math
+// intrinsics, and negation. Exercises the StoreF64/LoadF64 d-window paths
+// via a struct as well.
+//
+// expected stdout:
+// compilation successful
+// assert(true)
+// assert(true)
+// assert(true)
+// assert(true)
+// assert(true)
+// assert(true)
+// assert(true)
+// assert(true)
+// assert(true)
+// assert(true)
+// assert(true)
+// assert(true)
+// assert(true)
+// assert(true)
+// 42
+
+struct Acc { a: f64, b: f64, c: f64 }
+
+approx(x: f64, y: f64) -> bool {
+    var d = x - y
+    if d < 0.0 as f64 { d = 0.0 as f64 - d }
+    return d < 0.0001 as f64
+}
+
+// f64 argument and return value bridging across a call.
+fma(a: f64, b: f64, c: f64) -> f64 {
+    return a * b + c
+}
+
+main {
+    var x = 3.0 as f64
+    var y = 2.0 as f64
+
+    // Arithmetic.
+    assert(approx(x + y, 5.0 as f64))
+    assert(approx(x - y, 1.0 as f64))
+    assert(approx(x * y, 6.0 as f64))
+    assert(approx(x / y, 1.5 as f64))
+    assert(approx(0.0 as f64 - x, -3.0 as f64))
+
+    // All six comparisons.
+    assert(x > y)
+    assert(y < x)
+    assert(x >= 3.0 as f64)
+    assert(y <= 2.0 as f64)
+    assert(x == 3.0 as f64)
+    assert(x != y)
+
+    // Fused multiply-add sum chain: b0*x + b1*x1 - b2*x2.
+    var b0 = 1.0 as f64
+    var b1 = 2.0 as f64
+    var b2 = 0.5 as f64
+    var x1 = 4.0 as f64
+    var x2 = 6.0 as f64
+    var sum = b0 * x + b1 * x1 - b2 * x2
+    // 1*3 + 2*4 - 0.5*6 = 3 + 8 - 3 = 8
+    assert(approx(sum, 8.0 as f64))
+
+    // f64 across a function call (argument + return bridging).
+    assert(approx(fma(x, y, 1.0 as f64), 7.0 as f64))
+
+    // Math intrinsic + struct store/load through the d-window.
+    var acc: Acc
+    acc.a = sqrt(16.0 as f64)
+    acc.b = acc.a * 2.0 as f64
+    acc.c = acc.a + acc.b
+    assert(approx(acc.c, 12.0 as f64))
+
+    // Conversion round trip f64 -> i32.
+    print((x * y * 7.0 as f64) as i32)
+}