diff --git a/docs/FP_CODEGEN_PLAN.md b/docs/FP_CODEGEN_PLAN.md index 88c1c32f..6729c245 100644 --- a/docs/FP_CODEGEN_PLAN.md +++ b/docs/FP_CODEGEN_PLAN.md @@ -487,6 +487,19 @@ supports `f64` but the hot-path benchmarks don't exercise it). Option: Recommendation: **Option B** for now. Add f64 F variants later if real f64 workloads emerge. +> **Resolved: Option A was implemented.** A real f64 DSP workload emerged +> (`benchmark/biquad_f64.lyte`), and on the int path it ran ~2.9× slower +> than the f32 biquad — every op paid a GPR↔FP crossing and nothing fused. +> f64 now gets a full parallel `double` window (`d0..d3` + `dfsp`), the +> exact analogue of the float window, with `D`-suffix StackOps for +> arithmetic, comparisons, conversions, memory, and math, plus the mirrored +> fused superinstructions (`get_get_dmul_sum*`, `get_set*D`, +> `get_f64const_dgt_jiz`). The two FP windows use all 8 FP arg registers +> (`v0..v7` / `xmm0..xmm7`). Result: f64 biquad dropped from ~0.36s to +> ~0.16s on the VM-host reference (`benchmark/run.sh`), ~1.2× the f32 +> biquad — the residual gap is f64's 2× state bandwidth, not dispatch or +> crossing overhead. + ### 6.3 Function call arg passing When calling a function with mixed int and float args, how are they diff --git a/docs/Stack_VM.md b/docs/Stack_VM.md index 9bfb67d5..4aa26c73 100644 --- a/docs/Stack_VM.md +++ b/docs/Stack_VM.md @@ -33,11 +33,13 @@ On bare-metal Apple M4, the Stack VM runs biquad at 0.104s, sort at It was deleted — see `docs/HOT_LOCALS.md` for the design exploration and why the cache never paid off on this VM. - **No per-op runtime type check.** The codegen tracks each stack slot's - type statically, and int vs f32 ops pick the right window at emit - time — `IAdd` reads `t0/t1`, `FAddF` reads `f0/f1`, and so on. f64 - values ride the int window as bit patterns (rare in hot code). + type statically, and int / f32 / f64 ops pick the right window at emit + time — `IAdd` reads `t0/t1`, `FAddF` reads `f0/f1`, `DAddD` reads + `d0/d1`, and so on. f64 now has its own dedicated `double` window + (`d0..d3`, spilling to `*dfsp`), the exact analogue of the f32 float + window, so f64 arithmetic also stays in FP registers. - **Per-window stack-depth validation.** `src/stack_depth.rs` runs - forward over each function tracking int and float stack depth + forward over each function tracking int, float, and double stack depth independently; any jump target or call site that doesn't match between incoming edges is a codegen bug. @@ -247,8 +249,14 @@ Two non-obvious choices here, both load-bearing: register-indirect store. Commit `21f2949`. f32 and f64 coexist on the logical operand stack. The codegen tracks -each slot's type statically; f32 slots live in the float window, f64 -slots (rare) ride the int window as bit patterns. +each slot's type statically; f32 slots live in the float window (`f0..f3`) +and f64 slots live in a parallel double window (`d0..d3`). The two FP +windows together use the full 8 FP argument registers (`v0..v7` on +aarch64, `xmm0..xmm7` on x86-64). f64 gets the same fused superinstructions +as f32 — the multiply-accumulate sum chain, variable-move chains, and the +const-compare-branch — so f64 DSP loops reach near-parity with f32 (the +biquad f64 benchmark runs ~1.2× the f32 time, the gap being f64's 2× state +bandwidth, versus ~2.9× before the double window existed). ## Hot local cache (removed) @@ -477,8 +485,6 @@ VM is safe to call from a real-time audio thread. ### Limitations -- **f64 is second-class.** f64 values ride the int window, pay GPR↔FP - crossings on every op. Fine for correctness tests, not hot-pathed. - **No SIMD vector types in the VM.** `f32x4` lowers to per-lane scalar f32 ops via fused loads/stores. A proper SIMD window would need another register tier. diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index a0c12e42..4735ce44 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -18,6 +18,9 @@ libfuzzer-sys = "0.4.12" lyte = { path = "../" } libc = "0.2.186" +[build-dependencies] +cc = "1.2.62" + [[bin]] name = "lexer" path = "fuzz_targets/lexer.rs" diff --git a/fuzz/build.rs b/fuzz/build.rs new file mode 100644 index 00000000..b8f1360e --- /dev/null +++ b/fuzz/build.rs @@ -0,0 +1,14 @@ +fn main() { + // Propagate has_stack_interp cfg if the C compiler is Clang. + // Mirrors cli/build.rs so the differential fuzz target can compile and + // run the stack backend (which depends on the Clang-only C interpreter). + let compiler = cc::Build::new().try_get_compiler(); + let is_clang = compiler + .as_ref() + .map(|c| c.is_like_clang()) + .unwrap_or(false); + if is_clang { + println!("cargo:rustc-cfg=has_stack_interp"); + } + println!("cargo:rustc-check-cfg=cfg(has_stack_interp)"); +} diff --git a/fuzz/fuzz_targets/differential.rs b/fuzz/fuzz_targets/differential.rs index b6b40496..ed5b0605 100644 --- a/fuzz/fuzz_targets/differential.rs +++ b/fuzz/fuzz_targets/differential.rs @@ -60,6 +60,21 @@ impl<'a> Gen<'a> { )); } + // Optionally emit an `a * b + c` helper for each float type. The call + // exercises float argument/return-value bridging — i.e. saving and + // restoring the relevant float window (f0..f3 / d0..d3) around a call. + let mut has_fma = [false; FLOAT_KINDS.len()]; + for (k, kind) in FLOAT_KINDS.iter().enumerate() { + if self.next() % 3 == 0 { + has_fma[k] = true; + decls.push(format!( + "{}(a: {t}, b: {t}, c: {t}) -> {t} {{\n return a * b + c\n}}", + kind.fma, + t = kind.ty + )); + } + } + // 1-4 initial integer variables let n_vars = (self.next() % 4 + 1) as usize; for i in 0..n_vars { @@ -106,6 +121,24 @@ impl<'a> Gen<'a> { vars.push((name, VarType::Enum(n_variants))); } + // Optionally declare 1-2 variables of each float type. f32 drives the + // stack VM's single float window (f0..f3, spilling through fsp); f64 + // drives the double window (d0..d3, spilling through dfsp). Values are + // kept small so downstream `as i32` casts stay well inside i32 range. + let mut has_floats = [false; FLOAT_KINDS.len()]; + for (k, kind) in FLOAT_KINDS.iter().enumerate() { + if self.next() % 2 == 0 { + has_floats[k] = true; + let n_fvars = (self.next() % 2 + 1) as usize; // 1-2 + for i in 0..n_fvars { + let name = format!("{}{}", kind.var_prefix, i); + main_lines + .push(format!(" var {} = {}", name, self.gen_float_literal(kind.ty))); + vars.push((name, VarType::Float(kind.ty))); + } + } + } + // 1-5 computed values, each printed let n_stmts = (self.next() % 5 + 1) as usize; for i in 0..n_stmts { @@ -116,6 +149,25 @@ impl<'a> Gen<'a> { vars.push((name, VarType::Int)); } + // Float computations, each printed as i32. Printing the truncated + // integer (rather than the raw float) keeps output comparable across + // backends — Rust's float Display ("3") and the C interpreter's printf + // ("3.0") format integral floats differently, so raw float prints + // would diverge on formatting alone. The float arithmetic itself is + // still fully exercised before the cast. + for (k, kind) in FLOAT_KINDS.iter().enumerate() { + if !has_floats[k] { + continue; + } + let n_fstmts = (self.next() % 3 + 1) as usize; // 1-3 + for i in 0..n_fstmts { + let name = format!("{}r{}", kind.var_prefix, i); + let expr = self.gen_float_expr(&vars, kind, has_fma[k], 2); + main_lines.push(format!(" let {} = {}", name, expr)); + main_lines.push(format!(" print({} as i32)", name)); + } + } + main_lines.push("}".to_string()); let mut parts = decls; @@ -289,11 +341,101 @@ impl<'a> Gen<'a> { let val = (self.next() % 200) as i32 - 100; self.format_int(val) } + + /// A small float literal in [0.0, 9.9], typed as `ty` ("f32"/"f64"). + /// Bare float literals are already f32, so an `as f32` cast would be an + /// unsupported identity conversion — emit the bare literal for f32 and an + /// explicit `as f64` conversion for f64. Values are bounded so that + /// products of a few of these stay far inside i32 range after the final + /// `as i32` cast. + fn gen_float_literal(&mut self, ty: &str) -> String { + let whole = self.next() % 10; + let frac = self.next() % 10; + if ty == "f32" { + format!("{}.{}", whole, frac) + } else { + format!("({}.{} as {})", whole, frac, ty) + } + } + + fn gen_float_expr( + &mut self, + vars: &[(String, VarType)], + kind: &FloatKind, + has_fma: bool, + depth: u8, + ) -> String { + if depth == 0 { + return self.gen_float_leaf(vars, kind); + } + let max_choice = if has_fma { 8 } else { 6 }; + match self.next() % max_choice { + // Float literal + 0..=1 => self.gen_float_literal(kind.ty), + // Float variable reference (of this type) + 2..=3 => self.gen_float_var(vars, kind.ty), + // Binary arithmetic (no division — avoids safety errors) + 4..=5 => { + let ops = ["+", "-", "*"]; + let op = ops[self.next() as usize % ops.len()]; + let l = self.gen_float_expr(vars, kind, has_fma, depth - 1); + let r = self.gen_float_expr(vars, kind, has_fma, depth - 1); + format!("({} {} {})", l, op, r) + } + // Helper call (only if its `a * b + c` helper was emitted) — + // exercises float argument/return-value bridging across a call. + 6..=7 => { + let a = self.gen_float_expr(vars, kind, has_fma, depth - 1); + let b = self.gen_float_expr(vars, kind, has_fma, depth - 1); + let c = self.gen_float_expr(vars, kind, has_fma, depth - 1); + format!("{}({}, {}, {})", kind.fma, a, b, c) + } + _ => self.gen_float_literal(kind.ty), + } + } + + fn gen_float_leaf(&mut self, vars: &[(String, VarType)], kind: &FloatKind) -> String { + if self.next() % 2 == 0 { + self.gen_float_var(vars, kind.ty) + } else { + self.gen_float_literal(kind.ty) + } + } + + fn gen_float_var(&mut self, vars: &[(String, VarType)], ty: &str) -> String { + let fvars: Vec<&str> = vars + .iter() + .filter_map(|(name, t)| match t { + VarType::Float(vt) if *vt == ty => Some(name.as_str()), + _ => None, + }) + .collect(); + if fvars.is_empty() { + self.gen_float_literal(ty) + } else { + let idx = self.next() as usize % fvars.len(); + fvars[idx].to_string() + } + } +} + +/// A float type the generator can emit, paired with the names it uses for +/// that type's variables and `a * b + c` helper. +struct FloatKind { + ty: &'static str, + var_prefix: &'static str, + fma: &'static str, } +const FLOAT_KINDS: [FloatKind; 2] = [ + FloatKind { ty: "f32", var_prefix: "sv", fma: "sfma" }, + FloatKind { ty: "f64", var_prefix: "fv", fma: "ffma" }, +]; + #[derive(Clone)] enum VarType { Int, + Float(&'static str), Struct, Array(usize), Enum(usize), @@ -321,6 +463,11 @@ fn capture_stdout(f: F) -> String { f(); std::io::stdout().flush().ok(); + // The stack backend's C interpreter prints via C stdio (printf), which + // buffers independently of Rust's stdout. Flush all C streams before + // restoring fd 1, or the buffered output is lost and the capture comes + // back empty. + unsafe { libc::fflush(std::ptr::null_mut()) }; unsafe { libc::dup2(saved_fd, 1) }; unsafe { libc::close(saved_fd) }; @@ -356,6 +503,17 @@ fn run_backend(program: &str, backend: &str) -> Option { }); Some(output) } + #[cfg(has_stack_interp)] + "stack" => { + // Compile to the stack VM and run it through the C interpreter + // (the same path cli/src/main.rs uses for `--backend stack`). + let output = capture_stdout(|| { + if let Ok(program) = compiler.compile_stack() { + let _ = lyte::stack_interp_bridge::run(&program); + } + }); + Some(output) + } #[cfg(target_arch = "aarch64")] "asm" => { let output = capture_stdout(|| { @@ -460,4 +618,10 @@ fuzz_target!(|data: &[u8]| { let llvm_output = run_backend(&program, "llvm"); assert_same("VM", &vm_output, "LLVM", &llvm_output, &program); } + + #[cfg(has_stack_interp)] + { + let stack_output = run_backend(&program, "stack"); + assert_same("VM", &vm_output, "STACK", &stack_output, &program); + } }); diff --git a/src/compiler.rs b/src/compiler.rs index 12172642..225023b7 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -1980,7 +1980,31 @@ mod tests { // At Return/ReturnVoid the entry depth must be 0 (plus the op's // own delta, which is 0 for both return ops). fn assert_f_window_balanced(program: &crate::stack_ir::StackProgram, label: &str) { - use crate::stack_depth::float_stack_delta; + assert_window_balanced( + program, + label, + crate::stack_depth::float_stack_delta, + "f-window", + ); + assert_window_balanced( + program, + label, + crate::stack_depth::double_stack_delta, + "d-window", + ); + } + + /// Verify a register-window (float or double) stays balanced across the + /// CFG: every merge agrees on depth, every Return leaves depth 0, and no + /// op underflows. `delta` selects which window to check. + #[cfg(test)] + fn assert_window_balanced( + program: &crate::stack_ir::StackProgram, + label: &str, + delta: fn(&crate::stack_ir::StackOp) -> i32, + window: &str, + ) { + let float_stack_delta = delta; use crate::stack_ir::StackOp; for func in &program.functions { @@ -2044,7 +2068,9 @@ mod tests { } } StackOp::FusedF32ConstFGtJumpIfZeroF(_, off) - | StackOp::FusedGetF32ConstFGtJumpIfZeroF(_, _, off) => { + | StackOp::FusedGetF32ConstFGtJumpIfZeroF(_, _, off) + | StackOp::FusedF64ConstDGtJumpIfZeroD(_, off) + | StackOp::FusedGetF64ConstDGtJumpIfZeroD(_, _, off) => { let t = (i as i64 + 1 + *off as i64) as usize; if t < n { succs.push(t); @@ -2066,9 +2092,9 @@ mod tests { worklist.push(s); } else if in_depth[s] != d_out { panic!( - "[{}] {}: f-window depth mismatch at op {} \ + "[{}] {}: {} depth mismatch at op {} \ (from op {}): {} vs {}", - label, func.name, s, i, in_depth[s], d_out, + label, func.name, window, s, i, in_depth[s], d_out, ); } } @@ -2081,9 +2107,10 @@ mod tests { if matches!(op, StackOp::Return | StackOp::ReturnVoid) { assert!( in_depth[i] == 0, - "[{}] {}: f-window leaks {} slot(s) at return op {}", + "[{}] {}: {} leaks {} slot(s) at return op {}", label, func.name, + window, in_depth[i], i, ); @@ -2091,9 +2118,10 @@ mod tests { let d_out = in_depth[i] + float_stack_delta(op); assert!( d_out >= 0, - "[{}] {}: f-window underflow at op {} ({:?}): in={} delta={}", + "[{}] {}: {} underflow at op {} ({:?}): in={} delta={}", label, func.name, + window, i, op, in_depth[i], diff --git a/src/stack_codegen.rs b/src/stack_codegen.rs index b8b70c15..4d28ff1f 100644 --- a/src/stack_codegen.rs +++ b/src/stack_codegen.rs @@ -563,9 +563,11 @@ impl<'a> FunctionTranslator<'a> { // path. Just emit the return. func.emit(StackOp::ReturnVoid); } else { - // Fall-through return: bridge f32 results back to t0. + // Fall-through return: bridge f32/f64 results back to t0. if matches!(&*self.decl.ret, Type::Float32) { func.emit(StackOp::FToBitsF); + } else if matches!(&*self.decl.ret, Type::Float64) { + func.emit(StackOp::DToBitsD); } func.emit(StackOp::Return); } @@ -600,6 +602,8 @@ impl<'a> FunctionTranslator<'a> { let ty = self.expr_type(expr); if matches!(&*ty, Type::Float32) { func.emit(StackOp::DropF); + } else if matches!(&*ty, Type::Float64) { + func.emit(StackOp::DropD); } else { func.emit(StackOp::Drop); } @@ -663,6 +667,8 @@ impl<'a> FunctionTranslator<'a> { let ty = self.expr_type(expr); if matches!(&*ty, Type::Float32) { func.emit(StackOp::DropF); + } else if matches!(&*ty, Type::Float64) { + func.emit(StackOp::DropD); } else { func.emit(StackOp::Drop); } @@ -709,7 +715,7 @@ impl<'a> FunctionTranslator<'a> { } Type::Float64 => { let value: f64 = s.parse().unwrap_or(0.0); - func.emit(StackOp::F64Const(value)); + func.emit(StackOp::F64ConstD(value)); } _ => { let value: f32 = s.parse().unwrap_or(0.0); @@ -796,17 +802,10 @@ impl<'a> FunctionTranslator<'a> { // Scalar: translate init, store in local. self.translate_expr(init, func); let local = self.alloc_scalar(); - let is_f32 = matches!(&*ty, Type::Float32); if self.void_ctx { - if is_f32 { - func.emit(StackOp::LocalSetF(local)); - } else { - func.emit(StackOp::LocalSet(local)); - } - } else if is_f32 { - func.emit(StackOp::LocalTeeF(local)); + self.emit_local_set(&ty, local, func); } else { - func.emit(StackOp::LocalTee(local)); + self.emit_local_tee(&ty, local, func); } self.variables.insert(name, LocalKind::Scalar(local)); self.variable_types.insert(name, ty); @@ -832,15 +831,10 @@ impl<'a> FunctionTranslator<'a> { if !self.is_ptr_type(&ty) { let local = self.alloc_scalar(); - let is_f32 = matches!(&*ty, Type::Float32); if let Some(init_id) = init { if !self.try_emit_binop_set(local, init_id, func) { self.translate_expr(init_id, func); - if is_f32 { - func.emit(StackOp::LocalSetF(local)); - } else { - func.emit(StackOp::LocalSet(local)); - } + self.emit_local_set(&ty, local, func); } } else { // Uninitialized local — zero the slot via the int window. @@ -940,11 +934,13 @@ impl<'a> FunctionTranslator<'a> { func.emit(StackOp::MemCopy(size)); func.emit(StackOp::ReturnVoid); } else { - // f32 return values travel through t0 (int window). If - // the preceding expression left the value in the float + // f32/f64 return values travel through t0 (int window). + // If the preceding expression left the value in a FP // window, bridge it back to the int window first. if matches!(&*ret_ty, Type::Float32) { func.emit(StackOp::FToBitsF); + } else if matches!(&*ret_ty, Type::Float64) { + func.emit(StackOp::DToBitsD); } func.emit(StackOp::Return); } @@ -1073,11 +1069,7 @@ impl<'a> FunctionTranslator<'a> { if let Some(&kind) = self.variables.get(&name) { match kind { LocalKind::Scalar(slot) => { - if matches!(&*ty, Type::Float32) { - func.emit(StackOp::LocalGetF(slot)); - } else { - func.emit(StackOp::LocalGet(slot)); - } + self.emit_local_get(&ty, slot, func); } LocalKind::Reference(slot) => { func.emit(StackOp::LocalGet(slot)); @@ -1198,34 +1190,34 @@ impl<'a> FunctionTranslator<'a> { match op { Binop::Plus => match &*ty { Type::Float32 => func.emit(StackOp::FAddF), - Type::Float64 => func.emit(StackOp::DAdd), + Type::Float64 => func.emit(StackOp::DAddD), _ => func.emit(StackOp::IAdd), }, Binop::Minus => match &*ty { Type::Float32 => func.emit(StackOp::FSubF), - Type::Float64 => func.emit(StackOp::DSub), + Type::Float64 => func.emit(StackOp::DSubD), _ => func.emit(StackOp::ISub), }, Binop::Mult => match &*ty { Type::Float32 => func.emit(StackOp::FMulF), - Type::Float64 => func.emit(StackOp::DMul), + Type::Float64 => func.emit(StackOp::DMulD), _ => func.emit(StackOp::IMul), }, Binop::Div => match &*ty { Type::Float32 => func.emit(StackOp::FDivF), - Type::Float64 => func.emit(StackOp::DDiv), + Type::Float64 => func.emit(StackOp::DDivD), Type::UInt32 | Type::UInt8 => func.emit(StackOp::UDiv), _ => func.emit(StackOp::IDiv), }, Binop::Mod => func.emit(StackOp::IRem), Binop::Pow => match &*ty { Type::Float32 => func.emit(StackOp::FPowF), - Type::Float64 => func.emit(StackOp::DPow), + Type::Float64 => func.emit(StackOp::DPowD), _ => func.emit(StackOp::IPow), }, Binop::Equal => match &*ty { Type::Float32 => func.emit(StackOp::FEqF), - Type::Float64 => func.emit(StackOp::DEq), + Type::Float64 => func.emit(StackOp::DEqD), Type::Name(_, _) | Type::Tuple(_) | Type::Array(_, _) => { let size = ty.size(self.decls) as u32; func.emit(StackOp::MemEq(size)); @@ -1238,6 +1230,7 @@ impl<'a> FunctionTranslator<'a> { }, Binop::NotEqual => match &*ty { Type::Float32 => func.emit(StackOp::FNeF), + Type::Float64 => func.emit(StackOp::DNeD), Type::Name(_, _) | Type::Tuple(_) | Type::Array(_, _) => { let size = ty.size(self.decls) as u32; func.emit(StackOp::MemNe(size)); @@ -1250,27 +1243,27 @@ impl<'a> FunctionTranslator<'a> { }, Binop::Less => match &*ty { Type::Float32 => func.emit(StackOp::FLtF), - Type::Float64 => func.emit(StackOp::DLt), + Type::Float64 => func.emit(StackOp::DLtD), Type::UInt32 | Type::UInt8 => func.emit(StackOp::ULt), _ => func.emit(StackOp::ILt), }, Binop::Greater => { match &*ty { Type::Float32 => func.emit(StackOp::FGtF), - Type::Float64 => func.emit(StackOp::IGt), // TODO: DGt + Type::Float64 => func.emit(StackOp::DGtD), Type::UInt32 | Type::UInt8 => func.emit(StackOp::UGt), _ => func.emit(StackOp::IGt), } } Binop::Leq => match &*ty { Type::Float32 => func.emit(StackOp::FLeF), - Type::Float64 => func.emit(StackOp::DLe), + Type::Float64 => func.emit(StackOp::DLeD), _ => func.emit(StackOp::ILe), }, Binop::Geq => { match &*ty { Type::Float32 => func.emit(StackOp::FGeF), - Type::Float64 => func.emit(StackOp::IGe), // TODO: DGe + Type::Float64 => func.emit(StackOp::DGeD), Type::UInt32 | Type::UInt8 => func.emit(StackOp::IGe), // unsigned uses signed _ => func.emit(StackOp::IGe), } @@ -1284,7 +1277,6 @@ impl<'a> FunctionTranslator<'a> { /// Translate an assignment expression. fn translate_assign(&mut self, lhs_id: ExprID, rhs_id: ExprID, func: &mut StackFunction) { let lhs_ty = self.representation_type(lhs_id); - let lhs_is_f32 = matches!(&*lhs_ty, Type::Float32); // Check for captured variable assignment (double indirection). if let Expr::Id(name) = &self.decl.arena.exprs[lhs_id] { @@ -1294,31 +1286,17 @@ impl<'a> FunctionTranslator<'a> { let val_local = self.alloc_scalar(); let addr_local = *self.captured_slots.get(&name).unwrap(); if self.void_ctx { - if lhs_is_f32 { - func.emit(StackOp::LocalSetF(val_local)); - } else { - func.emit(StackOp::LocalSet(val_local)); - } - } else if lhs_is_f32 { - func.emit(StackOp::LocalTeeF(val_local)); + self.emit_local_set(&lhs_ty, val_local, func); } else { - func.emit(StackOp::LocalTee(val_local)); + self.emit_local_tee(&lhs_ty, val_local, func); } // Stack: [value]. Need to store through captured pointer. // Push addr, then value, then store. func.emit(StackOp::LocalGet(addr_local)); // push captured addr - if lhs_is_f32 { - func.emit(StackOp::LocalGetF(val_local)); - } else { - func.emit(StackOp::LocalGet(val_local)); // push value - } + self.emit_local_get(&lhs_ty, val_local, func); // push value self.emit_store_op(&lhs_ty, func); if !self.void_ctx { - if lhs_is_f32 { - func.emit(StackOp::LocalGetF(val_local)); - } else { - func.emit(StackOp::LocalGet(val_local)); // result value - } + self.emit_local_get(&lhs_ty, val_local, func); // result value } return; } @@ -1336,15 +1314,9 @@ impl<'a> FunctionTranslator<'a> { } self.translate_expr(rhs_id, func); if self.void_ctx { - if lhs_is_f32 { - func.emit(StackOp::LocalSetF(slot)); - } else { - func.emit(StackOp::LocalSet(slot)); - } - } else if lhs_is_f32 { - func.emit(StackOp::LocalTeeF(slot)); + self.emit_local_set(&lhs_ty, slot, func); } else { - func.emit(StackOp::LocalTee(slot)); + self.emit_local_tee(&lhs_ty, slot, func); } return; } @@ -1440,31 +1412,19 @@ impl<'a> FunctionTranslator<'a> { self.translate_expr(rhs_id, func); self.emit_wrap_for_expected_slice(lhs_ty, rhs_id, func); let tmp = self.alloc_scalar(); - if lhs_is_f32 { - func.emit(StackOp::LocalSetF(tmp)); - } else { - func.emit(StackOp::LocalSet(tmp)); - } + self.emit_local_set(&lhs_ty, tmp, func); tmp } } else { self.translate_expr(rhs_id, func); self.emit_wrap_for_expected_slice(lhs_ty, rhs_id, func); let tmp = self.alloc_scalar(); - if lhs_is_f32 { - func.emit(StackOp::LocalSetF(tmp)); - } else { - func.emit(StackOp::LocalSet(tmp)); - } + self.emit_local_set(&lhs_ty, tmp, func); tmp }; self.translate_lvalue(lhs_id, func); // pushes address - if lhs_is_f32 { - func.emit(StackOp::LocalGetF(val_local)); - } else { - func.emit(StackOp::LocalGet(val_local)); - } + self.emit_local_get(&lhs_ty, val_local, func); // For Func type field assignment, only copy func_idx (8 bytes). if matches!(&*lhs_ty, Type::Func(_, _)) { @@ -1481,11 +1441,7 @@ impl<'a> FunctionTranslator<'a> { self.emit_store_op(&lhs_ty, func); if !self.void_ctx { - if lhs_is_f32 { - func.emit(StackOp::LocalGetF(val_local)); - } else { - func.emit(StackOp::LocalGet(val_local)); - } + self.emit_local_get(&lhs_ty, val_local, func); } } @@ -1673,7 +1629,7 @@ impl<'a> FunctionTranslator<'a> { match op { Unop::Neg => match &*ty { Type::Float32 => func.emit(StackOp::FNegF), - Type::Float64 => func.emit(StackOp::DNeg), + Type::Float64 => func.emit(StackOp::DNegD), _ => func.emit(StackOp::INeg), }, Unop::Not => { @@ -1728,6 +1684,7 @@ impl<'a> FunctionTranslator<'a> { let ty = self.expr_type(arg_id); match &*ty { Type::Float32 => func.emit(StackOp::PrintF32F), + Type::Float64 => func.emit(StackOp::PrintF64D), _ => func.emit(StackOp::PrintI32), } } @@ -1807,29 +1764,29 @@ impl<'a> FunctionTranslator<'a> { ("isinf$f32", StackOp::IsinfF32F), ]; let unary_math_f64: &[(&str, StackOp)] = &[ - ("sin$f64", StackOp::SinF64), - ("cos$f64", StackOp::CosF64), - ("tan$f64", StackOp::TanF64), - ("asin$f64", StackOp::AsinF64), - ("acos$f64", StackOp::AcosF64), - ("atan$f64", StackOp::AtanF64), - ("sinh$f64", StackOp::SinhF64), - ("cosh$f64", StackOp::CoshF64), - ("tanh$f64", StackOp::TanhF64), - ("asinh$f64", StackOp::AsinhF64), - ("acosh$f64", StackOp::AcoshF64), - ("atanh$f64", StackOp::AtanhF64), - ("ln$f64", StackOp::LnF64), - ("exp$f64", StackOp::ExpF64), - ("exp2$f64", StackOp::Exp2F64), - ("log10$f64", StackOp::Log10F64), - ("log2$f64", StackOp::Log2F64), - ("sqrt$f64", StackOp::SqrtF64), - ("abs$f64", StackOp::AbsF64), - ("floor$f64", StackOp::FloorF64), - ("ceil$f64", StackOp::CeilF64), - ("isnan$f64", StackOp::IsnanF64), - ("isinf$f64", StackOp::IsinfF64), + ("sin$f64", StackOp::SinF64D), + ("cos$f64", StackOp::CosF64D), + ("tan$f64", StackOp::TanF64D), + ("asin$f64", StackOp::AsinF64D), + ("acos$f64", StackOp::AcosF64D), + ("atan$f64", StackOp::AtanF64D), + ("sinh$f64", StackOp::SinhF64D), + ("cosh$f64", StackOp::CoshF64D), + ("tanh$f64", StackOp::TanhF64D), + ("asinh$f64", StackOp::AsinhF64D), + ("acosh$f64", StackOp::AcoshF64D), + ("atanh$f64", StackOp::AtanhF64D), + ("ln$f64", StackOp::LnF64D), + ("exp$f64", StackOp::ExpF64D), + ("exp2$f64", StackOp::Exp2F64D), + ("log10$f64", StackOp::Log10F64D), + ("log2$f64", StackOp::Log2F64D), + ("sqrt$f64", StackOp::SqrtF64D), + ("abs$f64", StackOp::AbsF64D), + ("floor$f64", StackOp::FloorF64D), + ("ceil$f64", StackOp::CeilF64D), + ("isnan$f64", StackOp::IsnanF64D), + ("isinf$f64", StackOp::IsinfF64D), ]; for (n, op) in unary_math_f32.iter() { if *name == *n { @@ -1856,7 +1813,7 @@ impl<'a> FunctionTranslator<'a> { if *name == "atan2$f64$f64" { self.translate_expr(arg_ids[0], func); self.translate_expr(arg_ids[1], func); - func.emit(StackOp::Atan2F64); + func.emit(StackOp::Atan2F64D); return; } @@ -1870,7 +1827,7 @@ impl<'a> FunctionTranslator<'a> { if *name == "pow$f64$f64" { self.translate_expr(arg_ids[0], func); self.translate_expr(arg_ids[1], func); - func.emit(StackOp::DPow); + func.emit(StackOp::DPowD); return; } @@ -1883,24 +1840,23 @@ impl<'a> FunctionTranslator<'a> { let is_f64 = name.contains("f64"); let is_min = name.contains("min"); // Local set/get for the a/b temps: f32 goes through the - // float window (LocalSetF/LocalGetF); f64 through the - // int window (LocalSet/LocalGet — f64 values ride as - // u64 bit patterns). + // f32 lives in the float window (LocalSetF/LocalGetF), + // f64 in the double window (LocalSetD/LocalGetD). let local_set = |slot: u16| { if is_f64 { - StackOp::LocalSet(slot) + StackOp::LocalSetD(slot) } else { StackOp::LocalSetF(slot) } }; let local_get = |slot: u16| { if is_f64 { - StackOp::LocalGet(slot) + StackOp::LocalGetD(slot) } else { StackOp::LocalGetF(slot) } }; - let cmp_lt = if is_f64 { StackOp::DLt } else { StackOp::FLtF }; + let cmp_lt = if is_f64 { StackOp::DLtD } else { StackOp::FLtF }; self.translate_expr(arg_ids[0], func); let a_local = self.alloc_scalar(); func.emit(local_set(a_local)); @@ -1975,6 +1931,8 @@ impl<'a> FunctionTranslator<'a> { let arg_ty = self.expr_type(arg); if matches!(&*arg_ty, Type::Float32) { func.emit(StackOp::FToBitsF); + } else if matches!(&*arg_ty, Type::Float64) { + func.emit(StackOp::DToBitsD); } c_arg_count += 1; } @@ -1995,6 +1953,8 @@ impl<'a> FunctionTranslator<'a> { // translate_call's +1 invariant still holds. if matches!(&*ret_ty, Type::Float32) { func.emit(StackOp::BitsToFF); + } else if matches!(&*ret_ty, Type::Float64) { + func.emit(StackOp::BitsToDD); } return; } @@ -2041,6 +2001,8 @@ impl<'a> FunctionTranslator<'a> { let arg_ty = self.expr_type(*arg_id); if matches!(&*arg_ty, Type::Float32) { func.emit(StackOp::FToBitsF); + } else if matches!(&*arg_ty, Type::Float64) { + func.emit(StackOp::DToBitsD); } } if param_ty.is_some_and(|t| matches!(&*t, Type::Slice(_))) { @@ -2082,6 +2044,9 @@ impl<'a> FunctionTranslator<'a> { // Bridge into the float window so the surrounding codegen // can consume them as f32 directly. func.emit(StackOp::BitsToFF); + } else if matches!(&*ret_ty, Type::Float64) { + // Same for f64: bridge t0 bits into the double window. + func.emit(StackOp::BitsToDD); } // Otherwise the call already pushed its return value. @@ -2553,10 +2518,10 @@ impl<'a> FunctionTranslator<'a> { match (&*src_ty, &*target_ty) { (Type::Int32, Type::Float32) => func.emit(StackOp::I32ToF32F), (Type::Float32, Type::Int32) => func.emit(StackOp::F32ToI32F), - (Type::Int32, Type::Float64) => func.emit(StackOp::I32ToF64), - (Type::Float64, Type::Int32) => func.emit(StackOp::F64ToI32), - (Type::Float32, Type::Float64) => func.emit(StackOp::F32ToF64), - (Type::Float64, Type::Float32) => func.emit(StackOp::F64ToF32), + (Type::Int32, Type::Float64) => func.emit(StackOp::I32ToF64D), + (Type::Float64, Type::Int32) => func.emit(StackOp::F64ToI32D), + (Type::Float32, Type::Float64) => func.emit(StackOp::F32ToF64D), + (Type::Float64, Type::Float32) => func.emit(StackOp::F64ToF32D), (Type::Int32, Type::Int8) | (Type::UInt32, Type::Int8) => func.emit(StackOp::I32ToI8), (Type::Int8, Type::Int32) => func.emit(StackOp::I8ToI32), (Type::Int32, Type::UInt32) | (Type::UInt32, Type::Int32) => { @@ -2707,13 +2672,41 @@ impl<'a> FunctionTranslator<'a> { } } + /// Emit the window-appropriate `local.get` for a scalar of `ty`: + /// f32 → float window, f64 → double window, everything else → int. + fn emit_local_get(&self, ty: &TypeID, slot: u16, func: &mut StackFunction) { + match &**ty { + Type::Float32 => func.emit(StackOp::LocalGetF(slot)), + Type::Float64 => func.emit(StackOp::LocalGetD(slot)), + _ => func.emit(StackOp::LocalGet(slot)), + } + } + + /// Window-appropriate `local.set` (see `emit_local_get`). + fn emit_local_set(&self, ty: &TypeID, slot: u16, func: &mut StackFunction) { + match &**ty { + Type::Float32 => func.emit(StackOp::LocalSetF(slot)), + Type::Float64 => func.emit(StackOp::LocalSetD(slot)), + _ => func.emit(StackOp::LocalSet(slot)), + } + } + + /// Window-appropriate `local.tee` (see `emit_local_get`). + fn emit_local_tee(&self, ty: &TypeID, slot: u16, func: &mut StackFunction) { + match &**ty { + Type::Float32 => func.emit(StackOp::LocalTeeF(slot)), + Type::Float64 => func.emit(StackOp::LocalTeeD(slot)), + _ => func.emit(StackOp::LocalTee(slot)), + } + } + /// Emit a load instruction based on type. Pops address, pushes value. fn emit_load(&self, ty: &TypeID, func: &mut StackFunction) { match &**ty { Type::Bool | Type::Int8 | Type::UInt8 => func.emit(StackOp::Load8), Type::Float32 => func.emit(StackOp::LoadF32F), Type::Int32 | Type::UInt32 => func.emit(StackOp::Load32), - Type::Float64 => func.emit(StackOp::Load64), + Type::Float64 => func.emit(StackOp::LoadF64D), _ => func.emit(StackOp::Load64), } } @@ -2732,7 +2725,7 @@ impl<'a> FunctionTranslator<'a> { func.emit(StackOp::Load32Off(offset)); } Type::Float64 => { - func.emit(StackOp::Load64Off(offset)); + func.emit(StackOp::LoadF64OffD(offset)); } _ => { func.emit(StackOp::Load64Off(offset)); @@ -2750,7 +2743,7 @@ impl<'a> FunctionTranslator<'a> { Type::Bool | Type::Int8 | Type::UInt8 => func.emit(StackOp::Store8), Type::Float32 => func.emit(StackOp::StoreF32F), Type::Int32 | Type::UInt32 => func.emit(StackOp::Store32), - Type::Float64 => func.emit(StackOp::Store64), + Type::Float64 => func.emit(StackOp::StoreF64D), _ => func.emit(StackOp::Store64), } } @@ -2844,7 +2837,7 @@ impl<'a> FunctionTranslator<'a> { func.emit(StackOp::Store32Off(offset)); } Type::Float64 => { - func.emit(StackOp::Store64Off(offset)); + func.emit(StackOp::StoreF64OffD(offset)); } _ => { func.emit(StackOp::Store32Off(offset)); diff --git a/src/stack_depth.rs b/src/stack_depth.rs index eccc7627..62bc8d18 100644 --- a/src/stack_depth.rs +++ b/src/stack_depth.rs @@ -28,6 +28,8 @@ pub fn compute_depths(func: &StackFunction) -> Vec { | StackOp::FusedBoundsCheck8JumpIfZero(_, off) => Some(*off), StackOp::FusedF32ConstFGtJumpIfZeroF(_, off) => Some(*off), StackOp::FusedGetF32ConstFGtJumpIfZeroF(_, _, off) => Some(*off), + StackOp::FusedF64ConstDGtJumpIfZeroD(_, off) => Some(*off), + StackOp::FusedGetF64ConstDGtJumpIfZeroD(_, _, off) => Some(*off), _ => None, }; if let Some(off) = off { @@ -315,6 +317,190 @@ pub fn stack_delta(op: &StackOp) -> i32 { | StackOp::FusedGetGetFMulSum7F(_, _) | StackOp::FusedGetGetFMulSum8F(_, _) => 0, StackOp::FusedTeeSliceStore32F(_, _, _) => 0, + + // === Double-window (D) ops: integer-window deltas === + // Pure d-window ops don't touch the int window. + StackOp::F64ConstD(_) + | StackOp::LocalGetD(_) + | StackOp::LocalSetD(_) + | StackOp::LocalTeeD(_) + | StackOp::DropD + | StackOp::DAddD + | StackOp::DSubD + | StackOp::DMulD + | StackOp::DDivD + | StackOp::DPowD + | StackOp::DNegD + | StackOp::F32ToF64D + | StackOp::F64ToF32D + | StackOp::SinF64D + | StackOp::CosF64D + | StackOp::TanF64D + | StackOp::AsinF64D + | StackOp::AcosF64D + | StackOp::AtanF64D + | StackOp::SinhF64D + | StackOp::CoshF64D + | StackOp::TanhF64D + | StackOp::AsinhF64D + | StackOp::AcoshF64D + | StackOp::AtanhF64D + | StackOp::LnF64D + | StackOp::ExpF64D + | StackOp::Exp2F64D + | StackOp::Log10F64D + | StackOp::Log2F64D + | StackOp::SqrtF64D + | StackOp::AbsF64D + | StackOp::FloorF64D + | StackOp::CeilF64D + | StackOp::Atan2F64D + | StackOp::PrintF64D => 0, + + // d-window → int window: push 1 to the int window. + StackOp::DEqD + | StackOp::DNeD + | StackOp::DLtD + | StackOp::DLeD + | StackOp::DGtD + | StackOp::DGeD + | StackOp::F64ToI32D + | StackOp::DToBitsD + | StackOp::IsnanF64D + | StackOp::IsinfF64D => 1, + + // int window → d-window: pop 1 from the int window. Loads/stores + // take the address from the int window (value lives in d-window). + StackOp::I32ToF64D + | StackOp::BitsToDD + | StackOp::LoadF64D + | StackOp::LoadF64OffD(_) + | StackOp::StoreF64D + | StackOp::StoreF64OffD(_) => -1, + + // Fused d-window superinstructions read operands from locals[] and + // operate only on the d-window, so the int window is untouched. + StackOp::FusedGetGetDMulD(_, _) + | StackOp::FusedGetGetDMulDAddD(_, _) + | StackOp::FusedGetGetDMulDSubD(_, _) + | StackOp::FusedGetGetDMulSum2D(_, _) + | StackOp::FusedGetGetDMulSum3D(_, _) + | StackOp::FusedGetGetDMulSum4D(_, _) + | StackOp::FusedGetGetDMulSum5D(_, _) + | StackOp::FusedGetGetDMulSum6D(_, _) + | StackOp::FusedGetGetDMulSum7D(_, _) + | StackOp::FusedGetGetDMulSum8D(_, _) + | StackOp::FusedGetSetD(_, _) + | StackOp::FusedGetSet2D(_) + | StackOp::FusedGetSet3D(_) + | StackOp::FusedGetSet4D(_) + | StackOp::FusedGetSet5D(_) + | StackOp::FusedGetSet6D(_) + | StackOp::FusedGetSet7D(_) + | StackOp::FusedGetSet8D(_) + | StackOp::FusedF64ConstDGtJumpIfZeroD(_, _) + | StackOp::FusedGetF64ConstDGtJumpIfZeroD(_, _, _) => 0, + } +} + +/// Stack depth change for an instruction (double window only). +/// +/// The f64 analogue of `float_stack_delta`. Tracks d0..d3 occupancy +/// independently from the integer and float windows. Ops that don't +/// touch the double window contribute 0 via the catch-all. +pub fn double_stack_delta(op: &StackOp) -> i32 { + match op { + // Pushes onto the d-window. + StackOp::F64ConstD(_) + | StackOp::LocalGetD(_) + | StackOp::I32ToF64D + | StackOp::BitsToDD + | StackOp::F32ToF64D + | StackOp::LoadF64D + | StackOp::LoadF64OffD(_) => 1, + + // Pops from the d-window (single value). + StackOp::LocalSetD(_) + | StackOp::DropD + | StackOp::F64ToI32D + | StackOp::DToBitsD + | StackOp::F64ToF32D + | StackOp::IsnanF64D + | StackOp::IsinfF64D + | StackOp::StoreF64D + | StackOp::StoreF64OffD(_) + | StackOp::PrintF64D => -1, + + // Peek (LocalTeeD) and unary in-window ops: net 0. + StackOp::LocalTeeD(_) + | StackOp::DNegD + | StackOp::SinF64D + | StackOp::CosF64D + | StackOp::TanF64D + | StackOp::AsinF64D + | StackOp::AcosF64D + | StackOp::AtanF64D + | StackOp::SinhF64D + | StackOp::CoshF64D + | StackOp::TanhF64D + | StackOp::AsinhF64D + | StackOp::AcoshF64D + | StackOp::AtanhF64D + | StackOp::LnF64D + | StackOp::ExpF64D + | StackOp::Exp2F64D + | StackOp::Log10F64D + | StackOp::Log2F64D + | StackOp::SqrtF64D + | StackOp::AbsF64D + | StackOp::FloorF64D + | StackOp::CeilF64D => 0, + + // Binary d-window arith: pop 2, push 1 = -1. + StackOp::DAddD + | StackOp::DSubD + | StackOp::DMulD + | StackOp::DDivD + | StackOp::DPowD + | StackOp::Atan2F64D => -1, + + // Double comparisons: pop 2 from d-window (result goes to int). + StackOp::DEqD + | StackOp::DNeD + | StackOp::DLtD + | StackOp::DLeD + | StackOp::DGtD + | StackOp::DGeD => -2, + + // Fused d-window superinstructions. + // Push one result: bare mul and the mul-accumulate sums. + StackOp::FusedGetGetDMulD(_, _) + | StackOp::FusedGetGetDMulSum2D(_, _) + | StackOp::FusedGetGetDMulSum3D(_, _) + | StackOp::FusedGetGetDMulSum4D(_, _) + | StackOp::FusedGetGetDMulSum5D(_, _) + | StackOp::FusedGetGetDMulSum6D(_, _) + | StackOp::FusedGetGetDMulSum7D(_, _) + | StackOp::FusedGetGetDMulSum8D(_, _) => 1, + + // Accumulate onto d0 in place (net 0). Get/set move chains and the + // direct-from-local compare/jump don't touch the d-window. + StackOp::FusedGetGetDMulDAddD(_, _) + | StackOp::FusedGetGetDMulDSubD(_, _) + | StackOp::FusedGetSetD(_, _) + | StackOp::FusedGetSet2D(_) + | StackOp::FusedGetSet3D(_) + | StackOp::FusedGetSet4D(_) + | StackOp::FusedGetSet5D(_) + | StackOp::FusedGetSet6D(_) + | StackOp::FusedGetSet7D(_) + | StackOp::FusedGetSet8D(_) + | StackOp::FusedGetF64ConstDGtJumpIfZeroD(_, _, _) => 0, + + // Pops d0 for the comparison before branching. + StackOp::FusedF64ConstDGtJumpIfZeroD(_, _) => -1, + + _ => 0, } } @@ -419,14 +605,17 @@ pub fn float_stack_delta(op: &StackOp) -> i32 { // f0 ± coeff*state from frame slot — net 0 in f-window. StackOp::FusedGetAddrFMulFAddF(_, _, _) | StackOp::FusedGetAddrFMulFSubF(_, _, _) => 0, - // Crossings: F→int pops f-window + // Crossings: F→int pops f-window. F32ToF64D pops the f-window + // (f32) and pushes the d-window (handled in double_stack_delta). StackOp::F32ToI32F | StackOp::FToBitsF | StackOp::IsnanF32F | StackOp::IsinfF32F - | StackOp::F32ToF64 => -1, - // int→F pushes f-window - StackOp::I32ToF32F | StackOp::BitsToFF | StackOp::F64ToF32 => 1, + | StackOp::F32ToF64 + | StackOp::F32ToF64D => -1, + // int→F pushes f-window. F64ToF32D pops the d-window and pushes + // the f-window (f32). + StackOp::I32ToF32F | StackOp::BitsToFF | StackOp::F64ToF32 | StackOp::F64ToF32D => 1, // f-window stores pop f0 StackOp::StoreF32F | StackOp::StoreF32OffF(_) => -1, diff --git a/src/stack_inline.rs b/src/stack_inline.rs index 5396f925..1b32d006 100644 --- a/src/stack_inline.rs +++ b/src/stack_inline.rs @@ -154,6 +154,12 @@ fn inline_calls_in(func: &mut StackFunction, bodies: &[Option>]) { StackOp::FusedGetF32ConstFGtJumpIfZeroF(_, _, off) => { Some((i as i64 + 1 + *off as i64) as usize) } + StackOp::FusedF64ConstDGtJumpIfZeroD(_, off) => { + Some((i as i64 + 1 + *off as i64) as usize) + } + StackOp::FusedGetF64ConstDGtJumpIfZeroD(_, _, off) => { + Some((i as i64 + 1 + *off as i64) as usize) + } _ => None, }; @@ -183,6 +189,8 @@ fn inline_calls_in(func: &mut StackFunction, bodies: &[Option>]) { | StackOp::FusedBoundsCheck8JumpIfZero(_, o) => *o = new_off as i32, StackOp::FusedF32ConstFGtJumpIfZeroF(_, o) => *o = new_off as i32, StackOp::FusedGetF32ConstFGtJumpIfZeroF(_, _, o) => *o = new_off as i32, + StackOp::FusedF64ConstDGtJumpIfZeroD(_, o) => *o = new_off as i32, + StackOp::FusedGetF64ConstDGtJumpIfZeroD(_, _, o) => *o = new_off as i32, _ => unreachable!(), } } diff --git a/src/stack_interp.c b/src/stack_interp.c index 116ee8c6..08bb3767 100644 --- a/src/stack_interp.c +++ b/src/stack_interp.c @@ -103,6 +103,16 @@ static inline void store_f32_unaligned(void* p, float v) { memcpy(p, &v, sizeof(v)); } +static inline double load_f64_unaligned(const void* p) { + double v; + memcpy(&v, p, sizeof(v)); + return v; +} + +static inline void store_f64_unaligned(void* p, double v) { + memcpy(p, &v, sizeof(v)); +} + // ============================================================================ // Integer power // ============================================================================ @@ -147,7 +157,7 @@ static int64_t ipow(int64_t base, uint32_t exp) { // cross between GPR and FP register files, dodging the ~3-cycle // fmov/movq penalty that the old "f32 bit-pattern in u64" design // paid on every float arithmetic op. -#define HANDLER_ARGS Ctx* ctx, Instruction* pc, uint64_t* sp, float* fsp, uint64_t* locals, uint64_t t0, uint64_t t1, uint64_t t2, uint64_t t3, float f0, float f1, float f2, float f3, void* _nh_raw +#define HANDLER_ARGS Ctx* ctx, Instruction* pc, uint64_t* sp, float* fsp, double* dfsp, uint64_t* locals, uint64_t t0, uint64_t t1, uint64_t t2, uint64_t t3, float f0, float f1, float f2, float f3, double d0, double d1, double d2, double d3, void* _nh_raw #define HANDLER(name) PRESERVE_NONE void name(HANDLER_ARGS) // Cast nh from void* for use in NEXT macro. @@ -163,14 +173,14 @@ static int64_t ipow(int64_t base, uint32_t exp) { do { \ Instruction* _next = pc + 1; \ void* _new_nh = (_next + 1)->handler; \ - __attribute__((musttail)) return ((Handler)_nh_raw)(ctx, _next, sp, fsp, locals, t0, t1, t2, t3, f0, f1, f2, f3, _new_nh); \ + __attribute__((musttail)) return ((Handler)_nh_raw)(ctx, _next, sp, fsp, dfsp, locals, t0, t1, t2, t3, f0, f1, f2, f3, d0, d1, d2, d3, _new_nh); \ } while(0) #define DISPATCH() \ do { \ Handler _target_h = (Handler)pc->handler; \ void* _new_nh = (pc + 1)->handler; \ - __attribute__((musttail)) return _target_h(ctx, pc, sp, fsp, locals, t0, t1, t2, t3, f0, f1, f2, f3, _new_nh); \ + __attribute__((musttail)) return _target_h(ctx, pc, sp, fsp, dfsp, locals, t0, t1, t2, t3, f0, f1, f2, f3, d0, d1, d2, d3, _new_nh); \ } while(0) // How many backward jumps between cancel-callback invocations. Must match @@ -211,6 +221,22 @@ static int64_t ipow(int64_t base, uint32_t exp) { #define FDROP1() do { f0 = f1; f1 = f2; f2 = f3; f3 = *--fsp; } while(0) #define FBINOP_SHIFT() do { f1 = f2; f2 = f3; f3 = *--fsp; } while(0) +// f64 TOS window push/pop — exact mirror of the f32 window above, but +// typed `double` and spilling through `dfsp`. Keeps f64 values in FP +// registers across arithmetic, dodging the GPR↔FP crossings the old +// integer-window f64 path paid on every op. +#define DPUSH(val) do { \ + *dfsp++ = d3; \ + d3 = d2; d2 = d1; d1 = d0; d0 = (val); \ +} while(0) + +#define DPOP(dst) do { \ + (dst) = d0; d0 = d1; d1 = d2; d2 = d3; d3 = *--dfsp; \ +} while(0) + +#define DDROP1() do { d0 = d1; d1 = d2; d2 = d3; d3 = *--dfsp; } while(0) +#define DBINOP_SHIFT() do { d1 = d2; d2 = d3; d3 = *--dfsp; } while(0) + // ============================================================================ // Handlers // ============================================================================ @@ -1766,6 +1792,226 @@ HANDLER(op_print_f32_f) { NEXT(); } +// ============================================================================ +// Double-window (D) handlers — the f64 analogue of the float window. +// f64 values live in d0..d3 (FP regs), spilling through dfsp. Each handler +// mirrors its f32 counterpart above with float→double / f→d / fsp→dfsp. +// ============================================================================ + +// --- Constants and locals (double window) --- + +HANDLER(op_f64_const_d) { + // imm[0] is the f64 bit pattern. Reinterpret as double and push. + DPUSH(as_f64(pc->imm[0])); + NEXT(); +} + +// imm[0] is a pre-shifted byte offset into locals[] (see encode_imm), +// so the handler emits a single `ldr d, [locals, imm0]` with no shift. +HANDLER(op_local_get_d) { + DPUSH(*(double*)((uint8_t*)locals + pc->imm[0])); + NEXT(); +} +HANDLER(op_local_set_d) { + *(double*)((uint8_t*)locals + pc->imm[0]) = d0; + DDROP1(); + NEXT(); +} +HANDLER(op_local_tee_d) { + *(double*)((uint8_t*)locals + pc->imm[0]) = d0; + NEXT(); +} +HANDLER(op_drop_d) { + DDROP1(); + NEXT(); +} + +// --- Double arithmetic (binary): pop b=d0, a=d1, push a OP b --- + +HANDLER(op_dadd_d) { + d0 = d1 + d0; + DBINOP_SHIFT(); + NEXT(); +} +HANDLER(op_dsub_d) { + d0 = d1 - d0; + DBINOP_SHIFT(); + NEXT(); +} +HANDLER(op_dmul_d) { + d0 = d1 * d0; + DBINOP_SHIFT(); + NEXT(); +} +HANDLER(op_ddiv_d) { + d0 = d1 / d0; + DBINOP_SHIFT(); + NEXT(); +} +HANDLER(op_dpow_d) { + d0 = pow(d1, d0); + DBINOP_SHIFT(); + NEXT(); +} +HANDLER(op_dneg_d) { + d0 = -d0; + NEXT(); +} + +// --- Comparisons: pop 2 from d-window, push 0/1 to int window --- + +#define DW_CMP(name, op) \ +HANDLER(name) { \ + PUSH((d1 op d0) ? 1ULL : 0ULL); \ + /* drop both doubles */ \ + d0 = d2; d1 = d3; d2 = *--dfsp; d3 = *--dfsp; \ + NEXT(); \ +} +DW_CMP(op_deq_d, ==) +DW_CMP(op_dne_d, !=) +DW_CMP(op_dlt_d, <) +DW_CMP(op_dle_d, <=) +DW_CMP(op_dgt_d, >) +DW_CMP(op_dge_d, >=) + +// --- Conversions / window crossings --- + +HANDLER(op_f64_to_i32_d) { + // Pop d0, push int t0 = (int32)d0 + int64_t v = (int64_t)(int32_t)d0; + PUSH((uint64_t)v); + DDROP1(); + NEXT(); +} +HANDLER(op_i32_to_f64_d) { + // Pop t0 (int32), push d0 = (double)i + double v = (double)(int32_t)t0; + DROP1(); + DPUSH(v); + NEXT(); +} +HANDLER(op_to_bits_d) { + // Pop d0, push int t0 = bit pattern of d0 + PUSH(from_f64(d0)); + DDROP1(); + NEXT(); +} +HANDLER(op_from_bits_d) { + // Pop t0 (bit pattern), push d0 = double + double v = as_f64(t0); + DROP1(); + DPUSH(v); + NEXT(); +} +HANDLER(op_f32_to_f64_d) { + // Pop f0 (f32) from float window, push widened d0 (f64). + double v = (double)f0; + FDROP1(); + DPUSH(v); + NEXT(); +} +HANDLER(op_f64_to_f32_d) { + // Pop d0 (f64) from double window, push narrowed f0 (f32). + float v = (float)d0; + DDROP1(); + FPUSH(v); + NEXT(); +} + +// --- Double memory loads: pop addr from int window, push to d-window --- + +HANDLER(op_load_f64_d) { + double v = load_f64_unaligned((const void*)t0); + DROP1(); + DPUSH(v); + NEXT(); +} +HANDLER(op_load_f64_off_d) { + int32_t off = (int32_t)pc->imm[0]; + double v = load_f64_unaligned((uint8_t*)t0 + off); + DROP1(); + DPUSH(v); + NEXT(); +} + +// --- Double memory stores: pop d0 (value), pop t0 (addr) --- + +HANDLER(op_store_f64_d) { + store_f64_unaligned((void*)t0, d0); + DROP1(); + DDROP1(); + NEXT(); +} +HANDLER(op_store_f64_off_d) { + int32_t off = (int32_t)pc->imm[0]; + store_f64_unaligned((uint8_t*)t0 + off, d0); + DROP1(); + DDROP1(); + NEXT(); +} + +// --- Math intrinsics (double window) --- + +#define DW_F64_UNARY(name, func) \ +HANDLER(name) { \ + d0 = func(d0); \ + NEXT(); \ +} +DW_F64_UNARY(op_sin_f64_d, sin) +DW_F64_UNARY(op_cos_f64_d, cos) +DW_F64_UNARY(op_tan_f64_d, tan) +DW_F64_UNARY(op_asin_f64_d, asin) +DW_F64_UNARY(op_acos_f64_d, acos) +DW_F64_UNARY(op_atan_f64_d, atan) +DW_F64_UNARY(op_sinh_f64_d, sinh) +DW_F64_UNARY(op_cosh_f64_d, cosh) +DW_F64_UNARY(op_tanh_f64_d, tanh) +DW_F64_UNARY(op_asinh_f64_d, asinh) +DW_F64_UNARY(op_acosh_f64_d, acosh) +DW_F64_UNARY(op_atanh_f64_d, atanh) +DW_F64_UNARY(op_ln_f64_d, log) +DW_F64_UNARY(op_exp_f64_d, exp) +DW_F64_UNARY(op_exp2_f64_d, exp2) +DW_F64_UNARY(op_log10_f64_d, log10) +DW_F64_UNARY(op_log2_f64_d, log2) +DW_F64_UNARY(op_sqrt_f64_d, sqrt) +DW_F64_UNARY(op_abs_f64_d, fabs) +DW_F64_UNARY(op_floor_f64_d, floor) +DW_F64_UNARY(op_ceil_f64_d, ceil) + +HANDLER(op_atan2_f64_d) { + // Binary in d-window: pop b=d0, a=d1, push atan2(a, b). + d0 = atan2(d1, d0); + DBINOP_SHIFT(); + NEXT(); +} + +HANDLER(op_isnan_f64_d) { + int v = isnan(d0) ? 1 : 0; + PUSH((uint64_t)v); + DDROP1(); + NEXT(); +} +HANDLER(op_isinf_f64_d) { + int v = isinf(d0) ? 1 : 0; + PUSH((uint64_t)v); + DDROP1(); + NEXT(); +} + +// --- Debug --- + +HANDLER(op_print_f64_d) { + double val = d0; + DDROP1(); + if (val == floor(val) && fabs(val) < 1e15) { + printf("%.1f\n", val); + } else { + printf("%g\n", val); + } + NEXT(); +} + // --- Float-window fused superinstructions (Phase 5) --- // NOTE: imm[0] and imm[1] on these handlers are pre-shifted byte offsets @@ -1959,6 +2205,119 @@ HANDLER(op_fused_get_f32const_fgt_jiz_f) { NEXT(); } +// ============================================================================ +// Double-window (D) fused superinstructions — mirror the F-window set. +// Operands are read directly from locals[] by pre-shifted byte offset. +// ============================================================================ + +HANDLER(op_fused_get_get_dmul_d) { + double a = *(double*)((uint8_t*)locals + pc->imm[0]); + double b = *(double*)((uint8_t*)locals + pc->imm[1]); + DPUSH(a * b); + NEXT(); +} +HANDLER(op_fused_get_get_dmul_dadd_d) { + double a = *(double*)((uint8_t*)locals + pc->imm[0]); + double b = *(double*)((uint8_t*)locals + pc->imm[1]); + d0 = d0 + a * b; + NEXT(); +} +HANDLER(op_fused_get_get_dmul_dsub_d) { + double a = *(double*)((uint8_t*)locals + pc->imm[0]); + double b = *(double*)((uint8_t*)locals + pc->imm[1]); + d0 = d0 - a * b; + NEXT(); +} + +#define DMUL_SUM_HANDLER(name, TERMS) \ +HANDLER(name) { \ + uint8_t sub_mask = (uint8_t)pc->imm[2]; \ + double acc = 0.0; \ + for (int i = 0; i < (TERMS); i++) { \ + uint8_t a_idx = imm_u8(pc, i * 2); \ + uint8_t b_idx = imm_u8(pc, i * 2 + 1); \ + double a = *(double*)((uint8_t*)locals + (size_t)a_idx * 8); \ + double b = *(double*)((uint8_t*)locals + (size_t)b_idx * 8); \ + double prod = a * b; \ + acc = (sub_mask & (1u << i)) ? (acc - prod) : (acc + prod); \ + } \ + DPUSH(acc); \ + NEXT(); \ +} +DMUL_SUM_HANDLER(op_fused_get_get_dmul_sum2_d, 2) +DMUL_SUM_HANDLER(op_fused_get_get_dmul_sum3_d, 3) +DMUL_SUM_HANDLER(op_fused_get_get_dmul_sum4_d, 4) +DMUL_SUM_HANDLER(op_fused_get_get_dmul_sum5_d, 5) +DMUL_SUM_HANDLER(op_fused_get_get_dmul_sum6_d, 6) +DMUL_SUM_HANDLER(op_fused_get_get_dmul_sum7_d, 7) +DMUL_SUM_HANDLER(op_fused_get_get_dmul_sum8_d, 8) + +#define COPY_D_PAIR(N) do { \ + uint8_t src = imm_u8(pc, (N) * 2); \ + uint8_t dst = imm_u8(pc, (N) * 2 + 1); \ + *(double*)((uint8_t*)locals + (size_t)dst * 8) = *(double*)((uint8_t*)locals + (size_t)src * 8); \ +} while (0) + +HANDLER(op_fused_get_set_d) { + COPY_D_PAIR(0); + NEXT(); +} +HANDLER(op_fused_get_set2_d) { + COPY_D_PAIR(0); COPY_D_PAIR(1); + NEXT(); +} +HANDLER(op_fused_get_set3_d) { + COPY_D_PAIR(0); COPY_D_PAIR(1); COPY_D_PAIR(2); + NEXT(); +} +HANDLER(op_fused_get_set4_d) { + COPY_D_PAIR(0); COPY_D_PAIR(1); COPY_D_PAIR(2); COPY_D_PAIR(3); + NEXT(); +} +HANDLER(op_fused_get_set5_d) { + COPY_D_PAIR(0); COPY_D_PAIR(1); COPY_D_PAIR(2); COPY_D_PAIR(3); COPY_D_PAIR(4); + NEXT(); +} +HANDLER(op_fused_get_set6_d) { + COPY_D_PAIR(0); COPY_D_PAIR(1); COPY_D_PAIR(2); COPY_D_PAIR(3); COPY_D_PAIR(4); + COPY_D_PAIR(5); + NEXT(); +} +HANDLER(op_fused_get_set7_d) { + COPY_D_PAIR(0); COPY_D_PAIR(1); COPY_D_PAIR(2); COPY_D_PAIR(3); COPY_D_PAIR(4); + COPY_D_PAIR(5); COPY_D_PAIR(6); + NEXT(); +} +HANDLER(op_fused_get_set8_d) { + COPY_D_PAIR(0); COPY_D_PAIR(1); COPY_D_PAIR(2); COPY_D_PAIR(3); COPY_D_PAIR(4); + COPY_D_PAIR(5); COPY_D_PAIR(6); COPY_D_PAIR(7); + NEXT(); +} + +HANDLER(op_fused_f64const_dgt_jiz_d) { + double val = d0; + double limit = as_f64(pc->imm[0]); + DDROP1(); + if (!(val > limit)) { + int64_t off = (int64_t)pc->imm[1]; + pc = pc + 1 + off; + if (off < 0) POLL_CANCEL(); + DISPATCH(); + } + NEXT(); +} +HANDLER(op_fused_get_f64const_dgt_jiz_d) { + double val = *(double*)((uint8_t*)locals + pc->imm[0]); + double limit = as_f64(pc->imm[1]); + if (!(val > limit)) { + int64_t off = (int64_t)pc->imm[2]; + pc = pc + 1 + off; + if (off < 0) POLL_CANCEL(); + DISPATCH(); + } + NEXT(); +} + // ============================================================================ // Entry point // ============================================================================ @@ -1992,7 +2351,7 @@ int64_t stack_interp_run(Ctx* ctx, uint32_t entry_func) { // and x86-64. Instruction* pc = ctx->functions[entry_func].code; Handler initial_nh = (Handler)(pc + 1)->handler; - ((Handler)pc->handler)(ctx, pc, ctx->stack_base, ctx->float_stack, entry_locals, 0, 0, 0, 0, 0.0f, 0.0f, 0.0f, 0.0f, initial_nh); + ((Handler)pc->handler)(ctx, pc, ctx->stack_base, ctx->float_stack, ctx->double_stack, entry_locals, 0, 0, 0, 0, 0.0f, 0.0f, 0.0f, 0.0f, 0.0, 0.0, 0.0, 0.0, initial_nh); return ctx->result; } diff --git a/src/stack_interp.h b/src/stack_interp.h index 2c9506f1..9ba3a360 100644 --- a/src/stack_interp.h +++ b/src/stack_interp.h @@ -87,6 +87,15 @@ typedef struct Ctx { float* float_stack; size_t float_stack_cap; + // Double spill stack: backing store for the f64 TOS window (d0..d3) + // when its depth exceeds 4. Mirrors float_stack but typed `double`, + // so f64 arithmetic stays in FP registers and never pays the GPR↔FP + // crossing the old "f64 bit-pattern in u64" design forced. The live + // top pointer lives in the `dfsp` handler argument; this is the base + // for bounds checks and the initial value passed to the entry handler. + double* double_stack; + size_t double_stack_cap; + // Closure pointer (set by call_closure, read by handlers) uint64_t closure_ptr; @@ -148,25 +157,34 @@ typedef struct Ctx { // float values coexist on the logical stack; static types at each // position tell the codegen which window to use. // -// The window is typed as `float` (not `double`) so f32 arithmetic -// compiles to direct single-precision FMA/fadd/... instructions -// without the fcvt round-trips that a double-typed window forces on -// every op. f64 values — rare in our hot workloads — still travel -// through the integer window paying GPR↔FP crossings. +// The f32 window is typed as `float` (not `double`) so f32 arithmetic +// compiles to direct single-precision FMA/fadd/... instructions without +// the fcvt round-trips that a double-typed window forces on every op. +// +// f64 gets its own parallel 4-slot window (d0..d3) typed `double`, living +// in a separate set of FP/SIMD registers, with its own spill pointer +// `dfsp`. f64 arithmetic stays in FP registers throughout, the same way +// f32 does. The two windows together use 8 FP argument registers +// (v0..v7 on aarch64, xmm0..xmm7 on x86-64) — the full FP arg budget. typedef PRESERVE_NONE void (*Handler)( Ctx* ctx, Instruction* pc, uint64_t* sp, - float* fsp, // float spill pointer (lives in a GPR via preserve_none) + float* fsp, // float (f32) spill pointer (lives in a GPR via preserve_none) + double* dfsp, // double (f64) spill pointer (lives in a GPR via preserve_none) uint64_t* locals, // frame pointer: scalars, then local memory contiguously uint64_t t0, // int TOS window (GPRs) uint64_t t1, uint64_t t2, uint64_t t3, - float f0, // float TOS window (FP regs) + float f0, // f32 TOS window (FP regs) float f1, float f2, float f3, + double d0, // f64 TOS window (FP regs) + double d1, + double d2, + double d3, void* nh // preloaded handler for the NEXT instruction (cast to Handler) ); diff --git a/src/stack_interp_bridge.rs b/src/stack_interp_bridge.rs index 4d6615e4..c87972f3 100644 --- a/src/stack_interp_bridge.rs +++ b/src/stack_interp_bridge.rs @@ -42,6 +42,8 @@ struct Ctx { stack_base: *mut u64, float_stack: *mut f32, float_stack_cap: usize, + double_stack: *mut f64, + double_stack_cap: usize, closure_ptr: u64, result: i64, done: i32, @@ -286,6 +288,82 @@ extern "C" { fn op_fused_get_set8_f(); fn op_fused_f32const_fgt_jiz_f(); fn op_fused_get_f32const_fgt_jiz_f(); + + // === Double-window (D) handlers === + fn op_f64_const_d(); + fn op_local_get_d(); + fn op_local_set_d(); + fn op_local_tee_d(); + fn op_drop_d(); + fn op_dadd_d(); + fn op_dsub_d(); + fn op_dmul_d(); + fn op_ddiv_d(); + fn op_dpow_d(); + fn op_dneg_d(); + fn op_deq_d(); + fn op_dne_d(); + fn op_dlt_d(); + fn op_dle_d(); + fn op_dgt_d(); + fn op_dge_d(); + fn op_f64_to_i32_d(); + fn op_i32_to_f64_d(); + fn op_to_bits_d(); + fn op_from_bits_d(); + fn op_f32_to_f64_d(); + fn op_f64_to_f32_d(); + fn op_load_f64_d(); + fn op_load_f64_off_d(); + fn op_store_f64_d(); + fn op_store_f64_off_d(); + fn op_sin_f64_d(); + fn op_cos_f64_d(); + fn op_tan_f64_d(); + fn op_asin_f64_d(); + fn op_acos_f64_d(); + fn op_atan_f64_d(); + fn op_sinh_f64_d(); + fn op_cosh_f64_d(); + fn op_tanh_f64_d(); + fn op_asinh_f64_d(); + fn op_acosh_f64_d(); + fn op_atanh_f64_d(); + fn op_ln_f64_d(); + fn op_exp_f64_d(); + fn op_exp2_f64_d(); + fn op_log10_f64_d(); + fn op_log2_f64_d(); + fn op_sqrt_f64_d(); + fn op_abs_f64_d(); + fn op_floor_f64_d(); + fn op_ceil_f64_d(); + fn op_atan2_f64_d(); + fn op_isnan_f64_d(); + fn op_isinf_f64_d(); + fn op_print_f64_d(); + + // === Double-window fused handlers === + fn op_fused_get_get_dmul_d(); + fn op_fused_get_get_dmul_dadd_d(); + fn op_fused_get_get_dmul_dsub_d(); + fn op_fused_get_get_dmul_sum2_d(); + fn op_fused_get_get_dmul_sum3_d(); + fn op_fused_get_get_dmul_sum4_d(); + fn op_fused_get_get_dmul_sum5_d(); + fn op_fused_get_get_dmul_sum6_d(); + fn op_fused_get_get_dmul_sum7_d(); + fn op_fused_get_get_dmul_sum8_d(); + fn op_fused_get_set_d(); + fn op_fused_get_set2_d(); + fn op_fused_get_set3_d(); + fn op_fused_get_set4_d(); + fn op_fused_get_set5_d(); + fn op_fused_get_set6_d(); + fn op_fused_get_set7_d(); + fn op_fused_get_set8_d(); + fn op_fused_f64const_dgt_jiz_d(); + fn op_fused_get_f64const_dgt_jiz_d(); } /// Get the C handler function pointer for a StackOp. @@ -522,6 +600,84 @@ fn handler_for(op: &StackOp) -> *const () { StackOp::FusedGetF32ConstFGtJumpIfZeroF(_, _, _) => { op_fused_get_f32const_fgt_jiz_f as *const () } + + // === Double-window (D) ops === + StackOp::F64ConstD(_) => op_f64_const_d as *const (), + StackOp::LocalGetD(_) => op_local_get_d as *const (), + StackOp::LocalSetD(_) => op_local_set_d as *const (), + StackOp::LocalTeeD(_) => op_local_tee_d as *const (), + StackOp::DropD => op_drop_d as *const (), + StackOp::DAddD => op_dadd_d as *const (), + StackOp::DSubD => op_dsub_d as *const (), + StackOp::DMulD => op_dmul_d as *const (), + StackOp::DDivD => op_ddiv_d as *const (), + StackOp::DPowD => op_dpow_d as *const (), + StackOp::DNegD => op_dneg_d as *const (), + StackOp::DEqD => op_deq_d as *const (), + StackOp::DNeD => op_dne_d as *const (), + StackOp::DLtD => op_dlt_d as *const (), + StackOp::DLeD => op_dle_d as *const (), + StackOp::DGtD => op_dgt_d as *const (), + StackOp::DGeD => op_dge_d as *const (), + StackOp::F64ToI32D => op_f64_to_i32_d as *const (), + StackOp::I32ToF64D => op_i32_to_f64_d as *const (), + StackOp::DToBitsD => op_to_bits_d as *const (), + StackOp::BitsToDD => op_from_bits_d as *const (), + StackOp::F32ToF64D => op_f32_to_f64_d as *const (), + StackOp::F64ToF32D => op_f64_to_f32_d as *const (), + StackOp::LoadF64D => op_load_f64_d as *const (), + StackOp::LoadF64OffD(_) => op_load_f64_off_d as *const (), + StackOp::StoreF64D => op_store_f64_d as *const (), + StackOp::StoreF64OffD(_) => op_store_f64_off_d as *const (), + StackOp::SinF64D => op_sin_f64_d as *const (), + StackOp::CosF64D => op_cos_f64_d as *const (), + StackOp::TanF64D => op_tan_f64_d as *const (), + StackOp::AsinF64D => op_asin_f64_d as *const (), + StackOp::AcosF64D => op_acos_f64_d as *const (), + StackOp::AtanF64D => op_atan_f64_d as *const (), + StackOp::SinhF64D => op_sinh_f64_d as *const (), + StackOp::CoshF64D => op_cosh_f64_d as *const (), + StackOp::TanhF64D => op_tanh_f64_d as *const (), + StackOp::AsinhF64D => op_asinh_f64_d as *const (), + StackOp::AcoshF64D => op_acosh_f64_d as *const (), + StackOp::AtanhF64D => op_atanh_f64_d as *const (), + StackOp::LnF64D => op_ln_f64_d as *const (), + StackOp::ExpF64D => op_exp_f64_d as *const (), + StackOp::Exp2F64D => op_exp2_f64_d as *const (), + StackOp::Log10F64D => op_log10_f64_d as *const (), + StackOp::Log2F64D => op_log2_f64_d as *const (), + StackOp::SqrtF64D => op_sqrt_f64_d as *const (), + StackOp::AbsF64D => op_abs_f64_d as *const (), + StackOp::FloorF64D => op_floor_f64_d as *const (), + StackOp::CeilF64D => op_ceil_f64_d as *const (), + StackOp::Atan2F64D => op_atan2_f64_d as *const (), + StackOp::IsnanF64D => op_isnan_f64_d as *const (), + StackOp::IsinfF64D => op_isinf_f64_d as *const (), + StackOp::PrintF64D => op_print_f64_d as *const (), + + // === Double-window fused ops === + StackOp::FusedGetGetDMulD(_, _) => op_fused_get_get_dmul_d as *const (), + StackOp::FusedGetGetDMulDAddD(_, _) => op_fused_get_get_dmul_dadd_d as *const (), + StackOp::FusedGetGetDMulDSubD(_, _) => op_fused_get_get_dmul_dsub_d as *const (), + StackOp::FusedGetGetDMulSum2D(_, _) => op_fused_get_get_dmul_sum2_d as *const (), + StackOp::FusedGetGetDMulSum3D(_, _) => op_fused_get_get_dmul_sum3_d as *const (), + StackOp::FusedGetGetDMulSum4D(_, _) => op_fused_get_get_dmul_sum4_d as *const (), + StackOp::FusedGetGetDMulSum5D(_, _) => op_fused_get_get_dmul_sum5_d as *const (), + StackOp::FusedGetGetDMulSum6D(_, _) => op_fused_get_get_dmul_sum6_d as *const (), + StackOp::FusedGetGetDMulSum7D(_, _) => op_fused_get_get_dmul_sum7_d as *const (), + StackOp::FusedGetGetDMulSum8D(_, _) => op_fused_get_get_dmul_sum8_d as *const (), + StackOp::FusedGetSetD(_, _) => op_fused_get_set_d as *const (), + StackOp::FusedGetSet2D(_) => op_fused_get_set2_d as *const (), + StackOp::FusedGetSet3D(_) => op_fused_get_set3_d as *const (), + StackOp::FusedGetSet4D(_) => op_fused_get_set4_d as *const (), + StackOp::FusedGetSet5D(_) => op_fused_get_set5_d as *const (), + StackOp::FusedGetSet6D(_) => op_fused_get_set6_d as *const (), + StackOp::FusedGetSet7D(_) => op_fused_get_set7_d as *const (), + StackOp::FusedGetSet8D(_) => op_fused_get_set8_d as *const (), + StackOp::FusedF64ConstDGtJumpIfZeroD(_, _) => op_fused_f64const_dgt_jiz_d as *const (), + StackOp::FusedGetF64ConstDGtJumpIfZeroD(_, _, _) => { + op_fused_get_f64const_dgt_jiz_d as *const () + } } } @@ -719,6 +875,42 @@ fn encode_imm(op: &StackOp, func_idx: u32) -> [u64; 3] { StackOp::FusedGetF32ConstFGtJumpIfZeroF(n, v, off) => { [(*n as u64) * 8, f32::to_bits(*v) as u64, *off as i64 as u64] } + // === Double-window (D) ops === + // f64 const: imm[0] is the f64 bit pattern. Local indices are + // pre-shifted to byte offsets so the handler emits `ldr d` with + // no scale (locals[] has 8-byte stride, same as the f32 path). + StackOp::F64ConstD(v) => [f64::to_bits(*v), 0, 0], + StackOp::LocalGetD(n) | StackOp::LocalSetD(n) | StackOp::LocalTeeD(n) => { + [(*n as u64) * 8, 0, 0] + } + StackOp::LoadF64OffD(o) | StackOp::StoreF64OffD(o) => [*o as i64 as u64, 0, 0], + // Double-window fused ops. get_get_dmul* carry two pre-shifted byte + // offsets; the sum/get-set chains carry raw u8 local indices (the + // handler shifts them); the compare-jumps carry an f64 const + off. + StackOp::FusedGetGetDMulD(a, b) + | StackOp::FusedGetGetDMulDAddD(a, b) + | StackOp::FusedGetGetDMulDSubD(a, b) => [(*a as u64) * 8, (*b as u64) * 8, 0], + StackOp::FusedGetGetDMulSum2D(p, mask) => pack_u8_imms_with_tail(p, *mask), + StackOp::FusedGetGetDMulSum3D(p, mask) => pack_u8_imms_with_tail(p, *mask), + StackOp::FusedGetGetDMulSum4D(p, mask) => pack_u8_imms_with_tail(p, *mask), + StackOp::FusedGetGetDMulSum5D(p, mask) => pack_u8_imms_with_tail(p, *mask), + StackOp::FusedGetGetDMulSum6D(p, mask) => pack_u8_imms_with_tail(p, *mask), + StackOp::FusedGetGetDMulSum7D(p, mask) => pack_u8_imms_with_tail(p, *mask), + StackOp::FusedGetGetDMulSum8D(p, mask) => pack_u8_imms_with_tail(p, *mask), + StackOp::FusedGetSetD(src, dst) => pack_u8_imms(&[*src, *dst]), + StackOp::FusedGetSet2D(p) => pack_u8_imms(p), + StackOp::FusedGetSet3D(p) => pack_u8_imms(p), + StackOp::FusedGetSet4D(p) => pack_u8_imms(p), + StackOp::FusedGetSet5D(p) => pack_u8_imms(p), + StackOp::FusedGetSet6D(p) => pack_u8_imms(p), + StackOp::FusedGetSet7D(p) => pack_u8_imms(p), + StackOp::FusedGetSet8D(p) => pack_u8_imms(p), + StackOp::FusedF64ConstDGtJumpIfZeroD(v, off) => { + [f64::to_bits(*v), *off as i64 as u64, 0] + } + StackOp::FusedGetF64ConstDGtJumpIfZeroD(n, v, off) => { + [(*n as u64) * 8, f64::to_bits(*v), *off as i64 as u64] + } _ => [0, 0, 0], } } @@ -749,6 +941,7 @@ pub struct StackBackend { operand_stack: Vec, frame_stack: Vec, float_stack: Vec, + double_stack: Vec, ctx: Ctx, } @@ -793,6 +986,8 @@ impl StackBackend { let mut frame_stack: Vec = vec![0u64; frame_stack_cap]; let float_stack_cap: usize = 64 * 1024; let mut float_stack: Vec = vec![0.0f32; float_stack_cap]; + let double_stack_cap: usize = 64 * 1024; + let mut double_stack: Vec = vec![0.0f64; double_stack_cap]; let ctx = Ctx { call_stack: call_stack.as_mut_ptr(), @@ -807,6 +1002,8 @@ impl StackBackend { stack_base: operand_stack.as_mut_ptr(), float_stack: float_stack.as_mut_ptr(), float_stack_cap, + double_stack: double_stack.as_mut_ptr(), + double_stack_cap, closure_ptr: 0, result: 0, done: 0, @@ -825,6 +1022,7 @@ impl StackBackend { operand_stack, frame_stack, float_stack, + double_stack, ctx, } } @@ -869,6 +1067,7 @@ impl StackBackend { &self.operand_stack, &self.frame_stack, &self.float_stack, + &self.double_stack, ); result } diff --git a/src/stack_ir.rs b/src/stack_ir.rs index 5707e629..5e0d06d6 100644 --- a/src/stack_ir.rs +++ b/src/stack_ir.rs @@ -483,6 +483,121 @@ pub enum StackOp { /// if !(locals[n] > const) jump. Pop 0, conditionally jump. FusedGetF32ConstFGtJumpIfZeroF(u16, f32, i32), + // === Double-window (D) ops — the f64 analogue of the float window === + // f64 values live in a parallel 4-slot FP register window (d0..d3), + // spilling through `dfsp`. These mirror the F-window ops one-for-one + // so f64 arithmetic stays in FP registers and never pays the GPR↔FP + // crossing the old int-window f64 path forced on every op. + /// Push an f64 constant onto the double window. + F64ConstD(f64), + /// Push scalar local N (interpreted as f64) onto the double window. + LocalGetD(u16), + /// Pop top of double window into scalar local N (stored as f64 bits). + LocalSetD(u16), + /// Copy top of double window into scalar local N (don't pop). + LocalTeeD(u16), + /// Pop and discard top of double window. + DropD, + + // Double arithmetic (binary pop 2 push 1, all in d-window). + DAddD, + DSubD, + DMulD, + DDivD, + DPowD, + /// Pop 1 push 1 (negate) on d-window. + DNegD, + + // Double comparisons: pop 2 from d-window, push 0/1 to the int window. + DEqD, + DNeD, + DLtD, + DLeD, + DGtD, + DGeD, + + // Crossings between the double window and the int / float windows. + /// Pop d0 (f64), push i32 (truncated) to int window. + F64ToI32D, + /// Pop i32 from int window, push f64 to d-window. + I32ToF64D, + /// Pop d0, push its raw u64 bit pattern to the int window (call/return + /// bridge — mirrors FToBitsF). + DToBitsD, + /// Pop u64 bits from int window, push as f64 to d-window (mirrors BitsToFF). + BitsToDD, + /// Pop f0 (f32) from the float window, push widened f64 to d-window. + F32ToF64D, + /// Pop d0 (f64) from the double window, push narrowed f32 to f-window. + F64ToF32D, + + // Double memory loads/stores: address comes from the int window, the + // value lives in the d-window. + LoadF64D, + LoadF64OffD(i32), + StoreF64D, + StoreF64OffD(i32), + + // Double math intrinsics (pop 1 push 1 in d-window unless noted). + SinF64D, + CosF64D, + TanF64D, + AsinF64D, + AcosF64D, + AtanF64D, + SinhF64D, + CoshF64D, + TanhF64D, + AsinhF64D, + AcoshF64D, + AtanhF64D, + LnF64D, + ExpF64D, + Exp2F64D, + Log10F64D, + Log2F64D, + SqrtF64D, + AbsF64D, + FloorF64D, + CeilF64D, + /// Pop d0, push i32 0/1 to int window. + IsnanF64D, + IsinfF64D, + /// Pop 2 from d-window, push 1 (atan2). + Atan2F64D, + /// Pop d0 and print it. + PrintF64D, + + // === Double-window fused superinstructions (mirror the F-window set) === + /// Push locals[a] * locals[b] onto the d-window. + FusedGetGetDMulD(u16, u16), + /// d0 += locals[a] * locals[b] (accumulate; net 0 on the d-window). + FusedGetGetDMulDAddD(u16, u16), + /// d0 -= locals[a] * locals[b]. + FusedGetGetDMulDSubD(u16, u16), + /// Fused multiply-accumulate sum: read N (local,local) pairs, multiply + /// each, sum with per-term add/sub from the mask, push one result. + FusedGetGetDMulSum2D([u8; 4], u8), + FusedGetGetDMulSum3D([u8; 6], u8), + FusedGetGetDMulSum4D([u8; 8], u8), + FusedGetGetDMulSum5D([u8; 10], u8), + FusedGetGetDMulSum6D([u8; 12], u8), + FusedGetGetDMulSum7D([u8; 14], u8), + FusedGetGetDMulSum8D([u8; 16], u8), + /// d-window variable-move chains: locals[dst] = locals[src] (×N). + FusedGetSetD(u8, u8), + FusedGetSet2D([u8; 4]), + FusedGetSet3D([u8; 6]), + FusedGetSet4D([u8; 8]), + FusedGetSet5D([u8; 10]), + FusedGetSet6D([u8; 12]), + FusedGetSet7D([u8; 14]), + FusedGetSet8D([u8; 16]), + /// if !(d0 > const) jump; pops d0. + FusedF64ConstDGtJumpIfZeroD(f64, i32), + /// if !(locals[n] > const) jump. Pop 0, conditionally jump. + FusedGetF64ConstDGtJumpIfZeroD(u16, f64, i32), + Halt, Nop, } @@ -1034,6 +1149,146 @@ impl fmt::Display for StackOp { StackOp::FusedGetF32ConstFGtJumpIfZeroF(n, v, o) => { write!(f, "fw.fused.get_f32const_fgt_jiz {} {} {}", n, v, o) } + // === Double-window (D) ops === + StackOp::F64ConstD(v) => write!(f, "dw.f64.const {}", v), + StackOp::LocalGetD(n) => write!(f, "dw.local.get {}", n), + StackOp::LocalSetD(n) => write!(f, "dw.local.set {}", n), + StackOp::LocalTeeD(n) => write!(f, "dw.local.tee {}", n), + StackOp::DropD => write!(f, "dw.drop"), + StackOp::DAddD => write!(f, "dw.f64.add"), + StackOp::DSubD => write!(f, "dw.f64.sub"), + StackOp::DMulD => write!(f, "dw.f64.mul"), + StackOp::DDivD => write!(f, "dw.f64.div"), + StackOp::DPowD => write!(f, "dw.f64.pow"), + StackOp::DNegD => write!(f, "dw.f64.neg"), + StackOp::DEqD => write!(f, "dw.f64.eq"), + StackOp::DNeD => write!(f, "dw.f64.ne"), + StackOp::DLtD => write!(f, "dw.f64.lt"), + StackOp::DLeD => write!(f, "dw.f64.le"), + StackOp::DGtD => write!(f, "dw.f64.gt"), + StackOp::DGeD => write!(f, "dw.f64.ge"), + StackOp::F64ToI32D => write!(f, "dw.convert.f64_to_i32"), + StackOp::I32ToF64D => write!(f, "dw.convert.i32_to_f64"), + StackOp::DToBitsD => write!(f, "dw.to_bits"), + StackOp::BitsToDD => write!(f, "dw.from_bits"), + StackOp::F32ToF64D => write!(f, "dw.convert.f32_to_f64"), + StackOp::F64ToF32D => write!(f, "dw.convert.f64_to_f32"), + StackOp::LoadF64D => write!(f, "dw.f64.load"), + StackOp::LoadF64OffD(o) => write!(f, "dw.f64.load offset={}", o), + StackOp::StoreF64D => write!(f, "dw.f64.store"), + StackOp::StoreF64OffD(o) => write!(f, "dw.f64.store offset={}", o), + StackOp::SinF64D => write!(f, "dw.f64.sin"), + StackOp::CosF64D => write!(f, "dw.f64.cos"), + StackOp::TanF64D => write!(f, "dw.f64.tan"), + StackOp::AsinF64D => write!(f, "dw.f64.asin"), + StackOp::AcosF64D => write!(f, "dw.f64.acos"), + StackOp::AtanF64D => write!(f, "dw.f64.atan"), + StackOp::SinhF64D => write!(f, "dw.f64.sinh"), + StackOp::CoshF64D => write!(f, "dw.f64.cosh"), + StackOp::TanhF64D => write!(f, "dw.f64.tanh"), + StackOp::AsinhF64D => write!(f, "dw.f64.asinh"), + StackOp::AcoshF64D => write!(f, "dw.f64.acosh"), + StackOp::AtanhF64D => write!(f, "dw.f64.atanh"), + StackOp::LnF64D => write!(f, "dw.f64.ln"), + StackOp::ExpF64D => write!(f, "dw.f64.exp"), + StackOp::Exp2F64D => write!(f, "dw.f64.exp2"), + StackOp::Log10F64D => write!(f, "dw.f64.log10"), + StackOp::Log2F64D => write!(f, "dw.f64.log2"), + StackOp::SqrtF64D => write!(f, "dw.f64.sqrt"), + StackOp::AbsF64D => write!(f, "dw.f64.abs"), + StackOp::FloorF64D => write!(f, "dw.f64.floor"), + StackOp::CeilF64D => write!(f, "dw.f64.ceil"), + StackOp::IsnanF64D => write!(f, "dw.f64.isnan"), + StackOp::IsinfF64D => write!(f, "dw.f64.isinf"), + StackOp::Atan2F64D => write!(f, "dw.f64.atan2"), + StackOp::PrintF64D => write!(f, "dw.debug.print_f64"), + // === Double-window fused superinstructions === + StackOp::FusedGetGetDMulD(a, b) => write!(f, "dw.fused.get_get_dmul {} {}", a, b), + StackOp::FusedGetGetDMulDAddD(a, b) => { + write!(f, "dw.fused.get_get_dmul_dadd {} {}", a, b) + } + StackOp::FusedGetGetDMulDSubD(a, b) => { + write!(f, "dw.fused.get_get_dmul_dsub {} {}", a, b) + } + StackOp::FusedGetGetDMulSum2D(p, mask) => write!( + f, + "dw.fused.get_get_dmul_sum2 {} {} {} {} {}", + p[0], p[1], p[2], p[3], mask + ), + StackOp::FusedGetGetDMulSum3D(p, mask) => write!( + f, + "dw.fused.get_get_dmul_sum3 {} {} {} {} {} {} {}", + p[0], p[1], p[2], p[3], p[4], p[5], mask + ), + StackOp::FusedGetGetDMulSum4D(p, mask) => write!( + f, + "dw.fused.get_get_dmul_sum4 {} {} {} {} {} {} {} {} {}", + p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], mask + ), + StackOp::FusedGetGetDMulSum5D(p, mask) => write!( + f, + "dw.fused.get_get_dmul_sum5 {} {} {} {} {} {} {} {} {} {} {}", + p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], mask + ), + StackOp::FusedGetGetDMulSum6D(p, mask) => write!( + f, + "dw.fused.get_get_dmul_sum6 {} {} {} {} {} {} {} {} {} {} {} {} {}", + p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], mask + ), + StackOp::FusedGetGetDMulSum7D(p, mask) => write!( + f, + "dw.fused.get_get_dmul_sum7 {} {} {} {} {} {} {} {} {} {} {} {} {} {} {}", + p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], + p[13], mask + ), + StackOp::FusedGetGetDMulSum8D(p, mask) => write!( + f, + "dw.fused.get_get_dmul_sum8 {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {}", + p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], + p[13], p[14], p[15], mask + ), + StackOp::FusedGetSetD(src, dst) => write!(f, "dw.fused.get_set {} {}", src, dst), + StackOp::FusedGetSet2D(p) => { + write!(f, "dw.fused.get_set2 {} {} {} {}", p[0], p[1], p[2], p[3]) + } + StackOp::FusedGetSet3D(p) => write!( + f, + "dw.fused.get_set3 {} {} {} {} {} {}", + p[0], p[1], p[2], p[3], p[4], p[5] + ), + StackOp::FusedGetSet4D(p) => write!( + f, + "dw.fused.get_set4 {} {} {} {} {} {} {} {}", + p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7] + ), + StackOp::FusedGetSet5D(p) => write!( + f, + "dw.fused.get_set5 {} {} {} {} {} {} {} {} {} {}", + p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9] + ), + StackOp::FusedGetSet6D(p) => write!( + f, + "dw.fused.get_set6 {} {} {} {} {} {} {} {} {} {} {} {}", + p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11] + ), + StackOp::FusedGetSet7D(p) => write!( + f, + "dw.fused.get_set7 {} {} {} {} {} {} {} {} {} {} {} {} {} {}", + p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], + p[13] + ), + StackOp::FusedGetSet8D(p) => write!( + f, + "dw.fused.get_set8 {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {}", + p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], + p[13], p[14], p[15] + ), + StackOp::FusedF64ConstDGtJumpIfZeroD(v, o) => { + write!(f, "dw.fused.f64const_dgt_jiz {} {}", v, o) + } + StackOp::FusedGetF64ConstDGtJumpIfZeroD(n, v, o) => { + write!(f, "dw.fused.get_f64const_dgt_jiz {} {} {}", n, v, o) + } StackOp::Halt => write!(f, "halt"), StackOp::Nop => write!(f, "nop"), } diff --git a/src/stack_optimize.rs b/src/stack_optimize.rs index 28293d17..a6051286 100644 --- a/src/stack_optimize.rs +++ b/src/stack_optimize.rs @@ -39,6 +39,8 @@ fn compute_jump_targets(ops: &[StackOp]) -> Vec { | StackOp::FusedBoundsCheck8JumpIfZero(_, off) => Some(*off), StackOp::FusedF32ConstFGtJumpIfZeroF(_, off) => Some(*off), StackOp::FusedGetF32ConstFGtJumpIfZeroF(_, _, off) => Some(*off), + StackOp::FusedF64ConstDGtJumpIfZeroD(_, off) => Some(*off), + StackOp::FusedGetF64ConstDGtJumpIfZeroD(_, _, off) => Some(*off), _ => None, }; if let Some(off) = off { @@ -139,6 +141,130 @@ fn make_fused_get_get_fmul_sum_f(bytes: &[u8], sub_mask: u8) -> StackOp { } } +// === Double-window (D) chain builders — mirror the F-window helpers === + +fn make_fused_get_set_d(bytes: &[u8]) -> StackOp { + match bytes.len() { + 2 => StackOp::FusedGetSetD(bytes[0], bytes[1]), + 4 => StackOp::FusedGetSet2D([bytes[0], bytes[1], bytes[2], bytes[3]]), + 6 => StackOp::FusedGetSet3D([bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5]]), + 8 => StackOp::FusedGetSet4D([ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7], + ]), + 10 => StackOp::FusedGetSet5D([ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7], + bytes[8], bytes[9], + ]), + 12 => StackOp::FusedGetSet6D([ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7], + bytes[8], bytes[9], bytes[10], bytes[11], + ]), + 14 => StackOp::FusedGetSet7D([ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7], + bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], bytes[13], + ]), + 16 => StackOp::FusedGetSet8D([ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7], + bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], bytes[13], bytes[14], bytes[15], + ]), + _ => unreachable!("unsupported double get/set chain length"), + } +} + +fn packed_get_set_d_chain(ops: &[StackOp], start: usize, pairs: usize) -> Option> { + let mut bytes = Vec::with_capacity(pairs * 2); + for j in 0..pairs { + match (&ops[start + j * 2], &ops[start + j * 2 + 1]) { + (StackOp::LocalGetD(src), StackOp::LocalSetD(dst)) if *src < 256 && *dst < 256 => { + bytes.push(*src as u8); + bytes.push(*dst as u8); + } + _ => return None, + } + } + Some(bytes) +} + +fn make_fused_get_get_dmul_sum_d(bytes: &[u8], sub_mask: u8) -> StackOp { + match bytes.len() { + 4 => StackOp::FusedGetGetDMulSum2D([bytes[0], bytes[1], bytes[2], bytes[3]], sub_mask), + 6 => StackOp::FusedGetGetDMulSum3D( + [bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5]], + sub_mask, + ), + 8 => StackOp::FusedGetGetDMulSum4D( + [ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7], + ], + sub_mask, + ), + 10 => StackOp::FusedGetGetDMulSum5D( + [ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7], + bytes[8], bytes[9], + ], + sub_mask, + ), + 12 => StackOp::FusedGetGetDMulSum6D( + [ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7], + bytes[8], bytes[9], bytes[10], bytes[11], + ], + sub_mask, + ), + 14 => StackOp::FusedGetGetDMulSum7D( + [ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7], + bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], bytes[13], + ], + sub_mask, + ), + 16 => StackOp::FusedGetGetDMulSum8D( + [ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7], + bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], bytes[13], bytes[14], + bytes[15], + ], + sub_mask, + ), + _ => unreachable!("unsupported double mul-sum chain length"), + } +} + +fn packed_dmul_sum_fused_chain( + ops: &[StackOp], + start: usize, + terms: usize, +) -> Option<(Vec, u8)> { + let mut bytes = Vec::with_capacity(terms * 2); + let mut sub_mask = 0u8; + + match &ops[start] { + StackOp::FusedGetGetDMulD(a, b) if *a < 256 && *b < 256 => { + bytes.push(*a as u8); + bytes.push(*b as u8); + } + _ => return None, + } + + for term in 1..terms { + match &ops[start + term] { + StackOp::FusedGetGetDMulDAddD(a, b) if *a < 256 && *b < 256 => { + bytes.push(*a as u8); + bytes.push(*b as u8); + } + StackOp::FusedGetGetDMulDSubD(a, b) if *a < 256 && *b < 256 => { + bytes.push(*a as u8); + bytes.push(*b as u8); + sub_mask |= 1 << term; + } + _ => return None, + } + } + + Some((bytes, sub_mask)) +} + fn make_fused_bounds_check_jiz(bytes: &[u8], off: i32) -> StackOp { match bytes.len() { 2 => StackOp::FusedBoundsCheck1JumpIfZero([bytes[0], bytes[1]], off), @@ -594,6 +720,186 @@ fn fuse(func: &mut StackFunction) { continue; } + // ================= Double-window (D) fusions ================= + // Mirror the F-window rules above. Ordered so the multi-term + // mul-sum chain (built from already-fused ops on a prior pass) is + // tried before the 4-op and 3-op builders, and the 4-op builders + // before the 3-op one so the longest match wins at each position. + + // Constant-fold fw.f32.const v + dw.convert.f32_to_f64 → dw.f64.const. + // Widening f32→f64 is exact, so this is always valid; it both folds + // the many ` as f64` conversions and exposes the constant to + // the d-window compare-jump fusion below. + if i + 1 < len && !spans_target(i, 2) { + if let (StackOp::F32ConstF(v), StackOp::F32ToF64D) = (&ops[i], &ops[i + 1]) { + let v = *v as f64; + ops[i] = StackOp::F64ConstD(v); + ops[i + 1] = StackOp::Nop; + i += 2; + continue; + } + } + + // dw.fused.get_get_dmul[_dadd|_dsub]* chain → get_get_dmul_sumN. + let mut fused_dmul_sum = false; + for terms in (2..=8).rev() { + let span = terms; + if i + span <= len && !spans_target(i, span) { + if let Some((pairs, sub_mask)) = packed_dmul_sum_fused_chain(ops, i, terms) { + ops[i] = make_fused_get_get_dmul_sum_d(&pairs, sub_mask); + for slot in ops.iter_mut().take(i + span).skip(i + 1) { + *slot = StackOp::Nop; + } + i += span; + fused_dmul_sum = true; + break; + } + } + } + if fused_dmul_sum { + continue; + } + + // dw.local.get a + dw.local.get b + dw.f64.mul + dw.f64.add + // → dw.fused.get_get_dmul_dadd. + if i + 3 < len && !spans_target(i, 4) { + if let ( + StackOp::LocalGetD(a), + StackOp::LocalGetD(b), + StackOp::DMulD, + StackOp::DAddD, + ) = (&ops[i], &ops[i + 1], &ops[i + 2], &ops[i + 3]) + { + ops[i] = StackOp::FusedGetGetDMulDAddD(*a, *b); + ops[i + 1] = StackOp::Nop; + ops[i + 2] = StackOp::Nop; + ops[i + 3] = StackOp::Nop; + i += 4; + continue; + } + } + + // dw.local.get a + dw.local.get b + dw.f64.mul + dw.f64.sub + // → dw.fused.get_get_dmul_dsub. + if i + 3 < len && !spans_target(i, 4) { + if let ( + StackOp::LocalGetD(a), + StackOp::LocalGetD(b), + StackOp::DMulD, + StackOp::DSubD, + ) = (&ops[i], &ops[i + 1], &ops[i + 2], &ops[i + 3]) + { + ops[i] = StackOp::FusedGetGetDMulDSubD(*a, *b); + ops[i + 1] = StackOp::Nop; + ops[i + 2] = StackOp::Nop; + ops[i + 3] = StackOp::Nop; + i += 4; + continue; + } + } + + // dw.local.get a + dw.local.get b + dw.f64.mul → dw.fused.get_get_dmul. + if i + 2 < len && !spans_target(i, 3) { + if let (StackOp::LocalGetD(a), StackOp::LocalGetD(b), StackOp::DMulD) = + (&ops[i], &ops[i + 1], &ops[i + 2]) + { + let a = *a; + let b = *b; + ops[i] = StackOp::FusedGetGetDMulD(a, b); + ops[i + 1] = StackOp::Nop; + ops[i + 2] = StackOp::Nop; + i += 3; + continue; + } + } + + // dw.f64.const v + dw.f64.gt + jump_if_zero off + // → FusedF64ConstDGtJumpIfZeroD. + if i + 2 < len && !spans_target(i, 3) { + if let (StackOp::F64ConstD(v), StackOp::DGtD, StackOp::JumpIfZero(off)) = + (&ops[i], &ops[i + 1], &ops[i + 2]) + { + let v = *v; + let new_off = *off + 2; + ops[i] = StackOp::FusedF64ConstDGtJumpIfZeroD(v, new_off); + ops[i + 1] = StackOp::Nop; + ops[i + 2] = StackOp::Nop; + i += 3; + continue; + } + } + + // dw.local.get n + FusedF64ConstDGtJumpIfZeroD v off + // → FusedGetF64ConstDGtJumpIfZeroD. + if i + 1 < len && !spans_target(i, 2) { + if let (StackOp::LocalGetD(n), StackOp::FusedF64ConstDGtJumpIfZeroD(v, off)) = + (&ops[i], &ops[i + 1]) + { + let n = *n; + let v = *v; + let new_off = *off + 1; + ops[i] = StackOp::FusedGetF64ConstDGtJumpIfZeroD(n, v, new_off); + ops[i + 1] = StackOp::Nop; + i += 2; + continue; + } + } + + // dw.local.tee N + dw.drop → dw.local.set N. + if i + 1 < len && !spans_target(i, 2) { + if let StackOp::LocalTeeD(n) = ops[i] { + if matches!(ops[i + 1], StackOp::DropD) { + ops[i] = StackOp::LocalSetD(n); + ops[i + 1] = StackOp::Nop; + i += 2; + continue; + } + } + } + + // dw.local.get + dw.drop → nop (dead read). + if i + 1 < len && !spans_target(i, 2) { + if matches!(ops[i], StackOp::LocalGetD(_)) && matches!(ops[i + 1], StackOp::DropD) { + ops[i] = StackOp::Nop; + ops[i + 1] = StackOp::Nop; + i += 2; + continue; + } + } + + // dw.local.get src + dw.local.set dst (×N) → FusedGetSetND (moves). + let mut fused_get_set_d_chain = false; + for pairs in (2..=8).rev() { + let span = pairs * 2; + if i + span <= len && !spans_target(i, span) { + if let Some(bytes) = packed_get_set_d_chain(ops, i, pairs) { + ops[i] = make_fused_get_set_d(&bytes); + for slot in ops.iter_mut().take(i + span).skip(i + 1) { + *slot = StackOp::Nop; + } + i += span; + fused_get_set_d_chain = true; + break; + } + } + } + if fused_get_set_d_chain { + continue; + } + // Single dw.local.get + dw.local.set → FusedGetSetD. + if i + 1 < len && !spans_target(i, 2) { + if let (StackOp::LocalGetD(src), StackOp::LocalSetD(dst)) = (&ops[i], &ops[i + 1]) { + if *src < 256 && *dst < 256 { + let src = *src as u8; + let dst = *dst as u8; + ops[i] = StackOp::FusedGetSetD(src, dst); + ops[i + 1] = StackOp::Nop; + i += 2; + continue; + } + } + } + // Float-window: fw.local.get a + fw.local.get b + fw.f32.mul + fw.f32.add // → fw.fused.get_get_fmul_fadd. if i + 3 < len && !spans_target(i, 4) { @@ -1111,6 +1417,18 @@ fn strip_nops(func: &mut StackFunction) { let new_off = target_new as i32 - new_idx[old] as i32 - 1; StackOp::FusedGetF32ConstFGtJumpIfZeroF(*n, *v, new_off) } + StackOp::FusedF64ConstDGtJumpIfZeroD(v, off) => { + let target_old = (old as i64 + 1 + *off as i64) as usize; + let target_new = new_idx[target_old]; + let new_off = target_new as i32 - new_idx[old] as i32 - 1; + StackOp::FusedF64ConstDGtJumpIfZeroD(*v, new_off) + } + StackOp::FusedGetF64ConstDGtJumpIfZeroD(n, v, off) => { + let target_old = (old as i64 + 1 + *off as i64) as usize; + let target_new = new_idx[target_old]; + let new_off = target_new as i32 - new_idx[old] as i32 - 1; + StackOp::FusedGetF64ConstDGtJumpIfZeroD(*n, *v, new_off) + } other => other.clone(), }; new_ops.push(adjusted); diff --git a/tests/cases/f64_window.lyte b/tests/cases/f64_window.lyte new file mode 100644 index 00000000..7485b278 --- /dev/null +++ b/tests/cases/f64_window.lyte @@ -0,0 +1,79 @@ +// f64 double-window coverage: arithmetic, all comparisons, the fused +// multiply-add sum chain, f64 across function calls (arg/return bridging), +// deep expression chains that spill the d-window, conversions, math +// intrinsics, and negation. Exercises the StoreF64/LoadF64 d-window paths +// via a struct as well. +// +// expected stdout: +// compilation successful +// assert(true) +// assert(true) +// assert(true) +// assert(true) +// assert(true) +// assert(true) +// assert(true) +// assert(true) +// assert(true) +// assert(true) +// assert(true) +// assert(true) +// assert(true) +// assert(true) +// 42 + +struct Acc { a: f64, b: f64, c: f64 } + +approx(x: f64, y: f64) -> bool { + var d = x - y + if d < 0.0 as f64 { d = 0.0 as f64 - d } + return d < 0.0001 as f64 +} + +// f64 argument and return value bridging across a call. +fma(a: f64, b: f64, c: f64) -> f64 { + return a * b + c +} + +main { + var x = 3.0 as f64 + var y = 2.0 as f64 + + // Arithmetic. + assert(approx(x + y, 5.0 as f64)) + assert(approx(x - y, 1.0 as f64)) + assert(approx(x * y, 6.0 as f64)) + assert(approx(x / y, 1.5 as f64)) + assert(approx(0.0 as f64 - x, -3.0 as f64)) + + // All six comparisons. + assert(x > y) + assert(y < x) + assert(x >= 3.0 as f64) + assert(y <= 2.0 as f64) + assert(x == 3.0 as f64) + assert(x != y) + + // Fused multiply-add sum chain: b0*x + b1*x1 - b2*x2. + var b0 = 1.0 as f64 + var b1 = 2.0 as f64 + var b2 = 0.5 as f64 + var x1 = 4.0 as f64 + var x2 = 6.0 as f64 + var sum = b0 * x + b1 * x1 - b2 * x2 + // 1*3 + 2*4 - 0.5*6 = 3 + 8 - 3 = 8 + assert(approx(sum, 8.0 as f64)) + + // f64 across a function call (argument + return bridging). + assert(approx(fma(x, y, 1.0 as f64), 7.0 as f64)) + + // Math intrinsic + struct store/load through the d-window. + var acc: Acc + acc.a = sqrt(16.0 as f64) + acc.b = acc.a * 2.0 as f64 + acc.c = acc.a + acc.b + assert(approx(acc.c, 12.0 as f64)) + + // Conversion round trip f64 -> i32. + print((x * y * 7.0 as f64) as i32) +}