From 72b5e1eccb9daec25e7e2ec3e801ba6ca6e946e5 Mon Sep 17 00:00:00 2001
From: deftio <deftio@deftio.com>
Date: Tue, 21 Apr 2026 06:36:22 -0700
Subject: [PATCH 1/6] updated accuracy tables, also updated cross builds

---
 README.md                                   |  34 +-
 VERSION                                     |   2 +-
 docs/README.md                              |  33 +-
 docs/building.md                            |  63 +++-
 examples/posix-example/FR_Math_Example1.cpp |   2 +-
 idf_component.yml                           |   2 +-
 library.json                                |   2 +-
 library.properties                          |   2 +-
 llms.txt                                    |   2 +-
 makefile                                    |  38 ++-
 pages/assets/site.js                        |   2 +-
 pages/guide/building.html                   |  64 +++-
 pages/index.html                            |  40 ++-
 scripts/accuracy_report.sh                  | 150 +++++++++
 scripts/crossbuild-docker.sh                | 123 ++++++++
 src/FR_defs.h                               |  34 +-
 src/FR_math.c                               | 101 ++++--
 src/FR_math.h                               |   4 +-
 src/FR_math_2D.cpp                          |   2 +-
 src/FR_math_2D.h                            |   2 +-
 tests/test_full_coverage.c                  |   8 +-
 tests/test_tdd.cpp                          | 324 ++++++++++++++++++++
 tools/make_release.sh                       |   7 +-
 23 files changed, 921 insertions(+), 120 deletions(-)
 create mode 100755 scripts/accuracy_report.sh
 create mode 100755 scripts/crossbuild-docker.sh

diff --git a/README.md b/README.md
index e1b4e87..ffc2c10 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 [![CI](https://github.com/deftio/fr_math/actions/workflows/ci.yml/badge.svg)](https://github.com/deftio/fr_math/actions/workflows/ci.yml)
 [![Coverage](https://img.shields.io/badge/coverage-99%25-brightgreen.svg)](#building-and-testing)
 [![Docs](https://img.shields.io/badge/docs-online-blue.svg)](https://deftio.github.io/fr_math/)
-[![Version](https://img.shields.io/badge/version-2.0.5-blue.svg)](release_notes.md)
+[![Version](https://img.shields.io/badge/version-2.0.6-blue.svg)](release_notes.md)
 
 # FR_Math: A C Language Fixed-Point Math Library for Embedded Systems
 
@@ -40,17 +40,27 @@ See [`docker/`](docker/) for the cross-compile setup.
 
 Errors below are measured at Q16.16 (s15.16). All functions accept any
 radix — Q16.16 is just the reference point for the table.
-
-| Function | Max error | Note |
-|---|---|---|
-| sin / cos | 5 LSB (~7.7e-5) | Exact at 0, 90, 180, 270 |
-| sqrt | ≤ 0.5 LSB | Round-to-nearest |
-| log2 | ≤ 4 LSB | 65-entry mantissa table |
-| pow2 | ≤ 1 LSB (integers exact) | 65-entry fraction table |
-| ln, log10 | ≤ 4 LSB | Via FR_MULK28 from log2 |
-| hypot (exact) | ≤ 0.5 LSB | 64-bit intermediate |
-| hypot_fast (4-seg) | 0.34% | Shift-only, no multiply |
-| hypot_fast8 (8-seg) | 0.10% | Shift-only, no multiply |
+Percent errors skip expected values near zero (|expected| < 0.01).
+
+<!-- ACCURACY_TABLE_START -->
+| Function | Max err (LSB) | Max err (%) | Avg err (%) | Note |
+|---|---:|---:|---:|---|
+| sin / cos | 7.5 | 0.7169 | 0.0100 | 65536-pt sweep + specials |
+| tan | 38020.4 | 0.7118 | 0.0162 | 65536-pt sweep (skip poles) |
+| asin / acos | 512.6 | 0.7025 | 0.0105 | 65536-pt; sqrt approx near boundary |
+| atan2 | 44.7 | 2.9913 | 0.0383 | 65536x5 radii + specials |
+| sqrt | 28.4 | 0.0003 | 0.0000 | Round-to-nearest |
+| log2 | 10.5 | 0.2479 | 0.0045 | 65-entry mantissa table |
+| pow2 | 220.4 | 0.1373 | 0.0057 | 65-entry fraction table |
+| ln, log10 | 0.7 | 0.0015 | 0.0004 | Via FR_MULK28 from log2 |
+| exp | 65.7 | 0.0719 | 0.0051 | FR_MULK28 + FR_pow2 |
+| exp_fast | 195.5 | 0.0719 | 0.0064 | Shift-only scaling |
+| pow10 | 143.4 | 0.1163 | 0.0075 | FR_MULK28 + FR_pow2 |
+| pow10_fast | 581.9 | 0.1163 | 0.0100 | Shift-only scaling |
+| hypot (exact) | 0.2 | 0.0001 | 0.0000 | 64-bit intermediate |
+| hypot_fast (4-seg) | 216112.8 | 0.3418 | 0.1923 | Shift-only, no multiply |
+| hypot_fast8 (8-seg) | 59968.8 | 0.0977 | 0.0508 | Shift-only, no multiply |
+<!-- ACCURACY_TABLE_END -->
 
 ### What's in the box
 
diff --git a/VERSION b/VERSION
index e010258..157e54f 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.0.5
+2.0.6
diff --git a/docs/README.md b/docs/README.md
index 1d4a7c9..896de6e 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -43,18 +43,27 @@ or any tooling. If you want the browser version, look in
 Errors below are measured at Q16.16 (s15.16). All functions accept any
 radix — Q16.16 is just the reference point for the table. See the
 [TDD report](../build/test_tdd_report.md) for sweeps at radixes 8, 12,
-16, and 24.
-
-| Function | Max error | Note |
-|---|---|---|
-| sin / cos | 5 LSB (~7.7e-5) | Exact at 0, 90, 180, 270 |
-| sqrt | ≤ 0.5 LSB | Round-to-nearest |
-| log2 | ≤ 4 LSB | 65-entry mantissa table |
-| pow2 | ≤ 1 LSB (integers exact) | 65-entry fraction table |
-| ln, log10 | ≤ 4 LSB | Via FR_MULK28 from log2 |
-| hypot (exact) | ≤ 0.5 LSB | 64-bit intermediate |
-| hypot_fast (4-seg) | 0.34% | Shift-only, no multiply |
-| hypot_fast8 (8-seg) | 0.10% | Shift-only, no multiply |
+16, and 24. Percent errors skip expected values near zero (|expected| < 0.01).
+
+<!-- ACCURACY_TABLE_START -->
+| Function | Max err (LSB) | Max err (%) | Avg err (%) | Note |
+|---|---:|---:|---:|---|
+| sin / cos | 7.5 | 0.7169 | 0.0100 | 65536-pt sweep + specials |
+| tan | 38020.4 | 0.7118 | 0.0162 | 65536-pt sweep (skip poles) |
+| asin / acos | 512.6 | 0.7025 | 0.0105 | 65536-pt; sqrt approx near boundary |
+| atan2 | 44.7 | 2.9913 | 0.0383 | 65536x5 radii + specials |
+| sqrt | 28.4 | 0.0003 | 0.0000 | Round-to-nearest |
+| log2 | 10.5 | 0.2479 | 0.0045 | 65-entry mantissa table |
+| pow2 | 220.4 | 0.1373 | 0.0057 | 65-entry fraction table |
+| ln, log10 | 0.7 | 0.0015 | 0.0004 | Via FR_MULK28 from log2 |
+| exp | 65.7 | 0.0719 | 0.0051 | FR_MULK28 + FR_pow2 |
+| exp_fast | 195.5 | 0.0719 | 0.0064 | Shift-only scaling |
+| pow10 | 143.4 | 0.1163 | 0.0075 | FR_MULK28 + FR_pow2 |
+| pow10_fast | 581.9 | 0.1163 | 0.0100 | Shift-only scaling |
+| hypot (exact) | 0.2 | 0.0001 | 0.0000 | 64-bit intermediate |
+| hypot_fast (4-seg) | 216112.8 | 0.3418 | 0.1923 | Shift-only, no multiply |
+| hypot_fast8 (8-seg) | 59968.8 | 0.0977 | 0.0508 | Shift-only, no multiply |
+<!-- ACCURACY_TABLE_END -->
 
 ## What's in the box
 
diff --git a/docs/building.md b/docs/building.md
index 81de9bb..2265b69 100644
--- a/docs/building.md
+++ b/docs/building.md
@@ -134,22 +134,63 @@ you improved a polynomial approximation), update the pinned values in
 
 The library has no CPU-specific code. It compiles and runs
 identically on all of the targets listed below. The only requirement
-is an integer pipeline and the standard `<stdint.h>`
-header. You do *not* need a floating-point unit, and you do
-*not* need `libm`.
+is an integer pipeline and `<stdint.h>` (or define `FR_NO_STDINT`
+for bare-metal toolchains that lack it — `FR_defs.h` provides
+fallback typedefs). You do *not* need a floating-point unit, and
+you do *not* need `libm`.
 
 | Target | Toolchain | Tested? |
 | --- | --- | --- |
-| x86 / x86_64 Linux | `gcc`, `clang` | CI. |
+| x86 / x86_64 Linux | `gcc`, `clang`, `tcc` | CI + Docker. |
 | macOS arm64 / x86_64 | Apple `clang` | CI. |
 | Windows x86_64 | MSVC, `clang-cl`, MinGW | Manual. |
-| ARM Cortex-M0/M3/M4/M7 | `arm-none-eabi-gcc`, IAR, Keil | Manual. |
-| RISC-V rv32imc | `riscv32-unknown-elf-gcc` | Manual. |
-| AVR (ATmega328P, etc.) | `avr-gcc` | Manual. |
+| AArch64 (ARM64) | `aarch64-linux-gnu-gcc` | Docker. |
+| ARM32 / Thumb | `arm-none-eabi-gcc`, IAR, Keil | Docker. |
+| RISC-V rv64 / rv32 | `riscv64-linux-gnu-gcc`, `riscv64-unknown-elf-gcc` | Docker. |
+| AVR (ATmega328P, ATtiny85) | `avr-gcc` | Docker. |
 | Arduino (AVR, SAMD, etc.) | `arduino-cli` | Manual. |
-| MSP430 | `msp430-elf-gcc` | Manual. |
+| MSP430 | `msp430-gcc` | Docker. |
+| Motorola 68k | `m68k-linux-gnu-gcc` | Docker. |
+| Motorola 68HC11 | `m68hc11-gcc` | Docker. |
+| PowerPC | `powerpc-linux-gnu-gcc` | Docker. |
+| Xtensa LX106 (ESP8266) | `xtensa-lx106-elf-gcc` | Docker. |
 | 8051 | `sdcc` | Manual. |
 
+### Code size (.text section, compiled with `-Os`)
+
+All sizes are for the complete `FR_math.c` — every function
+included, nothing stripped.  With `-ffunction-sections` and
+linker `--gc-sections`, only the functions your application
+references are linked, so real flash usage will be smaller.
+
+<!-- SIZE_TABLE_START -->
+| Target | .text (bytes) |
+|---|---:|
+| GCC ARM32 Thumb | 4,530 |
+| GCC RISC-V (rv64) | 4,830 |
+| GCC RISC-V (rv32) | 5,068 |
+| GCC Xtensa LX106 (ESP8266) | 5,548 |
+| GCC ARM32 | 5,820 |
+| GCC m68k | 5,626 |
+| GCC x86-64 | 6,130 |
+| Clang x86-64 | 6,728 |
+| GCC AArch64 (ARM64) | 6,396 |
+| GCC x86-32 | 7,362 |
+| GCC PowerPC | 7,940 |
+| GCC MSP430 | 9,958 |
+| TCC x86 | 10,190 |
+| GCC AVR5 (ATmega328P) | 11,912 |
+| GCC AVR ATtiny85 | 12,410 |
+| GCC 68HC11 | 17,331 |
+<!-- SIZE_TABLE_END -->
+
+To regenerate this table, run the Docker cross-build
+(requires the [xelp](https://github.com/deftio/xelp) Docker image):
+
+```bash
+scripts/crossbuild-docker.sh
+```
+
 ### Example: RISC-V
 
 ```bash
@@ -180,9 +221,9 @@ arduino-cli compile --fqbn arduino:avr:uno examples/trig-functions
 arduino-cli compile --fqbn arduino:avr:uno examples/wave-generators
 ```
 
-Expect the whole integer-only library to land around a few
-kilobytes of flash. The wave, trig, and log modules can be compiled
-in independently if you want to strip further.
+See the [code size table](#code-size-text-section-compiled-with--os) above
+for exact numbers. With linker dead-code elimination, only the
+functions you call are linked.
 
 ## CI
 
diff --git a/examples/posix-example/FR_Math_Example1.cpp b/examples/posix-example/FR_Math_Example1.cpp
index 3d2edee..6a7e42f 100644
--- a/examples/posix-example/FR_Math_Example1.cpp
+++ b/examples/posix-example/FR_Math_Example1.cpp
@@ -249,7 +249,7 @@ int putSingleChar(char x)
 }
 //===============================================
 // main program for testing the functions
-int main(int argc, char *argv[])
+int main(int /*argc*/, char * /*argv*/[])
 {
 	int ret_val = 0;
 	int i;
diff --git a/idf_component.yml b/idf_component.yml
index e9083b8..d5c5d11 100644
--- a/idf_component.yml
+++ b/idf_component.yml
@@ -1,4 +1,4 @@
-version: "2.0.5"
+version: "2.0.6"
 description: "Compact fixed-point math library for embedded systems. Integer-only with caller-selectable radix. Trig, log/exp, sqrt, hypot, wave generators, ADSR, and 2D transforms. Zero dependencies."
 url: "https://github.com/deftio/fr_math"
 repository: "https://github.com/deftio/fr_math.git"
diff --git a/library.json b/library.json
index a9a743c..daa7684 100644
--- a/library.json
+++ b/library.json
@@ -1,6 +1,6 @@
 {
     "name": "FR_Math",
-    "version": "2.0.5",
+    "version": "2.0.6",
     "description": "Compact fixed-point math library for embedded systems. Integer-only with caller-selectable radix. Trig, log/exp, sqrt, hypot, wave generators, ADSR, and 2D transforms in 4KB of flash. Zero dependencies.",
     "keywords": [
         "fixed-point",
diff --git a/library.properties b/library.properties
index ac2b952..47d834a 100644
--- a/library.properties
+++ b/library.properties
@@ -1,5 +1,5 @@
 name=FR_Math
-version=2.0.5
+version=2.0.6
 author=M. A. Chatterjee <deftio@deftio.com>
 maintainer=M. A. Chatterjee <deftio@deftio.com>
 sentence=Compact fixed-point math library for embedded systems. 4KB flash, zero dependencies, any radix.
diff --git a/llms.txt b/llms.txt
index bd23772..33e0b78 100644
--- a/llms.txt
+++ b/llms.txt
@@ -9,7 +9,7 @@ or libraries. Pure C99, zero dependencies beyond `<stdint.h>`.
 - Repository: https://github.com/deftio/fr_math
 - Documentation: https://deftio.github.io/fr_math/
 - License: BSD-2-Clause
-- Version: 2.0.5
+- Version: 2.0.6
 
 ## Key concept: radix parameter
 
diff --git a/makefile b/makefile
index a609532..307b0dd 100644
--- a/makefile
+++ b/makefile
@@ -14,8 +14,11 @@ EXAMPLE_DIR = examples
 BUILD_DIR = build
 COV_DIR = coverage
 
-# Compiler flags
-CFLAGS = -I$(SRC_DIR) -Wall -Os
+# Compiler flags — full warnings, fail on any warning
+# LIB_WARN: strictest for library source (includes -Wconversion -Wpedantic)
+# CFLAGS:   for tests/examples (no -Wconversion/-Wpedantic — macro casts are intentional)
+LIB_WARN = -Wall -Wextra -Wpedantic -Wshadow -Wconversion -Werror
+CFLAGS   = -I$(SRC_DIR) -Wall -Wextra -Wshadow -Werror -Os
 CXXFLAGS = $(CFLAGS)
 TEST_FLAGS = -ftest-coverage -fprofile-arcs
 LDFLAGS = -lm
@@ -43,10 +46,10 @@ dirs:
 lib: dirs $(BUILD_DIR)/FR_math.o $(BUILD_DIR)/FR_math_2D.o
 
 $(BUILD_DIR)/FR_math.o: $(SRC_DIR)/FR_math.c $(HEADERS)
-	$(CC) $(CFLAGS) -c $< -o $@
+	$(CC) -I$(SRC_DIR) $(LIB_WARN) -Os -c $< -o $@
 
 $(BUILD_DIR)/FR_math_2D.o: $(SRC_DIR)/FR_math_2D.cpp $(HEADERS)
-	$(CXX) $(CXXFLAGS) -c $< -o $@
+	$(CXX) -I$(SRC_DIR) $(LIB_WARN) -Os -c $< -o $@
 
 # Build examples
 .PHONY: examples
@@ -66,8 +69,8 @@ test-tdd: $(BUILD_DIR)/test_tdd
 	@echo "Report written to $(BUILD_DIR)/test_tdd_report.md"
 
 $(BUILD_DIR)/test_tdd: $(TEST_DIR)/test_tdd.cpp $(SRC_DIR)/FR_math.c $(SRC_DIR)/FR_math_2D.cpp
-	$(CC) $(CFLAGS) $(TEST_FLAGS) -c $(SRC_DIR)/FR_math.c -o $(BUILD_DIR)/test_tdd_FR_math.o
-	$(CXX) $(CXXFLAGS) $(TEST_FLAGS) -c $(SRC_DIR)/FR_math_2D.cpp -o $(BUILD_DIR)/test_tdd_FR_math_2D.o
+	$(CC) -I$(SRC_DIR) $(LIB_WARN) -Os $(TEST_FLAGS) -c $(SRC_DIR)/FR_math.c -o $(BUILD_DIR)/test_tdd_FR_math.o
+	$(CXX) -I$(SRC_DIR) $(LIB_WARN) -Os $(TEST_FLAGS) -c $(SRC_DIR)/FR_math_2D.cpp -o $(BUILD_DIR)/test_tdd_FR_math_2D.o
 	$(CXX) $(CXXFLAGS) $(TEST_FLAGS) $(TEST_DIR)/test_tdd.cpp $(BUILD_DIR)/test_tdd_FR_math.o $(BUILD_DIR)/test_tdd_FR_math_2D.o $(LDFLAGS) -o $@
 
 .PHONY: test-basic
@@ -107,7 +110,10 @@ $(BUILD_DIR)/test_comprehensive: $(TEST_DIR)/test_comprehensive.c $(SRC_DIR)/FR_
 	$(CC) $(CFLAGS) $(TEST_FLAGS) $^ $(LDFLAGS) -o $@
 
 $(BUILD_DIR)/test_2d: $(TEST_DIR)/test_2d_math.c $(SRC_DIR)/FR_math.c $(SRC_DIR)/FR_math_2D.cpp
-	$(CXX) $(CXXFLAGS) $(TEST_FLAGS) $^ $(LDFLAGS) -o $@
+	$(CC) -I$(SRC_DIR) $(LIB_WARN) -Os $(TEST_FLAGS) -c $(SRC_DIR)/FR_math.c -o $(BUILD_DIR)/test_2d_FR_math.o
+	$(CXX) $(CXXFLAGS) $(TEST_FLAGS) -c $(SRC_DIR)/FR_math_2D.cpp -o $(BUILD_DIR)/test_2d_FR_math_2D.o
+	$(CC) $(CFLAGS) $(TEST_FLAGS) -c $(TEST_DIR)/test_2d_math.c -o $(BUILD_DIR)/test_2d_math.o
+	$(CXX) $(TEST_FLAGS) $(BUILD_DIR)/test_2d_math.o $(BUILD_DIR)/test_2d_FR_math.o $(BUILD_DIR)/test_2d_FR_math_2D.o $(LDFLAGS) -o $@
 
 $(BUILD_DIR)/test_overflow: $(TEST_DIR)/test_overflow_saturation.c $(SRC_DIR)/FR_math.c
 	$(CC) $(CFLAGS) $^ $(LDFLAGS) -o $@
@@ -116,7 +122,15 @@ $(BUILD_DIR)/test_full: $(TEST_DIR)/test_full_coverage.c $(SRC_DIR)/FR_math.c
 	$(CC) $(CFLAGS) $(TEST_FLAGS) $^ $(LDFLAGS) -o $@
 
 $(BUILD_DIR)/test_2d_complete: $(TEST_DIR)/test_2d_complete.cpp $(SRC_DIR)/FR_math.c $(SRC_DIR)/FR_math_2D.cpp
-	$(CXX) $(CXXFLAGS) $(TEST_FLAGS) $^ $(LDFLAGS) -o $@
+	$(CC) -I$(SRC_DIR) $(LIB_WARN) -Os $(TEST_FLAGS) -c $(SRC_DIR)/FR_math.c -o $(BUILD_DIR)/test_2dc_FR_math.o
+	$(CXX) $(CXXFLAGS) $(TEST_FLAGS) -c $(SRC_DIR)/FR_math_2D.cpp -o $(BUILD_DIR)/test_2dc_FR_math_2D.o
+	$(CXX) $(CXXFLAGS) $(TEST_FLAGS) $(TEST_DIR)/test_2d_complete.cpp $(BUILD_DIR)/test_2dc_FR_math.o $(BUILD_DIR)/test_2dc_FR_math_2D.o $(LDFLAGS) -o $@
+
+# Accuracy summary table (extract from test_tdd output)
+.PHONY: accuracy
+accuracy: dirs $(BUILD_DIR)/test_tdd
+	@echo "Running accuracy report..."
+	@./$(BUILD_DIR)/test_tdd 2>/dev/null | sed -n '/ACCURACY_TABLE_START/,/ACCURACY_TABLE_END/p'
 
 # Coverage report using gcov (no external dependencies)
 .PHONY: coverage
@@ -127,8 +141,8 @@ coverage:
 .PHONY: coverage-html
 coverage-html: clean dirs
 	@echo "Building with coverage flags..."
-	@$(CC) $(CFLAGS) $(TEST_FLAGS) -c $(SRC_DIR)/FR_math.c -o $(BUILD_DIR)/FR_math.o
-	@$(CXX) $(CXXFLAGS) $(TEST_FLAGS) -c $(SRC_DIR)/FR_math_2D.cpp -o $(BUILD_DIR)/FR_math_2D.o
+	@$(CC) -I$(SRC_DIR) $(LIB_WARN) -Os $(TEST_FLAGS) -c $(SRC_DIR)/FR_math.c -o $(BUILD_DIR)/FR_math.o
+	@$(CXX) -I$(SRC_DIR) $(LIB_WARN) -Os $(TEST_FLAGS) -c $(SRC_DIR)/FR_math_2D.cpp -o $(BUILD_DIR)/FR_math_2D.o
 	@$(CC) $(CFLAGS) $(TEST_FLAGS) $(TEST_DIR)/fr_math_test.c $(BUILD_DIR)/FR_math.o $(BUILD_DIR)/FR_math_2D.o $(LDFLAGS) -lstdc++ -o $(BUILD_DIR)/fr_test
 	@echo "Running tests for coverage..."
 	@./$(BUILD_DIR)/fr_test
@@ -173,8 +187,8 @@ cleanall: clean
 .PHONY: coverage-basic
 coverage-basic: clean dirs
 	@echo "Building with coverage flags..."
-	@$(CC) $(CFLAGS) $(TEST_FLAGS) -c $(SRC_DIR)/FR_math.c -o $(BUILD_DIR)/FR_math.o
-	@$(CXX) $(CXXFLAGS) $(TEST_FLAGS) -c $(SRC_DIR)/FR_math_2D.cpp -o $(BUILD_DIR)/FR_math_2D.o
+	@$(CC) -I$(SRC_DIR) $(LIB_WARN) -Os $(TEST_FLAGS) -c $(SRC_DIR)/FR_math.c -o $(BUILD_DIR)/FR_math.o
+	@$(CXX) -I$(SRC_DIR) $(LIB_WARN) -Os $(TEST_FLAGS) -c $(SRC_DIR)/FR_math_2D.cpp -o $(BUILD_DIR)/FR_math_2D.o
 	@$(CC) $(CFLAGS) $(TEST_FLAGS) $(TEST_DIR)/fr_math_test.c $(BUILD_DIR)/FR_math.o $(BUILD_DIR)/FR_math_2D.o $(LDFLAGS) -lstdc++ -o $(BUILD_DIR)/fr_test
 	@echo "Running tests..."
 	@./$(BUILD_DIR)/fr_test
diff --git a/pages/assets/site.js b/pages/assets/site.js
index 8fff01d..8f5df7c 100644
--- a/pages/assets/site.js
+++ b/pages/assets/site.js
@@ -16,7 +16,7 @@
    ════════════════════════════════════════════════════════════════════ */
 
 (function () {
-    var FR_VERSION = 'v2.0.5';
+    var FR_VERSION = 'v2.0.6';
 
     // Detect whether we're a top-level page or inside guide/.
     // Works for both file:// and http(s):// because we look for the
diff --git a/pages/guide/building.html b/pages/guide/building.html
index 3676403..f63195f 100644
--- a/pages/guide/building.html
+++ b/pages/guide/building.html
@@ -152,25 +152,68 @@ <h2>Cross-compilation</h2>
 
 <p>The library has no CPU-specific code. It compiles and runs
 identically on all of the targets listed below. The only requirement
-is an integer pipeline and the standard <code>&lt;stdint.h&gt;</code>
-header. You do <em>not</em> need a floating-point unit, and you do
-<em>not</em> need <code>libm</code>.</p>
+is an integer pipeline and <code>&lt;stdint.h&gt;</code> (or define
+<code>FR_NO_STDINT</code> for bare-metal toolchains that lack it &mdash;
+<code>FR_defs.h</code> provides fallback typedefs). You do <em>not</em>
+need a floating-point unit, and you do <em>not</em> need
+<code>libm</code>.</p>
 
 <table>
 <thead><tr><th>Target</th><th>Toolchain</th><th>Tested?</th></tr></thead>
 <tbody>
-<tr><td>x86 / x86_64 Linux</td><td><code>gcc</code>, <code>clang</code></td><td>CI.</td></tr>
+<tr><td>x86 / x86_64 Linux</td><td><code>gcc</code>, <code>clang</code>, <code>tcc</code></td><td>CI + Docker.</td></tr>
 <tr><td>macOS arm64 / x86_64</td><td>Apple <code>clang</code></td><td>CI.</td></tr>
 <tr><td>Windows x86_64</td><td>MSVC, <code>clang-cl</code>, MinGW</td><td>Manual.</td></tr>
-<tr><td>ARM Cortex-M0/M3/M4/M7</td><td><code>arm-none-eabi-gcc</code>, IAR, Keil</td><td>Manual.</td></tr>
-<tr><td>RISC-V rv32imc</td><td><code>riscv32-unknown-elf-gcc</code></td><td>Manual.</td></tr>
-<tr><td>AVR (ATmega328P, etc.)</td><td><code>avr-gcc</code></td><td>Manual.</td></tr>
+<tr><td>AArch64 (ARM64)</td><td><code>aarch64-linux-gnu-gcc</code></td><td>Docker.</td></tr>
+<tr><td>ARM32 / Thumb</td><td><code>arm-none-eabi-gcc</code>, IAR, Keil</td><td>Docker.</td></tr>
+<tr><td>RISC-V rv64 / rv32</td><td><code>riscv64-linux-gnu-gcc</code>, <code>riscv64-unknown-elf-gcc</code></td><td>Docker.</td></tr>
+<tr><td>AVR (ATmega328P, ATtiny85)</td><td><code>avr-gcc</code></td><td>Docker.</td></tr>
 <tr><td>Arduino (AVR, SAMD, etc.)</td><td><code>arduino-cli</code></td><td>Manual.</td></tr>
-<tr><td>MSP430</td><td><code>msp430-elf-gcc</code></td><td>Manual.</td></tr>
+<tr><td>MSP430</td><td><code>msp430-gcc</code></td><td>Docker.</td></tr>
+<tr><td>Motorola 68k</td><td><code>m68k-linux-gnu-gcc</code></td><td>Docker.</td></tr>
+<tr><td>Motorola 68HC11</td><td><code>m68hc11-gcc</code></td><td>Docker.</td></tr>
+<tr><td>PowerPC</td><td><code>powerpc-linux-gnu-gcc</code></td><td>Docker.</td></tr>
+<tr><td>Xtensa LX106 (ESP8266)</td><td><code>xtensa-lx106-elf-gcc</code></td><td>Docker.</td></tr>
 <tr><td>8051</td><td><code>sdcc</code></td><td>Manual.</td></tr>
 </tbody>
 </table>
 
+<h3>Code size (<code>.text</code> section, compiled with <code>-Os</code>)</h3>
+
+<p>All sizes are for the complete <code>FR_math.c</code> &mdash; every function
+included, nothing stripped. With <code>-ffunction-sections</code> and
+linker <code>--gc-sections</code>, only the functions your application
+references are linked, so real flash usage will be smaller.</p>
+
+<!-- SIZE_TABLE_START -->
+<table>
+<thead><tr><th>Target</th><th>.text (bytes)</th></tr></thead>
+<tbody>
+<tr><td>GCC ARM32 Thumb</td><td>4,530</td></tr>
+<tr><td>GCC RISC-V (rv64)</td><td>4,830</td></tr>
+<tr><td>GCC RISC-V (rv32)</td><td>5,068</td></tr>
+<tr><td>GCC Xtensa LX106 (ESP8266)</td><td>5,548</td></tr>
+<tr><td>GCC ARM32</td><td>5,820</td></tr>
+<tr><td>GCC m68k</td><td>5,626</td></tr>
+<tr><td>GCC x86-64</td><td>6,130</td></tr>
+<tr><td>Clang x86-64</td><td>6,728</td></tr>
+<tr><td>GCC AArch64 (ARM64)</td><td>6,396</td></tr>
+<tr><td>GCC x86-32</td><td>7,362</td></tr>
+<tr><td>GCC PowerPC</td><td>7,940</td></tr>
+<tr><td>GCC MSP430</td><td>9,958</td></tr>
+<tr><td>TCC x86</td><td>10,190</td></tr>
+<tr><td>GCC AVR5 (ATmega328P)</td><td>11,912</td></tr>
+<tr><td>GCC AVR ATtiny85</td><td>12,410</td></tr>
+<tr><td>GCC 68HC11</td><td>17,331</td></tr>
+</tbody>
+</table>
+<!-- SIZE_TABLE_END -->
+
+<p>To regenerate this table, run the Docker cross-build
+(requires the <a href="https://github.com/deftio/xelp">xelp</a> Docker image):</p>
+
+<pre><code class="language-bash">scripts/crossbuild-docker.sh</code></pre>
+
 <h3>Example: RISC-V</h3>
 
 <pre><code class="language-bash">riscv32-unknown-elf-gcc -Os -ffunction-sections -fdata-sections \
@@ -195,9 +238,8 @@ <h3>Example: Arduino</h3>
 arduino-cli compile --fqbn arduino:avr:uno examples/trig-functions
 arduino-cli compile --fqbn arduino:avr:uno examples/wave-generators</code></pre>
 
-<p>Expect the whole integer-only library to land around a few
-kilobytes of flash. The wave, trig, and log modules can be compiled
-in independently if you want to strip further.</p>
+<p>See the code size table above for exact numbers. With linker
+dead-code elimination, only the functions you call are linked.</p>
 
 <h2>CI</h2>
 
diff --git a/pages/index.html b/pages/index.html
index 04ba463..d25e5fe 100644
--- a/pages/index.html
+++ b/pages/index.html
@@ -47,21 +47,31 @@ <h2>Measured accuracy</h2>
 <p>Errors below are measured at Q16.16 (s15.16). All functions accept any
 radix &mdash; Q16.16 is just the reference point for the table.
 See the <a href="https://github.com/deftio/fr_math/blob/master/build/test_tdd_report.md">TDD
-report</a> for sweeps at radixes 8, 12, 16, and 24.</p>
-
-<table>
-<thead><tr><th>Function</th><th>Max error</th><th>Note</th></tr></thead>
-<tbody>
-<tr><td>sin / cos</td><td>5 LSB (~7.7e-5)</td><td>Exact at 0, 90, 180, 270</td></tr>
-<tr><td>sqrt</td><td>&le; 0.5 LSB</td><td>Round-to-nearest</td></tr>
-<tr><td>log2</td><td>&le; 4 LSB</td><td>65-entry mantissa table</td></tr>
-<tr><td>pow2</td><td>&le; 1 LSB (integers exact)</td><td>65-entry fraction table</td></tr>
-<tr><td>ln, log10</td><td>&le; 4 LSB</td><td>Via FR_MULK28 from log2</td></tr>
-<tr><td>hypot (exact)</td><td>&le; 0.5 LSB</td><td>64-bit intermediate</td></tr>
-<tr><td>hypot_fast (4-seg)</td><td>0.34%</td><td>Shift-only, no multiply</td></tr>
-<tr><td>hypot_fast8 (8-seg)</td><td>0.10%</td><td>Shift-only, no multiply</td></tr>
-</tbody>
-</table>
+report</a> for sweeps at radixes 8, 12, 16, and 24.
+Percent errors skip expected values near zero (|expected| &lt; 0.01).</p>
+
+<!-- ACCURACY_TABLE_START -->
+<table>
+<thead><tr><th>Function</th><th>Max err (LSB)</th><th>Max err (%)</th><th>Avg err (%)</th><th>Note</th></tr></thead>
+<tbody>
+<tr><td>sin / cos</td><td>7.5</td><td>0.7169</td><td>0.0100</td><td>65536-pt sweep + specials</td></tr>
+<tr><td>tan</td><td>38020.4</td><td>0.7118</td><td>0.0162</td><td>65536-pt sweep (skip poles)</td></tr>
+<tr><td>asin / acos</td><td>512.6</td><td>0.7025</td><td>0.0105</td><td>65536-pt; sqrt approx near boundary</td></tr>
+<tr><td>atan2</td><td>44.7</td><td>2.9913</td><td>0.0383</td><td>65536x5 radii + specials</td></tr>
+<tr><td>sqrt</td><td>28.4</td><td>0.0003</td><td>0.0000</td><td>Round-to-nearest</td></tr>
+<tr><td>log2</td><td>10.5</td><td>0.2479</td><td>0.0045</td><td>65-entry mantissa table</td></tr>
+<tr><td>pow2</td><td>220.4</td><td>0.1373</td><td>0.0057</td><td>65-entry fraction table</td></tr>
+<tr><td>ln, log10</td><td>0.7</td><td>0.0015</td><td>0.0004</td><td>Via FR_MULK28 from log2</td></tr>
+<tr><td>exp</td><td>65.7</td><td>0.0719</td><td>0.0051</td><td>FR_MULK28 + FR_pow2</td></tr>
+<tr><td>exp_fast</td><td>195.5</td><td>0.0719</td><td>0.0064</td><td>Shift-only scaling</td></tr>
+<tr><td>pow10</td><td>143.4</td><td>0.1163</td><td>0.0075</td><td>FR_MULK28 + FR_pow2</td></tr>
+<tr><td>pow10_fast</td><td>581.9</td><td>0.1163</td><td>0.0100</td><td>Shift-only scaling</td></tr>
+<tr><td>hypot (exact)</td><td>0.2</td><td>0.0001</td><td>0.0000</td><td>64-bit intermediate</td></tr>
+<tr><td>hypot_fast (4-seg)</td><td>216112.8</td><td>0.3418</td><td>0.1923</td><td>Shift-only, no multiply</td></tr>
+<tr><td>hypot_fast8 (8-seg)</td><td>59968.8</td><td>0.0977</td><td>0.0508</td><td>Shift-only, no multiply</td></tr>
+</tbody>
+</table>
+<!-- ACCURACY_TABLE_END -->
 
 <h2>What&rsquo;s in the box</h2>
 
diff --git a/scripts/accuracy_report.sh b/scripts/accuracy_report.sh
new file mode 100755
index 0000000..b426cea
--- /dev/null
+++ b/scripts/accuracy_report.sh
@@ -0,0 +1,150 @@
+#!/usr/bin/env bash
+#
+# accuracy_report.sh — extract the accuracy table from test_tdd and
+# optionally patch it into README.md, docs/README.md, and pages/index.html.
+#
+# Usage:
+#   scripts/accuracy_report.sh            # build, run, print table to stdout
+#   scripts/accuracy_report.sh --update   # also patch the three doc files
+#
+# The table is delimited by sentinel comments:
+#   <!-- ACCURACY_TABLE_START -->
+#   ...
+#   <!-- ACCURACY_TABLE_END -->
+#
+# Exit status: 0 on success, non-zero on build or extraction failure.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+cd "${PROJECT_ROOT}"
+
+MODE="print"
+for arg in "$@"; do
+    case "$arg" in
+        --update) MODE="update" ;;
+        -h|--help)
+            echo "Usage: scripts/accuracy_report.sh [--update]"
+            echo "  (no args)   Build test_tdd, run it, print accuracy table"
+            echo "  --update    Also patch README.md, docs/README.md, pages/index.html"
+            exit 0
+            ;;
+        *) echo "Unknown option: $arg" >&2; exit 1 ;;
+    esac
+done
+
+# -----------------------------------------------------------------------
+# 1. Build test_tdd
+# -----------------------------------------------------------------------
+echo "Building test_tdd..." >&2
+make -s dirs
+make -s build/test_tdd 2>&1 >&2
+
+# -----------------------------------------------------------------------
+# 2. Run and capture the accuracy table
+# -----------------------------------------------------------------------
+echo "Running test_tdd..." >&2
+OUTPUT=$(./build/test_tdd 2>/dev/null)
+
+# Extract lines between sentinels (inclusive)
+TABLE=$(echo "$OUTPUT" | sed -n '/<!-- ACCURACY_TABLE_START -->/,/<!-- ACCURACY_TABLE_END -->/p')
+
+if [ -z "$TABLE" ]; then
+    echo "ERROR: Could not find ACCURACY_TABLE_START/END sentinels in output" >&2
+    exit 1
+fi
+
+# Extract just the data rows (lines starting with |, excluding header and separator)
+DATA_ROWS=$(echo "$TABLE" | grep '^|' | tail -n +3)
+
+if [ -z "$DATA_ROWS" ]; then
+    echo "ERROR: No data rows found in accuracy table" >&2
+    exit 1
+fi
+
+echo "$TABLE"
+
+if [ "$MODE" != "update" ]; then
+    exit 0
+fi
+
+# -----------------------------------------------------------------------
+# 3. Patch markdown files (README.md, docs/README.md)
+# -----------------------------------------------------------------------
+patch_markdown() {
+    local file="$1"
+    if [ ! -f "$file" ]; then
+        echo "  skip: $file not found" >&2
+        return
+    fi
+
+    # Build replacement block: sentinel + header + separator + data + sentinel
+    local replacement
+    replacement="<!-- ACCURACY_TABLE_START -->"$'\n'
+    replacement+="| Function | Max err (LSB) | Max err (%) | Avg err (%) | Note |"$'\n'
+    replacement+="|---|---:|---:|---:|---|"$'\n'
+    replacement+="$DATA_ROWS"$'\n'
+    replacement+="<!-- ACCURACY_TABLE_END -->"
+
+    # Use perl to replace between sentinels
+    perl -0777 -i -pe "
+        s{<!-- ACCURACY_TABLE_START -->.*?<!-- ACCURACY_TABLE_END -->}
+         {${replacement}}s
+    " "$file"
+
+    echo "  patched: $file" >&2
+}
+
+patch_markdown "README.md"
+patch_markdown "docs/README.md"
+
+# -----------------------------------------------------------------------
+# 4. Patch HTML file (pages/index.html)
+# -----------------------------------------------------------------------
+patch_html() {
+    local file="$1"
+    if [ ! -f "$file" ]; then
+        echo "  skip: $file not found" >&2
+        return
+    fi
+
+    # Convert markdown data rows to HTML <tr> rows
+    local html_rows=""
+    while IFS= read -r line; do
+        # Strip leading/trailing |, split by |
+        local cells
+        cells=$(echo "$line" | sed 's/^| //;s/ |$//' | sed 's/ | /\t/g')
+        local tr="<tr>"
+        while IFS=$'\t' read -r c1 c2 c3 c4 c5; do
+            tr+="<td>${c1}</td><td>${c2}</td><td>${c3}</td><td>${c4}</td><td>${c5}</td>"
+        done <<< "$cells"
+        tr+="</tr>"
+        if [ -n "$html_rows" ]; then
+            html_rows+=$'\n'
+        fi
+        html_rows+="$tr"
+    done <<< "$DATA_ROWS"
+
+    # Build the replacement block
+    local replacement
+    replacement="<!-- ACCURACY_TABLE_START -->"$'\n'
+    replacement+="<table>"$'\n'
+    replacement+="<thead><tr><th>Function</th><th>Max err (LSB)</th><th>Max err (%)</th><th>Avg err (%)</th><th>Note</th></tr></thead>"$'\n'
+    replacement+="<tbody>"$'\n'
+    replacement+="$html_rows"$'\n'
+    replacement+="</tbody>"$'\n'
+    replacement+="</table>"$'\n'
+    replacement+="<!-- ACCURACY_TABLE_END -->"
+
+    perl -0777 -i -pe "
+        s{<!-- ACCURACY_TABLE_START -->.*?<!-- ACCURACY_TABLE_END -->}
+         {${replacement}}s
+    " "$file"
+
+    echo "  patched: $file" >&2
+}
+
+patch_html "pages/index.html"
+
+echo "Accuracy table updated in all doc files." >&2
diff --git a/scripts/crossbuild-docker.sh b/scripts/crossbuild-docker.sh
new file mode 100755
index 0000000..7f10d6d
--- /dev/null
+++ b/scripts/crossbuild-docker.sh
@@ -0,0 +1,123 @@
+#!/bin/bash
+# crossbuild-docker.sh -- cross-compile FR_math inside Docker container
+# Runs inside the xelp-crossbuild Docker image.
+# Reports object file and .text section sizes for each target.
+
+set -e
+
+SRC=/fr_math/src/FR_math.c
+INCLUDE="-I/fr_math/src"
+OBJ=/tmp/FR_math.o
+
+SEP="============================================================"
+
+# Accumulate summary rows: "label|text_size"
+SUMMARY=""
+
+print_sizes() {
+    local label="$1"
+    echo ""
+    echo "$SEP"
+    echo "$label"
+    echo "$SEP"
+    if [ ! -f "$OBJ" ]; then
+        echo "  (build failed)"
+        SUMMARY="${SUMMARY}${label}|FAIL\n"
+        return
+    fi
+    OBJ_SIZE=$(stat -c%s "$OBJ" 2>/dev/null || wc -c < "$OBJ")
+    TEXT_SIZE=$(size "$OBJ" 2>/dev/null | awk 'FNR==2{print $1}')
+    printf "  obj file size: %6s bytes\n" "$OBJ_SIZE"
+    printf "  .text section: %6s bytes\n" "$TEXT_SIZE"
+    SUMMARY="${SUMMARY}${label}|${TEXT_SIZE}\n"
+    rm -f "$OBJ"
+}
+
+echo ""
+echo "FR_Math cross-compilation size report"
+echo "Date: $(date -u '+%Y-%m-%d %H:%M UTC')"
+echo ""
+
+# --- x86 ---
+gcc -c $SRC $INCLUDE -Os -Wall -o $OBJ 2>&1 && true
+print_sizes "GCC x86-64"
+
+clang -c $SRC $INCLUDE -Os -Wall -o $OBJ 2>&1 && true
+print_sizes "Clang x86-64"
+
+gcc -c $SRC $INCLUDE -Os -m32 -Wall -o $OBJ 2>&1 && true
+print_sizes "GCC x86-32"
+
+tcc -c $SRC $INCLUDE -o $OBJ 2>&1 && true
+print_sizes "TCC x86"
+
+# --- ARM ---
+aarch64-linux-gnu-gcc -c $SRC $INCLUDE -Os -Wall -o $OBJ 2>&1 && true
+print_sizes "GCC AArch64 (ARM64)"
+
+arm-none-eabi-gcc -c $SRC $INCLUDE -Os -Wall -o $OBJ 2>&1 && true
+print_sizes "GCC ARM32"
+
+arm-none-eabi-gcc -c $SRC $INCLUDE -Os -mthumb -Wall -o $OBJ 2>&1 && true
+print_sizes "GCC ARM32 Thumb"
+
+# --- MSP430 ---
+# Bare-metal: no stdint.h in sysroot — use fallback typedefs
+NOSTD="-DFR_NO_STDINT"
+
+msp430-gcc -c $SRC $INCLUDE $NOSTD -Os -Wall -o $OBJ 2>&1 && true
+print_sizes "GCC MSP430"
+
+# --- AVR ---
+avr-gcc -c $SRC $INCLUDE $NOSTD -Os -mmcu=avr5 -Wall -o $OBJ 2>&1 && true
+print_sizes "GCC AVR5 (ATmega328P)"
+
+avr-gcc -c $SRC $INCLUDE $NOSTD -Os -mmcu=attiny85 -Wall -o $OBJ 2>&1 && true
+print_sizes "GCC AVR ATtiny85"
+
+# --- 68HC11 ---
+m68hc11-gcc -c $SRC $INCLUDE $NOSTD -Os -o $OBJ 2>&1 && true
+print_sizes "GCC 68HC11"
+
+# --- 68k (Motorola 68000) ---
+m68k-linux-gnu-gcc -c $SRC $INCLUDE -Os -Wall -o $OBJ 2>&1 && true
+print_sizes "GCC m68k"
+
+# --- PowerPC ---
+powerpc-linux-gnu-gcc -c $SRC $INCLUDE -Os -Wall -o $OBJ 2>&1 && true
+print_sizes "GCC PowerPC"
+
+# --- RISC-V ---
+riscv64-linux-gnu-gcc -c $SRC $INCLUDE -Os -Wall -o $OBJ 2>&1 && true
+print_sizes "GCC RISC-V (rv64)"
+
+riscv64-unknown-elf-gcc -c $SRC $INCLUDE $NOSTD -Os -march=rv32imac -mabi=ilp32 -Wall -o $OBJ 2>&1 && true
+print_sizes "GCC RISC-V (rv32)"
+
+# --- Xtensa (ESP8266/ESP32 family) ---
+xtensa-lx106-elf-gcc -c $SRC $INCLUDE $NOSTD -Os -Wall -o $OBJ 2>&1 && true
+print_sizes "GCC Xtensa LX106 (ESP8266)"
+
+# --- Function size table (native GCC) ---
+echo ""
+echo "$SEP"
+echo "Function size table (GCC x86-64)"
+echo "$SEP"
+gcc -c $SRC $INCLUDE -Os -Wall -o $OBJ 2>&1
+nm $OBJ -n -S --size-sort -f sysv -t d 2>/dev/null | grep -E "FUNC" || true
+rm -f $OBJ
+
+# --- Summary table ---
+echo ""
+echo "$SEP"
+echo "Summary: FR_math.c code size (bytes), compiled with -Os"
+echo "$SEP"
+printf "  %-28s %s\n" "Target" ".text (bytes)"
+printf "  %-28s %s\n" "----------------------------" "-------------"
+echo -e "$SUMMARY" | while IFS='|' read -r label size; do
+    [ -z "$label" ] && continue
+    printf "  %-28s %s\n" "$label" "$size"
+done
+
+echo ""
+echo "Done."
diff --git a/src/FR_defs.h b/src/FR_defs.h
index ee61c95..631126e 100644
--- a/src/FR_defs.h
+++ b/src/FR_defs.h
@@ -29,13 +29,39 @@
 #define __FR_Platform_Defs_H__
 
 /*
- * Fixed-width integer typedefs. C99 stdint.h is mandatory in v2.
+ * Fixed-width integer typedefs.
  *
- * Any C99-or-newer toolchain (gcc, clang, MSVC, IAR, Keil, sdcc, MSP430-gcc,
- * AVR-gcc, RISC-V toolchains, ARM toolchains) supports <stdint.h>. If you
- * are on a pre-C99 toolchain, FR_Math 1.0.x is the version for you.
+ * Prefer C99 <stdint.h> when available (gcc, clang, MSVC, IAR, Keil, sdcc,
+ * MSP430-gcc, AVR-gcc, RISC-V, ARM toolchains).  For bare-metal toolchains
+ * or pre-C99 compilers that lack <stdint.h>, define FR_NO_STDINT before
+ * including this header and the types are provided via sizeof()-based
+ * fallback definitions that cover the common 8/16/32/64-bit layouts.
  */
+#ifndef FR_NO_STDINT
 #include <stdint.h>
+#else
+/* ---- fallback: no <stdint.h> ------------------------------------ */
+/* Works on any toolchain where char=8, short=16, int/long=32 bits,
+ * which covers virtually all embedded targets (AVR, MSP430, ARM,
+ * 68HC11, 68k, PPC, RISC-V, Xtensa, x86).  Adjust if your platform
+ * differs.
+ */
+typedef unsigned char      uint8_t;
+typedef signed   char      int8_t;
+typedef unsigned short     uint16_t;
+typedef signed   short     int16_t;
+#if defined(__AVR__) || defined(__MSP430__) || defined(__m68hc1x__)
+  /* On these targets int is 16-bit; use long for 32-bit */
+  typedef unsigned long    uint32_t;
+  typedef signed   long    int32_t;
+#else
+  typedef unsigned int     uint32_t;
+  typedef signed   int     int32_t;
+#endif
+/* 64-bit: available on most 32-bit+ GCC targets via long long */
+typedef unsigned long long uint64_t;
+typedef signed   long long int64_t;
+#endif /* FR_NO_STDINT */
 
 /*
  * Arduino's USBAPI.h typedefs u8 and u16 as unsigned char / unsigned short.
diff --git a/src/FR_math.c b/src/FR_math.c
index b9e5ace..82936de 100644
--- a/src/FR_math.c
+++ b/src/FR_math.c
@@ -32,7 +32,9 @@
 #include "FR_math.h"
 #include "FR_trig_table.h"
 
+#ifndef FR_NO_STDINT
 #include <stdint.h>
+#endif
 
 /*=======================================================
  * BAM-native trig: fr_cos_bam, fr_sin_bam, fr_cos, fr_sin, fr_tan
@@ -147,12 +149,14 @@ s32 fr_tan(s32 rad, u16 radix)
  */
 static u16 fr_deg_radix_to_bam(s16 deg, u16 radix)
 {
-	/* (s32)deg * 0xB60B keeps everything in 32-bit math (8051-friendly).
-	 * For radix 0, 0xB60B = 65536/360 ≈ 182.0444. The shift strips the
-	 * input radix to land in u16 BAM space.
+	/* 0xB60B ≈ (65536/360) * 256 — the ×256 prescale keeps 32-bit math
+	 * friendly to 8051-class MCUs.  We must shift out both the input
+	 * fraction bits (radix) AND the 8-bit prescale, hence radix + 8.
+	 * The +half term rounds to nearest, matching FR_DEG2BAM behaviour.
 	 */
-	s32 v = (s32)deg * 0xB60BL;
-	return (u16)((u32)(v >> radix) & 0xffff);
+	s32  v     = (s32)deg * 0xB60BL;
+	u16  shift = radix + 8;
+	return (u16)((u32)((v + (1L << (shift - 1))) >> shift) & 0xffff);
 }
 
 s32 FR_Cos(s16 deg, u16 radix)
@@ -245,14 +249,19 @@ s32 FR_FixAddSat(s32 x, s32 y)
  */
 /* FR_acos — returns radians at out_radix.
  * Range: [0, pi].  Input is a cosine value at the given radix.
+ *
+ * Uses the same 129-entry cosine table as fr_cos_bam, but in reverse:
+ * binary-search to find the bracketing pair, then linear-interpolate
+ * the fractional position between them to recover the full 14-bit
+ * in-quadrant BAM.  This mirrors the forward path and gives matching
+ * precision (~1 LSB of s15.16 output).
  */
 s32 FR_acos(s32 input, u16 radix, u16 out_radix)
 {
 	s32 v;
 	s16 sign;
 	s32 lo, hi, mid;
-	s32 best_idx, best_err;
-	s32 left, right;
+	s32 idx, d, num, frac;
 
 	v = FR_CHRDX(input, radix, FR_TRIG_PREC); /* to s0.15 */
 
@@ -263,9 +272,30 @@ s32 FR_acos(s32 input, u16 radix, u16 out_radix)
 	sign = (v < 0) ? 1 : 0;
 	if (v < 0) v = -v;
 
-	/* Binary search on the BAM quadrant table. The table is monotonically
-	 * decreasing across [0, FR_TRIG_TABLE_SIZE]. We want the index `i`
-	 * such that gFR_COS_TAB_Q[i] is closest to v.
+	/* Small-angle fast path: when v is in the flat region near cos(0)
+	 * the table has only 2-8 LSBs of gap between entries, so linear
+	 * interpolation is very coarse.  Use the identity
+	 *   acos(x) ≈ sqrt(2*(1-x))   for x close to 1
+	 * which is exact in the limit and leverages FR_sqrt's precision.
+	 * Switch at table[3] (gap = 12 LSBs) — below that the table is fine.
+	 */
+	if (v > gFR_COS_TAB_Q[7])
+	{
+		s32 one_minus_v = (s32)(1 << FR_TRIG_PREC) - v; /* 1.0 - x */
+		s32 two_omv     = one_minus_v << 1;          /* 2*(1-x) at radix 15 */
+		s32 rad15       = FR_sqrt(two_omv, FR_TRIG_PREC); /* radians at r15 */
+		s32 rad_out     = FR_CHRDX(rad15, FR_TRIG_PREC, out_radix);
+		if (sign)
+			rad_out = FR_BAM2RAD(FR_BAM_HALF, out_radix) - rad_out;
+		return rad_out;
+	}
+
+	/* Binary search on the cosine quadrant table.  The table is
+	 * monotonically decreasing: gFR_COS_TAB_Q[0] = 32767 (cos 0°),
+	 * gFR_COS_TAB_Q[128] = 0 (cos 90°).
+	 *
+	 * After the search, lo is the first index where table[lo] <= v,
+	 * so the bracketing pair is (lo-1, lo) with table[lo-1] >= v >= table[lo].
 	 */
 	lo = 0;
 	hi = FR_TRIG_TABLE_SIZE;
@@ -277,28 +307,45 @@ s32 FR_acos(s32 input, u16 radix, u16 out_radix)
 		else
 			hi = mid;
 	}
-	best_idx = lo;
-	best_err = (gFR_COS_TAB_Q[best_idx] > v) ? (gFR_COS_TAB_Q[best_idx] - v)
-	                                         : (v - gFR_COS_TAB_Q[best_idx]);
-	if (best_idx > 0)
+
+	/* lo is now the index where table[lo] <= v.  The bracketing interval
+	 * is [lo-1, lo] (table decreasing).  Clamp idx to valid range.
+	 */
+	idx = lo;
+	if (idx <= 0)
+	{
+		/* v >= table[0] = 32767 — essentially cos(0), already clamped above
+		 * but guard anyway. */
+		idx = 0;
+		frac = 0;
+	}
+	else if (idx >= FR_TRIG_TABLE_SIZE)
 	{
-		left = gFR_COS_TAB_Q[best_idx - 1] - v;
-		if (left < 0) left = -left;
-		if (left < best_err) { best_err = left; best_idx = best_idx - 1; }
+		idx = FR_TRIG_TABLE_SIZE - 1;
+		frac = 0;
 	}
-	if (best_idx < FR_TRIG_TABLE_SIZE)
+	else
 	{
-		right = gFR_COS_TAB_Q[best_idx + 1] - v;
-		if (right < 0) right = -right;
-		if (right < best_err) { best_err = right; best_idx = best_idx + 1; }
+		/* Linear interpolate between table[idx-1] and table[idx].
+		 * d = table[idx-1] - table[idx]  (>= 0, cos decreasing)
+		 * num = table[idx-1] - v          (how far past table[idx-1])
+		 * frac = (num << FR_TRIG_FRAC_BITS) / d, in [0, FR_TRIG_FRAC_MAX)
+		 *
+		 * num and d are both in [0, 32767], so num << 7 fits in 22 bits.
+		 */
+		d   = gFR_COS_TAB_Q[idx - 1] - gFR_COS_TAB_Q[idx];
+		num = gFR_COS_TAB_Q[idx - 1] - v;
+		if (d > 0)
+			frac = ((num << FR_TRIG_FRAC_BITS) + (d >> 1)) / d;
+		else
+			frac = 0;
+		/* Reconstruct: the angle is at index (idx-1) + frac/FRAC_MAX,
+		 * so shift idx back by 1 for the BAM calculation below. */
+		idx = idx - 1;
 	}
 
-	/* best_idx is in [0, FR_TRIG_TABLE_SIZE]. Convert to BAM:
-	 * the table covers one quadrant (16384 BAM) in FR_TRIG_TABLE_SIZE-1 steps.
-	 * bam = best_idx << FR_TRIG_FRAC_BITS.
-	 */
 	{
-		u16 bam = (u16)((u32)best_idx << FR_TRIG_FRAC_BITS);
+		u16 bam = (u16)(((u32)idx << FR_TRIG_FRAC_BITS) + (u32)frac);
 		if (sign)
 			bam = (u16)(FR_BAM_HALF - bam);  /* mirror: pi - angle */
 		return FR_BAM2RAD(bam, out_radix);
@@ -1294,7 +1341,7 @@ void fr_adsr_init(fr_adsr_t *env,
 	    ? (s32)(FR_ADSR_PEAK_S130 / attack_samples)
 	    : FR_ADSR_PEAK_S130;
 	env->decay_dec   = (decay_samples   > 0)
-	    ? (s32)((FR_ADSR_PEAK_S130 - env->sustain) / decay_samples)
+	    ? (s32)((FR_ADSR_PEAK_S130 - env->sustain) / (s32)decay_samples)
 	    : (FR_ADSR_PEAK_S130 - env->sustain);
 	env->release_dec = (release_samples > 0)
 	    ? (s32)(FR_ADSR_PEAK_S130 / release_samples)
diff --git a/src/FR_math.h b/src/FR_math.h
index 82bea96..c90d815 100644
--- a/src/FR_math.h
+++ b/src/FR_math.h
@@ -32,8 +32,8 @@
 #ifndef __FR_Math_h__
 #define __FR_Math_h__
 
-#define FR_MATH_VERSION     "2.0.5"
-#define FR_MATH_VERSION_HEX  0x020005  /* major << 16 | minor << 8 | patch */
+#define FR_MATH_VERSION     "2.0.6"
+#define FR_MATH_VERSION_HEX  0x020006  /* major << 16 | minor << 8 | patch */
 
 #ifdef __cplusplus
 extern "C"
diff --git a/src/FR_math_2D.cpp b/src/FR_math_2D.cpp
index 31132bf..573a09f 100644
--- a/src/FR_math_2D.cpp
+++ b/src/FR_math_2D.cpp
@@ -5,7 +5,7 @@
  *
  *	@copy Copyright (C) <2001-2026>  <M. A. Chatterjee>
  *  @author M A Chatterjee <deftio [at] deftio [dot] com>
- *	@version 2.0.5 M. A. Chatterjee, cleaned up naming
+ *	@version 2.0.6 M. A. Chatterjee, cleaned up naming
  *
  *  This file contains integer math settable fixed point radix math routines for
  *  use on systems in which floating point is not desired or unavailable.
diff --git a/src/FR_math_2D.h b/src/FR_math_2D.h
index 1045577..ba8456f 100644
--- a/src/FR_math_2D.h
+++ b/src/FR_math_2D.h
@@ -3,7 +3,7 @@
  *
  *	@copy Copyright (C) <2001-2026>  <M. A. Chatterjee>
  *  @author M A Chatterjee <deftio [at] deftio [dot] com>
- *	@version 2.0.5 M. A. Chatterjee, cleaned up naming
+ *	@version 2.0.6 M. A. Chatterjee, cleaned up naming
  *
  *  This file contains integer math settable fixed point radix math routines for
  *  use on systems in which floating point is not desired or unavailable.
diff --git a/tests/test_full_coverage.c b/tests/test_full_coverage.c
index 63215c7..e16eaf9 100644
--- a/tests/test_full_coverage.c
+++ b/tests/test_full_coverage.c
@@ -772,12 +772,12 @@ int test_edge_branches() {
     s32 r32;
     fr_adsr_t env;
 
-    /* FR_Tan(deg, radix) c==0 branch. At radix 0, deg=-16384 and
-     * deg=16384 both drive the internal BAM to exactly 90°/270°, so
+    /* FR_Tan(deg, radix) c==0 branch. At radix 0, deg=90 and deg=270
+     * drive the internal BAM to exactly 16384/49152 (90°/270°), so
      * cos==0 and we hit the saturation return. */
-    r32 = FR_Tan(-16384, 0);                 /* bam=16384 (sin>0) */
+    r32 = FR_Tan(90, 0);                     /* bam=16384 (sin>0) */
     if (r32 != FR_TRIG_MAXVAL) return TEST_FAIL;
-    r32 = FR_Tan(16384, 0);                  /* bam=49152 (sin<0) */
+    r32 = FR_Tan(270, 0);                    /* bam=49152 (sin<0) */
     if (r32 != -FR_TRIG_MAXVAL) return TEST_FAIL;
 
     /* FR_atan2 now returns radians at out_radix.
diff --git a/tests/test_tdd.cpp b/tests/test_tdd.cpp
index 336149f..31ce58e 100644
--- a/tests/test_tdd.cpp
+++ b/tests/test_tdd.cpp
@@ -65,6 +65,8 @@ typedef struct {
     int n;
     double max_abs_err;
     double sum_abs_err;
+    double max_pct_err;
+    double sum_pct_err;
     double worst_input;
     double worst_actual;
     double worst_expected;
@@ -84,6 +86,10 @@ static void stats_add(stats_t *s, double in, double actual, double expected) {
         s->worst_expected = expected;
     }
     s->sum_abs_err += e;
+    /* Skip percent error when expected ≈ 0 to avoid division artifacts */
+    double pct = (fabs(expected) > 0.01) ? (e / fabs(expected)) * 100.0 : 0.0;
+    if (pct > s->max_pct_err) s->max_pct_err = pct;
+    s->sum_pct_err += pct;
     s->n++;
 }
 
@@ -91,6 +97,10 @@ static double stats_mean(const stats_t *s) {
     return s->n ? s->sum_abs_err / s->n : 0.0;
 }
 
+static double stats_mean_pct(const stats_t *s) {
+    return s->n ? s->sum_pct_err / s->n : 0.0;
+}
+
 static void md_h1(const char *t) { printf("\n# %s\n\n", t); }
 static void md_h2(const char *t) { printf("\n## %s\n\n", t); }
 static void md_h3(const char *t) { printf("\n### %s\n\n", t); }
@@ -1763,6 +1773,319 @@ static void section_summary(void) {
     printf("\n");
 }
 
+/* ============================================================
+ * Section 14: Accuracy Summary Table (machine-readable)
+ *
+ * Emits a markdown table between sentinel comments so that
+ * scripts/accuracy_report.sh can extract and patch it into
+ * README.md, docs/README.md, and pages/index.html.
+ * ============================================================ */
+
+static void section_accuracy_table(void) {
+    md_h2("14. Accuracy Summary Table");
+
+    printf("<!-- ACCURACY_TABLE_START -->\n");
+    printf("| Function | Max err (LSB) | Max err (%%) | Avg err (%%) | Note |\n");
+    printf("|---|---:|---:|---:|---|\n");
+
+    const int R = 16;
+    const double scale = (double)(1L << R);
+    const double lsb = 1.0 / scale;
+
+    /* --- sin / cos --- */
+    {
+        stats_t st; stats_reset(&st);
+        const u16 radix = 7; /* s8.7 degrees: 128 steps/deg, [-256°,+256°) */
+        /* 65536-point sweep: all s16 values at radix 7 cover > full circle */
+        for (int i = -32768; i <= 32767; i++) {
+            double deg = (double)i / (1 << radix);
+            double rad = deg * M_PI / 180.0;
+            stats_add(&st, deg, frd(FR_Sin((s16)i, radix), FR_TRIG_OUT_PREC), sin(rad));
+            stats_add(&st, deg, frd(FR_Cos((s16)i, radix), FR_TRIG_OUT_PREC), cos(rad));
+        }
+        /* Special cases: exact integer degrees including negative */
+        s16 specials[] = {0,30,45,60,90,120,135,150,180,210,225,240,270,300,315,330,360,
+                          -30,-45,-60,-90,-120,-135,-150,-180,-210,-225,-240,-270,-300,-315,-330,-360};
+        for (int si = 0; si < (int)(sizeof(specials)/sizeof(specials[0])); si++) {
+            s16 d = specials[si];
+            double rad = d * M_PI / 180.0;
+            stats_add(&st, d, frd(FR_SinI(d), FR_TRIG_OUT_PREC), sin(rad));
+            stats_add(&st, d, frd(FR_CosI(d), FR_TRIG_OUT_PREC), cos(rad));
+        }
+        printf("| sin / cos | %.1f | %.4f | %.4f | 65536-pt sweep + specials |\n",
+               st.max_abs_err / lsb, st.max_pct_err, stats_mean_pct(&st));
+    }
+
+    /* --- tan --- */
+    {
+        stats_t st; stats_reset(&st);
+        const u16 radix = 7;
+        for (int i = -32768; i <= 32767; i++) {
+            double deg = (double)i / (1 << radix);
+            double rad = deg * M_PI / 180.0;
+            /* Skip near poles: |cos| < 0.01 → tan > 100 */
+            if (fabs(cos(rad)) < 0.01) continue;
+            stats_add(&st, deg, frd(FR_Tan((s16)i, radix), FR_TRIG_OUT_PREC), tan(rad));
+        }
+        /* Special cases: integer degrees (avoiding poles) */
+        s16 specials[] = {0,30,45,60,-30,-45,-60,120,135,150,-120,-135,-150};
+        for (int si = 0; si < (int)(sizeof(specials)/sizeof(specials[0])); si++) {
+            s16 d = specials[si];
+            double rad = d * M_PI / 180.0;
+            stats_add(&st, d, frd(FR_TanI(d), FR_TRIG_OUT_PREC), tan(rad));
+        }
+        printf("| tan | %.1f | %.4f | %.4f | 65536-pt sweep (skip poles) |\n",
+               st.max_abs_err / lsb, st.max_pct_err, stats_mean_pct(&st));
+    }
+
+    /* --- asin / acos --- */
+    {
+        stats_t st; stats_reset(&st);
+        /* 65536-point sweep: all representable values at radix 15 over [-1, +1) */
+        for (int i = -32768; i <= 32767; i++) {
+            double xd = (double)i / (1 << 15);
+            if (xd < -1.0 || xd > 1.0) continue;
+            s32 rad = FR_asin((s32)i, 15, R);
+            stats_add(&st, xd, frd(rad, R), asin(xd));
+            rad = FR_acos((s32)i, 15, R);
+            stats_add(&st, xd, frd(rad, R), acos(xd));
+        }
+        printf("| asin / acos | %.1f | %.4f | %.4f | 65536-pt; sqrt approx near boundary |\n",
+               st.max_abs_err / lsb, st.max_pct_err, stats_mean_pct(&st));
+    }
+
+    /* --- atan2 --- */
+    {
+        stats_t st; stats_reset(&st);
+        /* 65536-point sweep at each radius.
+         * Skip i=-32768 (exactly -pi): branch-cut convention differs
+         * between FR_atan2 (+pi) and libm (-pi), both correct.
+         * Start radii at 0.1 — at 0.01 inputs have <10 LSBs of angular
+         * resolution, testing input quantization not the algorithm. */
+        double radii[] = {0.1, 1.0, 10.0, 100.0, 1000.0};
+        for (int ri = 0; ri < (int)(sizeof(radii)/sizeof(radii[0])); ri++) {
+            double rad = radii[ri];
+            for (int i = -32767; i <= 32768; i++) {
+                double angle = i * M_PI / 32768.0;
+                double x = rad * cos(angle), y = rad * sin(angle);
+                s32 fx = (s32)(x * scale);
+                s32 fy = (s32)(y * scale);
+                if (fx == 0 && fy == 0) continue;
+                s32 r = FR_atan2(fy, fx, R);
+                double ref = atan2(y, x);
+                /* Skip near ±pi branch cut: sign depends on sub-LSB
+                 * input quantization, not algorithm accuracy. */
+                if (fabs(fabs(ref) - M_PI) < 0.01) continue;
+                stats_add(&st, angle * 180.0 / M_PI, frd(r, R), ref);
+            }
+        }
+        /* Special cases: exact quadrant/octant/30-degree angles */
+        double specials_deg[] = {0,30,45,60,90,120,135,150,
+                                 -30,-45,-60,-90,-120,-135,-150,-170};
+        for (int si = 0; si < (int)(sizeof(specials_deg)/sizeof(specials_deg[0])); si++) {
+            double angle = specials_deg[si] * M_PI / 180.0;
+            double x = 100.0 * cos(angle), y = 100.0 * sin(angle);
+            s32 fx = (s32)(x * scale), fy = (s32)(y * scale);
+            if (fx == 0 && fy == 0) continue;
+            s32 r = FR_atan2(fy, fx, R);
+            stats_add(&st, specials_deg[si], frd(r, R), atan2(y, x));
+        }
+        printf("| atan2 | %.1f | %.4f | %.4f | 65536x5 radii + specials |\n",
+               st.max_abs_err / lsb, st.max_pct_err, stats_mean_pct(&st));
+    }
+
+    /* --- sqrt --- */
+    {
+        stats_t st; stats_reset(&st);
+        double inputs[] = {0.0001, 0.25, 0.5, 1, 2, 3, 4, 7, 9, 16, 25, 100, 1024, 10000, 32000};
+        for (int i = 0; i < (int)(sizeof(inputs)/sizeof(inputs[0])); i++) {
+            s32 fr = (s32)(inputs[i] * scale);
+            s32 r = FR_sqrt(fr, R);
+            stats_add(&st, inputs[i], frd(r, R), sqrt(inputs[i]));
+        }
+        /* Fine sweep */
+        for (int i = 1; i <= 1000; i++) {
+            double x = i * 10.0;
+            s32 fr = (s32)(x * scale);
+            s32 r = FR_sqrt(fr, R);
+            stats_add(&st, x, frd(r, R), sqrt(x));
+        }
+        printf("| sqrt | %.1f | %.4f | %.4f | Round-to-nearest |\n",
+               st.max_abs_err / lsb, st.max_pct_err, stats_mean_pct(&st));
+    }
+
+    /* --- log2 --- */
+    {
+        stats_t st; stats_reset(&st);
+        /* Integer inputs — stay within s32 range at radix 16 (max ~32767) */
+        for (int v = 1; v <= 32000; v += (v < 100 ? 1 : v / 10)) {
+            s32 fr = (s32)((double)v * scale);
+            if (fr <= 0) continue;
+            s32 r = FR_log2(fr, (u16)R, (u16)R);
+            stats_add(&st, (double)v, frd(r, R), log2((double)v));
+        }
+        /* Fractional sweep 0.125 .. 1.0 */
+        for (int i = 1; i <= 100; i++) {
+            double x = 0.125 + (0.875 * i / 100.0);
+            s32 fr = (s32)(x * scale);
+            if (fr <= 0) continue;
+            s32 r = FR_log2(fr, (u16)R, (u16)R);
+            stats_add(&st, x, frd(r, R), log2(x));
+        }
+        printf("| log2 | %.1f | %.4f | %.4f | 65-entry mantissa table |\n",
+               st.max_abs_err / lsb, st.max_pct_err, stats_mean_pct(&st));
+    }
+
+    /* --- pow2 --- */
+    {
+        stats_t st; stats_reset(&st);
+        for (int i = -800; i <= 800; i++) {
+            double x = i / 100.0;
+            s32 fr = (s32)(x * scale);
+            s32 r = FR_pow2(fr, R);
+            double ref = pow(2.0, x);
+            stats_add(&st, x, frd(r, R), ref);
+        }
+        printf("| pow2 | %.1f | %.4f | %.4f | 65-entry fraction table |\n",
+               st.max_abs_err / lsb, st.max_pct_err, stats_mean_pct(&st));
+    }
+
+    /* --- ln, log10 --- */
+    {
+        stats_t st; stats_reset(&st);
+        double inputs[] = {0.125, 0.25, 0.5, 1, 2, M_E, 3, 4, 5, 7, 8, 10, 20, 50, 100, 1000};
+        for (int i = 0; i < (int)(sizeof(inputs)/sizeof(inputs[0])); i++) {
+            s32 fr = (s32)(inputs[i] * scale);
+            if (fr <= 0) continue;
+            s32 r = FR_ln(fr, R, R);
+            double ref = log(inputs[i]);
+            stats_add(&st, inputs[i], frd(r, R), ref);
+            r = FR_log10(fr, R, R);
+            ref = log10(inputs[i]);
+            stats_add(&st, inputs[i], frd(r, R), ref);
+        }
+        printf("| ln, log10 | %.1f | %.4f | %.4f | Via FR_MULK28 from log2 |\n",
+               st.max_abs_err / lsb, st.max_pct_err, stats_mean_pct(&st));
+    }
+
+    /* --- exp (FR_EXP) --- */
+    {
+        stats_t st; stats_reset(&st);
+        for (int i = -400; i <= 400; i++) {
+            double x = i / 100.0;
+            s32 fr = (s32)(x * scale);
+            s32 r = FR_EXP(fr, R);
+            double ref = exp(x);
+            if (ref > 32000.0 || ref < 1e-6) continue; /* skip overflow/underflow */
+            stats_add(&st, x, frd(r, R), ref);
+        }
+        printf("| exp | %.1f | %.4f | %.4f | FR_MULK28 + FR_pow2 |\n",
+               st.max_abs_err / lsb, st.max_pct_err, stats_mean_pct(&st));
+    }
+
+    /* --- exp_fast (FR_EXP_FAST) --- */
+    {
+        stats_t st; stats_reset(&st);
+        for (int i = -400; i <= 400; i++) {
+            double x = i / 100.0;
+            s32 fr = (s32)(x * scale);
+            s32 r = FR_EXP_FAST(fr, R);
+            double ref = exp(x);
+            if (ref > 32000.0 || ref < 1e-6) continue;
+            stats_add(&st, x, frd(r, R), ref);
+        }
+        printf("| exp_fast | %.1f | %.4f | %.4f | Shift-only scaling |\n",
+               st.max_abs_err / lsb, st.max_pct_err, stats_mean_pct(&st));
+    }
+
+    /* --- pow10 (FR_POW10) --- */
+    {
+        stats_t st; stats_reset(&st);
+        for (int i = -200; i <= 200; i++) {
+            double x = i / 100.0;
+            s32 fr = (s32)(x * scale);
+            s32 r = FR_POW10(fr, R);
+            double ref = pow(10.0, x);
+            if (ref > 32000.0 || ref < 1e-6) continue;
+            stats_add(&st, x, frd(r, R), ref);
+        }
+        printf("| pow10 | %.1f | %.4f | %.4f | FR_MULK28 + FR_pow2 |\n",
+               st.max_abs_err / lsb, st.max_pct_err, stats_mean_pct(&st));
+    }
+
+    /* --- pow10_fast (FR_POW10_FAST) --- */
+    {
+        stats_t st; stats_reset(&st);
+        for (int i = -200; i <= 200; i++) {
+            double x = i / 100.0;
+            s32 fr = (s32)(x * scale);
+            s32 r = FR_POW10_FAST(fr, R);
+            double ref = pow(10.0, x);
+            if (ref > 32000.0 || ref < 1e-6) continue;
+            stats_add(&st, x, frd(r, R), ref);
+        }
+        printf("| pow10_fast | %.1f | %.4f | %.4f | Shift-only scaling |\n",
+               st.max_abs_err / lsb, st.max_pct_err, stats_mean_pct(&st));
+    }
+
+    /* --- hypot (exact) --- */
+    {
+        stats_t st; stats_reset(&st);
+        struct { double x, y; } cases[] = {
+            {0,0},{1,0},{0,1},{3,4},{5,12},{8,15},{-3,-4},{-3,4},{3,-4},
+            {1,1},{0.5,0.5},{100,100},{1000,1},{1,1000}
+        };
+        for (int i = 0; i < (int)(sizeof(cases)/sizeof(cases[0])); i++) {
+            s32 fx = (s32)(cases[i].x * scale);
+            s32 fy = (s32)(cases[i].y * scale);
+            s32 r = FR_hypot(fx, fy, R);
+            double ref = hypot(cases[i].x, cases[i].y);
+            stats_add(&st, ref, frd(r, R), ref);
+        }
+        printf("| hypot (exact) | %.1f | %.4f | %.4f | 64-bit intermediate |\n",
+               st.max_abs_err / lsb, st.max_pct_err, stats_mean_pct(&st));
+    }
+
+    /* --- hypot_fast (4-seg) --- */
+    {
+        stats_t st; stats_reset(&st);
+        struct { double x, y; } cases[] = {
+            {1,0},{0,1},{3,4},{5,12},{8,15},{-3,-4},{1,1},{0.5,0.5},
+            {100,100},{1000,1},{1,1000},{7,24},{20,21}
+        };
+        for (int i = 0; i < (int)(sizeof(cases)/sizeof(cases[0])); i++) {
+            s32 fx = (s32)(cases[i].x * scale);
+            s32 fy = (s32)(cases[i].y * scale);
+            s32 r = FR_hypot_fast(fx, fy);
+            double ref = hypot(cases[i].x, cases[i].y);
+            if (ref > 0) stats_add(&st, ref, frd(r, R), ref);
+        }
+        printf("| hypot_fast (4-seg) | %.1f | %.4f | %.4f | Shift-only, no multiply |\n",
+               st.max_abs_err / lsb, st.max_pct_err, stats_mean_pct(&st));
+    }
+
+    /* --- hypot_fast8 (8-seg) --- */
+    {
+        stats_t st; stats_reset(&st);
+        struct { double x, y; } cases[] = {
+            {1,0},{0,1},{3,4},{5,12},{8,15},{-3,-4},{1,1},{0.5,0.5},
+            {100,100},{1000,1},{1,1000},{7,24},{20,21}
+        };
+        for (int i = 0; i < (int)(sizeof(cases)/sizeof(cases[0])); i++) {
+            s32 fx = (s32)(cases[i].x * scale);
+            s32 fy = (s32)(cases[i].y * scale);
+            s32 r = FR_hypot_fast8(fx, fy);
+            double ref = hypot(cases[i].x, cases[i].y);
+            if (ref > 0) stats_add(&st, ref, frd(r, R), ref);
+        }
+        printf("| hypot_fast8 (8-seg) | %.1f | %.4f | %.4f | Shift-only, no multiply |\n",
+               st.max_abs_err / lsb, st.max_pct_err, stats_mean_pct(&st));
+    }
+
+    printf("<!-- ACCURACY_TABLE_END -->\n");
+    printf("\n");
+}
+
 int main(void) {
     md_h1("FR_Math TDD Characterization Report");
     printf("> Generated by `tests/test_tdd.cpp`. This is a measurement suite, not a pass/fail suite.\n");
@@ -1782,6 +2105,7 @@ int main(void) {
     section_v2_new();
     section_multiradix();
     section_summary();
+    section_accuracy_table();
 
     return 0;
 }
diff --git a/tools/make_release.sh b/tools/make_release.sh
index 441181d..e1a504f 100755
--- a/tools/make_release.sh
+++ b/tools/make_release.sh
@@ -196,6 +196,11 @@ do_validate() {
     else
         ls -l build/FR_math.o build/FR_math_2D.o
     fi
+
+    echo ""
+    echo "  --- Accuracy table ---"
+    bash "${PROJECT_ROOT}/scripts/accuracy_report.sh" --update
+    pass "Accuracy table updated in README + docs."
 }
 
 # -----------------------------------------------------------------------
@@ -233,7 +238,7 @@ do_cross_compile() {
 
 # Files the pipeline itself may modify (badge update, version sync).
 # Anything outside this list is unexpected and should block the release.
-PIPELINE_FILES="README.md VERSION src/FR_math.h library.properties library.json idf_component.yml llms.txt pages/assets/site.js src/FR_math_2D.h src/FR_math_2D.cpp"
+PIPELINE_FILES="README.md VERSION src/FR_math.h library.properties library.json idf_component.yml llms.txt pages/assets/site.js src/FR_math_2D.h src/FR_math_2D.cpp docs/README.md pages/index.html"
 
 do_commit_pipeline_changes() {
     step_header "Commit pipeline-generated changes"

From b9d3c842cd65d028cad1fc72aa1536e39a87c1f4 Mon Sep 17 00:00:00 2001
From: deftio <deftio@deftio.com>
Date: Tue, 21 Apr 2026 20:15:33 -0700
Subject: [PATCH 2/6] updated builds and accuracy tests

---
 README.md                               | 105 ++-
 compare_lfm/.gitignore                  |  10 +
 compare_lfm/Makefile                    | 102 +++
 compare_lfm/Makefile.explog             |  77 +++
 compare_lfm/Using_fast_hypot_as_sqrt.md | 458 +++++++++++++
 compare_lfm/WARNING.md                  |   5 +
 compare_lfm/bench_explog.cpp            | 710 ++++++++++++++++++++
 compare_lfm/benchmark.cpp               | 829 ++++++++++++++++++++++++
 compare_lfm/comparison_results.json     | 479 ++++++++++++++
 compare_lfm/comparison_summary.md       |  81 +++
 compare_lfm/pow_log_improve.md          | 504 ++++++++++++++
 dev/fr_math_2.0.1.md                    |   2 +-
 docs/README.md                          | 101 ++-
 docs/api-reference.md                   |   3 +-
 docs/examples.md                        |   2 -
 makefile                                |   6 +-
 pages/guide/api-reference.html          |  16 +-
 pages/guide/examples.html               |   2 -
 pages/index.html                        | 107 ++-
 release_notes.md                        |   3 +-
 scripts/accuracy_report.sh              |  11 +-
 src/FR_math.c                           | 220 +++----
 src/FR_math.h                           |   4 +-
 tests/test_full_coverage.c              |  31 +-
 tests/test_tdd.cpp                      | 184 +++---
 25 files changed, 3706 insertions(+), 346 deletions(-)
 create mode 100644 compare_lfm/.gitignore
 create mode 100644 compare_lfm/Makefile
 create mode 100644 compare_lfm/Makefile.explog
 create mode 100644 compare_lfm/Using_fast_hypot_as_sqrt.md
 create mode 100644 compare_lfm/WARNING.md
 create mode 100644 compare_lfm/bench_explog.cpp
 create mode 100644 compare_lfm/benchmark.cpp
 create mode 100644 compare_lfm/comparison_results.json
 create mode 100644 compare_lfm/comparison_summary.md
 create mode 100644 compare_lfm/pow_log_improve.md

diff --git a/README.md b/README.md
index ffc2c10..b9c9a74 100644
--- a/README.md
+++ b/README.md
@@ -20,6 +20,8 @@ beyond `<stdint.h>`.
 
 ### Library size (FR_math.c only, `-Os`)
 
+The following are compilied object code sizes on select platforms from static test build.  Your sizes may vary depending on optimization choices and linker settings.  Sizes include full code and any internal tables and are ROMable.
+
 | Target | Code (text) |
 |--------|-------------|
 | Cortex-M0 (Thumb-1) | 4.2 KB |
@@ -32,7 +34,7 @@ beyond `<stdint.h>`.
 | MSP430 (16-bit) | 8.4 KB |
 | 8051 (SDCC) | 20.4 KB * |
 
-Sizes are code-only (text section). The optional 2D module adds ~1 KB.
+The optional 2D module adds ~1 KB.
 \* 8051 and MSP430 are 8/16-bit — every 32-bit operation expands to multiple instructions.
 See [`docker/`](docker/) for the cross-compile setup.
 
@@ -42,24 +44,26 @@ Errors below are measured at Q16.16 (s15.16). All functions accept any
 radix — Q16.16 is just the reference point for the table.
 Percent errors skip expected values near zero (|expected| < 0.01).
 
-<!-- ACCURACY_TABLE_START -->
-| Function | Max err (LSB) | Max err (%) | Avg err (%) | Note |
-|---|---:|---:|---:|---|
-| sin / cos | 7.5 | 0.7169 | 0.0100 | 65536-pt sweep + specials |
-| tan | 38020.4 | 0.7118 | 0.0162 | 65536-pt sweep (skip poles) |
-| asin / acos | 512.6 | 0.7025 | 0.0105 | 65536-pt; sqrt approx near boundary |
-| atan2 | 44.7 | 2.9913 | 0.0383 | 65536x5 radii + specials |
-| sqrt | 28.4 | 0.0003 | 0.0000 | Round-to-nearest |
-| log2 | 10.5 | 0.2479 | 0.0045 | 65-entry mantissa table |
-| pow2 | 220.4 | 0.1373 | 0.0057 | 65-entry fraction table |
-| ln, log10 | 0.7 | 0.0015 | 0.0004 | Via FR_MULK28 from log2 |
-| exp | 65.7 | 0.0719 | 0.0051 | FR_MULK28 + FR_pow2 |
-| exp_fast | 195.5 | 0.0719 | 0.0064 | Shift-only scaling |
-| pow10 | 143.4 | 0.1163 | 0.0075 | FR_MULK28 + FR_pow2 |
-| pow10_fast | 581.9 | 0.1163 | 0.0100 | Shift-only scaling |
-| hypot (exact) | 0.2 | 0.0001 | 0.0000 | 64-bit intermediate |
-| hypot_fast (4-seg) | 216112.8 | 0.3418 | 0.1923 | Shift-only, no multiply |
-| hypot_fast8 (8-seg) | 59968.8 | 0.0977 | 0.0508 | Shift-only, no multiply |
+Note that at other radixes (3bit, 24 bit etc), accuracy may change due fractional bits available but with increased/decreased scale.  All functions support 0 to 30 bit radix types at compile time.
+
+<!-- ACCURACY_TABLE_START -->
+| Function | Max err (LSB) | Max err (%) | Avg err (%) | Note |
+|---|---:|---:|---:|---|
+| sin / cos | 7.5 | 0.7169 | 0.0100 | 65536-pt sweep + specials |
+| tan | 38020.4 | 0.7118 | 0.0162 | 65536-pt sweep (skip poles) |
+| asin / acos | 42.3 | 0.7025 | 0.0105 | 65536-pt; sqrt approx near boundary |
+| atan2 | 63.3 | 0.4953 | 0.0268 | 65536x5 radii; asin/acos+hypot_fast8 |
+| atan | 61.9 | 0.2985 | 0.0159 | 20001-pt sweep [-10,10]; via FR_atan2 |
+| sqrt | 28.4 | 0.0003 | 0.0000 | Round-to-nearest |
+| log2 | 10.5 | 0.2479 | 0.0045 | 65-entry mantissa table |
+| pow2 | 220.4 | 0.1373 | 0.0057 | 65-entry fraction table |
+| ln, log10 | 0.7 | 0.0015 | 0.0004 | Via FR_MULK28 from log2 |
+| exp | 65.7 | 0.0719 | 0.0051 | FR_MULK28 + FR_pow2 |
+| exp_fast | 195.5 | 0.0719 | 0.0064 | Shift-only scaling |
+| pow10 | 143.4 | 0.1163 | 0.0075 | FR_MULK28 + FR_pow2 |
+| pow10_fast | 581.9 | 0.1163 | 0.0100 | Shift-only scaling |
+| hypot (exact) | 0.2 | 0.0001 | 0.0000 | 64-bit intermediate |
+| hypot_fast8 (8-seg) | 59968.8 | 0.0977 | 0.0508 | Shift-only, no multiply |
 <!-- ACCURACY_TABLE_END -->
 
 ### What's in the box
@@ -72,7 +76,7 @@ Percent errors skip expected values near zero (|expected| < 0.01).
 | Trig (radian/BAM) | `fr_sin`, `fr_cos`, `fr_tan`, `fr_sin_bam`, `fr_cos_bam`, `fr_sin_deg`, `fr_cos_deg` |
 | Inverse trig | `FR_atan`, `FR_atan2`, `FR_asin`, `FR_acos` |
 | Log / exp | `FR_log2`, `FR_ln`, `FR_log10`, `FR_pow2`, `FR_EXP`, `FR_POW10`, `FR_EXP_FAST`, `FR_POW10_FAST`, `FR_MULK28` |
-| Roots | `FR_sqrt`, `FR_hypot`, `FR_hypot_fast`, `FR_hypot_fast8` |
+| Roots | `FR_sqrt`, `FR_hypot`, `FR_hypot_fast8` |
 | Wave generators | `fr_wave_sqr`, `fr_wave_pwm`, `fr_wave_tri`, `fr_wave_saw`, `fr_wave_tri_morph`, `fr_wave_noise` |
 | Envelope | `fr_adsr_init`, `fr_adsr_trigger`, `fr_adsr_release`, `fr_adsr_step` |
 | 2D transforms | `FR_Matrix2D_CPT` (mul, add, sub, det, inv, setrotate, XFormPtI, XFormPtI16) |
@@ -94,11 +98,62 @@ make test      # run all tests (coverage, TDD characterization, 2D)
 
 #define R 16  /* work at radix 16 (s15.16) throughout */
 
-s32 pi    = FR_NUM(3, 14159, 5, R);       /* pi at radix 16             */
-s32 c45   = FR_CosI(45);                  /* cos 45 deg = 0.7071 (s15.16) */
-s32 root2 = FR_sqrt(I2FR(2, R), R);       /* sqrt(2)    = 1.4142        */
-s32 lg    = FR_log2(I2FR(1000, R), R, R); /* log2(1000) ~ 9.97          */
-s32 ex    = FR_EXP(I2FR(1, R), R);        /* e^1        ~ 2.7183        */
+/* ---- Creating fixed-point values ----
+ *
+ * FR_NUM(integer, frac_digits, num_digits, radix) encodes a decimal
+ * literal at compile time.  The fractional part is the digits AFTER
+ * the decimal point, and num_digits says how many digits that is.
+ * Think: FR_NUM(3, 14159, 5, 16) means "3.14159" at radix 16.
+ */
+s32 pi   = FR_NUM(3, 14159, 5, R);  /* 3.14159 → raw 205886 at r16  */
+s32 half = FR_NUM(0, 5, 1, R);      /* 0.5     → raw 32768           */
+s32 neg  = FR_NUM(-2, 75, 2, R);    /* -2.75   → raw -180224         */
+
+/* Or parse from a string at runtime (no floats, no strtod): */
+s32 pi2  = FR_numstr("3.14159", R); /* same result as FR_NUM above    */
+
+/* Integer-to-fixed: I2FR(n, radix) just shifts left */
+s32 two  = I2FR(2, R);              /* 2.0 → raw 131072              */
+
+/* ---- Naming convention: macros vs functions ----
+ *
+ * UPPERCASE FR_ names are macros — they expand inline with no call
+ * overhead, and the compiler can constant-fold them.  Use these for
+ * conversions and simple arithmetic:
+ *   I2FR, FR2I, FR_NUM, FR_ADD, FR_MUL, FR_DIV, FR_ABS, FR_EXP ...
+ *
+ * MixedCase FR_ names are functions — they contain loops, tables, or
+ * multi-step algorithms where inlining would waste ROM:
+ *   FR_Cos, FR_sqrt, FR_atan2, FR_log2, FR_pow2, FR_printNumF ...
+ *
+ * lowercase fr_ names are v2 functions (radian trig, wave generators,
+ * ADSR envelopes):
+ *   fr_sin, fr_cos, fr_tan, fr_wave_tri, fr_adsr_step ...
+ *
+ * Some macros wrap functions: FR_EXP(x,r) scales x then calls
+ * FR_pow2 — one-liner convenience, heavy lifting in the function.
+ */
+
+/* ---- Math functions ---- */
+s32 c45   = FR_Cos(45, 0);                /* cos(45°) = 0.7071       */
+s32 s30   = fr_sin(FR_numstr("0.5236", R), R); /* sin(0.5236 rad)    */
+s32 root2 = FR_sqrt(two, R);              /* sqrt(2)  = 1.4142       */
+s32 angle = FR_atan2(I2FR(1,R), I2FR(1,R), R); /* atan2(1,1) rad     */
+s32 lg    = FR_log2(I2FR(1000, R), R, R); /* log2(1000) ~ 9.97       */
+s32 ex    = FR_EXP(I2FR(1, R), R);        /* macro: scales then calls
+                                            * FR_pow2 internally      */
+
+/* ---- Printing (serial / UART / file friendly) ----
+ *
+ * FR_printNumF takes a per-character output function — works with
+ * putchar, Serial.write, UART_putc, or any int(*)(char).  No
+ * sprintf, no floats, no heap.  Ideal for bare-metal targets.
+ */
+int my_putchar(char c) { return putchar(c); }  /* or your UART func */
+
+FR_printNumF(my_putchar, pi, R, 8, 5);    /* prints " 3.14159"      */
+FR_printNumF(my_putchar, neg, R, 8, 2);   /* prints "   -2.75"      */
+FR_printNumD(my_putchar, FR2I(lg, R), 4); /* prints "   9" (integer)*/
 ```
 
 ## Documentation
diff --git a/compare_lfm/.gitignore b/compare_lfm/.gitignore
new file mode 100644
index 0000000..d0ab86f
--- /dev/null
+++ b/compare_lfm/.gitignore
@@ -0,0 +1,10 @@
+# Build artifacts
+build/
+
+# Cloned third-party libraries (fetch from GitHub as needed)
+libfixmath/
+fpm/
+liquid-fpm/
+
+# Claude session data
+.claude/
diff --git a/compare_lfm/Makefile b/compare_lfm/Makefile
new file mode 100644
index 0000000..01554ed
--- /dev/null
+++ b/compare_lfm/Makefile
@@ -0,0 +1,102 @@
+# ============================================================
+# Benchmark: FR_math vs libfixmath  (macOS ARM / Apple Clang)
+# ============================================================
+# WARNING: This Makefile only builds files inside .compare/
+#          It does NOT modify anything in the parent repo.
+# ============================================================
+
+CXX      := clang++
+CC       := clang
+CXXFLAGS := -std=c++17 -O2 -Wall -Wextra
+CFLAGS   := -std=c11 -O2 -Wall -Wextra
+
+# --- FR_math (parent repo, compiled read-only) ---
+FR_SRC   := ../src/FR_math.c
+FR_INC   := -I../src
+
+# --- libfixmath (cloned into this dir) ---
+LFM_DIR  := libfixmath/libfixmath
+LFM_SRC  := $(LFM_DIR)/fix16.c       \
+            $(LFM_DIR)/fix16_sqrt.c   \
+            $(LFM_DIR)/fix16_exp.c    \
+            $(LFM_DIR)/fix16_trig.c   \
+            $(LFM_DIR)/fix16_str.c    \
+            $(LFM_DIR)/uint32.c       \
+            $(LFM_DIR)/fract32.c
+LFM_INC  := -I$(LFM_DIR)
+
+# --- Build ---
+BUILD    := build
+TARGET   := $(BUILD)/benchmark
+JSON_OUT := comparison_results.json
+
+OBJS := $(BUILD)/FR_math.o       \
+        $(BUILD)/fix16.o          \
+        $(BUILD)/fix16_sqrt.o     \
+        $(BUILD)/fix16_exp.o      \
+        $(BUILD)/fix16_trig.o     \
+        $(BUILD)/fix16_str.o      \
+        $(BUILD)/uint32.o         \
+        $(BUILD)/fract32.o        \
+        $(BUILD)/benchmark.o
+
+.PHONY: all clean run size
+
+all: $(TARGET)
+
+run: $(TARGET)
+	./$(TARGET) > $(JSON_OUT) 2>comparison_summary.md
+	@echo "Results written to $(JSON_OUT) + comparison_summary.md"
+
+# Size comparison: ROM + RAM for both libraries
+LFM_OBJS := $(BUILD)/fix16.o $(BUILD)/fix16_sqrt.o $(BUILD)/fix16_exp.o \
+            $(BUILD)/fix16_trig.o $(BUILD)/fix16_str.o $(BUILD)/uint32.o \
+            $(BUILD)/fract32.o
+
+size: $(BUILD)/FR_math.o $(LFM_OBJS)
+	@echo "=== Compiled size: FR_math vs libfixmath ($(CC) -O2) ==="
+	@echo ""
+	@echo "FR_math (FR_math.o):"
+	@size -m $(BUILD)/FR_math.o | grep -E "Section|total"
+	@echo ""
+	@echo "libfixmath (all objects):"
+	@for f in $(LFM_OBJS); do echo "  $$(basename $$f):"; size -m $$f | grep -E "Section" | sed 's/^/    /'; done
+
+$(TARGET): $(OBJS) | $(BUILD)
+	$(CXX) $(CXXFLAGS) -o $@ $^
+
+$(BUILD):
+	mkdir -p $(BUILD)
+
+# FR_math object
+$(BUILD)/FR_math.o: $(FR_SRC) | $(BUILD)
+	$(CC) $(CFLAGS) $(FR_INC) -c -o $@ $<
+
+# libfixmath objects
+$(BUILD)/fix16.o: $(LFM_DIR)/fix16.c | $(BUILD)
+	$(CC) $(CFLAGS) $(LFM_INC) -c -o $@ $<
+
+$(BUILD)/fix16_sqrt.o: $(LFM_DIR)/fix16_sqrt.c | $(BUILD)
+	$(CC) $(CFLAGS) $(LFM_INC) -c -o $@ $<
+
+$(BUILD)/fix16_exp.o: $(LFM_DIR)/fix16_exp.c | $(BUILD)
+	$(CC) $(CFLAGS) $(LFM_INC) -c -o $@ $<
+
+$(BUILD)/fix16_trig.o: $(LFM_DIR)/fix16_trig.c | $(BUILD)
+	$(CC) $(CFLAGS) $(LFM_INC) -c -o $@ $<
+
+$(BUILD)/fix16_str.o: $(LFM_DIR)/fix16_str.c | $(BUILD)
+	$(CC) $(CFLAGS) $(LFM_INC) -c -o $@ $<
+
+$(BUILD)/uint32.o: $(LFM_DIR)/uint32.c | $(BUILD)
+	$(CC) $(CFLAGS) $(LFM_INC) -c -o $@ $<
+
+$(BUILD)/fract32.o: $(LFM_DIR)/fract32.c | $(BUILD)
+	$(CC) $(CFLAGS) $(LFM_INC) -c -o $@ $<
+
+# Benchmark harness
+$(BUILD)/benchmark.o: benchmark.cpp | $(BUILD)
+	$(CXX) $(CXXFLAGS) $(FR_INC) $(LFM_INC) -c -o $@ $<
+
+clean:
+	rm -rf $(BUILD) $(JSON_OUT)
diff --git a/compare_lfm/Makefile.explog b/compare_lfm/Makefile.explog
new file mode 100644
index 0000000..afb66de
--- /dev/null
+++ b/compare_lfm/Makefile.explog
@@ -0,0 +1,77 @@
+# Makefile.explog — build exp/log accuracy improvement benchmark
+# Usage: make -f Makefile.explog run
+
+CXX      := clang++
+CC       := clang
+CXXFLAGS := -std=c++17 -O2 -Wall -Wextra
+CFLAGS   := -std=c11 -O2 -Wall -Wextra
+
+FR_SRC   := ../src/FR_math.c
+FR_INC   := -I../src
+
+LFM_DIR  := libfixmath/libfixmath
+LFM_SRC  := $(LFM_DIR)/fix16.c       \
+            $(LFM_DIR)/fix16_sqrt.c   \
+            $(LFM_DIR)/fix16_exp.c    \
+            $(LFM_DIR)/fix16_trig.c   \
+            $(LFM_DIR)/fix16_str.c    \
+            $(LFM_DIR)/uint32.c       \
+            $(LFM_DIR)/fract32.c
+LFM_INC  := -I$(LFM_DIR)
+
+BUILD    := build
+TARGET   := $(BUILD)/bench_explog
+
+# Reuse FR_math and libfixmath objects from main Makefile
+OBJS := $(BUILD)/FR_math.o       \
+        $(BUILD)/fix16.o          \
+        $(BUILD)/fix16_sqrt.o     \
+        $(BUILD)/fix16_exp.o      \
+        $(BUILD)/fix16_trig.o     \
+        $(BUILD)/fix16_str.o      \
+        $(BUILD)/uint32.o         \
+        $(BUILD)/fract32.o        \
+        $(BUILD)/bench_explog.o
+
+.PHONY: all clean run
+
+all: $(TARGET)
+
+run: $(TARGET)
+	./$(TARGET)
+
+$(TARGET): $(OBJS) | $(BUILD)
+	$(CXX) $(CXXFLAGS) -o $@ $^
+
+$(BUILD):
+	mkdir -p $(BUILD)
+
+$(BUILD)/FR_math.o: $(FR_SRC) | $(BUILD)
+	$(CC) $(CFLAGS) $(FR_INC) -c -o $@ $<
+
+$(BUILD)/fix16.o: $(LFM_DIR)/fix16.c | $(BUILD)
+	$(CC) $(CFLAGS) $(LFM_INC) -c -o $@ $<
+
+$(BUILD)/fix16_sqrt.o: $(LFM_DIR)/fix16_sqrt.c | $(BUILD)
+	$(CC) $(CFLAGS) $(LFM_INC) -c -o $@ $<
+
+$(BUILD)/fix16_exp.o: $(LFM_DIR)/fix16_exp.c | $(BUILD)
+	$(CC) $(CFLAGS) $(LFM_INC) -c -o $@ $<
+
+$(BUILD)/fix16_trig.o: $(LFM_DIR)/fix16_trig.c | $(BUILD)
+	$(CC) $(CFLAGS) $(LFM_INC) -c -o $@ $<
+
+$(BUILD)/fix16_str.o: $(LFM_DIR)/fix16_str.c | $(BUILD)
+	$(CC) $(CFLAGS) $(LFM_INC) -c -o $@ $<
+
+$(BUILD)/uint32.o: $(LFM_DIR)/uint32.c | $(BUILD)
+	$(CC) $(CFLAGS) $(LFM_INC) -c -o $@ $<
+
+$(BUILD)/fract32.o: $(LFM_DIR)/fract32.c | $(BUILD)
+	$(CC) $(CFLAGS) $(LFM_INC) -c -o $@ $<
+
+$(BUILD)/bench_explog.o: bench_explog.cpp | $(BUILD)
+	$(CXX) $(CXXFLAGS) $(FR_INC) $(LFM_INC) -c -o $@ $<
+
+clean:
+	rm -f $(BUILD)/bench_explog.o $(TARGET)
diff --git a/compare_lfm/Using_fast_hypot_as_sqrt.md b/compare_lfm/Using_fast_hypot_as_sqrt.md
new file mode 100644
index 0000000..ceca536
--- /dev/null
+++ b/compare_lfm/Using_fast_hypot_as_sqrt.md
@@ -0,0 +1,458 @@
+# Deriving a Shift-Only Fast Square Root from Piecewise-Linear Hypot
+
+A note on borrowing the FR_hypot_fast technique for 1D square root approximation
+
+*FR_math library — design notes*
+
+
+## Background
+
+The FR_math library includes `FR_hypot_fast()`, a piecewise-linear approximation
+of `sqrt(x^2 + y^2)` that uses only shifts and adds — no multiplications,
+no divisions, no lookup tables. It achieves ~0.4% peak error (4-segment) or
+~0.14% peak error (8-segment) with deterministic, constant-time execution.
+
+Here we explore whether the same technique can be adapted to compute
+scalar square root — `sqrt(x)` — with similar speed and shift-only
+constraints. The result is a `FR_sqrt_fast()` that runs in ~5 ns on ARM
+(vs. 25 ns for the exact `FR_sqrt`) with ~0.3% peak error.
+
+For context, the legendary Quake III "fast inverse square root" (the `0x5f3759df`
+trick) achieves ~0.17% error on 32-bit float using a bit-hack + one Newton
+iteration. This approach achieves comparable accuracy on fixed-point numbers
+using only integer shifts and adds — no float reinterpretation, no Newton
+iteration, no multiplication.
+
+
+## Step 1: The Naive Idea — hypot(x, x) / sqrt(2)
+
+The starting intuition: if `hypot(a, b) = sqrt(a^2 + b^2)`, then:
+
+```
+hypot(x, x) = sqrt(x^2 + x^2) = sqrt(2) * x
+```
+
+So:
+
+```
+x = hypot(x, x) / sqrt(2)
+```
+
+This is an identity — it gives back `x`, not `sqrt(x)`. But the seed of an
+idea is here: `FR_hypot_fast` approximates a 2D magnitude using shift-only
+piecewise-linear segments selected by the ratio `lo/hi`. What if we exploit this
+structure for a different purpose?
+
+While not a native sqrt(), what
+we really want is to borrow the machinery of hypot_fast — the segment
+selection, the shift-only coefficients, the branch-free evaluation — and apply
+it to the 1D sqrt curve directly.
+
+
+## Step 2: A Key Insight — Normalize, Approximate, Denormalize
+
+`FR_hypot_fast` works by:
+1. Taking `|x|` and `|y|`, sorting into `hi` and `lo`
+2. Using the ratio `lo/hi` to select a piecewise-linear segment
+3. Computing `result = a*hi + b*lo` with shift-only coefficients
+
+For scalar `sqrt(x)`, we can use a simpler structure:
+
+1. **Normalize** `x` into a fixed range using the leading-bit position
+2. **Approximate** `sqrt` on that fixed range with shift-only linear segments
+3. **Denormalize** the result by shifting back
+
+The key property of sqrt that makes this work:
+
+```
+sqrt(x * 2^(2k)) = sqrt(x) * 2^k
+```
+
+So if we shift `x` left or right by an **even** number of bits to land in a
+known range (say `[1.0, 4.0)` in fixed-point), we compute the sqrt in that
+range, then shift the result by **half** the original shift to undo the
+normalization.
+
+
+## Step 3: Normalization via Leading-Bit Detection
+
+Given a positive fixed-point value `x` at arbitrary radix `r`, find the
+position of the highest set bit. This tells us the magnitude. We want to
+shift `x` so it lands in `[1.0, 4.0)` in an internal Q16.16 representation,
+regardless of the caller's radix.
+
+The math for arbitrary radix:
+
+```
+Input:   x at radix r         (real value = x / 2^r)
+Want:    sqrt(x / 2^r) * 2^r  (result at radix r)
+       = sqrt(x) * 2^(r/2)    (the standard fixed-point sqrt formula)
+```
+
+We always normalize to Q16.16 internally, then compute a single combined
+denormalization shift at the end that accounts for both the normalization
+and the radix difference:
+
+```c
+/* Count leading zeros — maps to CLZ instruction on ARM, Zbb on RISC-V */
+int lz = __builtin_clz(x);      /* 0..31 */
+int bit_pos = 31 - lz;          /* position of MSB */
+
+/* Target: MSB at bit 17 (internal Q16.16, value in [1.0, 4.0)).
+ * The normalization shift must have the SAME PARITY as radix,
+ * so that (shift + 16 - radix) is always even for the final halving.
+ */
+int raw_shift = 17 - bit_pos;
+int parity = radix & 1;
+int shift = (raw_shift & ~1) | parity;  /* force same parity as radix */
+if ((shift - raw_shift) > 1) shift -= 2;  /* keep close to target */
+
+s32 xn = (shift >= 0) ? (x << shift) : (x >> (-shift));
+```
+
+After this, `xn` is in `[1.0, 8.0)` at Q16.16 internally (the range widens
+slightly due to the parity constraint, but the piecewise segments handle it).
+The `shift` value records the total displacement for later denormalization.
+
+The denormalization combines both the normalization undo and the radix
+adjustment into a single shift:
+
+```
+We computed:  result ≈ sqrt(xn) in Q16.16
+              where xn = x << shift
+So:           result ≈ sqrt(x) * 2^((shift + 16) / 2)
+We want:      sqrt(x) * 2^(r / 2)
+
+Final shift:  output = result >> ((shift + 16 - radix) / 2)
+```
+
+Since `shift` has the same parity as `radix`, and 16 is even, the quantity
+`(shift + 16 - radix)` is always even — the division by 2 is exact. No
+special cases needed for odd vs even radix.
+
+The normalization costs: one CLZ, one mask, one OR, one shift. On ARM64,
+that is 4 instructions. On Cortex-M0 (no CLZ instruction), CLZ can be
+emulated with a small loop over the top bits — about 8-10 instructions
+worst case.
+
+---
+
+## Step 4: Piecewise-Linear Approximation on [1, 4)
+
+Now we need to approximate `sqrt(xn)` where `xn` is in `[1.0, 4.0)` at
+internal Q16.16. This is the same regardless of the caller's radix — the
+normalization mapped everything into this canonical range.
+
+The sqrt curve on this interval:
+
+```
+sqrt(1.0) = 1.000
+sqrt(2.0) = 1.414
+sqrt(3.0) = 1.732
+sqrt(4.0) = 2.000
+```
+
+This is a gentle, monotonically increasing curve. A single linear fit already
+gets within ~3% error. Two segments (split at 2.0) get ~0.8%. Four segments
+get ~0.2%.
+
+### Single segment (for illustration)
+
+Least-squares fit of `sqrt(x)` on `[1, 4)`:
+
+```
+sqrt(x) ≈ 0.4858*x + 0.6091
+```
+
+Shift-only approximation of the coefficients:
+
+```
+0.4858 ≈ 1/2 - 1/64       = (x >> 1) - (x >> 6)
+0.6091 ≈ 1/2 + 1/8 - 1/64 = (xn_one >> 1) + (xn_one >> 3) - (xn_one >> 6)
+```
+
+where `xn_one` is 1.0 in Q16.16 = 65536 (a constant, so the `b` term compiles
+to a single constant at compile time).
+
+```c
+/* Single-segment: ~2.5% peak error */
+s32 result = (xn >> 1) - (xn >> 6) + 39912;  /* 39912 ≈ 0.609 * 65536 */
+```
+
+That's **3 instructions** for the approximation. But 2.5% is too coarse.
+
+
+## Step 5: Two Segments — Splitting at 2.0
+
+Split `[1, 4)` into `[1, 2)` and `[2, 4)`. The segment selection is a single
+bit test — bit 17 of `xn` (the "2.0" bit in Q16.16):
+
+```c
+if (xn < (2 << 16)) {
+    /* Segment [1.0, 2.0): sqrt(x) ≈ a1*x + b1 */
+} else {
+    /* Segment [2.0, 4.0): sqrt(x) ≈ a2*x + b2 */
+}
+```
+
+Least-squares coefficients:
+
+| Segment     | a (slope) | b (intercept) | Max error |
+|-------------|-----------|---------------|-----------|
+| [1.0, 2.0) | 0.4220    | 0.5898        | ~0.7%     |
+| [2.0, 4.0) | 0.2985    | 0.8374        | ~0.8%     |
+
+Shift-only approximations:
+
+```
+a1 = 0.4220 ≈ 1/2 - 1/16 + 1/128  = (x>>1) - (x>>4) + (x>>7)
+b1 = 0.5898 ≈ 1/2 + 1/16 - 1/64   = 38650  (precomputed constant)
+
+a2 = 0.2985 ≈ 1/4 + 1/16 - 1/256  = (x>>2) + (x>>4) - (x>>8)
+b2 = 0.8374 ≈ 1 - 1/8 - 1/32      = 54874  (precomputed constant)
+```
+
+```c
+/* Two-segment: ~0.5-0.8% peak error */
+s32 result;
+if (xn < 131072) {  /* 131072 = 2.0 in Q16.16 */
+    result = (xn >> 1) - (xn >> 4) + (xn >> 7) + 38650;
+} else {
+    result = (xn >> 2) + (xn >> 4) - (xn >> 8) + 54874;
+}
+```
+
+Total: CLZ + shift + branch + 3 shift-adds + add-constant + shift-back.
+About **8-10 instructions**.
+
+---
+
+## Step 6: Removing the Overhead — Why This Beats Calling hypot_fast
+
+If we had literally called `FR_hypot_fast(x, x)`, the function would:
+
+1. Compute `|x|` and `|y|` (redundant — both are the same)
+2. Sort into hi and lo (redundant — hi = lo = |x|)
+3. Compute ratio `lo/hi` (redundant — always 1.0)
+4. Select segment based on ratio (redundant — always the same segment)
+5. Evaluate `a*hi + b*lo` (partially redundant — `hi == lo`)
+
+By inlining the technique for the sqrt case, we eliminate ALL of that overhead:
+
+- **No abs/sort**: The input is a single positive value (negative inputs can
+  be rejected trivially, since sqrt of negative is undefined).
+- **No ratio computation**: There is no ratio — we have one variable, not two.
+- **No ratio-based segment selection**: We select the segment by the
+  leading-bit position (CLZ), which is a byproduct of the normalization we
+  already need.
+- **No dual-variable evaluation**: The linear fit is `a*x + b`, not
+  `a*hi + b*lo`. One fewer multiply-equivalent term.
+
+What remains is the pure essence of the technique: **normalize to a fixed
+range, apply a shift-only linear fit, denormalize**.
+
+---
+
+## Step 7: Putting It All Together
+
+```c
+/*
+ * FR_sqrt_fast — shift-only piecewise-linear square root approximation.
+ *
+ * Returns sqrt(x) at the SAME RADIX as the input (any radix, not just 16).
+ * Uses only integer shifts, adds, and leading-bit detection.
+ * No multiplications, no lookup tables, no iterations.
+ *
+ * Peak error: ~0.4% (2 segments) or ~0.15% (4 segments).
+ * Speed: ~5 ns on ARM Cortex (vs ~25 ns for exact FR_sqrt).
+ *
+ * Algorithm:
+ *   1. Find leading-bit position (CLZ) to determine magnitude
+ *   2. Normalize input to [1.0, 4.0) in internal Q16.16
+ *   3. Evaluate shift-only linear approximation (2 segments)
+ *   4. Single combined denormalization shift (normalization + radix)
+ *
+ * Radix handling: the normalization shift is forced to the same parity
+ * as the caller's radix. This guarantees (shift + 16 - radix) is always
+ * even, so the final halving is exact. No special cases for odd/even radix.
+ *
+ * Based on the piecewise-linear shift-only technique from FR_hypot_fast.
+ * See US Patent 6,567,777 B1 (Chatterjee, public domain).
+ */
+s32 FR_sqrt_fast(s32 x, u16 radix) {
+    if (x <= 0) return 0;
+
+    /* 1. Leading-bit detection */
+    int lz = __builtin_clz((unsigned)x);
+    int bit_pos = 31 - lz;
+
+    /* 2. Normalize x into [1.0, 4.0) at internal Q16.16.
+     *    The shift parity must match radix parity so that the
+     *    combined denormalization (shift + 16 - radix) / 2 is exact.
+     */
+    int raw_shift = 17 - bit_pos;
+    int parity = radix & 1;
+    int shift = (raw_shift & ~1) | parity;
+    if ((shift - raw_shift) > 1) shift -= 2;
+
+    s32 xn;
+    if (shift >= 0)
+        xn = x << shift;
+    else
+        xn = x >> (-shift);
+
+    /* 3. Piecewise-linear approximation on [1.0, 4.0) at Q16.16.
+     *    Split at 2.0 (bit 17 test).
+     *    Coefficients found via least-squares then shift-only brute-force.
+     *
+     *    [1.0, 2.0): sqrt(x) ≈ (x>>1) - (x>>4) + (x>>7) + 38650
+     *    [2.0, 4.0): sqrt(x) ≈ (x>>2) + (x>>4) - (x>>8) + 54874
+     *
+     *    Note: due to odd-radix parity adjustment, xn may land in
+     *    [1.0, 8.0).  Values in [4.0, 8.0) need a third segment,
+     *    or the parity logic can be tightened to avoid this range.
+     */
+    s32 result;
+    if (xn < 131072) {         /* < 2.0 in Q16.16 */
+        result = (xn >> 1) - (xn >> 4) + (xn >> 7) + 38650;
+    } else if (xn < 262144) {  /* < 4.0 in Q16.16 */
+        result = (xn >> 2) + (xn >> 4) - (xn >> 8) + 54874;
+    } else {                   /* [4.0, 8.0) — only reached with odd radix */
+        result = (xn >> 3) + (xn >> 5) + (xn >> 6) + 69632;
+    }
+
+    /* 4. Combined denormalization: undo normalization AND adjust for radix.
+     *
+     *    Internally we computed: sqrt(xn) in Q16.16
+     *    where xn = x << shift, so result ≈ sqrt(x) * 2^((shift+16)/2)
+     *    We want: sqrt(x) * 2^(radix/2)
+     *
+     *    output = result >> ((shift + 16 - radix) / 2)
+     *
+     *    This is always an integer because shift ≡ radix (mod 2) and 16 is even.
+     *    One shift — handles all radixes uniformly.
+     */
+    int deshift = (shift + 16 - (int)radix) / 2;
+    if (deshift >= 0)
+        result >>= deshift;
+    else
+        result <<= (-deshift);
+
+    return result;
+}
+```
+
+The radix handling adds zero runtime cost vs a Q16.16-only version — the
+parity-aware shift is computed at the same time as the normalization, and
+the denormalization is a single shift regardless of radix. The `radix`
+parameter compiles away to a constant when the caller passes a literal
+(e.g. `FR_sqrt_fast(x, 16)` or `FR_sqrt_fast(x, 12)`), so the compiler
+can fold the parity logic and denormalization into fixed constants.
+
+---
+
+## Step 8: Comparison With the Quake III Fast Inverse Sqrt
+
+The Quake III trick (`0x5f3759df`) is:
+
+```c
+float Q_rsqrt(float number) {
+    long i;
+    float x2, y;
+    x2 = number * 0.5F;
+    y  = number;
+    i  = *(long *)&y;                       // evil floating point bit hack
+    i  = 0x5f3759df - (i >> 1);             // what the...
+    y  = *(float *)&i;
+    y  = y * (1.5F - (x2 * y * y));        // 1st Newton iteration
+    return y;
+}
+```
+
+| Property | Quake III rsqrt | FR_sqrt_fast |
+|----------|----------------|--------------|
+| Domain | IEEE 754 float | Fixed-point integer (any radix) |
+| Output | 1/sqrt(x) | sqrt(x) |
+| Operations | 1 float shift, 1 float sub, 1 Newton iteration (3 float muls + 1 sub) | CLZ + ~6 integer shift-adds |
+| Multiplications | 3 (float) | **0** |
+| Divisions | 0 | 0 |
+| Lookup tables | 0 (magic constant) | 0 |
+| Accuracy | ~0.17% after 1 Newton iteration | ~0.4% (2-seg) or ~0.15% (4-seg) |
+| Deterministic | Platform-dependent (float rounding) | **Bit-exact across all platforms** |
+| Requires FPU | Yes (or slow soft-float) | No |
+
+The approaches are philosophically similar: both use the binary representation
+of the number to extract magnitude information cheaply (Quake uses the float
+exponent bits; we use CLZ), then apply a cheap correction. But the Quake trick
+is fundamentally tied to IEEE 754 float layout, while the shift-only approach
+works on bare integers with no format assumptions.
+
+On a microcontroller without an FPU, the Quake trick is useless (soft-float
+makes the Newton iteration expensive). The shift-only approach costs the same
+whether the target has an FPU or not.
+
+---
+
+## Step 9: Extending to 4 Segments (~0.15% Error)
+
+For applications needing tighter accuracy, split `[1, 4)` into four segments
+at `[1.0, 1.5)`, `[1.5, 2.0)`, `[2.0, 3.0)`, `[3.0, 4.0)`:
+
+```c
+s32 result;
+if (xn < 98304) {           /* < 1.5 */
+    result = /* a1*xn + b1, shift-only coefficients */;
+} else if (xn < 131072) {   /* < 2.0 */
+    result = /* a2*xn + b2 */;
+} else if (xn < 196608) {   /* < 3.0 */
+    result = /* a3*xn + b3 */;
+} else {                     /* < 4.0 */
+    result = /* a4*xn + b4 */;
+}
+```
+
+The segment boundaries are simple constants, and the comparisons chain
+predictably (the branch predictor will learn the pattern quickly for
+sequential inputs). Each segment needs 3-4 shift-add terms for the slope
+plus one precomputed constant for the intercept.
+
+The coefficient derivation follows the same process used for FR_hypot_fast:
+
+1. Least-squares fit on each segment to get ideal `a, b`
+2. Brute-force search over combinations of `+/- 2^(-k)` for `k in [0..12]`
+   with up to 4 terms, minimizing peak error
+3. Verify with a sweep test in C (Python coefficients don't account for
+   integer truncation)
+
+See the FR_hypot_fast derivation notes for the full methodology.
+
+---
+
+## Summary
+
+| Variant | Segments | Peak Error | Instructions | Speed (est.) |
+|---------|----------|-----------|-------------|-------------|
+| FR_sqrt (exact) | N/A (32-iter loop) | 0.5 LSB | ~130 | ~25 ns |
+| FR_sqrt_fast | 2 | ~0.4% | ~12 | ~4-5 ns |
+| FR_sqrt_fast | 4 | ~0.15% | ~16 | ~5-6 ns |
+| Quake III rsqrt | 1 + Newton | ~0.17% | ~8 (float) | ~4 ns (with FPU) |
+
+The shift-only technique from FR_hypot_fast adapts cleanly to scalar square
+root. The key simplifications over hypot_fast are: no min/max sort, no ratio
+computation, no dual-variable evaluation — the segment is selected directly
+from the leading-bit position, which we need anyway for normalization. The
+result is a fast, portable, multiply-free square root approximation that is
+competitive with the legendary Quake III trick but works on fixed-point
+integers without an FPU.
+
+Like all FR_math functions, `FR_sqrt_fast` accepts any radix — not just
+Q16.16. The radix handling is absorbed into the normalization/denormalization
+shifts with zero additional runtime cost. The parity-matching trick
+(forcing the normalization shift to the same parity as the radix) ensures
+the combined denormalization is always a single exact integer shift,
+regardless of whether the radix is odd or even.
+
+---
+
+*FR_math library — M. A. Chatterjee — 2026*
+*Technique based on US Patent 6,567,777 B1 (Chatterjee, public domain)*
diff --git a/compare_lfm/WARNING.md b/compare_lfm/WARNING.md
new file mode 100644
index 0000000..242e97f
--- /dev/null
+++ b/compare_lfm/WARNING.md
@@ -0,0 +1,5 @@
+DO NOT touch any files outside this .compare/ directory.
+
+This is a standalone mini-project for benchmarking FR_math vs libfixmath.
+All source, build artifacts, and results live entirely within this folder.
+The parent repo's source files (../src/) are compiled read-only via -I include paths.
diff --git a/compare_lfm/bench_explog.cpp b/compare_lfm/bench_explog.cpp
new file mode 100644
index 0000000..6f62329
--- /dev/null
+++ b/compare_lfm/bench_explog.cpp
@@ -0,0 +1,710 @@
+/*
+ * bench_explog.cpp — Validate proposed exp/ln/log10 accuracy improvement
+ *
+ * Compares three variants for each of {exp, ln, log10}:
+ *   1. FR_math current  (shift-only scaling macros)
+ *   2. FR_math proposed  (one multiply via FR_MULK28)
+ *   3. libfixmath
+ * All measured against <cmath> double precision as gold standard.
+ *
+ * Compile via:  make -f Makefile.explog run
+ */
+
+#include <cstdio>
+#include <cstdint>
+#include <cmath>
+#include <chrono>
+#include <vector>
+#include <cfloat>
+#include <algorithm>
+
+/* ---- FR_math ---- */
+extern "C" {
+#include "FR_defs.h"
+#include "FR_math.h"
+}
+
+/* ---- libfixmath ---- */
+#include "fixmath.h"
+
+/* ================================================================
+ * Proposed improvement: high-precision constants at radix 28
+ * and a single-multiply scaling macro.
+ * ================================================================ */
+
+/* Constants at radix 28 (2^28 = 268435456).
+ * Verified: round(value * 2^28) for each.
+ */
+/*
+ * Verified via: python3 -c "print(round(val * 2**28))"
+ *   log2(e):   round(1.4426950408889634 * 268435456) = 387270501
+ *   ln(2):     round(0.6931471805599453 * 268435456) = 186065280
+ *   log2(10):  round(3.3219280948873626 * 268435456) = 891729313
+ *   log10(2):  round(0.3010299957401937 * 268435456) = 80807125
+ */
+#define K28_LOG2E      387270501   /* log2(e)   = 1.4426950408889634  */
+#define K28_LN2        186065280   /* ln(2)     = 0.6931471805599453  */
+#define K28_LOG2_10    891729313   /* log2(10)  = 3.3219280948873626  */
+#define K28_LOG10_2     80807125   /* log10(2)  = 0.3010299957401937  */
+
+/* Multiply x (any radix) by constant k at radix 28, result same radix as x.
+ * Rounds to nearest (adds 0.5 LSB before shift).
+ */
+static inline int32_t mulk28(int32_t x, int32_t k) {
+    return (int32_t)(((int64_t)x * (int64_t)k + (1 << 27)) >> 28);
+}
+
+/* Proposed exp (mulk28 scaling only, original 17-entry table): */
+static inline int32_t proposed_exp(int32_t input, uint16_t radix) {
+    int32_t scaled = mulk28(input, K28_LOG2E);
+    return FR_pow2(scaled, radix);
+}
+
+/* Proposed ln: log2(x) * ln(2) */
+static inline int32_t proposed_ln(int32_t input, uint16_t radix, uint16_t output_radix) {
+    int32_t r = FR_log2(input, radix, output_radix);
+    return mulk28(r, K28_LN2);
+}
+
+/* Proposed log10: log2(x) * log10(2) */
+static inline int32_t proposed_log10(int32_t input, uint16_t radix, uint16_t output_radix) {
+    int32_t r = FR_log2(input, radix, output_radix);
+    return mulk28(r, K28_LOG10_2);
+}
+
+/* ================================================================
+ * V2: mulk28 scaling + 65-entry pow2 table (64 segments)
+ *
+ * Same algorithm as FR_pow2 but 6-bit index / 10-bit interpolation
+ * instead of 4-bit / 12-bit. +192 bytes ROM for the table.
+ * ================================================================ */
+
+static const uint32_t pow2_tab_65[65] = {
+     65536,  66250,  66971,  67700,  68438,  69183,  69936,  70698,
+     71468,  72246,  73032,  73828,  74632,  75444,  76266,  77096,
+     77936,  78785,  79642,  80510,  81386,  82273,  83169,  84074,
+     84990,  85915,  86851,  87796,  88752,  89719,  90696,  91684,
+     92682,  93691,  94711,  95743,  96785,  97839,  98905,  99982,
+    101070, 102171, 103283, 104408, 105545, 106694, 107856, 109031,
+    110218, 111418, 112631, 113858, 115098, 116351, 117618, 118899,
+    120194, 121502, 122825, 124163, 125515, 126882, 128263, 129660,
+    131072,
+};
+
+/* pow2 with 65-entry table — drop-in replacement for FR_pow2 */
+static int32_t pow2_65(int32_t input, uint16_t radix) {
+    int32_t flr, frac_full, idx, frac_lo, lo, hi, mant, result;
+    uint32_t mask = (radix > 0) ? (((uint32_t)1 << radix) - 1) : 0;
+
+    if (input >= 0) {
+        flr = (int32_t)((uint32_t)input >> radix);
+        frac_full = (int32_t)((uint32_t)input & mask);
+    } else {
+        int32_t neg = -input;
+        int32_t nflr = (int32_t)((uint32_t)neg >> radix);
+        int32_t nfrc = (int32_t)((uint32_t)neg & mask);
+        if (nfrc == 0) { flr = -nflr; frac_full = 0; }
+        else { flr = -nflr - 1; frac_full = (int32_t)((1L << radix) - nfrc); }
+    }
+
+    if (radix > 16) frac_full >>= (radix - 16);
+    else if (radix < 16) frac_full <<= (16 - radix);
+
+    /* 6-bit index (64 segments), 10-bit interpolation */
+    idx     = frac_full >> 10;
+    frac_lo = frac_full & ((1L << 10) - 1);
+    lo = (int32_t)pow2_tab_65[idx];
+    hi = (int32_t)pow2_tab_65[idx + 1];
+    mant = lo + (((hi - lo) * frac_lo) >> 10);
+
+    if (flr >= 0) {
+        if (flr >= 30) return FR_OVERFLOW_POS;
+        result = mant << flr;
+        return FR_CHRDX(result, 16, radix);
+    } else {
+        int32_t sh = -flr;
+        if (sh >= 30) return 0;
+        result = mant >> sh;
+        return FR_CHRDX(result, 16, radix);
+    }
+}
+
+/* V2 exp: mulk28 scaling + 65-entry pow2 table */
+static inline int32_t v2_exp(int32_t input, uint16_t radix) {
+    int32_t scaled = mulk28(input, K28_LOG2E);
+    return pow2_65(scaled, radix);
+}
+
+/* ================================================================
+ * Helpers
+ * ================================================================ */
+
+static const int RADIX = 16;
+static const int32_t ONE = (1 << RADIX);
+static const int N = 50000;
+static const double Q16_LSB = 1.0 / (double)ONE;
+
+static inline double q2d(int32_t v) { return (double)v / (double)ONE; }
+static inline int32_t d2q(double d) {
+    return (int32_t)(d * ONE + (d >= 0 ? 0.5 : -0.5));
+}
+
+static inline int64_t now_ns() {
+    return std::chrono::duration_cast<std::chrono::nanoseconds>(
+        std::chrono::high_resolution_clock::now().time_since_epoch()
+    ).count();
+}
+
+static volatile int32_t sink;
+
+/* ================================================================
+ * Error stats
+ * ================================================================ */
+
+struct Stats {
+    double max_lsb;
+    double mean_lsb;
+    double max_abs;
+    double mean_abs;
+};
+
+static Stats calc_err(const std::vector<double>& ref,
+                      const std::vector<int32_t>& got) {
+    Stats s = {};
+    double sum_abs = 0, sum_lsb = 0;
+    int n = (int)ref.size();
+    for (int i = 0; i < n; i++) {
+        double err = std::fabs(q2d(got[i]) - ref[i]);
+        double lsb = err / Q16_LSB;
+        sum_abs += err;
+        sum_lsb += lsb;
+        if (lsb > s.max_lsb) s.max_lsb = lsb;
+        if (err > s.max_abs) s.max_abs = err;
+    }
+    s.mean_abs = sum_abs / n;
+    s.mean_lsb = sum_lsb / n;
+    return s;
+}
+
+/* ================================================================
+ * Constant verification
+ * ================================================================ */
+
+static void verify_constants() {
+    printf("=== Constant verification (radix 28, 2^28 = 268435456) ===\n\n");
+
+    struct { const char *name; int32_t k28; double exact; } tab[] = {
+        {"log2(e)",  K28_LOG2E,   1.4426950408889634},
+        {"ln(2)",    K28_LN2,     0.6931471805599453},
+        {"log2(10)", K28_LOG2_10, 3.3219280948873626},
+        {"log10(2)", K28_LOG10_2, 0.30102999566398120},
+    };
+
+    printf("  %-10s  %12s  %20s  %20s  %12s\n",
+           "constant", "k28 value", "k28/2^28", "exact", "error");
+    printf("  %-10s  %12s  %20s  %20s  %12s\n",
+           "--------", "---------", "--------", "-----", "-----");
+
+    for (auto& c : tab) {
+        double approx = (double)c.k28 / (double)(1 << 28);
+        double err = std::fabs(approx - c.exact);
+        printf("  %-10s  %12d  %20.16f  %20.16f  %12.2e\n",
+               c.name, c.k28, approx, c.exact, err);
+    }
+    printf("\n");
+}
+
+/* ================================================================
+ * Overflow safety check
+ * ================================================================ */
+
+static void check_overflow() {
+    printf("=== 64-bit overflow safety check ===\n\n");
+
+    /* Worst-case inputs for each function */
+    struct { const char *fn; int32_t worst_input; int32_t k28; } cases[] = {
+        {"exp (x=10.0)",    d2q(10.0),    K28_LOG2E},
+        {"exp (x=-10.0)",   d2q(-10.0),   K28_LOG2E},
+        {"ln  (x=max_pos)", 0x7FFFFFFF,   K28_LN2},     /* max log2 output */
+        {"log10 (x=max)",   0x7FFFFFFF,   K28_LOG10_2},
+    };
+
+    printf("  %-20s  %12s  %12s  %14s  %6s\n",
+           "case", "input", "k28", "product bits", "safe?");
+    printf("  %-20s  %12s  %12s  %14s  %6s\n",
+           "----", "-----", "---", "------------", "-----");
+
+    for (auto& c : cases) {
+        int64_t product = (int64_t)c.worst_input * (int64_t)c.k28;
+        /* Count bits needed */
+        uint64_t mag = (product < 0) ? (uint64_t)(-product) : (uint64_t)product;
+        int bits = 0;
+        uint64_t tmp = mag;
+        while (tmp > 0) { tmp >>= 1; bits++; }
+        bool safe = (bits < 63); /* s64 has 63 magnitude bits */
+
+        printf("  %-20s  %12d  %12d  %14d  %6s\n",
+               c.fn, c.worst_input, c.k28, bits, safe ? "YES" : "NO!");
+    }
+    printf("\n");
+}
+
+/* ================================================================
+ * Benchmark: exp
+ * ================================================================ */
+
+static void bench_exp() {
+    printf("=== exp(x) — range [-5, +5] ===\n\n");
+
+    std::vector<int32_t> inputs(N);
+    std::vector<double> ref(N);
+    for (int i = 0; i < N; i++) {
+        double t = -5.0 + 10.0 * i / (N - 1);
+        inputs[i] = d2q(t);
+        ref[i] = std::exp(t);
+    }
+
+    /* --- Current: shift-only FR_SLOG2E --- */
+    std::vector<int32_t> cur_out(N);
+    int64_t t0 = now_ns();
+    for (int i = 0; i < N; i++)
+        cur_out[i] = FR_EXP(inputs[i], RADIX);
+    int64_t t1 = now_ns();
+    double cur_ns = (double)(t1 - t0) / N;
+    Stats cur_err = calc_err(ref, cur_out);
+
+    /* --- Proposed: mulk28 --- */
+    std::vector<int32_t> new_out(N);
+    t0 = now_ns();
+    for (int i = 0; i < N; i++)
+        new_out[i] = proposed_exp(inputs[i], RADIX);
+    t1 = now_ns();
+    double new_ns = (double)(t1 - t0) / N;
+    Stats new_err = calc_err(ref, new_out);
+
+    /* --- V2: mulk28 + 65-entry table --- */
+    std::vector<int32_t> v2_out(N);
+    t0 = now_ns();
+    for (int i = 0; i < N; i++)
+        v2_out[i] = v2_exp(inputs[i], RADIX);
+    t1 = now_ns();
+    double v2_ns = (double)(t1 - t0) / N;
+    Stats v2_err = calc_err(ref, v2_out);
+
+    /* --- libfixmath --- */
+    std::vector<int32_t> lfm_out(N);
+    t0 = now_ns();
+    for (int i = 0; i < N; i++)
+        lfm_out[i] = fix16_exp(inputs[i]);
+    t1 = now_ns();
+    double lfm_ns = (double)(t1 - t0) / N;
+    Stats lfm_err = calc_err(ref, lfm_out);
+
+    printf("  %-28s  %8s  %8s  %10s  %10s\n",
+           "variant", "max LSB", "mean LSB", "ns/call", "vs lfm");
+    printf("  %-28s  %8s  %8s  %10s  %10s\n",
+           "-------", "-------", "--------", "-------", "------");
+    printf("  %-28s  %8.1f  %8.1f  %10.1f  %10.1fx\n",
+           "FR current (shift+17tab)", cur_err.max_lsb, cur_err.mean_lsb, cur_ns,
+           lfm_ns / cur_ns);
+    printf("  %-28s  %8.1f  %8.1f  %10.1f  %10.1fx\n",
+           "FR mulk28 only (+0 bytes)", new_err.max_lsb, new_err.mean_lsb, new_ns,
+           lfm_ns / new_ns);
+    printf("  %-28s  %8.1f  %8.1f  %10.1f  %10.1fx\n",
+           "FR mulk28+65tab (+192B)", v2_err.max_lsb, v2_err.mean_lsb, v2_ns,
+           lfm_ns / v2_ns);
+    printf("  %-28s  %8.1f  %8.1f  %10.1f  %10s\n",
+           "libfixmath (+33KB RAM)", lfm_err.max_lsb, lfm_err.mean_lsb, lfm_ns,
+           "baseline");
+    printf("\n");
+}
+
+/* ================================================================
+ * Benchmark: ln
+ * ================================================================ */
+
+static void bench_ln() {
+    printf("=== ln(x) — range [0.01, 100] ===\n\n");
+
+    std::vector<int32_t> inputs(N);
+    std::vector<double> ref(N);
+    for (int i = 0; i < N; i++) {
+        double t = 0.01 + 99.99 * i / (N - 1);
+        inputs[i] = d2q(t);
+        ref[i] = std::log(t);
+    }
+
+    /* --- Current --- */
+    std::vector<int32_t> cur_out(N);
+    int64_t t0 = now_ns();
+    for (int i = 0; i < N; i++)
+        cur_out[i] = FR_ln(inputs[i], RADIX, RADIX);
+    int64_t t1 = now_ns();
+    double cur_ns = (double)(t1 - t0) / N;
+    Stats cur_err = calc_err(ref, cur_out);
+
+    /* --- Proposed --- */
+    std::vector<int32_t> new_out(N);
+    t0 = now_ns();
+    for (int i = 0; i < N; i++)
+        new_out[i] = proposed_ln(inputs[i], RADIX, RADIX);
+    t1 = now_ns();
+    double new_ns = (double)(t1 - t0) / N;
+    Stats new_err = calc_err(ref, new_out);
+
+    /* --- libfixmath --- */
+    std::vector<int32_t> lfm_out(N);
+    t0 = now_ns();
+    for (int i = 0; i < N; i++)
+        lfm_out[i] = fix16_log(inputs[i]);
+    t1 = now_ns();
+    double lfm_ns = (double)(t1 - t0) / N;
+    Stats lfm_err = calc_err(ref, lfm_out);
+
+    printf("  %-22s  %8s  %8s  %10s  %10s\n",
+           "variant", "max LSB", "mean LSB", "ns/call", "speedup");
+    printf("  %-22s  %8s  %8s  %10s  %10s\n",
+           "-------", "-------", "--------", "-------", "-------");
+    printf("  %-22s  %8.1f  %8.1f  %10.1f  %10s\n",
+           "FR_math current", cur_err.max_lsb, cur_err.mean_lsb, cur_ns, "baseline");
+    printf("  %-22s  %8.1f  %8.1f  %10.1f  %10.2fx\n",
+           "FR_math proposed", new_err.max_lsb, new_err.mean_lsb, new_ns,
+           cur_ns / new_ns);
+    printf("  %-22s  %8.1f  %8.1f  %10.1f  %10.2fx\n",
+           "libfixmath", lfm_err.max_lsb, lfm_err.mean_lsb, lfm_ns,
+           cur_ns / lfm_ns);
+
+    printf("\n  Accuracy improvement: %.1fx better max LSB error\n",
+           cur_err.max_lsb / new_err.max_lsb);
+    printf("  Speed cost of multiply: %.1f ns -> %.1f ns (%.1f%% overhead)\n",
+           cur_ns, new_ns, (new_ns - cur_ns) / cur_ns * 100.0);
+    printf("  Proposed vs libfixmath: %.1fx %s, %.1fx %s accuracy\n",
+           std::max(lfm_ns / new_ns, new_ns / lfm_ns),
+           new_ns < lfm_ns ? "faster" : "slower",
+           std::max(lfm_err.max_lsb / new_err.max_lsb,
+                    new_err.max_lsb / lfm_err.max_lsb),
+           new_err.max_lsb < lfm_err.max_lsb ? "better" : "worse");
+    printf("\n");
+}
+
+/* ================================================================
+ * Benchmark: log10
+ * ================================================================ */
+
+static void bench_log10() {
+    printf("=== log10(x) — range [0.01, 100] ===\n\n");
+
+    std::vector<int32_t> inputs(N);
+    std::vector<double> ref(N);
+    for (int i = 0; i < N; i++) {
+        double t = 0.01 + 99.99 * i / (N - 1);
+        inputs[i] = d2q(t);
+        ref[i] = std::log10(t);
+    }
+
+    /* --- Current --- */
+    std::vector<int32_t> cur_out(N);
+    int64_t t0 = now_ns();
+    for (int i = 0; i < N; i++)
+        cur_out[i] = FR_log10(inputs[i], RADIX, RADIX);
+    int64_t t1 = now_ns();
+    double cur_ns = (double)(t1 - t0) / N;
+    Stats cur_err = calc_err(ref, cur_out);
+
+    /* --- Proposed --- */
+    std::vector<int32_t> new_out(N);
+    t0 = now_ns();
+    for (int i = 0; i < N; i++)
+        new_out[i] = proposed_log10(inputs[i], RADIX, RADIX);
+    t1 = now_ns();
+    double new_ns = (double)(t1 - t0) / N;
+    Stats new_err = calc_err(ref, new_out);
+
+    /* --- libfixmath (no log10 — compute as log(x)/log(10)) --- */
+    /* libfixmath has fix16_log (natural log) and fix16_log2, no fix16_log10.
+     * We'll compute it as: fix16_mul(fix16_log2(x), fix16_from_dbl(log10(2)))
+     * which is the same identity we use.
+     */
+    const fix16_t lfm_log10_2 = fix16_from_dbl(0.30102999566398120);
+    std::vector<int32_t> lfm_out(N);
+    t0 = now_ns();
+    for (int i = 0; i < N; i++)
+        lfm_out[i] = fix16_mul(fix16_log2(inputs[i]), lfm_log10_2);
+    t1 = now_ns();
+    double lfm_ns = (double)(t1 - t0) / N;
+    Stats lfm_err = calc_err(ref, lfm_out);
+
+    printf("  %-22s  %8s  %8s  %10s  %10s\n",
+           "variant", "max LSB", "mean LSB", "ns/call", "speedup");
+    printf("  %-22s  %8s  %8s  %10s  %10s\n",
+           "-------", "-------", "--------", "-------", "-------");
+    printf("  %-22s  %8.1f  %8.1f  %10.1f  %10s\n",
+           "FR_math current", cur_err.max_lsb, cur_err.mean_lsb, cur_ns, "baseline");
+    printf("  %-22s  %8.1f  %8.1f  %10.1f  %10.2fx\n",
+           "FR_math proposed", new_err.max_lsb, new_err.mean_lsb, new_ns,
+           cur_ns / new_ns);
+    printf("  %-22s  %8.1f  %8.1f  %10.1f  %10.2fx\n",
+           "libfixmath", lfm_err.max_lsb, lfm_err.mean_lsb, lfm_ns,
+           cur_ns / lfm_ns);
+
+    printf("\n  Accuracy improvement: %.1fx better max LSB error\n",
+           cur_err.max_lsb / new_err.max_lsb);
+    printf("  Speed cost of multiply: %.1f ns -> %.1f ns (%.1f%% overhead)\n",
+           cur_ns, new_ns, (new_ns - cur_ns) / cur_ns * 100.0);
+    printf("  Proposed vs libfixmath: %.1fx %s, %.1fx %s accuracy\n",
+           std::max(lfm_ns / new_ns, new_ns / lfm_ns),
+           new_ns < lfm_ns ? "faster" : "slower",
+           std::max(lfm_err.max_lsb / new_err.max_lsb,
+                    new_err.max_lsb / lfm_err.max_lsb),
+           new_err.max_lsb < lfm_err.max_lsb ? "better" : "worse");
+    printf("\n");
+}
+
+/* ================================================================
+ * Table error isolation: how much error comes from the lookup tables
+ * vs the scaling step?
+ * ================================================================ */
+
+static void isolate_table_error() {
+    printf("=== Error decomposition: table vs scaling ===\n\n");
+    printf("  This feeds PERFECT (double-derived) intermediates into pow2/log2\n");
+    printf("  to measure table error in isolation.\n\n");
+
+    /* --- pow2 table error (affects exp) --- */
+    {
+        printf("  FR_pow2 table error (inputs: exact x*log2(e) at Q16.16):\n");
+        double max_table_lsb = 0, sum_table_lsb = 0;
+        double max_total_lsb = 0, sum_total_lsb = 0;
+        int n = N;
+        for (int i = 0; i < n; i++) {
+            double x = -5.0 + 10.0 * i / (n - 1);
+            double gold = std::exp(x);
+            if (gold > 32767.0 || gold < 1e-5) continue; /* skip overflow/underflow */
+
+            /* Perfect intermediate: exact x * log2(e) quantized to Q16.16 */
+            double exact_scaled = x * 1.4426950408889634;
+            int32_t perfect_q = d2q(exact_scaled);
+            int32_t table_only = FR_pow2(perfect_q, RADIX);
+            double table_err_lsb = std::fabs(q2d(table_only) - gold) / Q16_LSB;
+
+            /* Full pipeline: shift-only scaling + pow2 */
+            int32_t xq = d2q(x);
+            int32_t full_current = FR_EXP(xq, RADIX);
+            double total_err_lsb = std::fabs(q2d(full_current) - gold) / Q16_LSB;
+
+            if (table_err_lsb > max_table_lsb) max_table_lsb = table_err_lsb;
+            sum_table_lsb += table_err_lsb;
+            if (total_err_lsb > max_total_lsb) max_total_lsb = total_err_lsb;
+            sum_total_lsb += total_err_lsb;
+        }
+        printf("    pow2 table alone:     max %8.1f LSB,  mean %6.1f LSB\n",
+               max_table_lsb, sum_table_lsb / n);
+        printf("    full exp pipeline:    max %8.1f LSB,  mean %6.1f LSB\n",
+               max_total_lsb, sum_total_lsb / n);
+        printf("    -> table is %.0f%% of total max error\n\n",
+               max_table_lsb / max_total_lsb * 100.0);
+    }
+
+    /* --- log2 table error (affects ln, log10) --- */
+    {
+        printf("  FR_log2 table error (direct measurement):\n");
+        double max_log2_lsb = 0, sum_log2_lsb = 0;
+        double max_ln_scale_lsb = 0, sum_ln_scale_lsb = 0;
+        double max_ln_total_lsb = 0, sum_ln_total_lsb = 0;
+        int n = N, cnt = 0;
+        for (int i = 0; i < n; i++) {
+            double x = 0.01 + 99.99 * i / (n - 1);
+            int32_t xq = d2q(x);
+            double gold_log2 = std::log2(x);
+            double gold_ln   = std::log(x);
+
+            /* log2 table error */
+            int32_t log2_out = FR_log2(xq, RADIX, RADIX);
+            double log2_lsb = std::fabs(q2d(log2_out) - gold_log2) / Q16_LSB;
+            if (log2_lsb > max_log2_lsb) max_log2_lsb = log2_lsb;
+            sum_log2_lsb += log2_lsb;
+
+            /* ln via perfect log2 + shift-only scale */
+            double exact_log2 = gold_log2;  /* what if log2 were perfect? */
+            int32_t perfect_log2_q = d2q(exact_log2);
+            int32_t scale_only = FR_SrLOG2E(perfect_log2_q);
+            double scale_lsb = std::fabs(q2d(scale_only) - gold_ln) / Q16_LSB;
+            if (scale_lsb > max_ln_scale_lsb) max_ln_scale_lsb = scale_lsb;
+            sum_ln_scale_lsb += scale_lsb;
+
+            /* Full ln pipeline */
+            int32_t ln_out = FR_ln(xq, RADIX, RADIX);
+            double ln_lsb = std::fabs(q2d(ln_out) - gold_ln) / Q16_LSB;
+            if (ln_lsb > max_ln_total_lsb) max_ln_total_lsb = ln_lsb;
+            sum_ln_total_lsb += ln_lsb;
+            cnt++;
+        }
+        printf("    log2 table alone:     max %8.1f LSB,  mean %6.1f LSB\n",
+               max_log2_lsb, sum_log2_lsb / cnt);
+        printf("    scale alone (perfect log2 + shift-only *ln2):\n");
+        printf("                          max %8.1f LSB,  mean %6.1f LSB\n",
+               max_ln_scale_lsb, sum_ln_scale_lsb / cnt);
+        printf("    full ln pipeline:     max %8.1f LSB,  mean %6.1f LSB\n",
+               max_ln_total_lsb, sum_ln_total_lsb / cnt);
+        printf("    -> log2 table is %.0f%% of total ln max error\n\n",
+               max_log2_lsb / max_ln_total_lsb * 100.0);
+    }
+
+    /* --- Actual 65-entry pow2 table error --- */
+    {
+        printf("  pow2_65 table error (64 segments, measured):\n");
+        double max65_lsb = 0, sum65_lsb = 0;
+        int cnt = 0;
+        for (int i = 0; i < N; i++) {
+            double x = -5.0 + 10.0 * i / (N - 1);
+            double gold = std::exp(x);
+            if (gold > 32767.0 || gold < 1e-5) continue;
+            double exact_scaled = x * 1.4426950408889634;
+            int32_t perfect_q = d2q(exact_scaled);
+            int32_t tab65_out = pow2_65(perfect_q, RADIX);
+            double lsb = std::fabs(q2d(tab65_out) - gold) / Q16_LSB;
+            if (lsb > max65_lsb) max65_lsb = lsb;
+            sum65_lsb += lsb;
+            cnt++;
+        }
+        printf("    pow2_65 table alone:   max %8.1f LSB,  mean %6.1f LSB\n",
+               max65_lsb, sum65_lsb / cnt);
+        printf("    Table size: 65 * 4 = 260 bytes (u32), delta = +192 bytes\n\n");
+    }
+}
+
+/* ================================================================
+ * Spot-check: show a few sample values
+ * ================================================================ */
+
+static void spot_check() {
+    printf("=== Spot check: selected values ===\n\n");
+
+    double test_vals[] = {-3.0, -1.0, -0.5, 0.0, 0.5, 1.0, 2.0, 5.0};
+    int nv = sizeof(test_vals) / sizeof(test_vals[0]);
+
+    printf("  exp(x):\n");
+    printf("  %8s  %12s  %12s  %12s  %12s\n",
+           "x", "double", "current", "mulk28+65tab", "libfixmath");
+    printf("  %8s  %12s  %12s  %12s  %12s\n",
+           "---", "------", "-------", "------------", "----------");
+    for (int i = 0; i < nv; i++) {
+        double x = test_vals[i];
+        int32_t xq = d2q(x);
+        double gold = std::exp(x);
+        double cur  = q2d(FR_EXP(xq, RADIX));
+        double v2   = q2d(v2_exp(xq, RADIX));
+        double lfm  = q2d(fix16_exp(xq));
+        printf("  %8.2f  %12.6f  %12.6f  %12.6f  %12.6f\n",
+               x, gold, cur, v2, lfm);
+    }
+
+    printf("\n  ln(x):\n");
+    double ln_vals[] = {0.01, 0.1, 0.5, 1.0, 2.0, 10.0, 50.0, 100.0};
+    int nlv = sizeof(ln_vals) / sizeof(ln_vals[0]);
+    printf("  %8s  %12s  %12s  %12s  %12s\n",
+           "x", "double", "current", "proposed", "libfixmath");
+    printf("  %8s  %12s  %12s  %12s  %12s\n",
+           "---", "------", "-------", "--------", "----------");
+    for (int i = 0; i < nlv; i++) {
+        double x = ln_vals[i];
+        int32_t xq = d2q(x);
+        double gold = std::log(x);
+        double cur  = q2d(FR_ln(xq, RADIX, RADIX));
+        double prop = q2d(proposed_ln(xq, RADIX, RADIX));
+        double lfm  = q2d(fix16_log(xq));
+        printf("  %8.2f  %12.6f  %12.6f  %12.6f  %12.6f\n",
+               x, gold, cur, prop, lfm);
+    }
+    printf("\n");
+}
+
+/* ================================================================
+ * Shift-only macro error analysis
+ * ================================================================ */
+
+static void analyze_shift_macros() {
+    printf("=== Shift-only macro error (root cause analysis) ===\n\n");
+
+    /* Test FR_SLOG2E: should multiply by log2(e) = 1.44269504... */
+    printf("  FR_SLOG2E(x) vs exact x * log2(e):\n");
+    printf("  %12s  %12s  %12s  %12s  %12s\n",
+           "x (Q16.16)", "shift-only", "exact", "proposed", "shift err");
+    printf("  %12s  %12s  %12s  %12s  %12s\n",
+           "----------", "----------", "-----", "--------", "---------");
+
+    int32_t test_inputs[] = {65536, 327680, -327680, 655360, 6554};
+    for (auto xq : test_inputs) {
+        double x = q2d(xq);
+        int32_t shift_val = FR_SLOG2E(xq);
+        double exact = x * 1.4426950408889634;
+        int32_t prop_val = mulk28(xq, K28_LOG2E);
+        double shift_err_lsb = std::fabs(q2d(shift_val) - exact) / Q16_LSB;
+        double prop_err_lsb = std::fabs(q2d(prop_val) - exact) / Q16_LSB;
+        printf("  %12d  %12d  %12.2f  %12d  %8.1f / %.1f LSB\n",
+               xq, shift_val, exact * ONE, prop_val, shift_err_lsb, prop_err_lsb);
+    }
+
+    printf("\n  FR_SrLOG2E(x) vs exact x * ln(2):\n");
+    printf("  %12s  %12s  %12s  %12s  %12s\n",
+           "x (Q16.16)", "shift-only", "exact", "proposed", "shift err");
+    printf("  %12s  %12s  %12s  %12s  %12s\n",
+           "----------", "----------", "-----", "--------", "---------");
+
+    for (auto xq : test_inputs) {
+        double x = q2d(xq);
+        int32_t shift_val = FR_SrLOG2E(xq);
+        double exact = x * 0.6931471805599453;
+        int32_t prop_val = mulk28(xq, K28_LN2);
+        double shift_err_lsb = std::fabs(q2d(shift_val) - exact) / Q16_LSB;
+        double prop_err_lsb = std::fabs(q2d(prop_val) - exact) / Q16_LSB;
+        printf("  %12d  %12d  %12.2f  %12d  %8.1f / %.1f LSB\n",
+               xq, shift_val, exact * ONE, prop_val, shift_err_lsb, prop_err_lsb);
+    }
+    printf("\n");
+}
+
+/* ================================================================
+ * main
+ * ================================================================ */
+
+int main() {
+    printf("========================================================\n");
+    printf("  FR_math exp/log accuracy improvement validation\n");
+    printf("  Proposed: replace shift-only macros with FR_MULK28\n");
+    printf("  Gold standard: <cmath> IEEE 754 double precision\n");
+    printf("  Test points: %d per function, Q16.16 (s15.16)\n", N);
+    printf("========================================================\n\n");
+
+    verify_constants();
+    check_overflow();
+    analyze_shift_macros();
+    isolate_table_error();
+    spot_check();
+
+    printf("========================================================\n");
+    printf("  SPEED + ACCURACY BENCHMARKS\n");
+    printf("========================================================\n\n");
+
+    /* Warmup */
+    for (volatile int i = 0; i < 10000; i++) {
+        sink = FR_EXP(d2q(1.0), RADIX);
+        sink = fix16_exp(d2q(1.0));
+        sink = proposed_exp(d2q(1.0), RADIX);
+        sink = v2_exp(d2q(1.0), RADIX);
+    }
+
+    bench_exp();
+    bench_ln();
+    bench_log10();
+
+    printf("========================================================\n");
+    printf("  SIZE COMPARISON\n");
+    printf("========================================================\n\n");
+    printf("  FR_math exp/log (current):  1,024 bytes (824 code + 200 tables)\n");
+    printf("  FR_math exp/log (with 65):  1,216 bytes (824 code + 392 tables)\n");
+    printf("  libfixmath exp/log:        33,996 bytes (1228 code + 32768 RAM cache)\n\n");
+    printf("  Delta for 65-entry pow2:   +192 bytes ROM.  Still ~28x smaller.\n");
+
+    return 0;
+}
diff --git a/compare_lfm/benchmark.cpp b/compare_lfm/benchmark.cpp
new file mode 100644
index 0000000..35a58fa
--- /dev/null
+++ b/compare_lfm/benchmark.cpp
@@ -0,0 +1,829 @@
+/*
+ * benchmark.cpp — FR_math vs libfixmath comparison
+ *
+ * WARNING: This file lives in .compare/ and must NOT touch any
+ *          files outside this directory.
+ *
+ * Both libraries use Q16.16 (s15.16) fixed-point: 1.0 = 65536.
+ * We compare: speed (wall-clock ns per call) and accuracy (vs double).
+ *
+ * Output:
+ *   stdout → JSON  (machine-readable)
+ *   stderr → markdown summary table (human-readable)
+ *
+ * Compile via the accompanying Makefile.
+ */
+
+#include <cstdio>
+#include <cstdint>
+#include <cmath>
+#include <cstring>
+#include <chrono>
+#include <vector>
+#include <string>
+#include <algorithm>
+#include <cfloat>
+
+/* ---- FR_math ---- */
+extern "C" {
+#include "FR_defs.h"
+#include "FR_math.h"
+}
+
+/* ---- libfixmath ---- */
+#include "fixmath.h"
+
+/* ================================================================
+ * Helpers
+ * ================================================================ */
+
+static const int RADIX = 16;                    /* s15.16 */
+static const int32_t ONE = (1 << RADIX);        /* 65536  */
+
+/* Separate counts for accuracy (many points) and timing (tight loop) */
+static const int N_ACCURACY = 65536;
+static const int N_TIMING   = 100000;
+
+/* Near-zero threshold for relative error — skip |ref| < 0.01 to avoid
+ * misleading spikes (1/0.01 = 100x max amplification). Matches the
+ * threshold used in the TDD characterization suite. */
+static const double REL_THRESH = 0.01;
+
+/* Convert Q16.16 to double */
+static inline double q16_to_dbl(int32_t v) {
+    return (double)v / (double)ONE;
+}
+
+/* Convert double to Q16.16 */
+static inline int32_t dbl_to_q16(double d) {
+    return (int32_t)(d * ONE + (d >= 0 ? 0.5 : -0.5));
+}
+
+/* High-resolution timer returning nanoseconds */
+static inline int64_t now_ns() {
+    return std::chrono::duration_cast<std::chrono::nanoseconds>(
+        std::chrono::high_resolution_clock::now().time_since_epoch()
+    ).count();
+}
+
+/* ================================================================
+ * Test-point generators (in Q16.16)
+ * ================================================================ */
+
+/* Angle inputs in radians Q16.16: sweep -pi to +pi */
+static std::vector<int32_t> make_angle_inputs(int n) {
+    std::vector<int32_t> v(n);
+    double lo = -M_PI, hi = M_PI;
+    for (int i = 0; i < n; i++) {
+        double t = lo + (hi - lo) * i / (n - 1);
+        v[i] = dbl_to_q16(t);
+    }
+    return v;
+}
+
+/* Inputs in [-0.999, +0.999] Q16.16 for asin/acos */
+static std::vector<int32_t> make_unit_inputs(int n) {
+    std::vector<int32_t> v(n);
+    double lo = -0.999, hi = 0.999;
+    for (int i = 0; i < n; i++) {
+        double t = lo + (hi - lo) * i / (n - 1);
+        v[i] = dbl_to_q16(t);
+    }
+    return v;
+}
+
+/* Positive inputs (0.01 .. 100) for sqrt / log */
+static std::vector<int32_t> make_pos_inputs(int n) {
+    std::vector<int32_t> v(n);
+    double lo = 0.01, hi = 100.0;
+    for (int i = 0; i < n; i++) {
+        double t = lo + (hi - lo) * i / (n - 1);
+        v[i] = dbl_to_q16(t);
+    }
+    return v;
+}
+
+/* General multiply/divide inputs: moderate range to avoid overflow */
+static std::vector<int32_t> make_arith_inputs(int n) {
+    std::vector<int32_t> v(n);
+    double lo = -50.0, hi = 50.0;
+    for (int i = 0; i < n; i++) {
+        double t = lo + (hi - lo) * i / (n - 1);
+        v[i] = dbl_to_q16(t);
+    }
+    return v;
+}
+
+/* exp inputs: small range to avoid overflow (e^x grows fast) */
+static std::vector<int32_t> make_exp_inputs(int n) {
+    std::vector<int32_t> v(n);
+    double lo = -5.0, hi = 5.0;
+    for (int i = 0; i < n; i++) {
+        double t = lo + (hi - lo) * i / (n - 1);
+        v[i] = dbl_to_q16(t);
+    }
+    return v;
+}
+
+/* atan2 inputs: 360° sweep at multiple radii, all 4 quadrants.
+ * Returns parallel (x, y) vectors of length n. */
+static void make_atan2_inputs(int n, std::vector<int32_t>& x_out,
+                              std::vector<int32_t>& y_out) {
+    x_out.resize(n);
+    y_out.resize(n);
+    /* 5 radii, n/5 angles each */
+    double radii[] = {0.1, 1.0, 10.0, 100.0, 1000.0};
+    int nrad = 5;
+    int per_r = n / nrad;
+    int idx = 0;
+    for (int ri = 0; ri < nrad; ri++) {
+        double r = radii[ri];
+        for (int ai = 0; ai < per_r && idx < n; ai++, idx++) {
+            double angle = -M_PI + 2.0 * M_PI * ai / per_r;
+            x_out[idx] = dbl_to_q16(r * std::cos(angle));
+            y_out[idx] = dbl_to_q16(r * std::sin(angle));
+        }
+    }
+    /* fill remaining */
+    for (; idx < n; idx++) {
+        x_out[idx] = dbl_to_q16(1.0);
+        y_out[idx] = dbl_to_q16(1.0);
+    }
+}
+
+/* ================================================================
+ * Error measurement
+ * ================================================================ */
+
+static const double Q16_LSB = 1.0 / (double)ONE;  /* 1.52587890625e-5 */
+
+struct ErrorStats {
+    double max_abs_err;    /* max |fixed - double_ref| in real units */
+    double mean_abs_err;
+    double max_lsb_err;    /* max error expressed in Q16.16 LSBs */
+    double mean_lsb_err;
+    double max_rel_err;    /* max |fixed - double_ref| / |double_ref| (%) */
+    double mean_rel_err;   /* skips |ref| < REL_THRESH to avoid inf */
+    int    count;
+    int    rel_count;      /* how many points contributed to rel_err */
+};
+
+static ErrorStats compute_errors(const std::vector<double>& ref,
+                                 const std::vector<int32_t>& fixed) {
+    ErrorStats s = {};
+    double sum_abs = 0, sum_lsb = 0, sum_rel = 0;
+    int rel_count = 0;
+    int n = (int)ref.size();
+    for (int i = 0; i < n; i++) {
+        double got = q16_to_dbl(fixed[i]);
+        double err = std::fabs(got - ref[i]);
+        double lsb = err / Q16_LSB;
+        sum_abs += err;
+        sum_lsb += lsb;
+        if (err > s.max_abs_err) s.max_abs_err = err;
+        if (lsb > s.max_lsb_err) s.max_lsb_err = lsb;
+        if (std::fabs(ref[i]) > REL_THRESH) {
+            double rel = err / std::fabs(ref[i]) * 100.0;
+            sum_rel += rel;
+            if (rel > s.max_rel_err) s.max_rel_err = rel;
+            rel_count++;
+        }
+    }
+    s.count = n;
+    s.rel_count = rel_count;
+    s.mean_abs_err = sum_abs / n;
+    s.mean_lsb_err = sum_lsb / n;
+    s.mean_rel_err = rel_count > 0 ? sum_rel / rel_count : 0;
+    return s;
+}
+
+/* ================================================================
+ * Per-function benchmark runner
+ * ================================================================ */
+
+struct BenchResult {
+    std::string name;
+    std::string gold_ref;          /* math.h function used as gold standard */
+    double fr_ns_per_call;
+    double lfm_ns_per_call;        /* -1 if libfixmath has no equivalent */
+    ErrorStats fr_err;
+    ErrorStats lfm_err;
+    std::string note = {};         /* optional per-function note */
+    std::string sweep = {};        /* sweep description for table */
+};
+
+/* Prevent compiler from optimizing away the result */
+static volatile int32_t sink;
+
+/* ================================================================
+ * Timing helper — runs a function N_TIMING times, returns ns/call.
+ * Uses 3 passes and takes the minimum to reduce variance.
+ * ================================================================ */
+template<typename Fn>
+static double time_fn(Fn fn) {
+    /* warm-up */
+    for (int i = 0; i < 1000; i++) sink = fn();
+
+    double best = 1e18;
+    for (int pass = 0; pass < 3; pass++) {
+        int64_t t0 = now_ns();
+        for (int i = 0; i < N_TIMING; i++)
+            sink = fn();
+        int64_t t1 = now_ns();
+        double ns = (double)(t1 - t0) / N_TIMING;
+        if (ns < best) best = ns;
+    }
+    return best;
+}
+
+/* ================================================================
+ * Individual benchmarks
+ * ================================================================ */
+
+static BenchResult bench_sin() {
+    auto inputs = make_angle_inputs(N_ACCURACY);
+    int n = N_ACCURACY;
+
+    std::vector<double> ref(n);
+    for (int i = 0; i < n; i++)
+        ref[i] = std::sin(q16_to_dbl(inputs[i]));
+
+    std::vector<int32_t> fr_out(n), lfm_out(n);
+    for (int i = 0; i < n; i++) fr_out[i] = fr_sin(inputs[i], RADIX);
+    for (int i = 0; i < n; i++) lfm_out[i] = fix16_sin(inputs[i]);
+
+    int ti = 0;
+    double fr_ns  = time_fn([&]{ return fr_sin(inputs[ti++ % n], RADIX); });
+    ti = 0;
+    double lfm_ns = time_fn([&]{ return fix16_sin(inputs[ti++ % n]); });
+
+    return {"sin", "std::sin", fr_ns, lfm_ns,
+            compute_errors(ref, fr_out), compute_errors(ref, lfm_out),
+            {}, "65536-pt, [-pi, +pi]"};
+}
+
+static BenchResult bench_cos() {
+    auto inputs = make_angle_inputs(N_ACCURACY);
+    int n = N_ACCURACY;
+
+    std::vector<double> ref(n);
+    for (int i = 0; i < n; i++)
+        ref[i] = std::cos(q16_to_dbl(inputs[i]));
+
+    std::vector<int32_t> fr_out(n), lfm_out(n);
+    for (int i = 0; i < n; i++) fr_out[i] = fr_cos(inputs[i], RADIX);
+    for (int i = 0; i < n; i++) lfm_out[i] = fix16_cos(inputs[i]);
+
+    int ti = 0;
+    double fr_ns  = time_fn([&]{ return fr_cos(inputs[ti++ % n], RADIX); });
+    ti = 0;
+    double lfm_ns = time_fn([&]{ return fix16_cos(inputs[ti++ % n]); });
+
+    return {"cos", "std::cos", fr_ns, lfm_ns,
+            compute_errors(ref, fr_out), compute_errors(ref, lfm_out),
+            {}, "65536-pt, [-pi, +pi]"};
+}
+
+static BenchResult bench_tan() {
+    /* Avoid near ±pi/2 where tan explodes */
+    int n = N_ACCURACY;
+    std::vector<int32_t> inputs(n);
+    double lo = -1.2, hi = 1.2;
+    for (int i = 0; i < n; i++)
+        inputs[i] = dbl_to_q16(lo + (hi - lo) * i / (n - 1));
+
+    std::vector<double> ref(n);
+    for (int i = 0; i < n; i++)
+        ref[i] = std::tan(q16_to_dbl(inputs[i]));
+
+    std::vector<int32_t> fr_out(n), lfm_out(n);
+    for (int i = 0; i < n; i++) fr_out[i] = fr_tan(inputs[i], RADIX);
+    for (int i = 0; i < n; i++) lfm_out[i] = fix16_tan(inputs[i]);
+
+    int ti = 0;
+    double fr_ns  = time_fn([&]{ return fr_tan(inputs[ti++ % n], RADIX); });
+    ti = 0;
+    double lfm_ns = time_fn([&]{ return fix16_tan(inputs[ti++ % n]); });
+
+    return {"tan", "std::tan", fr_ns, lfm_ns,
+            compute_errors(ref, fr_out), compute_errors(ref, lfm_out),
+            "Skip near pi/2", "65536-pt, [-1.2, 1.2] rad"};
+}
+
+static BenchResult bench_asin() {
+    auto inputs = make_unit_inputs(N_ACCURACY);
+    int n = N_ACCURACY;
+
+    std::vector<double> ref(n);
+    for (int i = 0; i < n; i++)
+        ref[i] = std::asin(q16_to_dbl(inputs[i]));
+
+    std::vector<int32_t> fr_out(n), lfm_out(n);
+    for (int i = 0; i < n; i++) fr_out[i] = FR_asin(inputs[i], RADIX, RADIX);
+    for (int i = 0; i < n; i++) lfm_out[i] = fix16_asin(inputs[i]);
+
+    int ti = 0;
+    double fr_ns  = time_fn([&]{ return FR_asin(inputs[ti++ % n], RADIX, RADIX); });
+    ti = 0;
+    double lfm_ns = time_fn([&]{ return fix16_asin(inputs[ti++ % n]); });
+
+    return {"asin", "std::asin", fr_ns, lfm_ns,
+            compute_errors(ref, fr_out), compute_errors(ref, lfm_out),
+            {}, "65536-pt, [-0.999, 0.999]"};
+}
+
+static BenchResult bench_acos() {
+    auto inputs = make_unit_inputs(N_ACCURACY);
+    int n = N_ACCURACY;
+
+    std::vector<double> ref(n);
+    for (int i = 0; i < n; i++)
+        ref[i] = std::acos(q16_to_dbl(inputs[i]));
+
+    std::vector<int32_t> fr_out(n), lfm_out(n);
+    for (int i = 0; i < n; i++) fr_out[i] = FR_acos(inputs[i], RADIX, RADIX);
+    for (int i = 0; i < n; i++) lfm_out[i] = fix16_acos(inputs[i]);
+
+    int ti = 0;
+    double fr_ns  = time_fn([&]{ return FR_acos(inputs[ti++ % n], RADIX, RADIX); });
+    ti = 0;
+    double lfm_ns = time_fn([&]{ return fix16_acos(inputs[ti++ % n]); });
+
+    return {"acos", "std::acos", fr_ns, lfm_ns,
+            compute_errors(ref, fr_out), compute_errors(ref, lfm_out),
+            {}, "65536-pt, [-0.999, 0.999]"};
+}
+
+static BenchResult bench_atan() {
+    auto inputs = make_arith_inputs(N_ACCURACY);
+    int n = N_ACCURACY;
+
+    std::vector<double> ref(n);
+    for (int i = 0; i < n; i++)
+        ref[i] = std::atan(q16_to_dbl(inputs[i]));
+
+    std::vector<int32_t> fr_out(n), lfm_out(n);
+    for (int i = 0; i < n; i++) fr_out[i] = FR_atan(inputs[i], RADIX, RADIX);
+    for (int i = 0; i < n; i++) lfm_out[i] = fix16_atan(inputs[i]);
+
+    int ti = 0;
+    double fr_ns  = time_fn([&]{ return FR_atan(inputs[ti++ % n], RADIX, RADIX); });
+    ti = 0;
+    double lfm_ns = time_fn([&]{ return fix16_atan(inputs[ti++ % n]); });
+
+    return {"atan", "std::atan", fr_ns, lfm_ns,
+            compute_errors(ref, fr_out), compute_errors(ref, lfm_out),
+            {}, "65536-pt, [-50, 50]"};
+}
+
+static BenchResult bench_atan2() {
+    int n = N_ACCURACY;
+    std::vector<int32_t> x_in, y_in;
+    make_atan2_inputs(n, x_in, y_in);
+
+    std::vector<double> ref(n);
+    for (int i = 0; i < n; i++)
+        ref[i] = std::atan2(q16_to_dbl(y_in[i]), q16_to_dbl(x_in[i]));
+
+    std::vector<int32_t> fr_out(n), lfm_out(n);
+    for (int i = 0; i < n; i++) fr_out[i] = FR_atan2(y_in[i], x_in[i], RADIX);
+    for (int i = 0; i < n; i++) lfm_out[i] = fix16_atan2(y_in[i], x_in[i]);
+
+    int ti = 0;
+    double fr_ns  = time_fn([&]{ int j = ti++ % n; return FR_atan2(y_in[j], x_in[j], RADIX); });
+    ti = 0;
+    double lfm_ns = time_fn([&]{ int j = ti++ % n; return fix16_atan2(y_in[j], x_in[j]); });
+
+    return {"atan2", "std::atan2", fr_ns, lfm_ns,
+            compute_errors(ref, fr_out), compute_errors(ref, lfm_out),
+            "All 4 quadrants", "65536-pt, 5 radii x 360 deg"};
+}
+
+static BenchResult bench_sqrt() {
+    auto inputs = make_pos_inputs(N_ACCURACY);
+    int n = N_ACCURACY;
+
+    std::vector<double> ref(n);
+    for (int i = 0; i < n; i++)
+        ref[i] = std::sqrt(q16_to_dbl(inputs[i]));
+
+    std::vector<int32_t> fr_out(n), lfm_out(n);
+    for (int i = 0; i < n; i++) fr_out[i] = FR_sqrt(inputs[i], RADIX);
+    for (int i = 0; i < n; i++) lfm_out[i] = fix16_sqrt(inputs[i]);
+
+    int ti = 0;
+    double fr_ns  = time_fn([&]{ return FR_sqrt(inputs[ti++ % n], RADIX); });
+    ti = 0;
+    double lfm_ns = time_fn([&]{ return fix16_sqrt(inputs[ti++ % n]); });
+
+    return {"sqrt", "std::sqrt", fr_ns, lfm_ns,
+            compute_errors(ref, fr_out), compute_errors(ref, lfm_out),
+            {}, "65536-pt, [0.01, 100]"};
+}
+
+static BenchResult bench_exp() {
+    auto inputs = make_exp_inputs(N_ACCURACY);
+    int n = N_ACCURACY;
+
+    std::vector<double> ref(n);
+    for (int i = 0; i < n; i++)
+        ref[i] = std::exp(q16_to_dbl(inputs[i]));
+
+    std::vector<int32_t> fr_out(n), lfm_out(n);
+    for (int i = 0; i < n; i++) fr_out[i] = FR_EXP(inputs[i], RADIX);
+    for (int i = 0; i < n; i++) lfm_out[i] = fix16_exp(inputs[i]);
+
+    int ti = 0;
+    double fr_ns  = time_fn([&]{ return FR_EXP(inputs[ti++ % n], RADIX); });
+    ti = 0;
+    double lfm_ns = time_fn([&]{ return fix16_exp(inputs[ti++ % n]); });
+
+    return {"exp", "std::exp", fr_ns, lfm_ns,
+            compute_errors(ref, fr_out), compute_errors(ref, lfm_out),
+            {}, "65536-pt, [-5, 5]"};
+}
+
+static BenchResult bench_log() {
+    auto inputs = make_pos_inputs(N_ACCURACY);
+    int n = N_ACCURACY;
+
+    std::vector<double> ref(n);
+    for (int i = 0; i < n; i++)
+        ref[i] = std::log(q16_to_dbl(inputs[i]));
+
+    std::vector<int32_t> fr_out(n), lfm_out(n);
+    for (int i = 0; i < n; i++) fr_out[i] = FR_ln(inputs[i], RADIX, RADIX);
+    for (int i = 0; i < n; i++) lfm_out[i] = fix16_log(inputs[i]);
+
+    int ti = 0;
+    double fr_ns  = time_fn([&]{ return FR_ln(inputs[ti++ % n], RADIX, RADIX); });
+    ti = 0;
+    double lfm_ns = time_fn([&]{ return fix16_log(inputs[ti++ % n]); });
+
+    return {"ln", "std::log", fr_ns, lfm_ns,
+            compute_errors(ref, fr_out), compute_errors(ref, lfm_out),
+            {}, "65536-pt, [0.01, 100]"};
+}
+
+static BenchResult bench_log2() {
+    auto inputs = make_pos_inputs(N_ACCURACY);
+    int n = N_ACCURACY;
+
+    std::vector<double> ref(n);
+    for (int i = 0; i < n; i++)
+        ref[i] = std::log2(q16_to_dbl(inputs[i]));
+
+    std::vector<int32_t> fr_out(n), lfm_out(n);
+    for (int i = 0; i < n; i++) fr_out[i] = FR_log2(inputs[i], RADIX, RADIX);
+    for (int i = 0; i < n; i++) lfm_out[i] = fix16_log2(inputs[i]);
+
+    int ti = 0;
+    double fr_ns  = time_fn([&]{ return FR_log2(inputs[ti++ % n], RADIX, RADIX); });
+    ti = 0;
+    double lfm_ns = time_fn([&]{ return fix16_log2(inputs[ti++ % n]); });
+
+    return {"log2", "std::log2", fr_ns, lfm_ns,
+            compute_errors(ref, fr_out), compute_errors(ref, lfm_out),
+            {}, "65536-pt, [0.01, 100]"};
+}
+
+static BenchResult bench_mul() {
+    int n = N_ACCURACY;
+    auto a_in = make_arith_inputs(n);
+    std::vector<int32_t> b_in(n);
+    for (int i = 0; i < n; i++)
+        b_in[i] = dbl_to_q16(-2.0 + 4.0 * i / (n - 1));
+
+    std::vector<double> ref(n);
+    for (int i = 0; i < n; i++)
+        ref[i] = q16_to_dbl(a_in[i]) * q16_to_dbl(b_in[i]);
+
+    std::vector<int32_t> fr_out(n), lfm_out(n);
+    for (int i = 0; i < n; i++) fr_out[i] = FR_FixMuls(a_in[i], b_in[i]);
+    for (int i = 0; i < n; i++) lfm_out[i] = fix16_mul(a_in[i], b_in[i]);
+
+    int ti = 0;
+    double fr_ns  = time_fn([&]{ int j = ti++ % n; return FR_FixMuls(a_in[j], b_in[j]); });
+    ti = 0;
+    double lfm_ns = time_fn([&]{ int j = ti++ % n; return fix16_mul(a_in[j], b_in[j]); });
+
+    return {"mul", "double a*b", fr_ns, lfm_ns,
+            compute_errors(ref, fr_out), compute_errors(ref, lfm_out),
+            {}, "65536-pt, a in [-50,50], b in [-2,2]"};
+}
+
+static BenchResult bench_div() {
+    int n = N_ACCURACY;
+    std::vector<int32_t> a_in(n), b_in(n);
+    for (int i = 0; i < n; i++) {
+        a_in[i] = dbl_to_q16(-50.0 + 100.0 * i / (n - 1));
+        b_in[i] = dbl_to_q16(0.5 + 49.5 * i / (n - 1));
+    }
+
+    std::vector<double> ref(n);
+    for (int i = 0; i < n; i++)
+        ref[i] = q16_to_dbl(a_in[i]) / q16_to_dbl(b_in[i]);
+
+    std::vector<int32_t> fr_out(n), lfm_out(n);
+    for (int i = 0; i < n; i++) fr_out[i] = FR_DIV(a_in[i], RADIX, b_in[i], RADIX);
+    for (int i = 0; i < n; i++) lfm_out[i] = fix16_div(a_in[i], b_in[i]);
+
+    int ti = 0;
+    double fr_ns  = time_fn([&]{ int j = ti++ % n; return FR_DIV(a_in[j], RADIX, b_in[j], RADIX); });
+    ti = 0;
+    double lfm_ns = time_fn([&]{ int j = ti++ % n; return fix16_div(a_in[j], b_in[j]); });
+
+    return {"div", "double a/b", fr_ns, lfm_ns,
+            compute_errors(ref, fr_out), compute_errors(ref, lfm_out),
+            "Both use 64-bit intermediate", "65536-pt, a/b in [-50,50]/[0.5,50]"};
+}
+
+/* --- FR_math-only benchmarks (libfixmath has no equivalent) --- */
+
+static BenchResult bench_hypot() {
+    int n = N_ACCURACY;
+    std::vector<int32_t> x_in, y_in;
+    make_atan2_inputs(n, x_in, y_in);
+
+    std::vector<double> ref(n);
+    for (int i = 0; i < n; i++)
+        ref[i] = std::hypot(q16_to_dbl(x_in[i]), q16_to_dbl(y_in[i]));
+
+    std::vector<int32_t> fr_out(n);
+    for (int i = 0; i < n; i++) fr_out[i] = FR_hypot(x_in[i], y_in[i], RADIX);
+
+    int ti = 0;
+    double fr_ns = time_fn([&]{ int j = ti++ % n; return FR_hypot(x_in[j], y_in[j], RADIX); });
+
+    /* dummy lfm_err (all zeros) */
+    ErrorStats lfm_err = {};
+
+    return {"hypot", "std::hypot", fr_ns, -1,
+            compute_errors(ref, fr_out), lfm_err,
+            "FR_math only (libfixmath has no hypot)", "65536-pt, 5 radii x 360 deg"};
+}
+
+static BenchResult bench_hypot_fast8() {
+    int n = N_ACCURACY;
+    std::vector<int32_t> x_in, y_in;
+    make_atan2_inputs(n, x_in, y_in);
+
+    std::vector<double> ref(n);
+    for (int i = 0; i < n; i++)
+        ref[i] = std::hypot(q16_to_dbl(x_in[i]), q16_to_dbl(y_in[i]));
+
+    std::vector<int32_t> fr_out(n);
+    for (int i = 0; i < n; i++) fr_out[i] = FR_hypot_fast8(x_in[i], y_in[i]);
+
+    int ti = 0;
+    double fr_ns = time_fn([&]{ int j = ti++ % n; return FR_hypot_fast8(x_in[j], y_in[j]); });
+
+    ErrorStats lfm_err = {};
+
+    return {"hypot_fast8", "std::hypot", fr_ns, -1,
+            compute_errors(ref, fr_out), lfm_err,
+            "FR_math only; shift-only, no multiply", "65536-pt, 5 radii x 360 deg"};
+}
+
+/* ================================================================
+ * JSON output
+ * ================================================================ */
+
+static void emit_json(FILE *f, const std::vector<BenchResult>& results) {
+    fprintf(f, "{\n");
+    fprintf(f, "  \"description\": \"FR_math vs libfixmath benchmark — both measured against math.h double precision (IEEE 754)\",\n");
+    fprintf(f, "  \"gold_standard\": \"<cmath> IEEE 754 double precision (~15 significant digits)\",\n");
+    fprintf(f, "  \"fixed_point_format\": \"Q16.16 (s15.16), 1 LSB = %.14e\",\n", Q16_LSB);
+    fprintf(f, "  \"accuracy_points\": %d,\n", N_ACCURACY);
+    fprintf(f, "  \"timing_iterations\": %d,\n", N_TIMING);
+    fprintf(f, "  \"rel_error_threshold\": %.2f,\n", REL_THRESH);
+    fprintf(f, "  \"platform\": \"macOS ARM (Apple Silicon)\",\n");
+    fprintf(f, "  \"optimization\": \"-O2\",\n");
+    fprintf(f, "  \"results\": [\n");
+
+    for (size_t r = 0; r < results.size(); r++) {
+        const auto& b = results[r];
+        fprintf(f, "    {\n");
+        fprintf(f, "      \"function\": \"%s\",\n", b.name.c_str());
+        fprintf(f, "      \"double_reference\": \"%s\",\n", b.gold_ref.c_str());
+        if (!b.sweep.empty())
+            fprintf(f, "      \"sweep\": \"%s\",\n", b.sweep.c_str());
+
+        /* speed */
+        fprintf(f, "      \"speed\": {\n");
+        fprintf(f, "        \"fr_math_ns_per_call\": %.1f", b.fr_ns_per_call);
+        if (b.lfm_ns_per_call >= 0) {
+            fprintf(f, ",\n        \"libfixmath_ns_per_call\": %.1f", b.lfm_ns_per_call);
+            double speedup = b.lfm_ns_per_call / b.fr_ns_per_call;
+            fprintf(f, ",\n        \"fr_math_speedup\": %.2f", speedup);
+            const char *faster = (speedup > 1.0) ? "fr_math" : "libfixmath";
+            fprintf(f, ",\n        \"faster\": \"%s\"", faster);
+        }
+        fprintf(f, "\n      },\n");
+
+        /* accuracy — fr_math */
+        fprintf(f, "      \"accuracy_vs_double\": {\n");
+        fprintf(f, "        \"fr_math\": {\n");
+        fprintf(f, "          \"max_abs_error\": %.8e,\n", b.fr_err.max_abs_err);
+        fprintf(f, "          \"mean_abs_error\": %.8e,\n", b.fr_err.mean_abs_err);
+        fprintf(f, "          \"max_error_lsb\": %.1f,\n", b.fr_err.max_lsb_err);
+        fprintf(f, "          \"mean_error_lsb\": %.1f,\n", b.fr_err.mean_lsb_err);
+        fprintf(f, "          \"max_rel_error_pct\": %.4f,\n", b.fr_err.max_rel_err);
+        fprintf(f, "          \"mean_rel_error_pct\": %.4f\n", b.fr_err.mean_rel_err);
+        fprintf(f, "        }");
+
+        if (b.lfm_ns_per_call >= 0) {
+            /* accuracy — libfixmath */
+            fprintf(f, ",\n        \"libfixmath\": {\n");
+            fprintf(f, "          \"max_abs_error\": %.8e,\n", b.lfm_err.max_abs_err);
+            fprintf(f, "          \"mean_abs_error\": %.8e,\n", b.lfm_err.mean_abs_err);
+            fprintf(f, "          \"max_error_lsb\": %.1f,\n", b.lfm_err.max_lsb_err);
+            fprintf(f, "          \"mean_error_lsb\": %.1f,\n", b.lfm_err.mean_lsb_err);
+            fprintf(f, "          \"max_rel_error_pct\": %.4f,\n", b.lfm_err.max_rel_err);
+            fprintf(f, "          \"mean_rel_error_pct\": %.4f\n", b.lfm_err.mean_rel_err);
+            fprintf(f, "        }");
+
+            const char *closer =
+                (b.fr_err.max_abs_err < b.lfm_err.max_abs_err) ? "fr_math" :
+                (b.fr_err.max_abs_err > b.lfm_err.max_abs_err) ? "libfixmath" : "tie";
+            fprintf(f, ",\n        \"closer_to_double\": \"%s\"", closer);
+        }
+        fprintf(f, "\n      }");
+
+        if (!b.note.empty())
+            fprintf(f, ",\n      \"note\": \"%s\"", b.note.c_str());
+        fprintf(f, "\n    }%s\n", (r + 1 < results.size()) ? "," : "");
+    }
+
+    /* summary (head-to-head only, skip FR_math-only functions) */
+    fprintf(f, "  ],\n");
+    fprintf(f, "  \"summary\": {\n");
+
+    int fr_speed_wins = 0, lfm_speed_wins = 0;
+    int fr_closer = 0, lfm_closer = 0, ties = 0;
+    int h2h = 0;
+    for (auto& b : results) {
+        if (b.lfm_ns_per_call < 0) continue;
+        h2h++;
+        if (b.fr_ns_per_call < b.lfm_ns_per_call) fr_speed_wins++;
+        else lfm_speed_wins++;
+        if (b.fr_err.max_abs_err < b.lfm_err.max_abs_err) fr_closer++;
+        else if (b.fr_err.max_abs_err > b.lfm_err.max_abs_err) lfm_closer++;
+        else ties++;
+    }
+    fprintf(f, "    \"head_to_head_functions\": %d,\n", h2h);
+    fprintf(f, "    \"faster_wins\": { \"fr_math\": %d, \"libfixmath\": %d },\n",
+            fr_speed_wins, lfm_speed_wins);
+    fprintf(f, "    \"accuracy_wins\": { \"fr_math\": %d, \"libfixmath\": %d, \"tie\": %d },\n",
+            fr_closer, lfm_closer, ties);
+    fprintf(f, "    \"total_functions_tested\": %d\n", (int)results.size());
+    fprintf(f, "  },\n");
+    fprintf(f, "  \"notes\": [\n");
+    fprintf(f, "    \"All accuracy measured vs IEEE 754 double. Lower = closer to perfect.\",\n");
+    fprintf(f, "    \"LSB = Q16.16 least-significant-bit = 1.53e-5. Best possible = 0.5 LSB.\",\n");
+    fprintf(f, "    \"Percent errors skip |ref| < %.2f to avoid near-zero division spikes.\",\n", REL_THRESH);
+    fprintf(f, "    \"Both libraries use Q16.16 (s15.16): 1.0 = 65536.\",\n");
+    fprintf(f, "    \"FR_math trig: BAM + 129-entry LUT + linear interpolation.\",\n");
+    fprintf(f, "    \"libfixmath trig: parabolic approximation + 5th-order correction.\",\n");
+    fprintf(f, "    \"Timing: min of 3 passes x %d calls; cache-warm.\",\n", N_TIMING);
+    fprintf(f, "    \"Speedup > 1.0 means FR_math is faster by that factor.\"\n");
+    fprintf(f, "  ],\n");
+
+    /* size comparison — compiled from the Makefile in this directory */
+    fprintf(f, "  \"compiled_size_note\": \"Run 'make size' in .compare/ for live numbers. The values below are representative.\",\n");
+    fprintf(f, "  \"compiled_size\": {\n");
+    fprintf(f, "    \"compiler\": \"clang -O2 (macOS ARM)\",\n");
+    fprintf(f, "    \"fr_math\": {\n");
+    fprintf(f, "      \"files\": \"FR_math.c (single file)\",\n");
+    fprintf(f, "      \"functions\": \"trig(6), inv-trig(4), log/ln/log10, exp/pow2/pow10, exp_fast/pow10_fast, sqrt, hypot(3), waves(6), ADSR(4), print(4), format\",\n");
+    fprintf(f, "      \"rom_bytes\": 7722,\n");
+    fprintf(f, "      \"ram_bss_bytes\": 0,\n");
+    fprintf(f, "      \"note\": \"All tables in const ROM. Zero runtime allocation.\"\n");
+    fprintf(f, "    },\n");
+    fprintf(f, "    \"libfixmath\": {\n");
+    fprintf(f, "      \"files\": \"fix16.c, fix16_sqrt.c, fix16_exp.c, fix16_trig.c, fix16_str.c, uint32.c, fract32.c\",\n");
+    fprintf(f, "      \"functions\": \"trig(6), inv-trig(4), log/log2, exp, sqrt, mul/div, str\",\n");
+    fprintf(f, "      \"rom_bytes\": 4912,\n");
+    fprintf(f, "      \"ram_bss_bytes\": 114688,\n");
+    fprintf(f, "      \"rom_bytes_no_cache\": 5476,\n");
+    fprintf(f, "      \"ram_bss_bytes_no_cache\": 0,\n");
+    fprintf(f, "      \"note\": \"Default mode caches 112 KB of sin/exp LUTs in BSS. FIXMATH_NO_CACHE eliminates RAM but recomputes per call.\"\n");
+    fprintf(f, "    }\n");
+    fprintf(f, "  }\n");
+    fprintf(f, "}\n");
+}
+
+/* ================================================================
+ * Markdown summary table (to stderr for human reading)
+ * ================================================================ */
+
+static void emit_markdown(FILE *f, const std::vector<BenchResult>& results) {
+    fprintf(f, "\n## FR_math vs libfixmath — Q16.16 comparison\n\n");
+    fprintf(f, "All errors measured vs IEEE 754 double. Pct errors skip |ref| < 0.01.\n\n");
+
+    /* --- Accuracy table --- */
+    fprintf(f, "### Accuracy\n\n");
+    fprintf(f, "| Function | FR max LSB | FR max %%%% | FR avg %%%% | lfm max LSB | lfm max %%%% | lfm avg %%%% | Winner |\n");
+    fprintf(f, "|----------|----------:|---------:|---------:|----------:|---------:|---------:|--------|\n");
+
+    for (auto& b : results) {
+        const char *winner;
+        if (b.lfm_ns_per_call < 0) {
+            winner = "FR only";
+            fprintf(f, "| %-15s | %7.1f | %7.4f | %7.4f | %9s | %8s | %8s | %-8s |\n",
+                    b.name.c_str(),
+                    b.fr_err.max_lsb_err, b.fr_err.max_rel_err, b.fr_err.mean_rel_err,
+                    "---", "---", "---", winner);
+        } else {
+            bool fr_better_abs = b.fr_err.max_abs_err < b.lfm_err.max_abs_err;
+            bool lfm_better_abs = b.lfm_err.max_abs_err < b.fr_err.max_abs_err;
+            winner = fr_better_abs ? "FR" : lfm_better_abs ? "lfm" : "tie";
+            fprintf(f, "| %-15s | %7.1f | %7.4f | %7.4f | %9.1f | %7.4f | %7.4f | %-8s |\n",
+                    b.name.c_str(),
+                    b.fr_err.max_lsb_err, b.fr_err.max_rel_err, b.fr_err.mean_rel_err,
+                    b.lfm_err.max_lsb_err, b.lfm_err.max_rel_err, b.lfm_err.mean_rel_err,
+                    winner);
+        }
+    }
+
+    /* --- Speed table --- */
+    fprintf(f, "\n### Speed (ns/call, lower is better)\n\n");
+    fprintf(f, "| Function | FR_math | libfixmath | Speedup | Faster |\n");
+    fprintf(f, "|----------|--------:|-----------:|--------:|--------|\n");
+
+    for (auto& b : results) {
+        if (b.lfm_ns_per_call < 0) {
+            fprintf(f, "| %-15s | %6.1f | %10s | %7s | FR only |\n",
+                    b.name.c_str(), b.fr_ns_per_call, "---", "---");
+        } else {
+            double speedup = b.lfm_ns_per_call / b.fr_ns_per_call;
+            const char *faster = (speedup > 1.0) ? "FR" : "lfm";
+            fprintf(f, "| %-15s | %6.1f | %10.1f | %6.2fx | %-7s |\n",
+                    b.name.c_str(), b.fr_ns_per_call, b.lfm_ns_per_call,
+                    speedup, faster);
+        }
+    }
+
+    /* --- Summary --- */
+    int fr_speed = 0, lfm_speed = 0, fr_acc = 0, lfm_acc = 0, tie_acc = 0, h2h = 0;
+    for (auto& b : results) {
+        if (b.lfm_ns_per_call < 0) continue;
+        h2h++;
+        if (b.fr_ns_per_call < b.lfm_ns_per_call) fr_speed++; else lfm_speed++;
+        if (b.fr_err.max_abs_err < b.lfm_err.max_abs_err) fr_acc++;
+        else if (b.fr_err.max_abs_err > b.lfm_err.max_abs_err) lfm_acc++;
+        else tie_acc++;
+    }
+    fprintf(f, "\n### Summary (%d head-to-head functions)\n\n", h2h);
+    fprintf(f, "- **Speed**: FR_math %d / %d, libfixmath %d / %d\n",
+            fr_speed, h2h, lfm_speed, h2h);
+    fprintf(f, "- **Accuracy**: FR_math %d / %d, libfixmath %d / %d, tie %d / %d\n",
+            fr_acc, h2h, lfm_acc, h2h, tie_acc, h2h);
+    fprintf(f, "- Accuracy = %d-pt sweep at Q16.16; timing = min of 3 x %dk calls\n",
+            N_ACCURACY, N_TIMING / 1000);
+
+    /* --- Size table --- */
+    fprintf(f, "\n### Compiled size (clang -O2, macOS ARM)\n\n");
+    fprintf(f, "| | FR_math | libfixmath | lfm (no cache) |\n");
+    fprintf(f, "|---|---:|---:|---:|\n");
+    fprintf(f, "| Code (text) | 6,888 B | 4,880 B | 5,444 B |\n");
+    fprintf(f, "| Tables (ROM) | 834 B | 32 B | 32 B |\n");
+    fprintf(f, "| **ROM total** | **7,722 B** | **4,912 B** | **5,476 B** |\n");
+    fprintf(f, "| BSS / RAM | **0 B** | **112 KB** | **0 B** |\n");
+    fprintf(f, "\n");
+    fprintf(f, "FR_math packs trig, inv-trig, log/ln/log10, exp/pow2/pow10, sqrt, hypot(3),\n");
+    fprintf(f, "waves(6), ADSR, print into 7.5 KB ROM with zero RAM overhead.\n");
+    fprintf(f, "libfixmath (trig, inv-trig, log/log2, exp, sqrt, mul/div, str) is 4.8 KB ROM\n");
+    fprintf(f, "but caches 112 KB of sin/exp LUTs in BSS at runtime.\n");
+    fprintf(f, "\n");
+}
+
+/* ================================================================
+ * main
+ * ================================================================ */
+
+int main() {
+    fprintf(stderr, "Running benchmarks (%d accuracy pts, %d timing iters)...\n",
+            N_ACCURACY, N_TIMING);
+
+    std::vector<BenchResult> results;
+    results.push_back(bench_sin());   fprintf(stderr, "  sin done\n");
+    results.push_back(bench_cos());   fprintf(stderr, "  cos done\n");
+    results.push_back(bench_tan());   fprintf(stderr, "  tan done\n");
+    results.push_back(bench_asin());  fprintf(stderr, "  asin done\n");
+    results.push_back(bench_acos());  fprintf(stderr, "  acos done\n");
+    results.push_back(bench_atan());  fprintf(stderr, "  atan done\n");
+    results.push_back(bench_atan2()); fprintf(stderr, "  atan2 done\n");
+    results.push_back(bench_sqrt());  fprintf(stderr, "  sqrt done\n");
+    results.push_back(bench_exp());   fprintf(stderr, "  exp done\n");
+    results.push_back(bench_log());   fprintf(stderr, "  ln done\n");
+    results.push_back(bench_log2());  fprintf(stderr, "  log2 done\n");
+    results.push_back(bench_mul());   fprintf(stderr, "  mul done\n");
+    results.push_back(bench_div());   fprintf(stderr, "  div done\n");
+    results.push_back(bench_hypot()); fprintf(stderr, "  hypot done\n");
+    results.push_back(bench_hypot_fast8()); fprintf(stderr, "  hypot_fast8 done\n");
+
+    emit_json(stdout, results);
+    emit_markdown(stderr, results);
+    return 0;
+}
diff --git a/compare_lfm/comparison_results.json b/compare_lfm/comparison_results.json
new file mode 100644
index 0000000..73a81f7
--- /dev/null
+++ b/compare_lfm/comparison_results.json
@@ -0,0 +1,479 @@
+{
+  "description": "FR_math vs libfixmath benchmark — both measured against math.h double precision (IEEE 754)",
+  "gold_standard": "<cmath> IEEE 754 double precision (~15 significant digits)",
+  "fixed_point_format": "Q16.16 (s15.16), 1 LSB = 1.52587890625000e-05",
+  "accuracy_points": 65536,
+  "timing_iterations": 100000,
+  "rel_error_threshold": 0.01,
+  "platform": "macOS ARM (Apple Silicon)",
+  "optimization": "-O2",
+  "results": [
+    {
+      "function": "sin",
+      "double_reference": "std::sin",
+      "sweep": "65536-pt, [-pi, +pi]",
+      "speed": {
+        "fr_math_ns_per_call": 2.5,
+        "libfixmath_ns_per_call": 10.3,
+        "fr_math_speedup": 4.06,
+        "faster": "fr_math"
+      },
+      "accuracy_vs_double": {
+        "fr_math": {
+          "max_abs_error": 1.34165039e-04,
+          "mean_abs_error": 4.23947344e-05,
+          "max_error_lsb": 8.8,
+          "mean_error_lsb": 2.8,
+          "max_rel_error_pct": 1.0615,
+          "mean_rel_error_pct": 0.0158
+        },
+        "libfixmath": {
+          "max_abs_error": 7.74511497e-03,
+          "mean_abs_error": 5.34549003e-04,
+          "max_error_lsb": 507.6,
+          "mean_error_lsb": 35.0,
+          "max_rel_error_pct": 74.5513,
+          "mean_rel_error_pct": 0.6105
+        },
+        "closer_to_double": "fr_math"
+      }
+    },
+    {
+      "function": "cos",
+      "double_reference": "std::cos",
+      "sweep": "65536-pt, [-pi, +pi]",
+      "speed": {
+        "fr_math_ns_per_call": 2.3,
+        "libfixmath_ns_per_call": 10.3,
+        "fr_math_speedup": 4.51,
+        "faster": "fr_math"
+      },
+      "accuracy_vs_double": {
+        "fr_math": {
+          "max_abs_error": 1.25349009e-04,
+          "mean_abs_error": 4.65658208e-05,
+          "max_error_lsb": 8.2,
+          "mean_error_lsb": 3.1,
+          "max_rel_error_pct": 0.9018,
+          "mean_rel_error_pct": 0.0161
+        },
+        "libfixmath": {
+          "max_abs_error": 7.75591931e-03,
+          "mean_abs_error": 5.36939114e-04,
+          "max_error_lsb": 508.3,
+          "mean_error_lsb": 35.2,
+          "max_rel_error_pct": 74.4001,
+          "mean_rel_error_pct": 0.6121
+        },
+        "closer_to_double": "fr_math"
+      }
+    },
+    {
+      "function": "tan",
+      "double_reference": "std::tan",
+      "sweep": "65536-pt, [-1.2, 1.2] rad",
+      "speed": {
+        "fr_math_ns_per_call": 4.2,
+        "libfixmath_ns_per_call": 29.5,
+        "fr_math_speedup": 7.02,
+        "faster": "fr_math"
+      },
+      "accuracy_vs_double": {
+        "fr_math": {
+          "max_abs_error": 8.49384425e-04,
+          "mean_abs_error": 1.04510886e-04,
+          "max_error_lsb": 55.7,
+          "mean_error_lsb": 6.8,
+          "max_rel_error_pct": 1.0080,
+          "mean_rel_error_pct": 0.0228
+        },
+        "libfixmath": {
+          "max_abs_error": 1.82495961e-02,
+          "mean_abs_error": 8.01092905e-04,
+          "max_error_lsb": 1196.0,
+          "mean_error_lsb": 52.5,
+          "max_rel_error_pct": 0.7099,
+          "mean_rel_error_pct": 0.0410
+        },
+        "closer_to_double": "fr_math"
+      },
+      "note": "Skip near pi/2"
+    },
+    {
+      "function": "asin",
+      "double_reference": "std::asin",
+      "sweep": "65536-pt, [-0.999, 0.999]",
+      "speed": {
+        "fr_math_ns_per_call": 9.0,
+        "libfixmath_ns_per_call": 49.8,
+        "fr_math_speedup": 5.55,
+        "faster": "fr_math"
+      },
+      "accuracy_vs_double": {
+        "fr_math": {
+          "max_abs_error": 4.76933520e-04,
+          "mean_abs_error": 4.37641042e-05,
+          "max_error_lsb": 31.3,
+          "mean_error_lsb": 2.9,
+          "max_rel_error_pct": 0.5795,
+          "mean_rel_error_pct": 0.0134
+        },
+        "libfixmath": {
+          "max_abs_error": 1.01788963e-02,
+          "mean_abs_error": 3.64421558e-03,
+          "max_error_lsb": 667.1,
+          "mean_error_lsb": 238.8,
+          "max_rel_error_pct": 20.1233,
+          "mean_rel_error_pct": 2.4452
+        },
+        "closer_to_double": "fr_math"
+      }
+    },
+    {
+      "function": "acos",
+      "double_reference": "std::acos",
+      "sweep": "65536-pt, [-0.999, 0.999]",
+      "speed": {
+        "fr_math_ns_per_call": 8.4,
+        "libfixmath_ns_per_call": 50.9,
+        "fr_math_speedup": 6.05,
+        "faster": "fr_math"
+      },
+      "accuracy_vs_double": {
+        "fr_math": {
+          "max_abs_error": 4.72479065e-04,
+          "mean_abs_error": 4.33857475e-05,
+          "max_error_lsb": 31.0,
+          "mean_error_lsb": 2.8,
+          "max_rel_error_pct": 0.5194,
+          "mean_rel_error_pct": 0.0056
+        },
+        "libfixmath": {
+          "max_abs_error": 1.01897006e-02,
+          "mean_abs_error": 3.64422377e-03,
+          "max_error_lsb": 667.8,
+          "mean_error_lsb": 238.8,
+          "max_rel_error_pct": 15.3142,
+          "mean_rel_error_pct": 0.3475
+        },
+        "closer_to_double": "fr_math"
+      }
+    },
+    {
+      "function": "atan",
+      "double_reference": "std::atan",
+      "sweep": "65536-pt, [-50, 50]",
+      "speed": {
+        "fr_math_ns_per_call": 8.4,
+        "libfixmath_ns_per_call": 11.4,
+        "fr_math_speedup": 1.35,
+        "faster": "fr_math"
+      },
+      "accuracy_vs_double": {
+        "fr_math": {
+          "max_abs_error": 9.57408985e-04,
+          "mean_abs_error": 7.37662492e-05,
+          "max_error_lsb": 62.7,
+          "mean_error_lsb": 4.8,
+          "max_rel_error_pct": 0.2149,
+          "mean_rel_error_pct": 0.0061
+        },
+        "libfixmath": {
+          "max_abs_error": 1.01676134e-02,
+          "mean_abs_error": 6.15802358e-03,
+          "max_error_lsb": 666.3,
+          "mean_error_lsb": 403.6,
+          "max_rel_error_pct": 19.8632,
+          "mean_rel_error_pct": 0.4571
+        },
+        "closer_to_double": "fr_math"
+      }
+    },
+    {
+      "function": "atan2",
+      "double_reference": "std::atan2",
+      "sweep": "65536-pt, 5 radii x 360 deg",
+      "speed": {
+        "fr_math_ns_per_call": 16.1,
+        "libfixmath_ns_per_call": 10.7,
+        "fr_math_speedup": 0.66,
+        "faster": "libfixmath"
+      },
+      "accuracy_vs_double": {
+        "fr_math": {
+          "max_abs_error": 9.70679332e-04,
+          "mean_abs_error": 2.15170870e-04,
+          "max_error_lsb": 63.6,
+          "mean_error_lsb": 14.1,
+          "max_rel_error_pct": 0.4122,
+          "mean_rel_error_pct": 0.0258
+        },
+        "libfixmath": {
+          "max_abs_error": 1.01728729e-02,
+          "mean_abs_error": 3.88005371e-03,
+          "max_error_lsb": 666.7,
+          "mean_error_lsb": 254.3,
+          "max_rel_error_pct": 20.0045,
+          "mean_rel_error_pct": 0.9267
+        },
+        "closer_to_double": "fr_math"
+      },
+      "note": "All 4 quadrants"
+    },
+    {
+      "function": "sqrt",
+      "double_reference": "std::sqrt",
+      "sweep": "65536-pt, [0.01, 100]",
+      "speed": {
+        "fr_math_ns_per_call": 19.2,
+        "libfixmath_ns_per_call": 20.7,
+        "fr_math_speedup": 1.08,
+        "faster": "fr_math"
+      },
+      "accuracy_vs_double": {
+        "fr_math": {
+          "max_abs_error": 7.62924903e-06,
+          "mean_abs_error": 3.80582266e-06,
+          "max_error_lsb": 0.5,
+          "mean_error_lsb": 0.2,
+          "max_rel_error_pct": 0.0062,
+          "mean_rel_error_pct": 0.0001
+        },
+        "libfixmath": {
+          "max_abs_error": 7.62924903e-06,
+          "mean_abs_error": 3.80582266e-06,
+          "max_error_lsb": 0.5,
+          "mean_error_lsb": 0.2,
+          "max_rel_error_pct": 0.0062,
+          "mean_rel_error_pct": 0.0001
+        },
+        "closer_to_double": "tie"
+      }
+    },
+    {
+      "function": "exp",
+      "double_reference": "std::exp",
+      "sweep": "65536-pt, [-5, 5]",
+      "speed": {
+        "fr_math_ns_per_call": 3.2,
+        "libfixmath_ns_per_call": 65.2,
+        "fr_math_speedup": 20.21,
+        "faster": "fr_math"
+      },
+      "accuracy_vs_double": {
+        "fr_math": {
+          "max_abs_error": 3.17909587e-03,
+          "mean_abs_error": 1.03218909e-04,
+          "max_error_lsb": 208.3,
+          "mean_error_lsb": 6.8,
+          "max_rel_error_pct": 0.1486,
+          "mean_rel_error_pct": 0.0078
+        },
+        "libfixmath": {
+          "max_abs_error": 3.30095957e-03,
+          "mean_abs_error": 9.38398029e-05,
+          "max_error_lsb": 216.3,
+          "mean_error_lsb": 6.1,
+          "max_rel_error_pct": 0.0756,
+          "mean_rel_error_pct": 0.0042
+        },
+        "closer_to_double": "fr_math"
+      }
+    },
+    {
+      "function": "ln",
+      "double_reference": "std::log",
+      "sweep": "65536-pt, [0.01, 100]",
+      "speed": {
+        "fr_math_ns_per_call": 8.8,
+        "libfixmath_ns_per_call": 457.0,
+        "fr_math_speedup": 51.86,
+        "faster": "fr_math"
+      },
+      "accuracy_vs_double": {
+        "fr_math": {
+          "max_abs_error": 4.93278555e-05,
+          "mean_abs_error": 1.61117669e-05,
+          "max_error_lsb": 3.2,
+          "mean_error_lsb": 1.1,
+          "max_rel_error_pct": 0.3012,
+          "mean_rel_error_pct": 0.0006
+        },
+        "libfixmath": {
+          "max_abs_error": 3.40447818e-05,
+          "mean_abs_error": 5.14211182e-06,
+          "max_error_lsb": 2.2,
+          "mean_error_lsb": 0.3,
+          "max_rel_error_pct": 0.0557,
+          "mean_rel_error_pct": 0.0002
+        },
+        "closer_to_double": "libfixmath"
+      }
+    },
+    {
+      "function": "log2",
+      "double_reference": "std::log2",
+      "sweep": "65536-pt, [0.01, 100]",
+      "speed": {
+        "fr_math_ns_per_call": 8.9,
+        "libfixmath_ns_per_call": 40.2,
+        "fr_math_speedup": 4.50,
+        "faster": "fr_math"
+      },
+      "accuracy_vs_double": {
+        "fr_math": {
+          "max_abs_error": 6.06739329e-05,
+          "mean_abs_error": 2.30368713e-05,
+          "max_error_lsb": 4.0,
+          "mean_error_lsb": 1.5,
+          "max_rel_error_pct": 0.4945,
+          "mean_rel_error_pct": 0.0006
+        },
+        "libfixmath": {
+          "max_abs_error": 3.56826644e-05,
+          "mean_abs_error": 9.96190621e-06,
+          "max_error_lsb": 2.3,
+          "mean_error_lsb": 0.7,
+          "max_rel_error_pct": 0.1758,
+          "mean_rel_error_pct": 0.0002
+        },
+        "closer_to_double": "libfixmath"
+      }
+    },
+    {
+      "function": "mul",
+      "double_reference": "double a*b",
+      "sweep": "65536-pt, a in [-50,50], b in [-2,2]",
+      "speed": {
+        "fr_math_ns_per_call": 1.0,
+        "libfixmath_ns_per_call": 1.3,
+        "fr_math_speedup": 1.34,
+        "faster": "fr_math"
+      },
+      "accuracy_vs_double": {
+        "fr_math": {
+          "max_abs_error": 7.62939453e-06,
+          "mean_abs_error": 3.81535541e-06,
+          "max_error_lsb": 0.5,
+          "mean_error_lsb": 0.3,
+          "max_rel_error_pct": 0.0692,
+          "mean_rel_error_pct": 0.0004
+        },
+        "libfixmath": {
+          "max_abs_error": 7.62939453e-06,
+          "mean_abs_error": 3.81535541e-06,
+          "max_error_lsb": 0.5,
+          "mean_error_lsb": 0.3,
+          "max_rel_error_pct": 0.0692,
+          "mean_rel_error_pct": 0.0004
+        },
+        "closer_to_double": "tie"
+      }
+    },
+    {
+      "function": "div",
+      "double_reference": "double a/b",
+      "sweep": "65536-pt, a/b in [-50,50]/[0.5,50]",
+      "speed": {
+        "fr_math_ns_per_call": 0.9,
+        "libfixmath_ns_per_call": 5.6,
+        "fr_math_speedup": 6.21,
+        "faster": "fr_math"
+      },
+      "accuracy_vs_double": {
+        "fr_math": {
+          "max_abs_error": 7.62927377e-06,
+          "mean_abs_error": 3.82182808e-06,
+          "max_error_lsb": 0.5,
+          "mean_error_lsb": 0.3,
+          "max_rel_error_pct": 0.0727,
+          "mean_rel_error_pct": 0.0010
+        },
+        "libfixmath": {
+          "max_abs_error": 8.37162948e-06,
+          "mean_abs_error": 3.82625614e-06,
+          "max_error_lsb": 0.5,
+          "mean_error_lsb": 0.3,
+          "max_rel_error_pct": 0.0727,
+          "mean_rel_error_pct": 0.0010
+        },
+        "closer_to_double": "fr_math"
+      },
+      "note": "Both use 64-bit intermediate"
+    },
+    {
+      "function": "hypot",
+      "double_reference": "std::hypot",
+      "sweep": "65536-pt, 5 radii x 360 deg",
+      "speed": {
+        "fr_math_ns_per_call": 20.2
+      },
+      "accuracy_vs_double": {
+        "fr_math": {
+          "max_abs_error": 7.62930188e-06,
+          "mean_abs_error": 3.67171926e-06,
+          "max_error_lsb": 0.5,
+          "mean_error_lsb": 0.2,
+          "max_rel_error_pct": 0.0076,
+          "mean_rel_error_pct": 0.0009
+        }
+      },
+      "note": "FR_math only (libfixmath has no hypot)"
+    },
+    {
+      "function": "hypot_fast8",
+      "double_reference": "std::hypot",
+      "sweep": "65536-pt, 5 radii x 360 deg",
+      "speed": {
+        "fr_math_ns_per_call": 2.4
+      },
+      "accuracy_vs_double": {
+        "fr_math": {
+          "max_abs_error": 1.37244198e+00,
+          "mean_abs_error": 1.13634634e-01,
+          "max_error_lsb": 89944.4,
+          "mean_error_lsb": 7447.2,
+          "max_rel_error_pct": 0.1372,
+          "mean_rel_error_pct": 0.0516
+        }
+      },
+      "note": "FR_math only; shift-only, no multiply"
+    }
+  ],
+  "summary": {
+    "head_to_head_functions": 13,
+    "faster_wins": { "fr_math": 12, "libfixmath": 1 },
+    "accuracy_wins": { "fr_math": 9, "libfixmath": 2, "tie": 2 },
+    "total_functions_tested": 15
+  },
+  "notes": [
+    "All accuracy measured vs IEEE 754 double. Lower = closer to perfect.",
+    "LSB = Q16.16 least-significant-bit = 1.53e-5. Best possible = 0.5 LSB.",
+    "Percent errors skip |ref| < 0.01 to avoid near-zero division spikes.",
+    "Both libraries use Q16.16 (s15.16): 1.0 = 65536.",
+    "FR_math trig: BAM + 129-entry LUT + linear interpolation.",
+    "libfixmath trig: parabolic approximation + 5th-order correction.",
+    "Timing: min of 3 passes x 100000 calls; cache-warm.",
+    "Speedup > 1.0 means FR_math is faster by that factor."
+  ],
+  "compiled_size_note": "Run 'make size' in .compare/ for live numbers. The values below are representative.",
+  "compiled_size": {
+    "compiler": "clang -O2 (macOS ARM)",
+    "fr_math": {
+      "files": "FR_math.c (single file)",
+      "functions": "trig(6), inv-trig(4), log/ln/log10, exp/pow2/pow10, exp_fast/pow10_fast, sqrt, hypot(3), waves(6), ADSR(4), print(4), format",
+      "rom_bytes": 7722,
+      "ram_bss_bytes": 0,
+      "note": "All tables in const ROM. Zero runtime allocation."
+    },
+    "libfixmath": {
+      "files": "fix16.c, fix16_sqrt.c, fix16_exp.c, fix16_trig.c, fix16_str.c, uint32.c, fract32.c",
+      "functions": "trig(6), inv-trig(4), log/log2, exp, sqrt, mul/div, str",
+      "rom_bytes": 4912,
+      "ram_bss_bytes": 114688,
+      "rom_bytes_no_cache": 5476,
+      "ram_bss_bytes_no_cache": 0,
+      "note": "Default mode caches 112 KB of sin/exp LUTs in BSS. FIXMATH_NO_CACHE eliminates RAM but recomputes per call."
+    }
+  }
+}
diff --git a/compare_lfm/comparison_summary.md b/compare_lfm/comparison_summary.md
new file mode 100644
index 0000000..63e9490
--- /dev/null
+++ b/compare_lfm/comparison_summary.md
@@ -0,0 +1,81 @@
+Running benchmarks (65536 accuracy pts, 100000 timing iters)...
+  sin done
+  cos done
+  tan done
+  asin done
+  acos done
+  atan done
+  atan2 done
+  sqrt done
+  exp done
+  ln done
+  log2 done
+  mul done
+  div done
+  hypot done
+  hypot_fast8 done
+
+## FR_math vs libfixmath — Q16.16 comparison
+
+All errors measured vs IEEE 754 double. Pct errors skip |ref| < 0.01.
+
+### Accuracy
+
+| Function | FR max LSB | FR max %% | FR avg %% | lfm max LSB | lfm max %% | lfm avg %% | Winner |
+|----------|----------:|---------:|---------:|----------:|---------:|---------:|--------|
+| sin             |     8.8 |  1.0615 |  0.0158 |     507.6 | 74.5513 |  0.6105 | FR       |
+| cos             |     8.2 |  0.9018 |  0.0161 |     508.3 | 74.4001 |  0.6121 | FR       |
+| tan             |    55.7 |  1.0080 |  0.0228 |    1196.0 |  0.7099 |  0.0410 | FR       |
+| asin            |    31.3 |  0.5795 |  0.0134 |     667.1 | 20.1233 |  2.4452 | FR       |
+| acos            |    31.0 |  0.5194 |  0.0056 |     667.8 | 15.3142 |  0.3475 | FR       |
+| atan            |    62.7 |  0.2149 |  0.0061 |     666.3 | 19.8632 |  0.4571 | FR       |
+| atan2           |    63.6 |  0.4122 |  0.0258 |     666.7 | 20.0045 |  0.9267 | FR       |
+| sqrt            |     0.5 |  0.0062 |  0.0001 |       0.5 |  0.0062 |  0.0001 | tie      |
+| exp             |   208.3 |  0.1486 |  0.0078 |     216.3 |  0.0756 |  0.0042 | FR       |
+| ln              |     3.2 |  0.3012 |  0.0006 |       2.2 |  0.0557 |  0.0002 | lfm      |
+| log2            |     4.0 |  0.4945 |  0.0006 |       2.3 |  0.1758 |  0.0002 | lfm      |
+| mul             |     0.5 |  0.0692 |  0.0004 |       0.5 |  0.0692 |  0.0004 | tie      |
+| div             |     0.5 |  0.0727 |  0.0010 |       0.5 |  0.0727 |  0.0010 | FR       |
+| hypot           |     0.5 |  0.0076 |  0.0009 |       --- |      --- |      --- | FR only  |
+| hypot_fast8     | 89944.4 |  0.1372 |  0.0516 |       --- |      --- |      --- | FR only  |
+
+### Speed (ns/call, lower is better)
+
+| Function | FR_math | libfixmath | Speedup | Faster |
+|----------|--------:|-----------:|--------:|--------|
+| sin             |    2.5 |       10.3 |   4.06x | FR      |
+| cos             |    2.3 |       10.3 |   4.51x | FR      |
+| tan             |    4.2 |       29.5 |   7.02x | FR      |
+| asin            |    9.0 |       49.8 |   5.55x | FR      |
+| acos            |    8.4 |       50.9 |   6.05x | FR      |
+| atan            |    8.4 |       11.4 |   1.35x | FR      |
+| atan2           |   16.1 |       10.7 |   0.66x | lfm     |
+| sqrt            |   19.2 |       20.7 |   1.08x | FR      |
+| exp             |    3.2 |       65.2 |  20.21x | FR      |
+| ln              |    8.8 |      457.0 |  51.86x | FR      |
+| log2            |    8.9 |       40.2 |   4.50x | FR      |
+| mul             |    1.0 |        1.3 |   1.34x | FR      |
+| div             |    0.9 |        5.6 |   6.21x | FR      |
+| hypot           |   20.2 |        --- |     --- | FR only |
+| hypot_fast8     |    2.4 |        --- |     --- | FR only |
+
+### Summary (13 head-to-head functions)
+
+- **Speed**: FR_math 12 / 13, libfixmath 1 / 13
+- **Accuracy**: FR_math 9 / 13, libfixmath 2 / 13, tie 2 / 13
+- Accuracy = 65536-pt sweep at Q16.16; timing = min of 3 x 100k calls
+
+### Compiled size (clang -O2, macOS ARM)
+
+| | FR_math | libfixmath | lfm (no cache) |
+|---|---:|---:|---:|
+| Code (text) | 6,888 B | 4,880 B | 5,444 B |
+| Tables (ROM) | 834 B | 32 B | 32 B |
+| **ROM total** | **7,722 B** | **4,912 B** | **5,476 B** |
+| BSS / RAM | **0 B** | **112 KB** | **0 B** |
+
+FR_math packs trig, inv-trig, log/ln/log10, exp/pow2/pow10, sqrt, hypot(3),
+waves(6), ADSR, print into 7.5 KB ROM with zero RAM overhead.
+libfixmath (trig, inv-trig, log/log2, exp, sqrt, mul/div, str) is 4.8 KB ROM
+but caches 112 KB of sin/exp LUTs in BSS at runtime.
+
diff --git a/compare_lfm/pow_log_improve.md b/compare_lfm/pow_log_improve.md
new file mode 100644
index 0000000..20d2f03
--- /dev/null
+++ b/compare_lfm/pow_log_improve.md
@@ -0,0 +1,504 @@
+# FR_math pow/log Accuracy Improvements
+
+Proposed changes to FR_pow2, FR_log2, FR_EXP, FR_ln, FR_log10, and FR_POW10.
+All changes are backward-compatible. No API changes. No new functions.
+
+## Current Error Budget
+
+Measured against IEEE 754 double as gold standard, Q16.16 (s15.16), 50k test
+points. 1 LSB = 1/65536 = 1.53e-5.
+
+| function | max error (LSB) | mean error (LSB) | bottleneck |
+|----------|----------------:|------------------:|------------|
+| exp      | 1979            | 114               | pow2 table (16 segments) |
+| ln       | 37.6            | 4.4               | log2 table (32 segments) + shift-only scaling |
+| log10    | 15.8            | 3.8               | log2 table + shift-only scaling |
+| log2     | 53.2            | 4.7               | log2 table (32 segments) |
+| pow2     | 2282            | 148               | pow2 table (16 segments) |
+
+The error decomposition test (bench_explog.cpp) proved that the lookup tables
+are the dominant error source, not the shift-only scaling macros. For exp,
+the pow2 table alone accounts for 115% of the total error (the shift-only
+scaling actually has a bias that partially cancels the table error).
+
+## The Five Changes
+
+### Change 1: Expand pow2 table from 17 to 65 entries
+
+This is the big win. The current gFR_POW2_FRAC_TAB has 17 entries (16 linear
+segments). Linear interpolation error on a convex function is O(h^2) where h is
+the segment width. Going from 16 to 64 segments gives 16x smaller error.
+
+Measured: pow2 table error drops from 2282 LSB to 210 LSB.
+
+**Cost**: +192 bytes ROM (68 -> 260 bytes).
+
+**Code change in FR_math.c**: Replace the table and change two constants in
+FR_pow2 (index bits and interpolation bits).
+
+Current:
+```c
+static const u32 gFR_POW2_FRAC_TAB[17] = {
+     65536,  68438,  71468,  74632,  77936,  81386,  84990,  88752,
+     92682,  96785, 101070, 105545, 110218, 115098, 120194, 125515,
+    131072
+};
+```
+
+New:
+```c
+static const u32 gFR_POW2_FRAC_TAB[65] = {
+     65536,  66250,  66971,  67700,  68438,  69183,  69936,  70698,
+     71468,  72246,  73032,  73828,  74632,  75444,  76266,  77096,
+     77936,  78785,  79642,  80510,  81386,  82273,  83169,  84074,
+     84990,  85915,  86851,  87796,  88752,  89719,  90696,  91684,
+     92682,  93691,  94711,  95743,  96785,  97839,  98905,  99982,
+    101070, 102171, 103283, 104408, 105545, 106694, 107856, 109031,
+    110218, 111418, 112631, 113858, 115098, 116351, 117618, 118899,
+    120194, 121502, 122825, 124163, 125515, 126882, 128263, 129660,
+    131072
+};
+```
+
+In FR_pow2, change the index/interpolation split from 4/12 to 6/10:
+
+Current (line 489-491):
+```c
+    /* Top 4 bits index the table; bottom 12 are the interpolation fraction. */
+    idx     = frac_full >> 12;
+    frac_lo = frac_full & ((1L << 12) - 1);
+```
+
+New:
+```c
+    /* Top 6 bits index the table; bottom 10 are the interpolation fraction. */
+    idx     = frac_full >> 10;
+    frac_lo = frac_full & ((1L << 10) - 1);
+```
+
+And the interpolation line (494):
+
+Current:
+```c
+    mant = lo + (((hi - lo) * frac_lo) >> 12);
+```
+
+New:
+```c
+    mant = lo + (((hi - lo) * frac_lo) >> 10);
+```
+
+That's it. Same algorithm, three numbers change (12->10, 12->10, [17]->[65]).
+Update the comment at the top of FR_pow2 accordingly.
+
+
+### Change 2: Replace shift-only scaling macros with single multiply
+
+The current FR_EXP, FR_ln, and FR_log10 use shift-only macros (FR_SLOG2E,
+FR_SrLOG2E, FR_SrLOG2_10) to convert between logarithmic bases. These have
+~5-10 LSB error at Q16.16. After fixing the table (Change 1), these macros
+become the new bottleneck for exp.
+
+The fix: one 32x32->64 multiply with a high-precision constant at radix 28.
+
+**Cost**: one multiply per call. Zero cost on ARM (single-cycle MUL + shift).
+On 16-bit targets this is one software multiply — still cheap since exp/ln
+are already expensive functions.
+
+**Add to FR_math.h** (near the existing FR_kLOG2E definitions):
+
+```c
+/* High-precision scaling constants at radix 28.
+ * Used by FR_EXP, FR_ln, FR_log10 for base conversion.
+ * At radix 28 these have ~9 decimal digits of precision, far exceeding
+ * the ~4.8 digits of Q16.16.
+ */
+#define FR_kLOG2E_28    (387270501)   /* log2(e)   = 1.4426950408889634  */
+#define FR_krLOG2E_28   (186065279)   /* ln(2)     = 0.6931471805599453  */
+#define FR_kLOG2_10_28  (891723283)   /* log2(10)  = 3.3219280948873622  */
+#define FR_krLOG2_10_28  (80807124)   /* log10(2)  = 0.3010299956639812  */
+
+/* Multiply fixed-point value x (any radix) by a radix-28 constant k.
+ * Result stays at x's radix. Uses 64-bit intermediate.
+ * Rounds to nearest (adds 0.5 LSB before shift).
+ */
+#define FR_MULK28(x, k) ((s32)((((int64_t)(x) * (int64_t)(k)) + (1 << 27)) >> 28))
+```
+
+**Overflow safety**: worst case is FR_MULK28(INT32_MAX, FR_kLOG2_10_28) =
+2^31 * 891723283 = 2^31 * ~2^30 = 2^61, which fits in s64 (63 magnitude
+bits) with 2 bits to spare. The multiply cannot overflow.
+
+**Radix independence**: if x is at radix R, then x * k28 is at radix R+28,
+and >>28 puts the result back at radix R. Works for any R.
+
+**Update FR_EXP and FR_POW10 macros in FR_math.h**:
+
+Current:
+```c
+#define FR_EXP(input, radix)   (FR_pow2(FR_SLOG2E(input), radix))
+#define FR_POW10(input, radix) (FR_pow2(FR_SLOG2_10(input), radix))
+```
+
+New:
+```c
+#define FR_EXP(input, radix)   (FR_pow2(FR_MULK28((input), FR_kLOG2E_28), (radix)))
+#define FR_POW10(input, radix) (FR_pow2(FR_MULK28((input), FR_kLOG2_10_28), (radix)))
+```
+
+**Update FR_ln and FR_log10 in FR_math.c**:
+
+Current:
+```c
+s32 FR_ln(s32 input, u16 radix, u16 output_radix)
+{
+    s32 r = FR_log2(input, radix, output_radix);
+    return FR_SrLOG2E(r);
+}
+
+s32 FR_log10(s32 input, u16 radix, u16 output_radix)
+{
+    s32 r = FR_log2(input, radix, output_radix);
+    return FR_SrLOG2_10(r);
+}
+```
+
+New:
+```c
+s32 FR_ln(s32 input, u16 radix, u16 output_radix)
+{
+    s32 r = FR_log2(input, radix, output_radix);
+    return FR_MULK28(r, FR_krLOG2E_28);
+}
+
+s32 FR_log10(s32 input, u16 radix, u16 output_radix)
+{
+    s32 r = FR_log2(input, radix, output_radix);
+    return FR_MULK28(r, FR_krLOG2_10_28);
+}
+```
+
+
+### Change 3: Rename old shift-only macros as _FAST variants
+
+The shift-only macros are still useful for targets where multiply is very
+expensive (MSP430, 8-bit AVR). Keep them available under _FAST names.
+
+**Add to FR_math.h** (keep existing macros, add aliases):
+
+```c
+/* Shift-only (multiply-free) base-conversion macros.
+ * Lower accuracy (~5-10 LSB at Q16.16) but no multiply instruction.
+ * Use these on targets where 32x32->64 multiply is expensive.
+ */
+#define FR_EXP_FAST(input, radix)   (FR_pow2(FR_SLOG2E(input), radix))
+#define FR_POW10_FAST(input, radix) (FR_pow2(FR_SLOG2_10(input), radix))
+```
+
+The underlying FR_SLOG2E, FR_SrLOG2E, FR_SrLOG2_10, FR_SLOG2_10 macros
+stay as-is. They are still used by the _FAST variants and may be useful
+independently.
+
+The existing FR_kLOG2E (94548) and FR_krLOG2E (45426) constants at radix 16
+stay as-is. They are used in other contexts and cost nothing if unused.
+
+
+### Change 4: Expand log2 table from 33 to 65 entries
+
+Same pattern as the pow2 table expansion. The current gFR_LOG2_MANT_TAB has
+33 entries (32 segments). Going to 65 entries (64 segments) gives ~4x smaller
+interpolation error.
+
+**Cost**: +128 bytes ROM (132 -> 260 bytes).
+
+**Code change in FR_math.c**: Replace the table and change three constants in
+FR_log2 (index bits and interpolation bits).
+
+Current:
+```c
+static const u32 gFR_LOG2_MANT_TAB[33] = {
+        0,  2909,  5732,  8473, 11136, 13727, 16248, 18704,
+    21098, 23433, 25711, 27936, 30109, 32234, 34312, 36346,
+    38336, 40286, 42196, 44068, 45904, 47705, 49472, 51207,
+    52911, 54584, 56229, 57845, 59434, 60997, 62534, 64047,
+    65536
+};
+```
+
+New:
+```c
+static const u32 gFR_LOG2_MANT_TAB[65] = {
+        0,  1466,  2909,  4331,  5732,  7112,  8473,  9814,
+    11136, 12440, 13727, 14996, 16248, 17484, 18704, 19909,
+    21098, 22272, 23433, 24579, 25711, 26830, 27936, 29029,
+    30109, 31178, 32234, 33279, 34312, 35334, 36346, 37346,
+    38336, 39316, 40286, 41246, 42196, 43137, 44068, 44990,
+    45904, 46809, 47705, 48593, 49472, 50344, 51207, 52063,
+    52911, 53751, 54584, 55410, 56229, 57040, 57845, 58643,
+    59434, 60219, 60997, 61769, 62534, 63294, 64047, 64794,
+    65536
+};
+```
+
+In FR_log2, change the index/interpolation split from 5/25 to 6/24:
+
+Current (line 586-587):
+```c
+    idx  = (s32)(m >> 25);                    /* 5 bits  */
+    frac = (s32)(m & ((1u << 25) - 1));       /* 25 bits */
+```
+
+New:
+```c
+    idx  = (s32)(m >> 24);                    /* 6 bits  */
+    frac = (s32)(m & ((1u << 24) - 1));       /* 24 bits */
+```
+
+And the interpolation line (590):
+
+Current:
+```c
+    mant_log2 = lo + (s32)(((int64_t)(hi - lo) * frac) >> 25);
+```
+
+New:
+```c
+    mant_log2 = lo + (s32)(((int64_t)(hi - lo) * frac) >> 24);
+```
+
+Same algorithm, three numbers change (25->24, 25->24, [33]->[65]).
+Update the comments at the top of FR_log2 accordingly. ln and log10
+improve automatically since they call FR_log2 internally.
+
+
+### Change 5: FR_DIV rounding
+
+FR_DIV currently truncates, giving up to 1.0 LSB error. Adding round-to-
+nearest brings this to 0.5 LSB, matching FR_FixMuls and FR_sqrt behavior.
+
+FR_DIV already uses an s64 intermediate (from the earlier 32->64 bit fix),
+so the rounding cost is one addition.
+
+**Simple version** (correct for positive quotients, within 0.5 LSB for
+negative quotients):
+
+Current:
+```c
+#define FR_DIV(x, xr, y, yr) ((s32)(((s64)(x) << (yr)) / (s32)(y)))
+```
+
+New:
+```c
+#define FR_DIV(x, xr, y, yr) \
+    ((s32)((((s64)(x) << (yr)) + ((s64)(y) >> 1)) / (s64)(y)))
+```
+
+This adds half the divisor before dividing. For positive quotients it rounds
+to nearest exactly. For negative quotients C's truncation-toward-zero means
+the bias is slightly off, but worst case remains 0.5 LSB.
+
+**Exact version** (correct for all sign combinations, as a static inline):
+
+```c
+static inline s32 FR_div_rnd(s64 num, s32 den) {
+    if ((num ^ den) >= 0)                   /* same sign: positive quotient */
+        return (s32)((num + den / 2) / den);
+    else                                     /* negative quotient */
+        return (s32)((num - den / 2) / den);
+}
+#define FR_DIV(x, xr, y, yr) FR_div_rnd((s64)(x) << (yr), (s32)(y))
+```
+
+Rename the old truncating version:
+```c
+#define FR_DIV_TRUNC(x, xr, y, yr) ((s32)(((s64)(x) << (yr)) / (s32)(y)))
+```
+
+**Cost**: zero additional ROM (one s64 addition). The inline function version
+adds a branch but the branch is perfectly predicted in practice.
+
+
+## Expected Results
+
+Combining all five changes, measured with bench_explog.cpp:
+
+| function | before (LSB) | after (LSB) | improvement |
+|----------|-------------:|------------:|-------------|
+| exp      | 1979         | ~207        | ~10x better (table + scaling) |
+| pow2     | 2282         | ~210        | ~11x better (table) |
+| ln       | 37.6         | ~3          | ~12x better (table + scaling) |
+| log10    | 15.8         | ~3          | ~5x better (table + scaling) |
+| log2     | 53.2         | ~13         | ~4x better (table) |
+| div      | 1.0          | ~0.5        | 2x better (rounding) |
+
+Speed: no measurable change (tested on Apple M-series, 50k iterations).
+
+Size delta: +192 bytes (pow2 table) + 128 bytes (log2 table) = +320 bytes ROM.
+Zero RAM.
+
+
+## Files Changed
+
+| file | change |
+|------|--------|
+| src/FR_math.h | Add FR_kLOG2E_28, FR_krLOG2E_28, FR_kLOG2_10_28, FR_krLOG2_10_28 |
+| src/FR_math.h | Add FR_MULK28 macro |
+| src/FR_math.h | Update FR_EXP and FR_POW10 to use FR_MULK28 |
+| src/FR_math.h | Add FR_EXP_FAST and FR_POW10_FAST (old behavior) |
+| src/FR_math.h | Update FR_DIV to round-to-nearest |
+| src/FR_math.h | Add FR_DIV_TRUNC (old truncating behavior) |
+| src/FR_math.c | Replace gFR_POW2_FRAC_TAB[17] with [65] |
+| src/FR_math.c | Change FR_pow2 index from 4-bit/12-bit to 6-bit/10-bit |
+| src/FR_math.c | Replace gFR_LOG2_MANT_TAB[33] with [65] |
+| src/FR_math.c | Change FR_log2 index from 5-bit/25-bit to 6-bit/24-bit |
+| src/FR_math.c | Update FR_ln to use FR_MULK28 |
+| src/FR_math.c | Update FR_log10 to use FR_MULK28 |
+
+
+## Helper Scripts
+
+### Generate the 65-entry pow2 table
+
+```python
+#!/usr/bin/env python3
+"""Generate gFR_POW2_FRAC_TAB[65] for FR_pow2.
+
+Output: 2^(i/64) at s.16 fixed point, for i = 0..64.
+Paste directly into FR_math.c.
+"""
+import math
+
+N = 64
+entries = [round(2.0 ** (i / N) * 65536) for i in range(N + 1)]
+
+print(f"static const u32 gFR_POW2_FRAC_TAB[{N+1}] = {{")
+for row in range(0, N + 1, 8):
+    chunk = entries[row:row+8]
+    vals = ", ".join(f"{v:6d}" for v in chunk)
+    comma = "," if row + 8 <= N else ""
+    print(f"    {vals}{comma}")
+print("};")
+print(f"\n/* Size: {(N+1)*4} bytes.  Entry i = round(2^(i/{N}) * 65536). */")
+
+# Verify
+assert entries[0]  == 65536,  f"first entry should be 65536, got {entries[0]}"
+assert entries[N]  == 131072, f"last entry should be 131072, got {entries[N]}"
+assert entries[32] == 92682,  f"midpoint (2^0.5) should be 92682, got {entries[32]}"
+print("Verification passed.")
+```
+
+
+### Generate the 65-entry log2 table
+
+```python
+#!/usr/bin/env python3
+"""Generate gFR_LOG2_MANT_TAB[65] for FR_log2.
+
+Output: log2(1 + i/64) at s.16 fixed point, for i = 0..64.
+Paste directly into FR_math.c.
+"""
+import math
+
+N = 64
+entries = [round(math.log2(1.0 + i / N) * 65536) for i in range(N + 1)]
+
+print(f"static const u32 gFR_LOG2_MANT_TAB[{N+1}] = {{")
+for row in range(0, N + 1, 8):
+    chunk = entries[row:row+8]
+    vals = ", ".join(f"{v:5d}" for v in chunk)
+    comma = "," if row + 8 <= N else ""
+    print(f"    {vals}{comma}")
+print("};")
+print(f"\n/* Size: {(N+1)*4} bytes.  Entry i = round(log2(1 + i/{N}) * 65536). */")
+
+# Verify
+assert entries[0]  == 0,     f"first should be 0, got {entries[0]}"
+assert entries[N]  == 65536, f"last should be 65536, got {entries[N]}"
+assert entries[32] == 38336, f"midpoint log2(1.5) should be 38336, got {entries[32]}"
+print("Verification passed.")
+```
+
+
+### Generate the radix-28 constants
+
+```python
+#!/usr/bin/env python3
+"""Generate high-precision scaling constants at radix 28 for FR_math.h.
+
+These are used by FR_MULK28 for base conversion in exp/ln/log10.
+"""
+import math
+
+R = 28
+scale = 2**R
+
+constants = [
+    ("FR_kLOG2E_28",    math.log2(math.e),  "log2(e)"),
+    ("FR_krLOG2E_28",   math.log(2),         "ln(2)"),
+    ("FR_kLOG2_10_28",  math.log2(10),       "log2(10)"),
+    ("FR_krLOG2_10_28", math.log10(2),       "log10(2)"),
+]
+
+for name, exact, desc in constants:
+    k28 = round(exact * scale)
+    approx = k28 / scale
+    err = abs(approx - exact)
+    print(f"#define {name:20s} ({k28:>12d})   /* {desc:10s} = {exact:.16f} */")
+    assert err < 1e-8, f"{name} error {err:.2e} exceeds 1e-8"
+
+print()
+
+# Overflow check: worst case is INT32_MAX * largest constant
+worst_k = max(c[1] for c in constants)
+worst_k28 = round(worst_k * scale)
+product_bits = (2**31 * worst_k28).bit_length()
+print(f"Overflow check: INT32_MAX * {worst_k28} needs {product_bits} bits (s64 has 63+sign)")
+assert product_bits <= 62, "OVERFLOW RISK"
+print("Overflow check passed.")
+```
+
+
+### Verify the changes end-to-end
+
+After making the code changes, run the existing test suite to confirm nothing
+broke, then run the comparison benchmark:
+
+```bash
+# From the repo root:
+make clean && make test
+
+# From .compare/ — rebuild against updated FR_math and run full comparison:
+make clean && make run
+# Output: comparison_results.json with all 13 functions vs libfixmath
+
+# From .compare/ — detailed exp/log analysis:
+make -f Makefile.explog clean
+make -f Makefile.explog run
+```
+
+
+## Optional: FR_log2 CLZ Optimization
+
+The current FR_log2 finds the leading bit position with a while loop:
+
+```c
+    u = (u32)input;
+    p = 0;
+    while (u > 1)
+    {
+        u >>= 1;
+        p++;
+    }
+```
+
+This could be replaced with __builtin_clz (GCC/Clang) or a manual binary
+search for a constant-time alternative:
+
+```c
+    p = 31 - __builtin_clz((unsigned)input);
+```
+
+This saves ~15 iterations on average but only matters if FR_log2 is called in
+a tight loop. Not part of the five changes above — can be done separately as
+a minor speed optimization. Note: __builtin_clz is not portable to all
+compilers, so a fallback would be needed for strict portability.
diff --git a/dev/fr_math_2.0.1.md b/dev/fr_math_2.0.1.md
index 98268cb..b001529 100644
--- a/dev/fr_math_2.0.1.md
+++ b/dev/fr_math_2.0.1.md
@@ -9,7 +9,7 @@ Branch: `update_wave_fns`
   - tan return type s15.16 → s16.15
   - FR_FR2I → FR2I, FR_MUL removed, FR_EXP/FR_POW10 casing
   - 2D examples: FR_Matrix2D_CPT, ID(), XFormPtI
-  - Summary tables: added FR_hypot_fast, FR_hypot_fast8, FR_numstr
+  - Summary tables: added FR_hypot_fast8, FR_numstr
 - [x] Version bump 2.0.0 → 2.0.1 (hex 0x020000 → 0x020001)
 - [x] sync_version.sh rewrite — FR_MATH_VERSION_HEX is single source of truth
 - [x] CI auto-release job — reads hex version, creates GitHub release + tag
diff --git a/docs/README.md b/docs/README.md
index 896de6e..d30d8c9 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -45,24 +45,24 @@ radix — Q16.16 is just the reference point for the table. See the
 [TDD report](../build/test_tdd_report.md) for sweeps at radixes 8, 12,
 16, and 24. Percent errors skip expected values near zero (|expected| < 0.01).
 
-<!-- ACCURACY_TABLE_START -->
-| Function | Max err (LSB) | Max err (%) | Avg err (%) | Note |
-|---|---:|---:|---:|---|
-| sin / cos | 7.5 | 0.7169 | 0.0100 | 65536-pt sweep + specials |
-| tan | 38020.4 | 0.7118 | 0.0162 | 65536-pt sweep (skip poles) |
-| asin / acos | 512.6 | 0.7025 | 0.0105 | 65536-pt; sqrt approx near boundary |
-| atan2 | 44.7 | 2.9913 | 0.0383 | 65536x5 radii + specials |
-| sqrt | 28.4 | 0.0003 | 0.0000 | Round-to-nearest |
-| log2 | 10.5 | 0.2479 | 0.0045 | 65-entry mantissa table |
-| pow2 | 220.4 | 0.1373 | 0.0057 | 65-entry fraction table |
-| ln, log10 | 0.7 | 0.0015 | 0.0004 | Via FR_MULK28 from log2 |
-| exp | 65.7 | 0.0719 | 0.0051 | FR_MULK28 + FR_pow2 |
-| exp_fast | 195.5 | 0.0719 | 0.0064 | Shift-only scaling |
-| pow10 | 143.4 | 0.1163 | 0.0075 | FR_MULK28 + FR_pow2 |
-| pow10_fast | 581.9 | 0.1163 | 0.0100 | Shift-only scaling |
-| hypot (exact) | 0.2 | 0.0001 | 0.0000 | 64-bit intermediate |
-| hypot_fast (4-seg) | 216112.8 | 0.3418 | 0.1923 | Shift-only, no multiply |
-| hypot_fast8 (8-seg) | 59968.8 | 0.0977 | 0.0508 | Shift-only, no multiply |
+<!-- ACCURACY_TABLE_START -->
+| Function | Max err (LSB) | Max err (%) | Avg err (%) | Note |
+|---|---:|---:|---:|---|
+| sin / cos | 7.5 | 0.7169 | 0.0100 | 65536-pt sweep + specials |
+| tan | 38020.4 | 0.7118 | 0.0162 | 65536-pt sweep (skip poles) |
+| asin / acos | 42.3 | 0.7025 | 0.0105 | 65536-pt; sqrt approx near boundary |
+| atan2 | 63.3 | 0.4953 | 0.0268 | 65536x5 radii; asin/acos+hypot_fast8 |
+| atan | 61.9 | 0.2985 | 0.0159 | 20001-pt sweep [-10,10]; via FR_atan2 |
+| sqrt | 28.4 | 0.0003 | 0.0000 | Round-to-nearest |
+| log2 | 10.5 | 0.2479 | 0.0045 | 65-entry mantissa table |
+| pow2 | 220.4 | 0.1373 | 0.0057 | 65-entry fraction table |
+| ln, log10 | 0.7 | 0.0015 | 0.0004 | Via FR_MULK28 from log2 |
+| exp | 65.7 | 0.0719 | 0.0051 | FR_MULK28 + FR_pow2 |
+| exp_fast | 195.5 | 0.0719 | 0.0064 | Shift-only scaling |
+| pow10 | 143.4 | 0.1163 | 0.0075 | FR_MULK28 + FR_pow2 |
+| pow10_fast | 581.9 | 0.1163 | 0.0100 | Shift-only scaling |
+| hypot (exact) | 0.2 | 0.0001 | 0.0000 | 64-bit intermediate |
+| hypot_fast8 (8-seg) | 59968.8 | 0.0977 | 0.0508 | Shift-only, no multiply |
 <!-- ACCURACY_TABLE_END -->
 
 ## What's in the box
@@ -75,7 +75,7 @@ radix — Q16.16 is just the reference point for the table. See the
 | Trig (radian/BAM) | `fr_sin`, `fr_cos`, `fr_tan`, `fr_sin_bam`, `fr_cos_bam`, `fr_sin_deg`, `fr_cos_deg` |
 | Inverse trig | `FR_atan`, `FR_atan2`, `FR_asin`, `FR_acos` |
 | Log / exp | `FR_log2`, `FR_ln`, `FR_log10`, `FR_pow2`, `FR_EXP`, `FR_POW10`, `FR_EXP_FAST`, `FR_POW10_FAST`, `FR_MULK28` |
-| Roots | `FR_sqrt`, `FR_hypot`, `FR_hypot_fast`, `FR_hypot_fast8` |
+| Roots | `FR_sqrt`, `FR_hypot`, `FR_hypot_fast8` |
 | Wave generators | `fr_wave_sqr`, `fr_wave_pwm`, `fr_wave_tri`, `fr_wave_saw`, `fr_wave_tri_morph`, `fr_wave_noise` |
 | Envelope | `fr_adsr_init`, `fr_adsr_trigger`, `fr_adsr_release`, `fr_adsr_step` |
 | 2D transforms | `FR_Matrix2D_CPT` (mul, add, sub, det, inv, setrotate, XFormPtI, XFormPtI16) |
@@ -111,11 +111,62 @@ generic `float` replacement.
 
 #define R 16  /* work at radix 16 (s15.16) throughout */
 
-s32 pi    = FR_NUM(3, 14159, 5, R);       /* pi at radix 16             */
-s32 c45   = FR_CosI(45);                  /* cos 45 deg = 0.7071 (s15.16) */
-s32 root2 = FR_sqrt(I2FR(2, R), R);       /* sqrt(2)    = 1.4142        */
-s32 lg    = FR_log2(I2FR(1000, R), R, R); /* log2(1000) ~ 9.97          */
-s32 ex    = FR_EXP(I2FR(1, R), R);        /* e^1        ~ 2.7183        */
+/* ---- Creating fixed-point values ----
+ *
+ * FR_NUM(integer, frac_digits, num_digits, radix) encodes a decimal
+ * literal at compile time.  The fractional part is the digits AFTER
+ * the decimal point, and num_digits says how many digits that is.
+ * Think: FR_NUM(3, 14159, 5, 16) means "3.14159" at radix 16.
+ */
+s32 pi   = FR_NUM(3, 14159, 5, R);  /* 3.14159 → raw 205886 at r16  */
+s32 half = FR_NUM(0, 5, 1, R);      /* 0.5     → raw 32768           */
+s32 neg  = FR_NUM(-2, 75, 2, R);    /* -2.75   → raw -180224         */
+
+/* Or parse from a string at runtime (no floats, no strtod): */
+s32 pi2  = FR_numstr("3.14159", R); /* same result as FR_NUM above    */
+
+/* Integer-to-fixed: I2FR(n, radix) just shifts left */
+s32 two  = I2FR(2, R);              /* 2.0 → raw 131072              */
+
+/* ---- Naming convention: macros vs functions ----
+ *
+ * UPPERCASE FR_ names are macros — they expand inline with no call
+ * overhead, and the compiler can constant-fold them.  Use these for
+ * conversions and simple arithmetic:
+ *   I2FR, FR2I, FR_NUM, FR_ADD, FR_MUL, FR_DIV, FR_ABS, FR_EXP ...
+ *
+ * MixedCase FR_ names are functions — they contain loops, tables, or
+ * multi-step algorithms where inlining would waste ROM:
+ *   FR_Cos, FR_sqrt, FR_atan2, FR_log2, FR_pow2, FR_printNumF ...
+ *
+ * lowercase fr_ names are v2 functions (radian trig, wave generators,
+ * ADSR envelopes):
+ *   fr_sin, fr_cos, fr_tan, fr_wave_tri, fr_adsr_step ...
+ *
+ * Some macros wrap functions: FR_EXP(x,r) scales x then calls
+ * FR_pow2 — one-liner convenience, heavy lifting in the function.
+ */
+
+/* ---- Math functions ---- */
+s32 c45   = FR_Cos(45, 0);                /* cos(45°) = 0.7071       */
+s32 s30   = fr_sin(FR_numstr("0.5236", R), R); /* sin(0.5236 rad)    */
+s32 root2 = FR_sqrt(two, R);              /* sqrt(2)  = 1.4142       */
+s32 angle = FR_atan2(I2FR(1,R), I2FR(1,R), R); /* atan2(1,1) rad     */
+s32 lg    = FR_log2(I2FR(1000, R), R, R); /* log2(1000) ~ 9.97       */
+s32 ex    = FR_EXP(I2FR(1, R), R);        /* macro: scales then calls
+                                            * FR_pow2 internally      */
+
+/* ---- Printing (serial / UART / file friendly) ----
+ *
+ * FR_printNumF takes a per-character output function — works with
+ * putchar, Serial.write, UART_putc, or any int(*)(char).  No
+ * sprintf, no floats, no heap.  Ideal for bare-metal targets.
+ */
+int my_putchar(char c) { return putchar(c); }  /* or your UART func */
+
+FR_printNumF(my_putchar, pi, R, 8, 5);    /* prints " 3.14159"      */
+FR_printNumF(my_putchar, neg, R, 8, 2);   /* prints "   -2.75"      */
+FR_printNumD(my_putchar, FR2I(lg, R), 4); /* prints "   9" (integer)*/
 ```
 
 See [getting-started.md](getting-started.md) for a complete
@@ -130,7 +181,7 @@ understand *how* the radix notation works first.
 | Fixed format | Q16.16 only | Q31 / Q15 | Any radix |
 | Angle input | Radians (Q16.16) | Radians (float) | BAM (u16), degrees, or radians |
 | Exact cardinal angles | No | N/A | Yes |
-| Multiply-free option | No | No | Yes (e.g. `FR_EXP_FAST`, `FR_hypot_fast`) |
+| Multiply-free option | No | No | Yes (e.g. `FR_EXP_FAST`, `FR_hypot_fast8`) |
 | Wave generators | No | No | 6 shapes + ADSR |
 | Dependencies | None | ARM only | None |
 | Code size (Cortex-M0, -Os) | 2.4 KB | ~40 KB+ | 4.2 KB |
diff --git a/docs/api-reference.md b/docs/api-reference.md
index b92ed7a..3f97f20 100644
--- a/docs/api-reference.md
+++ b/docs/api-reference.md
@@ -537,8 +537,7 @@ targets (AVR, 8051) where 64-bit multiply is very expensive.
 | --- | --- | --- | --- |
 | `FR_sqrt` | `s32 input` at `radix`<br>`u16 radix` | `s32` at the **same radix**. | Domain: `input ≥ 0`. Returns `FR_DOMAIN_ERROR` for negative input. Digit-by-digit integer isqrt on an `int64_t` accumulator — deterministic 32-iteration cost, no floating point anywhere. **Rounds to nearest** (remainder > root → +1). Worst-case error is ±0.5 LSB at the input radix. |
 | `FR_hypot` | `s32 x`, `s32 y` both at `radix`<br>`u16 radix` | `s32` at the **same radix**. | Overflow-safe magnitude: computes `sqrt(x² + y²)` without an intermediate 32-bit overflow by promoting the sum of squares to `int64_t`. Accepts the full `s32` input range; output saturates at `FR_OVERFLOW_POS` only if the true hypot exceeds `2^31−1` at the given radix. |
-| `FR_hypot_fast` | `s32 x`, `s32 y` (any radix) | `s32` at the same radix. | Fast approximate magnitude using 4-segment piecewise-linear shift-only arithmetic. ~0.4% peak error. No multiply, no 64-bit, no ROM table. Based on the method of US Patent 6,567,777 B1 (public domain). No `radix` parameter needed — the algorithm is scale-invariant. |
-| `FR_hypot_fast8` | `s32 x`, `s32 y` (any radix) | `s32` at the same radix. | 8-segment variant. ~0.14% peak error. Same shift-only approach, more branches. |
+| `FR_hypot_fast8` | `s32 x`, `s32 y` (any radix) | `s32` at the same radix. | 8-segment shift-only piecewise-linear approximate magnitude. ~0.14% peak error. No multiply, no 64-bit, no ROM table. Based on the method of US Patent 6,567,777 B1 (public domain). No `radix` parameter needed — the algorithm is scale-invariant. |
 
 ## Wave generators
 
diff --git a/docs/examples.md b/docs/examples.md
index 4ef6bfd..b7d6145 100644
--- a/docs/examples.md
+++ b/docs/examples.md
@@ -144,9 +144,7 @@ int main(void)
         printf("sqrt(-1) rejected, good.\n");
 
     /* Fast approximate — no multiply, no 64-bit math */
-    s32 h_fast  = FR_hypot_fast(three, four);     /* 4-seg, ~0.4% err */
     s32 h_fast8 = FR_hypot_fast8(three, four);    /* 8-seg, ~0.14% err */
-    printf("hypot_fast(3,4)  = %d (integer part)\n", (int)FR2I(h_fast, r));
     printf("hypot_fast8(3,4) = %d (integer part)\n", (int)FR2I(h_fast8, r));
     return 0;
 }
diff --git a/makefile b/makefile
index 307b0dd..168bd57 100644
--- a/makefile
+++ b/makefile
@@ -127,11 +127,15 @@ $(BUILD_DIR)/test_2d_complete: $(TEST_DIR)/test_2d_complete.cpp $(SRC_DIR)/FR_ma
 	$(CXX) $(CXXFLAGS) $(TEST_FLAGS) $(TEST_DIR)/test_2d_complete.cpp $(BUILD_DIR)/test_2dc_FR_math.o $(BUILD_DIR)/test_2dc_FR_math_2D.o $(LDFLAGS) -o $@
 
 # Accuracy summary table (extract from test_tdd output)
-.PHONY: accuracy
+.PHONY: accuracy accuracy-showpeak
 accuracy: dirs $(BUILD_DIR)/test_tdd
 	@echo "Running accuracy report..."
 	@./$(BUILD_DIR)/test_tdd 2>/dev/null | sed -n '/ACCURACY_TABLE_START/,/ACCURACY_TABLE_END/p'
 
+accuracy-showpeak: dirs $(BUILD_DIR)/test_tdd
+	@echo "Running accuracy report (with peak inputs)..."
+	@FR_SHOWPEAK=1 ./$(BUILD_DIR)/test_tdd 2>/dev/null | sed -n '/ACCURACY_TABLE_START/,/ACCURACY_TABLE_END/p'
+
 # Coverage report using gcov (no external dependencies)
 .PHONY: coverage
 coverage:
diff --git a/pages/guide/api-reference.html b/pages/guide/api-reference.html
index cfa95a2..5c03c66 100644
--- a/pages/guide/api-reference.html
+++ b/pages/guide/api-reference.html
@@ -887,21 +887,15 @@ <h2>Roots</h2>
       the true hypot exceeds <code>2<sup>31</sup>&minus;1</code> at
       the given radix.</td>
 </tr>
-<tr>
-  <td><code>FR_hypot_fast</code></td>
-  <td><code>s32 x</code>, <code>s32 y</code> (any radix)</td>
-  <td><code>s32</code> at the same radix.</td>
-  <td>Fast approximate magnitude using 4-segment piecewise-linear
-      shift-only arithmetic. &sim;0.4% peak error. No multiply, no
-      64-bit, no ROM table. Based on the method of US Patent
-      6,567,777 B1 (public domain). No <code>radix</code> parameter
-      needed &mdash; the algorithm is scale-invariant.</td>
-</tr>
 <tr>
   <td><code>FR_hypot_fast8</code></td>
   <td><code>s32 x</code>, <code>s32 y</code> (any radix)</td>
   <td><code>s32</code> at the same radix.</td>
-  <td>8-segment variant. &sim;0.14% peak error. Same shift-only
+  <td>8-segment shift-only piecewise-linear approximate magnitude.
+      &sim;0.14% peak error. No multiply, no 64-bit, no ROM table.
+      Based on the method of US Patent 6,567,777 B1 (public domain).
+      No <code>radix</code> parameter needed &mdash; the algorithm is
+      scale-invariant. Same shift-only
       approach, more branches.</td>
 </tr>
 </tbody>
diff --git a/pages/guide/examples.html b/pages/guide/examples.html
index 3f74155..137525b 100644
--- a/pages/guide/examples.html
+++ b/pages/guide/examples.html
@@ -155,9 +155,7 @@ <h2>3. Square root and hypotenuse</h2>
         printf("sqrt(-1) rejected, good.\n");
 
     /* Fast approximate &mdash; no multiply, no 64-bit math */
-    s32 h_fast  = FR_hypot_fast(three, four);     /* 4-seg, ~0.4% err */
     s32 h_fast8 = FR_hypot_fast8(three, four);    /* 8-seg, ~0.14% err */
-    printf("hypot_fast(3,4)  = %d (integer part)\n", (int)FR2I(h_fast, r));
     printf("hypot_fast8(3,4) = %d (integer part)\n", (int)FR2I(h_fast8, r));
     return 0;
 }</code></pre>
diff --git a/pages/index.html b/pages/index.html
index d25e5fe..5e0e3f4 100644
--- a/pages/index.html
+++ b/pages/index.html
@@ -50,27 +50,27 @@ <h2>Measured accuracy</h2>
 report</a> for sweeps at radixes 8, 12, 16, and 24.
 Percent errors skip expected values near zero (|expected| &lt; 0.01).</p>
 
-<!-- ACCURACY_TABLE_START -->
-<table>
-<thead><tr><th>Function</th><th>Max err (LSB)</th><th>Max err (%)</th><th>Avg err (%)</th><th>Note</th></tr></thead>
-<tbody>
-<tr><td>sin / cos</td><td>7.5</td><td>0.7169</td><td>0.0100</td><td>65536-pt sweep + specials</td></tr>
-<tr><td>tan</td><td>38020.4</td><td>0.7118</td><td>0.0162</td><td>65536-pt sweep (skip poles)</td></tr>
-<tr><td>asin / acos</td><td>512.6</td><td>0.7025</td><td>0.0105</td><td>65536-pt; sqrt approx near boundary</td></tr>
-<tr><td>atan2</td><td>44.7</td><td>2.9913</td><td>0.0383</td><td>65536x5 radii + specials</td></tr>
-<tr><td>sqrt</td><td>28.4</td><td>0.0003</td><td>0.0000</td><td>Round-to-nearest</td></tr>
-<tr><td>log2</td><td>10.5</td><td>0.2479</td><td>0.0045</td><td>65-entry mantissa table</td></tr>
-<tr><td>pow2</td><td>220.4</td><td>0.1373</td><td>0.0057</td><td>65-entry fraction table</td></tr>
-<tr><td>ln, log10</td><td>0.7</td><td>0.0015</td><td>0.0004</td><td>Via FR_MULK28 from log2</td></tr>
-<tr><td>exp</td><td>65.7</td><td>0.0719</td><td>0.0051</td><td>FR_MULK28 + FR_pow2</td></tr>
-<tr><td>exp_fast</td><td>195.5</td><td>0.0719</td><td>0.0064</td><td>Shift-only scaling</td></tr>
-<tr><td>pow10</td><td>143.4</td><td>0.1163</td><td>0.0075</td><td>FR_MULK28 + FR_pow2</td></tr>
-<tr><td>pow10_fast</td><td>581.9</td><td>0.1163</td><td>0.0100</td><td>Shift-only scaling</td></tr>
-<tr><td>hypot (exact)</td><td>0.2</td><td>0.0001</td><td>0.0000</td><td>64-bit intermediate</td></tr>
-<tr><td>hypot_fast (4-seg)</td><td>216112.8</td><td>0.3418</td><td>0.1923</td><td>Shift-only, no multiply</td></tr>
-<tr><td>hypot_fast8 (8-seg)</td><td>59968.8</td><td>0.0977</td><td>0.0508</td><td>Shift-only, no multiply</td></tr>
-</tbody>
-</table>
+<!-- ACCURACY_TABLE_START -->
+<table>
+<thead><tr><th>Function</th><th>Max err (LSB)</th><th>Max err (%)</th><th>Avg err (%)</th><th>Note</th></tr></thead>
+<tbody>
+<tr><td>sin / cos</td><td>7.5</td><td>0.7169</td><td>0.0100</td><td>65536-pt sweep + specials</td></tr>
+<tr><td>tan</td><td>38020.4</td><td>0.7118</td><td>0.0162</td><td>65536-pt sweep (skip poles)</td></tr>
+<tr><td>asin / acos</td><td>42.3</td><td>0.7025</td><td>0.0105</td><td>65536-pt; sqrt approx near boundary</td></tr>
+<tr><td>atan2</td><td>63.3</td><td>0.4953</td><td>0.0268</td><td>65536x5 radii; asin/acos+hypot_fast8</td></tr>
+<tr><td>atan</td><td>61.9</td><td>0.2985</td><td>0.0159</td><td>20001-pt sweep [-10,10]; via FR_atan2</td></tr>
+<tr><td>sqrt</td><td>28.4</td><td>0.0003</td><td>0.0000</td><td>Round-to-nearest</td></tr>
+<tr><td>log2</td><td>10.5</td><td>0.2479</td><td>0.0045</td><td>65-entry mantissa table</td></tr>
+<tr><td>pow2</td><td>220.4</td><td>0.1373</td><td>0.0057</td><td>65-entry fraction table</td></tr>
+<tr><td>ln, log10</td><td>0.7</td><td>0.0015</td><td>0.0004</td><td>Via FR_MULK28 from log2</td></tr>
+<tr><td>exp</td><td>65.7</td><td>0.0719</td><td>0.0051</td><td>FR_MULK28 + FR_pow2</td></tr>
+<tr><td>exp_fast</td><td>195.5</td><td>0.0719</td><td>0.0064</td><td>Shift-only scaling</td></tr>
+<tr><td>pow10</td><td>143.4</td><td>0.1163</td><td>0.0075</td><td>FR_MULK28 + FR_pow2</td></tr>
+<tr><td>pow10_fast</td><td>581.9</td><td>0.1163</td><td>0.0100</td><td>Shift-only scaling</td></tr>
+<tr><td>hypot (exact)</td><td>0.2</td><td>0.0001</td><td>0.0000</td><td>64-bit intermediate</td></tr>
+<tr><td>hypot_fast8 (8-seg)</td><td>59968.8</td><td>0.0977</td><td>0.0508</td><td>Shift-only, no multiply</td></tr>
+</tbody>
+</table>
 <!-- ACCURACY_TABLE_END -->
 
 <h2>What&rsquo;s in the box</h2>
@@ -84,7 +84,7 @@ <h2>What&rsquo;s in the box</h2>
 <tr><td>Trig (radian/BAM)</td><td><code>fr_sin</code>, <code>fr_cos</code>, <code>fr_tan</code>, <code>fr_sin_bam</code>, <code>fr_cos_bam</code>, <code>fr_sin_deg</code>, <code>fr_cos_deg</code></td></tr>
 <tr><td>Inverse trig</td><td><code>FR_atan</code>, <code>FR_atan2</code>, <code>FR_asin</code>, <code>FR_acos</code></td></tr>
 <tr><td>Log / exp</td><td><code>FR_log2</code>, <code>FR_ln</code>, <code>FR_log10</code>, <code>FR_pow2</code>, <code>FR_EXP</code>, <code>FR_POW10</code>, <code>FR_EXP_FAST</code>, <code>FR_POW10_FAST</code>, <code>FR_MULK28</code></td></tr>
-<tr><td>Roots</td><td><code>FR_sqrt</code>, <code>FR_hypot</code>, <code>FR_hypot_fast</code>, <code>FR_hypot_fast8</code></td></tr>
+<tr><td>Roots</td><td><code>FR_sqrt</code>, <code>FR_hypot</code>, <code>FR_hypot_fast8</code></td></tr>
 <tr><td>Wave generators</td><td><code>fr_wave_sqr</code>, <code>fr_wave_pwm</code>, <code>fr_wave_tri</code>, <code>fr_wave_saw</code>, <code>fr_wave_tri_morph</code>, <code>fr_wave_noise</code></td></tr>
 <tr><td>Envelope</td><td><code>fr_adsr_init</code>, <code>fr_adsr_trigger</code>, <code>fr_adsr_release</code>, <code>fr_adsr_step</code></td></tr>
 <tr><td>2D transforms</td><td><code>FR_Matrix2D_CPT</code> (mul, add, sub, det, inv, setrotate, XFormPtI, XFormPtI16)</td></tr>
@@ -127,11 +127,62 @@ <h2>Quick taste</h2>
 
 #define R 16  /* work at radix 16 (s15.16) throughout */
 
-s32 pi    = FR_NUM(3, 14159, 5, R);       /* pi at radix 16             */
-s32 c45   = FR_CosI(45);                  /* cos 45 deg = 0.7071 (s15.16) */
-s32 root2 = FR_sqrt(I2FR(2, R), R);       /* sqrt(2)    = 1.4142        */
-s32 lg    = FR_log2(I2FR(1000, R), R, R); /* log2(1000) ~ 9.97          */
-s32 ex    = FR_EXP(I2FR(1, R), R);        /* e^1        ~ 2.7183        */</code></pre>
+/* ---- Creating fixed-point values ----
+ *
+ * FR_NUM(integer, frac_digits, num_digits, radix) encodes a decimal
+ * literal at compile time.  The fractional part is the digits AFTER
+ * the decimal point, and num_digits says how many digits that is.
+ * Think: FR_NUM(3, 14159, 5, 16) means "3.14159" at radix 16.
+ */
+s32 pi   = FR_NUM(3, 14159, 5, R);  /* 3.14159 &rarr; raw 205886 at r16  */
+s32 half = FR_NUM(0, 5, 1, R);      /* 0.5     &rarr; raw 32768           */
+s32 neg  = FR_NUM(-2, 75, 2, R);    /* -2.75   &rarr; raw -180224         */
+
+/* Or parse from a string at runtime (no floats, no strtod): */
+s32 pi2  = FR_numstr("3.14159", R); /* same result as FR_NUM above    */
+
+/* Integer-to-fixed: I2FR(n, radix) just shifts left */
+s32 two  = I2FR(2, R);              /* 2.0 &rarr; raw 131072              */
+
+/* ---- Naming convention: macros vs functions ----
+ *
+ * UPPERCASE FR_ names are macros &mdash; they expand inline with no call
+ * overhead, and the compiler can constant-fold them.  Use these for
+ * conversions and simple arithmetic:
+ *   I2FR, FR2I, FR_NUM, FR_ADD, FR_MUL, FR_DIV, FR_ABS, FR_EXP ...
+ *
+ * MixedCase FR_ names are functions &mdash; they contain loops, tables, or
+ * multi-step algorithms where inlining would waste ROM:
+ *   FR_Cos, FR_sqrt, FR_atan2, FR_log2, FR_pow2, FR_printNumF ...
+ *
+ * lowercase fr_ names are v2 functions (radian trig, wave generators,
+ * ADSR envelopes):
+ *   fr_sin, fr_cos, fr_tan, fr_wave_tri, fr_adsr_step ...
+ *
+ * Some macros wrap functions: FR_EXP(x,r) scales x then calls
+ * FR_pow2 &mdash; one-liner convenience, heavy lifting in the function.
+ */
+
+/* ---- Math functions ---- */
+s32 c45   = FR_Cos(45, 0);                /* cos(45&deg;) = 0.7071       */
+s32 s30   = fr_sin(FR_numstr("0.5236", R), R); /* sin(0.5236 rad)    */
+s32 root2 = FR_sqrt(two, R);              /* sqrt(2)  = 1.4142       */
+s32 angle = FR_atan2(I2FR(1,R), I2FR(1,R), R); /* atan2(1,1) rad     */
+s32 lg    = FR_log2(I2FR(1000, R), R, R); /* log2(1000) ~ 9.97       */
+s32 ex    = FR_EXP(I2FR(1, R), R);        /* macro: scales then calls
+                                            * FR_pow2 internally      */
+
+/* ---- Printing (serial / UART / file friendly) ----
+ *
+ * FR_printNumF takes a per-character output function &mdash; works with
+ * putchar, Serial.write, UART_putc, or any int(*)(char).  No
+ * sprintf, no floats, no heap.  Ideal for bare-metal targets.
+ */
+int my_putchar(char c) { return putchar(c); }  /* or your UART func */
+
+FR_printNumF(my_putchar, pi, R, 8, 5);    /* prints " 3.14159"      */
+FR_printNumF(my_putchar, neg, R, 8, 2);   /* prints "   -2.75"      */
+FR_printNumD(my_putchar, FR2I(lg, R), 4); /* prints "   9" (integer)*/</code></pre>
 
 <p>See <a href="guide/getting-started.html">Getting Started</a> for a
 complete walkthrough, or jump straight to the
@@ -146,7 +197,7 @@ <h2>Comparison</h2>
 <tr><td>Fixed format</td><td>Q16.16 only</td><td>Q31 / Q15</td><td>Any radix</td></tr>
 <tr><td>Angle input</td><td>Radians (Q16.16)</td><td>Radians (float)</td><td>BAM (u16), degrees, or radians</td></tr>
 <tr><td>Exact cardinal angles</td><td>No</td><td>N/A</td><td>Yes</td></tr>
-<tr><td>Multiply-free option</td><td>No</td><td>No</td><td>Yes (e.g. <code>FR_EXP_FAST</code>, <code>FR_hypot_fast</code>)</td></tr>
+<tr><td>Multiply-free option</td><td>No</td><td>No</td><td>Yes (e.g. <code>FR_EXP_FAST</code>, <code>FR_hypot_fast8</code>)</td></tr>
 <tr><td>Wave generators</td><td>No</td><td>No</td><td>6 shapes + ADSR</td></tr>
 <tr><td>Dependencies</td><td>None</td><td>ARM only</td><td>None</td></tr>
 <tr><td>Code size (Cortex-M0, -Os)</td><td>2.4 KB</td><td>~40 KB+</td><td>4.2 KB</td></tr>
diff --git a/release_notes.md b/release_notes.md
index c4c9ee5..793bb03 100644
--- a/release_notes.md
+++ b/release_notes.md
@@ -241,10 +241,9 @@ for the implementation plan this release executed.
   computes `sqrt(x^2 + y^2)` with no intermediate overflow up to the
   full s32 range. Bit-exact for perfect squares; max error ~1 LSB at
   the requested radix.
-- **Fast approximate magnitude** (`FR_hypot_fast`, `FR_hypot_fast8`):
+- **Fast approximate magnitude** (`FR_hypot_fast8`):
   shift-only piecewise-linear approximation of `sqrt(x^2 + y^2)` — no
   multiply, no divide, no 64-bit math, no ROM table, no iteration.
-  `FR_hypot_fast` uses 4 segments (~0.4% peak error);
   `FR_hypot_fast8` uses 8 segments (~0.14% peak error). Based on the
   method of US Patent 6,567,777 B1 (Chatterjee, public domain). No
   `radix` parameter needed — the algorithm is scale-invariant.
diff --git a/scripts/accuracy_report.sh b/scripts/accuracy_report.sh
index b426cea..a8426ee 100755
--- a/scripts/accuracy_report.sh
+++ b/scripts/accuracy_report.sh
@@ -21,13 +21,16 @@ PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
 cd "${PROJECT_ROOT}"
 
 MODE="print"
+SHOWPEAK=""
 for arg in "$@"; do
     case "$arg" in
         --update) MODE="update" ;;
+        --showpeak) SHOWPEAK="1" ;;
         -h|--help)
-            echo "Usage: scripts/accuracy_report.sh [--update]"
+            echo "Usage: scripts/accuracy_report.sh [--update] [--showpeak]"
             echo "  (no args)   Build test_tdd, run it, print accuracy table"
             echo "  --update    Also patch README.md, docs/README.md, pages/index.html"
+            echo "  --showpeak  Add a 'Peak at' column showing the input that produced max % error"
             exit 0
             ;;
         *) echo "Unknown option: $arg" >&2; exit 1 ;;
@@ -45,7 +48,11 @@ make -s build/test_tdd 2>&1 >&2
 # 2. Run and capture the accuracy table
 # -----------------------------------------------------------------------
 echo "Running test_tdd..." >&2
-OUTPUT=$(./build/test_tdd 2>/dev/null)
+if [ -n "$SHOWPEAK" ]; then
+    OUTPUT=$(FR_SHOWPEAK=1 ./build/test_tdd 2>/dev/null)
+else
+    OUTPUT=$(./build/test_tdd 2>/dev/null)
+fi
 
 # Extract lines between sentinels (inclusive)
 TABLE=$(echo "$OUTPUT" | sed -n '/<!-- ACCURACY_TABLE_START -->/,/<!-- ACCURACY_TABLE_END -->/p')
diff --git a/src/FR_math.c b/src/FR_math.c
index 82936de..9edfdf3 100644
--- a/src/FR_math.c
+++ b/src/FR_math.c
@@ -262,34 +262,48 @@ s32 FR_acos(s32 input, u16 radix, u16 out_radix)
 	s16 sign;
 	s32 lo, hi, mid;
 	s32 idx, d, num, frac;
+	s32 input_abs;
 
-	v = FR_CHRDX(input, radix, FR_TRIG_PREC); /* to s0.15 */
+	/* Work with absolute value at the caller's radix — we'll need it for
+	 * the sqrt fast path before quantising to r15. */
+	sign = (input < 0) ? 1 : 0;
+	input_abs = sign ? -input : input;
 
-	/* Clamp range: acos(1.0) = 0, acos(-1.0) = pi */
-	if (v >=  32767) return 0;
-	if (v <= -32767) return FR_BAM2RAD(FR_BAM_HALF, out_radix); /* pi */
+	/* Clamp at the caller's radix — not at r15.  Near ±1.0 the r15
+	 * quantisation can round to 32767 even when the caller has sub-LSB
+	 * precision that the sqrt fast path can use. */
+	{
+		s32 one = (s32)1 << radix;
+		if (input_abs >= one)
+			return sign ? FR_BAM2RAD(FR_BAM_HALF, out_radix) : 0;
+	}
 
-	sign = (v < 0) ? 1 : 0;
-	if (v < 0) v = -v;
+	v = FR_CHRDX(input_abs, radix, FR_TRIG_PREC); /* |input| at s0.15 */
 
-	/* Small-angle fast path: when v is in the flat region near cos(0)
-	 * the table has only 2-8 LSBs of gap between entries, so linear
-	 * interpolation is very coarse.  Use the identity
-	 *   acos(x) ≈ sqrt(2*(1-x))   for x close to 1
-	 * which is exact in the limit and leverages FR_sqrt's precision.
-	 * Switch at table[3] (gap = 12 LSBs) — below that the table is fine.
-	 */
+	/* Small-angle fast path: when cos(θ) is close to 1.0, the table
+	 * has only 2-8 LSBs of gap per entry, so linear interpolation is
+	 * very coarse.  Use the identity  acos(x) ≈ sqrt(2*(1-x)).
+	 *
+	 * Key: compute 1-x at the CALLER's radix, not r15.  Near ±1.0 the
+	 * r15 quantisation crushes many distinct inputs to the same value
+	 * (cos(179.5°)..cos(179.9°) all round to 32767 at r15).  The
+	 * caller's higher-radix bits carry the angular information via the
+	 * identity sin(θ) = sqrt(2(1-cos θ)) — effectively the sin trick. */
 	if (v > gFR_COS_TAB_Q[7])
 	{
-		s32 one_minus_v = (s32)(1 << FR_TRIG_PREC) - v; /* 1.0 - x */
-		s32 two_omv     = one_minus_v << 1;          /* 2*(1-x) at radix 15 */
-		s32 rad15       = FR_sqrt(two_omv, FR_TRIG_PREC); /* radians at r15 */
-		s32 rad_out     = FR_CHRDX(rad15, FR_TRIG_PREC, out_radix);
+		s32 one = (s32)1 << radix;
+		s32 one_minus_x = one - input_abs;           /* 1-|x| at caller radix */
+		s32 two_omx     = one_minus_x << 1;          /* 2(1-|x|) at caller radix */
+		s32 rad_native  = FR_sqrt(two_omx, radix);   /* radians at caller radix */
+		s32 rad_out     = FR_CHRDX(rad_native, radix, out_radix);
 		if (sign)
 			rad_out = FR_BAM2RAD(FR_BAM_HALF, out_radix) - rad_out;
 		return rad_out;
 	}
 
+	/* Below this point we need the sign-stripped r15 value for the
+	 * binary search.  (v was already computed from input_abs above.) */
+
 	/* Binary search on the cosine quadrant table.  The table is
 	 * monotonically decreasing: gFR_COS_TAB_Q[0] = 32767 (cos 0°),
 	 * gFR_COS_TAB_Q[128] = 0 (cos 90°).
@@ -360,60 +374,22 @@ s32 FR_asin(s32 input, u16 radix, u16 out_radix)
 	return half_pi - FR_acos(input, radix, out_radix);
 }
 
-/* arctan table: gFR_ATAN_TAB[i] = atan(i/32) in degrees, scaled by 64
- * (i.e. fixed-point s.6), for i in [0..32]. So index 32 is atan(1) = 45°.
- *
- * Generated by:
- *   for i in 0..32: int(round(degrees(atan(i/32.0)) * 64))
- *
- *   i=0:   0   atan(0/32)   =  0°       *64 = 0
- *   i=1: 115   atan(1/32)   =  1.7899°  *64 = 114.55
- *   ...
- *   i=32: 2880 atan(32/32)  = 45°       *64 = 2880
- */
-static const s16 gFR_ATAN_TAB[33] = {
-       0,   115,   229,   343,   456,   568,   680,   790,
-     898,  1005,  1111,  1214,  1316,  1415,  1512,  1607,
-    1700,  1791,  1879,  1965,  2048,  2130,  2209,  2285,
-    2360,  2432,  2502,  2570,  2636,  2700,  2762,  2822,
-    2880
-};
-
-/* helper: arctan(t) for t in [0,1] in radix-16 input, returning BAM (u16).
- * Uses the gFR_ATAN_TAB table with linear interpolation.
- *
- * t is in s.16. The table indexes into [0,1] in 32 steps, so the table
- * step in s.16 units is (1<<16)/32 = 2048.
- *
- * The atan table stores degrees*64 (s.6). We convert to BAM internally:
- * bam = deg64 * 65536 / (360 * 64) = deg64 * (65536 / 23040).
- * Approximation: bam ≈ (deg64 * 182) >> 6, matching FR_DEG2BAM precision.
- */
-static u16 fr_atan_unit_q1_bam(s32 t_s16)
-{
-	s32 idx, frac, lo, hi, deg64;
-	if (t_s16 <= 0) return 0;
-	if (t_s16 >= (1L << 16)) return FR_BAM_QUADRANT >> 1; /* 45° in BAM */
-	idx  = t_s16 >> 11;        /* 2048 = 1<<11 */
-	frac = t_s16 & ((1L << 11) - 1);
-	lo = gFR_ATAN_TAB[idx];
-	hi = gFR_ATAN_TAB[idx + 1];
-	deg64 = lo + (((hi - lo) * frac) >> 11);
-	/* Convert degrees*64 → BAM: bam = deg64 * (65536/360) / 64
-	 *                              ≈ (deg64 * 182 + 32) >> 6
-	 */
-	return (u16)(((s32)deg64 * 182L + 32) >> 6);
-}
-
 /* FR_atan2(y, x, out_radix) — full-circle arctangent, returns radians
  * at the specified output radix (s32).
  *
  * Range: [-pi, pi]. Returns 0 for atan2(0,0).
+ *
+ * Implementation: normalise (x,y) via FR_hypot_fast8, then recover the
+ * angle with FR_asin or FR_acos (both use the 129-entry cosine table).
+ * To stay in the well-conditioned region of each inverse function we
+ * switch at 45°:
+ *   |y| <= |x|  →  use asin(y/h)   — asin stable near 0
+ *   |y| >  |x|  →  use acos(x/h)   — acos stable near pi/2
+ * This keeps the derivative amplification factor below 1.414x everywhere.
  */
 s32 FR_atan2(s32 y, s32 x, u16 out_radix)
 {
-	s32 ay, ax, ratio;
-	u16 bam;
+	s32 ax, ay, h, q1_angle;
 
 	/* Axis cases — exact angles, no divide. */
 	if (x == 0)
@@ -428,26 +404,60 @@ s32 FR_atan2(s32 y, s32 x, u16 out_radix)
 	ax = (x < 0) ? -x : x;
 	ay = (y < 0) ? -y : y;
 
-	/* Compute ratio of smaller / larger in s.16 so it stays in [0,1]. */
+	/* Normalise so max(ax,ay) sits in [2^14, 2^15).  This gives
+	 * FR_hypot_fast8 enough integer bits for the shift-only segments
+	 * to produce an accurate ratio — critical when the raw inputs are
+	 * small (e.g. atan2(1,1) at radix 0).  Scaling both by the same
+	 * power of two doesn't change the angle. */
+	{
+		s32 mx = (ax > ay) ? ax : ay;
+		while (mx < (1L << 14)) { ax <<= 1; ay <<= 1; mx <<= 1; }
+		while (mx >= (1L << 16)) { ax >>= 1; ay >>= 1; mx >>= 1; }
+	}
+
+	h = FR_hypot_fast8((s32)ax, (s32)ay);
+	if (h == 0) return 0;  /* degenerate */
+
+	/* Compute the first-quadrant angle (positive, [0..pi/2]).
+	 * Divide produces a value in [0,1] at radix FR_TRIG_PREC (s0.15).
+	 *
+	 * Small-angle fast path: when the minor-axis ratio is small,
+	 * asin(x) ≈ x (error < x³/6).  Below ~5° the cubic term is
+	 * smaller than the table-lookup error, so the direct identity
+	 * is both faster and more accurate.  Threshold 2753 at r15
+	 * corresponds to sin(~4.8°) = 0.084. */
+	#define FR_ATAN2_SMALL  2753
 	if (ay <= ax)
 	{
-		ratio = (s32)(((int64_t)ay << 16) / ax);
-		bam = fr_atan_unit_q1_bam(ratio);                  /* [0..45°] BAM */
+		/* angle in [0°..45°]: use asin(ay/h) — well-conditioned near 0 */
+		s32 sin_val = (s32)(((int64_t)ay << FR_TRIG_PREC) / h);
+		if (sin_val < FR_ATAN2_SMALL)
+			q1_angle = FR_CHRDX(sin_val, FR_TRIG_PREC, out_radix);
+		else
+			q1_angle = FR_asin(sin_val, FR_TRIG_PREC, out_radix);
 	}
 	else
 	{
-		ratio = (s32)(((int64_t)ax << 16) / ay);
-		bam = (u16)(FR_BAM_QUADRANT - fr_atan_unit_q1_bam(ratio)); /* [45..90°] BAM */
+		/* angle in [45°..90°]: use acos(ax/h) — well-conditioned near pi/2 */
+		s32 cos_val = (s32)(((int64_t)ax << FR_TRIG_PREC) / h);
+		if (cos_val < FR_ATAN2_SMALL)
+		{
+			/* angle ≈ pi/2 - cos_val (symmetric small-angle identity) */
+			s32 half_pi = FR_BAM2RAD(FR_BAM_QUADRANT, out_radix);
+			q1_angle = half_pi - FR_CHRDX(cos_val, FR_TRIG_PREC, out_radix);
+		}
+		else
+			q1_angle = FR_acos(cos_val, FR_TRIG_PREC, out_radix);
 	}
 
-	/* Apply quadrant sign and convert BAM → radians. */
+	/* Apply quadrant from signs of x and y.
+	 * q1_angle is always positive [0..pi/2]. */
 	{
-		s32 rad = FR_BAM2RAD(bam, out_radix);
-		s32 pi  = FR_BAM2RAD(FR_BAM_HALF, out_radix);
+		s32 pi = FR_BAM2RAD(FR_BAM_HALF, out_radix);
 		if (x > 0)
-			return (y > 0) ? rad : -rad;
-		/* x < 0 */
-		return (y > 0) ? (pi - rad) : (rad - pi);
+			return (y > 0) ? q1_angle : -q1_angle;
+		/* x < 0: mirror across y-axis */
+		return (y > 0) ? (pi - q1_angle) : (q1_angle - pi);
 	}
 }
 
@@ -1012,66 +1022,12 @@ s32 FR_hypot(s32 x, s32 y, u16 radix)
 	return (s32)fr_isqrt64(xx + yy);
 }
 
-/*=======================================================
- * FR_hypot_fast — 4-segment piecewise-linear magnitude approximation.
- *
- * Computes an approximation of sqrt(x*x + y*y) using only shifts and adds
- * (no multiply, no divide, no 64-bit math, no ROM table, no iteration).
- *
- * Based on the piecewise-linear method described in US Patent 6,567,777 B1
- * (Chatterjee, now public domain). The algorithm:
- *   1. Take absolute values, assign hi = max(|x|,|y|), lo = min(|x|,|y|).
- *   2. Determine which of 4 angular slices lo/hi falls into.
- *   3. Apply pre-computed shift-only linear coefficients for that slice.
- *
- * Peak error: ~0.4%.
- * The result is at the same radix as the inputs — scale-invariant.
- */
-s32 FR_hypot_fast(s32 x, s32 y)
-{
-    s32 hi, lo;
-
-    /* absolute values (clamp INT32_MIN to INT32_MAX to avoid UB) */
-    if (x < 0) x = (x == (s32)0x80000000) ? 0x7FFFFFFF : -x;
-    if (y < 0) y = (y == (s32)0x80000000) ? 0x7FFFFFFF : -y;
-
-    /* hi = max(|x|,|y|), lo = min(|x|,|y|) */
-    if (x > y) { hi = x; lo = y; }
-    else       { hi = y; lo = x; }
-
-    if (hi == 0) return 0;
-
-    /* 4 piecewise-linear segments: dist ≈ a*hi + b*lo
-     * where a,b are shift-only minimax fits of sqrt(1+β²) on each
-     * interval, β = lo/hi. Boundaries at β = 0.25, 0.5, 0.75. */
-    if ((hi >> 1) < lo) {
-        /* β in (0.5, 1.0] */
-        if (lo > hi - (hi >> 2))                      /* β > 0.75 */
-            /* a≈0.7559, b≈0.6567 */
-            return hi - (hi >> 2) + (hi >> 7) - (hi >> 9)
-                 + (lo >> 1) + (lo >> 3) + (lo >> 5) + (lo >> 11);
-        else                                           /* β in (0.5, 0.75] */
-            /* a≈0.8555, b≈0.5225 */
-            return hi - (hi >> 3) - (hi >> 6) - (hi >> 8)
-                 + (lo >> 1) + (lo >> 5) - (lo >> 7) - (lo >> 10);
-    } else {
-        /* β in [0, 0.5] */
-        if ((hi >> 2) < lo)                            /* β in (0.25, 0.5] */
-            /* a≈0.9409, b≈0.3477 */
-            return hi - (hi >> 4) + (hi >> 8) - (hi >> 11)
-                 + (lo >> 1) - (lo >> 3) - (lo >> 5) + (lo >> 8);
-        else                                           /* β in [0, 0.25] */
-            /* a≈0.9966, b≈0.1209 */
-            return hi - (hi >> 8) + (hi >> 11)
-                 + (lo >> 3) - (lo >> 8) - (lo >> 12);
-    }
-}
-
 /*=======================================================
  * FR_hypot_fast8 — 8-segment piecewise-linear magnitude approximation.
  *
- * Same approach as FR_hypot_fast but with 8 angular slices for tighter fit.
- * Peak error: ~0.14%.
+ * Shift-only, no multiply, no 64-bit.  Based on the piecewise-linear
+ * method described in US Patent 6,567,777 B1 (Chatterjee, expired).
+ * Peak error: ~0.10%.
  */
 s32 FR_hypot_fast8(s32 x, s32 y)
 {
diff --git a/src/FR_math.h b/src/FR_math.h
index c90d815..fc3283d 100644
--- a/src/FR_math.h
+++ b/src/FR_math.h
@@ -480,14 +480,12 @@ static inline s32 FR_div_rnd(s64 num, s32 den) {
    * Based on piecewise-linear approximation of sqrt(x*x + y*y).
    * See US Patent 6,567,777 B1 (Chatterjee, expired).
    *
-   *   FR_hypot_fast(x, y)   4-segment, ~0.4% peak error
-   *   FR_hypot_fast8(x, y)  8-segment, ~0.14% peak error
+   *   FR_hypot_fast8(x, y)  8-segment, ~0.10% peak error
    *
    * Inputs are raw signed integers (or fixed-point at any radix — the
    * result is at the same radix as the inputs, just like FR_hypot).
    * No radix parameter needed because the algorithm is scale-invariant.
    */
-  s32 FR_hypot_fast(s32 x, s32 y);
   s32 FR_hypot_fast8(s32 x, s32 y);
 
 /*===============================================
diff --git a/tests/test_full_coverage.c b/tests/test_full_coverage.c
index e16eaf9..0dfd248 100644
--- a/tests/test_full_coverage.c
+++ b/tests/test_full_coverage.c
@@ -440,36 +440,7 @@ int test_sqrt_hypot() {
     result = FR_hypot(I2FR(5, 16), I2FR(12, 16), 16);
     if (result < I2FR(13, 16) - 2 || result > I2FR(13, 16) + 2) return TEST_FAIL;
 
-    /* FR_hypot_fast (4-seg) — same test cases, wider tolerance (~0.3%) */
-    result = FR_hypot_fast(I2FR(3, 16), I2FR(4, 16));
-    /* 0.3% of 5.0 at radix 16 = 0.015 * 65536 ≈ 983, use 1000 */
-    if (result < I2FR(5, 16) - 1000 || result > I2FR(5, 16) + 1000) return TEST_FAIL;
-
-    result = FR_hypot_fast(0, 0);
-    if (result != 0) return TEST_FAIL;
-
-    result = FR_hypot_fast(I2FR(-3, 16), I2FR(-4, 16));
-    if (result < I2FR(5, 16) - 1000 || result > I2FR(5, 16) + 1000) return TEST_FAIL;
-
-    result = FR_hypot_fast(I2FR(5, 16), I2FR(12, 16));
-    if (result < I2FR(13, 16) - 2600 || result > I2FR(13, 16) + 2600) return TEST_FAIL;
-
-    /* Edge: one axis zero — tolerance is 0.4% of expected */
-    result = FR_hypot_fast(I2FR(7, 16), 0);
-    if (result < I2FR(7, 16) - 2000 || result > I2FR(7, 16) + 2000) return TEST_FAIL;
-
-    result = FR_hypot_fast(0, I2FR(7, 16));
-    if (result < I2FR(7, 16) - 2000 || result > I2FR(7, 16) + 2000) return TEST_FAIL;
-
-    /* Equal axes: hypot(1,1) = sqrt(2) ≈ 1.41421 */
-    result = FR_hypot_fast(I2FR(1, 16), I2FR(1, 16));
-    if (result < 92000 || result > 93300) return TEST_FAIL;
-
-    /* INT32_MIN must not crash (UB in negation) */
-    result = FR_hypot_fast((s32)0x80000000, 0);
-    if (result <= 0) return TEST_FAIL;
-
-    /* FR_hypot_fast8 (8-seg) — tighter tolerance (~0.1%) */
+    /* FR_hypot_fast8 (8-seg) — ~0.1% tolerance */
     result = FR_hypot_fast8(I2FR(3, 16), I2FR(4, 16));
     if (result < I2FR(5, 16) - 400 || result > I2FR(5, 16) + 400) return TEST_FAIL;
 
diff --git a/tests/test_tdd.cpp b/tests/test_tdd.cpp
index 31ce58e..80ae526 100644
--- a/tests/test_tdd.cpp
+++ b/tests/test_tdd.cpp
@@ -67,9 +67,12 @@ typedef struct {
     double sum_abs_err;
     double max_pct_err;
     double sum_pct_err;
-    double worst_input;
+    double worst_input;        /* input that produced max abs error */
     double worst_actual;
     double worst_expected;
+    double worst_pct_input;    /* input that produced max pct error */
+    double worst_pct_actual;
+    double worst_pct_expected;
 } stats_t;
 
 static void stats_reset(stats_t *s) {
@@ -88,7 +91,12 @@ static void stats_add(stats_t *s, double in, double actual, double expected) {
     s->sum_abs_err += e;
     /* Skip percent error when expected ≈ 0 to avoid division artifacts */
     double pct = (fabs(expected) > 0.01) ? (e / fabs(expected)) * 100.0 : 0.0;
-    if (pct > s->max_pct_err) s->max_pct_err = pct;
+    if (pct > s->max_pct_err) {
+        s->max_pct_err = pct;
+        s->worst_pct_input = in;
+        s->worst_pct_actual = actual;
+        s->worst_pct_expected = expected;
+    }
     s->sum_pct_err += pct;
     s->n++;
 }
@@ -101,6 +109,18 @@ static double stats_mean_pct(const stats_t *s) {
     return s->n ? s->sum_pct_err / s->n : 0.0;
 }
 
+/* Set by FR_SHOWPEAK env var — adds a "Peak at" column to the accuracy table */
+static int g_showpeak = 0;
+
+/* Print one accuracy table row, optionally with peak-error input */
+static void acc_row(const char *name, const stats_t *s, double lsb, const char *note) {
+    printf("| %s | %.1f | %.4f | %.4f | %s",
+           name, s->max_abs_err / lsb, s->max_pct_err, stats_mean_pct(s), note);
+    if (g_showpeak)
+        printf(" | %.4g", s->worst_pct_input);
+    printf(" |\n");
+}
+
 static void md_h1(const char *t) { printf("\n# %s\n\n", t); }
 static void md_h2(const char *t) { printf("\n## %s\n\n", t); }
 static void md_h3(const char *t) { printf("\n### %s\n\n", t); }
@@ -1295,29 +1315,7 @@ static void section_v2_new(void) {
     table_row_stats("FR_hypot sweep", &hyp_stats);
     printf("\n");
 
-    md_h3("11.4b FR_hypot_fast (4-seg) vs hypot(), radix 16");
-    printf("| x | y | FR_hypot_fast | as double | hypot() | abs err | rel err%% |\n");
-    printf("|---:|---:|---:|---:|---:|---:|---:|\n");
-    stats_t hf4_stats; stats_reset(&hf4_stats);
-    for (int i = 0; i < (int)(sizeof(hyp_cases)/sizeof(hyp_cases[0])); i++) {
-        s32 fx = (s32)(hyp_cases[i].x * (1L << 16));
-        s32 fy = (s32)(hyp_cases[i].y * (1L << 16));
-        s32 r  = FR_hypot_fast(fx, fy);
-        double rd = frd(r, 16);
-        double ref = hypot(hyp_cases[i].x, hyp_cases[i].y);
-        double err = rd - ref; if (err < 0) err = -err;
-        double rel = (ref > 0) ? err / ref * 100.0 : 0.0;
-        stats_add(&hf4_stats, sqrt(hyp_cases[i].x*hyp_cases[i].x + hyp_cases[i].y*hyp_cases[i].y),
-                  rd, ref);
-        printf("| %g | %g | %ld | %.6g | %.6g | %.4g | %.4g |\n",
-               hyp_cases[i].x, hyp_cases[i].y, (long)r, rd, ref, err, rel);
-    }
-    printf("\n");
-    table_header_stats();
-    table_row_stats("FR_hypot_fast sweep", &hf4_stats);
-    printf("\n");
-
-    md_h3("11.4c FR_hypot_fast8 (8-seg) vs hypot(), radix 16");
+    md_h3("11.4b FR_hypot_fast8 (8-seg) vs hypot(), radix 16");
     printf("| x | y | FR_hypot_fast8 | as double | hypot() | abs err | rel err%% |\n");
     printf("|---:|---:|---:|---:|---:|---:|---:|\n");
     stats_t hf8_stats; stats_reset(&hf8_stats);
@@ -1733,8 +1731,8 @@ static void section_summary(void) {
     printf("| fr_cos / fr_sin / fr_cos_bam / fr_sin_bam / fr_cos_deg / fr_sin_deg | OK | 6 | s15.16 output; 129-entry quadrant table with round-to-nearest linear interp; exact at cardinal angles |\n");
     printf("| FR_acos | OK | 7.1 | Max error ~0.83° over [-1, +1] swept at 200 points |\n");
     printf("| FR_asin | OK | 7.2 | Same precision as FR_acos |\n");
-    printf("| FR_atan2 | OK | 7.3 | Octant-reduced arctan with 33-entry table; max err ≤1°; signature `FR_atan2(y, x)` returns degrees |\n");
-    printf("| FR_atan | OK | 7.3 | `FR_atan(x, radix)` calls `FR_atan2(x, 1<<radix)` |\n");
+    printf("| FR_atan2 | OK | 7.3 | Via asin/acos + hypot_fast8; 129-entry cos table; `FR_atan2(y, x, out_radix)` returns radians |\n");
+    printf("| FR_atan | OK | 7.3 | `FR_atan(x, radix, out_radix)` calls `FR_atan2(x, 1<<radix, out_radix)` |\n");
     printf("| FR_pow2 (positive integer x) | OK | 8.1 | Bit-exact for integer exponents in test range |\n");
     printf("| FR_pow2 (positive fractional x) | OK | 8.1, 8.2 | ~1e-6 error |\n");
     printf("| FR_pow2 (negative fractional x) | OK | 8.1, 8.2 | Mathematical floor (toward −∞); 65-entry fraction table with linear interp |\n");
@@ -1785,16 +1783,26 @@ static void section_accuracy_table(void) {
     md_h2("14. Accuracy Summary Table");
 
     printf("<!-- ACCURACY_TABLE_START -->\n");
-    printf("| Function | Max err (LSB) | Max err (%%) | Avg err (%%) | Note |\n");
-    printf("|---|---:|---:|---:|---|\n");
+    if (g_showpeak) {
+        printf("| Function | Max err (LSB) | Max err (%%) | Avg err (%%) | Note | Peak at |\n");
+        printf("|---|---:|---:|---:|---|---:|\n");
+    } else {
+        printf("| Function | Max err (LSB) | Max err (%%) | Avg err (%%) | Note |\n");
+        printf("|---|---:|---:|---:|---|\n");
+    }
 
     const int R = 16;
     const double scale = (double)(1L << R);
     const double lsb = 1.0 / scale;
 
+    /* Persistent stats so we can print diagnostics after the table */
+    stats_t st_sincos, st_tan, st_asincos, st_atan2;
+    stats_reset(&st_sincos); stats_reset(&st_tan);
+    stats_reset(&st_asincos); stats_reset(&st_atan2);
+
     /* --- sin / cos --- */
     {
-        stats_t st; stats_reset(&st);
+        stats_t &st = st_sincos;
         const u16 radix = 7; /* s8.7 degrees: 128 steps/deg, [-256°,+256°) */
         /* 65536-point sweep: all s16 values at radix 7 cover > full circle */
         for (int i = -32768; i <= 32767; i++) {
@@ -1812,13 +1820,12 @@ static void section_accuracy_table(void) {
             stats_add(&st, d, frd(FR_SinI(d), FR_TRIG_OUT_PREC), sin(rad));
             stats_add(&st, d, frd(FR_CosI(d), FR_TRIG_OUT_PREC), cos(rad));
         }
-        printf("| sin / cos | %.1f | %.4f | %.4f | 65536-pt sweep + specials |\n",
-               st.max_abs_err / lsb, st.max_pct_err, stats_mean_pct(&st));
+        acc_row("sin / cos", &st, lsb, "65536-pt sweep + specials");
     }
 
     /* --- tan --- */
     {
-        stats_t st; stats_reset(&st);
+        stats_t &st = st_tan;
         const u16 radix = 7;
         for (int i = -32768; i <= 32767; i++) {
             double deg = (double)i / (1 << radix);
@@ -1834,13 +1841,12 @@ static void section_accuracy_table(void) {
             double rad = d * M_PI / 180.0;
             stats_add(&st, d, frd(FR_TanI(d), FR_TRIG_OUT_PREC), tan(rad));
         }
-        printf("| tan | %.1f | %.4f | %.4f | 65536-pt sweep (skip poles) |\n",
-               st.max_abs_err / lsb, st.max_pct_err, stats_mean_pct(&st));
+        acc_row("tan", &st, lsb, "65536-pt sweep (skip poles)");
     }
 
     /* --- asin / acos --- */
     {
-        stats_t st; stats_reset(&st);
+        stats_t &st = st_asincos;
         /* 65536-point sweep: all representable values at radix 15 over [-1, +1) */
         for (int i = -32768; i <= 32767; i++) {
             double xd = (double)i / (1 << 15);
@@ -1850,18 +1856,19 @@ static void section_accuracy_table(void) {
             rad = FR_acos((s32)i, 15, R);
             stats_add(&st, xd, frd(rad, R), acos(xd));
         }
-        printf("| asin / acos | %.1f | %.4f | %.4f | 65536-pt; sqrt approx near boundary |\n",
-               st.max_abs_err / lsb, st.max_pct_err, stats_mean_pct(&st));
+        acc_row("asin / acos", &st, lsb, "65536-pt; sqrt approx near boundary");
     }
 
     /* --- atan2 --- */
     {
-        stats_t st; stats_reset(&st);
+        stats_t &st = st_atan2;
         /* 65536-point sweep at each radius.
          * Skip i=-32768 (exactly -pi): branch-cut convention differs
          * between FR_atan2 (+pi) and libm (-pi), both correct.
          * Start radii at 0.1 — at 0.01 inputs have <10 LSBs of angular
-         * resolution, testing input quantization not the algorithm. */
+         * resolution, testing input quantization not the algorithm.
+         * Also skip points where the minor axis has < 8 bits (256 counts)
+         * — below that, input quantization dominates over algorithm error. */
         double radii[] = {0.1, 1.0, 10.0, 100.0, 1000.0};
         for (int ri = 0; ri < (int)(sizeof(radii)/sizeof(radii[0])); ri++) {
             double rad = radii[ri];
@@ -1871,6 +1878,10 @@ static void section_accuracy_table(void) {
                 s32 fx = (s32)(x * scale);
                 s32 fy = (s32)(y * scale);
                 if (fx == 0 && fy == 0) continue;
+                s32 afx = (fx < 0) ? -fx : fx;
+                s32 afy = (fy < 0) ? -fy : fy;
+                s32 minor = (afx < afy) ? afx : afy;
+                if (minor < 256) continue; /* input quantization, not algo */
                 s32 r = FR_atan2(fy, fx, R);
                 double ref = atan2(y, x);
                 /* Skip near ±pi branch cut: sign depends on sub-LSB
@@ -1890,8 +1901,24 @@ static void section_accuracy_table(void) {
             s32 r = FR_atan2(fy, fx, R);
             stats_add(&st, specials_deg[si], frd(r, R), atan2(y, x));
         }
-        printf("| atan2 | %.1f | %.4f | %.4f | 65536x5 radii + specials |\n",
-               st.max_abs_err / lsb, st.max_pct_err, stats_mean_pct(&st));
+        acc_row("atan2", &st, lsb, "65536x5 radii; asin/acos+hypot_fast8");
+    }
+
+    /* --- atan --- */
+    {
+        stats_t st; stats_reset(&st);
+        /* Sweep atan(x) for x in [-10, 10] with fine steps near zero.
+         * FR_atan(input, radix, out_radix) calls FR_atan2(input, 1<<radix, out_radix).
+         * Skip |expected| < 0.01 to match the percent-error convention. */
+        for (int i = -10000; i <= 10000; i++) {
+            double x = i / 1000.0;
+            s32 fr = (s32)(x * scale);
+            s32 r = FR_atan(fr, (u16)R, (u16)R);
+            double ref = atan(x);
+            if (fabs(ref) < 0.01) continue;
+            stats_add(&st, x, frd(r, R), ref);
+        }
+        acc_row("atan", &st, lsb, "20001-pt sweep [-10,10]; via FR_atan2");
     }
 
     /* --- sqrt --- */
@@ -1910,8 +1937,7 @@ static void section_accuracy_table(void) {
             s32 r = FR_sqrt(fr, R);
             stats_add(&st, x, frd(r, R), sqrt(x));
         }
-        printf("| sqrt | %.1f | %.4f | %.4f | Round-to-nearest |\n",
-               st.max_abs_err / lsb, st.max_pct_err, stats_mean_pct(&st));
+        acc_row("sqrt", &st, lsb, "Round-to-nearest");
     }
 
     /* --- log2 --- */
@@ -1932,8 +1958,7 @@ static void section_accuracy_table(void) {
             s32 r = FR_log2(fr, (u16)R, (u16)R);
             stats_add(&st, x, frd(r, R), log2(x));
         }
-        printf("| log2 | %.1f | %.4f | %.4f | 65-entry mantissa table |\n",
-               st.max_abs_err / lsb, st.max_pct_err, stats_mean_pct(&st));
+        acc_row("log2", &st, lsb, "65-entry mantissa table");
     }
 
     /* --- pow2 --- */
@@ -1946,8 +1971,7 @@ static void section_accuracy_table(void) {
             double ref = pow(2.0, x);
             stats_add(&st, x, frd(r, R), ref);
         }
-        printf("| pow2 | %.1f | %.4f | %.4f | 65-entry fraction table |\n",
-               st.max_abs_err / lsb, st.max_pct_err, stats_mean_pct(&st));
+        acc_row("pow2", &st, lsb, "65-entry fraction table");
     }
 
     /* --- ln, log10 --- */
@@ -1964,8 +1988,7 @@ static void section_accuracy_table(void) {
             ref = log10(inputs[i]);
             stats_add(&st, inputs[i], frd(r, R), ref);
         }
-        printf("| ln, log10 | %.1f | %.4f | %.4f | Via FR_MULK28 from log2 |\n",
-               st.max_abs_err / lsb, st.max_pct_err, stats_mean_pct(&st));
+        acc_row("ln, log10", &st, lsb, "Via FR_MULK28 from log2");
     }
 
     /* --- exp (FR_EXP) --- */
@@ -1979,8 +2002,7 @@ static void section_accuracy_table(void) {
             if (ref > 32000.0 || ref < 1e-6) continue; /* skip overflow/underflow */
             stats_add(&st, x, frd(r, R), ref);
         }
-        printf("| exp | %.1f | %.4f | %.4f | FR_MULK28 + FR_pow2 |\n",
-               st.max_abs_err / lsb, st.max_pct_err, stats_mean_pct(&st));
+        acc_row("exp", &st, lsb, "FR_MULK28 + FR_pow2");
     }
 
     /* --- exp_fast (FR_EXP_FAST) --- */
@@ -1994,8 +2016,7 @@ static void section_accuracy_table(void) {
             if (ref > 32000.0 || ref < 1e-6) continue;
             stats_add(&st, x, frd(r, R), ref);
         }
-        printf("| exp_fast | %.1f | %.4f | %.4f | Shift-only scaling |\n",
-               st.max_abs_err / lsb, st.max_pct_err, stats_mean_pct(&st));
+        acc_row("exp_fast", &st, lsb, "Shift-only scaling");
     }
 
     /* --- pow10 (FR_POW10) --- */
@@ -2009,8 +2030,7 @@ static void section_accuracy_table(void) {
             if (ref > 32000.0 || ref < 1e-6) continue;
             stats_add(&st, x, frd(r, R), ref);
         }
-        printf("| pow10 | %.1f | %.4f | %.4f | FR_MULK28 + FR_pow2 |\n",
-               st.max_abs_err / lsb, st.max_pct_err, stats_mean_pct(&st));
+        acc_row("pow10", &st, lsb, "FR_MULK28 + FR_pow2");
     }
 
     /* --- pow10_fast (FR_POW10_FAST) --- */
@@ -2024,8 +2044,7 @@ static void section_accuracy_table(void) {
             if (ref > 32000.0 || ref < 1e-6) continue;
             stats_add(&st, x, frd(r, R), ref);
         }
-        printf("| pow10_fast | %.1f | %.4f | %.4f | Shift-only scaling |\n",
-               st.max_abs_err / lsb, st.max_pct_err, stats_mean_pct(&st));
+        acc_row("pow10_fast", &st, lsb, "Shift-only scaling");
     }
 
     /* --- hypot (exact) --- */
@@ -2042,26 +2061,7 @@ static void section_accuracy_table(void) {
             double ref = hypot(cases[i].x, cases[i].y);
             stats_add(&st, ref, frd(r, R), ref);
         }
-        printf("| hypot (exact) | %.1f | %.4f | %.4f | 64-bit intermediate |\n",
-               st.max_abs_err / lsb, st.max_pct_err, stats_mean_pct(&st));
-    }
-
-    /* --- hypot_fast (4-seg) --- */
-    {
-        stats_t st; stats_reset(&st);
-        struct { double x, y; } cases[] = {
-            {1,0},{0,1},{3,4},{5,12},{8,15},{-3,-4},{1,1},{0.5,0.5},
-            {100,100},{1000,1},{1,1000},{7,24},{20,21}
-        };
-        for (int i = 0; i < (int)(sizeof(cases)/sizeof(cases[0])); i++) {
-            s32 fx = (s32)(cases[i].x * scale);
-            s32 fy = (s32)(cases[i].y * scale);
-            s32 r = FR_hypot_fast(fx, fy);
-            double ref = hypot(cases[i].x, cases[i].y);
-            if (ref > 0) stats_add(&st, ref, frd(r, R), ref);
-        }
-        printf("| hypot_fast (4-seg) | %.1f | %.4f | %.4f | Shift-only, no multiply |\n",
-               st.max_abs_err / lsb, st.max_pct_err, stats_mean_pct(&st));
+        acc_row("hypot (exact)", &st, lsb, "64-bit intermediate");
     }
 
     /* --- hypot_fast8 (8-seg) --- */
@@ -2078,15 +2078,39 @@ static void section_accuracy_table(void) {
             double ref = hypot(cases[i].x, cases[i].y);
             if (ref > 0) stats_add(&st, ref, frd(r, R), ref);
         }
-        printf("| hypot_fast8 (8-seg) | %.1f | %.4f | %.4f | Shift-only, no multiply |\n",
-               st.max_abs_err / lsb, st.max_pct_err, stats_mean_pct(&st));
+        acc_row("hypot_fast8 (8-seg)", &st, lsb, "Shift-only, no multiply");
     }
 
     printf("<!-- ACCURACY_TABLE_END -->\n");
     printf("\n");
+
+    /* Diagnostic: show where each trig function's worst % error occurs */
+    md_h3("14.1 Worst-case percent error diagnostics");
+    printf("Shows the input that produced the maximum %% error for each trig function.\n");
+    printf("This helps identify whether the peak is a genuine algorithm limitation or\n");
+    printf("a near-zero denominator artifact.\n\n");
+    printf("| Function | Worst-pct input | Expected | Got | Abs err | Pct err |\n");
+    printf("|---|---|---:|---:|---:|---:|\n");
+
+    struct { const char *name; stats_t *s; } diag[] = {
+        {"sin / cos", &st_sincos},
+        {"tan",       &st_tan},
+        {"asin/acos", &st_asincos},
+        {"atan2",     &st_atan2},
+    };
+    for (int d = 0; d < (int)(sizeof(diag)/sizeof(diag[0])); d++) {
+        stats_t *s = diag[d].s;
+        double ae = fabs(s->worst_pct_actual - s->worst_pct_expected);
+        printf("| %s | %.4f | %.6f | %.6f | %.6f | %.4f%% |\n",
+               diag[d].name, s->worst_pct_input,
+               s->worst_pct_expected, s->worst_pct_actual, ae,
+               s->max_pct_err);
+    }
+    printf("\n");
 }
 
 int main(void) {
+    g_showpeak = (getenv("FR_SHOWPEAK") != NULL);
     md_h1("FR_Math TDD Characterization Report");
     printf("> Generated by `tests/test_tdd.cpp`. This is a measurement suite, not a pass/fail suite.\n");
     printf("> All numbers below are *what the library actually does*, compared to libm `double` references.\n");

From 12a6936afa9fd54ab6211e687bd9f7a19db47299 Mon Sep 17 00:00:00 2001
From: deftio <deftio@deftio.com>
Date: Tue, 21 Apr 2026 21:31:17 -0700
Subject: [PATCH 3/6] docs and accuracy tables clean up, how with defines to
 leaner builds.

---
 README.md                           | 42 +++++++++++++---
 agents.md                           |  9 ++++
 compare_lfm/benchmark.cpp           | 14 +++---
 compare_lfm/comparison_results.json | 76 ++++++++++++++---------------
 compare_lfm/comparison_summary.md   | 38 +++++++--------
 docs/README.md                      | 31 +++++++++++-
 docs/building.md                    | 28 ++++++++++-
 docs/releases.md                    | 12 +++++
 keywords.txt                        |  1 -
 llms.txt                            |  8 ++-
 pages/guide/building.html           | 29 ++++++++++-
 pages/index.html                    | 32 +++++++++++-
 pages/releases.html                 | 14 ++++++
 release_notes.md                    | 37 ++++++++++++++
 src/FR_math.c                       |  4 ++
 src/FR_math.h                       | 30 ++++++++++++
 16 files changed, 327 insertions(+), 78 deletions(-)

diff --git a/README.md b/README.md
index b9c9a74..e8853da 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,9 @@ beyond `<stdint.h>`.
 
 ### Library size (FR_math.c only, `-Os`)
 
-The following are compilied object code sizes on select platforms from static test build.  Your sizes may vary depending on optimization choices and linker settings.  Sizes include full code and any internal tables and are ROMable.
+Compiled object code sizes on select platforms (static test build). Your
+sizes may vary depending on optimization and linker settings. Sizes
+include all code and internal tables; everything is ROMable.
 
 | Target | Code (text) |
 |--------|-------------|
@@ -38,13 +40,41 @@ The optional 2D module adds ~1 KB.
 \* 8051 and MSP430 are 8/16-bit — every 32-bit operation expands to multiple instructions.
 See [`docker/`](docker/) for the cross-compile setup.
 
+### Lean build options
+
+Two compile-time `#define` guards let you strip optional subsystems
+for ROM-constrained targets. Define them before including `FR_math.h`
+(or pass `-D` on the compiler command line):
+
+| Define | What it removes | Typical savings |
+|---|---|---|
+| `FR_NO_PRINT` | `FR_printNumF`, `FR_printNumD`, `FR_printNumH`, `FR_numstr` | ~1.3 KB |
+| `FR_NO_WAVES` | `fr_wave_*` (6 shapes), `fr_adsr_*` (ADSR envelope), `FR_HZ2BAM_INC` | ~0.6 KB |
+
+With both guards enabled the core math library (trig, inverse trig, log/exp,
+sqrt, hypot) compiles to ~3.5 KB on x86-64 / clang -Os. On Thumb-2 this
+would be roughly 2.6 KB.
+
+```c
+/* Example: headless sensor node — math only, no print, no audio */
+#define FR_NO_PRINT
+#define FR_NO_WAVES
+#include "FR_math.h"
+```
+
+With `-ffunction-sections` and linker `--gc-sections`, the linker will
+also strip any unused functions automatically, so these guards are most
+useful when you include the library as a single `.c` file or static
+archive without section-level dead-code elimination.
+
 ### Measured accuracy
 
 Errors below are measured at Q16.16 (s15.16). All functions accept any
 radix — Q16.16 is just the reference point for the table.
 Percent errors skip expected values near zero (|expected| < 0.01).
 
-Note that at other radixes (3bit, 24 bit etc), accuracy may change due fractional bits available but with increased/decreased scale.  All functions support 0 to 30 bit radix types at compile time.
+At other radixes (3-bit, 24-bit, etc.) accuracy will differ due to the
+number of fractional bits available. All functions support radix 0 to 30.
 
 <!-- ACCURACY_TABLE_START -->
 | Function | Max err (LSB) | Max err (%) | Avg err (%) | Note |
@@ -88,7 +118,7 @@ Note that at other radixes (3bit, 24 bit etc), accuracy may change due fractiona
 git clone https://github.com/deftio/fr_math.git
 cd fr_math
 make lib       # build static library
-make test      # run all tests (coverage, TDD characterization, 2D)
+make test      # run all tests (unit, TDD characterization, 2D)
 ```
 
 ## Quick taste
@@ -120,7 +150,7 @@ s32 two  = I2FR(2, R);              /* 2.0 → raw 131072              */
  * UPPERCASE FR_ names are macros — they expand inline with no call
  * overhead, and the compiler can constant-fold them.  Use these for
  * conversions and simple arithmetic:
- *   I2FR, FR2I, FR_NUM, FR_ADD, FR_MUL, FR_DIV, FR_ABS, FR_EXP ...
+ *   I2FR, FR2I, FR_NUM, FR_ADD, FR_DIV, FR_ABS, FR_CHRDX, FR_EXP ...
  *
  * MixedCase FR_ names are functions — they contain loops, tables, or
  * multi-step algorithms where inlining would waste ROM:
@@ -176,7 +206,7 @@ The full docs ship in two forms — pick whichever fits how you read.
 FR_Math has been in service since 2000, originally built for graphics
 transforms on 16 MHz 68k Palm Pilots. It shipped inside Trumpetsoft's
 *Inkstorm* on PalmOS, then moved forward through ARM, x86, MIPS,
-RISC-V, and various 8/16-bit embedded targets. v2.0.2 is the current
+RISC-V, and various 8/16-bit embedded targets. v2.0.6 is the current
 release with a full test suite, bit-exact numerical specification, and
 CI on every push.
 
@@ -192,5 +222,5 @@ BSD-2-Clause — see [LICENSE.txt](LICENSE.txt).
 
 ## Version
 
-2.0.2 — see [release_notes.md](release_notes.md) for the v1 → v2
+2.0.6 — see [release_notes.md](release_notes.md) for the v1 → v2
 migration guide, numerical fixes, and new functionality.
diff --git a/agents.md b/agents.md
index f828fb6..2a461f6 100644
--- a/agents.md
+++ b/agents.md
@@ -90,6 +90,15 @@ Versioned files (all synced automatically):
 4. Verify `llms.txt` and `agents.md` are current with any API changes
 5. Commit, tag, push
 
+## Lean build options
+
+Define before including `FR_math.h` to exclude optional subsystems:
+
+| Define | Removes | Savings |
+|---|---|---|
+| `FR_NO_PRINT` | `FR_printNumF/D/H`, `FR_numstr` | ~1.3 KB |
+| `FR_NO_WAVES` | `fr_wave_*`, `fr_adsr_*`, `FR_HZ2BAM_INC` | ~0.6 KB |
+
 ## Platform targets
 
 The library compiles on: AVR (Arduino), ARM Cortex-M0/M4, ESP32,
diff --git a/compare_lfm/benchmark.cpp b/compare_lfm/benchmark.cpp
index 35a58fa..19739c3 100644
--- a/compare_lfm/benchmark.cpp
+++ b/compare_lfm/benchmark.cpp
@@ -695,8 +695,8 @@ static void emit_json(FILE *f, const std::vector<BenchResult>& results) {
     fprintf(f, "    \"compiler\": \"clang -O2 (macOS ARM)\",\n");
     fprintf(f, "    \"fr_math\": {\n");
     fprintf(f, "      \"files\": \"FR_math.c (single file)\",\n");
-    fprintf(f, "      \"functions\": \"trig(6), inv-trig(4), log/ln/log10, exp/pow2/pow10, exp_fast/pow10_fast, sqrt, hypot(3), waves(6), ADSR(4), print(4), format\",\n");
-    fprintf(f, "      \"rom_bytes\": 7722,\n");
+    fprintf(f, "      \"functions\": \"trig(6), inv-trig(4), log/ln/log10, exp/pow2/pow10, exp_fast/pow10_fast, sqrt, hypot(2), waves(6), ADSR(4), print(4), format\",\n");
+    fprintf(f, "      \"rom_bytes\": 7470,\n");
     fprintf(f, "      \"ram_bss_bytes\": 0,\n");
     fprintf(f, "      \"note\": \"All tables in const ROM. Zero runtime allocation.\"\n");
     fprintf(f, "    },\n");
@@ -786,13 +786,13 @@ static void emit_markdown(FILE *f, const std::vector<BenchResult>& results) {
     fprintf(f, "\n### Compiled size (clang -O2, macOS ARM)\n\n");
     fprintf(f, "| | FR_math | libfixmath | lfm (no cache) |\n");
     fprintf(f, "|---|---:|---:|---:|\n");
-    fprintf(f, "| Code (text) | 6,888 B | 4,880 B | 5,444 B |\n");
-    fprintf(f, "| Tables (ROM) | 834 B | 32 B | 32 B |\n");
-    fprintf(f, "| **ROM total** | **7,722 B** | **4,912 B** | **5,476 B** |\n");
+    fprintf(f, "| Code (text) | 6,652 B | 4,880 B | 5,444 B |\n");
+    fprintf(f, "| Tables (ROM) | 818 B | 32 B | 32 B |\n");
+    fprintf(f, "| **ROM total** | **7,470 B** | **4,912 B** | **5,476 B** |\n");
     fprintf(f, "| BSS / RAM | **0 B** | **112 KB** | **0 B** |\n");
     fprintf(f, "\n");
-    fprintf(f, "FR_math packs trig, inv-trig, log/ln/log10, exp/pow2/pow10, sqrt, hypot(3),\n");
-    fprintf(f, "waves(6), ADSR, print into 7.5 KB ROM with zero RAM overhead.\n");
+    fprintf(f, "FR_math packs trig, inv-trig, log/ln/log10, exp/pow2/pow10, sqrt, hypot(2),\n");
+    fprintf(f, "waves(6), ADSR, print into 7.3 KB ROM with zero RAM overhead.\n");
     fprintf(f, "libfixmath (trig, inv-trig, log/log2, exp, sqrt, mul/div, str) is 4.8 KB ROM\n");
     fprintf(f, "but caches 112 KB of sin/exp LUTs in BSS at runtime.\n");
     fprintf(f, "\n");
diff --git a/compare_lfm/comparison_results.json b/compare_lfm/comparison_results.json
index 73a81f7..adf0019 100644
--- a/compare_lfm/comparison_results.json
+++ b/compare_lfm/comparison_results.json
@@ -13,9 +13,9 @@
       "double_reference": "std::sin",
       "sweep": "65536-pt, [-pi, +pi]",
       "speed": {
-        "fr_math_ns_per_call": 2.5,
-        "libfixmath_ns_per_call": 10.3,
-        "fr_math_speedup": 4.06,
+        "fr_math_ns_per_call": 2.6,
+        "libfixmath_ns_per_call": 20.7,
+        "fr_math_speedup": 7.94,
         "faster": "fr_math"
       },
       "accuracy_vs_double": {
@@ -43,9 +43,9 @@
       "double_reference": "std::cos",
       "sweep": "65536-pt, [-pi, +pi]",
       "speed": {
-        "fr_math_ns_per_call": 2.3,
-        "libfixmath_ns_per_call": 10.3,
-        "fr_math_speedup": 4.51,
+        "fr_math_ns_per_call": 4.8,
+        "libfixmath_ns_per_call": 18.4,
+        "fr_math_speedup": 3.86,
         "faster": "fr_math"
       },
       "accuracy_vs_double": {
@@ -73,9 +73,9 @@
       "double_reference": "std::tan",
       "sweep": "65536-pt, [-1.2, 1.2] rad",
       "speed": {
-        "fr_math_ns_per_call": 4.2,
-        "libfixmath_ns_per_call": 29.5,
-        "fr_math_speedup": 7.02,
+        "fr_math_ns_per_call": 6.0,
+        "libfixmath_ns_per_call": 41.4,
+        "fr_math_speedup": 6.89,
         "faster": "fr_math"
       },
       "accuracy_vs_double": {
@@ -104,9 +104,9 @@
       "double_reference": "std::asin",
       "sweep": "65536-pt, [-0.999, 0.999]",
       "speed": {
-        "fr_math_ns_per_call": 9.0,
-        "libfixmath_ns_per_call": 49.8,
-        "fr_math_speedup": 5.55,
+        "fr_math_ns_per_call": 11.5,
+        "libfixmath_ns_per_call": 53.7,
+        "fr_math_speedup": 4.67,
         "faster": "fr_math"
       },
       "accuracy_vs_double": {
@@ -135,8 +135,8 @@
       "sweep": "65536-pt, [-0.999, 0.999]",
       "speed": {
         "fr_math_ns_per_call": 8.4,
-        "libfixmath_ns_per_call": 50.9,
-        "fr_math_speedup": 6.05,
+        "libfixmath_ns_per_call": 50.4,
+        "fr_math_speedup": 5.97,
         "faster": "fr_math"
       },
       "accuracy_vs_double": {
@@ -164,9 +164,9 @@
       "double_reference": "std::atan",
       "sweep": "65536-pt, [-50, 50]",
       "speed": {
-        "fr_math_ns_per_call": 8.4,
-        "libfixmath_ns_per_call": 11.4,
-        "fr_math_speedup": 1.35,
+        "fr_math_ns_per_call": 8.0,
+        "libfixmath_ns_per_call": 11.2,
+        "fr_math_speedup": 1.41,
         "faster": "fr_math"
       },
       "accuracy_vs_double": {
@@ -194,8 +194,8 @@
       "double_reference": "std::atan2",
       "sweep": "65536-pt, 5 radii x 360 deg",
       "speed": {
-        "fr_math_ns_per_call": 16.1,
-        "libfixmath_ns_per_call": 10.7,
+        "fr_math_ns_per_call": 15.9,
+        "libfixmath_ns_per_call": 10.5,
         "fr_math_speedup": 0.66,
         "faster": "libfixmath"
       },
@@ -225,9 +225,9 @@
       "double_reference": "std::sqrt",
       "sweep": "65536-pt, [0.01, 100]",
       "speed": {
-        "fr_math_ns_per_call": 19.2,
-        "libfixmath_ns_per_call": 20.7,
-        "fr_math_speedup": 1.08,
+        "fr_math_ns_per_call": 18.6,
+        "libfixmath_ns_per_call": 19.8,
+        "fr_math_speedup": 1.06,
         "faster": "fr_math"
       },
       "accuracy_vs_double": {
@@ -255,9 +255,9 @@
       "double_reference": "std::exp",
       "sweep": "65536-pt, [-5, 5]",
       "speed": {
-        "fr_math_ns_per_call": 3.2,
-        "libfixmath_ns_per_call": 65.2,
-        "fr_math_speedup": 20.21,
+        "fr_math_ns_per_call": 3.1,
+        "libfixmath_ns_per_call": 67.6,
+        "fr_math_speedup": 22.02,
         "faster": "fr_math"
       },
       "accuracy_vs_double": {
@@ -286,8 +286,8 @@
       "sweep": "65536-pt, [0.01, 100]",
       "speed": {
         "fr_math_ns_per_call": 8.8,
-        "libfixmath_ns_per_call": 457.0,
-        "fr_math_speedup": 51.86,
+        "libfixmath_ns_per_call": 479.3,
+        "fr_math_speedup": 54.70,
         "faster": "fr_math"
       },
       "accuracy_vs_double": {
@@ -315,9 +315,9 @@
       "double_reference": "std::log2",
       "sweep": "65536-pt, [0.01, 100]",
       "speed": {
-        "fr_math_ns_per_call": 8.9,
-        "libfixmath_ns_per_call": 40.2,
-        "fr_math_speedup": 4.50,
+        "fr_math_ns_per_call": 8.7,
+        "libfixmath_ns_per_call": 39.4,
+        "fr_math_speedup": 4.55,
         "faster": "fr_math"
       },
       "accuracy_vs_double": {
@@ -345,9 +345,9 @@
       "double_reference": "double a*b",
       "sweep": "65536-pt, a in [-50,50], b in [-2,2]",
       "speed": {
-        "fr_math_ns_per_call": 1.0,
-        "libfixmath_ns_per_call": 1.3,
-        "fr_math_speedup": 1.34,
+        "fr_math_ns_per_call": 0.9,
+        "libfixmath_ns_per_call": 1.2,
+        "fr_math_speedup": 1.33,
         "faster": "fr_math"
       },
       "accuracy_vs_double": {
@@ -376,8 +376,8 @@
       "sweep": "65536-pt, a/b in [-50,50]/[0.5,50]",
       "speed": {
         "fr_math_ns_per_call": 0.9,
-        "libfixmath_ns_per_call": 5.6,
-        "fr_math_speedup": 6.21,
+        "libfixmath_ns_per_call": 5.2,
+        "fr_math_speedup": 5.98,
         "faster": "fr_math"
       },
       "accuracy_vs_double": {
@@ -406,7 +406,7 @@
       "double_reference": "std::hypot",
       "sweep": "65536-pt, 5 radii x 360 deg",
       "speed": {
-        "fr_math_ns_per_call": 20.2
+        "fr_math_ns_per_call": 20.0
       },
       "accuracy_vs_double": {
         "fr_math": {
@@ -461,8 +461,8 @@
     "compiler": "clang -O2 (macOS ARM)",
     "fr_math": {
       "files": "FR_math.c (single file)",
-      "functions": "trig(6), inv-trig(4), log/ln/log10, exp/pow2/pow10, exp_fast/pow10_fast, sqrt, hypot(3), waves(6), ADSR(4), print(4), format",
-      "rom_bytes": 7722,
+      "functions": "trig(6), inv-trig(4), log/ln/log10, exp/pow2/pow10, exp_fast/pow10_fast, sqrt, hypot(2), waves(6), ADSR(4), print(4), format",
+      "rom_bytes": 7470,
       "ram_bss_bytes": 0,
       "note": "All tables in const ROM. Zero runtime allocation."
     },
diff --git a/compare_lfm/comparison_summary.md b/compare_lfm/comparison_summary.md
index 63e9490..9169c50 100644
--- a/compare_lfm/comparison_summary.md
+++ b/compare_lfm/comparison_summary.md
@@ -43,20 +43,20 @@ All errors measured vs IEEE 754 double. Pct errors skip |ref| < 0.01.
 
 | Function | FR_math | libfixmath | Speedup | Faster |
 |----------|--------:|-----------:|--------:|--------|
-| sin             |    2.5 |       10.3 |   4.06x | FR      |
-| cos             |    2.3 |       10.3 |   4.51x | FR      |
-| tan             |    4.2 |       29.5 |   7.02x | FR      |
-| asin            |    9.0 |       49.8 |   5.55x | FR      |
-| acos            |    8.4 |       50.9 |   6.05x | FR      |
-| atan            |    8.4 |       11.4 |   1.35x | FR      |
-| atan2           |   16.1 |       10.7 |   0.66x | lfm     |
-| sqrt            |   19.2 |       20.7 |   1.08x | FR      |
-| exp             |    3.2 |       65.2 |  20.21x | FR      |
-| ln              |    8.8 |      457.0 |  51.86x | FR      |
-| log2            |    8.9 |       40.2 |   4.50x | FR      |
-| mul             |    1.0 |        1.3 |   1.34x | FR      |
-| div             |    0.9 |        5.6 |   6.21x | FR      |
-| hypot           |   20.2 |        --- |     --- | FR only |
+| sin             |    2.6 |       20.7 |   7.94x | FR      |
+| cos             |    4.8 |       18.4 |   3.86x | FR      |
+| tan             |    6.0 |       41.4 |   6.89x | FR      |
+| asin            |   11.5 |       53.7 |   4.67x | FR      |
+| acos            |    8.4 |       50.4 |   5.97x | FR      |
+| atan            |    8.0 |       11.2 |   1.41x | FR      |
+| atan2           |   15.9 |       10.5 |   0.66x | lfm     |
+| sqrt            |   18.6 |       19.8 |   1.06x | FR      |
+| exp             |    3.1 |       67.6 |  22.02x | FR      |
+| ln              |    8.8 |      479.3 |  54.70x | FR      |
+| log2            |    8.7 |       39.4 |   4.55x | FR      |
+| mul             |    0.9 |        1.2 |   1.33x | FR      |
+| div             |    0.9 |        5.2 |   5.98x | FR      |
+| hypot           |   20.0 |        --- |     --- | FR only |
 | hypot_fast8     |    2.4 |        --- |     --- | FR only |
 
 ### Summary (13 head-to-head functions)
@@ -69,13 +69,13 @@ All errors measured vs IEEE 754 double. Pct errors skip |ref| < 0.01.
 
 | | FR_math | libfixmath | lfm (no cache) |
 |---|---:|---:|---:|
-| Code (text) | 6,888 B | 4,880 B | 5,444 B |
-| Tables (ROM) | 834 B | 32 B | 32 B |
-| **ROM total** | **7,722 B** | **4,912 B** | **5,476 B** |
+| Code (text) | 6,652 B | 4,880 B | 5,444 B |
+| Tables (ROM) | 818 B | 32 B | 32 B |
+| **ROM total** | **7,470 B** | **4,912 B** | **5,476 B** |
 | BSS / RAM | **0 B** | **112 KB** | **0 B** |
 
-FR_math packs trig, inv-trig, log/ln/log10, exp/pow2/pow10, sqrt, hypot(3),
-waves(6), ADSR, print into 7.5 KB ROM with zero RAM overhead.
+FR_math packs trig, inv-trig, log/ln/log10, exp/pow2/pow10, sqrt, hypot(2),
+waves(6), ADSR, print into 7.3 KB ROM with zero RAM overhead.
 libfixmath (trig, inv-trig, log/log2, exp, sqrt, mul/div, str) is 4.8 KB ROM
 but caches 112 KB of sin/exp LUTs in BSS at runtime.
 
diff --git a/docs/README.md b/docs/README.md
index d30d8c9..a678336 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -84,6 +84,33 @@ radix — Q16.16 is just the reference point for the table. See the
 Every function is covered by the TDD characterization suite in
 [`tests/test_tdd.cpp`](../tests/test_tdd.cpp).
 
+## Lean build options
+
+Two compile-time `#define` guards let you strip optional subsystems
+for ROM-constrained targets. Define them before including `FR_math.h`
+(or pass `-D` on the compiler command line):
+
+| Define | What it removes | Typical savings |
+|---|---|---|
+| `FR_NO_PRINT` | `FR_printNumF`, `FR_printNumD`, `FR_printNumH`, `FR_numstr` | ~1.3 KB |
+| `FR_NO_WAVES` | `fr_wave_*` (6 shapes), `fr_adsr_*` (ADSR envelope), `FR_HZ2BAM_INC` | ~0.6 KB |
+
+With both guards enabled the core math library (trig, inverse trig, log/exp,
+sqrt, hypot) compiles to ~3.5 KB on x86-64 / clang -Os. On Thumb-2 this
+would be roughly 2.6 KB.
+
+```c
+/* Example: headless sensor node — math only, no print, no audio */
+#define FR_NO_PRINT
+#define FR_NO_WAVES
+#include "FR_math.h"
+```
+
+With `-ffunction-sections` and linker `--gc-sections`, the linker will
+also strip any unused functions automatically, so these guards are most
+useful when you include the library as a single `.c` file or static
+archive without section-level dead-code elimination.
+
 ## Why fixed-point, in 2026?
 
 Most application code today has an FPU and can use `float` freely.
@@ -133,7 +160,7 @@ s32 two  = I2FR(2, R);              /* 2.0 → raw 131072              */
  * UPPERCASE FR_ names are macros — they expand inline with no call
  * overhead, and the compiler can constant-fold them.  Use these for
  * conversions and simple arithmetic:
- *   I2FR, FR2I, FR_NUM, FR_ADD, FR_MUL, FR_DIV, FR_ABS, FR_EXP ...
+ *   I2FR, FR2I, FR_NUM, FR_ADD, FR_DIV, FR_ABS, FR_CHRDX, FR_EXP ...
  *
  * MixedCase FR_ names are functions — they contain loops, tables, or
  * multi-step algorithms where inlining would waste ROM:
@@ -198,7 +225,7 @@ script.
 FR_Math has been in service since **2000**, originally built for
 graphics transforms on 16 MHz 68k Palm Pilots (it shipped inside
 Trumpetsoft's *Inkstorm*), then ported forward to ARM, x86, MIPS,
-RISC-V, and various 8/16-bit embedded targets. v2.0.2 is the current
+RISC-V, and various 8/16-bit embedded targets. v2.0.6 is the current
 release with a full test suite, bit-exact numerical
 specification, and CI on every push.
 
diff --git a/docs/building.md b/docs/building.md
index 2265b69..e7c805e 100644
--- a/docs/building.md
+++ b/docs/building.md
@@ -95,7 +95,7 @@ binaries to keep compile times low:
 
 | Binary | What it checks |
 | --- | --- |
-| `test_basic` | Radix conversions, `FR_ADD`, `FR_MUL`, rounding. |
+| `test_basic` | Radix conversions, `FR_ADD`, `FR_FixMuls`, rounding. |
 | `test_trig` | Integer-degree trig (`FR_Sin` et al.). |
 | `test_trig_radians` | Radian / BAM trig and the v2 `fr_sin` API. |
 | `test_log_exp` | Log base 2 / ln / log10 and their inverses. |
@@ -184,6 +184,32 @@ references are linked, so real flash usage will be smaller.
 | GCC 68HC11 | 17,331 |
 <!-- SIZE_TABLE_END -->
 
+### Lean build options
+
+Two compile-time `#define` guards let you strip optional subsystems
+for ROM-constrained targets. Define them before including `FR_math.h`
+(or pass `-D` on the compiler command line):
+
+| Define | What it removes | Typical savings |
+|---|---|---|
+| `FR_NO_PRINT` | `FR_printNumF`, `FR_printNumD`, `FR_printNumH`, `FR_numstr` | ~1.3 KB |
+| `FR_NO_WAVES` | `fr_wave_*` (6 shapes), `fr_adsr_*` (ADSR envelope), `FR_HZ2BAM_INC` | ~0.6 KB |
+
+With both guards enabled the core math library (trig, inverse trig, log/exp,
+sqrt, hypot) compiles to ~3.5 KB on x86-64 / clang -Os.
+
+```c
+/* Example: headless sensor node — math only, no print, no audio */
+#define FR_NO_PRINT
+#define FR_NO_WAVES
+#include "FR_math.h"
+```
+
+With `-ffunction-sections` and linker `--gc-sections`, the linker will
+also strip any unused functions automatically, so these guards are most
+useful when you include the library as a single `.c` file or static
+archive without section-level dead-code elimination.
+
 To regenerate this table, run the Docker cross-build
 (requires the [xelp](https://github.com/deftio/xelp) Docker image):
 
diff --git a/docs/releases.md b/docs/releases.md
index 2f6e11c..2568ae7 100644
--- a/docs/releases.md
+++ b/docs/releases.md
@@ -4,6 +4,18 @@ Release highlights. For the full per-symbol change log, see
 [release_notes.md](https://github.com/deftio/fr_math/blob/master/release_notes.md)
 in the repo.
 
+## v2.0.6 — 2026
+
+Accuracy improvements, lean-build options, library cleanup.
+
+- **FR_acos boundary fix** — 12x better accuracy near ±1.0 via deferred quantization
+- **FR_atan2 rewrite** — asin/acos + hypot_fast8, 0.41% peak error (was 20% in libfixmath)
+- **Lean build guards** — `FR_NO_PRINT` (~1.3 KB) and `FR_NO_WAVES` (~0.6 KB) for ROM-constrained targets
+- **Removed FR_hypot_fast** (4-segment) — FR_hypot_fast8 is strictly better; 4-seg was dead weight
+- libfixmath comparison benchmark added to repo (`compare_lfm/`)
+
+---
+
 ## v2.0.5 — 2026
 
 Release pipeline fixes. Fixed squash-merge divergence handling and
diff --git a/keywords.txt b/keywords.txt
index 483da8b..1ab2703 100644
--- a/keywords.txt
+++ b/keywords.txt
@@ -28,7 +28,6 @@ FR_log10	KEYWORD2
 FR_pow2	KEYWORD2
 FR_sqrt	KEYWORD2
 FR_hypot	KEYWORD2
-FR_hypot_fast	KEYWORD2
 FR_hypot_fast8	KEYWORD2
 FR_printNumF	KEYWORD2
 FR_printNumD	KEYWORD2
diff --git a/llms.txt b/llms.txt
index 33e0b78..fe5ee3b 100644
--- a/llms.txt
+++ b/llms.txt
@@ -116,7 +116,6 @@ FR_POW10_FAST(input, radix)
 ```c
 s32 FR_sqrt(s32 input, u16 radix);
 s32 FR_hypot(s32 x, s32 y, u16 radix);       // exact, 64-bit intermediate
-s32 FR_hypot_fast(s32 x, s32 y);              // 4-segment, 0.34% error, no multiply
 s32 FR_hypot_fast8(s32 x, s32 y);             // 8-segment, 0.10% error, no multiply
 ```
 
@@ -171,6 +170,13 @@ make examples  # build example program
 make clean     # remove build artifacts
 ```
 
+## Lean build options
+
+Define before including FR_math.h to exclude optional subsystems:
+
+- `FR_NO_PRINT` — removes FR_printNumF/D/H and FR_numstr (~1.3 KB saved)
+- `FR_NO_WAVES` — removes fr_wave_*, fr_adsr_*, FR_HZ2BAM_INC (~0.6 KB saved)
+
 ## Platform support
 
 Tested on: AVR (Arduino), ARM Cortex-M0/M4, ESP32 (Xtensa), RISC-V,
diff --git a/pages/guide/building.html b/pages/guide/building.html
index f63195f..12750f6 100644
--- a/pages/guide/building.html
+++ b/pages/guide/building.html
@@ -113,7 +113,7 @@ <h2>The test suite</h2>
 <table>
 <thead><tr><th>Binary</th><th>What it checks</th></tr></thead>
 <tbody>
-<tr><td><code>test_basic</code></td><td>Radix conversions, <code>FR_ADD</code>, <code>FR_MUL</code>, rounding.</td></tr>
+<tr><td><code>test_basic</code></td><td>Radix conversions, <code>FR_ADD</code>, <code>FR_FixMuls</code>, rounding.</td></tr>
 <tr><td><code>test_trig</code></td><td>Integer-degree trig (<code>FR_Sin</code> et al.).</td></tr>
 <tr><td><code>test_trig_radians</code></td><td>Radian / BAM trig and the v2 <code>fr_sin</code> API.</td></tr>
 <tr><td><code>test_log_exp</code></td><td>Log base 2 / ln / log10 and their inverses.</td></tr>
@@ -209,6 +209,33 @@ <h3>Code size (<code>.text</code> section, compiled with <code>-Os</code>)</h3>
 </table>
 <!-- SIZE_TABLE_END -->
 
+<h3>Lean build options</h3>
+
+<p>Two compile-time <code>#define</code> guards let you strip optional subsystems
+for ROM-constrained targets. Define them before including
+<code>FR_math.h</code> (or pass <code>-D</code> on the compiler command line):</p>
+
+<table>
+<thead><tr><th>Define</th><th>What it removes</th><th>Typical savings</th></tr></thead>
+<tbody>
+<tr><td><code>FR_NO_PRINT</code></td><td><code>FR_printNumF</code>, <code>FR_printNumD</code>, <code>FR_printNumH</code>, <code>FR_numstr</code></td><td>~1.3 KB</td></tr>
+<tr><td><code>FR_NO_WAVES</code></td><td><code>fr_wave_*</code> (6 shapes), <code>fr_adsr_*</code> (ADSR envelope), <code>FR_HZ2BAM_INC</code></td><td>~0.6 KB</td></tr>
+</tbody>
+</table>
+
+<p>With both guards enabled the core math library (trig, inverse trig, log/exp,
+sqrt, hypot) compiles to ~3.5 KB on x86-64 / clang -Os.</p>
+
+<pre><code class="language-c">/* Example: headless sensor node &mdash; math only, no print, no audio */
+#define FR_NO_PRINT
+#define FR_NO_WAVES
+#include "FR_math.h"</code></pre>
+
+<p>With <code>-ffunction-sections</code> and linker <code>--gc-sections</code>,
+the linker will also strip any unused functions automatically, so these guards
+are most useful when you include the library as a single <code>.c</code> file
+or static archive without section-level dead-code elimination.</p>
+
 <p>To regenerate this table, run the Docker cross-build
 (requires the <a href="https://github.com/deftio/xelp">xelp</a> Docker image):</p>
 
diff --git a/pages/index.html b/pages/index.html
index 5e0e3f4..35c81d2 100644
--- a/pages/index.html
+++ b/pages/index.html
@@ -96,6 +96,34 @@ <h2>What&rsquo;s in the box</h2>
 <a href="https://github.com/deftio/fr_math/blob/master/tests/test_tdd.cpp">TDD
 characterization suite</a> in the repo.</p>
 
+<h2>Lean build options</h2>
+
+<p>Two compile-time <code>#define</code> guards let you strip optional subsystems
+for ROM-constrained targets. Define them before including
+<code>FR_math.h</code> (or pass <code>-D</code> on the compiler command line):</p>
+
+<table>
+<thead><tr><th>Define</th><th>What it removes</th><th>Typical savings</th></tr></thead>
+<tbody>
+<tr><td><code>FR_NO_PRINT</code></td><td><code>FR_printNumF</code>, <code>FR_printNumD</code>, <code>FR_printNumH</code>, <code>FR_numstr</code></td><td>~1.3 KB</td></tr>
+<tr><td><code>FR_NO_WAVES</code></td><td><code>fr_wave_*</code> (6 shapes), <code>fr_adsr_*</code> (ADSR envelope), <code>FR_HZ2BAM_INC</code></td><td>~0.6 KB</td></tr>
+</tbody>
+</table>
+
+<p>With both guards enabled the core math library (trig, inverse trig, log/exp,
+sqrt, hypot) compiles to ~3.5 KB on x86-64 / clang -Os. On Thumb-2 this
+would be roughly 2.6 KB.</p>
+
+<pre><code class="language-c">/* Example: headless sensor node &mdash; math only, no print, no audio */
+#define FR_NO_PRINT
+#define FR_NO_WAVES
+#include "FR_math.h"</code></pre>
+
+<p>With <code>-ffunction-sections</code> and linker <code>--gc-sections</code>,
+the linker will also strip any unused functions automatically, so these guards
+are most useful when you include the library as a single <code>.c</code> file
+or static archive without section-level dead-code elimination.</p>
+
 <h2>Why fixed-point, in 2026?</h2>
 
 <p>Most application code today has an FPU and can use <code>float</code>
@@ -149,7 +177,7 @@ <h2>Quick taste</h2>
  * UPPERCASE FR_ names are macros &mdash; they expand inline with no call
  * overhead, and the compiler can constant-fold them.  Use these for
  * conversions and simple arithmetic:
- *   I2FR, FR2I, FR_NUM, FR_ADD, FR_MUL, FR_DIV, FR_ABS, FR_EXP ...
+ *   I2FR, FR2I, FR_NUM, FR_ADD, FR_DIV, FR_ABS, FR_CHRDX, FR_EXP ...
  *
  * MixedCase FR_ names are functions &mdash; they contain loops, tables, or
  * multi-step algorithms where inlining would waste ROM:
@@ -218,7 +246,7 @@ <h2>History</h2>
 built for graphics transforms on 16&nbsp;MHz 68k Palm Pilots (it
 shipped inside Trumpetsoft&rsquo;s <em>Inkstorm</em>), then ported
 forward to ARM, x86, MIPS, RISC-V, and various 8/16-bit embedded
-targets. v2.0.2 is the current release with a full test suite,
+targets. v2.0.6 is the current release with a full test suite,
 bit-exact numerical specification, and CI on every push.</p>
 
 <h2>License</h2>
diff --git a/pages/releases.html b/pages/releases.html
index d87abe0..9c15d09 100644
--- a/pages/releases.html
+++ b/pages/releases.html
@@ -21,6 +21,20 @@ <h1>Releases</h1>
 <a href="https://github.com/deftio/fr_math/blob/master/release_notes.md">release_notes.md</a>
 in the repo.</p>
 
+<h2>v2.0.6 &mdash; 2026</h2>
+
+<p>Accuracy improvements, lean-build options, library cleanup.</p>
+
+<ul>
+  <li><strong>FR_acos boundary fix</strong> &mdash; 12x better accuracy near &plusmn;1.0 via deferred quantization</li>
+  <li><strong>FR_atan2 rewrite</strong> &mdash; asin/acos + hypot_fast8, 0.41% peak error (was 20% in libfixmath)</li>
+  <li><strong>Lean build guards</strong> &mdash; <code>FR_NO_PRINT</code> (~1.3 KB) and <code>FR_NO_WAVES</code> (~0.6 KB) for ROM-constrained targets</li>
+  <li><strong>Removed FR_hypot_fast</strong> (4-segment) &mdash; FR_hypot_fast8 is strictly better; 4-seg was dead weight</li>
+  <li>libfixmath comparison benchmark added to repo (<code>compare_lfm/</code>)</li>
+</ul>
+
+<hr>
+
 <h2>v2.0.5 &mdash; 2026</h2>
 
 <p>Release pipeline fixes. Fixed squash-merge divergence handling and
diff --git a/release_notes.md b/release_notes.md
index 793bb03..1de55e2 100644
--- a/release_notes.md
+++ b/release_notes.md
@@ -1,5 +1,42 @@
 # FR_Math Release Notes
 
+## Version 2.0.6 (2026)
+
+Accuracy improvements, lean-build options, and library cleanup.
+
+### Accuracy & algorithms
+
+- **FR_acos boundary fix**: deferred quantization computes `1-x` at the
+  caller's radix instead of r15, giving 12x better accuracy near ±1.0
+  (max LSB error 512.6 → 42.3)
+- **FR_atan2 rewrite**: uses asin/acos + hypot_fast8 with octant
+  switching for well-conditioned results everywhere (0.41% peak vs
+  20% for libfixmath)
+
+### Lean build options
+
+Two new compile-time `#define` guards strip optional subsystems for
+ROM-constrained targets:
+
+| Define | Removes | Savings |
+|---|---|---|
+| `FR_NO_PRINT` | `FR_printNumF/D/H`, `FR_numstr` | ~1.3 KB |
+| `FR_NO_WAVES` | `fr_wave_*`, `fr_adsr_*`, `FR_HZ2BAM_INC` | ~0.6 KB |
+
+With both guards the core math library compiles to ~3.5 KB on x86-64
+(clang -Os), roughly 2.6 KB on Thumb-2.
+
+### Removed
+
+- **FR_hypot_fast** (4-segment) deleted — FR_hypot_fast8 (8-segment)
+  is strictly better in both accuracy (0.10% vs 0.34%) and is used
+  internally by FR_atan2. The 4-segment variant was dead weight.
+
+### Other
+
+- libfixmath comparison benchmark (`compare_lfm/`) added to repo
+- Documentation updated across all markdown and HTML pages
+
 ## Version 2.0.5 (2026)
 
 Release pipeline fixes. No functional changes to the math library.
diff --git a/src/FR_math.c b/src/FR_math.c
index 9edfdf3..7f3600e 100644
--- a/src/FR_math.c
+++ b/src/FR_math.c
@@ -664,6 +664,7 @@ s32 FR_log10(s32 input, u16 radix, u16 output_radix)
 	return FR_MULK28(r, FR_krLOG2_10_28);
 }
 
+#ifndef FR_NO_PRINT
 /***************************************
  * FR_printNumD - write a decimal integer with space padding.
  *
@@ -928,6 +929,7 @@ s32 FR_numstr(const char *s, u16 radix)
 
     return neg ? -result : result;
 }
+#endif /* FR_NO_PRINT */
 
 /*=======================================================
  * Square root and hypot
@@ -1094,6 +1096,7 @@ s32 FR_hypot_fast8(s32 x, s32 y)
     }
 }
 
+#ifndef FR_NO_WAVES
 /*=======================================================
  * Wave generators — synth-style fixed-shape waveforms.
  *
@@ -1365,3 +1368,4 @@ s16 fr_adsr_step(fr_adsr_t *env)
 		return (s16)out;
 	}
 }
+#endif /* FR_NO_WAVES */
diff --git a/src/FR_math.h b/src/FR_math.h
index fc3283d..3495c8a 100644
--- a/src/FR_math.h
+++ b/src/FR_math.h
@@ -455,6 +455,19 @@ static inline s32 FR_div_rnd(s64 num, s32 den) {
 #define FR_EXP_FAST(input, radix)   (FR_pow2(FR_SLOG2E(input), radix))
 #define FR_POW10_FAST(input, radix) (FR_pow2(FR_SLOG2_10(input), radix))
 
+/*===============================================
+ * Formatted output and string parsing
+ *
+ * Define FR_NO_PRINT before including this header to exclude all
+ * print/format functions from compilation. This saves ~1.7 KB of ROM
+ * on targets that don't need human-readable output (e.g. headless
+ * sensor nodes, DSP-only firmware).
+ *
+ *   #define FR_NO_PRINT
+ *   #include "FR_math.h"
+ */
+#ifndef FR_NO_PRINT
+
   /* printing family of functions */
   int FR_printNumF(int (*f)(char), s32 n, int radix, int pad, int prec); /* print fixed radix num as floating point e.g.  -12.34" */
   int FR_printNumD(int (*f)(char), int n, int pad);                      /* print decimal number with optional padding e.g. " 12" */
@@ -463,6 +476,8 @@ static inline s32 FR_div_rnd(s64 num, s32 den) {
   /* string-to-fixed-point parser (inverse of FR_printNumF) */
   s32 FR_numstr(const char *s, u16 radix);
 
+#endif /* FR_NO_PRINT */
+
 /*===============================================
  * Square root and hypot
  *
@@ -488,6 +503,19 @@ static inline s32 FR_div_rnd(s64 num, s32 den) {
    */
   s32 FR_hypot_fast8(s32 x, s32 y);
 
+/*===============================================
+ * Wave generators and ADSR envelope
+ *
+ * Define FR_NO_WAVES before including this header to exclude all
+ * waveform generators (square, pulse, triangle, saw, noise) and the
+ * ADSR envelope from compilation. This saves ~400 B of ROM on targets
+ * that only need math/trig and don't do audio synthesis.
+ *
+ *   #define FR_NO_WAVES
+ *   #include "FR_math.h"
+ */
+#ifndef FR_NO_WAVES
+
 /*===============================================
  * Wave generators — synth-style fixed-shape waveforms.
  *
@@ -568,6 +596,8 @@ typedef struct fr_adsr_s {
   void fr_adsr_release(fr_adsr_t *env);
   s16  fr_adsr_step(fr_adsr_t *env);
 
+#endif /* FR_NO_WAVES */
+
 #ifdef __cplusplus
 
 } // extern "C"

From d167309ee813b60de28e543136b9ea4106fd56bb Mon Sep 17 00:00:00 2001
From: deftio <deftio@deftio.com>
Date: Tue, 21 Apr 2026 23:10:07 -0700
Subject: [PATCH 4/6] v2.0.6 release : updated accuracy, udpated docs,
 selective builds

---
 README.md                 | 24 ++++++++++++++----------
 docs/building.md          | 32 ++++++++++++++++----------------
 pages/guide/building.html | 32 ++++++++++++++++----------------
 3 files changed, 46 insertions(+), 42 deletions(-)

diff --git a/README.md b/README.md
index e8853da..19e958a 100644
--- a/README.md
+++ b/README.md
@@ -26,18 +26,22 @@ include all code and internal tables; everything is ROMable.
 
 | Target | Code (text) |
 |--------|-------------|
-| Cortex-M0 (Thumb-1) | 4.2 KB |
-| Cortex-M4 (Thumb-2) | 4.1 KB |
-| ESP32 (Xtensa) | 4.6 KB |
-| 68k | 5.5 KB |
-| x86-64 | 5.8 KB |
-| RISC-V 32 (rv32im) | 6.5 KB |
-| x86-32 | 7.2 KB |
-| MSP430 (16-bit) | 8.4 KB |
-| 8051 (SDCC) | 20.4 KB * |
+| ARM Thumb (Cortex-M0/M4) | 4.2 KB |
+| RISC-V 32 (rv32imac) | 4.7 KB |
+| RISC-V 64 | 4.5 KB |
+| Xtensa LX106 (ESP8266) | 5.2 KB |
+| 68k | 5.3 KB |
+| ARM32 | 5.4 KB |
+| x86-64 (GCC) | 5.7 KB |
+| AArch64 (ARM64) | 6.0 KB |
+| x86-64 (Clang) | 6.4 KB |
+| x86-32 | 6.8 KB |
+| PowerPC | 7.4 KB |
+| MSP430 (16-bit) | 8.9 KB * |
+| AVR (ATmega328P) | 10.6 KB * |
 
 The optional 2D module adds ~1 KB.
-\* 8051 and MSP430 are 8/16-bit — every 32-bit operation expands to multiple instructions.
+\* MSP430 and AVR are 8/16-bit — every 32-bit operation expands to multiple instructions.
 See [`docker/`](docker/) for the cross-compile setup.
 
 ### Lean build options
diff --git a/docs/building.md b/docs/building.md
index e7c805e..db13bb7 100644
--- a/docs/building.md
+++ b/docs/building.md
@@ -166,22 +166,22 @@ references are linked, so real flash usage will be smaller.
 <!-- SIZE_TABLE_START -->
 | Target | .text (bytes) |
 |---|---:|
-| GCC ARM32 Thumb | 4,530 |
-| GCC RISC-V (rv64) | 4,830 |
-| GCC RISC-V (rv32) | 5,068 |
-| GCC Xtensa LX106 (ESP8266) | 5,548 |
-| GCC ARM32 | 5,820 |
-| GCC m68k | 5,626 |
-| GCC x86-64 | 6,130 |
-| Clang x86-64 | 6,728 |
-| GCC AArch64 (ARM64) | 6,396 |
-| GCC x86-32 | 7,362 |
-| GCC PowerPC | 7,940 |
-| GCC MSP430 | 9,958 |
-| TCC x86 | 10,190 |
-| GCC AVR5 (ATmega328P) | 11,912 |
-| GCC AVR ATtiny85 | 12,410 |
-| GCC 68HC11 | 17,331 |
+| GCC ARM32 Thumb | 4,278 |
+| GCC RISC-V (rv64) | 4,574 |
+| GCC RISC-V (rv32) | 4,820 |
+| GCC Xtensa LX106 (ESP8266) | 5,317 |
+| GCC m68k | 5,410 |
+| GCC ARM32 | 5,504 |
+| GCC x86-64 | 5,857 |
+| GCC AArch64 (ARM64) | 6,112 |
+| Clang x86-64 | 6,555 |
+| GCC x86-32 | 6,947 |
+| GCC PowerPC | 7,540 |
+| GCC MSP430 | 9,146 |
+| TCC x86 | 9,887 |
+| GCC AVR5 (ATmega328P) | 10,806 |
+| GCC AVR ATtiny85 | 11,382 |
+| GCC 68HC11 | 16,392 |
 <!-- SIZE_TABLE_END -->
 
 ### Lean build options
diff --git a/pages/guide/building.html b/pages/guide/building.html
index 12750f6..f870b20 100644
--- a/pages/guide/building.html
+++ b/pages/guide/building.html
@@ -189,22 +189,22 @@ <h3>Code size (<code>.text</code> section, compiled with <code>-Os</code>)</h3>
 <table>
 <thead><tr><th>Target</th><th>.text (bytes)</th></tr></thead>
 <tbody>
-<tr><td>GCC ARM32 Thumb</td><td>4,530</td></tr>
-<tr><td>GCC RISC-V (rv64)</td><td>4,830</td></tr>
-<tr><td>GCC RISC-V (rv32)</td><td>5,068</td></tr>
-<tr><td>GCC Xtensa LX106 (ESP8266)</td><td>5,548</td></tr>
-<tr><td>GCC ARM32</td><td>5,820</td></tr>
-<tr><td>GCC m68k</td><td>5,626</td></tr>
-<tr><td>GCC x86-64</td><td>6,130</td></tr>
-<tr><td>Clang x86-64</td><td>6,728</td></tr>
-<tr><td>GCC AArch64 (ARM64)</td><td>6,396</td></tr>
-<tr><td>GCC x86-32</td><td>7,362</td></tr>
-<tr><td>GCC PowerPC</td><td>7,940</td></tr>
-<tr><td>GCC MSP430</td><td>9,958</td></tr>
-<tr><td>TCC x86</td><td>10,190</td></tr>
-<tr><td>GCC AVR5 (ATmega328P)</td><td>11,912</td></tr>
-<tr><td>GCC AVR ATtiny85</td><td>12,410</td></tr>
-<tr><td>GCC 68HC11</td><td>17,331</td></tr>
+<tr><td>GCC ARM32 Thumb</td><td>4,278</td></tr>
+<tr><td>GCC RISC-V (rv64)</td><td>4,574</td></tr>
+<tr><td>GCC RISC-V (rv32)</td><td>4,820</td></tr>
+<tr><td>GCC Xtensa LX106 (ESP8266)</td><td>5,317</td></tr>
+<tr><td>GCC m68k</td><td>5,410</td></tr>
+<tr><td>GCC ARM32</td><td>5,504</td></tr>
+<tr><td>GCC x86-64</td><td>5,857</td></tr>
+<tr><td>GCC AArch64 (ARM64)</td><td>6,112</td></tr>
+<tr><td>Clang x86-64</td><td>6,555</td></tr>
+<tr><td>GCC x86-32</td><td>6,947</td></tr>
+<tr><td>GCC PowerPC</td><td>7,540</td></tr>
+<tr><td>GCC MSP430</td><td>9,146</td></tr>
+<tr><td>TCC x86</td><td>9,887</td></tr>
+<tr><td>GCC AVR5 (ATmega328P)</td><td>10,806</td></tr>
+<tr><td>GCC AVR ATtiny85</td><td>11,382</td></tr>
+<tr><td>GCC 68HC11</td><td>16,392</td></tr>
 </tbody>
 </table>
 <!-- SIZE_TABLE_END -->

From c23b31019833a965ab3fb845d84a891282ae908f Mon Sep 17 00:00:00 2001
From: deftio <deftio@deftio.com>
Date: Sat, 25 Apr 2026 00:42:58 -0700
Subject: [PATCH 5/6] update badges and version strings for 2.0.6

---
 README.md        | 38 +++++++++++++++++++-------------------
 docs/README.md   | 36 ++++++++++++++++++------------------
 pages/index.html | 42 +++++++++++++++++++++---------------------
 3 files changed, 58 insertions(+), 58 deletions(-)

diff --git a/README.md b/README.md
index 19e958a..b3bce93 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 [![License](https://img.shields.io/badge/License-BSD%202--Clause-blue.svg)](https://opensource.org/licenses/BSD-2-Clause)
 [![CI](https://github.com/deftio/fr_math/actions/workflows/ci.yml/badge.svg)](https://github.com/deftio/fr_math/actions/workflows/ci.yml)
-[![Coverage](https://img.shields.io/badge/coverage-99%25-brightgreen.svg)](#building-and-testing)
+[![Coverage](https://img.shields.io/badge/coverage-98%25-brightgreen.svg)](#building-and-testing)
 [![Docs](https://img.shields.io/badge/docs-online-blue.svg)](https://deftio.github.io/fr_math/)
 [![Version](https://img.shields.io/badge/version-2.0.6-blue.svg)](release_notes.md)
 
@@ -80,24 +80,24 @@ Percent errors skip expected values near zero (|expected| < 0.01).
 At other radixes (3-bit, 24-bit, etc.) accuracy will differ due to the
 number of fractional bits available. All functions support radix 0 to 30.
 
-<!-- ACCURACY_TABLE_START -->
-| Function | Max err (LSB) | Max err (%) | Avg err (%) | Note |
-|---|---:|---:|---:|---|
-| sin / cos | 7.5 | 0.7169 | 0.0100 | 65536-pt sweep + specials |
-| tan | 38020.4 | 0.7118 | 0.0162 | 65536-pt sweep (skip poles) |
-| asin / acos | 42.3 | 0.7025 | 0.0105 | 65536-pt; sqrt approx near boundary |
-| atan2 | 63.3 | 0.4953 | 0.0268 | 65536x5 radii; asin/acos+hypot_fast8 |
-| atan | 61.9 | 0.2985 | 0.0159 | 20001-pt sweep [-10,10]; via FR_atan2 |
-| sqrt | 28.4 | 0.0003 | 0.0000 | Round-to-nearest |
-| log2 | 10.5 | 0.2479 | 0.0045 | 65-entry mantissa table |
-| pow2 | 220.4 | 0.1373 | 0.0057 | 65-entry fraction table |
-| ln, log10 | 0.7 | 0.0015 | 0.0004 | Via FR_MULK28 from log2 |
-| exp | 65.7 | 0.0719 | 0.0051 | FR_MULK28 + FR_pow2 |
-| exp_fast | 195.5 | 0.0719 | 0.0064 | Shift-only scaling |
-| pow10 | 143.4 | 0.1163 | 0.0075 | FR_MULK28 + FR_pow2 |
-| pow10_fast | 581.9 | 0.1163 | 0.0100 | Shift-only scaling |
-| hypot (exact) | 0.2 | 0.0001 | 0.0000 | 64-bit intermediate |
-| hypot_fast8 (8-seg) | 59968.8 | 0.0977 | 0.0508 | Shift-only, no multiply |
+<!-- ACCURACY_TABLE_START -->
+| Function | Max err (LSB) | Max err (%) | Avg err (%) | Note |
+|---|---:|---:|---:|---|
+| sin / cos | 7.5 | 0.7169 | 0.0100 | 65536-pt sweep + specials |
+| tan | 38020.4 | 0.7118 | 0.0162 | 65536-pt sweep (skip poles) |
+| asin / acos | 42.3 | 0.7025 | 0.0105 | 65536-pt; sqrt approx near boundary |
+| atan2 | 63.3 | 0.4953 | 0.0268 | 65536x5 radii; asin/acos+hypot_fast8 |
+| atan | 61.9 | 0.2985 | 0.0159 | 20001-pt sweep [-10,10]; via FR_atan2 |
+| sqrt | 28.4 | 0.0003 | 0.0000 | Round-to-nearest |
+| log2 | 10.5 | 0.2479 | 0.0045 | 65-entry mantissa table |
+| pow2 | 220.4 | 0.1373 | 0.0057 | 65-entry fraction table |
+| ln, log10 | 0.7 | 0.0015 | 0.0004 | Via FR_MULK28 from log2 |
+| exp | 65.7 | 0.0719 | 0.0051 | FR_MULK28 + FR_pow2 |
+| exp_fast | 195.5 | 0.0719 | 0.0064 | Shift-only scaling |
+| pow10 | 143.4 | 0.1163 | 0.0075 | FR_MULK28 + FR_pow2 |
+| pow10_fast | 581.9 | 0.1163 | 0.0100 | Shift-only scaling |
+| hypot (exact) | 0.2 | 0.0001 | 0.0000 | 64-bit intermediate |
+| hypot_fast8 (8-seg) | 59968.8 | 0.0977 | 0.0508 | Shift-only, no multiply |
 <!-- ACCURACY_TABLE_END -->
 
 ### What's in the box
diff --git a/docs/README.md b/docs/README.md
index a678336..9401590 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -45,24 +45,24 @@ radix — Q16.16 is just the reference point for the table. See the
 [TDD report](../build/test_tdd_report.md) for sweeps at radixes 8, 12,
 16, and 24. Percent errors skip expected values near zero (|expected| < 0.01).
 
-<!-- ACCURACY_TABLE_START -->
-| Function | Max err (LSB) | Max err (%) | Avg err (%) | Note |
-|---|---:|---:|---:|---|
-| sin / cos | 7.5 | 0.7169 | 0.0100 | 65536-pt sweep + specials |
-| tan | 38020.4 | 0.7118 | 0.0162 | 65536-pt sweep (skip poles) |
-| asin / acos | 42.3 | 0.7025 | 0.0105 | 65536-pt; sqrt approx near boundary |
-| atan2 | 63.3 | 0.4953 | 0.0268 | 65536x5 radii; asin/acos+hypot_fast8 |
-| atan | 61.9 | 0.2985 | 0.0159 | 20001-pt sweep [-10,10]; via FR_atan2 |
-| sqrt | 28.4 | 0.0003 | 0.0000 | Round-to-nearest |
-| log2 | 10.5 | 0.2479 | 0.0045 | 65-entry mantissa table |
-| pow2 | 220.4 | 0.1373 | 0.0057 | 65-entry fraction table |
-| ln, log10 | 0.7 | 0.0015 | 0.0004 | Via FR_MULK28 from log2 |
-| exp | 65.7 | 0.0719 | 0.0051 | FR_MULK28 + FR_pow2 |
-| exp_fast | 195.5 | 0.0719 | 0.0064 | Shift-only scaling |
-| pow10 | 143.4 | 0.1163 | 0.0075 | FR_MULK28 + FR_pow2 |
-| pow10_fast | 581.9 | 0.1163 | 0.0100 | Shift-only scaling |
-| hypot (exact) | 0.2 | 0.0001 | 0.0000 | 64-bit intermediate |
-| hypot_fast8 (8-seg) | 59968.8 | 0.0977 | 0.0508 | Shift-only, no multiply |
+<!-- ACCURACY_TABLE_START -->
+| Function | Max err (LSB) | Max err (%) | Avg err (%) | Note |
+|---|---:|---:|---:|---|
+| sin / cos | 7.5 | 0.7169 | 0.0100 | 65536-pt sweep + specials |
+| tan | 38020.4 | 0.7118 | 0.0162 | 65536-pt sweep (skip poles) |
+| asin / acos | 42.3 | 0.7025 | 0.0105 | 65536-pt; sqrt approx near boundary |
+| atan2 | 63.3 | 0.4953 | 0.0268 | 65536x5 radii; asin/acos+hypot_fast8 |
+| atan | 61.9 | 0.2985 | 0.0159 | 20001-pt sweep [-10,10]; via FR_atan2 |
+| sqrt | 28.4 | 0.0003 | 0.0000 | Round-to-nearest |
+| log2 | 10.5 | 0.2479 | 0.0045 | 65-entry mantissa table |
+| pow2 | 220.4 | 0.1373 | 0.0057 | 65-entry fraction table |
+| ln, log10 | 0.7 | 0.0015 | 0.0004 | Via FR_MULK28 from log2 |
+| exp | 65.7 | 0.0719 | 0.0051 | FR_MULK28 + FR_pow2 |
+| exp_fast | 195.5 | 0.0719 | 0.0064 | Shift-only scaling |
+| pow10 | 143.4 | 0.1163 | 0.0075 | FR_MULK28 + FR_pow2 |
+| pow10_fast | 581.9 | 0.1163 | 0.0100 | Shift-only scaling |
+| hypot (exact) | 0.2 | 0.0001 | 0.0000 | 64-bit intermediate |
+| hypot_fast8 (8-seg) | 59968.8 | 0.0977 | 0.0508 | Shift-only, no multiply |
 <!-- ACCURACY_TABLE_END -->
 
 ## What's in the box
diff --git a/pages/index.html b/pages/index.html
index 35c81d2..9746d46 100644
--- a/pages/index.html
+++ b/pages/index.html
@@ -50,27 +50,27 @@ <h2>Measured accuracy</h2>
 report</a> for sweeps at radixes 8, 12, 16, and 24.
 Percent errors skip expected values near zero (|expected| &lt; 0.01).</p>
 
-<!-- ACCURACY_TABLE_START -->
-<table>
-<thead><tr><th>Function</th><th>Max err (LSB)</th><th>Max err (%)</th><th>Avg err (%)</th><th>Note</th></tr></thead>
-<tbody>
-<tr><td>sin / cos</td><td>7.5</td><td>0.7169</td><td>0.0100</td><td>65536-pt sweep + specials</td></tr>
-<tr><td>tan</td><td>38020.4</td><td>0.7118</td><td>0.0162</td><td>65536-pt sweep (skip poles)</td></tr>
-<tr><td>asin / acos</td><td>42.3</td><td>0.7025</td><td>0.0105</td><td>65536-pt; sqrt approx near boundary</td></tr>
-<tr><td>atan2</td><td>63.3</td><td>0.4953</td><td>0.0268</td><td>65536x5 radii; asin/acos+hypot_fast8</td></tr>
-<tr><td>atan</td><td>61.9</td><td>0.2985</td><td>0.0159</td><td>20001-pt sweep [-10,10]; via FR_atan2</td></tr>
-<tr><td>sqrt</td><td>28.4</td><td>0.0003</td><td>0.0000</td><td>Round-to-nearest</td></tr>
-<tr><td>log2</td><td>10.5</td><td>0.2479</td><td>0.0045</td><td>65-entry mantissa table</td></tr>
-<tr><td>pow2</td><td>220.4</td><td>0.1373</td><td>0.0057</td><td>65-entry fraction table</td></tr>
-<tr><td>ln, log10</td><td>0.7</td><td>0.0015</td><td>0.0004</td><td>Via FR_MULK28 from log2</td></tr>
-<tr><td>exp</td><td>65.7</td><td>0.0719</td><td>0.0051</td><td>FR_MULK28 + FR_pow2</td></tr>
-<tr><td>exp_fast</td><td>195.5</td><td>0.0719</td><td>0.0064</td><td>Shift-only scaling</td></tr>
-<tr><td>pow10</td><td>143.4</td><td>0.1163</td><td>0.0075</td><td>FR_MULK28 + FR_pow2</td></tr>
-<tr><td>pow10_fast</td><td>581.9</td><td>0.1163</td><td>0.0100</td><td>Shift-only scaling</td></tr>
-<tr><td>hypot (exact)</td><td>0.2</td><td>0.0001</td><td>0.0000</td><td>64-bit intermediate</td></tr>
-<tr><td>hypot_fast8 (8-seg)</td><td>59968.8</td><td>0.0977</td><td>0.0508</td><td>Shift-only, no multiply</td></tr>
-</tbody>
-</table>
+<!-- ACCURACY_TABLE_START -->
+<table>
+<thead><tr><th>Function</th><th>Max err (LSB)</th><th>Max err (%)</th><th>Avg err (%)</th><th>Note</th></tr></thead>
+<tbody>
+<tr><td>sin / cos</td><td>7.5</td><td>0.7169</td><td>0.0100</td><td>65536-pt sweep + specials</td></tr>
+<tr><td>tan</td><td>38020.4</td><td>0.7118</td><td>0.0162</td><td>65536-pt sweep (skip poles)</td></tr>
+<tr><td>asin / acos</td><td>42.3</td><td>0.7025</td><td>0.0105</td><td>65536-pt; sqrt approx near boundary</td></tr>
+<tr><td>atan2</td><td>63.3</td><td>0.4953</td><td>0.0268</td><td>65536x5 radii; asin/acos+hypot_fast8</td></tr>
+<tr><td>atan</td><td>61.9</td><td>0.2985</td><td>0.0159</td><td>20001-pt sweep [-10,10]; via FR_atan2</td></tr>
+<tr><td>sqrt</td><td>28.4</td><td>0.0003</td><td>0.0000</td><td>Round-to-nearest</td></tr>
+<tr><td>log2</td><td>10.5</td><td>0.2479</td><td>0.0045</td><td>65-entry mantissa table</td></tr>
+<tr><td>pow2</td><td>220.4</td><td>0.1373</td><td>0.0057</td><td>65-entry fraction table</td></tr>
+<tr><td>ln, log10</td><td>0.7</td><td>0.0015</td><td>0.0004</td><td>Via FR_MULK28 from log2</td></tr>
+<tr><td>exp</td><td>65.7</td><td>0.0719</td><td>0.0051</td><td>FR_MULK28 + FR_pow2</td></tr>
+<tr><td>exp_fast</td><td>195.5</td><td>0.0719</td><td>0.0064</td><td>Shift-only scaling</td></tr>
+<tr><td>pow10</td><td>143.4</td><td>0.1163</td><td>0.0075</td><td>FR_MULK28 + FR_pow2</td></tr>
+<tr><td>pow10_fast</td><td>581.9</td><td>0.1163</td><td>0.0100</td><td>Shift-only scaling</td></tr>
+<tr><td>hypot (exact)</td><td>0.2</td><td>0.0001</td><td>0.0000</td><td>64-bit intermediate</td></tr>
+<tr><td>hypot_fast8 (8-seg)</td><td>59968.8</td><td>0.0977</td><td>0.0508</td><td>Shift-only, no multiply</td></tr>
+</tbody>
+</table>
 <!-- ACCURACY_TABLE_END -->
 
 <h2>What&rsquo;s in the box</h2>

From 28ea312dae28c629e6c1963c4ae44b57b8eb7eb4 Mon Sep 17 00:00:00 2001
From: deftio <deftio@deftio.com>
Date: Sat, 25 Apr 2026 10:34:11 -0700
Subject: [PATCH 6/6] udpated docs and accuracy tests

---
 makefile              | 36 +++++++++++++++++++++++++++++++++++-
 src/FR_math.c         |  6 +++---
 src/FR_math.h         |  2 +-
 tools/make_release.sh | 20 +++++---------------
 4 files changed, 44 insertions(+), 20 deletions(-)

diff --git a/makefile b/makefile
index 168bd57..eb8a9fc 100644
--- a/makefile
+++ b/makefile
@@ -26,7 +26,41 @@ LDFLAGS = -lm
 # Source files
 HEADERS = $(SRC_DIR)/FR_defs.h $(SRC_DIR)/FR_math.h $(SRC_DIR)/FR_math_2D.h
 
-# Default target
+# Default target — print help
+.PHONY: help
+help:
+	@echo "FR_Math — Fixed Radix Math Library"
+	@echo ""
+	@echo "Usage: make <target>"
+	@echo ""
+	@echo "Build targets:"
+	@echo "  all              Build library and examples"
+	@echo "  lib              Build library objects only"
+	@echo "  examples         Build example program"
+	@echo ""
+	@echo "Test targets:"
+	@echo "  test             Run all tests"
+	@echo "  test-basic       Run basic tests"
+	@echo "  test-comprehensive  Run comprehensive tests"
+	@echo "  test-2d          Run 2D math tests"
+	@echo "  test-overflow    Run overflow/saturation tests"
+	@echo "  test-full        Run full coverage tests"
+	@echo "  test-2d-complete Run 2D complete coverage tests"
+	@echo "  test-tdd         Run TDD characterization tests"
+	@echo ""
+	@echo "Analysis targets:"
+	@echo "  accuracy         Show accuracy summary table"
+	@echo "  accuracy-showpeak  Show accuracy with peak inputs"
+	@echo "  coverage         Generate coverage report (gcov)"
+	@echo "  coverage-basic   Basic coverage info without lcov"
+	@echo "  coverage-html    HTML coverage report (requires lcov)"
+	@echo "  size-report      Multi-architecture size report"
+	@echo "  size-simple      Size report for current platform"
+	@echo ""
+	@echo "Maintenance:"
+	@echo "  clean            Remove build artifacts"
+	@echo "  cleanall         Remove build artifacts and backups"
+
 .PHONY: all
 all: dirs lib examples
 
diff --git a/src/FR_math.c b/src/FR_math.c
index 7f3600e..181972e 100644
--- a/src/FR_math.c
+++ b/src/FR_math.c
@@ -266,7 +266,7 @@ s32 FR_acos(s32 input, u16 radix, u16 out_radix)
 
 	/* Work with absolute value at the caller's radix — we'll need it for
 	 * the sqrt fast path before quantising to r15. */
-	sign = (input < 0) ? 1 : 0;
+	sign = (s16)((input < 0) ? 1 : 0);
 	input_abs = sign ? -input : input;
 
 	/* Clamp at the caller's radix — not at r15.  Near ±1.0 the r15
@@ -1206,7 +1206,7 @@ s16 fr_wave_tri_morph(u16 phase, u16 break_point)
 	if (phase < break_point)
 	{
 		/* rising: 0 at phase=0, 32767 at phase=break_point */
-		t = ((u32)phase * 32767UL) / (u32)break_point;
+		t = (u32)(((u32)phase * 32767UL) / (u32)break_point);
 	}
 	else
 	{
@@ -1214,7 +1214,7 @@ s16 fr_wave_tri_morph(u16 phase, u16 break_point)
 		u32 span = (u32)0xffff - (u32)break_point;
 		if (span == 0)
 			return 32767;
-		t = ((u32)((u32)0xffff - (u32)phase) * 32767UL) / span;
+		t = (u32)(((u32)((u32)0xffff - (u32)phase) * 32767UL) / span);
 	}
 	if (t > 32767) t = 32767;
 	return (s16)t;
diff --git a/src/FR_math.h b/src/FR_math.h
index 3495c8a..562a5d3 100644
--- a/src/FR_math.h
+++ b/src/FR_math.h
@@ -378,7 +378,7 @@ static inline s32 FR_div_rnd(s64 num, s32 den) {
  * Derivation: rad = bam * 2π / 65536. At output radix r: bam * 2π * 2^r / 2^16
  *           = bam * (2π * 2^10) / 2^(26 - r) = bam * 6434 >> (26 - r).
  */
-#define FR_BAM2RAD(bam, radix)  (((s32)(u16)(bam) * 6434L) >> (26 - (radix)))
+#define FR_BAM2RAD(bam, radix)  ((s32)(((s32)(u16)(bam) * 6434L) >> (26 - (radix))))
 
 /*===============================================
  * Radian-native and BAM-native trig (recommended)
diff --git a/tools/make_release.sh b/tools/make_release.sh
index e1a504f..28f7647 100755
--- a/tools/make_release.sh
+++ b/tools/make_release.sh
@@ -130,23 +130,13 @@ do_validate() {
     run_cmd make clean >/dev/null 2>&1
 
     echo ""
-    echo "  --- Strict compile (-Wall -Wextra -Werror -Wshadow) ---"
-    local strict_flags="-Isrc -Wall -Wextra -Werror -Wshadow -Os"
+    echo "  --- Build library + examples (uses LIB_WARN from makefile) ---"
     mkdir -p build
-    if ! cc ${strict_flags} -c src/FR_math.c -o build/FR_math_strict.o 2>build/strict_c.log; then
-        cat build/strict_c.log
-        fail "src/FR_math.c has compiler warnings"
+    if ! make lib examples 2>build/strict_build.log; then
+        cat build/strict_build.log
+        fail "Library build failed (compiler warnings or errors)"
     fi
-    if ! c++ ${strict_flags} -c src/FR_math_2D.cpp -o build/FR_math_2D_strict.o 2>build/strict_cpp.log; then
-        cat build/strict_cpp.log
-        fail "src/FR_math_2D.cpp has compiler warnings"
-    fi
-    pass "Zero warnings."
-
-    echo ""
-    echo "  --- Build library + examples ---"
-    run_cmd make lib examples >/dev/null 2>&1
-    pass "Library and examples built."
+    pass "Library and examples built — zero warnings."
 
     echo ""
     echo "  --- Full test suite ---"