diff --git a/README.md b/README.md
index e1b4e87..b3bce93 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 [![License](https://img.shields.io/badge/License-BSD%202--Clause-blue.svg)](https://opensource.org/licenses/BSD-2-Clause)
 [![CI](https://github.com/deftio/fr_math/actions/workflows/ci.yml/badge.svg)](https://github.com/deftio/fr_math/actions/workflows/ci.yml)
-[![Coverage](https://img.shields.io/badge/coverage-99%25-brightgreen.svg)](#building-and-testing)
+[![Coverage](https://img.shields.io/badge/coverage-98%25-brightgreen.svg)](#building-and-testing)
 [![Docs](https://img.shields.io/badge/docs-online-blue.svg)](https://deftio.github.io/fr_math/)
-[![Version](https://img.shields.io/badge/version-2.0.5-blue.svg)](release_notes.md)
+[![Version](https://img.shields.io/badge/version-2.0.6-blue.svg)](release_notes.md)
 
 # FR_Math: A C Language Fixed-Point Math Library for Embedded Systems
 
@@ -20,37 +20,85 @@ beyond `<stdint.h>`.
 
 ### Library size (FR_math.c only, `-Os`)
 
+Compiled object code sizes on select platforms (static test build). Your
+sizes may vary depending on optimization and linker settings. Sizes
+include all code and internal tables; everything is ROMable.
+
 | Target | Code (text) |
 |--------|-------------|
-| Cortex-M0 (Thumb-1) | 4.2 KB |
-| Cortex-M4 (Thumb-2) | 4.1 KB |
-| ESP32 (Xtensa) | 4.6 KB |
-| 68k | 5.5 KB |
-| x86-64 | 5.8 KB |
-| RISC-V 32 (rv32im) | 6.5 KB |
-| x86-32 | 7.2 KB |
-| MSP430 (16-bit) | 8.4 KB |
-| 8051 (SDCC) | 20.4 KB * |
-
-Sizes are code-only (text section). The optional 2D module adds ~1 KB.
-\* 8051 and MSP430 are 8/16-bit — every 32-bit operation expands to multiple instructions.
+| ARM Thumb (Cortex-M0/M4) | 4.2 KB |
+| RISC-V 32 (rv32imac) | 4.7 KB |
+| RISC-V 64 | 4.5 KB |
+| Xtensa LX106 (ESP8266) | 5.2 KB |
+| 68k | 5.3 KB |
+| ARM32 | 5.4 KB |
+| x86-64 (GCC) | 5.7 KB |
+| AArch64 (ARM64) | 6.0 KB |
+| x86-64 (Clang) | 6.4 KB |
+| x86-32 | 6.8 KB |
+| PowerPC | 7.4 KB |
+| MSP430 (16-bit) | 8.9 KB * |
+| AVR (ATmega328P) | 10.6 KB * |
+
+The optional 2D module adds ~1 KB.
+\* MSP430 and AVR are 8/16-bit — every 32-bit operation expands to multiple instructions.
 See [`docker/`](docker/) for the cross-compile setup.
 
+### Lean build options
+
+Two compile-time `#define` guards let you strip optional subsystems
+for ROM-constrained targets. Define them before including `FR_math.h`
+(or pass `-D` on the compiler command line):
+
+| Define | What it removes | Typical savings |
+|---|---|---|
+| `FR_NO_PRINT` | `FR_printNumF`, `FR_printNumD`, `FR_printNumH`, `FR_numstr` | ~1.3 KB |
+| `FR_NO_WAVES` | `fr_wave_*` (6 shapes), `fr_adsr_*` (ADSR envelope), `FR_HZ2BAM_INC` | ~0.6 KB |
+
+With both guards enabled the core math library (trig, inverse trig, log/exp,
+sqrt, hypot) compiles to ~3.5 KB on x86-64 / clang -Os. On Thumb-2 this
+would be roughly 2.6 KB.
+
+```c
+/* Example: headless sensor node — math only, no print, no audio */
+#define FR_NO_PRINT
+#define FR_NO_WAVES
+#include "FR_math.h"
+```
+
+With `-ffunction-sections` and linker `--gc-sections`, the linker will
+also strip any unused functions automatically, so these guards are most
+useful when you include the library as a single `.c` file or static
+archive without section-level dead-code elimination.
+
 ### Measured accuracy
 
 Errors below are measured at Q16.16 (s15.16). All functions accept any
 radix — Q16.16 is just the reference point for the table.
-
-| Function | Max error | Note |
-|---|---|---|
-| sin / cos | 5 LSB (~7.7e-5) | Exact at 0, 90, 180, 270 |
-| sqrt | ≤ 0.5 LSB | Round-to-nearest |
-| log2 | ≤ 4 LSB | 65-entry mantissa table |
-| pow2 | ≤ 1 LSB (integers exact) | 65-entry fraction table |
-| ln, log10 | ≤ 4 LSB | Via FR_MULK28 from log2 |
-| hypot (exact) | ≤ 0.5 LSB | 64-bit intermediate |
-| hypot_fast (4-seg) | 0.34% | Shift-only, no multiply |
-| hypot_fast8 (8-seg) | 0.10% | Shift-only, no multiply |
+Percent errors skip expected values near zero (|expected| < 0.01).
+
+At other radixes (3-bit, 24-bit, etc.) accuracy will differ due to the
+number of fractional bits available. All functions support radix 0 to 30.
+
+<!-- ACCURACY_TABLE_START -->
+| Function | Max err (LSB) | Max err (%) | Avg err (%) | Note |
+|---|---:|---:|---:|---|
+| sin / cos | 7.5 | 0.7169 | 0.0100 | 65536-pt sweep + specials |
+| tan | 38020.4 | 0.7118 | 0.0162 | 65536-pt sweep (skip poles) |
+| asin / acos | 42.3 | 0.7025 | 0.0105 | 65536-pt; sqrt approx near boundary |
+| atan2 | 63.3 | 0.4953 | 0.0268 | 65536x5 radii; asin/acos+hypot_fast8 |
+| atan | 61.9 | 0.2985 | 0.0159 | 20001-pt sweep [-10,10]; via FR_atan2 |
+| sqrt | 28.4 | 0.0003 | 0.0000 | Round-to-nearest |
+| log2 | 10.5 | 0.2479 | 0.0045 | 65-entry mantissa table |
+| pow2 | 220.4 | 0.1373 | 0.0057 | 65-entry fraction table |
+| ln, log10 | 0.7 | 0.0015 | 0.0004 | Via FR_MULK28 from log2 |
+| exp | 65.7 | 0.0719 | 0.0051 | FR_MULK28 + FR_pow2 |
+| exp_fast | 195.5 | 0.0719 | 0.0064 | Shift-only scaling |
+| pow10 | 143.4 | 0.1163 | 0.0075 | FR_MULK28 + FR_pow2 |
+| pow10_fast | 581.9 | 0.1163 | 0.0100 | Shift-only scaling |
+| hypot (exact) | 0.2 | 0.0001 | 0.0000 | 64-bit intermediate |
+| hypot_fast8 (8-seg) | 59968.8 | 0.0977 | 0.0508 | Shift-only, no multiply |
+<!-- ACCURACY_TABLE_END -->
 
 ### What's in the box
 
@@ -62,7 +110,7 @@ radix — Q16.16 is just the reference point for the table.
 | Trig (radian/BAM) | `fr_sin`, `fr_cos`, `fr_tan`, `fr_sin_bam`, `fr_cos_bam`, `fr_sin_deg`, `fr_cos_deg` |
 | Inverse trig | `FR_atan`, `FR_atan2`, `FR_asin`, `FR_acos` |
 | Log / exp | `FR_log2`, `FR_ln`, `FR_log10`, `FR_pow2`, `FR_EXP`, `FR_POW10`, `FR_EXP_FAST`, `FR_POW10_FAST`, `FR_MULK28` |
-| Roots | `FR_sqrt`, `FR_hypot`, `FR_hypot_fast`, `FR_hypot_fast8` |
+| Roots | `FR_sqrt`, `FR_hypot`, `FR_hypot_fast8` |
 | Wave generators | `fr_wave_sqr`, `fr_wave_pwm`, `fr_wave_tri`, `fr_wave_saw`, `fr_wave_tri_morph`, `fr_wave_noise` |
 | Envelope | `fr_adsr_init`, `fr_adsr_trigger`, `fr_adsr_release`, `fr_adsr_step` |
 | 2D transforms | `FR_Matrix2D_CPT` (mul, add, sub, det, inv, setrotate, XFormPtI, XFormPtI16) |
@@ -74,7 +122,7 @@ radix — Q16.16 is just the reference point for the table.
 git clone https://github.com/deftio/fr_math.git
 cd fr_math
 make lib       # build static library
-make test      # run all tests (coverage, TDD characterization, 2D)
+make test      # run all tests (unit, TDD characterization, 2D)
 ```
 
 ## Quick taste
@@ -84,11 +132,62 @@ make test      # run all tests (coverage, TDD characterization, 2D)
 
 #define R 16  /* work at radix 16 (s15.16) throughout */
 
-s32 pi    = FR_NUM(3, 14159, 5, R);       /* pi at radix 16             */
-s32 c45   = FR_CosI(45);                  /* cos 45 deg = 0.7071 (s15.16) */
-s32 root2 = FR_sqrt(I2FR(2, R), R);       /* sqrt(2)    = 1.4142        */
-s32 lg    = FR_log2(I2FR(1000, R), R, R); /* log2(1000) ~ 9.97          */
-s32 ex    = FR_EXP(I2FR(1, R), R);        /* e^1        ~ 2.7183        */
+/* ---- Creating fixed-point values ----
+ *
+ * FR_NUM(integer, frac_digits, num_digits, radix) encodes a decimal
+ * literal at compile time.  The fractional part is the digits AFTER
+ * the decimal point, and num_digits says how many digits that is.
+ * Think: FR_NUM(3, 14159, 5, 16) means "3.14159" at radix 16.
+ */
+s32 pi   = FR_NUM(3, 14159, 5, R);  /* 3.14159 → raw 205886 at r16  */
+s32 half = FR_NUM(0, 5, 1, R);      /* 0.5     → raw 32768           */
+s32 neg  = FR_NUM(-2, 75, 2, R);    /* -2.75   → raw -180224         */
+
+/* Or parse from a string at runtime (no floats, no strtod): */
+s32 pi2  = FR_numstr("3.14159", R); /* same result as FR_NUM above    */
+
+/* Integer-to-fixed: I2FR(n, radix) just shifts left */
+s32 two  = I2FR(2, R);              /* 2.0 → raw 131072              */
+
+/* ---- Naming convention: macros vs functions ----
+ *
+ * UPPERCASE FR_ names are macros — they expand inline with no call
+ * overhead, and the compiler can constant-fold them.  Use these for
+ * conversions and simple arithmetic:
+ *   I2FR, FR2I, FR_NUM, FR_ADD, FR_DIV, FR_ABS, FR_CHRDX, FR_EXP ...
+ *
+ * MixedCase FR_ names are functions — they contain loops, tables, or
+ * multi-step algorithms where inlining would waste ROM:
+ *   FR_Cos, FR_sqrt, FR_atan2, FR_log2, FR_pow2, FR_printNumF ...
+ *
+ * lowercase fr_ names are v2 functions (radian trig, wave generators,
+ * ADSR envelopes):
+ *   fr_sin, fr_cos, fr_tan, fr_wave_tri, fr_adsr_step ...
+ *
+ * Some macros wrap functions: FR_EXP(x,r) scales x then calls
+ * FR_pow2 — one-liner convenience, heavy lifting in the function.
+ */
+
+/* ---- Math functions ---- */
+s32 c45   = FR_Cos(45, 0);                /* cos(45°) = 0.7071       */
+s32 s30   = fr_sin(FR_numstr("0.5236", R), R); /* sin(0.5236 rad)    */
+s32 root2 = FR_sqrt(two, R);              /* sqrt(2)  = 1.4142       */
+s32 angle = FR_atan2(I2FR(1,R), I2FR(1,R), R); /* atan2(1,1) rad     */
+s32 lg    = FR_log2(I2FR(1000, R), R, R); /* log2(1000) ~ 9.97       */
+s32 ex    = FR_EXP(I2FR(1, R), R);        /* macro: scales then calls
+                                            * FR_pow2 internally      */
+
+/* ---- Printing (serial / UART / file friendly) ----
+ *
+ * FR_printNumF takes a per-character output function — works with
+ * putchar, Serial.write, UART_putc, or any int(*)(char).  No
+ * sprintf, no floats, no heap.  Ideal for bare-metal targets.
+ */
+int my_putchar(char c) { return putchar(c); }  /* or your UART func */
+
+FR_printNumF(my_putchar, pi, R, 8, 5);    /* prints " 3.14159"      */
+FR_printNumF(my_putchar, neg, R, 8, 2);   /* prints "   -2.75"      */
+FR_printNumD(my_putchar, FR2I(lg, R), 4); /* prints "   9" (integer)*/
 ```
 
 ## Documentation
@@ -111,7 +210,7 @@ The full docs ship in two forms — pick whichever fits how you read.
 FR_Math has been in service since 2000, originally built for graphics
 transforms on 16 MHz 68k Palm Pilots. It shipped inside Trumpetsoft's
 *Inkstorm* on PalmOS, then moved forward through ARM, x86, MIPS,
-RISC-V, and various 8/16-bit embedded targets. v2.0.2 is the current
+RISC-V, and various 8/16-bit embedded targets. v2.0.6 is the current
 release with a full test suite, bit-exact numerical specification, and
 CI on every push.
 
@@ -127,5 +226,5 @@ BSD-2-Clause — see [LICENSE.txt](LICENSE.txt).
 
 ## Version
 
-2.0.2 — see [release_notes.md](release_notes.md) for the v1 → v2
+2.0.6 — see [release_notes.md](release_notes.md) for the v1 → v2
 migration guide, numerical fixes, and new functionality.
diff --git a/VERSION b/VERSION
index e010258..157e54f 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.0.5
+2.0.6
diff --git a/agents.md b/agents.md
index f828fb6..2a461f6 100644
--- a/agents.md
+++ b/agents.md
@@ -90,6 +90,15 @@ Versioned files (all synced automatically):
 4. Verify `llms.txt` and `agents.md` are current with any API changes
 5. Commit, tag, push
 
+## Lean build options
+
+Define before including `FR_math.h` to exclude optional subsystems:
+
+| Define | Removes | Savings |
+|---|---|---|
+| `FR_NO_PRINT` | `FR_printNumF/D/H`, `FR_numstr` | ~1.3 KB |
+| `FR_NO_WAVES` | `fr_wave_*`, `fr_adsr_*`, `FR_HZ2BAM_INC` | ~0.6 KB |
+
 ## Platform targets
 
 The library compiles on: AVR (Arduino), ARM Cortex-M0/M4, ESP32,
diff --git a/compare_lfm/.gitignore b/compare_lfm/.gitignore
new file mode 100644
index 0000000..d0ab86f
--- /dev/null
+++ b/compare_lfm/.gitignore
@@ -0,0 +1,10 @@
+# Build artifacts
+build/
+
+# Cloned third-party libraries (fetch from GitHub as needed)
+libfixmath/
+fpm/
+liquid-fpm/
+
+# Claude session data
+.claude/
diff --git a/compare_lfm/Makefile b/compare_lfm/Makefile
new file mode 100644
index 0000000..01554ed
--- /dev/null
+++ b/compare_lfm/Makefile
@@ -0,0 +1,102 @@
+# ============================================================
+# Benchmark: FR_math vs libfixmath  (macOS ARM / Apple Clang)
+# ============================================================
+# WARNING: This Makefile only builds files inside .compare/
+#          It does NOT modify anything in the parent repo.
+# ============================================================
+
+CXX      := clang++
+CC       := clang
+CXXFLAGS := -std=c++17 -O2 -Wall -Wextra
+CFLAGS   := -std=c11 -O2 -Wall -Wextra
+
+# --- FR_math (parent repo, compiled read-only) ---
+FR_SRC   := ../src/FR_math.c
+FR_INC   := -I../src
+
+# --- libfixmath (cloned into this dir) ---
+LFM_DIR  := libfixmath/libfixmath
+LFM_SRC  := $(LFM_DIR)/fix16.c       \
+            $(LFM_DIR)/fix16_sqrt.c   \
+            $(LFM_DIR)/fix16_exp.c    \
+            $(LFM_DIR)/fix16_trig.c   \
+            $(LFM_DIR)/fix16_str.c    \
+            $(LFM_DIR)/uint32.c       \
+            $(LFM_DIR)/fract32.c
+LFM_INC  := -I$(LFM_DIR)
+
+# --- Build ---
+BUILD    := build
+TARGET   := $(BUILD)/benchmark
+JSON_OUT := comparison_results.json
+
+OBJS := $(BUILD)/FR_math.o       \
+        $(BUILD)/fix16.o          \
+        $(BUILD)/fix16_sqrt.o     \
+        $(BUILD)/fix16_exp.o      \
+        $(BUILD)/fix16_trig.o     \
+        $(BUILD)/fix16_str.o      \
+        $(BUILD)/uint32.o         \
+        $(BUILD)/fract32.o        \
+        $(BUILD)/benchmark.o
+
+.PHONY: all clean run size
+
+all: $(TARGET)
+
+run: $(TARGET)
+	./$(TARGET) > $(JSON_OUT) 2>comparison_summary.md
+	@echo "Results written to $(JSON_OUT) + comparison_summary.md"
+
+# Size comparison: ROM + RAM for both libraries
+LFM_OBJS := $(BUILD)/fix16.o $(BUILD)/fix16_sqrt.o $(BUILD)/fix16_exp.o \
+            $(BUILD)/fix16_trig.o $(BUILD)/fix16_str.o $(BUILD)/uint32.o \
+            $(BUILD)/fract32.o
+
+size: $(BUILD)/FR_math.o $(LFM_OBJS)
+	@echo "=== Compiled size: FR_math vs libfixmath ($(CC) -O2) ==="
+	@echo ""
+	@echo "FR_math (FR_math.o):"
+	@size -m $(BUILD)/FR_math.o | grep -E "Section|total"
+	@echo ""
+	@echo "libfixmath (all objects):"
+	@for f in $(LFM_OBJS); do echo "  $$(basename $$f):"; size -m $$f | grep -E "Section" | sed 's/^/    /'; done
+
+$(TARGET): $(OBJS) | $(BUILD)
+	$(CXX) $(CXXFLAGS) -o $@ $^
+
+$(BUILD):
+	mkdir -p $(BUILD)
+
+# FR_math object
+$(BUILD)/FR_math.o: $(FR_SRC) | $(BUILD)
+	$(CC) $(CFLAGS) $(FR_INC) -c -o $@ $<
+
+# libfixmath objects
+$(BUILD)/fix16.o: $(LFM_DIR)/fix16.c | $(BUILD)
+	$(CC) $(CFLAGS) $(LFM_INC) -c -o $@ $<
+
+$(BUILD)/fix16_sqrt.o: $(LFM_DIR)/fix16_sqrt.c | $(BUILD)
+	$(CC) $(CFLAGS) $(LFM_INC) -c -o $@ $<
+
+$(BUILD)/fix16_exp.o: $(LFM_DIR)/fix16_exp.c | $(BUILD)
+	$(CC) $(CFLAGS) $(LFM_INC) -c -o $@ $<
+
+$(BUILD)/fix16_trig.o: $(LFM_DIR)/fix16_trig.c | $(BUILD)
+	$(CC) $(CFLAGS) $(LFM_INC) -c -o $@ $<
+
+$(BUILD)/fix16_str.o: $(LFM_DIR)/fix16_str.c | $(BUILD)
+	$(CC) $(CFLAGS) $(LFM_INC) -c -o $@ $<
+
+$(BUILD)/uint32.o: $(LFM_DIR)/uint32.c | $(BUILD)
+	$(CC) $(CFLAGS) $(LFM_INC) -c -o $@ $<
+
+$(BUILD)/fract32.o: $(LFM_DIR)/fract32.c | $(BUILD)
+	$(CC) $(CFLAGS) $(LFM_INC) -c -o $@ $<
+
+# Benchmark harness
+$(BUILD)/benchmark.o: benchmark.cpp | $(BUILD)
+	$(CXX) $(CXXFLAGS) $(FR_INC) $(LFM_INC) -c -o $@ $<
+
+clean:
+	rm -rf $(BUILD) $(JSON_OUT)
diff --git a/compare_lfm/Makefile.explog b/compare_lfm/Makefile.explog
new file mode 100644
index 0000000..afb66de
--- /dev/null
+++ b/compare_lfm/Makefile.explog
@@ -0,0 +1,77 @@
+# Makefile.explog — build exp/log accuracy improvement benchmark
+# Usage: make -f Makefile.explog run
+
+CXX      := clang++
+CC       := clang
+CXXFLAGS := -std=c++17 -O2 -Wall -Wextra
+CFLAGS   := -std=c11 -O2 -Wall -Wextra
+
+FR_SRC   := ../src/FR_math.c
+FR_INC   := -I../src
+
+LFM_DIR  := libfixmath/libfixmath
+LFM_SRC  := $(LFM_DIR)/fix16.c       \
+            $(LFM_DIR)/fix16_sqrt.c   \
+            $(LFM_DIR)/fix16_exp.c    \
+            $(LFM_DIR)/fix16_trig.c   \
+            $(LFM_DIR)/fix16_str.c    \
+            $(LFM_DIR)/uint32.c       \
+            $(LFM_DIR)/fract32.c
+LFM_INC  := -I$(LFM_DIR)
+
+BUILD    := build
+TARGET   := $(BUILD)/bench_explog
+
+# Reuse FR_math and libfixmath objects from main Makefile
+OBJS := $(BUILD)/FR_math.o       \
+        $(BUILD)/fix16.o          \
+        $(BUILD)/fix16_sqrt.o     \
+        $(BUILD)/fix16_exp.o      \
+        $(BUILD)/fix16_trig.o     \
+        $(BUILD)/fix16_str.o      \
+        $(BUILD)/uint32.o         \
+        $(BUILD)/fract32.o        \
+        $(BUILD)/bench_explog.o
+
+.PHONY: all clean run
+
+all: $(TARGET)
+
+run: $(TARGET)
+	./$(TARGET)
+
+$(TARGET): $(OBJS) | $(BUILD)
+	$(CXX) $(CXXFLAGS) -o $@ $^
+
+$(BUILD):
+	mkdir -p $(BUILD)
+
+$(BUILD)/FR_math.o: $(FR_SRC) | $(BUILD)
+	$(CC) $(CFLAGS) $(FR_INC) -c -o $@ $<
+
+$(BUILD)/fix16.o: $(LFM_DIR)/fix16.c | $(BUILD)
+	$(CC) $(CFLAGS) $(LFM_INC) -c -o $@ $<
+
+$(BUILD)/fix16_sqrt.o: $(LFM_DIR)/fix16_sqrt.c | $(BUILD)
+	$(CC) $(CFLAGS) $(LFM_INC) -c -o $@ $<
+
+$(BUILD)/fix16_exp.o: $(LFM_DIR)/fix16_exp.c | $(BUILD)
+	$(CC) $(CFLAGS) $(LFM_INC) -c -o $@ $<
+
+$(BUILD)/fix16_trig.o: $(LFM_DIR)/fix16_trig.c | $(BUILD)
+	$(CC) $(CFLAGS) $(LFM_INC) -c -o $@ $<
+
+$(BUILD)/fix16_str.o: $(LFM_DIR)/fix16_str.c | $(BUILD)
+	$(CC) $(CFLAGS) $(LFM_INC) -c -o $@ $<
+
+$(BUILD)/uint32.o: $(LFM_DIR)/uint32.c | $(BUILD)
+	$(CC) $(CFLAGS) $(LFM_INC) -c -o $@ $<
+
+$(BUILD)/fract32.o: $(LFM_DIR)/fract32.c | $(BUILD)
+	$(CC) $(CFLAGS) $(LFM_INC) -c -o $@ $<
+
+$(BUILD)/bench_explog.o: bench_explog.cpp | $(BUILD)
+	$(CXX) $(CXXFLAGS) $(FR_INC) $(LFM_INC) -c -o $@ $<
+
+clean:
+	rm -f $(BUILD)/bench_explog.o $(TARGET)
diff --git a/compare_lfm/Using_fast_hypot_as_sqrt.md b/compare_lfm/Using_fast_hypot_as_sqrt.md
new file mode 100644
index 0000000..ceca536
--- /dev/null
+++ b/compare_lfm/Using_fast_hypot_as_sqrt.md
@@ -0,0 +1,458 @@
+# Deriving a Shift-Only Fast Square Root from Piecewise-Linear Hypot
+
+A note on borrowing the FR_hypot_fast technique for 1D square root approximation
+
+*FR_math library — design notes*
+
+
+## Background
+
+The FR_math library includes `FR_hypot_fast()`, a piecewise-linear approximation
+of `sqrt(x^2 + y^2)` that uses only shifts and adds — no multiplications,
+no divisions, no lookup tables. It achieves ~0.4% peak error (4-segment) or
+~0.14% peak error (8-segment) with deterministic, constant-time execution.
+
+Here we explore whether the same technique can be adapted to compute
+scalar square root — `sqrt(x)` — with similar speed and shift-only
+constraints. The result is a `FR_sqrt_fast()` that runs in ~5 ns on ARM
+(vs. 25 ns for the exact `FR_sqrt`) with ~0.3% peak error.
+
+For context, the legendary Quake III "fast inverse square root" (the `0x5f3759df`
+trick) achieves ~0.17% error on 32-bit float using a bit-hack + one Newton
+iteration. This approach achieves comparable accuracy on fixed-point numbers
+using only integer shifts and adds — no float reinterpretation, no Newton
+iteration, no multiplication.
+
+
+## Step 1: The Naive Idea — hypot(x, x) / sqrt(2)
+
+The starting intuition: if `hypot(a, b) = sqrt(a^2 + b^2)`, then:
+
+```
+hypot(x, x) = sqrt(x^2 + x^2) = sqrt(2) * x
+```
+
+So:
+
+```
+x = hypot(x, x) / sqrt(2)
+```
+
+This is an identity — it gives back `x`, not `sqrt(x)`. But the seed of an
+idea is here: `FR_hypot_fast` approximates a 2D magnitude using shift-only
+piecewise-linear segments selected by the ratio `lo/hi`. What if we exploit this
+structure for a different purpose?
+
+While not a native sqrt(), what
+we really want is to borrow the machinery of hypot_fast — the segment
+selection, the shift-only coefficients, the branch-free evaluation — and apply
+it to the 1D sqrt curve directly.
+
+
+## Step 2: A Key Insight — Normalize, Approximate, Denormalize
+
+`FR_hypot_fast` works by:
+1. Taking `|x|` and `|y|`, sorting into `hi` and `lo`
+2. Using the ratio `lo/hi` to select a piecewise-linear segment
+3. Computing `result = a*hi + b*lo` with shift-only coefficients
+
+For scalar `sqrt(x)`, we can use a simpler structure:
+
+1. **Normalize** `x` into a fixed range using the leading-bit position
+2. **Approximate** `sqrt` on that fixed range with shift-only linear segments
+3. **Denormalize** the result by shifting back
+
+The key property of sqrt that makes this work:
+
+```
+sqrt(x * 2^(2k)) = sqrt(x) * 2^k
+```
+
+So if we shift `x` left or right by an **even** number of bits to land in a
+known range (say `[1.0, 4.0)` in fixed-point), we compute the sqrt in that
+range, then shift the result by **half** the original shift to undo the
+normalization.
+
+
+## Step 3: Normalization via Leading-Bit Detection
+
+Given a positive fixed-point value `x` at arbitrary radix `r`, find the
+position of the highest set bit. This tells us the magnitude. We want to
+shift `x` so it lands in `[1.0, 4.0)` in an internal Q16.16 representation,
+regardless of the caller's radix.
+
+The math for arbitrary radix:
+
+```
+Input:   x at radix r         (real value = x / 2^r)
+Want:    sqrt(x / 2^r) * 2^r  (result at radix r)
+       = sqrt(x) * 2^(r/2)    (the standard fixed-point sqrt formula)
+```
+
+We always normalize to Q16.16 internally, then compute a single combined
+denormalization shift at the end that accounts for both the normalization
+and the radix difference:
+
+```c
+/* Count leading zeros — maps to CLZ instruction on ARM, Zbb on RISC-V */
+int lz = __builtin_clz(x);      /* 0..31 */
+int bit_pos = 31 - lz;          /* position of MSB */
+
+/* Target: MSB at bit 17 (internal Q16.16, value in [1.0, 4.0)).
+ * The normalization shift must have the SAME PARITY as radix,
+ * so that (shift + 16 - radix) is always even for the final halving.
+ */
+int raw_shift = 17 - bit_pos;
+int parity = radix & 1;
+int shift = (raw_shift & ~1) | parity;  /* force same parity as radix */
+if ((shift - raw_shift) > 1) shift -= 2;  /* keep close to target */
+
+s32 xn = (shift >= 0) ? (x << shift) : (x >> (-shift));
+```
+
+After this, `xn` is in `[1.0, 8.0)` at Q16.16 internally (the range widens
+slightly due to the parity constraint, but the piecewise segments handle it).
+The `shift` value records the total displacement for later denormalization.
+
+The denormalization combines both the normalization undo and the radix
+adjustment into a single shift:
+
+```
+We computed:  result ≈ sqrt(xn) in Q16.16
+              where xn = x << shift
+So:           result ≈ sqrt(x) * 2^((shift + 16) / 2)
+We want:      sqrt(x) * 2^(r / 2)
+
+Final shift:  output = result >> ((shift + 16 - radix) / 2)
+```
+
+Since `shift` has the same parity as `radix`, and 16 is even, the quantity
+`(shift + 16 - radix)` is always even — the division by 2 is exact. No
+special cases needed for odd vs even radix.
+
+The normalization costs: one CLZ, one mask, one OR, one shift. On ARM64,
+that is 4 instructions. On Cortex-M0 (no CLZ instruction), CLZ can be
+emulated with a small loop over the top bits — about 8-10 instructions
+worst case.
+
+---
+
+## Step 4: Piecewise-Linear Approximation on [1, 4)
+
+Now we need to approximate `sqrt(xn)` where `xn` is in `[1.0, 4.0)` at
+internal Q16.16. This is the same regardless of the caller's radix — the
+normalization mapped everything into this canonical range.
+
+The sqrt curve on this interval:
+
+```
+sqrt(1.0) = 1.000
+sqrt(2.0) = 1.414
+sqrt(3.0) = 1.732
+sqrt(4.0) = 2.000
+```
+
+This is a gentle, monotonically increasing curve. A single linear fit already
+gets within ~3% error. Two segments (split at 2.0) get ~0.8%. Four segments
+get ~0.2%.
+
+### Single segment (for illustration)
+
+Least-squares fit of `sqrt(x)` on `[1, 4)`:
+
+```
+sqrt(x) ≈ 0.4858*x + 0.6091
+```
+
+Shift-only approximation of the coefficients:
+
+```
+0.4858 ≈ 1/2 - 1/64       = (x >> 1) - (x >> 6)
+0.6091 ≈ 1/2 + 1/8 - 1/64 = (xn_one >> 1) + (xn_one >> 3) - (xn_one >> 6)
+```
+
+where `xn_one` is 1.0 in Q16.16 = 65536 (a constant, so the `b` term compiles
+to a single constant at compile time).
+
+```c
+/* Single-segment: ~2.5% peak error */
+s32 result = (xn >> 1) - (xn >> 6) + 39912;  /* 39912 ≈ 0.609 * 65536 */
+```
+
+That's **3 instructions** for the approximation. But 2.5% is too coarse.
+
+
+## Step 5: Two Segments — Splitting at 2.0
+
+Split `[1, 4)` into `[1, 2)` and `[2, 4)`. The segment selection is a single
+bit test — bit 17 of `xn` (the "2.0" bit in Q16.16):
+
+```c
+if (xn < (2 << 16)) {
+    /* Segment [1.0, 2.0): sqrt(x) ≈ a1*x + b1 */
+} else {
+    /* Segment [2.0, 4.0): sqrt(x) ≈ a2*x + b2 */
+}
+```
+
+Least-squares coefficients:
+
+| Segment     | a (slope) | b (intercept) | Max error |
+|-------------|-----------|---------------|-----------|
+| [1.0, 2.0) | 0.4220    | 0.5898        | ~0.7%     |
+| [2.0, 4.0) | 0.2985    | 0.8374        | ~0.8%     |
+
+Shift-only approximations:
+
+```
+a1 = 0.4220 ≈ 1/2 - 1/16 + 1/128  = (x>>1) - (x>>4) + (x>>7)
+b1 = 0.5898 ≈ 1/2 + 1/16 - 1/64   = 38650  (precomputed constant)
+
+a2 = 0.2985 ≈ 1/4 + 1/16 - 1/256  = (x>>2) + (x>>4) - (x>>8)
+b2 = 0.8374 ≈ 1 - 1/8 - 1/32      = 54874  (precomputed constant)
+```
+
+```c
+/* Two-segment: ~0.5-0.8% peak error */
+s32 result;
+if (xn < 131072) {  /* 131072 = 2.0 in Q16.16 */
+    result = (xn >> 1) - (xn >> 4) + (xn >> 7) + 38650;
+} else {
+    result = (xn >> 2) + (xn >> 4) - (xn >> 8) + 54874;
+}
+```
+
+Total: CLZ + shift + branch + 3 shift-adds + add-constant + shift-back.
+About **8-10 instructions**.
+
+---
+
+## Step 6: Removing the Overhead — Why This Beats Calling hypot_fast
+
+If we had literally called `FR_hypot_fast(x, x)`, the function would:
+
+1. Compute `|x|` and `|y|` (redundant — both are the same)
+2. Sort into hi and lo (redundant — hi = lo = |x|)
+3. Compute ratio `lo/hi` (redundant — always 1.0)
+4. Select segment based on ratio (redundant — always the same segment)
+5. Evaluate `a*hi + b*lo` (partially redundant — `hi == lo`)
+
+By inlining the technique for the sqrt case, we eliminate ALL of that overhead:
+
+- **No abs/sort**: The input is a single positive value (negative inputs can
+  be rejected trivially, since sqrt of negative is undefined).
+- **No ratio computation**: There is no ratio — we have one variable, not two.
+- **No ratio-based segment selection**: We select the segment by the
+  leading-bit position (CLZ), which is a byproduct of the normalization we
+  already need.
+- **No dual-variable evaluation**: The linear fit is `a*x + b`, not
+  `a*hi + b*lo`. One fewer multiply-equivalent term.
+
+What remains is the pure essence of the technique: **normalize to a fixed
+range, apply a shift-only linear fit, denormalize**.
+
+---
+
+## Step 7: Putting It All Together
+
+```c
+/*
+ * FR_sqrt_fast — shift-only piecewise-linear square root approximation.
+ *
+ * Returns sqrt(x) at the SAME RADIX as the input (any radix, not just 16).
+ * Uses only integer shifts, adds, and leading-bit detection.
+ * No multiplications, no lookup tables, no iterations.
+ *
+ * Peak error: ~0.4% (2 segments) or ~0.15% (4 segments).
+ * Speed: ~5 ns on ARM Cortex (vs ~25 ns for exact FR_sqrt).
+ *
+ * Algorithm:
+ *   1. Find leading-bit position (CLZ) to determine magnitude
+ *   2. Normalize input to [1.0, 4.0) in internal Q16.16
+ *   3. Evaluate shift-only linear approximation (2 segments)
+ *   4. Single combined denormalization shift (normalization + radix)
+ *
+ * Radix handling: the normalization shift is forced to the same parity
+ * as the caller's radix. This guarantees (shift + 16 - radix) is always
+ * even, so the final halving is exact. No special cases for odd/even radix.
+ *
+ * Based on the piecewise-linear shift-only technique from FR_hypot_fast.
+ * See US Patent 6,567,777 B1 (Chatterjee, public domain).
+ */
+s32 FR_sqrt_fast(s32 x, u16 radix) {
+    if (x <= 0) return 0;
+
+    /* 1. Leading-bit detection */
+    int lz = __builtin_clz((unsigned)x);
+    int bit_pos = 31 - lz;
+
+    /* 2. Normalize x into [1.0, 4.0) at internal Q16.16.
+     *    The shift parity must match radix parity so that the
+     *    combined denormalization (shift + 16 - radix) / 2 is exact.
+     */
+    int raw_shift = 17 - bit_pos;
+    int parity = radix & 1;
+    int shift = (raw_shift & ~1) | parity;
+    if ((shift - raw_shift) > 1) shift -= 2;
+
+    s32 xn;
+    if (shift >= 0)
+        xn = x << shift;
+    else
+        xn = x >> (-shift);
+
+    /* 3. Piecewise-linear approximation on [1.0, 4.0) at Q16.16.
+     *    Split at 2.0 (bit 17 test).
+     *    Coefficients found via least-squares then shift-only brute-force.
+     *
+     *    [1.0, 2.0): sqrt(x) ≈ (x>>1) - (x>>4) + (x>>7) + 38650
+     *    [2.0, 4.0): sqrt(x) ≈ (x>>2) + (x>>4) - (x>>8) + 54874
+     *
+     *    Note: due to odd-radix parity adjustment, xn may land in
+     *    [1.0, 8.0).  Values in [4.0, 8.0) need a third segment,
+     *    or the parity logic can be tightened to avoid this range.
+     */
+    s32 result;
+    if (xn < 131072) {         /* < 2.0 in Q16.16 */
+        result = (xn >> 1) - (xn >> 4) + (xn >> 7) + 38650;
+    } else if (xn < 262144) {  /* < 4.0 in Q16.16 */
+        result = (xn >> 2) + (xn >> 4) - (xn >> 8) + 54874;
+    } else {                   /* [4.0, 8.0) — only reached with odd radix */
+        result = (xn >> 3) + (xn >> 5) + (xn >> 6) + 69632;
+    }
+
+    /* 4. Combined denormalization: undo normalization AND adjust for radix.
+     *
+     *    Internally we computed: sqrt(xn) in Q16.16
+     *    where xn = x << shift, so result ≈ sqrt(x) * 2^((shift+16)/2)
+     *    We want: sqrt(x) * 2^(radix/2)
+     *
+     *    output = result >> ((shift + 16 - radix) / 2)
+     *
+     *    This is always an integer because shift ≡ radix (mod 2) and 16 is even.
+     *    One shift — handles all radixes uniformly.
+     */
+    int deshift = (shift + 16 - (int)radix) / 2;
+    if (deshift >= 0)
+        result >>= deshift;
+    else
+        result <<= (-deshift);
+
+    return result;
+}
+```
+
+The radix handling adds zero runtime cost vs a Q16.16-only version — the
+parity-aware shift is computed at the same time as the normalization, and
+the denormalization is a single shift regardless of radix. The `radix`
+parameter compiles away to a constant when the caller passes a literal
+(e.g. `FR_sqrt_fast(x, 16)` or `FR_sqrt_fast(x, 12)`), so the compiler
+can fold the parity logic and denormalization into fixed constants.
+
+---
+
+## Step 8: Comparison With the Quake III Fast Inverse Sqrt
+
+The Quake III trick (`0x5f3759df`) is:
+
+```c
+float Q_rsqrt(float number) {
+    long i;
+    float x2, y;
+    x2 = number * 0.5F;
+    y  = number;
+    i  = *(long *)&y;                       // evil floating point bit hack
+    i  = 0x5f3759df - (i >> 1);             // what the...
+    y  = *(float *)&i;
+    y  = y * (1.5F - (x2 * y * y));        // 1st Newton iteration
+    return y;
+}
+```
+
+| Property | Quake III rsqrt | FR_sqrt_fast |
+|----------|----------------|--------------|
+| Domain | IEEE 754 float | Fixed-point integer (any radix) |
+| Output | 1/sqrt(x) | sqrt(x) |
+| Operations | 1 float shift, 1 float sub, 1 Newton iteration (3 float muls + 1 sub) | CLZ + ~6 integer shift-adds |
+| Multiplications | 3 (float) | **0** |
+| Divisions | 0 | 0 |
+| Lookup tables | 0 (magic constant) | 0 |
+| Accuracy | ~0.17% after 1 Newton iteration | ~0.4% (2-seg) or ~0.15% (4-seg) |
+| Deterministic | Platform-dependent (float rounding) | **Bit-exact across all platforms** |
+| Requires FPU | Yes (or slow soft-float) | No |
+
+The approaches are philosophically similar: both use the binary representation
+of the number to extract magnitude information cheaply (Quake uses the float
+exponent bits; we use CLZ), then apply a cheap correction. But the Quake trick
+is fundamentally tied to IEEE 754 float layout, while the shift-only approach
+works on bare integers with no format assumptions.
+
+On a microcontroller without an FPU, the Quake trick is useless (soft-float
+makes the Newton iteration expensive). The shift-only approach costs the same
+whether the target has an FPU or not.
+
+---
+
+## Step 9: Extending to 4 Segments (~0.15% Error)
+
+For applications needing tighter accuracy, split `[1, 4)` into four segments
+at `[1.0, 1.5)`, `[1.5, 2.0)`, `[2.0, 3.0)`, `[3.0, 4.0)`:
+
+```c
+s32 result;
+if (xn < 98304) {           /* < 1.5 */
+    result = /* a1*xn + b1, shift-only coefficients */;
+} else if (xn < 131072) {   /* < 2.0 */
+    result = /* a2*xn + b2 */;
+} else if (xn < 196608) {   /* < 3.0 */
+    result = /* a3*xn + b3 */;
+} else {                     /* < 4.0 */
+    result = /* a4*xn + b4 */;
+}
+```
+
+The segment boundaries are simple constants, and the comparisons chain
+predictably (the branch predictor will learn the pattern quickly for
+sequential inputs). Each segment needs 3-4 shift-add terms for the slope
+plus one precomputed constant for the intercept.
+
+The coefficient derivation follows the same process used for FR_hypot_fast:
+
+1. Least-squares fit on each segment to get ideal `a, b`
+2. Brute-force search over combinations of `+/- 2^(-k)` for `k in [0..12]`
+   with up to 4 terms, minimizing peak error
+3. Verify with a sweep test in C (Python coefficients don't account for
+   integer truncation)
+
+See the FR_hypot_fast derivation notes for the full methodology.
+
+---
+
+## Summary
+
+| Variant | Segments | Peak Error | Instructions | Speed (est.) |
+|---------|----------|-----------|-------------|-------------|
+| FR_sqrt (exact) | N/A (32-iter loop) | 0.5 LSB | ~130 | ~25 ns |
+| FR_sqrt_fast | 2 | ~0.4% | ~12 | ~4-5 ns |
+| FR_sqrt_fast | 4 | ~0.15% | ~16 | ~5-6 ns |
+| Quake III rsqrt | 1 + Newton | ~0.17% | ~8 (float) | ~4 ns (with FPU) |
+
+The shift-only technique from FR_hypot_fast adapts cleanly to scalar square
+root. The key simplifications over hypot_fast are: no min/max sort, no ratio
+computation, no dual-variable evaluation — the segment is selected directly
+from the leading-bit position, which we need anyway for normalization. The
+result is a fast, portable, multiply-free square root approximation that is
+competitive with the legendary Quake III trick but works on fixed-point
+integers without an FPU.
+
+Like all FR_math functions, `FR_sqrt_fast` accepts any radix — not just
+Q16.16. The radix handling is absorbed into the normalization/denormalization
+shifts with zero additional runtime cost. The parity-matching trick
+(forcing the normalization shift to the same parity as the radix) ensures
+the combined denormalization is always a single exact integer shift,
+regardless of whether the radix is odd or even.
+
+---
+
+*FR_math library — M. A. Chatterjee — 2026*
+*Technique based on US Patent 6,567,777 B1 (Chatterjee, public domain)*
diff --git a/compare_lfm/WARNING.md b/compare_lfm/WARNING.md
new file mode 100644
index 0000000..242e97f
--- /dev/null
+++ b/compare_lfm/WARNING.md
@@ -0,0 +1,5 @@
+DO NOT touch any files outside this .compare/ directory.
+
+This is a standalone mini-project for benchmarking FR_math vs libfixmath.
+All source, build artifacts, and results live entirely within this folder.
+The parent repo's source files (../src/) are compiled read-only via -I include paths.
diff --git a/compare_lfm/bench_explog.cpp b/compare_lfm/bench_explog.cpp
new file mode 100644
index 0000000..6f62329
--- /dev/null
+++ b/compare_lfm/bench_explog.cpp
@@ -0,0 +1,710 @@
+/*
+ * bench_explog.cpp — Validate proposed exp/ln/log10 accuracy improvement
+ *
+ * Compares three variants for each of {exp, ln, log10}:
+ *   1. FR_math current  (shift-only scaling macros)
+ *   2. FR_math proposed  (one multiply via FR_MULK28)
+ *   3. libfixmath
+ * All measured against <cmath> double precision as gold standard.
+ *
+ * Compile via:  make -f Makefile.explog run
+ */
+
+#include <cstdio>
+#include <cstdint>
+#include <cmath>
+#include <chrono>
+#include <vector>
+#include <cfloat>
+#include <algorithm>
+
+/* ---- FR_math ---- */
+extern "C" {
+#include "FR_defs.h"
+#include "FR_math.h"
+}
+
+/* ---- libfixmath ---- */
+#include "fixmath.h"
+
+/* ================================================================
+ * Proposed improvement: high-precision constants at radix 28
+ * and a single-multiply scaling macro.
+ * ================================================================ */
+
+/* Constants at radix 28 (2^28 = 268435456).
+ * Verified: round(value * 2^28) for each.
+ */
+/*
+ * Verified via: python3 -c "print(round(val * 2**28))"
+ *   log2(e):   round(1.4426950408889634 * 268435456) = 387270501
+ *   ln(2):     round(0.6931471805599453 * 268435456) = 186065280
+ *   log2(10):  round(3.3219280948873626 * 268435456) = 891729313
+ *   log10(2):  round(0.3010299957401937 * 268435456) = 80807125
+ */
+#define K28_LOG2E      387270501   /* log2(e)   = 1.4426950408889634  */
+#define K28_LN2        186065280   /* ln(2)     = 0.6931471805599453  */
+#define K28_LOG2_10    891729313   /* log2(10)  = 3.3219280948873626  */
+#define K28_LOG10_2     80807125   /* log10(2)  = 0.3010299957401937  */
+
+/* Multiply x (any radix) by constant k at radix 28, result same radix as x.
+ * Rounds to nearest (adds 0.5 LSB before shift).
+ */
+static inline int32_t mulk28(int32_t x, int32_t k) {
+    return (int32_t)(((int64_t)x * (int64_t)k + (1 << 27)) >> 28);
+}
+
+/* Proposed exp (mulk28 scaling only, original 17-entry table): */
+static inline int32_t proposed_exp(int32_t input, uint16_t radix) {
+    int32_t scaled = mulk28(input, K28_LOG2E);
+    return FR_pow2(scaled, radix);
+}
+
+/* Proposed ln: log2(x) * ln(2) */
+static inline int32_t proposed_ln(int32_t input, uint16_t radix, uint16_t output_radix) {
+    int32_t r = FR_log2(input, radix, output_radix);
+    return mulk28(r, K28_LN2);
+}
+
+/* Proposed log10: log2(x) * log10(2) */
+static inline int32_t proposed_log10(int32_t input, uint16_t radix, uint16_t output_radix) {
+    int32_t r = FR_log2(input, radix, output_radix);
+    return mulk28(r, K28_LOG10_2);
+}
+
+/* ================================================================
+ * V2: mulk28 scaling + 65-entry pow2 table (64 segments)
+ *
+ * Same algorithm as FR_pow2 but 6-bit index / 10-bit interpolation
+ * instead of 4-bit / 12-bit. +192 bytes ROM for the table.
+ * ================================================================ */
+
+static const uint32_t pow2_tab_65[65] = {
+     65536,  66250,  66971,  67700,  68438,  69183,  69936,  70698,
+     71468,  72246,  73032,  73828,  74632,  75444,  76266,  77096,
+     77936,  78785,  79642,  80510,  81386,  82273,  83169,  84074,
+     84990,  85915,  86851,  87796,  88752,  89719,  90696,  91684,
+     92682,  93691,  94711,  95743,  96785,  97839,  98905,  99982,
+    101070, 102171, 103283, 104408, 105545, 106694, 107856, 109031,
+    110218, 111418, 112631, 113858, 115098, 116351, 117618, 118899,
+    120194, 121502, 122825, 124163, 125515, 126882, 128263, 129660,
+    131072,
+};
+
+/* pow2 with 65-entry table — drop-in replacement for FR_pow2 */
+static int32_t pow2_65(int32_t input, uint16_t radix) {
+    int32_t flr, frac_full, idx, frac_lo, lo, hi, mant, result;
+    uint32_t mask = (radix > 0) ? (((uint32_t)1 << radix) - 1) : 0;
+
+    if (input >= 0) {
+        flr = (int32_t)((uint32_t)input >> radix);
+        frac_full = (int32_t)((uint32_t)input & mask);
+    } else {
+        int32_t neg = -input;
+        int32_t nflr = (int32_t)((uint32_t)neg >> radix);
+        int32_t nfrc = (int32_t)((uint32_t)neg & mask);
+        if (nfrc == 0) { flr = -nflr; frac_full = 0; }
+        else { flr = -nflr - 1; frac_full = (int32_t)((1L << radix) - nfrc); }
+    }
+
+    if (radix > 16) frac_full >>= (radix - 16);
+    else if (radix < 16) frac_full <<= (16 - radix);
+
+    /* 6-bit index (64 segments), 10-bit interpolation */
+    idx     = frac_full >> 10;
+    frac_lo = frac_full & ((1L << 10) - 1);
+    lo = (int32_t)pow2_tab_65[idx];
+    hi = (int32_t)pow2_tab_65[idx + 1];
+    mant = lo + (((hi - lo) * frac_lo) >> 10);
+
+    if (flr >= 0) {
+        if (flr >= 30) return FR_OVERFLOW_POS;
+        result = mant << flr;
+        return FR_CHRDX(result, 16, radix);
+    } else {
+        int32_t sh = -flr;
+        if (sh >= 30) return 0;
+        result = mant >> sh;
+        return FR_CHRDX(result, 16, radix);
+    }
+}
+
+/* V2 exp: mulk28 scaling + 65-entry pow2 table */
+static inline int32_t v2_exp(int32_t input, uint16_t radix) {
+    int32_t scaled = mulk28(input, K28_LOG2E);
+    return pow2_65(scaled, radix);
+}
+
+/* ================================================================
+ * Helpers
+ * ================================================================ */
+
+static const int RADIX = 16;
+static const int32_t ONE = (1 << RADIX);
+static const int N = 50000;
+static const double Q16_LSB = 1.0 / (double)ONE;
+
+static inline double q2d(int32_t v) { return (double)v / (double)ONE; }
+static inline int32_t d2q(double d) {
+    return (int32_t)(d * ONE + (d >= 0 ? 0.5 : -0.5));
+}
+
+static inline int64_t now_ns() {
+    return std::chrono::duration_cast<std::chrono::nanoseconds>(
+        std::chrono::high_resolution_clock::now().time_since_epoch()
+    ).count();
+}
+
+static volatile int32_t sink;
+
+/* ================================================================
+ * Error stats
+ * ================================================================ */
+
+struct Stats {
+    double max_lsb;
+    double mean_lsb;
+    double max_abs;
+    double mean_abs;
+};
+
+static Stats calc_err(const std::vector<double>& ref,
+                      const std::vector<int32_t>& got) {
+    Stats s = {};
+    double sum_abs = 0, sum_lsb = 0;
+    int n = (int)ref.size();
+    for (int i = 0; i < n; i++) {
+        double err = std::fabs(q2d(got[i]) - ref[i]);
+        double lsb = err / Q16_LSB;
+        sum_abs += err;
+        sum_lsb += lsb;
+        if (lsb > s.max_lsb) s.max_lsb = lsb;
+        if (err > s.max_abs) s.max_abs = err;
+    }
+    s.mean_abs = sum_abs / n;
+    s.mean_lsb = sum_lsb / n;
+    return s;
+}
+
+/* ================================================================
+ * Constant verification
+ * ================================================================ */
+
+static void verify_constants() {
+    printf("=== Constant verification (radix 28, 2^28 = 268435456) ===\n\n");
+
+    struct { const char *name; int32_t k28; double exact; } tab[] = {
+        {"log2(e)",  K28_LOG2E,   1.4426950408889634},
+        {"ln(2)",    K28_LN2,     0.6931471805599453},
+        {"log2(10)", K28_LOG2_10, 3.3219280948873626},
+        {"log10(2)", K28_LOG10_2, 0.30102999566398120},
+    };
+
+    printf("  %-10s  %12s  %20s  %20s  %12s\n",
+           "constant", "k28 value", "k28/2^28", "exact", "error");
+    printf("  %-10s  %12s  %20s  %20s  %12s\n",
+           "--------", "---------", "--------", "-----", "-----");
+
+    for (auto& c : tab) {
+        double approx = (double)c.k28 / (double)(1 << 28);
+        double err = std::fabs(approx - c.exact);
+        printf("  %-10s  %12d  %20.16f  %20.16f  %12.2e\n",
+               c.name, c.k28, approx, c.exact, err);
+    }
+    printf("\n");
+}
+
+/* ================================================================
+ * Overflow safety check
+ * ================================================================ */
+
+static void check_overflow() {
+    printf("=== 64-bit overflow safety check ===\n\n");
+
+    /* Worst-case inputs for each function */
+    struct { const char *fn; int32_t worst_input; int32_t k28; } cases[] = {
+        {"exp (x=10.0)",    d2q(10.0),    K28_LOG2E},
+        {"exp (x=-10.0)",   d2q(-10.0),   K28_LOG2E},
+        {"ln  (x=max_pos)", 0x7FFFFFFF,   K28_LN2},     /* max log2 output */
+        {"log10 (x=max)",   0x7FFFFFFF,   K28_LOG10_2},
+    };
+
+    printf("  %-20s  %12s  %12s  %14s  %6s\n",
+           "case", "input", "k28", "product bits", "safe?");
+    printf("  %-20s  %12s  %12s  %14s  %6s\n",
+           "----", "-----", "---", "------------", "-----");
+
+    for (auto& c : cases) {
+        int64_t product = (int64_t)c.worst_input * (int64_t)c.k28;
+        /* Count bits needed */
+        uint64_t mag = (product < 0) ? (uint64_t)(-product) : (uint64_t)product;
+        int bits = 0;
+        uint64_t tmp = mag;
+        while (tmp > 0) { tmp >>= 1; bits++; }
+        bool safe = (bits < 63); /* s64 has 63 magnitude bits */
+
+        printf("  %-20s  %12d  %12d  %14d  %6s\n",
+               c.fn, c.worst_input, c.k28, bits, safe ? "YES" : "NO!");
+    }
+    printf("\n");
+}
+
+/* ================================================================
+ * Benchmark: exp
+ * ================================================================ */
+
+static void bench_exp() {
+    printf("=== exp(x) — range [-5, +5] ===\n\n");
+
+    std::vector<int32_t> inputs(N);
+    std::vector<double> ref(N);
+    for (int i = 0; i < N; i++) {
+        double t = -5.0 + 10.0 * i / (N - 1);
+        inputs[i] = d2q(t);
+        ref[i] = std::exp(t);
+    }
+
+    /* --- Current: shift-only FR_SLOG2E --- */
+    std::vector<int32_t> cur_out(N);
+    int64_t t0 = now_ns();
+    for (int i = 0; i < N; i++)
+        cur_out[i] = FR_EXP(inputs[i], RADIX);
+    int64_t t1 = now_ns();
+    double cur_ns = (double)(t1 - t0) / N;
+    Stats cur_err = calc_err(ref, cur_out);
+
+    /* --- Proposed: mulk28 --- */
+    std::vector<int32_t> new_out(N);
+    t0 = now_ns();
+    for (int i = 0; i < N; i++)
+        new_out[i] = proposed_exp(inputs[i], RADIX);
+    t1 = now_ns();
+    double new_ns = (double)(t1 - t0) / N;
+    Stats new_err = calc_err(ref, new_out);
+
+    /* --- V2: mulk28 + 65-entry table --- */
+    std::vector<int32_t> v2_out(N);
+    t0 = now_ns();
+    for (int i = 0; i < N; i++)
+        v2_out[i] = v2_exp(inputs[i], RADIX);
+    t1 = now_ns();
+    double v2_ns = (double)(t1 - t0) / N;
+    Stats v2_err = calc_err(ref, v2_out);
+
+    /* --- libfixmath --- */
+    std::vector<int32_t> lfm_out(N);
+    t0 = now_ns();
+    for (int i = 0; i < N; i++)
+        lfm_out[i] = fix16_exp(inputs[i]);
+    t1 = now_ns();
+    double lfm_ns = (double)(t1 - t0) / N;
+    Stats lfm_err = calc_err(ref, lfm_out);
+
+    printf("  %-28s  %8s  %8s  %10s  %10s\n",
+           "variant", "max LSB", "mean LSB", "ns/call", "vs lfm");
+    printf("  %-28s  %8s  %8s  %10s  %10s\n",
+           "-------", "-------", "--------", "-------", "------");
+    printf("  %-28s  %8.1f  %8.1f  %10.1f  %10.1fx\n",
+           "FR current (shift+17tab)", cur_err.max_lsb, cur_err.mean_lsb, cur_ns,
+           lfm_ns / cur_ns);
+    printf("  %-28s  %8.1f  %8.1f  %10.1f  %10.1fx\n",
+           "FR mulk28 only (+0 bytes)", new_err.max_lsb, new_err.mean_lsb, new_ns,
+           lfm_ns / new_ns);
+    printf("  %-28s  %8.1f  %8.1f  %10.1f  %10.1fx\n",
+           "FR mulk28+65tab (+192B)", v2_err.max_lsb, v2_err.mean_lsb, v2_ns,
+           lfm_ns / v2_ns);
+    printf("  %-28s  %8.1f  %8.1f  %10.1f  %10s\n",
+           "libfixmath (+33KB RAM)", lfm_err.max_lsb, lfm_err.mean_lsb, lfm_ns,
+           "baseline");
+    printf("\n");
+}
+
+/* ================================================================
+ * Benchmark: ln
+ * ================================================================ */
+
+static void bench_ln() {
+    printf("=== ln(x) — range [0.01, 100] ===\n\n");
+
+    std::vector<int32_t> inputs(N);
+    std::vector<double> ref(N);
+    for (int i = 0; i < N; i++) {
+        double t = 0.01 + 99.99 * i / (N - 1);
+        inputs[i] = d2q(t);
+        ref[i] = std::log(t);
+    }
+
+    /* --- Current --- */
+    std::vector<int32_t> cur_out(N);
+    int64_t t0 = now_ns();
+    for (int i = 0; i < N; i++)
+        cur_out[i] = FR_ln(inputs[i], RADIX, RADIX);
+    int64_t t1 = now_ns();
+    double cur_ns = (double)(t1 - t0) / N;
+    Stats cur_err = calc_err(ref, cur_out);
+
+    /* --- Proposed --- */
+    std::vector<int32_t> new_out(N);
+    t0 = now_ns();
+    for (int i = 0; i < N; i++)
+        new_out[i] = proposed_ln(inputs[i], RADIX, RADIX);
+    t1 = now_ns();
+    double new_ns = (double)(t1 - t0) / N;
+    Stats new_err = calc_err(ref, new_out);
+
+    /* --- libfixmath --- */
+    std::vector<int32_t> lfm_out(N);
+    t0 = now_ns();
+    for (int i = 0; i < N; i++)
+        lfm_out[i] = fix16_log(inputs[i]);
+    t1 = now_ns();
+    double lfm_ns = (double)(t1 - t0) / N;
+    Stats lfm_err = calc_err(ref, lfm_out);
+
+    printf("  %-22s  %8s  %8s  %10s  %10s\n",
+           "variant", "max LSB", "mean LSB", "ns/call", "speedup");
+    printf("  %-22s  %8s  %8s  %10s  %10s\n",
+           "-------", "-------", "--------", "-------", "-------");
+    printf("  %-22s  %8.1f  %8.1f  %10.1f  %10s\n",
+           "FR_math current", cur_err.max_lsb, cur_err.mean_lsb, cur_ns, "baseline");
+    printf("  %-22s  %8.1f  %8.1f  %10.1f  %10.2fx\n",
+           "FR_math proposed", new_err.max_lsb, new_err.mean_lsb, new_ns,
+           cur_ns / new_ns);
+    printf("  %-22s  %8.1f  %8.1f  %10.1f  %10.2fx\n",
+           "libfixmath", lfm_err.max_lsb, lfm_err.mean_lsb, lfm_ns,
+           cur_ns / lfm_ns);
+
+    printf("\n  Accuracy improvement: %.1fx better max LSB error\n",
+           cur_err.max_lsb / new_err.max_lsb);
+    printf("  Speed cost of multiply: %.1f ns -> %.1f ns (%.1f%% overhead)\n",
+           cur_ns, new_ns, (new_ns - cur_ns) / cur_ns * 100.0);
+    printf("  Proposed vs libfixmath: %.1fx %s, %.1fx %s accuracy\n",
+           std::max(lfm_ns / new_ns, new_ns / lfm_ns),
+           new_ns < lfm_ns ? "faster" : "slower",
+           std::max(lfm_err.max_lsb / new_err.max_lsb,
+                    new_err.max_lsb / lfm_err.max_lsb),
+           new_err.max_lsb < lfm_err.max_lsb ? "better" : "worse");
+    printf("\n");
+}
+
+/* ================================================================
+ * Benchmark: log10
+ * ================================================================ */
+
+static void bench_log10() {
+    printf("=== log10(x) — range [0.01, 100] ===\n\n");
+
+    std::vector<int32_t> inputs(N);
+    std::vector<double> ref(N);
+    for (int i = 0; i < N; i++) {
+        double t = 0.01 + 99.99 * i / (N - 1);
+        inputs[i] = d2q(t);
+        ref[i] = std::log10(t);
+    }
+
+    /* --- Current --- */
+    std::vector<int32_t> cur_out(N);
+    int64_t t0 = now_ns();
+    for (int i = 0; i < N; i++)
+        cur_out[i] = FR_log10(inputs[i], RADIX, RADIX);
+    int64_t t1 = now_ns();
+    double cur_ns = (double)(t1 - t0) / N;
+    Stats cur_err = calc_err(ref, cur_out);
+
+    /* --- Proposed --- */
+    std::vector<int32_t> new_out(N);
+    t0 = now_ns();
+    for (int i = 0; i < N; i++)
+        new_out[i] = proposed_log10(inputs[i], RADIX, RADIX);
+    t1 = now_ns();
+    double new_ns = (double)(t1 - t0) / N;
+    Stats new_err = calc_err(ref, new_out);
+
+    /* --- libfixmath (no log10 — compute as log(x)/log(10)) --- */
+    /* libfixmath has fix16_log (natural log) and fix16_log2, no fix16_log10.
+     * We'll compute it as: fix16_mul(fix16_log2(x), fix16_from_dbl(log10(2)))
+     * which is the same identity we use.
+     */
+    const fix16_t lfm_log10_2 = fix16_from_dbl(0.30102999566398120);
+    std::vector<int32_t> lfm_out(N);
+    t0 = now_ns();
+    for (int i = 0; i < N; i++)
+        lfm_out[i] = fix16_mul(fix16_log2(inputs[i]), lfm_log10_2);
+    t1 = now_ns();
+    double lfm_ns = (double)(t1 - t0) / N;
+    Stats lfm_err = calc_err(ref, lfm_out);
+
+    printf("  %-22s  %8s  %8s  %10s  %10s\n",
+           "variant", "max LSB", "mean LSB", "ns/call", "speedup");
+    printf("  %-22s  %8s  %8s  %10s  %10s\n",
+           "-------", "-------", "--------", "-------", "-------");
+    printf("  %-22s  %8.1f  %8.1f  %10.1f  %10s\n",
+           "FR_math current", cur_err.max_lsb, cur_err.mean_lsb, cur_ns, "baseline");
+    printf("  %-22s  %8.1f  %8.1f  %10.1f  %10.2fx\n",
+           "FR_math proposed", new_err.max_lsb, new_err.mean_lsb, new_ns,
+           cur_ns / new_ns);
+    printf("  %-22s  %8.1f  %8.1f  %10.1f  %10.2fx\n",
+           "libfixmath", lfm_err.max_lsb, lfm_err.mean_lsb, lfm_ns,
+           cur_ns / lfm_ns);
+
+    printf("\n  Accuracy improvement: %.1fx better max LSB error\n",
+           cur_err.max_lsb / new_err.max_lsb);
+    printf("  Speed cost of multiply: %.1f ns -> %.1f ns (%.1f%% overhead)\n",
+           cur_ns, new_ns, (new_ns - cur_ns) / cur_ns * 100.0);
+    printf("  Proposed vs libfixmath: %.1fx %s, %.1fx %s accuracy\n",
+           std::max(lfm_ns / new_ns, new_ns / lfm_ns),
+           new_ns < lfm_ns ? "faster" : "slower",
+           std::max(lfm_err.max_lsb / new_err.max_lsb,
+                    new_err.max_lsb / lfm_err.max_lsb),
+           new_err.max_lsb < lfm_err.max_lsb ? "better" : "worse");
+    printf("\n");
+}
+
+/* ================================================================
+ * Table error isolation: how much error comes from the lookup tables
+ * vs the scaling step?
+ * ================================================================ */
+
+static void isolate_table_error() {
+    printf("=== Error decomposition: table vs scaling ===\n\n");
+    printf("  This feeds PERFECT (double-derived) intermediates into pow2/log2\n");
+    printf("  to measure table error in isolation.\n\n");
+
+    /* --- pow2 table error (affects exp) --- */
+    {
+        printf("  FR_pow2 table error (inputs: exact x*log2(e) at Q16.16):\n");
+        double max_table_lsb = 0, sum_table_lsb = 0;
+        double max_total_lsb = 0, sum_total_lsb = 0;
+        int n = N;
+        for (int i = 0; i < n; i++) {
+            double x = -5.0 + 10.0 * i / (n - 1);
+            double gold = std::exp(x);
+            if (gold > 32767.0 || gold < 1e-5) continue; /* skip overflow/underflow */
+
+            /* Perfect intermediate: exact x * log2(e) quantized to Q16.16 */
+            double exact_scaled = x * 1.4426950408889634;
+            int32_t perfect_q = d2q(exact_scaled);
+            int32_t table_only = FR_pow2(perfect_q, RADIX);
+            double table_err_lsb = std::fabs(q2d(table_only) - gold) / Q16_LSB;
+
+            /* Full pipeline: shift-only scaling + pow2 */
+            int32_t xq = d2q(x);
+            int32_t full_current = FR_EXP(xq, RADIX);
+            double total_err_lsb = std::fabs(q2d(full_current) - gold) / Q16_LSB;
+
+            if (table_err_lsb > max_table_lsb) max_table_lsb = table_err_lsb;
+            sum_table_lsb += table_err_lsb;
+            if (total_err_lsb > max_total_lsb) max_total_lsb = total_err_lsb;
+            sum_total_lsb += total_err_lsb;
+        }
+        printf("    pow2 table alone:     max %8.1f LSB,  mean %6.1f LSB\n",
+               max_table_lsb, sum_table_lsb / n);
+        printf("    full exp pipeline:    max %8.1f LSB,  mean %6.1f LSB\n",
+               max_total_lsb, sum_total_lsb / n);
+        printf("    -> table is %.0f%% of total max error\n\n",
+               max_table_lsb / max_total_lsb * 100.0);
+    }
+
+    /* --- log2 table error (affects ln, log10) --- */
+    {
+        printf("  FR_log2 table error (direct measurement):\n");
+        double max_log2_lsb = 0, sum_log2_lsb = 0;
+        double max_ln_scale_lsb = 0, sum_ln_scale_lsb = 0;
+        double max_ln_total_lsb = 0, sum_ln_total_lsb = 0;
+        int n = N, cnt = 0;
+        for (int i = 0; i < n; i++) {
+            double x = 0.01 + 99.99 * i / (n - 1);
+            int32_t xq = d2q(x);
+            double gold_log2 = std::log2(x);
+            double gold_ln   = std::log(x);
+
+            /* log2 table error */
+            int32_t log2_out = FR_log2(xq, RADIX, RADIX);
+            double log2_lsb = std::fabs(q2d(log2_out) - gold_log2) / Q16_LSB;
+            if (log2_lsb > max_log2_lsb) max_log2_lsb = log2_lsb;
+            sum_log2_lsb += log2_lsb;
+
+            /* ln via perfect log2 + shift-only scale */
+            double exact_log2 = gold_log2;  /* what if log2 were perfect? */
+            int32_t perfect_log2_q = d2q(exact_log2);
+            int32_t scale_only = FR_SrLOG2E(perfect_log2_q);
+            double scale_lsb = std::fabs(q2d(scale_only) - gold_ln) / Q16_LSB;
+            if (scale_lsb > max_ln_scale_lsb) max_ln_scale_lsb = scale_lsb;
+            sum_ln_scale_lsb += scale_lsb;
+
+            /* Full ln pipeline */
+            int32_t ln_out = FR_ln(xq, RADIX, RADIX);
+            double ln_lsb = std::fabs(q2d(ln_out) - gold_ln) / Q16_LSB;
+            if (ln_lsb > max_ln_total_lsb) max_ln_total_lsb = ln_lsb;
+            sum_ln_total_lsb += ln_lsb;
+            cnt++;
+        }
+        printf("    log2 table alone:     max %8.1f LSB,  mean %6.1f LSB\n",
+               max_log2_lsb, sum_log2_lsb / cnt);
+        printf("    scale alone (perfect log2 + shift-only *ln2):\n");
+        printf("                          max %8.1f LSB,  mean %6.1f LSB\n",
+               max_ln_scale_lsb, sum_ln_scale_lsb / cnt);
+        printf("    full ln pipeline:     max %8.1f LSB,  mean %6.1f LSB\n",
+               max_ln_total_lsb, sum_ln_total_lsb / cnt);
+        printf("    -> log2 table is %.0f%% of total ln max error\n\n",
+               max_log2_lsb / max_ln_total_lsb * 100.0);
+    }
+
+    /* --- Actual 65-entry pow2 table error --- */
+    {
+        printf("  pow2_65 table error (64 segments, measured):\n");
+        double max65_lsb = 0, sum65_lsb = 0;
+        int cnt = 0;
+        for (int i = 0; i < N; i++) {
+            double x = -5.0 + 10.0 * i / (N - 1);
+            double gold = std::exp(x);
+            if (gold > 32767.0 || gold < 1e-5) continue;
+            double exact_scaled = x * 1.4426950408889634;
+            int32_t perfect_q = d2q(exact_scaled);
+            int32_t tab65_out = pow2_65(perfect_q, RADIX);
+            double lsb = std::fabs(q2d(tab65_out) - gold) / Q16_LSB;
+            if (lsb > max65_lsb) max65_lsb = lsb;
+            sum65_lsb += lsb;
+            cnt++;
+        }
+        printf("    pow2_65 table alone:   max %8.1f LSB,  mean %6.1f LSB\n",
+               max65_lsb, sum65_lsb / cnt);
+        printf("    Table size: 65 * 4 = 260 bytes (u32), delta = +192 bytes\n\n");
+    }
+}
+
+/* ================================================================
+ * Spot-check: show a few sample values
+ * ================================================================ */
+
+static void spot_check() {
+    printf("=== Spot check: selected values ===\n\n");
+
+    double test_vals[] = {-3.0, -1.0, -0.5, 0.0, 0.5, 1.0, 2.0, 5.0};
+    int nv = sizeof(test_vals) / sizeof(test_vals[0]);
+
+    printf("  exp(x):\n");
+    printf("  %8s  %12s  %12s  %12s  %12s\n",
+           "x", "double", "current", "mulk28+65tab", "libfixmath");
+    printf("  %8s  %12s  %12s  %12s  %12s\n",
+           "---", "------", "-------", "------------", "----------");
+    for (int i = 0; i < nv; i++) {
+        double x = test_vals[i];
+        int32_t xq = d2q(x);
+        double gold = std::exp(x);
+        double cur  = q2d(FR_EXP(xq, RADIX));
+        double v2   = q2d(v2_exp(xq, RADIX));
+        double lfm  = q2d(fix16_exp(xq));
+        printf("  %8.2f  %12.6f  %12.6f  %12.6f  %12.6f\n",
+               x, gold, cur, v2, lfm);
+    }
+
+    printf("\n  ln(x):\n");
+    double ln_vals[] = {0.01, 0.1, 0.5, 1.0, 2.0, 10.0, 50.0, 100.0};
+    int nlv = sizeof(ln_vals) / sizeof(ln_vals[0]);
+    printf("  %8s  %12s  %12s  %12s  %12s\n",
+           "x", "double", "current", "proposed", "libfixmath");
+    printf("  %8s  %12s  %12s  %12s  %12s\n",
+           "---", "------", "-------", "--------", "----------");
+    for (int i = 0; i < nlv; i++) {
+        double x = ln_vals[i];
+        int32_t xq = d2q(x);
+        double gold = std::log(x);
+        double cur  = q2d(FR_ln(xq, RADIX, RADIX));
+        double prop = q2d(proposed_ln(xq, RADIX, RADIX));
+        double lfm  = q2d(fix16_log(xq));
+        printf("  %8.2f  %12.6f  %12.6f  %12.6f  %12.6f\n",
+               x, gold, cur, prop, lfm);
+    }
+    printf("\n");
+}
+
+/* ================================================================
+ * Shift-only macro error analysis
+ * ================================================================ */
+
+static void analyze_shift_macros() {
+    printf("=== Shift-only macro error (root cause analysis) ===\n\n");
+
+    /* Test FR_SLOG2E: should multiply by log2(e) = 1.44269504... */
+    printf("  FR_SLOG2E(x) vs exact x * log2(e):\n");
+    printf("  %12s  %12s  %12s  %12s  %12s\n",
+           "x (Q16.16)", "shift-only", "exact", "proposed", "shift err");
+    printf("  %12s  %12s  %12s  %12s  %12s\n",
+           "----------", "----------", "-----", "--------", "---------");
+
+    int32_t test_inputs[] = {65536, 327680, -327680, 655360, 6554};
+    for (auto xq : test_inputs) {
+        double x = q2d(xq);
+        int32_t shift_val = FR_SLOG2E(xq);
+        double exact = x * 1.4426950408889634;
+        int32_t prop_val = mulk28(xq, K28_LOG2E);
+        double shift_err_lsb = std::fabs(q2d(shift_val) - exact) / Q16_LSB;
+        double prop_err_lsb = std::fabs(q2d(prop_val) - exact) / Q16_LSB;
+        printf("  %12d  %12d  %12.2f  %12d  %8.1f / %.1f LSB\n",
+               xq, shift_val, exact * ONE, prop_val, shift_err_lsb, prop_err_lsb);
+    }
+
+    printf("\n  FR_SrLOG2E(x) vs exact x * ln(2):\n");
+    printf("  %12s  %12s  %12s  %12s  %12s\n",
+           "x (Q16.16)", "shift-only", "exact", "proposed", "shift err");
+    printf("  %12s  %12s  %12s  %12s  %12s\n",
+           "----------", "----------", "-----", "--------", "---------");
+
+    for (auto xq : test_inputs) {
+        double x = q2d(xq);
+        int32_t shift_val = FR_SrLOG2E(xq);
+        double exact = x * 0.6931471805599453;
+        int32_t prop_val = mulk28(xq, K28_LN2);
+        double shift_err_lsb = std::fabs(q2d(shift_val) - exact) / Q16_LSB;
+        double prop_err_lsb = std::fabs(q2d(prop_val) - exact) / Q16_LSB;
+        printf("  %12d  %12d  %12.2f  %12d  %8.1f / %.1f LSB\n",
+               xq, shift_val, exact * ONE, prop_val, shift_err_lsb, prop_err_lsb);
+    }
+    printf("\n");
+}
+
+/* ================================================================
+ * main
+ * ================================================================ */
+
+int main() {
+    printf("========================================================\n");
+    printf("  FR_math exp/log accuracy improvement validation\n");
+    printf("  Proposed: replace shift-only macros with FR_MULK28\n");
+    printf("  Gold standard: <cmath> IEEE 754 double precision\n");
+    printf("  Test points: %d per function, Q16.16 (s15.16)\n", N);
+    printf("========================================================\n\n");
+
+    verify_constants();
+    check_overflow();
+    analyze_shift_macros();
+    isolate_table_error();
+    spot_check();
+
+    printf("========================================================\n");
+    printf("  SPEED + ACCURACY BENCHMARKS\n");
+    printf("========================================================\n\n");
+
+    /* Warmup */
+    for (volatile int i = 0; i < 10000; i++) {
+        sink = FR_EXP(d2q(1.0), RADIX);
+        sink = fix16_exp(d2q(1.0));
+        sink = proposed_exp(d2q(1.0), RADIX);
+        sink = v2_exp(d2q(1.0), RADIX);
+    }
+
+    bench_exp();
+    bench_ln();
+    bench_log10();
+
+    printf("========================================================\n");
+    printf("  SIZE COMPARISON\n");
+    printf("========================================================\n\n");
+    printf("  FR_math exp/log (current):  1,024 bytes (824 code + 200 tables)\n");
+    printf("  FR_math exp/log (with 65):  1,216 bytes (824 code + 392 tables)\n");
+    printf("  libfixmath exp/log:        33,996 bytes (1228 code + 32768 RAM cache)\n\n");
+    printf("  Delta for 65-entry pow2:   +192 bytes ROM.  Still ~28x smaller.\n");
+
+    return 0;
+}
diff --git a/compare_lfm/benchmark.cpp b/compare_lfm/benchmark.cpp
new file mode 100644
index 0000000..19739c3
--- /dev/null
+++ b/compare_lfm/benchmark.cpp
@@ -0,0 +1,829 @@
+/*
+ * benchmark.cpp — FR_math vs libfixmath comparison
+ *
+ * WARNING: This file lives in .compare/ and must NOT touch any
+ *          files outside this directory.
+ *
+ * Both libraries use Q16.16 (s15.16) fixed-point: 1.0 = 65536.
+ * We compare: speed (wall-clock ns per call) and accuracy (vs double).
+ *
+ * Output:
+ *   stdout → JSON  (machine-readable)
+ *   stderr → markdown summary table (human-readable)
+ *
+ * Compile via the accompanying Makefile.
+ */
+
+#include <cstdio>
+#include <cstdint>
+#include <cmath>
+#include <cstring>
+#include <chrono>
+#include <vector>
+#include <string>
+#include <algorithm>
+#include <cfloat>
+
+/* ---- FR_math ---- */
+extern "C" {
+#include "FR_defs.h"
+#include "FR_math.h"
+}
+
+/* ---- libfixmath ---- */
+#include "fixmath.h"
+
+/* ================================================================
+ * Helpers
+ * ================================================================ */
+
+static const int RADIX = 16;                    /* s15.16 */
+static const int32_t ONE = (1 << RADIX);        /* 65536  */
+
+/* Separate counts for accuracy (many points) and timing (tight loop) */
+static const int N_ACCURACY = 65536;
+static const int N_TIMING   = 100000;
+
+/* Near-zero threshold for relative error — skip |ref| < 0.01 to avoid
+ * misleading spikes (1/0.01 = 100x max amplification). Matches the
+ * threshold used in the TDD characterization suite. */
+static const double REL_THRESH = 0.01;
+
+/* Convert Q16.16 to double */
+static inline double q16_to_dbl(int32_t v) {
+    return (double)v / (double)ONE;
+}
+
+/* Convert double to Q16.16 */
+static inline int32_t dbl_to_q16(double d) {
+    return (int32_t)(d * ONE + (d >= 0 ? 0.5 : -0.5));
+}
+
+/* High-resolution timer returning nanoseconds */
+static inline int64_t now_ns() {
+    return std::chrono::duration_cast<std::chrono::nanoseconds>(
+        std::chrono::high_resolution_clock::now().time_since_epoch()
+    ).count();
+}
+
+/* ================================================================
+ * Test-point generators (in Q16.16)
+ * ================================================================ */
+
+/* Angle inputs in radians Q16.16: sweep -pi to +pi */
+static std::vector<int32_t> make_angle_inputs(int n) {
+    std::vector<int32_t> v(n);
+    double lo = -M_PI, hi = M_PI;
+    for (int i = 0; i < n; i++) {
+        double t = lo + (hi - lo) * i / (n - 1);
+        v[i] = dbl_to_q16(t);
+    }
+    return v;
+}
+
+/* Inputs in [-0.999, +0.999] Q16.16 for asin/acos */
+static std::vector<int32_t> make_unit_inputs(int n) {
+    std::vector<int32_t> v(n);
+    double lo = -0.999, hi = 0.999;
+    for (int i = 0; i < n; i++) {
+        double t = lo + (hi - lo) * i / (n - 1);
+        v[i] = dbl_to_q16(t);
+    }
+    return v;
+}
+
+/* Positive inputs (0.01 .. 100) for sqrt / log */
+static std::vector<int32_t> make_pos_inputs(int n) {
+    std::vector<int32_t> v(n);
+    double lo = 0.01, hi = 100.0;
+    for (int i = 0; i < n; i++) {
+        double t = lo + (hi - lo) * i / (n - 1);
+        v[i] = dbl_to_q16(t);
+    }
+    return v;
+}
+
+/* General multiply/divide inputs: moderate range to avoid overflow */
+static std::vector<int32_t> make_arith_inputs(int n) {
+    std::vector<int32_t> v(n);
+    double lo = -50.0, hi = 50.0;
+    for (int i = 0; i < n; i++) {
+        double t = lo + (hi - lo) * i / (n - 1);
+        v[i] = dbl_to_q16(t);
+    }
+    return v;
+}
+
+/* exp inputs: small range to avoid overflow (e^x grows fast) */
+static std::vector<int32_t> make_exp_inputs(int n) {
+    std::vector<int32_t> v(n);
+    double lo = -5.0, hi = 5.0;
+    for (int i = 0; i < n; i++) {
+        double t = lo + (hi - lo) * i / (n - 1);
+        v[i] = dbl_to_q16(t);
+    }
+    return v;
+}
+
+/* atan2 inputs: 360° sweep at multiple radii, all 4 quadrants.
+ * Returns parallel (x, y) vectors of length n. */
+static void make_atan2_inputs(int n, std::vector<int32_t>& x_out,
+                              std::vector<int32_t>& y_out) {
+    x_out.resize(n);
+    y_out.resize(n);
+    /* 5 radii, n/5 angles each */
+    double radii[] = {0.1, 1.0, 10.0, 100.0, 1000.0};
+    int nrad = 5;
+    int per_r = n / nrad;
+    int idx = 0;
+    for (int ri = 0; ri < nrad; ri++) {
+        double r = radii[ri];
+        for (int ai = 0; ai < per_r && idx < n; ai++, idx++) {
+            double angle = -M_PI + 2.0 * M_PI * ai / per_r;
+            x_out[idx] = dbl_to_q16(r * std::cos(angle));
+            y_out[idx] = dbl_to_q16(r * std::sin(angle));
+        }
+    }
+    /* fill remaining */
+    for (; idx < n; idx++) {
+        x_out[idx] = dbl_to_q16(1.0);
+        y_out[idx] = dbl_to_q16(1.0);
+    }
+}
+
+/* ================================================================
+ * Error measurement
+ * ================================================================ */
+
+static const double Q16_LSB = 1.0 / (double)ONE;  /* 1.52587890625e-5 */
+
+struct ErrorStats {
+    double max_abs_err;    /* max |fixed - double_ref| in real units */
+    double mean_abs_err;
+    double max_lsb_err;    /* max error expressed in Q16.16 LSBs */
+    double mean_lsb_err;
+    double max_rel_err;    /* max |fixed - double_ref| / |double_ref| (%) */
+    double mean_rel_err;   /* skips |ref| < REL_THRESH to avoid inf */
+    int    count;
+    int    rel_count;      /* how many points contributed to rel_err */
+};
+
+static ErrorStats compute_errors(const std::vector<double>& ref,
+                                 const std::vector<int32_t>& fixed) {
+    ErrorStats s = {};
+    double sum_abs = 0, sum_lsb = 0, sum_rel = 0;
+    int rel_count = 0;
+    int n = (int)ref.size();
+    for (int i = 0; i < n; i++) {
+        double got = q16_to_dbl(fixed[i]);
+        double err = std::fabs(got - ref[i]);
+        double lsb = err / Q16_LSB;
+        sum_abs += err;
+        sum_lsb += lsb;
+        if (err > s.max_abs_err) s.max_abs_err = err;
+        if (lsb > s.max_lsb_err) s.max_lsb_err = lsb;
+        if (std::fabs(ref[i]) > REL_THRESH) {
+            double rel = err / std::fabs(ref[i]) * 100.0;
+            sum_rel += rel;
+            if (rel > s.max_rel_err) s.max_rel_err = rel;
+            rel_count++;
+        }
+    }
+    s.count = n;
+    s.rel_count = rel_count;
+    s.mean_abs_err = sum_abs / n;
+    s.mean_lsb_err = sum_lsb / n;
+    s.mean_rel_err = rel_count > 0 ? sum_rel / rel_count : 0;
+    return s;
+}
+
+/* ================================================================
+ * Per-function benchmark runner
+ * ================================================================ */
+
+struct BenchResult {
+    std::string name;
+    std::string gold_ref;          /* math.h function used as gold standard */
+    double fr_ns_per_call;
+    double lfm_ns_per_call;        /* -1 if libfixmath has no equivalent */
+    ErrorStats fr_err;
+    ErrorStats lfm_err;
+    std::string note = {};         /* optional per-function note */
+    std::string sweep = {};        /* sweep description for table */
+};
+
+/* Prevent compiler from optimizing away the result */
+static volatile int32_t sink;
+
+/* ================================================================
+ * Timing helper — runs a function N_TIMING times, returns ns/call.
+ * Uses 3 passes and takes the minimum to reduce variance.
+ * ================================================================ */
+template<typename Fn>
+static double time_fn(Fn fn) {
+    /* warm-up */
+    for (int i = 0; i < 1000; i++) sink = fn();
+
+    double best = 1e18;
+    for (int pass = 0; pass < 3; pass++) {
+        int64_t t0 = now_ns();
+        for (int i = 0; i < N_TIMING; i++)
+            sink = fn();
+        int64_t t1 = now_ns();
+        double ns = (double)(t1 - t0) / N_TIMING;
+        if (ns < best) best = ns;
+    }
+    return best;
+}
+
+/* ================================================================
+ * Individual benchmarks
+ * ================================================================ */
+
+static BenchResult bench_sin() {
+    auto inputs = make_angle_inputs(N_ACCURACY);
+    int n = N_ACCURACY;
+
+    std::vector<double> ref(n);
+    for (int i = 0; i < n; i++)
+        ref[i] = std::sin(q16_to_dbl(inputs[i]));
+
+    std::vector<int32_t> fr_out(n), lfm_out(n);
+    for (int i = 0; i < n; i++) fr_out[i] = fr_sin(inputs[i], RADIX);
+    for (int i = 0; i < n; i++) lfm_out[i] = fix16_sin(inputs[i]);
+
+    int ti = 0;
+    double fr_ns  = time_fn([&]{ return fr_sin(inputs[ti++ % n], RADIX); });
+    ti = 0;
+    double lfm_ns = time_fn([&]{ return fix16_sin(inputs[ti++ % n]); });
+
+    return {"sin", "std::sin", fr_ns, lfm_ns,
+            compute_errors(ref, fr_out), compute_errors(ref, lfm_out),
+            {}, "65536-pt, [-pi, +pi]"};
+}
+
+static BenchResult bench_cos() {
+    auto inputs = make_angle_inputs(N_ACCURACY);
+    int n = N_ACCURACY;
+
+    std::vector<double> ref(n);
+    for (int i = 0; i < n; i++)
+        ref[i] = std::cos(q16_to_dbl(inputs[i]));
+
+    std::vector<int32_t> fr_out(n), lfm_out(n);
+    for (int i = 0; i < n; i++) fr_out[i] = fr_cos(inputs[i], RADIX);
+    for (int i = 0; i < n; i++) lfm_out[i] = fix16_cos(inputs[i]);
+
+    int ti = 0;
+    double fr_ns  = time_fn([&]{ return fr_cos(inputs[ti++ % n], RADIX); });
+    ti = 0;
+    double lfm_ns = time_fn([&]{ return fix16_cos(inputs[ti++ % n]); });
+
+    return {"cos", "std::cos", fr_ns, lfm_ns,
+            compute_errors(ref, fr_out), compute_errors(ref, lfm_out),
+            {}, "65536-pt, [-pi, +pi]"};
+}
+
+static BenchResult bench_tan() {
+    /* Avoid near ±pi/2 where tan explodes */
+    int n = N_ACCURACY;
+    std::vector<int32_t> inputs(n);
+    double lo = -1.2, hi = 1.2;
+    for (int i = 0; i < n; i++)
+        inputs[i] = dbl_to_q16(lo + (hi - lo) * i / (n - 1));
+
+    std::vector<double> ref(n);
+    for (int i = 0; i < n; i++)
+        ref[i] = std::tan(q16_to_dbl(inputs[i]));
+
+    std::vector<int32_t> fr_out(n), lfm_out(n);
+    for (int i = 0; i < n; i++) fr_out[i] = fr_tan(inputs[i], RADIX);
+    for (int i = 0; i < n; i++) lfm_out[i] = fix16_tan(inputs[i]);
+
+    int ti = 0;
+    double fr_ns  = time_fn([&]{ return fr_tan(inputs[ti++ % n], RADIX); });
+    ti = 0;
+    double lfm_ns = time_fn([&]{ return fix16_tan(inputs[ti++ % n]); });
+
+    return {"tan", "std::tan", fr_ns, lfm_ns,
+            compute_errors(ref, fr_out), compute_errors(ref, lfm_out),
+            "Skip near pi/2", "65536-pt, [-1.2, 1.2] rad"};
+}
+
+static BenchResult bench_asin() {
+    auto inputs = make_unit_inputs(N_ACCURACY);
+    int n = N_ACCURACY;
+
+    std::vector<double> ref(n);
+    for (int i = 0; i < n; i++)
+        ref[i] = std::asin(q16_to_dbl(inputs[i]));
+
+    std::vector<int32_t> fr_out(n), lfm_out(n);
+    for (int i = 0; i < n; i++) fr_out[i] = FR_asin(inputs[i], RADIX, RADIX);
+    for (int i = 0; i < n; i++) lfm_out[i] = fix16_asin(inputs[i]);
+
+    int ti = 0;
+    double fr_ns  = time_fn([&]{ return FR_asin(inputs[ti++ % n], RADIX, RADIX); });
+    ti = 0;
+    double lfm_ns = time_fn([&]{ return fix16_asin(inputs[ti++ % n]); });
+
+    return {"asin", "std::asin", fr_ns, lfm_ns,
+            compute_errors(ref, fr_out), compute_errors(ref, lfm_out),
+            {}, "65536-pt, [-0.999, 0.999]"};
+}
+
+static BenchResult bench_acos() {
+    auto inputs = make_unit_inputs(N_ACCURACY);
+    int n = N_ACCURACY;
+
+    std::vector<double> ref(n);
+    for (int i = 0; i < n; i++)
+        ref[i] = std::acos(q16_to_dbl(inputs[i]));
+
+    std::vector<int32_t> fr_out(n), lfm_out(n);
+    for (int i = 0; i < n; i++) fr_out[i] = FR_acos(inputs[i], RADIX, RADIX);
+    for (int i = 0; i < n; i++) lfm_out[i] = fix16_acos(inputs[i]);
+
+    int ti = 0;
+    double fr_ns  = time_fn([&]{ return FR_acos(inputs[ti++ % n], RADIX, RADIX); });
+    ti = 0;
+    double lfm_ns = time_fn([&]{ return fix16_acos(inputs[ti++ % n]); });
+
+    return {"acos", "std::acos", fr_ns, lfm_ns,
+            compute_errors(ref, fr_out), compute_errors(ref, lfm_out),
+            {}, "65536-pt, [-0.999, 0.999]"};
+}
+
+static BenchResult bench_atan() {
+    auto inputs = make_arith_inputs(N_ACCURACY);
+    int n = N_ACCURACY;
+
+    std::vector<double> ref(n);
+    for (int i = 0; i < n; i++)
+        ref[i] = std::atan(q16_to_dbl(inputs[i]));
+
+    std::vector<int32_t> fr_out(n), lfm_out(n);
+    for (int i = 0; i < n; i++) fr_out[i] = FR_atan(inputs[i], RADIX, RADIX);
+    for (int i = 0; i < n; i++) lfm_out[i] = fix16_atan(inputs[i]);
+
+    int ti = 0;
+    double fr_ns  = time_fn([&]{ return FR_atan(inputs[ti++ % n], RADIX, RADIX); });
+    ti = 0;
+    double lfm_ns = time_fn([&]{ return fix16_atan(inputs[ti++ % n]); });
+
+    return {"atan", "std::atan", fr_ns, lfm_ns,
+            compute_errors(ref, fr_out), compute_errors(ref, lfm_out),
+            {}, "65536-pt, [-50, 50]"};
+}
+
+static BenchResult bench_atan2() {
+    int n = N_ACCURACY;
+    std::vector<int32_t> x_in, y_in;
+    make_atan2_inputs(n, x_in, y_in);
+
+    std::vector<double> ref(n);
+    for (int i = 0; i < n; i++)
+        ref[i] = std::atan2(q16_to_dbl(y_in[i]), q16_to_dbl(x_in[i]));
+
+    std::vector<int32_t> fr_out(n), lfm_out(n);
+    for (int i = 0; i < n; i++) fr_out[i] = FR_atan2(y_in[i], x_in[i], RADIX);
+    for (int i = 0; i < n; i++) lfm_out[i] = fix16_atan2(y_in[i], x_in[i]);
+
+    int ti = 0;
+    double fr_ns  = time_fn([&]{ int j = ti++ % n; return FR_atan2(y_in[j], x_in[j], RADIX); });
+    ti = 0;
+    double lfm_ns = time_fn([&]{ int j = ti++ % n; return fix16_atan2(y_in[j], x_in[j]); });
+
+    return {"atan2", "std::atan2", fr_ns, lfm_ns,
+            compute_errors(ref, fr_out), compute_errors(ref, lfm_out),
+            "All 4 quadrants", "65536-pt, 5 radii x 360 deg"};
+}
+
+static BenchResult bench_sqrt() {
+    auto inputs = make_pos_inputs(N_ACCURACY);
+    int n = N_ACCURACY;
+
+    std::vector<double> ref(n);
+    for (int i = 0; i < n; i++)
+        ref[i] = std::sqrt(q16_to_dbl(inputs[i]));
+
+    std::vector<int32_t> fr_out(n), lfm_out(n);
+    for (int i = 0; i < n; i++) fr_out[i] = FR_sqrt(inputs[i], RADIX);
+    for (int i = 0; i < n; i++) lfm_out[i] = fix16_sqrt(inputs[i]);
+
+    int ti = 0;
+    double fr_ns  = time_fn([&]{ return FR_sqrt(inputs[ti++ % n], RADIX); });
+    ti = 0;
+    double lfm_ns = time_fn([&]{ return fix16_sqrt(inputs[ti++ % n]); });
+
+    return {"sqrt", "std::sqrt", fr_ns, lfm_ns,
+            compute_errors(ref, fr_out), compute_errors(ref, lfm_out),
+            {}, "65536-pt, [0.01, 100]"};
+}
+
+static BenchResult bench_exp() {
+    auto inputs = make_exp_inputs(N_ACCURACY);
+    int n = N_ACCURACY;
+
+    std::vector<double> ref(n);
+    for (int i = 0; i < n; i++)
+        ref[i] = std::exp(q16_to_dbl(inputs[i]));
+
+    std::vector<int32_t> fr_out(n), lfm_out(n);
+    for (int i = 0; i < n; i++) fr_out[i] = FR_EXP(inputs[i], RADIX);
+    for (int i = 0; i < n; i++) lfm_out[i] = fix16_exp(inputs[i]);
+
+    int ti = 0;
+    double fr_ns  = time_fn([&]{ return FR_EXP(inputs[ti++ % n], RADIX); });
+    ti = 0;
+    double lfm_ns = time_fn([&]{ return fix16_exp(inputs[ti++ % n]); });
+
+    return {"exp", "std::exp", fr_ns, lfm_ns,
+            compute_errors(ref, fr_out), compute_errors(ref, lfm_out),
+            {}, "65536-pt, [-5, 5]"};
+}
+
+static BenchResult bench_log() {
+    auto inputs = make_pos_inputs(N_ACCURACY);
+    int n = N_ACCURACY;
+
+    std::vector<double> ref(n);
+    for (int i = 0; i < n; i++)
+        ref[i] = std::log(q16_to_dbl(inputs[i]));
+
+    std::vector<int32_t> fr_out(n), lfm_out(n);
+    for (int i = 0; i < n; i++) fr_out[i] = FR_ln(inputs[i], RADIX, RADIX);
+    for (int i = 0; i < n; i++) lfm_out[i] = fix16_log(inputs[i]);
+
+    int ti = 0;
+    double fr_ns  = time_fn([&]{ return FR_ln(inputs[ti++ % n], RADIX, RADIX); });
+    ti = 0;
+    double lfm_ns = time_fn([&]{ return fix16_log(inputs[ti++ % n]); });
+
+    return {"ln", "std::log", fr_ns, lfm_ns,
+            compute_errors(ref, fr_out), compute_errors(ref, lfm_out),
+            {}, "65536-pt, [0.01, 100]"};
+}
+
+static BenchResult bench_log2() {
+    auto inputs = make_pos_inputs(N_ACCURACY);
+    int n = N_ACCURACY;
+
+    std::vector<double> ref(n);
+    for (int i = 0; i < n; i++)
+        ref[i] = std::log2(q16_to_dbl(inputs[i]));
+
+    std::vector<int32_t> fr_out(n), lfm_out(n);
+    for (int i = 0; i < n; i++) fr_out[i] = FR_log2(inputs[i], RADIX, RADIX);
+    for (int i = 0; i < n; i++) lfm_out[i] = fix16_log2(inputs[i]);
+
+    int ti = 0;
+    double fr_ns  = time_fn([&]{ return FR_log2(inputs[ti++ % n], RADIX, RADIX); });
+    ti = 0;
+    double lfm_ns = time_fn([&]{ return fix16_log2(inputs[ti++ % n]); });
+
+    return {"log2", "std::log2", fr_ns, lfm_ns,
+            compute_errors(ref, fr_out), compute_errors(ref, lfm_out),
+            {}, "65536-pt, [0.01, 100]"};
+}
+
+static BenchResult bench_mul() {
+    int n = N_ACCURACY;
+    auto a_in = make_arith_inputs(n);
+    std::vector<int32_t> b_in(n);
+    for (int i = 0; i < n; i++)
+        b_in[i] = dbl_to_q16(-2.0 + 4.0 * i / (n - 1));
+
+    std::vector<double> ref(n);
+    for (int i = 0; i < n; i++)
+        ref[i] = q16_to_dbl(a_in[i]) * q16_to_dbl(b_in[i]);
+
+    std::vector<int32_t> fr_out(n), lfm_out(n);
+    for (int i = 0; i < n; i++) fr_out[i] = FR_FixMuls(a_in[i], b_in[i]);
+    for (int i = 0; i < n; i++) lfm_out[i] = fix16_mul(a_in[i], b_in[i]);
+
+    int ti = 0;
+    double fr_ns  = time_fn([&]{ int j = ti++ % n; return FR_FixMuls(a_in[j], b_in[j]); });
+    ti = 0;
+    double lfm_ns = time_fn([&]{ int j = ti++ % n; return fix16_mul(a_in[j], b_in[j]); });
+
+    return {"mul", "double a*b", fr_ns, lfm_ns,
+            compute_errors(ref, fr_out), compute_errors(ref, lfm_out),
+            {}, "65536-pt, a in [-50,50], b in [-2,2]"};
+}
+
+static BenchResult bench_div() {
+    int n = N_ACCURACY;
+    std::vector<int32_t> a_in(n), b_in(n);
+    for (int i = 0; i < n; i++) {
+        a_in[i] = dbl_to_q16(-50.0 + 100.0 * i / (n - 1));
+        b_in[i] = dbl_to_q16(0.5 + 49.5 * i / (n - 1));
+    }
+
+    std::vector<double> ref(n);
+    for (int i = 0; i < n; i++)
+        ref[i] = q16_to_dbl(a_in[i]) / q16_to_dbl(b_in[i]);
+
+    std::vector<int32_t> fr_out(n), lfm_out(n);
+    for (int i = 0; i < n; i++) fr_out[i] = FR_DIV(a_in[i], RADIX, b_in[i], RADIX);
+    for (int i = 0; i < n; i++) lfm_out[i] = fix16_div(a_in[i], b_in[i]);
+
+    int ti = 0;
+    double fr_ns  = time_fn([&]{ int j = ti++ % n; return FR_DIV(a_in[j], RADIX, b_in[j], RADIX); });
+    ti = 0;
+    double lfm_ns = time_fn([&]{ int j = ti++ % n; return fix16_div(a_in[j], b_in[j]); });
+
+    return {"div", "double a/b", fr_ns, lfm_ns,
+            compute_errors(ref, fr_out), compute_errors(ref, lfm_out),
+            "Both use 64-bit intermediate", "65536-pt, a/b in [-50,50]/[0.5,50]"};
+}
+
+/* --- FR_math-only benchmarks (libfixmath has no equivalent) --- */
+
+static BenchResult bench_hypot() {
+    int n = N_ACCURACY;
+    std::vector<int32_t> x_in, y_in;
+    make_atan2_inputs(n, x_in, y_in);
+
+    std::vector<double> ref(n);
+    for (int i = 0; i < n; i++)
+        ref[i] = std::hypot(q16_to_dbl(x_in[i]), q16_to_dbl(y_in[i]));
+
+    std::vector<int32_t> fr_out(n);
+    for (int i = 0; i < n; i++) fr_out[i] = FR_hypot(x_in[i], y_in[i], RADIX);
+
+    int ti = 0;
+    double fr_ns = time_fn([&]{ int j = ti++ % n; return FR_hypot(x_in[j], y_in[j], RADIX); });
+
+    /* dummy lfm_err (all zeros) */
+    ErrorStats lfm_err = {};
+
+    return {"hypot", "std::hypot", fr_ns, -1,
+            compute_errors(ref, fr_out), lfm_err,
+            "FR_math only (libfixmath has no hypot)", "65536-pt, 5 radii x 360 deg"};
+}
+
+static BenchResult bench_hypot_fast8() {
+    int n = N_ACCURACY;
+    std::vector<int32_t> x_in, y_in;
+    make_atan2_inputs(n, x_in, y_in);
+
+    std::vector<double> ref(n);
+    for (int i = 0; i < n; i++)
+        ref[i] = std::hypot(q16_to_dbl(x_in[i]), q16_to_dbl(y_in[i]));
+
+    std::vector<int32_t> fr_out(n);
+    for (int i = 0; i < n; i++) fr_out[i] = FR_hypot_fast8(x_in[i], y_in[i]);
+
+    int ti = 0;
+    double fr_ns = time_fn([&]{ int j = ti++ % n; return FR_hypot_fast8(x_in[j], y_in[j]); });
+
+    ErrorStats lfm_err = {};
+
+    return {"hypot_fast8", "std::hypot", fr_ns, -1,
+            compute_errors(ref, fr_out), lfm_err,
+            "FR_math only; shift-only, no multiply", "65536-pt, 5 radii x 360 deg"};
+}
+
+/* ================================================================
+ * JSON output
+ * ================================================================ */
+
+static void emit_json(FILE *f, const std::vector<BenchResult>& results) {
+    fprintf(f, "{\n");
+    fprintf(f, "  \"description\": \"FR_math vs libfixmath benchmark — both measured against math.h double precision (IEEE 754)\",\n");
+    fprintf(f, "  \"gold_standard\": \"<cmath> IEEE 754 double precision (~15 significant digits)\",\n");
+    fprintf(f, "  \"fixed_point_format\": \"Q16.16 (s15.16), 1 LSB = %.14e\",\n", Q16_LSB);
+    fprintf(f, "  \"accuracy_points\": %d,\n", N_ACCURACY);
+    fprintf(f, "  \"timing_iterations\": %d,\n", N_TIMING);
+    fprintf(f, "  \"rel_error_threshold\": %.2f,\n", REL_THRESH);
+    fprintf(f, "  \"platform\": \"macOS ARM (Apple Silicon)\",\n");
+    fprintf(f, "  \"optimization\": \"-O2\",\n");
+    fprintf(f, "  \"results\": [\n");
+
+    for (size_t r = 0; r < results.size(); r++) {
+        const auto& b = results[r];
+        fprintf(f, "    {\n");
+        fprintf(f, "      \"function\": \"%s\",\n", b.name.c_str());
+        fprintf(f, "      \"double_reference\": \"%s\",\n", b.gold_ref.c_str());
+        if (!b.sweep.empty())
+            fprintf(f, "      \"sweep\": \"%s\",\n", b.sweep.c_str());
+
+        /* speed */
+        fprintf(f, "      \"speed\": {\n");
+        fprintf(f, "        \"fr_math_ns_per_call\": %.1f", b.fr_ns_per_call);
+        if (b.lfm_ns_per_call >= 0) {
+            fprintf(f, ",\n        \"libfixmath_ns_per_call\": %.1f", b.lfm_ns_per_call);
+            double speedup = b.lfm_ns_per_call / b.fr_ns_per_call;
+            fprintf(f, ",\n        \"fr_math_speedup\": %.2f", speedup);
+            const char *faster = (speedup > 1.0) ? "fr_math" : "libfixmath";
+            fprintf(f, ",\n        \"faster\": \"%s\"", faster);
+        }
+        fprintf(f, "\n      },\n");
+
+        /* accuracy — fr_math */
+        fprintf(f, "      \"accuracy_vs_double\": {\n");
+        fprintf(f, "        \"fr_math\": {\n");
+        fprintf(f, "          \"max_abs_error\": %.8e,\n", b.fr_err.max_abs_err);
+        fprintf(f, "          \"mean_abs_error\": %.8e,\n", b.fr_err.mean_abs_err);
+        fprintf(f, "          \"max_error_lsb\": %.1f,\n", b.fr_err.max_lsb_err);
+        fprintf(f, "          \"mean_error_lsb\": %.1f,\n", b.fr_err.mean_lsb_err);
+        fprintf(f, "          \"max_rel_error_pct\": %.4f,\n", b.fr_err.max_rel_err);
+        fprintf(f, "          \"mean_rel_error_pct\": %.4f\n", b.fr_err.mean_rel_err);
+        fprintf(f, "        }");
+
+        if (b.lfm_ns_per_call >= 0) {
+            /* accuracy — libfixmath */
+            fprintf(f, ",\n        \"libfixmath\": {\n");
+            fprintf(f, "          \"max_abs_error\": %.8e,\n", b.lfm_err.max_abs_err);
+            fprintf(f, "          \"mean_abs_error\": %.8e,\n", b.lfm_err.mean_abs_err);
+            fprintf(f, "          \"max_error_lsb\": %.1f,\n", b.lfm_err.max_lsb_err);
+            fprintf(f, "          \"mean_error_lsb\": %.1f,\n", b.lfm_err.mean_lsb_err);
+            fprintf(f, "          \"max_rel_error_pct\": %.4f,\n", b.lfm_err.max_rel_err);
+            fprintf(f, "          \"mean_rel_error_pct\": %.4f\n", b.lfm_err.mean_rel_err);
+            fprintf(f, "        }");
+
+            const char *closer =
+                (b.fr_err.max_abs_err < b.lfm_err.max_abs_err) ? "fr_math" :
+                (b.fr_err.max_abs_err > b.lfm_err.max_abs_err) ? "libfixmath" : "tie";
+            fprintf(f, ",\n        \"closer_to_double\": \"%s\"", closer);
+        }
+        fprintf(f, "\n      }");
+
+        if (!b.note.empty())
+            fprintf(f, ",\n      \"note\": \"%s\"", b.note.c_str());
+        fprintf(f, "\n    }%s\n", (r + 1 < results.size()) ? "," : "");
+    }
+
+    /* summary (head-to-head only, skip FR_math-only functions) */
+    fprintf(f, "  ],\n");
+    fprintf(f, "  \"summary\": {\n");
+
+    int fr_speed_wins = 0, lfm_speed_wins = 0;
+    int fr_closer = 0, lfm_closer = 0, ties = 0;
+    int h2h = 0;
+    for (auto& b : results) {
+        if (b.lfm_ns_per_call < 0) continue;
+        h2h++;
+        if (b.fr_ns_per_call < b.lfm_ns_per_call) fr_speed_wins++;
+        else lfm_speed_wins++;
+        if (b.fr_err.max_abs_err < b.lfm_err.max_abs_err) fr_closer++;
+        else if (b.fr_err.max_abs_err > b.lfm_err.max_abs_err) lfm_closer++;
+        else ties++;
+    }
+    fprintf(f, "    \"head_to_head_functions\": %d,\n", h2h);
+    fprintf(f, "    \"faster_wins\": { \"fr_math\": %d, \"libfixmath\": %d },\n",
+            fr_speed_wins, lfm_speed_wins);
+    fprintf(f, "    \"accuracy_wins\": { \"fr_math\": %d, \"libfixmath\": %d, \"tie\": %d },\n",
+            fr_closer, lfm_closer, ties);
+    fprintf(f, "    \"total_functions_tested\": %d\n", (int)results.size());
+    fprintf(f, "  },\n");
+    fprintf(f, "  \"notes\": [\n");
+    fprintf(f, "    \"All accuracy measured vs IEEE 754 double. Lower = closer to perfect.\",\n");
+    fprintf(f, "    \"LSB = Q16.16 least-significant-bit = 1.53e-5. Best possible = 0.5 LSB.\",\n");
+    fprintf(f, "    \"Percent errors skip |ref| < %.2f to avoid near-zero division spikes.\",\n", REL_THRESH);
+    fprintf(f, "    \"Both libraries use Q16.16 (s15.16): 1.0 = 65536.\",\n");
+    fprintf(f, "    \"FR_math trig: BAM + 129-entry LUT + linear interpolation.\",\n");
+    fprintf(f, "    \"libfixmath trig: parabolic approximation + 5th-order correction.\",\n");
+    fprintf(f, "    \"Timing: min of 3 passes x %d calls; cache-warm.\",\n", N_TIMING);
+    fprintf(f, "    \"Speedup > 1.0 means FR_math is faster by that factor.\"\n");
+    fprintf(f, "  ],\n");
+
+    /* size comparison — compiled from the Makefile in this directory */
+    fprintf(f, "  \"compiled_size_note\": \"Run 'make size' in .compare/ for live numbers. The values below are representative.\",\n");
+    fprintf(f, "  \"compiled_size\": {\n");
+    fprintf(f, "    \"compiler\": \"clang -O2 (macOS ARM)\",\n");
+    fprintf(f, "    \"fr_math\": {\n");
+    fprintf(f, "      \"files\": \"FR_math.c (single file)\",\n");
+    fprintf(f, "      \"functions\": \"trig(6), inv-trig(4), log/ln/log10, exp/pow2/pow10, exp_fast/pow10_fast, sqrt, hypot(2), waves(6), ADSR(4), print(4), format\",\n");
+    fprintf(f, "      \"rom_bytes\": 7470,\n");
+    fprintf(f, "      \"ram_bss_bytes\": 0,\n");
+    fprintf(f, "      \"note\": \"All tables in const ROM. Zero runtime allocation.\"\n");
+    fprintf(f, "    },\n");
+    fprintf(f, "    \"libfixmath\": {\n");
+    fprintf(f, "      \"files\": \"fix16.c, fix16_sqrt.c, fix16_exp.c, fix16_trig.c, fix16_str.c, uint32.c, fract32.c\",\n");
+    fprintf(f, "      \"functions\": \"trig(6), inv-trig(4), log/log2, exp, sqrt, mul/div, str\",\n");
+    fprintf(f, "      \"rom_bytes\": 4912,\n");
+    fprintf(f, "      \"ram_bss_bytes\": 114688,\n");
+    fprintf(f, "      \"rom_bytes_no_cache\": 5476,\n");
+    fprintf(f, "      \"ram_bss_bytes_no_cache\": 0,\n");
+    fprintf(f, "      \"note\": \"Default mode caches 112 KB of sin/exp LUTs in BSS. FIXMATH_NO_CACHE eliminates RAM but recomputes per call.\"\n");
+    fprintf(f, "    }\n");
+    fprintf(f, "  }\n");
+    fprintf(f, "}\n");
+}
+
+/* ================================================================
+ * Markdown summary table (to stderr for human reading)
+ * ================================================================ */
+
+static void emit_markdown(FILE *f, const std::vector<BenchResult>& results) {
+    fprintf(f, "\n## FR_math vs libfixmath — Q16.16 comparison\n\n");
+    fprintf(f, "All errors measured vs IEEE 754 double. Pct errors skip |ref| < 0.01.\n\n");
+
+    /* --- Accuracy table --- */
+    fprintf(f, "### Accuracy\n\n");
+    fprintf(f, "| Function | FR max LSB | FR max %%%% | FR avg %%%% | lfm max LSB | lfm max %%%% | lfm avg %%%% | Winner |\n");
+    fprintf(f, "|----------|----------:|---------:|---------:|----------:|---------:|---------:|--------|\n");
+
+    for (auto& b : results) {
+        const char *winner;
+        if (b.lfm_ns_per_call < 0) {
+            winner = "FR only";
+            fprintf(f, "| %-15s | %7.1f | %7.4f | %7.4f | %9s | %8s | %8s | %-8s |\n",
+                    b.name.c_str(),
+                    b.fr_err.max_lsb_err, b.fr_err.max_rel_err, b.fr_err.mean_rel_err,
+                    "---", "---", "---", winner);
+        } else {
+            bool fr_better_abs = b.fr_err.max_abs_err < b.lfm_err.max_abs_err;
+            bool lfm_better_abs = b.lfm_err.max_abs_err < b.fr_err.max_abs_err;
+            winner = fr_better_abs ? "FR" : lfm_better_abs ? "lfm" : "tie";
+            fprintf(f, "| %-15s | %7.1f | %7.4f | %7.4f | %9.1f | %7.4f | %7.4f | %-8s |\n",
+                    b.name.c_str(),
+                    b.fr_err.max_lsb_err, b.fr_err.max_rel_err, b.fr_err.mean_rel_err,
+                    b.lfm_err.max_lsb_err, b.lfm_err.max_rel_err, b.lfm_err.mean_rel_err,
+                    winner);
+        }
+    }
+
+    /* --- Speed table --- */
+    fprintf(f, "\n### Speed (ns/call, lower is better)\n\n");
+    fprintf(f, "| Function | FR_math | libfixmath | Speedup | Faster |\n");
+    fprintf(f, "|----------|--------:|-----------:|--------:|--------|\n");
+
+    for (auto& b : results) {
+        if (b.lfm_ns_per_call < 0) {
+            fprintf(f, "| %-15s | %6.1f | %10s | %7s | FR only |\n",
+                    b.name.c_str(), b.fr_ns_per_call, "---", "---");
+        } else {
+            double speedup = b.lfm_ns_per_call / b.fr_ns_per_call;
+            const char *faster = (speedup > 1.0) ? "FR" : "lfm";
+            fprintf(f, "| %-15s | %6.1f | %10.1f | %6.2fx | %-7s |\n",
+                    b.name.c_str(), b.fr_ns_per_call, b.lfm_ns_per_call,
+                    speedup, faster);
+        }
+    }
+
+    /* --- Summary --- */
+    int fr_speed = 0, lfm_speed = 0, fr_acc = 0, lfm_acc = 0, tie_acc = 0, h2h = 0;
+    for (auto& b : results) {
+        if (b.lfm_ns_per_call < 0) continue;
+        h2h++;
+        if (b.fr_ns_per_call < b.lfm_ns_per_call) fr_speed++; else lfm_speed++;
+        if (b.fr_err.max_abs_err < b.lfm_err.max_abs_err) fr_acc++;
+        else if (b.fr_err.max_abs_err > b.lfm_err.max_abs_err) lfm_acc++;
+        else tie_acc++;
+    }
+    fprintf(f, "\n### Summary (%d head-to-head functions)\n\n", h2h);
+    fprintf(f, "- **Speed**: FR_math %d / %d, libfixmath %d / %d\n",
+            fr_speed, h2h, lfm_speed, h2h);
+    fprintf(f, "- **Accuracy**: FR_math %d / %d, libfixmath %d / %d, tie %d / %d\n",
+            fr_acc, h2h, lfm_acc, h2h, tie_acc, h2h);
+    fprintf(f, "- Accuracy = %d-pt sweep at Q16.16; timing = min of 3 x %dk calls\n",
+            N_ACCURACY, N_TIMING / 1000);
+
+    /* --- Size table --- */
+    fprintf(f, "\n### Compiled size (clang -O2, macOS ARM)\n\n");
+    fprintf(f, "| | FR_math | libfixmath | lfm (no cache) |\n");
+    fprintf(f, "|---|---:|---:|---:|\n");
+    fprintf(f, "| Code (text) | 6,652 B | 4,880 B | 5,444 B |\n");
+    fprintf(f, "| Tables (ROM) | 818 B | 32 B | 32 B |\n");
+    fprintf(f, "| **ROM total** | **7,470 B** | **4,912 B** | **5,476 B** |\n");
+    fprintf(f, "| BSS / RAM | **0 B** | **112 KB** | **0 B** |\n");
+    fprintf(f, "\n");
+    fprintf(f, "FR_math packs trig, inv-trig, log/ln/log10, exp/pow2/pow10, sqrt, hypot(2),\n");
+    fprintf(f, "waves(6), ADSR, print into 7.3 KB ROM with zero RAM overhead.\n");
+    fprintf(f, "libfixmath (trig, inv-trig, log/log2, exp, sqrt, mul/div, str) is 4.8 KB ROM\n");
+    fprintf(f, "but caches 112 KB of sin/exp LUTs in BSS at runtime.\n");
+    fprintf(f, "\n");
+}
+
+/* ================================================================
+ * main
+ * ================================================================ */
+
+int main() {
+    fprintf(stderr, "Running benchmarks (%d accuracy pts, %d timing iters)...\n",
+            N_ACCURACY, N_TIMING);
+
+    std::vector<BenchResult> results;
+    results.push_back(bench_sin());   fprintf(stderr, "  sin done\n");
+    results.push_back(bench_cos());   fprintf(stderr, "  cos done\n");
+    results.push_back(bench_tan());   fprintf(stderr, "  tan done\n");
+    results.push_back(bench_asin());  fprintf(stderr, "  asin done\n");
+    results.push_back(bench_acos());  fprintf(stderr, "  acos done\n");
+    results.push_back(bench_atan());  fprintf(stderr, "  atan done\n");
+    results.push_back(bench_atan2()); fprintf(stderr, "  atan2 done\n");
+    results.push_back(bench_sqrt());  fprintf(stderr, "  sqrt done\n");
+    results.push_back(bench_exp());   fprintf(stderr, "  exp done\n");
+    results.push_back(bench_log());   fprintf(stderr, "  ln done\n");
+    results.push_back(bench_log2());  fprintf(stderr, "  log2 done\n");
+    results.push_back(bench_mul());   fprintf(stderr, "  mul done\n");
+    results.push_back(bench_div());   fprintf(stderr, "  div done\n");
+    results.push_back(bench_hypot()); fprintf(stderr, "  hypot done\n");
+    results.push_back(bench_hypot_fast8()); fprintf(stderr, "  hypot_fast8 done\n");
+
+    emit_json(stdout, results);
+    emit_markdown(stderr, results);
+    return 0;
+}
diff --git a/compare_lfm/comparison_results.json b/compare_lfm/comparison_results.json
new file mode 100644
index 0000000..adf0019
--- /dev/null
+++ b/compare_lfm/comparison_results.json
@@ -0,0 +1,479 @@
+{
+  "description": "FR_math vs libfixmath benchmark — both measured against math.h double precision (IEEE 754)",
+  "gold_standard": "<cmath> IEEE 754 double precision (~15 significant digits)",
+  "fixed_point_format": "Q16.16 (s15.16), 1 LSB = 1.52587890625000e-05",
+  "accuracy_points": 65536,
+  "timing_iterations": 100000,
+  "rel_error_threshold": 0.01,
+  "platform": "macOS ARM (Apple Silicon)",
+  "optimization": "-O2",
+  "results": [
+    {
+      "function": "sin",
+      "double_reference": "std::sin",
+      "sweep": "65536-pt, [-pi, +pi]",
+      "speed": {
+        "fr_math_ns_per_call": 2.6,
+        "libfixmath_ns_per_call": 20.7,
+        "fr_math_speedup": 7.94,
+        "faster": "fr_math"
+      },
+      "accuracy_vs_double": {
+        "fr_math": {
+          "max_abs_error": 1.34165039e-04,
+          "mean_abs_error": 4.23947344e-05,
+          "max_error_lsb": 8.8,
+          "mean_error_lsb": 2.8,
+          "max_rel_error_pct": 1.0615,
+          "mean_rel_error_pct": 0.0158
+        },
+        "libfixmath": {
+          "max_abs_error": 7.74511497e-03,
+          "mean_abs_error": 5.34549003e-04,
+          "max_error_lsb": 507.6,
+          "mean_error_lsb": 35.0,
+          "max_rel_error_pct": 74.5513,
+          "mean_rel_error_pct": 0.6105
+        },
+        "closer_to_double": "fr_math"
+      }
+    },
+    {
+      "function": "cos",
+      "double_reference": "std::cos",
+      "sweep": "65536-pt, [-pi, +pi]",
+      "speed": {
+        "fr_math_ns_per_call": 4.8,
+        "libfixmath_ns_per_call": 18.4,
+        "fr_math_speedup": 3.86,
+        "faster": "fr_math"
+      },
+      "accuracy_vs_double": {
+        "fr_math": {
+          "max_abs_error": 1.25349009e-04,
+          "mean_abs_error": 4.65658208e-05,
+          "max_error_lsb": 8.2,
+          "mean_error_lsb": 3.1,
+          "max_rel_error_pct": 0.9018,
+          "mean_rel_error_pct": 0.0161
+        },
+        "libfixmath": {
+          "max_abs_error": 7.75591931e-03,
+          "mean_abs_error": 5.36939114e-04,
+          "max_error_lsb": 508.3,
+          "mean_error_lsb": 35.2,
+          "max_rel_error_pct": 74.4001,
+          "mean_rel_error_pct": 0.6121
+        },
+        "closer_to_double": "fr_math"
+      }
+    },
+    {
+      "function": "tan",
+      "double_reference": "std::tan",
+      "sweep": "65536-pt, [-1.2, 1.2] rad",
+      "speed": {
+        "fr_math_ns_per_call": 6.0,
+        "libfixmath_ns_per_call": 41.4,
+        "fr_math_speedup": 6.89,
+        "faster": "fr_math"
+      },
+      "accuracy_vs_double": {
+        "fr_math": {
+          "max_abs_error": 8.49384425e-04,
+          "mean_abs_error": 1.04510886e-04,
+          "max_error_lsb": 55.7,
+          "mean_error_lsb": 6.8,
+          "max_rel_error_pct": 1.0080,
+          "mean_rel_error_pct": 0.0228
+        },
+        "libfixmath": {
+          "max_abs_error": 1.82495961e-02,
+          "mean_abs_error": 8.01092905e-04,
+          "max_error_lsb": 1196.0,
+          "mean_error_lsb": 52.5,
+          "max_rel_error_pct": 0.7099,
+          "mean_rel_error_pct": 0.0410
+        },
+        "closer_to_double": "fr_math"
+      },
+      "note": "Skip near pi/2"
+    },
+    {
+      "function": "asin",
+      "double_reference": "std::asin",
+      "sweep": "65536-pt, [-0.999, 0.999]",
+      "speed": {
+        "fr_math_ns_per_call": 11.5,
+        "libfixmath_ns_per_call": 53.7,
+        "fr_math_speedup": 4.67,
+        "faster": "fr_math"
+      },
+      "accuracy_vs_double": {
+        "fr_math": {
+          "max_abs_error": 4.76933520e-04,
+          "mean_abs_error": 4.37641042e-05,
+          "max_error_lsb": 31.3,
+          "mean_error_lsb": 2.9,
+          "max_rel_error_pct": 0.5795,
+          "mean_rel_error_pct": 0.0134
+        },
+        "libfixmath": {
+          "max_abs_error": 1.01788963e-02,
+          "mean_abs_error": 3.64421558e-03,
+          "max_error_lsb": 667.1,
+          "mean_error_lsb": 238.8,
+          "max_rel_error_pct": 20.1233,
+          "mean_rel_error_pct": 2.4452
+        },
+        "closer_to_double": "fr_math"
+      }
+    },
+    {
+      "function": "acos",
+      "double_reference": "std::acos",
+      "sweep": "65536-pt, [-0.999, 0.999]",
+      "speed": {
+        "fr_math_ns_per_call": 8.4,
+        "libfixmath_ns_per_call": 50.4,
+        "fr_math_speedup": 5.97,
+        "faster": "fr_math"
+      },
+      "accuracy_vs_double": {
+        "fr_math": {
+          "max_abs_error": 4.72479065e-04,
+          "mean_abs_error": 4.33857475e-05,
+          "max_error_lsb": 31.0,
+          "mean_error_lsb": 2.8,
+          "max_rel_error_pct": 0.5194,
+          "mean_rel_error_pct": 0.0056
+        },
+        "libfixmath": {
+          "max_abs_error": 1.01897006e-02,
+          "mean_abs_error": 3.64422377e-03,
+          "max_error_lsb": 667.8,
+          "mean_error_lsb": 238.8,
+          "max_rel_error_pct": 15.3142,
+          "mean_rel_error_pct": 0.3475
+        },
+        "closer_to_double": "fr_math"
+      }
+    },
+    {
+      "function": "atan",
+      "double_reference": "std::atan",
+      "sweep": "65536-pt, [-50, 50]",
+      "speed": {
+        "fr_math_ns_per_call": 8.0,
+        "libfixmath_ns_per_call": 11.2,
+        "fr_math_speedup": 1.41,
+        "faster": "fr_math"
+      },
+      "accuracy_vs_double": {
+        "fr_math": {
+          "max_abs_error": 9.57408985e-04,
+          "mean_abs_error": 7.37662492e-05,
+          "max_error_lsb": 62.7,
+          "mean_error_lsb": 4.8,
+          "max_rel_error_pct": 0.2149,
+          "mean_rel_error_pct": 0.0061
+        },
+        "libfixmath": {
+          "max_abs_error": 1.01676134e-02,
+          "mean_abs_error": 6.15802358e-03,
+          "max_error_lsb": 666.3,
+          "mean_error_lsb": 403.6,
+          "max_rel_error_pct": 19.8632,
+          "mean_rel_error_pct": 0.4571
+        },
+        "closer_to_double": "fr_math"
+      }
+    },
+    {
+      "function": "atan2",
+      "double_reference": "std::atan2",
+      "sweep": "65536-pt, 5 radii x 360 deg",
+      "speed": {
+        "fr_math_ns_per_call": 15.9,
+        "libfixmath_ns_per_call": 10.5,
+        "fr_math_speedup": 0.66,
+        "faster": "libfixmath"
+      },
+      "accuracy_vs_double": {
+        "fr_math": {
+          "max_abs_error": 9.70679332e-04,
+          "mean_abs_error": 2.15170870e-04,
+          "max_error_lsb": 63.6,
+          "mean_error_lsb": 14.1,
+          "max_rel_error_pct": 0.4122,
+          "mean_rel_error_pct": 0.0258
+        },
+        "libfixmath": {
+          "max_abs_error": 1.01728729e-02,
+          "mean_abs_error": 3.88005371e-03,
+          "max_error_lsb": 666.7,
+          "mean_error_lsb": 254.3,
+          "max_rel_error_pct": 20.0045,
+          "mean_rel_error_pct": 0.9267
+        },
+        "closer_to_double": "fr_math"
+      },
+      "note": "All 4 quadrants"
+    },
+    {
+      "function": "sqrt",
+      "double_reference": "std::sqrt",
+      "sweep": "65536-pt, [0.01, 100]",
+      "speed": {
+        "fr_math_ns_per_call": 18.6,
+        "libfixmath_ns_per_call": 19.8,
+        "fr_math_speedup": 1.06,
+        "faster": "fr_math"
+      },
+      "accuracy_vs_double": {
+        "fr_math": {
+          "max_abs_error": 7.62924903e-06,
+          "mean_abs_error": 3.80582266e-06,
+          "max_error_lsb": 0.5,
+          "mean_error_lsb": 0.2,
+          "max_rel_error_pct": 0.0062,
+          "mean_rel_error_pct": 0.0001
+        },
+        "libfixmath": {
+          "max_abs_error": 7.62924903e-06,
+          "mean_abs_error": 3.80582266e-06,
+          "max_error_lsb": 0.5,
+          "mean_error_lsb": 0.2,
+          "max_rel_error_pct": 0.0062,
+          "mean_rel_error_pct": 0.0001
+        },
+        "closer_to_double": "tie"
+      }
+    },
+    {
+      "function": "exp",
+      "double_reference": "std::exp",
+      "sweep": "65536-pt, [-5, 5]",
+      "speed": {
+        "fr_math_ns_per_call": 3.1,
+        "libfixmath_ns_per_call": 67.6,
+        "fr_math_speedup": 22.02,
+        "faster": "fr_math"
+      },
+      "accuracy_vs_double": {
+        "fr_math": {
+          "max_abs_error": 3.17909587e-03,
+          "mean_abs_error": 1.03218909e-04,
+          "max_error_lsb": 208.3,
+          "mean_error_lsb": 6.8,
+          "max_rel_error_pct": 0.1486,
+          "mean_rel_error_pct": 0.0078
+        },
+        "libfixmath": {
+          "max_abs_error": 3.30095957e-03,
+          "mean_abs_error": 9.38398029e-05,
+          "max_error_lsb": 216.3,
+          "mean_error_lsb": 6.1,
+          "max_rel_error_pct": 0.0756,
+          "mean_rel_error_pct": 0.0042
+        },
+        "closer_to_double": "fr_math"
+      }
+    },
+    {
+      "function": "ln",
+      "double_reference": "std::log",
+      "sweep": "65536-pt, [0.01, 100]",
+      "speed": {
+        "fr_math_ns_per_call": 8.8,
+        "libfixmath_ns_per_call": 479.3,
+        "fr_math_speedup": 54.70,
+        "faster": "fr_math"
+      },
+      "accuracy_vs_double": {
+        "fr_math": {
+          "max_abs_error": 4.93278555e-05,
+          "mean_abs_error": 1.61117669e-05,
+          "max_error_lsb": 3.2,
+          "mean_error_lsb": 1.1,
+          "max_rel_error_pct": 0.3012,
+          "mean_rel_error_pct": 0.0006
+        },
+        "libfixmath": {
+          "max_abs_error": 3.40447818e-05,
+          "mean_abs_error": 5.14211182e-06,
+          "max_error_lsb": 2.2,
+          "mean_error_lsb": 0.3,
+          "max_rel_error_pct": 0.0557,
+          "mean_rel_error_pct": 0.0002
+        },
+        "closer_to_double": "libfixmath"
+      }
+    },
+    {
+      "function": "log2",
+      "double_reference": "std::log2",
+      "sweep": "65536-pt, [0.01, 100]",
+      "speed": {
+        "fr_math_ns_per_call": 8.7,
+        "libfixmath_ns_per_call": 39.4,
+        "fr_math_speedup": 4.55,
+        "faster": "fr_math"
+      },
+      "accuracy_vs_double": {
+        "fr_math": {
+          "max_abs_error": 6.06739329e-05,
+          "mean_abs_error": 2.30368713e-05,
+          "max_error_lsb": 4.0,
+          "mean_error_lsb": 1.5,
+          "max_rel_error_pct": 0.4945,
+          "mean_rel_error_pct": 0.0006
+        },
+        "libfixmath": {
+          "max_abs_error": 3.56826644e-05,
+          "mean_abs_error": 9.96190621e-06,
+          "max_error_lsb": 2.3,
+          "mean_error_lsb": 0.7,
+          "max_rel_error_pct": 0.1758,
+          "mean_rel_error_pct": 0.0002
+        },
+        "closer_to_double": "libfixmath"
+      }
+    },
+    {
+      "function": "mul",
+      "double_reference": "double a*b",
+      "sweep": "65536-pt, a in [-50,50], b in [-2,2]",
+      "speed": {
+        "fr_math_ns_per_call": 0.9,
+        "libfixmath_ns_per_call": 1.2,
+        "fr_math_speedup": 1.33,
+        "faster": "fr_math"
+      },
+      "accuracy_vs_double": {
+        "fr_math": {
+          "max_abs_error": 7.62939453e-06,
+          "mean_abs_error": 3.81535541e-06,
+          "max_error_lsb": 0.5,
+          "mean_error_lsb": 0.3,
+          "max_rel_error_pct": 0.0692,
+          "mean_rel_error_pct": 0.0004
+        },
+        "libfixmath": {
+          "max_abs_error": 7.62939453e-06,
+          "mean_abs_error": 3.81535541e-06,
+          "max_error_lsb": 0.5,
+          "mean_error_lsb": 0.3,
+          "max_rel_error_pct": 0.0692,
+          "mean_rel_error_pct": 0.0004
+        },
+        "closer_to_double": "tie"
+      }
+    },
+    {
+      "function": "div",
+      "double_reference": "double a/b",
+      "sweep": "65536-pt, a/b in [-50,50]/[0.5,50]",
+      "speed": {
+        "fr_math_ns_per_call": 0.9,
+        "libfixmath_ns_per_call": 5.2,
+        "fr_math_speedup": 5.98,
+        "faster": "fr_math"
+      },
+      "accuracy_vs_double": {
+        "fr_math": {
+          "max_abs_error": 7.62927377e-06,
+          "mean_abs_error": 3.82182808e-06,
+          "max_error_lsb": 0.5,
+          "mean_error_lsb": 0.3,
+          "max_rel_error_pct": 0.0727,
+          "mean_rel_error_pct": 0.0010
+        },
+        "libfixmath": {
+          "max_abs_error": 8.37162948e-06,
+          "mean_abs_error": 3.82625614e-06,
+          "max_error_lsb": 0.5,
+          "mean_error_lsb": 0.3,
+          "max_rel_error_pct": 0.0727,
+          "mean_rel_error_pct": 0.0010
+        },
+        "closer_to_double": "fr_math"
+      },
+      "note": "Both use 64-bit intermediate"
+    },
+    {
+      "function": "hypot",
+      "double_reference": "std::hypot",
+      "sweep": "65536-pt, 5 radii x 360 deg",
+      "speed": {
+        "fr_math_ns_per_call": 20.0
+      },
+      "accuracy_vs_double": {
+        "fr_math": {
+          "max_abs_error": 7.62930188e-06,
+          "mean_abs_error": 3.67171926e-06,
+          "max_error_lsb": 0.5,
+          "mean_error_lsb": 0.2,
+          "max_rel_error_pct": 0.0076,
+          "mean_rel_error_pct": 0.0009
+        }
+      },
+      "note": "FR_math only (libfixmath has no hypot)"
+    },
+    {
+      "function": "hypot_fast8",
+      "double_reference": "std::hypot",
+      "sweep": "65536-pt, 5 radii x 360 deg",
+      "speed": {
+        "fr_math_ns_per_call": 2.4
+      },
+      "accuracy_vs_double": {
+        "fr_math": {
+          "max_abs_error": 1.37244198e+00,
+          "mean_abs_error": 1.13634634e-01,
+          "max_error_lsb": 89944.4,
+          "mean_error_lsb": 7447.2,
+          "max_rel_error_pct": 0.1372,
+          "mean_rel_error_pct": 0.0516
+        }
+      },
+      "note": "FR_math only; shift-only, no multiply"
+    }
+  ],
+  "summary": {
+    "head_to_head_functions": 13,
+    "faster_wins": { "fr_math": 12, "libfixmath": 1 },
+    "accuracy_wins": { "fr_math": 9, "libfixmath": 2, "tie": 2 },
+    "total_functions_tested": 15
+  },
+  "notes": [
+    "All accuracy measured vs IEEE 754 double. Lower = closer to perfect.",
+    "LSB = Q16.16 least-significant-bit = 1.53e-5. Best possible = 0.5 LSB.",
+    "Percent errors skip |ref| < 0.01 to avoid near-zero division spikes.",
+    "Both libraries use Q16.16 (s15.16): 1.0 = 65536.",
+    "FR_math trig: BAM + 129-entry LUT + linear interpolation.",
+    "libfixmath trig: parabolic approximation + 5th-order correction.",
+    "Timing: min of 3 passes x 100000 calls; cache-warm.",
+    "Speedup > 1.0 means FR_math is faster by that factor."
+  ],
+  "compiled_size_note": "Run 'make size' in .compare/ for live numbers. The values below are representative.",
+  "compiled_size": {
+    "compiler": "clang -O2 (macOS ARM)",
+    "fr_math": {
+      "files": "FR_math.c (single file)",
+      "functions": "trig(6), inv-trig(4), log/ln/log10, exp/pow2/pow10, exp_fast/pow10_fast, sqrt, hypot(2), waves(6), ADSR(4), print(4), format",
+      "rom_bytes": 7470,
+      "ram_bss_bytes": 0,
+      "note": "All tables in const ROM. Zero runtime allocation."
+    },
+    "libfixmath": {
+      "files": "fix16.c, fix16_sqrt.c, fix16_exp.c, fix16_trig.c, fix16_str.c, uint32.c, fract32.c",
+      "functions": "trig(6), inv-trig(4), log/log2, exp, sqrt, mul/div, str",
+      "rom_bytes": 4912,
+      "ram_bss_bytes": 114688,
+      "rom_bytes_no_cache": 5476,
+      "ram_bss_bytes_no_cache": 0,
+      "note": "Default mode caches 112 KB of sin/exp LUTs in BSS. FIXMATH_NO_CACHE eliminates RAM but recomputes per call."
+    }
+  }
+}
diff --git a/compare_lfm/comparison_summary.md b/compare_lfm/comparison_summary.md
new file mode 100644
index 0000000..9169c50
--- /dev/null
+++ b/compare_lfm/comparison_summary.md
@@ -0,0 +1,81 @@
+Running benchmarks (65536 accuracy pts, 100000 timing iters)...
+  sin done
+  cos done
+  tan done
+  asin done
+  acos done
+  atan done
+  atan2 done
+  sqrt done
+  exp done
+  ln done
+  log2 done
+  mul done
+  div done
+  hypot done
+  hypot_fast8 done
+
+## FR_math vs libfixmath — Q16.16 comparison
+
+All errors measured vs IEEE 754 double. Pct errors skip |ref| < 0.01.
+
+### Accuracy
+
+| Function | FR max LSB | FR max %% | FR avg %% | lfm max LSB | lfm max %% | lfm avg %% | Winner |
+|----------|----------:|---------:|---------:|----------:|---------:|---------:|--------|
+| sin             |     8.8 |  1.0615 |  0.0158 |     507.6 | 74.5513 |  0.6105 | FR       |
+| cos             |     8.2 |  0.9018 |  0.0161 |     508.3 | 74.4001 |  0.6121 | FR       |
+| tan             |    55.7 |  1.0080 |  0.0228 |    1196.0 |  0.7099 |  0.0410 | FR       |
+| asin            |    31.3 |  0.5795 |  0.0134 |     667.1 | 20.1233 |  2.4452 | FR       |
+| acos            |    31.0 |  0.5194 |  0.0056 |     667.8 | 15.3142 |  0.3475 | FR       |
+| atan            |    62.7 |  0.2149 |  0.0061 |     666.3 | 19.8632 |  0.4571 | FR       |
+| atan2           |    63.6 |  0.4122 |  0.0258 |     666.7 | 20.0045 |  0.9267 | FR       |
+| sqrt            |     0.5 |  0.0062 |  0.0001 |       0.5 |  0.0062 |  0.0001 | tie      |
+| exp             |   208.3 |  0.1486 |  0.0078 |     216.3 |  0.0756 |  0.0042 | FR       |
+| ln              |     3.2 |  0.3012 |  0.0006 |       2.2 |  0.0557 |  0.0002 | lfm      |
+| log2            |     4.0 |  0.4945 |  0.0006 |       2.3 |  0.1758 |  0.0002 | lfm      |
+| mul             |     0.5 |  0.0692 |  0.0004 |       0.5 |  0.0692 |  0.0004 | tie      |
+| div             |     0.5 |  0.0727 |  0.0010 |       0.5 |  0.0727 |  0.0010 | FR       |
+| hypot           |     0.5 |  0.0076 |  0.0009 |       --- |      --- |      --- | FR only  |
+| hypot_fast8     | 89944.4 |  0.1372 |  0.0516 |       --- |      --- |      --- | FR only  |
+
+### Speed (ns/call, lower is better)
+
+| Function | FR_math | libfixmath | Speedup | Faster |
+|----------|--------:|-----------:|--------:|--------|
+| sin             |    2.6 |       20.7 |   7.94x | FR      |
+| cos             |    4.8 |       18.4 |   3.86x | FR      |
+| tan             |    6.0 |       41.4 |   6.89x | FR      |
+| asin            |   11.5 |       53.7 |   4.67x | FR      |
+| acos            |    8.4 |       50.4 |   5.97x | FR      |
+| atan            |    8.0 |       11.2 |   1.41x | FR      |
+| atan2           |   15.9 |       10.5 |   0.66x | lfm     |
+| sqrt            |   18.6 |       19.8 |   1.06x | FR      |
+| exp             |    3.1 |       67.6 |  22.02x | FR      |
+| ln              |    8.8 |      479.3 |  54.70x | FR      |
+| log2            |    8.7 |       39.4 |   4.55x | FR      |
+| mul             |    0.9 |        1.2 |   1.33x | FR      |
+| div             |    0.9 |        5.2 |   5.98x | FR      |
+| hypot           |   20.0 |        --- |     --- | FR only |
+| hypot_fast8     |    2.4 |        --- |     --- | FR only |
+
+### Summary (13 head-to-head functions)
+
+- **Speed**: FR_math 12 / 13, libfixmath 1 / 13
+- **Accuracy**: FR_math 9 / 13, libfixmath 2 / 13, tie 2 / 13
+- Accuracy = 65536-pt sweep at Q16.16; timing = min of 3 x 100k calls
+
+### Compiled size (clang -O2, macOS ARM)
+
+| | FR_math | libfixmath | lfm (no cache) |
+|---|---:|---:|---:|
+| Code (text) | 6,652 B | 4,880 B | 5,444 B |
+| Tables (ROM) | 818 B | 32 B | 32 B |
+| **ROM total** | **7,470 B** | **4,912 B** | **5,476 B** |
+| BSS / RAM | **0 B** | **112 KB** | **0 B** |
+
+FR_math packs trig, inv-trig, log/ln/log10, exp/pow2/pow10, sqrt, hypot(2),
+waves(6), ADSR, print into 7.3 KB ROM with zero RAM overhead.
+libfixmath (trig, inv-trig, log/log2, exp, sqrt, mul/div, str) is 4.8 KB ROM
+but caches 112 KB of sin/exp LUTs in BSS at runtime.
+
diff --git a/compare_lfm/pow_log_improve.md b/compare_lfm/pow_log_improve.md
new file mode 100644
index 0000000..20d2f03
--- /dev/null
+++ b/compare_lfm/pow_log_improve.md
@@ -0,0 +1,504 @@
+# FR_math pow/log Accuracy Improvements
+
+Proposed changes to FR_pow2, FR_log2, FR_EXP, FR_ln, FR_log10, and FR_POW10.
+All changes are backward-compatible. No API changes. No new functions.
+
+## Current Error Budget
+
+Measured against IEEE 754 double as gold standard, Q16.16 (s15.16), 50k test
+points. 1 LSB = 1/65536 = 1.53e-5.
+
+| function | max error (LSB) | mean error (LSB) | bottleneck |
+|----------|----------------:|------------------:|------------|
+| exp      | 1979            | 114               | pow2 table (16 segments) |
+| ln       | 37.6            | 4.4               | log2 table (32 segments) + shift-only scaling |
+| log10    | 15.8            | 3.8               | log2 table + shift-only scaling |
+| log2     | 53.2            | 4.7               | log2 table (32 segments) |
+| pow2     | 2282            | 148               | pow2 table (16 segments) |
+
+The error decomposition test (bench_explog.cpp) proved that the lookup tables
+are the dominant error source, not the shift-only scaling macros. For exp,
+the pow2 table alone accounts for 115% of the total error (the shift-only
+scaling actually has a bias that partially cancels the table error).
+
+## The Five Changes
+
+### Change 1: Expand pow2 table from 17 to 65 entries
+
+This is the big win. The current gFR_POW2_FRAC_TAB has 17 entries (16 linear
+segments). Linear interpolation error on a convex function is O(h^2) where h is
+the segment width. Going from 16 to 64 segments gives 16x smaller error.
+
+Measured: pow2 table error drops from 2282 LSB to 210 LSB.
+
+**Cost**: +192 bytes ROM (68 -> 260 bytes).
+
+**Code change in FR_math.c**: Replace the table and change two constants in
+FR_pow2 (index bits and interpolation bits).
+
+Current:
+```c
+static const u32 gFR_POW2_FRAC_TAB[17] = {
+     65536,  68438,  71468,  74632,  77936,  81386,  84990,  88752,
+     92682,  96785, 101070, 105545, 110218, 115098, 120194, 125515,
+    131072
+};
+```
+
+New:
+```c
+static const u32 gFR_POW2_FRAC_TAB[65] = {
+     65536,  66250,  66971,  67700,  68438,  69183,  69936,  70698,
+     71468,  72246,  73032,  73828,  74632,  75444,  76266,  77096,
+     77936,  78785,  79642,  80510,  81386,  82273,  83169,  84074,
+     84990,  85915,  86851,  87796,  88752,  89719,  90696,  91684,
+     92682,  93691,  94711,  95743,  96785,  97839,  98905,  99982,
+    101070, 102171, 103283, 104408, 105545, 106694, 107856, 109031,
+    110218, 111418, 112631, 113858, 115098, 116351, 117618, 118899,
+    120194, 121502, 122825, 124163, 125515, 126882, 128263, 129660,
+    131072
+};
+```
+
+In FR_pow2, change the index/interpolation split from 4/12 to 6/10:
+
+Current (line 489-491):
+```c
+    /* Top 4 bits index the table; bottom 12 are the interpolation fraction. */
+    idx     = frac_full >> 12;
+    frac_lo = frac_full & ((1L << 12) - 1);
+```
+
+New:
+```c
+    /* Top 6 bits index the table; bottom 10 are the interpolation fraction. */
+    idx     = frac_full >> 10;
+    frac_lo = frac_full & ((1L << 10) - 1);
+```
+
+And the interpolation line (494):
+
+Current:
+```c
+    mant = lo + (((hi - lo) * frac_lo) >> 12);
+```
+
+New:
+```c
+    mant = lo + (((hi - lo) * frac_lo) >> 10);
+```
+
+That's it. Same algorithm, three numbers change (12->10, 12->10, [17]->[65]).
+Update the comment at the top of FR_pow2 accordingly.
+
+
+### Change 2: Replace shift-only scaling macros with single multiply
+
+The current FR_EXP, FR_ln, and FR_log10 use shift-only macros (FR_SLOG2E,
+FR_SrLOG2E, FR_SrLOG2_10) to convert between logarithmic bases. These have
+~5-10 LSB error at Q16.16. After fixing the table (Change 1), these macros
+become the new bottleneck for exp.
+
+The fix: one 32x32->64 multiply with a high-precision constant at radix 28.
+
+**Cost**: one multiply per call. Zero cost on ARM (single-cycle MUL + shift).
+On 16-bit targets this is one software multiply — still cheap since exp/ln
+are already expensive functions.
+
+**Add to FR_math.h** (near the existing FR_kLOG2E definitions):
+
+```c
+/* High-precision scaling constants at radix 28.
+ * Used by FR_EXP, FR_ln, FR_log10 for base conversion.
+ * At radix 28 these have ~9 decimal digits of precision, far exceeding
+ * the ~4.8 digits of Q16.16.
+ */
+#define FR_kLOG2E_28    (387270501)   /* log2(e)   = 1.4426950408889634  */
+#define FR_krLOG2E_28   (186065279)   /* ln(2)     = 0.6931471805599453  */
+#define FR_kLOG2_10_28  (891723283)   /* log2(10)  = 3.3219280948873622  */
+#define FR_krLOG2_10_28  (80807124)   /* log10(2)  = 0.3010299956639812  */
+
+/* Multiply fixed-point value x (any radix) by a radix-28 constant k.
+ * Result stays at x's radix. Uses 64-bit intermediate.
+ * Rounds to nearest (adds 0.5 LSB before shift).
+ */
+#define FR_MULK28(x, k) ((s32)((((int64_t)(x) * (int64_t)(k)) + (1 << 27)) >> 28))
+```
+
+**Overflow safety**: worst case is FR_MULK28(INT32_MAX, FR_kLOG2_10_28) =
+2^31 * 891723283 = 2^31 * ~2^30 = 2^61, which fits in s64 (63 magnitude
+bits) with 2 bits to spare. The multiply cannot overflow.
+
+**Radix independence**: if x is at radix R, then x * k28 is at radix R+28,
+and >>28 puts the result back at radix R. Works for any R.
+
+**Update FR_EXP and FR_POW10 macros in FR_math.h**:
+
+Current:
+```c
+#define FR_EXP(input, radix)   (FR_pow2(FR_SLOG2E(input), radix))
+#define FR_POW10(input, radix) (FR_pow2(FR_SLOG2_10(input), radix))
+```
+
+New:
+```c
+#define FR_EXP(input, radix)   (FR_pow2(FR_MULK28((input), FR_kLOG2E_28), (radix)))
+#define FR_POW10(input, radix) (FR_pow2(FR_MULK28((input), FR_kLOG2_10_28), (radix)))
+```
+
+**Update FR_ln and FR_log10 in FR_math.c**:
+
+Current:
+```c
+s32 FR_ln(s32 input, u16 radix, u16 output_radix)
+{
+    s32 r = FR_log2(input, radix, output_radix);
+    return FR_SrLOG2E(r);
+}
+
+s32 FR_log10(s32 input, u16 radix, u16 output_radix)
+{
+    s32 r = FR_log2(input, radix, output_radix);
+    return FR_SrLOG2_10(r);
+}
+```
+
+New:
+```c
+s32 FR_ln(s32 input, u16 radix, u16 output_radix)
+{
+    s32 r = FR_log2(input, radix, output_radix);
+    return FR_MULK28(r, FR_krLOG2E_28);
+}
+
+s32 FR_log10(s32 input, u16 radix, u16 output_radix)
+{
+    s32 r = FR_log2(input, radix, output_radix);
+    return FR_MULK28(r, FR_krLOG2_10_28);
+}
+```
+
+
+### Change 3: Rename old shift-only macros as _FAST variants
+
+The shift-only macros are still useful for targets where multiply is very
+expensive (MSP430, 8-bit AVR). Keep them available under _FAST names.
+
+**Add to FR_math.h** (keep existing macros, add aliases):
+
+```c
+/* Shift-only (multiply-free) base-conversion macros.
+ * Lower accuracy (~5-10 LSB at Q16.16) but no multiply instruction.
+ * Use these on targets where 32x32->64 multiply is expensive.
+ */
+#define FR_EXP_FAST(input, radix)   (FR_pow2(FR_SLOG2E(input), radix))
+#define FR_POW10_FAST(input, radix) (FR_pow2(FR_SLOG2_10(input), radix))
+```
+
+The underlying FR_SLOG2E, FR_SrLOG2E, FR_SrLOG2_10, FR_SLOG2_10 macros
+stay as-is. They are still used by the _FAST variants and may be useful
+independently.
+
+The existing FR_kLOG2E (94548) and FR_krLOG2E (45426) constants at radix 16
+stay as-is. They are used in other contexts and cost nothing if unused.
+
+
+### Change 4: Expand log2 table from 33 to 65 entries
+
+Same pattern as the pow2 table expansion. The current gFR_LOG2_MANT_TAB has
+33 entries (32 segments). Going to 65 entries (64 segments) gives ~4x smaller
+interpolation error.
+
+**Cost**: +128 bytes ROM (132 -> 260 bytes).
+
+**Code change in FR_math.c**: Replace the table and change three constants in
+FR_log2 (index bits and interpolation bits).
+
+Current:
+```c
+static const u32 gFR_LOG2_MANT_TAB[33] = {
+        0,  2909,  5732,  8473, 11136, 13727, 16248, 18704,
+    21098, 23433, 25711, 27936, 30109, 32234, 34312, 36346,
+    38336, 40286, 42196, 44068, 45904, 47705, 49472, 51207,
+    52911, 54584, 56229, 57845, 59434, 60997, 62534, 64047,
+    65536
+};
+```
+
+New:
+```c
+static const u32 gFR_LOG2_MANT_TAB[65] = {
+        0,  1466,  2909,  4331,  5732,  7112,  8473,  9814,
+    11136, 12440, 13727, 14996, 16248, 17484, 18704, 19909,
+    21098, 22272, 23433, 24579, 25711, 26830, 27936, 29029,
+    30109, 31178, 32234, 33279, 34312, 35334, 36346, 37346,
+    38336, 39316, 40286, 41246, 42196, 43137, 44068, 44990,
+    45904, 46809, 47705, 48593, 49472, 50344, 51207, 52063,
+    52911, 53751, 54584, 55410, 56229, 57040, 57845, 58643,
+    59434, 60219, 60997, 61769, 62534, 63294, 64047, 64794,
+    65536
+};
+```
+
+In FR_log2, change the index/interpolation split from 5/25 to 6/24:
+
+Current (line 586-587):
+```c
+    idx  = (s32)(m >> 25);                    /* 5 bits  */
+    frac = (s32)(m & ((1u << 25) - 1));       /* 25 bits */
+```
+
+New:
+```c
+    idx  = (s32)(m >> 24);                    /* 6 bits  */
+    frac = (s32)(m & ((1u << 24) - 1));       /* 24 bits */
+```
+
+And the interpolation line (590):
+
+Current:
+```c
+    mant_log2 = lo + (s32)(((int64_t)(hi - lo) * frac) >> 25);
+```
+
+New:
+```c
+    mant_log2 = lo + (s32)(((int64_t)(hi - lo) * frac) >> 24);
+```
+
+Same algorithm, three numbers change (25->24, 25->24, [33]->[65]).
+Update the comments at the top of FR_log2 accordingly. ln and log10
+improve automatically since they call FR_log2 internally.
+
+
+### Change 5: FR_DIV rounding
+
+FR_DIV currently truncates, giving up to 1.0 LSB error. Adding round-to-
+nearest brings this to 0.5 LSB, matching FR_FixMuls and FR_sqrt behavior.
+
+FR_DIV already uses an s64 intermediate (from the earlier 32->64 bit fix),
+so the rounding cost is one addition.
+
+**Simple version** (correct for positive quotients, within 0.5 LSB for
+negative quotients):
+
+Current:
+```c
+#define FR_DIV(x, xr, y, yr) ((s32)(((s64)(x) << (yr)) / (s32)(y)))
+```
+
+New:
+```c
+#define FR_DIV(x, xr, y, yr) \
+    ((s32)((((s64)(x) << (yr)) + ((s64)(y) >> 1)) / (s64)(y)))
+```
+
+This adds half the divisor before dividing. For positive quotients it rounds
+to nearest exactly. For negative quotients C's truncation-toward-zero means
+the bias is slightly off, but worst case remains 0.5 LSB.
+
+**Exact version** (correct for all sign combinations, as a static inline):
+
+```c
+static inline s32 FR_div_rnd(s64 num, s32 den) {
+    if ((num ^ den) >= 0)                   /* same sign: positive quotient */
+        return (s32)((num + den / 2) / den);
+    else                                     /* negative quotient */
+        return (s32)((num - den / 2) / den);
+}
+#define FR_DIV(x, xr, y, yr) FR_div_rnd((s64)(x) << (yr), (s32)(y))
+```
+
+Rename the old truncating version:
+```c
+#define FR_DIV_TRUNC(x, xr, y, yr) ((s32)(((s64)(x) << (yr)) / (s32)(y)))
+```
+
+**Cost**: zero additional ROM (one s64 addition). The inline function version
+adds a branch but the branch is perfectly predicted in practice.
+
+
+## Expected Results
+
+Combining all five changes, measured with bench_explog.cpp:
+
+| function | before (LSB) | after (LSB) | improvement |
+|----------|-------------:|------------:|-------------|
+| exp      | 1979         | ~207        | ~10x better (table + scaling) |
+| pow2     | 2282         | ~210        | ~11x better (table) |
+| ln       | 37.6         | ~3          | ~12x better (table + scaling) |
+| log10    | 15.8         | ~3          | ~5x better (table + scaling) |
+| log2     | 53.2         | ~13         | ~4x better (table) |
+| div      | 1.0          | ~0.5        | 2x better (rounding) |
+
+Speed: no measurable change (tested on Apple M-series, 50k iterations).
+
+Size delta: +192 bytes (pow2 table) + 128 bytes (log2 table) = +320 bytes ROM.
+Zero RAM.
+
+
+## Files Changed
+
+| file | change |
+|------|--------|
+| src/FR_math.h | Add FR_kLOG2E_28, FR_krLOG2E_28, FR_kLOG2_10_28, FR_krLOG2_10_28 |
+| src/FR_math.h | Add FR_MULK28 macro |
+| src/FR_math.h | Update FR_EXP and FR_POW10 to use FR_MULK28 |
+| src/FR_math.h | Add FR_EXP_FAST and FR_POW10_FAST (old behavior) |
+| src/FR_math.h | Update FR_DIV to round-to-nearest |
+| src/FR_math.h | Add FR_DIV_TRUNC (old truncating behavior) |
+| src/FR_math.c | Replace gFR_POW2_FRAC_TAB[17] with [65] |
+| src/FR_math.c | Change FR_pow2 index from 4-bit/12-bit to 6-bit/10-bit |
+| src/FR_math.c | Replace gFR_LOG2_MANT_TAB[33] with [65] |
+| src/FR_math.c | Change FR_log2 index from 5-bit/25-bit to 6-bit/24-bit |
+| src/FR_math.c | Update FR_ln to use FR_MULK28 |
+| src/FR_math.c | Update FR_log10 to use FR_MULK28 |
+
+
+## Helper Scripts
+
+### Generate the 65-entry pow2 table
+
+```python
+#!/usr/bin/env python3
+"""Generate gFR_POW2_FRAC_TAB[65] for FR_pow2.
+
+Output: 2^(i/64) at s.16 fixed point, for i = 0..64.
+Paste directly into FR_math.c.
+"""
+import math
+
+N = 64
+entries = [round(2.0 ** (i / N) * 65536) for i in range(N + 1)]
+
+print(f"static const u32 gFR_POW2_FRAC_TAB[{N+1}] = {{")
+for row in range(0, N + 1, 8):
+    chunk = entries[row:row+8]
+    vals = ", ".join(f"{v:6d}" for v in chunk)
+    comma = "," if row + 8 <= N else ""
+    print(f"    {vals}{comma}")
+print("};")
+print(f"\n/* Size: {(N+1)*4} bytes.  Entry i = round(2^(i/{N}) * 65536). */")
+
+# Verify
+assert entries[0]  == 65536,  f"first entry should be 65536, got {entries[0]}"
+assert entries[N]  == 131072, f"last entry should be 131072, got {entries[N]}"
+assert entries[32] == 92682,  f"midpoint (2^0.5) should be 92682, got {entries[32]}"
+print("Verification passed.")
+```
+
+
+### Generate the 65-entry log2 table
+
+```python
+#!/usr/bin/env python3
+"""Generate gFR_LOG2_MANT_TAB[65] for FR_log2.
+
+Output: log2(1 + i/64) at s.16 fixed point, for i = 0..64.
+Paste directly into FR_math.c.
+"""
+import math
+
+N = 64
+entries = [round(math.log2(1.0 + i / N) * 65536) for i in range(N + 1)]
+
+print(f"static const u32 gFR_LOG2_MANT_TAB[{N+1}] = {{")
+for row in range(0, N + 1, 8):
+    chunk = entries[row:row+8]
+    vals = ", ".join(f"{v:5d}" for v in chunk)
+    comma = "," if row + 8 <= N else ""
+    print(f"    {vals}{comma}")
+print("};")
+print(f"\n/* Size: {(N+1)*4} bytes.  Entry i = round(log2(1 + i/{N}) * 65536). */")
+
+# Verify
+assert entries[0]  == 0,     f"first should be 0, got {entries[0]}"
+assert entries[N]  == 65536, f"last should be 65536, got {entries[N]}"
+assert entries[32] == 38336, f"midpoint log2(1.5) should be 38336, got {entries[32]}"
+print("Verification passed.")
+```
+
+
+### Generate the radix-28 constants
+
+```python
+#!/usr/bin/env python3
+"""Generate high-precision scaling constants at radix 28 for FR_math.h.
+
+These are used by FR_MULK28 for base conversion in exp/ln/log10.
+"""
+import math
+
+R = 28
+scale = 2**R
+
+constants = [
+    ("FR_kLOG2E_28",    math.log2(math.e),  "log2(e)"),
+    ("FR_krLOG2E_28",   math.log(2),         "ln(2)"),
+    ("FR_kLOG2_10_28",  math.log2(10),       "log2(10)"),
+    ("FR_krLOG2_10_28", math.log10(2),       "log10(2)"),
+]
+
+for name, exact, desc in constants:
+    k28 = round(exact * scale)
+    approx = k28 / scale
+    err = abs(approx - exact)
+    print(f"#define {name:20s} ({k28:>12d})   /* {desc:10s} = {exact:.16f} */")
+    assert err < 1e-8, f"{name} error {err:.2e} exceeds 1e-8"
+
+print()
+
+# Overflow check: worst case is INT32_MAX * largest constant
+worst_k = max(c[1] for c in constants)
+worst_k28 = round(worst_k * scale)
+product_bits = (2**31 * worst_k28).bit_length()
+print(f"Overflow check: INT32_MAX * {worst_k28} needs {product_bits} bits (s64 has 63+sign)")
+assert product_bits <= 62, "OVERFLOW RISK"
+print("Overflow check passed.")
+```
+
+
+### Verify the changes end-to-end
+
+After making the code changes, run the existing test suite to confirm nothing
+broke, then run the comparison benchmark:
+
+```bash
+# From the repo root:
+make clean && make test
+
+# From .compare/ — rebuild against updated FR_math and run full comparison:
+make clean && make run
+# Output: comparison_results.json with all 13 functions vs libfixmath
+
+# From .compare/ — detailed exp/log analysis:
+make -f Makefile.explog clean
+make -f Makefile.explog run
+```
+
+
+## Optional: FR_log2 CLZ Optimization
+
+The current FR_log2 finds the leading bit position with a while loop:
+
+```c
+    u = (u32)input;
+    p = 0;
+    while (u > 1)
+    {
+        u >>= 1;
+        p++;
+    }
+```
+
+This could be replaced with __builtin_clz (GCC/Clang) or a manual binary
+search for a constant-time alternative:
+
+```c
+    p = 31 - __builtin_clz((unsigned)input);
+```
+
+This saves ~15 iterations on average but only matters if FR_log2 is called in
+a tight loop. Not part of the five changes above — can be done separately as
+a minor speed optimization. Note: __builtin_clz is not portable to all
+compilers, so a fallback would be needed for strict portability.
diff --git a/dev/fr_math_2.0.1.md b/dev/fr_math_2.0.1.md
index 98268cb..b001529 100644
--- a/dev/fr_math_2.0.1.md
+++ b/dev/fr_math_2.0.1.md
@@ -9,7 +9,7 @@ Branch: `update_wave_fns`
   - tan return type s15.16 → s16.15
   - FR_FR2I → FR2I, FR_MUL removed, FR_EXP/FR_POW10 casing
   - 2D examples: FR_Matrix2D_CPT, ID(), XFormPtI
-  - Summary tables: added FR_hypot_fast, FR_hypot_fast8, FR_numstr
+  - Summary tables: added FR_hypot_fast8, FR_numstr
 - [x] Version bump 2.0.0 → 2.0.1 (hex 0x020000 → 0x020001)
 - [x] sync_version.sh rewrite — FR_MATH_VERSION_HEX is single source of truth
 - [x] CI auto-release job — reads hex version, creates GitHub release + tag
diff --git a/docs/README.md b/docs/README.md
index 1d4a7c9..9401590 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -43,18 +43,27 @@ or any tooling. If you want the browser version, look in
 Errors below are measured at Q16.16 (s15.16). All functions accept any
 radix — Q16.16 is just the reference point for the table. See the
 [TDD report](../build/test_tdd_report.md) for sweeps at radixes 8, 12,
-16, and 24.
-
-| Function | Max error | Note |
-|---|---|---|
-| sin / cos | 5 LSB (~7.7e-5) | Exact at 0, 90, 180, 270 |
-| sqrt | ≤ 0.5 LSB | Round-to-nearest |
-| log2 | ≤ 4 LSB | 65-entry mantissa table |
-| pow2 | ≤ 1 LSB (integers exact) | 65-entry fraction table |
-| ln, log10 | ≤ 4 LSB | Via FR_MULK28 from log2 |
-| hypot (exact) | ≤ 0.5 LSB | 64-bit intermediate |
-| hypot_fast (4-seg) | 0.34% | Shift-only, no multiply |
-| hypot_fast8 (8-seg) | 0.10% | Shift-only, no multiply |
+16, and 24. Percent errors skip expected values near zero (|expected| < 0.01).
+
+<!-- ACCURACY_TABLE_START -->
+| Function | Max err (LSB) | Max err (%) | Avg err (%) | Note |
+|---|---:|---:|---:|---|
+| sin / cos | 7.5 | 0.7169 | 0.0100 | 65536-pt sweep + specials |
+| tan | 38020.4 | 0.7118 | 0.0162 | 65536-pt sweep (skip poles) |
+| asin / acos | 42.3 | 0.7025 | 0.0105 | 65536-pt; sqrt approx near boundary |
+| atan2 | 63.3 | 0.4953 | 0.0268 | 65536x5 radii; asin/acos+hypot_fast8 |
+| atan | 61.9 | 0.2985 | 0.0159 | 20001-pt sweep [-10,10]; via FR_atan2 |
+| sqrt | 28.4 | 0.0003 | 0.0000 | Round-to-nearest |
+| log2 | 10.5 | 0.2479 | 0.0045 | 65-entry mantissa table |
+| pow2 | 220.4 | 0.1373 | 0.0057 | 65-entry fraction table |
+| ln, log10 | 0.7 | 0.0015 | 0.0004 | Via FR_MULK28 from log2 |
+| exp | 65.7 | 0.0719 | 0.0051 | FR_MULK28 + FR_pow2 |
+| exp_fast | 195.5 | 0.0719 | 0.0064 | Shift-only scaling |
+| pow10 | 143.4 | 0.1163 | 0.0075 | FR_MULK28 + FR_pow2 |
+| pow10_fast | 581.9 | 0.1163 | 0.0100 | Shift-only scaling |
+| hypot (exact) | 0.2 | 0.0001 | 0.0000 | 64-bit intermediate |
+| hypot_fast8 (8-seg) | 59968.8 | 0.0977 | 0.0508 | Shift-only, no multiply |
+<!-- ACCURACY_TABLE_END -->
 
 ## What's in the box
 
@@ -66,7 +75,7 @@ radix — Q16.16 is just the reference point for the table. See the
 | Trig (radian/BAM) | `fr_sin`, `fr_cos`, `fr_tan`, `fr_sin_bam`, `fr_cos_bam`, `fr_sin_deg`, `fr_cos_deg` |
 | Inverse trig | `FR_atan`, `FR_atan2`, `FR_asin`, `FR_acos` |
 | Log / exp | `FR_log2`, `FR_ln`, `FR_log10`, `FR_pow2`, `FR_EXP`, `FR_POW10`, `FR_EXP_FAST`, `FR_POW10_FAST`, `FR_MULK28` |
-| Roots | `FR_sqrt`, `FR_hypot`, `FR_hypot_fast`, `FR_hypot_fast8` |
+| Roots | `FR_sqrt`, `FR_hypot`, `FR_hypot_fast8` |
 | Wave generators | `fr_wave_sqr`, `fr_wave_pwm`, `fr_wave_tri`, `fr_wave_saw`, `fr_wave_tri_morph`, `fr_wave_noise` |
 | Envelope | `fr_adsr_init`, `fr_adsr_trigger`, `fr_adsr_release`, `fr_adsr_step` |
 | 2D transforms | `FR_Matrix2D_CPT` (mul, add, sub, det, inv, setrotate, XFormPtI, XFormPtI16) |
@@ -75,6 +84,33 @@ radix — Q16.16 is just the reference point for the table. See the
 Every function is covered by the TDD characterization suite in
 [`tests/test_tdd.cpp`](../tests/test_tdd.cpp).
 
+## Lean build options
+
+Two compile-time `#define` guards let you strip optional subsystems
+for ROM-constrained targets. Define them before including `FR_math.h`
+(or pass `-D` on the compiler command line):
+
+| Define | What it removes | Typical savings |
+|---|---|---|
+| `FR_NO_PRINT` | `FR_printNumF`, `FR_printNumD`, `FR_printNumH`, `FR_numstr` | ~1.3 KB |
+| `FR_NO_WAVES` | `fr_wave_*` (6 shapes), `fr_adsr_*` (ADSR envelope), `FR_HZ2BAM_INC` | ~0.6 KB |
+
+With both guards enabled the core math library (trig, inverse trig, log/exp,
+sqrt, hypot) compiles to ~3.5 KB on x86-64 / clang -Os. On Thumb-2 this
+would be roughly 2.6 KB.
+
+```c
+/* Example: headless sensor node — math only, no print, no audio */
+#define FR_NO_PRINT
+#define FR_NO_WAVES
+#include "FR_math.h"
+```
+
+With `-ffunction-sections` and linker `--gc-sections`, the linker will
+also strip any unused functions automatically, so these guards are most
+useful when you include the library as a single `.c` file or static
+archive without section-level dead-code elimination.
+
 ## Why fixed-point, in 2026?
 
 Most application code today has an FPU and can use `float` freely.
@@ -102,11 +138,62 @@ generic `float` replacement.
 
 #define R 16  /* work at radix 16 (s15.16) throughout */
 
-s32 pi    = FR_NUM(3, 14159, 5, R);       /* pi at radix 16             */
-s32 c45   = FR_CosI(45);                  /* cos 45 deg = 0.7071 (s15.16) */
-s32 root2 = FR_sqrt(I2FR(2, R), R);       /* sqrt(2)    = 1.4142        */
-s32 lg    = FR_log2(I2FR(1000, R), R, R); /* log2(1000) ~ 9.97          */
-s32 ex    = FR_EXP(I2FR(1, R), R);        /* e^1        ~ 2.7183        */
+/* ---- Creating fixed-point values ----
+ *
+ * FR_NUM(integer, frac_digits, num_digits, radix) encodes a decimal
+ * literal at compile time.  The fractional part is the digits AFTER
+ * the decimal point, and num_digits says how many digits that is.
+ * Think: FR_NUM(3, 14159, 5, 16) means "3.14159" at radix 16.
+ */
+s32 pi   = FR_NUM(3, 14159, 5, R);  /* 3.14159 → raw 205886 at r16  */
+s32 half = FR_NUM(0, 5, 1, R);      /* 0.5     → raw 32768           */
+s32 neg  = FR_NUM(-2, 75, 2, R);    /* -2.75   → raw -180224         */
+
+/* Or parse from a string at runtime (no floats, no strtod): */
+s32 pi2  = FR_numstr("3.14159", R); /* same result as FR_NUM above    */
+
+/* Integer-to-fixed: I2FR(n, radix) just shifts left */
+s32 two  = I2FR(2, R);              /* 2.0 → raw 131072              */
+
+/* ---- Naming convention: macros vs functions ----
+ *
+ * UPPERCASE FR_ names are macros — they expand inline with no call
+ * overhead, and the compiler can constant-fold them.  Use these for
+ * conversions and simple arithmetic:
+ *   I2FR, FR2I, FR_NUM, FR_ADD, FR_DIV, FR_ABS, FR_CHRDX, FR_EXP ...
+ *
+ * MixedCase FR_ names are functions — they contain loops, tables, or
+ * multi-step algorithms where inlining would waste ROM:
+ *   FR_Cos, FR_sqrt, FR_atan2, FR_log2, FR_pow2, FR_printNumF ...
+ *
+ * lowercase fr_ names are v2 functions (radian trig, wave generators,
+ * ADSR envelopes):
+ *   fr_sin, fr_cos, fr_tan, fr_wave_tri, fr_adsr_step ...
+ *
+ * Some macros wrap functions: FR_EXP(x,r) scales x then calls
+ * FR_pow2 — one-liner convenience, heavy lifting in the function.
+ */
+
+/* ---- Math functions ---- */
+s32 c45   = FR_Cos(45, 0);                /* cos(45°) = 0.7071       */
+s32 s30   = fr_sin(FR_numstr("0.5236", R), R); /* sin(0.5236 rad)    */
+s32 root2 = FR_sqrt(two, R);              /* sqrt(2)  = 1.4142       */
+s32 angle = FR_atan2(I2FR(1,R), I2FR(1,R), R); /* atan2(1,1) rad     */
+s32 lg    = FR_log2(I2FR(1000, R), R, R); /* log2(1000) ~ 9.97       */
+s32 ex    = FR_EXP(I2FR(1, R), R);        /* macro: scales then calls
+                                            * FR_pow2 internally      */
+
+/* ---- Printing (serial / UART / file friendly) ----
+ *
+ * FR_printNumF takes a per-character output function — works with
+ * putchar, Serial.write, UART_putc, or any int(*)(char).  No
+ * sprintf, no floats, no heap.  Ideal for bare-metal targets.
+ */
+int my_putchar(char c) { return putchar(c); }  /* or your UART func */
+
+FR_printNumF(my_putchar, pi, R, 8, 5);    /* prints " 3.14159"      */
+FR_printNumF(my_putchar, neg, R, 8, 2);   /* prints "   -2.75"      */
+FR_printNumD(my_putchar, FR2I(lg, R), 4); /* prints "   9" (integer)*/
 ```
 
 See [getting-started.md](getting-started.md) for a complete
@@ -121,7 +208,7 @@ understand *how* the radix notation works first.
 | Fixed format | Q16.16 only | Q31 / Q15 | Any radix |
 | Angle input | Radians (Q16.16) | Radians (float) | BAM (u16), degrees, or radians |
 | Exact cardinal angles | No | N/A | Yes |
-| Multiply-free option | No | No | Yes (e.g. `FR_EXP_FAST`, `FR_hypot_fast`) |
+| Multiply-free option | No | No | Yes (e.g. `FR_EXP_FAST`, `FR_hypot_fast8`) |
 | Wave generators | No | No | 6 shapes + ADSR |
 | Dependencies | None | ARM only | None |
 | Code size (Cortex-M0, -Os) | 2.4 KB | ~40 KB+ | 4.2 KB |
@@ -138,7 +225,7 @@ script.
 FR_Math has been in service since **2000**, originally built for
 graphics transforms on 16 MHz 68k Palm Pilots (it shipped inside
 Trumpetsoft's *Inkstorm*), then ported forward to ARM, x86, MIPS,
-RISC-V, and various 8/16-bit embedded targets. v2.0.2 is the current
+RISC-V, and various 8/16-bit embedded targets. v2.0.6 is the current
 release with a full test suite, bit-exact numerical
 specification, and CI on every push.
 
diff --git a/docs/api-reference.md b/docs/api-reference.md
index b92ed7a..3f97f20 100644
--- a/docs/api-reference.md
+++ b/docs/api-reference.md
@@ -537,8 +537,7 @@ targets (AVR, 8051) where 64-bit multiply is very expensive.
 | --- | --- | --- | --- |
 | `FR_sqrt` | `s32 input` at `radix`<br>`u16 radix` | `s32` at the **same radix**. | Domain: `input ≥ 0`. Returns `FR_DOMAIN_ERROR` for negative input. Digit-by-digit integer isqrt on an `int64_t` accumulator — deterministic 32-iteration cost, no floating point anywhere. **Rounds to nearest** (remainder > root → +1). Worst-case error is ±0.5 LSB at the input radix. |
 | `FR_hypot` | `s32 x`, `s32 y` both at `radix`<br>`u16 radix` | `s32` at the **same radix**. | Overflow-safe magnitude: computes `sqrt(x² + y²)` without an intermediate 32-bit overflow by promoting the sum of squares to `int64_t`. Accepts the full `s32` input range; output saturates at `FR_OVERFLOW_POS` only if the true hypot exceeds `2^31−1` at the given radix. |
-| `FR_hypot_fast` | `s32 x`, `s32 y` (any radix) | `s32` at the same radix. | Fast approximate magnitude using 4-segment piecewise-linear shift-only arithmetic. ~0.4% peak error. No multiply, no 64-bit, no ROM table. Based on the method of US Patent 6,567,777 B1 (public domain). No `radix` parameter needed — the algorithm is scale-invariant. |
-| `FR_hypot_fast8` | `s32 x`, `s32 y` (any radix) | `s32` at the same radix. | 8-segment variant. ~0.14% peak error. Same shift-only approach, more branches. |
+| `FR_hypot_fast8` | `s32 x`, `s32 y` (any radix) | `s32` at the same radix. | 8-segment shift-only piecewise-linear approximate magnitude. ~0.14% peak error. No multiply, no 64-bit, no ROM table. Based on the method of US Patent 6,567,777 B1 (public domain). No `radix` parameter needed — the algorithm is scale-invariant. |
 
 ## Wave generators
 
diff --git a/docs/building.md b/docs/building.md
index 81de9bb..db13bb7 100644
--- a/docs/building.md
+++ b/docs/building.md
@@ -95,7 +95,7 @@ binaries to keep compile times low:
 
 | Binary | What it checks |
 | --- | --- |
-| `test_basic` | Radix conversions, `FR_ADD`, `FR_MUL`, rounding. |
+| `test_basic` | Radix conversions, `FR_ADD`, `FR_FixMuls`, rounding. |
 | `test_trig` | Integer-degree trig (`FR_Sin` et al.). |
 | `test_trig_radians` | Radian / BAM trig and the v2 `fr_sin` API. |
 | `test_log_exp` | Log base 2 / ln / log10 and their inverses. |
@@ -134,22 +134,89 @@ you improved a polynomial approximation), update the pinned values in
 
 The library has no CPU-specific code. It compiles and runs
 identically on all of the targets listed below. The only requirement
-is an integer pipeline and the standard `<stdint.h>`
-header. You do *not* need a floating-point unit, and you do
-*not* need `libm`.
+is an integer pipeline and `<stdint.h>` (or define `FR_NO_STDINT`
+for bare-metal toolchains that lack it — `FR_defs.h` provides
+fallback typedefs). You do *not* need a floating-point unit, and
+you do *not* need `libm`.
 
 | Target | Toolchain | Tested? |
 | --- | --- | --- |
-| x86 / x86_64 Linux | `gcc`, `clang` | CI. |
+| x86 / x86_64 Linux | `gcc`, `clang`, `tcc` | CI + Docker. |
 | macOS arm64 / x86_64 | Apple `clang` | CI. |
 | Windows x86_64 | MSVC, `clang-cl`, MinGW | Manual. |
-| ARM Cortex-M0/M3/M4/M7 | `arm-none-eabi-gcc`, IAR, Keil | Manual. |
-| RISC-V rv32imc | `riscv32-unknown-elf-gcc` | Manual. |
-| AVR (ATmega328P, etc.) | `avr-gcc` | Manual. |
+| AArch64 (ARM64) | `aarch64-linux-gnu-gcc` | Docker. |
+| ARM32 / Thumb | `arm-none-eabi-gcc`, IAR, Keil | Docker. |
+| RISC-V rv64 / rv32 | `riscv64-linux-gnu-gcc`, `riscv64-unknown-elf-gcc` | Docker. |
+| AVR (ATmega328P, ATtiny85) | `avr-gcc` | Docker. |
 | Arduino (AVR, SAMD, etc.) | `arduino-cli` | Manual. |
-| MSP430 | `msp430-elf-gcc` | Manual. |
+| MSP430 | `msp430-gcc` | Docker. |
+| Motorola 68k | `m68k-linux-gnu-gcc` | Docker. |
+| Motorola 68HC11 | `m68hc11-gcc` | Docker. |
+| PowerPC | `powerpc-linux-gnu-gcc` | Docker. |
+| Xtensa LX106 (ESP8266) | `xtensa-lx106-elf-gcc` | Docker. |
 | 8051 | `sdcc` | Manual. |
 
+### Code size (.text section, compiled with `-Os`)
+
+All sizes are for the complete `FR_math.c` — every function
+included, nothing stripped.  With `-ffunction-sections` and
+linker `--gc-sections`, only the functions your application
+references are linked, so real flash usage will be smaller.
+
+<!-- SIZE_TABLE_START -->
+| Target | .text (bytes) |
+|---|---:|
+| GCC ARM32 Thumb | 4,278 |
+| GCC RISC-V (rv64) | 4,574 |
+| GCC RISC-V (rv32) | 4,820 |
+| GCC Xtensa LX106 (ESP8266) | 5,317 |
+| GCC m68k | 5,410 |
+| GCC ARM32 | 5,504 |
+| GCC x86-64 | 5,857 |
+| GCC AArch64 (ARM64) | 6,112 |
+| Clang x86-64 | 6,555 |
+| GCC x86-32 | 6,947 |
+| GCC PowerPC | 7,540 |
+| GCC MSP430 | 9,146 |
+| TCC x86 | 9,887 |
+| GCC AVR5 (ATmega328P) | 10,806 |
+| GCC AVR ATtiny85 | 11,382 |
+| GCC 68HC11 | 16,392 |
+<!-- SIZE_TABLE_END -->
+
+### Lean build options
+
+Two compile-time `#define` guards let you strip optional subsystems
+for ROM-constrained targets. Define them before including `FR_math.h`
+(or pass `-D` on the compiler command line):
+
+| Define | What it removes | Typical savings |
+|---|---|---|
+| `FR_NO_PRINT` | `FR_printNumF`, `FR_printNumD`, `FR_printNumH`, `FR_numstr` | ~1.3 KB |
+| `FR_NO_WAVES` | `fr_wave_*` (6 shapes), `fr_adsr_*` (ADSR envelope), `FR_HZ2BAM_INC` | ~0.6 KB |
+
+With both guards enabled the core math library (trig, inverse trig, log/exp,
+sqrt, hypot) compiles to ~3.5 KB on x86-64 / clang -Os.
+
+```c
+/* Example: headless sensor node — math only, no print, no audio */
+#define FR_NO_PRINT
+#define FR_NO_WAVES
+#include "FR_math.h"
+```
+
+With `-ffunction-sections` and linker `--gc-sections`, the linker will
+also strip any unused functions automatically, so these guards are most
+useful when you include the library as a single `.c` file or static
+archive without section-level dead-code elimination.
+
+To regenerate this table, run the Docker cross-build
+(requires the [xelp](https://github.com/deftio/xelp) Docker image):
+
+```bash
+scripts/crossbuild-docker.sh
+```
+
 ### Example: RISC-V
 
 ```bash
@@ -180,9 +247,9 @@ arduino-cli compile --fqbn arduino:avr:uno examples/trig-functions
 arduino-cli compile --fqbn arduino:avr:uno examples/wave-generators
 ```
 
-Expect the whole integer-only library to land around a few
-kilobytes of flash. The wave, trig, and log modules can be compiled
-in independently if you want to strip further.
+See the [code size table](#code-size-text-section-compiled-with--os) above
+for exact numbers. With linker dead-code elimination, only the
+functions you call are linked.
 
 ## CI
 
diff --git a/docs/examples.md b/docs/examples.md
index 4ef6bfd..b7d6145 100644
--- a/docs/examples.md
+++ b/docs/examples.md
@@ -144,9 +144,7 @@ int main(void)
         printf("sqrt(-1) rejected, good.\n");
 
     /* Fast approximate — no multiply, no 64-bit math */
-    s32 h_fast  = FR_hypot_fast(three, four);     /* 4-seg, ~0.4% err */
     s32 h_fast8 = FR_hypot_fast8(three, four);    /* 8-seg, ~0.14% err */
-    printf("hypot_fast(3,4)  = %d (integer part)\n", (int)FR2I(h_fast, r));
     printf("hypot_fast8(3,4) = %d (integer part)\n", (int)FR2I(h_fast8, r));
     return 0;
 }
diff --git a/docs/releases.md b/docs/releases.md
index 2f6e11c..2568ae7 100644
--- a/docs/releases.md
+++ b/docs/releases.md
@@ -4,6 +4,18 @@ Release highlights. For the full per-symbol change log, see
 [release_notes.md](https://github.com/deftio/fr_math/blob/master/release_notes.md)
 in the repo.
 
+## v2.0.6 — 2026
+
+Accuracy improvements, lean-build options, library cleanup.
+
+- **FR_acos boundary fix** — 12x better accuracy near ±1.0 via deferred quantization
+- **FR_atan2 rewrite** — asin/acos + hypot_fast8, 0.41% peak error (was 20% in libfixmath)
+- **Lean build guards** — `FR_NO_PRINT` (~1.3 KB) and `FR_NO_WAVES` (~0.6 KB) for ROM-constrained targets
+- **Removed FR_hypot_fast** (4-segment) — FR_hypot_fast8 is strictly better; 4-seg was dead weight
+- libfixmath comparison benchmark added to repo (`compare_lfm/`)
+
+---
+
 ## v2.0.5 — 2026
 
 Release pipeline fixes. Fixed squash-merge divergence handling and
diff --git a/examples/posix-example/FR_Math_Example1.cpp b/examples/posix-example/FR_Math_Example1.cpp
index 3d2edee..6a7e42f 100644
--- a/examples/posix-example/FR_Math_Example1.cpp
+++ b/examples/posix-example/FR_Math_Example1.cpp
@@ -249,7 +249,7 @@ int putSingleChar(char x)
 }
 //===============================================
 // main program for testing the functions
-int main(int argc, char *argv[])
+int main(int /*argc*/, char * /*argv*/[])
 {
 	int ret_val = 0;
 	int i;
diff --git a/idf_component.yml b/idf_component.yml
index e9083b8..d5c5d11 100644
--- a/idf_component.yml
+++ b/idf_component.yml
@@ -1,4 +1,4 @@
-version: "2.0.5"
+version: "2.0.6"
 description: "Compact fixed-point math library for embedded systems. Integer-only with caller-selectable radix. Trig, log/exp, sqrt, hypot, wave generators, ADSR, and 2D transforms. Zero dependencies."
 url: "https://github.com/deftio/fr_math"
 repository: "https://github.com/deftio/fr_math.git"
diff --git a/keywords.txt b/keywords.txt
index 483da8b..1ab2703 100644
--- a/keywords.txt
+++ b/keywords.txt
@@ -28,7 +28,6 @@ FR_log10	KEYWORD2
 FR_pow2	KEYWORD2
 FR_sqrt	KEYWORD2
 FR_hypot	KEYWORD2
-FR_hypot_fast	KEYWORD2
 FR_hypot_fast8	KEYWORD2
 FR_printNumF	KEYWORD2
 FR_printNumD	KEYWORD2
diff --git a/library.json b/library.json
index a9a743c..daa7684 100644
--- a/library.json
+++ b/library.json
@@ -1,6 +1,6 @@
 {
     "name": "FR_Math",
-    "version": "2.0.5",
+    "version": "2.0.6",
     "description": "Compact fixed-point math library for embedded systems. Integer-only with caller-selectable radix. Trig, log/exp, sqrt, hypot, wave generators, ADSR, and 2D transforms in 4KB of flash. Zero dependencies.",
     "keywords": [
         "fixed-point",
diff --git a/library.properties b/library.properties
index ac2b952..47d834a 100644
--- a/library.properties
+++ b/library.properties
@@ -1,5 +1,5 @@
 name=FR_Math
-version=2.0.5
+version=2.0.6
 author=M. A. Chatterjee <deftio@deftio.com>
 maintainer=M. A. Chatterjee <deftio@deftio.com>
 sentence=Compact fixed-point math library for embedded systems. 4KB flash, zero dependencies, any radix.
diff --git a/llms.txt b/llms.txt
index bd23772..fe5ee3b 100644
--- a/llms.txt
+++ b/llms.txt
@@ -9,7 +9,7 @@ or libraries. Pure C99, zero dependencies beyond `<stdint.h>`.
 - Repository: https://github.com/deftio/fr_math
 - Documentation: https://deftio.github.io/fr_math/
 - License: BSD-2-Clause
-- Version: 2.0.5
+- Version: 2.0.6
 
 ## Key concept: radix parameter
 
@@ -116,7 +116,6 @@ FR_POW10_FAST(input, radix)
 ```c
 s32 FR_sqrt(s32 input, u16 radix);
 s32 FR_hypot(s32 x, s32 y, u16 radix);       // exact, 64-bit intermediate
-s32 FR_hypot_fast(s32 x, s32 y);              // 4-segment, 0.34% error, no multiply
 s32 FR_hypot_fast8(s32 x, s32 y);             // 8-segment, 0.10% error, no multiply
 ```
 
@@ -171,6 +170,13 @@ make examples  # build example program
 make clean     # remove build artifacts
 ```
 
+## Lean build options
+
+Define before including FR_math.h to exclude optional subsystems:
+
+- `FR_NO_PRINT` — removes FR_printNumF/D/H and FR_numstr (~1.3 KB saved)
+- `FR_NO_WAVES` — removes fr_wave_*, fr_adsr_*, FR_HZ2BAM_INC (~0.6 KB saved)
+
 ## Platform support
 
 Tested on: AVR (Arduino), ARM Cortex-M0/M4, ESP32 (Xtensa), RISC-V,
diff --git a/makefile b/makefile
index a609532..eb8a9fc 100644
--- a/makefile
+++ b/makefile
@@ -14,8 +14,11 @@ EXAMPLE_DIR = examples
 BUILD_DIR = build
 COV_DIR = coverage
 
-# Compiler flags
-CFLAGS = -I$(SRC_DIR) -Wall -Os
+# Compiler flags — full warnings, fail on any warning
+# LIB_WARN: strictest for library source (includes -Wconversion -Wpedantic)
+# CFLAGS:   for tests/examples (no -Wconversion/-Wpedantic — macro casts are intentional)
+LIB_WARN = -Wall -Wextra -Wpedantic -Wshadow -Wconversion -Werror
+CFLAGS   = -I$(SRC_DIR) -Wall -Wextra -Wshadow -Werror -Os
 CXXFLAGS = $(CFLAGS)
 TEST_FLAGS = -ftest-coverage -fprofile-arcs
 LDFLAGS = -lm
@@ -23,7 +26,41 @@ LDFLAGS = -lm
 # Source files
 HEADERS = $(SRC_DIR)/FR_defs.h $(SRC_DIR)/FR_math.h $(SRC_DIR)/FR_math_2D.h
 
-# Default target
+# Default target — print help
+.PHONY: help
+help:
+	@echo "FR_Math — Fixed Radix Math Library"
+	@echo ""
+	@echo "Usage: make <target>"
+	@echo ""
+	@echo "Build targets:"
+	@echo "  all              Build library and examples"
+	@echo "  lib              Build library objects only"
+	@echo "  examples         Build example program"
+	@echo ""
+	@echo "Test targets:"
+	@echo "  test             Run all tests"
+	@echo "  test-basic       Run basic tests"
+	@echo "  test-comprehensive  Run comprehensive tests"
+	@echo "  test-2d          Run 2D math tests"
+	@echo "  test-overflow    Run overflow/saturation tests"
+	@echo "  test-full        Run full coverage tests"
+	@echo "  test-2d-complete Run 2D complete coverage tests"
+	@echo "  test-tdd         Run TDD characterization tests"
+	@echo ""
+	@echo "Analysis targets:"
+	@echo "  accuracy         Show accuracy summary table"
+	@echo "  accuracy-showpeak  Show accuracy with peak inputs"
+	@echo "  coverage         Generate coverage report (gcov)"
+	@echo "  coverage-basic   Basic coverage info without lcov"
+	@echo "  coverage-html    HTML coverage report (requires lcov)"
+	@echo "  size-report      Multi-architecture size report"
+	@echo "  size-simple      Size report for current platform"
+	@echo ""
+	@echo "Maintenance:"
+	@echo "  clean            Remove build artifacts"
+	@echo "  cleanall         Remove build artifacts and backups"
+
 .PHONY: all
 all: dirs lib examples
 
@@ -43,10 +80,10 @@ dirs:
 lib: dirs $(BUILD_DIR)/FR_math.o $(BUILD_DIR)/FR_math_2D.o
 
 $(BUILD_DIR)/FR_math.o: $(SRC_DIR)/FR_math.c $(HEADERS)
-	$(CC) $(CFLAGS) -c $< -o $@
+	$(CC) -I$(SRC_DIR) $(LIB_WARN) -Os -c $< -o $@
 
 $(BUILD_DIR)/FR_math_2D.o: $(SRC_DIR)/FR_math_2D.cpp $(HEADERS)
-	$(CXX) $(CXXFLAGS) -c $< -o $@
+	$(CXX) -I$(SRC_DIR) $(LIB_WARN) -Os -c $< -o $@
 
 # Build examples
 .PHONY: examples
@@ -66,8 +103,8 @@ test-tdd: $(BUILD_DIR)/test_tdd
 	@echo "Report written to $(BUILD_DIR)/test_tdd_report.md"
 
 $(BUILD_DIR)/test_tdd: $(TEST_DIR)/test_tdd.cpp $(SRC_DIR)/FR_math.c $(SRC_DIR)/FR_math_2D.cpp
-	$(CC) $(CFLAGS) $(TEST_FLAGS) -c $(SRC_DIR)/FR_math.c -o $(BUILD_DIR)/test_tdd_FR_math.o
-	$(CXX) $(CXXFLAGS) $(TEST_FLAGS) -c $(SRC_DIR)/FR_math_2D.cpp -o $(BUILD_DIR)/test_tdd_FR_math_2D.o
+	$(CC) -I$(SRC_DIR) $(LIB_WARN) -Os $(TEST_FLAGS) -c $(SRC_DIR)/FR_math.c -o $(BUILD_DIR)/test_tdd_FR_math.o
+	$(CXX) -I$(SRC_DIR) $(LIB_WARN) -Os $(TEST_FLAGS) -c $(SRC_DIR)/FR_math_2D.cpp -o $(BUILD_DIR)/test_tdd_FR_math_2D.o
 	$(CXX) $(CXXFLAGS) $(TEST_FLAGS) $(TEST_DIR)/test_tdd.cpp $(BUILD_DIR)/test_tdd_FR_math.o $(BUILD_DIR)/test_tdd_FR_math_2D.o $(LDFLAGS) -o $@
 
 .PHONY: test-basic
@@ -107,7 +144,10 @@ $(BUILD_DIR)/test_comprehensive: $(TEST_DIR)/test_comprehensive.c $(SRC_DIR)/FR_
 	$(CC) $(CFLAGS) $(TEST_FLAGS) $^ $(LDFLAGS) -o $@
 
 $(BUILD_DIR)/test_2d: $(TEST_DIR)/test_2d_math.c $(SRC_DIR)/FR_math.c $(SRC_DIR)/FR_math_2D.cpp
-	$(CXX) $(CXXFLAGS) $(TEST_FLAGS) $^ $(LDFLAGS) -o $@
+	$(CC) -I$(SRC_DIR) $(LIB_WARN) -Os $(TEST_FLAGS) -c $(SRC_DIR)/FR_math.c -o $(BUILD_DIR)/test_2d_FR_math.o
+	$(CXX) $(CXXFLAGS) $(TEST_FLAGS) -c $(SRC_DIR)/FR_math_2D.cpp -o $(BUILD_DIR)/test_2d_FR_math_2D.o
+	$(CC) $(CFLAGS) $(TEST_FLAGS) -c $(TEST_DIR)/test_2d_math.c -o $(BUILD_DIR)/test_2d_math.o
+	$(CXX) $(TEST_FLAGS) $(BUILD_DIR)/test_2d_math.o $(BUILD_DIR)/test_2d_FR_math.o $(BUILD_DIR)/test_2d_FR_math_2D.o $(LDFLAGS) -o $@
 
 $(BUILD_DIR)/test_overflow: $(TEST_DIR)/test_overflow_saturation.c $(SRC_DIR)/FR_math.c
 	$(CC) $(CFLAGS) $^ $(LDFLAGS) -o $@
@@ -116,7 +156,19 @@ $(BUILD_DIR)/test_full: $(TEST_DIR)/test_full_coverage.c $(SRC_DIR)/FR_math.c
 	$(CC) $(CFLAGS) $(TEST_FLAGS) $^ $(LDFLAGS) -o $@
 
 $(BUILD_DIR)/test_2d_complete: $(TEST_DIR)/test_2d_complete.cpp $(SRC_DIR)/FR_math.c $(SRC_DIR)/FR_math_2D.cpp
-	$(CXX) $(CXXFLAGS) $(TEST_FLAGS) $^ $(LDFLAGS) -o $@
+	$(CC) -I$(SRC_DIR) $(LIB_WARN) -Os $(TEST_FLAGS) -c $(SRC_DIR)/FR_math.c -o $(BUILD_DIR)/test_2dc_FR_math.o
+	$(CXX) $(CXXFLAGS) $(TEST_FLAGS) -c $(SRC_DIR)/FR_math_2D.cpp -o $(BUILD_DIR)/test_2dc_FR_math_2D.o
+	$(CXX) $(CXXFLAGS) $(TEST_FLAGS) $(TEST_DIR)/test_2d_complete.cpp $(BUILD_DIR)/test_2dc_FR_math.o $(BUILD_DIR)/test_2dc_FR_math_2D.o $(LDFLAGS) -o $@
+
+# Accuracy summary table (extract from test_tdd output)
+.PHONY: accuracy accuracy-showpeak
+accuracy: dirs $(BUILD_DIR)/test_tdd
+	@echo "Running accuracy report..."
+	@./$(BUILD_DIR)/test_tdd 2>/dev/null | sed -n '/ACCURACY_TABLE_START/,/ACCURACY_TABLE_END/p'
+
+accuracy-showpeak: dirs $(BUILD_DIR)/test_tdd
+	@echo "Running accuracy report (with peak inputs)..."
+	@FR_SHOWPEAK=1 ./$(BUILD_DIR)/test_tdd 2>/dev/null | sed -n '/ACCURACY_TABLE_START/,/ACCURACY_TABLE_END/p'
 
 # Coverage report using gcov (no external dependencies)
 .PHONY: coverage
@@ -127,8 +179,8 @@ coverage:
 .PHONY: coverage-html
 coverage-html: clean dirs
 	@echo "Building with coverage flags..."
-	@$(CC) $(CFLAGS) $(TEST_FLAGS) -c $(SRC_DIR)/FR_math.c -o $(BUILD_DIR)/FR_math.o
-	@$(CXX) $(CXXFLAGS) $(TEST_FLAGS) -c $(SRC_DIR)/FR_math_2D.cpp -o $(BUILD_DIR)/FR_math_2D.o
+	@$(CC) -I$(SRC_DIR) $(LIB_WARN) -Os $(TEST_FLAGS) -c $(SRC_DIR)/FR_math.c -o $(BUILD_DIR)/FR_math.o
+	@$(CXX) -I$(SRC_DIR) $(LIB_WARN) -Os $(TEST_FLAGS) -c $(SRC_DIR)/FR_math_2D.cpp -o $(BUILD_DIR)/FR_math_2D.o
 	@$(CC) $(CFLAGS) $(TEST_FLAGS) $(TEST_DIR)/fr_math_test.c $(BUILD_DIR)/FR_math.o $(BUILD_DIR)/FR_math_2D.o $(LDFLAGS) -lstdc++ -o $(BUILD_DIR)/fr_test
 	@echo "Running tests for coverage..."
 	@./$(BUILD_DIR)/fr_test
@@ -173,8 +225,8 @@ cleanall: clean
 .PHONY: coverage-basic
 coverage-basic: clean dirs
 	@echo "Building with coverage flags..."
-	@$(CC) $(CFLAGS) $(TEST_FLAGS) -c $(SRC_DIR)/FR_math.c -o $(BUILD_DIR)/FR_math.o
-	@$(CXX) $(CXXFLAGS) $(TEST_FLAGS) -c $(SRC_DIR)/FR_math_2D.cpp -o $(BUILD_DIR)/FR_math_2D.o
+	@$(CC) -I$(SRC_DIR) $(LIB_WARN) -Os $(TEST_FLAGS) -c $(SRC_DIR)/FR_math.c -o $(BUILD_DIR)/FR_math.o
+	@$(CXX) -I$(SRC_DIR) $(LIB_WARN) -Os $(TEST_FLAGS) -c $(SRC_DIR)/FR_math_2D.cpp -o $(BUILD_DIR)/FR_math_2D.o
 	@$(CC) $(CFLAGS) $(TEST_FLAGS) $(TEST_DIR)/fr_math_test.c $(BUILD_DIR)/FR_math.o $(BUILD_DIR)/FR_math_2D.o $(LDFLAGS) -lstdc++ -o $(BUILD_DIR)/fr_test
 	@echo "Running tests..."
 	@./$(BUILD_DIR)/fr_test
diff --git a/pages/assets/site.js b/pages/assets/site.js
index 8fff01d..8f5df7c 100644
--- a/pages/assets/site.js
+++ b/pages/assets/site.js
@@ -16,7 +16,7 @@
    ════════════════════════════════════════════════════════════════════ */
 
 (function () {
-    var FR_VERSION = 'v2.0.5';
+    var FR_VERSION = 'v2.0.6';
 
     // Detect whether we're a top-level page or inside guide/.
     // Works for both file:// and http(s):// because we look for the
diff --git a/pages/guide/api-reference.html b/pages/guide/api-reference.html
index cfa95a2..5c03c66 100644
--- a/pages/guide/api-reference.html
+++ b/pages/guide/api-reference.html
@@ -887,21 +887,15 @@ <h2>Roots</h2>
       the true hypot exceeds <code>2<sup>31</sup>&minus;1</code> at
       the given radix.</td>
 </tr>
-<tr>
-  <td><code>FR_hypot_fast</code></td>
-  <td><code>s32 x</code>, <code>s32 y</code> (any radix)</td>
-  <td><code>s32</code> at the same radix.</td>
-  <td>Fast approximate magnitude using 4-segment piecewise-linear
-      shift-only arithmetic. &sim;0.4% peak error. No multiply, no
-      64-bit, no ROM table. Based on the method of US Patent
-      6,567,777 B1 (public domain). No <code>radix</code> parameter
-      needed &mdash; the algorithm is scale-invariant.</td>
-</tr>
 <tr>
   <td><code>FR_hypot_fast8</code></td>
   <td><code>s32 x</code>, <code>s32 y</code> (any radix)</td>
   <td><code>s32</code> at the same radix.</td>
-  <td>8-segment variant. &sim;0.14% peak error. Same shift-only
+  <td>8-segment shift-only piecewise-linear approximate magnitude.
+      &sim;0.14% peak error. No multiply, no 64-bit, no ROM table.
+      Based on the method of US Patent 6,567,777 B1 (public domain).
+      No <code>radix</code> parameter needed &mdash; the algorithm is
+      scale-invariant. Same shift-only
       approach, more branches.</td>
 </tr>
 </tbody>
diff --git a/pages/guide/building.html b/pages/guide/building.html
index 3676403..f870b20 100644
--- a/pages/guide/building.html
+++ b/pages/guide/building.html
@@ -113,7 +113,7 @@ <h2>The test suite</h2>
 <table>
 <thead><tr><th>Binary</th><th>What it checks</th></tr></thead>
 <tbody>
-<tr><td><code>test_basic</code></td><td>Radix conversions, <code>FR_ADD</code>, <code>FR_MUL</code>, rounding.</td></tr>
+<tr><td><code>test_basic</code></td><td>Radix conversions, <code>FR_ADD</code>, <code>FR_FixMuls</code>, rounding.</td></tr>
 <tr><td><code>test_trig</code></td><td>Integer-degree trig (<code>FR_Sin</code> et al.).</td></tr>
 <tr><td><code>test_trig_radians</code></td><td>Radian / BAM trig and the v2 <code>fr_sin</code> API.</td></tr>
 <tr><td><code>test_log_exp</code></td><td>Log base 2 / ln / log10 and their inverses.</td></tr>
@@ -152,25 +152,95 @@ <h2>Cross-compilation</h2>
 
 <p>The library has no CPU-specific code. It compiles and runs
 identically on all of the targets listed below. The only requirement
-is an integer pipeline and the standard <code>&lt;stdint.h&gt;</code>
-header. You do <em>not</em> need a floating-point unit, and you do
-<em>not</em> need <code>libm</code>.</p>
+is an integer pipeline and <code>&lt;stdint.h&gt;</code> (or define
+<code>FR_NO_STDINT</code> for bare-metal toolchains that lack it &mdash;
+<code>FR_defs.h</code> provides fallback typedefs). You do <em>not</em>
+need a floating-point unit, and you do <em>not</em> need
+<code>libm</code>.</p>
 
 <table>
 <thead><tr><th>Target</th><th>Toolchain</th><th>Tested?</th></tr></thead>
 <tbody>
-<tr><td>x86 / x86_64 Linux</td><td><code>gcc</code>, <code>clang</code></td><td>CI.</td></tr>
+<tr><td>x86 / x86_64 Linux</td><td><code>gcc</code>, <code>clang</code>, <code>tcc</code></td><td>CI + Docker.</td></tr>
 <tr><td>macOS arm64 / x86_64</td><td>Apple <code>clang</code></td><td>CI.</td></tr>
 <tr><td>Windows x86_64</td><td>MSVC, <code>clang-cl</code>, MinGW</td><td>Manual.</td></tr>
-<tr><td>ARM Cortex-M0/M3/M4/M7</td><td><code>arm-none-eabi-gcc</code>, IAR, Keil</td><td>Manual.</td></tr>
-<tr><td>RISC-V rv32imc</td><td><code>riscv32-unknown-elf-gcc</code></td><td>Manual.</td></tr>
-<tr><td>AVR (ATmega328P, etc.)</td><td><code>avr-gcc</code></td><td>Manual.</td></tr>
+<tr><td>AArch64 (ARM64)</td><td><code>aarch64-linux-gnu-gcc</code></td><td>Docker.</td></tr>
+<tr><td>ARM32 / Thumb</td><td><code>arm-none-eabi-gcc</code>, IAR, Keil</td><td>Docker.</td></tr>
+<tr><td>RISC-V rv64 / rv32</td><td><code>riscv64-linux-gnu-gcc</code>, <code>riscv64-unknown-elf-gcc</code></td><td>Docker.</td></tr>
+<tr><td>AVR (ATmega328P, ATtiny85)</td><td><code>avr-gcc</code></td><td>Docker.</td></tr>
 <tr><td>Arduino (AVR, SAMD, etc.)</td><td><code>arduino-cli</code></td><td>Manual.</td></tr>
-<tr><td>MSP430</td><td><code>msp430-elf-gcc</code></td><td>Manual.</td></tr>
+<tr><td>MSP430</td><td><code>msp430-gcc</code></td><td>Docker.</td></tr>
+<tr><td>Motorola 68k</td><td><code>m68k-linux-gnu-gcc</code></td><td>Docker.</td></tr>
+<tr><td>Motorola 68HC11</td><td><code>m68hc11-gcc</code></td><td>Docker.</td></tr>
+<tr><td>PowerPC</td><td><code>powerpc-linux-gnu-gcc</code></td><td>Docker.</td></tr>
+<tr><td>Xtensa LX106 (ESP8266)</td><td><code>xtensa-lx106-elf-gcc</code></td><td>Docker.</td></tr>
 <tr><td>8051</td><td><code>sdcc</code></td><td>Manual.</td></tr>
 </tbody>
 </table>
 
+<h3>Code size (<code>.text</code> section, compiled with <code>-Os</code>)</h3>
+
+<p>All sizes are for the complete <code>FR_math.c</code> &mdash; every function
+included, nothing stripped. With <code>-ffunction-sections</code> and
+linker <code>--gc-sections</code>, only the functions your application
+references are linked, so real flash usage will be smaller.</p>
+
+<!-- SIZE_TABLE_START -->
+<table>
+<thead><tr><th>Target</th><th>.text (bytes)</th></tr></thead>
+<tbody>
+<tr><td>GCC ARM32 Thumb</td><td>4,278</td></tr>
+<tr><td>GCC RISC-V (rv64)</td><td>4,574</td></tr>
+<tr><td>GCC RISC-V (rv32)</td><td>4,820</td></tr>
+<tr><td>GCC Xtensa LX106 (ESP8266)</td><td>5,317</td></tr>
+<tr><td>GCC m68k</td><td>5,410</td></tr>
+<tr><td>GCC ARM32</td><td>5,504</td></tr>
+<tr><td>GCC x86-64</td><td>5,857</td></tr>
+<tr><td>GCC AArch64 (ARM64)</td><td>6,112</td></tr>
+<tr><td>Clang x86-64</td><td>6,555</td></tr>
+<tr><td>GCC x86-32</td><td>6,947</td></tr>
+<tr><td>GCC PowerPC</td><td>7,540</td></tr>
+<tr><td>GCC MSP430</td><td>9,146</td></tr>
+<tr><td>TCC x86</td><td>9,887</td></tr>
+<tr><td>GCC AVR5 (ATmega328P)</td><td>10,806</td></tr>
+<tr><td>GCC AVR ATtiny85</td><td>11,382</td></tr>
+<tr><td>GCC 68HC11</td><td>16,392</td></tr>
+</tbody>
+</table>
+<!-- SIZE_TABLE_END -->
+
+<h3>Lean build options</h3>
+
+<p>Two compile-time <code>#define</code> guards let you strip optional subsystems
+for ROM-constrained targets. Define them before including
+<code>FR_math.h</code> (or pass <code>-D</code> on the compiler command line):</p>
+
+<table>
+<thead><tr><th>Define</th><th>What it removes</th><th>Typical savings</th></tr></thead>
+<tbody>
+<tr><td><code>FR_NO_PRINT</code></td><td><code>FR_printNumF</code>, <code>FR_printNumD</code>, <code>FR_printNumH</code>, <code>FR_numstr</code></td><td>~1.3 KB</td></tr>
+<tr><td><code>FR_NO_WAVES</code></td><td><code>fr_wave_*</code> (6 shapes), <code>fr_adsr_*</code> (ADSR envelope), <code>FR_HZ2BAM_INC</code></td><td>~0.6 KB</td></tr>
+</tbody>
+</table>
+
+<p>With both guards enabled the core math library (trig, inverse trig, log/exp,
+sqrt, hypot) compiles to ~3.5 KB on x86-64 / clang -Os.</p>
+
+<pre><code class="language-c">/* Example: headless sensor node &mdash; math only, no print, no audio */
+#define FR_NO_PRINT
+#define FR_NO_WAVES
+#include "FR_math.h"</code></pre>
+
+<p>With <code>-ffunction-sections</code> and linker <code>--gc-sections</code>,
+the linker will also strip any unused functions automatically, so these guards
+are most useful when you include the library as a single <code>.c</code> file
+or static archive without section-level dead-code elimination.</p>
+
+<p>To regenerate this table, run the Docker cross-build
+(requires the <a href="https://github.com/deftio/xelp">xelp</a> Docker image):</p>
+
+<pre><code class="language-bash">scripts/crossbuild-docker.sh</code></pre>
+
 <h3>Example: RISC-V</h3>
 
 <pre><code class="language-bash">riscv32-unknown-elf-gcc -Os -ffunction-sections -fdata-sections \
@@ -195,9 +265,8 @@ <h3>Example: Arduino</h3>
 arduino-cli compile --fqbn arduino:avr:uno examples/trig-functions
 arduino-cli compile --fqbn arduino:avr:uno examples/wave-generators</code></pre>
 
-<p>Expect the whole integer-only library to land around a few
-kilobytes of flash. The wave, trig, and log modules can be compiled
-in independently if you want to strip further.</p>
+<p>See the code size table above for exact numbers. With linker
+dead-code elimination, only the functions you call are linked.</p>
 
 <h2>CI</h2>
 
diff --git a/pages/guide/examples.html b/pages/guide/examples.html
index 3f74155..137525b 100644
--- a/pages/guide/examples.html
+++ b/pages/guide/examples.html
@@ -155,9 +155,7 @@ <h2>3. Square root and hypotenuse</h2>
         printf("sqrt(-1) rejected, good.\n");
 
     /* Fast approximate &mdash; no multiply, no 64-bit math */
-    s32 h_fast  = FR_hypot_fast(three, four);     /* 4-seg, ~0.4% err */
     s32 h_fast8 = FR_hypot_fast8(three, four);    /* 8-seg, ~0.14% err */
-    printf("hypot_fast(3,4)  = %d (integer part)\n", (int)FR2I(h_fast, r));
     printf("hypot_fast8(3,4) = %d (integer part)\n", (int)FR2I(h_fast8, r));
     return 0;
 }</code></pre>
diff --git a/pages/index.html b/pages/index.html
index 04ba463..9746d46 100644
--- a/pages/index.html
+++ b/pages/index.html
@@ -47,21 +47,31 @@ <h2>Measured accuracy</h2>
 <p>Errors below are measured at Q16.16 (s15.16). All functions accept any
 radix &mdash; Q16.16 is just the reference point for the table.
 See the <a href="https://github.com/deftio/fr_math/blob/master/build/test_tdd_report.md">TDD
-report</a> for sweeps at radixes 8, 12, 16, and 24.</p>
+report</a> for sweeps at radixes 8, 12, 16, and 24.
+Percent errors skip expected values near zero (|expected| &lt; 0.01).</p>
 
-<table>
-<thead><tr><th>Function</th><th>Max error</th><th>Note</th></tr></thead>
-<tbody>
-<tr><td>sin / cos</td><td>5 LSB (~7.7e-5)</td><td>Exact at 0, 90, 180, 270</td></tr>
-<tr><td>sqrt</td><td>&le; 0.5 LSB</td><td>Round-to-nearest</td></tr>
-<tr><td>log2</td><td>&le; 4 LSB</td><td>65-entry mantissa table</td></tr>
-<tr><td>pow2</td><td>&le; 1 LSB (integers exact)</td><td>65-entry fraction table</td></tr>
-<tr><td>ln, log10</td><td>&le; 4 LSB</td><td>Via FR_MULK28 from log2</td></tr>
-<tr><td>hypot (exact)</td><td>&le; 0.5 LSB</td><td>64-bit intermediate</td></tr>
-<tr><td>hypot_fast (4-seg)</td><td>0.34%</td><td>Shift-only, no multiply</td></tr>
-<tr><td>hypot_fast8 (8-seg)</td><td>0.10%</td><td>Shift-only, no multiply</td></tr>
-</tbody>
-</table>
+<!-- ACCURACY_TABLE_START -->
+<table>
+<thead><tr><th>Function</th><th>Max err (LSB)</th><th>Max err (%)</th><th>Avg err (%)</th><th>Note</th></tr></thead>
+<tbody>
+<tr><td>sin / cos</td><td>7.5</td><td>0.7169</td><td>0.0100</td><td>65536-pt sweep + specials</td></tr>
+<tr><td>tan</td><td>38020.4</td><td>0.7118</td><td>0.0162</td><td>65536-pt sweep (skip poles)</td></tr>
+<tr><td>asin / acos</td><td>42.3</td><td>0.7025</td><td>0.0105</td><td>65536-pt; sqrt approx near boundary</td></tr>
+<tr><td>atan2</td><td>63.3</td><td>0.4953</td><td>0.0268</td><td>65536x5 radii; asin/acos+hypot_fast8</td></tr>
+<tr><td>atan</td><td>61.9</td><td>0.2985</td><td>0.0159</td><td>20001-pt sweep [-10,10]; via FR_atan2</td></tr>
+<tr><td>sqrt</td><td>28.4</td><td>0.0003</td><td>0.0000</td><td>Round-to-nearest</td></tr>
+<tr><td>log2</td><td>10.5</td><td>0.2479</td><td>0.0045</td><td>65-entry mantissa table</td></tr>
+<tr><td>pow2</td><td>220.4</td><td>0.1373</td><td>0.0057</td><td>65-entry fraction table</td></tr>
+<tr><td>ln, log10</td><td>0.7</td><td>0.0015</td><td>0.0004</td><td>Via FR_MULK28 from log2</td></tr>
+<tr><td>exp</td><td>65.7</td><td>0.0719</td><td>0.0051</td><td>FR_MULK28 + FR_pow2</td></tr>
+<tr><td>exp_fast</td><td>195.5</td><td>0.0719</td><td>0.0064</td><td>Shift-only scaling</td></tr>
+<tr><td>pow10</td><td>143.4</td><td>0.1163</td><td>0.0075</td><td>FR_MULK28 + FR_pow2</td></tr>
+<tr><td>pow10_fast</td><td>581.9</td><td>0.1163</td><td>0.0100</td><td>Shift-only scaling</td></tr>
+<tr><td>hypot (exact)</td><td>0.2</td><td>0.0001</td><td>0.0000</td><td>64-bit intermediate</td></tr>
+<tr><td>hypot_fast8 (8-seg)</td><td>59968.8</td><td>0.0977</td><td>0.0508</td><td>Shift-only, no multiply</td></tr>
+</tbody>
+</table>
+<!-- ACCURACY_TABLE_END -->
 
 <h2>What&rsquo;s in the box</h2>
 
@@ -74,7 +84,7 @@ <h2>What&rsquo;s in the box</h2>
 <tr><td>Trig (radian/BAM)</td><td><code>fr_sin</code>, <code>fr_cos</code>, <code>fr_tan</code>, <code>fr_sin_bam</code>, <code>fr_cos_bam</code>, <code>fr_sin_deg</code>, <code>fr_cos_deg</code></td></tr>
 <tr><td>Inverse trig</td><td><code>FR_atan</code>, <code>FR_atan2</code>, <code>FR_asin</code>, <code>FR_acos</code></td></tr>
 <tr><td>Log / exp</td><td><code>FR_log2</code>, <code>FR_ln</code>, <code>FR_log10</code>, <code>FR_pow2</code>, <code>FR_EXP</code>, <code>FR_POW10</code>, <code>FR_EXP_FAST</code>, <code>FR_POW10_FAST</code>, <code>FR_MULK28</code></td></tr>
-<tr><td>Roots</td><td><code>FR_sqrt</code>, <code>FR_hypot</code>, <code>FR_hypot_fast</code>, <code>FR_hypot_fast8</code></td></tr>
+<tr><td>Roots</td><td><code>FR_sqrt</code>, <code>FR_hypot</code>, <code>FR_hypot_fast8</code></td></tr>
 <tr><td>Wave generators</td><td><code>fr_wave_sqr</code>, <code>fr_wave_pwm</code>, <code>fr_wave_tri</code>, <code>fr_wave_saw</code>, <code>fr_wave_tri_morph</code>, <code>fr_wave_noise</code></td></tr>
 <tr><td>Envelope</td><td><code>fr_adsr_init</code>, <code>fr_adsr_trigger</code>, <code>fr_adsr_release</code>, <code>fr_adsr_step</code></td></tr>
 <tr><td>2D transforms</td><td><code>FR_Matrix2D_CPT</code> (mul, add, sub, det, inv, setrotate, XFormPtI, XFormPtI16)</td></tr>
@@ -86,6 +96,34 @@ <h2>What&rsquo;s in the box</h2>
 <a href="https://github.com/deftio/fr_math/blob/master/tests/test_tdd.cpp">TDD
 characterization suite</a> in the repo.</p>
 
+<h2>Lean build options</h2>
+
+<p>Two compile-time <code>#define</code> guards let you strip optional subsystems
+for ROM-constrained targets. Define them before including
+<code>FR_math.h</code> (or pass <code>-D</code> on the compiler command line):</p>
+
+<table>
+<thead><tr><th>Define</th><th>What it removes</th><th>Typical savings</th></tr></thead>
+<tbody>
+<tr><td><code>FR_NO_PRINT</code></td><td><code>FR_printNumF</code>, <code>FR_printNumD</code>, <code>FR_printNumH</code>, <code>FR_numstr</code></td><td>~1.3 KB</td></tr>
+<tr><td><code>FR_NO_WAVES</code></td><td><code>fr_wave_*</code> (6 shapes), <code>fr_adsr_*</code> (ADSR envelope), <code>FR_HZ2BAM_INC</code></td><td>~0.6 KB</td></tr>
+</tbody>
+</table>
+
+<p>With both guards enabled the core math library (trig, inverse trig, log/exp,
+sqrt, hypot) compiles to ~3.5 KB on x86-64 / clang -Os. On Thumb-2 this
+would be roughly 2.6 KB.</p>
+
+<pre><code class="language-c">/* Example: headless sensor node &mdash; math only, no print, no audio */
+#define FR_NO_PRINT
+#define FR_NO_WAVES
+#include "FR_math.h"</code></pre>
+
+<p>With <code>-ffunction-sections</code> and linker <code>--gc-sections</code>,
+the linker will also strip any unused functions automatically, so these guards
+are most useful when you include the library as a single <code>.c</code> file
+or static archive without section-level dead-code elimination.</p>
+
 <h2>Why fixed-point, in 2026?</h2>
 
 <p>Most application code today has an FPU and can use <code>float</code>
@@ -117,11 +155,62 @@ <h2>Quick taste</h2>
 
 #define R 16  /* work at radix 16 (s15.16) throughout */
 
-s32 pi    = FR_NUM(3, 14159, 5, R);       /* pi at radix 16             */
-s32 c45   = FR_CosI(45);                  /* cos 45 deg = 0.7071 (s15.16) */
-s32 root2 = FR_sqrt(I2FR(2, R), R);       /* sqrt(2)    = 1.4142        */
-s32 lg    = FR_log2(I2FR(1000, R), R, R); /* log2(1000) ~ 9.97          */
-s32 ex    = FR_EXP(I2FR(1, R), R);        /* e^1        ~ 2.7183        */</code></pre>
+/* ---- Creating fixed-point values ----
+ *
+ * FR_NUM(integer, frac_digits, num_digits, radix) encodes a decimal
+ * literal at compile time.  The fractional part is the digits AFTER
+ * the decimal point, and num_digits says how many digits that is.
+ * Think: FR_NUM(3, 14159, 5, 16) means "3.14159" at radix 16.
+ */
+s32 pi   = FR_NUM(3, 14159, 5, R);  /* 3.14159 &rarr; raw 205886 at r16  */
+s32 half = FR_NUM(0, 5, 1, R);      /* 0.5     &rarr; raw 32768           */
+s32 neg  = FR_NUM(-2, 75, 2, R);    /* -2.75   &rarr; raw -180224         */
+
+/* Or parse from a string at runtime (no floats, no strtod): */
+s32 pi2  = FR_numstr("3.14159", R); /* same result as FR_NUM above    */
+
+/* Integer-to-fixed: I2FR(n, radix) just shifts left */
+s32 two  = I2FR(2, R);              /* 2.0 &rarr; raw 131072              */
+
+/* ---- Naming convention: macros vs functions ----
+ *
+ * UPPERCASE FR_ names are macros &mdash; they expand inline with no call
+ * overhead, and the compiler can constant-fold them.  Use these for
+ * conversions and simple arithmetic:
+ *   I2FR, FR2I, FR_NUM, FR_ADD, FR_DIV, FR_ABS, FR_CHRDX, FR_EXP ...
+ *
+ * MixedCase FR_ names are functions &mdash; they contain loops, tables, or
+ * multi-step algorithms where inlining would waste ROM:
+ *   FR_Cos, FR_sqrt, FR_atan2, FR_log2, FR_pow2, FR_printNumF ...
+ *
+ * lowercase fr_ names are v2 functions (radian trig, wave generators,
+ * ADSR envelopes):
+ *   fr_sin, fr_cos, fr_tan, fr_wave_tri, fr_adsr_step ...
+ *
+ * Some macros wrap functions: FR_EXP(x,r) scales x then calls
+ * FR_pow2 &mdash; one-liner convenience, heavy lifting in the function.
+ */
+
+/* ---- Math functions ---- */
+s32 c45   = FR_Cos(45, 0);                /* cos(45&deg;) = 0.7071       */
+s32 s30   = fr_sin(FR_numstr("0.5236", R), R); /* sin(0.5236 rad)    */
+s32 root2 = FR_sqrt(two, R);              /* sqrt(2)  = 1.4142       */
+s32 angle = FR_atan2(I2FR(1,R), I2FR(1,R), R); /* atan2(1,1) rad     */
+s32 lg    = FR_log2(I2FR(1000, R), R, R); /* log2(1000) ~ 9.97       */
+s32 ex    = FR_EXP(I2FR(1, R), R);        /* macro: scales then calls
+                                            * FR_pow2 internally      */
+
+/* ---- Printing (serial / UART / file friendly) ----
+ *
+ * FR_printNumF takes a per-character output function &mdash; works with
+ * putchar, Serial.write, UART_putc, or any int(*)(char).  No
+ * sprintf, no floats, no heap.  Ideal for bare-metal targets.
+ */
+int my_putchar(char c) { return putchar(c); }  /* or your UART func */
+
+FR_printNumF(my_putchar, pi, R, 8, 5);    /* prints " 3.14159"      */
+FR_printNumF(my_putchar, neg, R, 8, 2);   /* prints "   -2.75"      */
+FR_printNumD(my_putchar, FR2I(lg, R), 4); /* prints "   9" (integer)*/</code></pre>
 
 <p>See <a href="guide/getting-started.html">Getting Started</a> for a
 complete walkthrough, or jump straight to the
@@ -136,7 +225,7 @@ <h2>Comparison</h2>
 <tr><td>Fixed format</td><td>Q16.16 only</td><td>Q31 / Q15</td><td>Any radix</td></tr>
 <tr><td>Angle input</td><td>Radians (Q16.16)</td><td>Radians (float)</td><td>BAM (u16), degrees, or radians</td></tr>
 <tr><td>Exact cardinal angles</td><td>No</td><td>N/A</td><td>Yes</td></tr>
-<tr><td>Multiply-free option</td><td>No</td><td>No</td><td>Yes (e.g. <code>FR_EXP_FAST</code>, <code>FR_hypot_fast</code>)</td></tr>
+<tr><td>Multiply-free option</td><td>No</td><td>No</td><td>Yes (e.g. <code>FR_EXP_FAST</code>, <code>FR_hypot_fast8</code>)</td></tr>
 <tr><td>Wave generators</td><td>No</td><td>No</td><td>6 shapes + ADSR</td></tr>
 <tr><td>Dependencies</td><td>None</td><td>ARM only</td><td>None</td></tr>
 <tr><td>Code size (Cortex-M0, -Os)</td><td>2.4 KB</td><td>~40 KB+</td><td>4.2 KB</td></tr>
@@ -157,7 +246,7 @@ <h2>History</h2>
 built for graphics transforms on 16&nbsp;MHz 68k Palm Pilots (it
 shipped inside Trumpetsoft&rsquo;s <em>Inkstorm</em>), then ported
 forward to ARM, x86, MIPS, RISC-V, and various 8/16-bit embedded
-targets. v2.0.2 is the current release with a full test suite,
+targets. v2.0.6 is the current release with a full test suite,
 bit-exact numerical specification, and CI on every push.</p>
 
 <h2>License</h2>
diff --git a/pages/releases.html b/pages/releases.html
index d87abe0..9c15d09 100644
--- a/pages/releases.html
+++ b/pages/releases.html
@@ -21,6 +21,20 @@ <h1>Releases</h1>
 <a href="https://github.com/deftio/fr_math/blob/master/release_notes.md">release_notes.md</a>
 in the repo.</p>
 
+<h2>v2.0.6 &mdash; 2026</h2>
+
+<p>Accuracy improvements, lean-build options, library cleanup.</p>
+
+<ul>
+  <li><strong>FR_acos boundary fix</strong> &mdash; 12x better accuracy near &plusmn;1.0 via deferred quantization</li>
+  <li><strong>FR_atan2 rewrite</strong> &mdash; asin/acos + hypot_fast8, 0.41% peak error (was 20% in libfixmath)</li>
+  <li><strong>Lean build guards</strong> &mdash; <code>FR_NO_PRINT</code> (~1.3 KB) and <code>FR_NO_WAVES</code> (~0.6 KB) for ROM-constrained targets</li>
+  <li><strong>Removed FR_hypot_fast</strong> (4-segment) &mdash; FR_hypot_fast8 is strictly better; 4-seg was dead weight</li>
+  <li>libfixmath comparison benchmark added to repo (<code>compare_lfm/</code>)</li>
+</ul>
+
+<hr>
+
 <h2>v2.0.5 &mdash; 2026</h2>
 
 <p>Release pipeline fixes. Fixed squash-merge divergence handling and
diff --git a/release_notes.md b/release_notes.md
index c4c9ee5..1de55e2 100644
--- a/release_notes.md
+++ b/release_notes.md
@@ -1,5 +1,42 @@
 # FR_Math Release Notes
 
+## Version 2.0.6 (2026)
+
+Accuracy improvements, lean-build options, and library cleanup.
+
+### Accuracy & algorithms
+
+- **FR_acos boundary fix**: deferred quantization computes `1-x` at the
+  caller's radix instead of r15, giving 12x better accuracy near ±1.0
+  (max LSB error 512.6 → 42.3)
+- **FR_atan2 rewrite**: uses asin/acos + hypot_fast8 with octant
+  switching for well-conditioned results everywhere (0.41% peak vs
+  20% for libfixmath)
+
+### Lean build options
+
+Two new compile-time `#define` guards strip optional subsystems for
+ROM-constrained targets:
+
+| Define | Removes | Savings |
+|---|---|---|
+| `FR_NO_PRINT` | `FR_printNumF/D/H`, `FR_numstr` | ~1.3 KB |
+| `FR_NO_WAVES` | `fr_wave_*`, `fr_adsr_*`, `FR_HZ2BAM_INC` | ~0.6 KB |
+
+With both guards the core math library compiles to ~3.5 KB on x86-64
+(clang -Os), roughly 2.6 KB on Thumb-2.
+
+### Removed
+
+- **FR_hypot_fast** (4-segment) deleted — FR_hypot_fast8 (8-segment)
+  is strictly better in both accuracy (0.10% vs 0.34%) and is used
+  internally by FR_atan2. The 4-segment variant was dead weight.
+
+### Other
+
+- libfixmath comparison benchmark (`compare_lfm/`) added to repo
+- Documentation updated across all markdown and HTML pages
+
 ## Version 2.0.5 (2026)
 
 Release pipeline fixes. No functional changes to the math library.
@@ -241,10 +278,9 @@ for the implementation plan this release executed.
   computes `sqrt(x^2 + y^2)` with no intermediate overflow up to the
   full s32 range. Bit-exact for perfect squares; max error ~1 LSB at
   the requested radix.
-- **Fast approximate magnitude** (`FR_hypot_fast`, `FR_hypot_fast8`):
+- **Fast approximate magnitude** (`FR_hypot_fast8`):
   shift-only piecewise-linear approximation of `sqrt(x^2 + y^2)` — no
   multiply, no divide, no 64-bit math, no ROM table, no iteration.
-  `FR_hypot_fast` uses 4 segments (~0.4% peak error);
   `FR_hypot_fast8` uses 8 segments (~0.14% peak error). Based on the
   method of US Patent 6,567,777 B1 (Chatterjee, public domain). No
   `radix` parameter needed — the algorithm is scale-invariant.
diff --git a/scripts/accuracy_report.sh b/scripts/accuracy_report.sh
new file mode 100755
index 0000000..a8426ee
--- /dev/null
+++ b/scripts/accuracy_report.sh
@@ -0,0 +1,157 @@
+#!/usr/bin/env bash
+#
+# accuracy_report.sh — extract the accuracy table from test_tdd and
+# optionally patch it into README.md, docs/README.md, and pages/index.html.
+#
+# Usage:
+#   scripts/accuracy_report.sh            # build, run, print table to stdout
+#   scripts/accuracy_report.sh --update   # also patch the three doc files
+#
+# The table is delimited by sentinel comments:
+#   <!-- ACCURACY_TABLE_START -->
+#   ...
+#   <!-- ACCURACY_TABLE_END -->
+#
+# Exit status: 0 on success, non-zero on build or extraction failure.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+cd "${PROJECT_ROOT}"
+
+MODE="print"
+SHOWPEAK=""
+for arg in "$@"; do
+    case "$arg" in
+        --update) MODE="update" ;;
+        --showpeak) SHOWPEAK="1" ;;
+        -h|--help)
+            echo "Usage: scripts/accuracy_report.sh [--update] [--showpeak]"
+            echo "  (no args)   Build test_tdd, run it, print accuracy table"
+            echo "  --update    Also patch README.md, docs/README.md, pages/index.html"
+            echo "  --showpeak  Add a 'Peak at' column showing the input that produced max % error"
+            exit 0
+            ;;
+        *) echo "Unknown option: $arg" >&2; exit 1 ;;
+    esac
+done
+
+# -----------------------------------------------------------------------
+# 1. Build test_tdd
+# -----------------------------------------------------------------------
+echo "Building test_tdd..." >&2
+make -s dirs
+make -s build/test_tdd 2>&1 >&2
+
+# -----------------------------------------------------------------------
+# 2. Run and capture the accuracy table
+# -----------------------------------------------------------------------
+echo "Running test_tdd..." >&2
+if [ -n "$SHOWPEAK" ]; then
+    OUTPUT=$(FR_SHOWPEAK=1 ./build/test_tdd 2>/dev/null)
+else
+    OUTPUT=$(./build/test_tdd 2>/dev/null)
+fi
+
+# Extract lines between sentinels (inclusive)
+TABLE=$(echo "$OUTPUT" | sed -n '/<!-- ACCURACY_TABLE_START -->/,/<!-- ACCURACY_TABLE_END -->/p')
+
+if [ -z "$TABLE" ]; then
+    echo "ERROR: Could not find ACCURACY_TABLE_START/END sentinels in output" >&2
+    exit 1
+fi
+
+# Extract just the data rows (lines starting with |, excluding header and separator)
+DATA_ROWS=$(echo "$TABLE" | grep '^|' | tail -n +3)
+
+if [ -z "$DATA_ROWS" ]; then
+    echo "ERROR: No data rows found in accuracy table" >&2
+    exit 1
+fi
+
+echo "$TABLE"
+
+if [ "$MODE" != "update" ]; then
+    exit 0
+fi
+
+# -----------------------------------------------------------------------
+# 3. Patch markdown files (README.md, docs/README.md)
+# -----------------------------------------------------------------------
+patch_markdown() {
+    local file="$1"
+    if [ ! -f "$file" ]; then
+        echo "  skip: $file not found" >&2
+        return
+    fi
+
+    # Build replacement block: sentinel + header + separator + data + sentinel
+    local replacement
+    replacement="<!-- ACCURACY_TABLE_START -->"$'\n'
+    replacement+="| Function | Max err (LSB) | Max err (%) | Avg err (%) | Note |"$'\n'
+    replacement+="|---|---:|---:|---:|---|"$'\n'
+    replacement+="$DATA_ROWS"$'\n'
+    replacement+="<!-- ACCURACY_TABLE_END -->"
+
+    # Use perl to replace between sentinels
+    perl -0777 -i -pe "
+        s{<!-- ACCURACY_TABLE_START -->.*?<!-- ACCURACY_TABLE_END -->}
+         {${replacement}}s
+    " "$file"
+
+    echo "  patched: $file" >&2
+}
+
+patch_markdown "README.md"
+patch_markdown "docs/README.md"
+
+# -----------------------------------------------------------------------
+# 4. Patch HTML file (pages/index.html)
+# -----------------------------------------------------------------------
+patch_html() {
+    local file="$1"
+    if [ ! -f "$file" ]; then
+        echo "  skip: $file not found" >&2
+        return
+    fi
+
+    # Convert markdown data rows to HTML <tr> rows
+    local html_rows=""
+    while IFS= read -r line; do
+        # Strip leading/trailing |, split by |
+        local cells
+        cells=$(echo "$line" | sed 's/^| //;s/ |$//' | sed 's/ | /\t/g')
+        local tr="<tr>"
+        while IFS=$'\t' read -r c1 c2 c3 c4 c5; do
+            tr+="<td>${c1}</td><td>${c2}</td><td>${c3}</td><td>${c4}</td><td>${c5}</td>"
+        done <<< "$cells"
+        tr+="</tr>"
+        if [ -n "$html_rows" ]; then
+            html_rows+=$'\n'
+        fi
+        html_rows+="$tr"
+    done <<< "$DATA_ROWS"
+
+    # Build the replacement block
+    local replacement
+    replacement="<!-- ACCURACY_TABLE_START -->"$'\n'
+    replacement+="<table>"$'\n'
+    replacement+="<thead><tr><th>Function</th><th>Max err (LSB)</th><th>Max err (%)</th><th>Avg err (%)</th><th>Note</th></tr></thead>"$'\n'
+    replacement+="<tbody>"$'\n'
+    replacement+="$html_rows"$'\n'
+    replacement+="</tbody>"$'\n'
+    replacement+="</table>"$'\n'
+    replacement+="<!-- ACCURACY_TABLE_END -->"
+
+    perl -0777 -i -pe "
+        s{<!-- ACCURACY_TABLE_START -->.*?<!-- ACCURACY_TABLE_END -->}
+         {${replacement}}s
+    " "$file"
+
+    echo "  patched: $file" >&2
+}
+
+patch_html "pages/index.html"
+
+echo "Accuracy table updated in all doc files." >&2
diff --git a/scripts/crossbuild-docker.sh b/scripts/crossbuild-docker.sh
new file mode 100755
index 0000000..7f10d6d
--- /dev/null
+++ b/scripts/crossbuild-docker.sh
@@ -0,0 +1,123 @@
+#!/bin/bash
+# crossbuild-docker.sh -- cross-compile FR_math inside Docker container
+# Runs inside the xelp-crossbuild Docker image.
+# Reports object file and .text section sizes for each target.
+
+set -e
+
+SRC=/fr_math/src/FR_math.c
+INCLUDE="-I/fr_math/src"
+OBJ=/tmp/FR_math.o
+
+SEP="============================================================"
+
+# Accumulate summary rows: "label|text_size"
+SUMMARY=""
+
+print_sizes() {
+    local label="$1"
+    echo ""
+    echo "$SEP"
+    echo "$label"
+    echo "$SEP"
+    if [ ! -f "$OBJ" ]; then
+        echo "  (build failed)"
+        SUMMARY="${SUMMARY}${label}|FAIL\n"
+        return
+    fi
+    OBJ_SIZE=$(stat -c%s "$OBJ" 2>/dev/null || wc -c < "$OBJ")
+    TEXT_SIZE=$(size "$OBJ" 2>/dev/null | awk 'FNR==2{print $1}')
+    printf "  obj file size: %6s bytes\n" "$OBJ_SIZE"
+    printf "  .text section: %6s bytes\n" "$TEXT_SIZE"
+    SUMMARY="${SUMMARY}${label}|${TEXT_SIZE}\n"
+    rm -f "$OBJ"
+}
+
+echo ""
+echo "FR_Math cross-compilation size report"
+echo "Date: $(date -u '+%Y-%m-%d %H:%M UTC')"
+echo ""
+
+# --- x86 ---
+gcc -c $SRC $INCLUDE -Os -Wall -o $OBJ 2>&1 && true
+print_sizes "GCC x86-64"
+
+clang -c $SRC $INCLUDE -Os -Wall -o $OBJ 2>&1 && true
+print_sizes "Clang x86-64"
+
+gcc -c $SRC $INCLUDE -Os -m32 -Wall -o $OBJ 2>&1 && true
+print_sizes "GCC x86-32"
+
+tcc -c $SRC $INCLUDE -o $OBJ 2>&1 && true
+print_sizes "TCC x86"
+
+# --- ARM ---
+aarch64-linux-gnu-gcc -c $SRC $INCLUDE -Os -Wall -o $OBJ 2>&1 && true
+print_sizes "GCC AArch64 (ARM64)"
+
+arm-none-eabi-gcc -c $SRC $INCLUDE -Os -Wall -o $OBJ 2>&1 && true
+print_sizes "GCC ARM32"
+
+arm-none-eabi-gcc -c $SRC $INCLUDE -Os -mthumb -Wall -o $OBJ 2>&1 && true
+print_sizes "GCC ARM32 Thumb"
+
+# --- MSP430 ---
+# Bare-metal: no stdint.h in sysroot — use fallback typedefs
+NOSTD="-DFR_NO_STDINT"
+
+msp430-gcc -c $SRC $INCLUDE $NOSTD -Os -Wall -o $OBJ 2>&1 && true
+print_sizes "GCC MSP430"
+
+# --- AVR ---
+avr-gcc -c $SRC $INCLUDE $NOSTD -Os -mmcu=avr5 -Wall -o $OBJ 2>&1 && true
+print_sizes "GCC AVR5 (ATmega328P)"
+
+avr-gcc -c $SRC $INCLUDE $NOSTD -Os -mmcu=attiny85 -Wall -o $OBJ 2>&1 && true
+print_sizes "GCC AVR ATtiny85"
+
+# --- 68HC11 ---
+m68hc11-gcc -c $SRC $INCLUDE $NOSTD -Os -o $OBJ 2>&1 && true
+print_sizes "GCC 68HC11"
+
+# --- 68k (Motorola 68000) ---
+m68k-linux-gnu-gcc -c $SRC $INCLUDE -Os -Wall -o $OBJ 2>&1 && true
+print_sizes "GCC m68k"
+
+# --- PowerPC ---
+powerpc-linux-gnu-gcc -c $SRC $INCLUDE -Os -Wall -o $OBJ 2>&1 && true
+print_sizes "GCC PowerPC"
+
+# --- RISC-V ---
+riscv64-linux-gnu-gcc -c $SRC $INCLUDE -Os -Wall -o $OBJ 2>&1 && true
+print_sizes "GCC RISC-V (rv64)"
+
+riscv64-unknown-elf-gcc -c $SRC $INCLUDE $NOSTD -Os -march=rv32imac -mabi=ilp32 -Wall -o $OBJ 2>&1 && true
+print_sizes "GCC RISC-V (rv32)"
+
+# --- Xtensa (ESP8266/ESP32 family) ---
+xtensa-lx106-elf-gcc -c $SRC $INCLUDE $NOSTD -Os -Wall -o $OBJ 2>&1 && true
+print_sizes "GCC Xtensa LX106 (ESP8266)"
+
+# --- Function size table (native GCC) ---
+echo ""
+echo "$SEP"
+echo "Function size table (GCC x86-64)"
+echo "$SEP"
+gcc -c $SRC $INCLUDE -Os -Wall -o $OBJ 2>&1
+nm $OBJ -n -S --size-sort -f sysv -t d 2>/dev/null | grep -E "FUNC" || true
+rm -f $OBJ
+
+# --- Summary table ---
+echo ""
+echo "$SEP"
+echo "Summary: FR_math.c code size (bytes), compiled with -Os"
+echo "$SEP"
+printf "  %-28s %s\n" "Target" ".text (bytes)"
+printf "  %-28s %s\n" "----------------------------" "-------------"
+echo -e "$SUMMARY" | while IFS='|' read -r label size; do
+    [ -z "$label" ] && continue
+    printf "  %-28s %s\n" "$label" "$size"
+done
+
+echo ""
+echo "Done."
diff --git a/src/FR_defs.h b/src/FR_defs.h
index ee61c95..631126e 100644
--- a/src/FR_defs.h
+++ b/src/FR_defs.h
@@ -29,13 +29,39 @@
 #define __FR_Platform_Defs_H__
 
 /*
- * Fixed-width integer typedefs. C99 stdint.h is mandatory in v2.
+ * Fixed-width integer typedefs.
  *
- * Any C99-or-newer toolchain (gcc, clang, MSVC, IAR, Keil, sdcc, MSP430-gcc,
- * AVR-gcc, RISC-V toolchains, ARM toolchains) supports <stdint.h>. If you
- * are on a pre-C99 toolchain, FR_Math 1.0.x is the version for you.
+ * Prefer C99 <stdint.h> when available (gcc, clang, MSVC, IAR, Keil, sdcc,
+ * MSP430-gcc, AVR-gcc, RISC-V, ARM toolchains).  For bare-metal toolchains
+ * or pre-C99 compilers that lack <stdint.h>, define FR_NO_STDINT before
+ * including this header and the types are provided via sizeof()-based
+ * fallback definitions that cover the common 8/16/32/64-bit layouts.
  */
+#ifndef FR_NO_STDINT
 #include <stdint.h>
+#else
+/* ---- fallback: no <stdint.h> ------------------------------------ */
+/* Works on any toolchain where char=8, short=16, int/long=32 bits,
+ * which covers virtually all embedded targets (AVR, MSP430, ARM,
+ * 68HC11, 68k, PPC, RISC-V, Xtensa, x86).  Adjust if your platform
+ * differs.
+ */
+typedef unsigned char      uint8_t;
+typedef signed   char      int8_t;
+typedef unsigned short     uint16_t;
+typedef signed   short     int16_t;
+#if defined(__AVR__) || defined(__MSP430__) || defined(__m68hc1x__)
+  /* On these targets int is 16-bit; use long for 32-bit */
+  typedef unsigned long    uint32_t;
+  typedef signed   long    int32_t;
+#else
+  typedef unsigned int     uint32_t;
+  typedef signed   int     int32_t;
+#endif
+/* 64-bit: available on most 32-bit+ GCC targets via long long */
+typedef unsigned long long uint64_t;
+typedef signed   long long int64_t;
+#endif /* FR_NO_STDINT */
 
 /*
  * Arduino's USBAPI.h typedefs u8 and u16 as unsigned char / unsigned short.
diff --git a/src/FR_math.c b/src/FR_math.c
index b9e5ace..181972e 100644
--- a/src/FR_math.c
+++ b/src/FR_math.c
@@ -32,7 +32,9 @@
 #include "FR_math.h"
 #include "FR_trig_table.h"
 
+#ifndef FR_NO_STDINT
 #include <stdint.h>
+#endif
 
 /*=======================================================
  * BAM-native trig: fr_cos_bam, fr_sin_bam, fr_cos, fr_sin, fr_tan
@@ -147,12 +149,14 @@ s32 fr_tan(s32 rad, u16 radix)
  */
 static u16 fr_deg_radix_to_bam(s16 deg, u16 radix)
 {
-	/* (s32)deg * 0xB60B keeps everything in 32-bit math (8051-friendly).
-	 * For radix 0, 0xB60B = 65536/360 ≈ 182.0444. The shift strips the
-	 * input radix to land in u16 BAM space.
+	/* 0xB60B ≈ (65536/360) * 256 — the ×256 prescale keeps 32-bit math
+	 * friendly to 8051-class MCUs.  We must shift out both the input
+	 * fraction bits (radix) AND the 8-bit prescale, hence radix + 8.
+	 * The +half term rounds to nearest, matching FR_DEG2BAM behaviour.
 	 */
-	s32 v = (s32)deg * 0xB60BL;
-	return (u16)((u32)(v >> radix) & 0xffff);
+	s32  v     = (s32)deg * 0xB60BL;
+	u16  shift = radix + 8;
+	return (u16)((u32)((v + (1L << (shift - 1))) >> shift) & 0xffff);
 }
 
 s32 FR_Cos(s16 deg, u16 radix)
@@ -245,27 +249,67 @@ s32 FR_FixAddSat(s32 x, s32 y)
  */
 /* FR_acos — returns radians at out_radix.
  * Range: [0, pi].  Input is a cosine value at the given radix.
+ *
+ * Uses the same 129-entry cosine table as fr_cos_bam, but in reverse:
+ * binary-search to find the bracketing pair, then linear-interpolate
+ * the fractional position between them to recover the full 14-bit
+ * in-quadrant BAM.  This mirrors the forward path and gives matching
+ * precision (~1 LSB of s15.16 output).
  */
 s32 FR_acos(s32 input, u16 radix, u16 out_radix)
 {
 	s32 v;
 	s16 sign;
 	s32 lo, hi, mid;
-	s32 best_idx, best_err;
-	s32 left, right;
+	s32 idx, d, num, frac;
+	s32 input_abs;
+
+	/* Work with absolute value at the caller's radix — we'll need it for
+	 * the sqrt fast path before quantising to r15. */
+	sign = (s16)((input < 0) ? 1 : 0);
+	input_abs = sign ? -input : input;
 
-	v = FR_CHRDX(input, radix, FR_TRIG_PREC); /* to s0.15 */
+	/* Clamp at the caller's radix — not at r15.  Near ±1.0 the r15
+	 * quantisation can round to 32767 even when the caller has sub-LSB
+	 * precision that the sqrt fast path can use. */
+	{
+		s32 one = (s32)1 << radix;
+		if (input_abs >= one)
+			return sign ? FR_BAM2RAD(FR_BAM_HALF, out_radix) : 0;
+	}
 
-	/* Clamp range: acos(1.0) = 0, acos(-1.0) = pi */
-	if (v >=  32767) return 0;
-	if (v <= -32767) return FR_BAM2RAD(FR_BAM_HALF, out_radix); /* pi */
+	v = FR_CHRDX(input_abs, radix, FR_TRIG_PREC); /* |input| at s0.15 */
+
+	/* Small-angle fast path: when cos(θ) is close to 1.0, the table
+	 * has only 2-8 LSBs of gap per entry, so linear interpolation is
+	 * very coarse.  Use the identity  acos(x) ≈ sqrt(2*(1-x)).
+	 *
+	 * Key: compute 1-x at the CALLER's radix, not r15.  Near ±1.0 the
+	 * r15 quantisation crushes many distinct inputs to the same value
+	 * (cos(179.5°)..cos(179.9°) all round to 32767 at r15).  The
+	 * caller's higher-radix bits carry the angular information via the
+	 * identity sin(θ) = sqrt(2(1-cos θ)) — effectively the sin trick. */
+	if (v > gFR_COS_TAB_Q[7])
+	{
+		s32 one = (s32)1 << radix;
+		s32 one_minus_x = one - input_abs;           /* 1-|x| at caller radix */
+		s32 two_omx     = one_minus_x << 1;          /* 2(1-|x|) at caller radix */
+		s32 rad_native  = FR_sqrt(two_omx, radix);   /* radians at caller radix */
+		s32 rad_out     = FR_CHRDX(rad_native, radix, out_radix);
+		if (sign)
+			rad_out = FR_BAM2RAD(FR_BAM_HALF, out_radix) - rad_out;
+		return rad_out;
+	}
 
-	sign = (v < 0) ? 1 : 0;
-	if (v < 0) v = -v;
+	/* Below this point we need the sign-stripped r15 value for the
+	 * binary search.  (v was already computed from input_abs above.) */
 
-	/* Binary search on the BAM quadrant table. The table is monotonically
-	 * decreasing across [0, FR_TRIG_TABLE_SIZE]. We want the index `i`
-	 * such that gFR_COS_TAB_Q[i] is closest to v.
+	/* Binary search on the cosine quadrant table.  The table is
+	 * monotonically decreasing: gFR_COS_TAB_Q[0] = 32767 (cos 0°),
+	 * gFR_COS_TAB_Q[128] = 0 (cos 90°).
+	 *
+	 * After the search, lo is the first index where table[lo] <= v,
+	 * so the bracketing pair is (lo-1, lo) with table[lo-1] >= v >= table[lo].
 	 */
 	lo = 0;
 	hi = FR_TRIG_TABLE_SIZE;
@@ -277,28 +321,45 @@ s32 FR_acos(s32 input, u16 radix, u16 out_radix)
 		else
 			hi = mid;
 	}
-	best_idx = lo;
-	best_err = (gFR_COS_TAB_Q[best_idx] > v) ? (gFR_COS_TAB_Q[best_idx] - v)
-	                                         : (v - gFR_COS_TAB_Q[best_idx]);
-	if (best_idx > 0)
+
+	/* lo is now the index where table[lo] <= v.  The bracketing interval
+	 * is [lo-1, lo] (table decreasing).  Clamp idx to valid range.
+	 */
+	idx = lo;
+	if (idx <= 0)
 	{
-		left = gFR_COS_TAB_Q[best_idx - 1] - v;
-		if (left < 0) left = -left;
-		if (left < best_err) { best_err = left; best_idx = best_idx - 1; }
+		/* v >= table[0] = 32767 — essentially cos(0), already clamped above
+		 * but guard anyway. */
+		idx = 0;
+		frac = 0;
 	}
-	if (best_idx < FR_TRIG_TABLE_SIZE)
+	else if (idx >= FR_TRIG_TABLE_SIZE)
 	{
-		right = gFR_COS_TAB_Q[best_idx + 1] - v;
-		if (right < 0) right = -right;
-		if (right < best_err) { best_err = right; best_idx = best_idx + 1; }
+		idx = FR_TRIG_TABLE_SIZE - 1;
+		frac = 0;
+	}
+	else
+	{
+		/* Linear interpolate between table[idx-1] and table[idx].
+		 * d = table[idx-1] - table[idx]  (>= 0, cos decreasing)
+		 * num = table[idx-1] - v          (how far past table[idx-1])
+		 * frac = (num << FR_TRIG_FRAC_BITS) / d, in [0, FR_TRIG_FRAC_MAX)
+		 *
+		 * num and d are both in [0, 32767], so num << 7 fits in 22 bits.
+		 */
+		d   = gFR_COS_TAB_Q[idx - 1] - gFR_COS_TAB_Q[idx];
+		num = gFR_COS_TAB_Q[idx - 1] - v;
+		if (d > 0)
+			frac = ((num << FR_TRIG_FRAC_BITS) + (d >> 1)) / d;
+		else
+			frac = 0;
+		/* Reconstruct: the angle is at index (idx-1) + frac/FRAC_MAX,
+		 * so shift idx back by 1 for the BAM calculation below. */
+		idx = idx - 1;
 	}
 
-	/* best_idx is in [0, FR_TRIG_TABLE_SIZE]. Convert to BAM:
-	 * the table covers one quadrant (16384 BAM) in FR_TRIG_TABLE_SIZE-1 steps.
-	 * bam = best_idx << FR_TRIG_FRAC_BITS.
-	 */
 	{
-		u16 bam = (u16)((u32)best_idx << FR_TRIG_FRAC_BITS);
+		u16 bam = (u16)(((u32)idx << FR_TRIG_FRAC_BITS) + (u32)frac);
 		if (sign)
 			bam = (u16)(FR_BAM_HALF - bam);  /* mirror: pi - angle */
 		return FR_BAM2RAD(bam, out_radix);
@@ -313,60 +374,22 @@ s32 FR_asin(s32 input, u16 radix, u16 out_radix)
 	return half_pi - FR_acos(input, radix, out_radix);
 }
 
-/* arctan table: gFR_ATAN_TAB[i] = atan(i/32) in degrees, scaled by 64
- * (i.e. fixed-point s.6), for i in [0..32]. So index 32 is atan(1) = 45°.
- *
- * Generated by:
- *   for i in 0..32: int(round(degrees(atan(i/32.0)) * 64))
- *
- *   i=0:   0   atan(0/32)   =  0°       *64 = 0
- *   i=1: 115   atan(1/32)   =  1.7899°  *64 = 114.55
- *   ...
- *   i=32: 2880 atan(32/32)  = 45°       *64 = 2880
- */
-static const s16 gFR_ATAN_TAB[33] = {
-       0,   115,   229,   343,   456,   568,   680,   790,
-     898,  1005,  1111,  1214,  1316,  1415,  1512,  1607,
-    1700,  1791,  1879,  1965,  2048,  2130,  2209,  2285,
-    2360,  2432,  2502,  2570,  2636,  2700,  2762,  2822,
-    2880
-};
-
-/* helper: arctan(t) for t in [0,1] in radix-16 input, returning BAM (u16).
- * Uses the gFR_ATAN_TAB table with linear interpolation.
- *
- * t is in s.16. The table indexes into [0,1] in 32 steps, so the table
- * step in s.16 units is (1<<16)/32 = 2048.
- *
- * The atan table stores degrees*64 (s.6). We convert to BAM internally:
- * bam = deg64 * 65536 / (360 * 64) = deg64 * (65536 / 23040).
- * Approximation: bam ≈ (deg64 * 182) >> 6, matching FR_DEG2BAM precision.
- */
-static u16 fr_atan_unit_q1_bam(s32 t_s16)
-{
-	s32 idx, frac, lo, hi, deg64;
-	if (t_s16 <= 0) return 0;
-	if (t_s16 >= (1L << 16)) return FR_BAM_QUADRANT >> 1; /* 45° in BAM */
-	idx  = t_s16 >> 11;        /* 2048 = 1<<11 */
-	frac = t_s16 & ((1L << 11) - 1);
-	lo = gFR_ATAN_TAB[idx];
-	hi = gFR_ATAN_TAB[idx + 1];
-	deg64 = lo + (((hi - lo) * frac) >> 11);
-	/* Convert degrees*64 → BAM: bam = deg64 * (65536/360) / 64
-	 *                              ≈ (deg64 * 182 + 32) >> 6
-	 */
-	return (u16)(((s32)deg64 * 182L + 32) >> 6);
-}
-
 /* FR_atan2(y, x, out_radix) — full-circle arctangent, returns radians
  * at the specified output radix (s32).
  *
  * Range: [-pi, pi]. Returns 0 for atan2(0,0).
+ *
+ * Implementation: normalise (x,y) via FR_hypot_fast8, then recover the
+ * angle with FR_asin or FR_acos (both use the 129-entry cosine table).
+ * To stay in the well-conditioned region of each inverse function we
+ * switch at 45°:
+ *   |y| <= |x|  →  use asin(y/h)   — asin stable near 0
+ *   |y| >  |x|  →  use acos(x/h)   — acos stable near pi/2
+ * This keeps the derivative amplification factor below 1.414x everywhere.
  */
 s32 FR_atan2(s32 y, s32 x, u16 out_radix)
 {
-	s32 ay, ax, ratio;
-	u16 bam;
+	s32 ax, ay, h, q1_angle;
 
 	/* Axis cases — exact angles, no divide. */
 	if (x == 0)
@@ -381,26 +404,60 @@ s32 FR_atan2(s32 y, s32 x, u16 out_radix)
 	ax = (x < 0) ? -x : x;
 	ay = (y < 0) ? -y : y;
 
-	/* Compute ratio of smaller / larger in s.16 so it stays in [0,1]. */
+	/* Normalise so max(ax,ay) sits in [2^14, 2^15).  This gives
+	 * FR_hypot_fast8 enough integer bits for the shift-only segments
+	 * to produce an accurate ratio — critical when the raw inputs are
+	 * small (e.g. atan2(1,1) at radix 0).  Scaling both by the same
+	 * power of two doesn't change the angle. */
+	{
+		s32 mx = (ax > ay) ? ax : ay;
+		while (mx < (1L << 14)) { ax <<= 1; ay <<= 1; mx <<= 1; }
+		while (mx >= (1L << 16)) { ax >>= 1; ay >>= 1; mx >>= 1; }
+	}
+
+	h = FR_hypot_fast8((s32)ax, (s32)ay);
+	if (h == 0) return 0;  /* degenerate */
+
+	/* Compute the first-quadrant angle (positive, [0..pi/2]).
+	 * Divide produces a value in [0,1] at radix FR_TRIG_PREC (s0.15).
+	 *
+	 * Small-angle fast path: when the minor-axis ratio is small,
+	 * asin(x) ≈ x (error < x³/6).  Below ~5° the cubic term is
+	 * smaller than the table-lookup error, so the direct identity
+	 * is both faster and more accurate.  Threshold 2753 at r15
+	 * corresponds to sin(~4.8°) = 0.084. */
+	#define FR_ATAN2_SMALL  2753
 	if (ay <= ax)
 	{
-		ratio = (s32)(((int64_t)ay << 16) / ax);
-		bam = fr_atan_unit_q1_bam(ratio);                  /* [0..45°] BAM */
+		/* angle in [0°..45°]: use asin(ay/h) — well-conditioned near 0 */
+		s32 sin_val = (s32)(((int64_t)ay << FR_TRIG_PREC) / h);
+		if (sin_val < FR_ATAN2_SMALL)
+			q1_angle = FR_CHRDX(sin_val, FR_TRIG_PREC, out_radix);
+		else
+			q1_angle = FR_asin(sin_val, FR_TRIG_PREC, out_radix);
 	}
 	else
 	{
-		ratio = (s32)(((int64_t)ax << 16) / ay);
-		bam = (u16)(FR_BAM_QUADRANT - fr_atan_unit_q1_bam(ratio)); /* [45..90°] BAM */
+		/* angle in [45°..90°]: use acos(ax/h) — well-conditioned near pi/2 */
+		s32 cos_val = (s32)(((int64_t)ax << FR_TRIG_PREC) / h);
+		if (cos_val < FR_ATAN2_SMALL)
+		{
+			/* angle ≈ pi/2 - cos_val (symmetric small-angle identity) */
+			s32 half_pi = FR_BAM2RAD(FR_BAM_QUADRANT, out_radix);
+			q1_angle = half_pi - FR_CHRDX(cos_val, FR_TRIG_PREC, out_radix);
+		}
+		else
+			q1_angle = FR_acos(cos_val, FR_TRIG_PREC, out_radix);
 	}
 
-	/* Apply quadrant sign and convert BAM → radians. */
+	/* Apply quadrant from signs of x and y.
+	 * q1_angle is always positive [0..pi/2]. */
 	{
-		s32 rad = FR_BAM2RAD(bam, out_radix);
-		s32 pi  = FR_BAM2RAD(FR_BAM_HALF, out_radix);
+		s32 pi = FR_BAM2RAD(FR_BAM_HALF, out_radix);
 		if (x > 0)
-			return (y > 0) ? rad : -rad;
-		/* x < 0 */
-		return (y > 0) ? (pi - rad) : (rad - pi);
+			return (y > 0) ? q1_angle : -q1_angle;
+		/* x < 0: mirror across y-axis */
+		return (y > 0) ? (pi - q1_angle) : (q1_angle - pi);
 	}
 }
 
@@ -607,6 +664,7 @@ s32 FR_log10(s32 input, u16 radix, u16 output_radix)
 	return FR_MULK28(r, FR_krLOG2_10_28);
 }
 
+#ifndef FR_NO_PRINT
 /***************************************
  * FR_printNumD - write a decimal integer with space padding.
  *
@@ -871,6 +929,7 @@ s32 FR_numstr(const char *s, u16 radix)
 
     return neg ? -result : result;
 }
+#endif /* FR_NO_PRINT */
 
 /*=======================================================
  * Square root and hypot
@@ -965,66 +1024,12 @@ s32 FR_hypot(s32 x, s32 y, u16 radix)
 	return (s32)fr_isqrt64(xx + yy);
 }
 
-/*=======================================================
- * FR_hypot_fast — 4-segment piecewise-linear magnitude approximation.
- *
- * Computes an approximation of sqrt(x*x + y*y) using only shifts and adds
- * (no multiply, no divide, no 64-bit math, no ROM table, no iteration).
- *
- * Based on the piecewise-linear method described in US Patent 6,567,777 B1
- * (Chatterjee, now public domain). The algorithm:
- *   1. Take absolute values, assign hi = max(|x|,|y|), lo = min(|x|,|y|).
- *   2. Determine which of 4 angular slices lo/hi falls into.
- *   3. Apply pre-computed shift-only linear coefficients for that slice.
- *
- * Peak error: ~0.4%.
- * The result is at the same radix as the inputs — scale-invariant.
- */
-s32 FR_hypot_fast(s32 x, s32 y)
-{
-    s32 hi, lo;
-
-    /* absolute values (clamp INT32_MIN to INT32_MAX to avoid UB) */
-    if (x < 0) x = (x == (s32)0x80000000) ? 0x7FFFFFFF : -x;
-    if (y < 0) y = (y == (s32)0x80000000) ? 0x7FFFFFFF : -y;
-
-    /* hi = max(|x|,|y|), lo = min(|x|,|y|) */
-    if (x > y) { hi = x; lo = y; }
-    else       { hi = y; lo = x; }
-
-    if (hi == 0) return 0;
-
-    /* 4 piecewise-linear segments: dist ≈ a*hi + b*lo
-     * where a,b are shift-only minimax fits of sqrt(1+β²) on each
-     * interval, β = lo/hi. Boundaries at β = 0.25, 0.5, 0.75. */
-    if ((hi >> 1) < lo) {
-        /* β in (0.5, 1.0] */
-        if (lo > hi - (hi >> 2))                      /* β > 0.75 */
-            /* a≈0.7559, b≈0.6567 */
-            return hi - (hi >> 2) + (hi >> 7) - (hi >> 9)
-                 + (lo >> 1) + (lo >> 3) + (lo >> 5) + (lo >> 11);
-        else                                           /* β in (0.5, 0.75] */
-            /* a≈0.8555, b≈0.5225 */
-            return hi - (hi >> 3) - (hi >> 6) - (hi >> 8)
-                 + (lo >> 1) + (lo >> 5) - (lo >> 7) - (lo >> 10);
-    } else {
-        /* β in [0, 0.5] */
-        if ((hi >> 2) < lo)                            /* β in (0.25, 0.5] */
-            /* a≈0.9409, b≈0.3477 */
-            return hi - (hi >> 4) + (hi >> 8) - (hi >> 11)
-                 + (lo >> 1) - (lo >> 3) - (lo >> 5) + (lo >> 8);
-        else                                           /* β in [0, 0.25] */
-            /* a≈0.9966, b≈0.1209 */
-            return hi - (hi >> 8) + (hi >> 11)
-                 + (lo >> 3) - (lo >> 8) - (lo >> 12);
-    }
-}
-
 /*=======================================================
  * FR_hypot_fast8 — 8-segment piecewise-linear magnitude approximation.
  *
- * Same approach as FR_hypot_fast but with 8 angular slices for tighter fit.
- * Peak error: ~0.14%.
+ * Shift-only, no multiply, no 64-bit.  Based on the piecewise-linear
+ * method described in US Patent 6,567,777 B1 (Chatterjee, expired).
+ * Peak error: ~0.10%.
  */
 s32 FR_hypot_fast8(s32 x, s32 y)
 {
@@ -1091,6 +1096,7 @@ s32 FR_hypot_fast8(s32 x, s32 y)
     }
 }
 
+#ifndef FR_NO_WAVES
 /*=======================================================
  * Wave generators — synth-style fixed-shape waveforms.
  *
@@ -1200,7 +1206,7 @@ s16 fr_wave_tri_morph(u16 phase, u16 break_point)
 	if (phase < break_point)
 	{
 		/* rising: 0 at phase=0, 32767 at phase=break_point */
-		t = ((u32)phase * 32767UL) / (u32)break_point;
+		t = (u32)(((u32)phase * 32767UL) / (u32)break_point);
 	}
 	else
 	{
@@ -1208,7 +1214,7 @@ s16 fr_wave_tri_morph(u16 phase, u16 break_point)
 		u32 span = (u32)0xffff - (u32)break_point;
 		if (span == 0)
 			return 32767;
-		t = ((u32)((u32)0xffff - (u32)phase) * 32767UL) / span;
+		t = (u32)(((u32)((u32)0xffff - (u32)phase) * 32767UL) / span);
 	}
 	if (t > 32767) t = 32767;
 	return (s16)t;
@@ -1294,7 +1300,7 @@ void fr_adsr_init(fr_adsr_t *env,
 	    ? (s32)(FR_ADSR_PEAK_S130 / attack_samples)
 	    : FR_ADSR_PEAK_S130;
 	env->decay_dec   = (decay_samples   > 0)
-	    ? (s32)((FR_ADSR_PEAK_S130 - env->sustain) / decay_samples)
+	    ? (s32)((FR_ADSR_PEAK_S130 - env->sustain) / (s32)decay_samples)
 	    : (FR_ADSR_PEAK_S130 - env->sustain);
 	env->release_dec = (release_samples > 0)
 	    ? (s32)(FR_ADSR_PEAK_S130 / release_samples)
@@ -1362,3 +1368,4 @@ s16 fr_adsr_step(fr_adsr_t *env)
 		return (s16)out;
 	}
 }
+#endif /* FR_NO_WAVES */
diff --git a/src/FR_math.h b/src/FR_math.h
index 82bea96..562a5d3 100644
--- a/src/FR_math.h
+++ b/src/FR_math.h
@@ -32,8 +32,8 @@
 #ifndef __FR_Math_h__
 #define __FR_Math_h__
 
-#define FR_MATH_VERSION     "2.0.5"
-#define FR_MATH_VERSION_HEX  0x020005  /* major << 16 | minor << 8 | patch */
+#define FR_MATH_VERSION     "2.0.6"
+#define FR_MATH_VERSION_HEX  0x020006  /* major << 16 | minor << 8 | patch */
 
 #ifdef __cplusplus
 extern "C"
@@ -378,7 +378,7 @@ static inline s32 FR_div_rnd(s64 num, s32 den) {
  * Derivation: rad = bam * 2π / 65536. At output radix r: bam * 2π * 2^r / 2^16
  *           = bam * (2π * 2^10) / 2^(26 - r) = bam * 6434 >> (26 - r).
  */
-#define FR_BAM2RAD(bam, radix)  (((s32)(u16)(bam) * 6434L) >> (26 - (radix)))
+#define FR_BAM2RAD(bam, radix)  ((s32)(((s32)(u16)(bam) * 6434L) >> (26 - (radix))))
 
 /*===============================================
  * Radian-native and BAM-native trig (recommended)
@@ -455,6 +455,19 @@ static inline s32 FR_div_rnd(s64 num, s32 den) {
 #define FR_EXP_FAST(input, radix)   (FR_pow2(FR_SLOG2E(input), radix))
 #define FR_POW10_FAST(input, radix) (FR_pow2(FR_SLOG2_10(input), radix))
 
+/*===============================================
+ * Formatted output and string parsing
+ *
+ * Define FR_NO_PRINT before including this header to exclude all
+ * print/format functions from compilation. This saves ~1.7 KB of ROM
+ * on targets that don't need human-readable output (e.g. headless
+ * sensor nodes, DSP-only firmware).
+ *
+ *   #define FR_NO_PRINT
+ *   #include "FR_math.h"
+ */
+#ifndef FR_NO_PRINT
+
   /* printing family of functions */
   int FR_printNumF(int (*f)(char), s32 n, int radix, int pad, int prec); /* print fixed radix num as floating point e.g.  -12.34" */
   int FR_printNumD(int (*f)(char), int n, int pad);                      /* print decimal number with optional padding e.g. " 12" */
@@ -463,6 +476,8 @@ static inline s32 FR_div_rnd(s64 num, s32 den) {
   /* string-to-fixed-point parser (inverse of FR_printNumF) */
   s32 FR_numstr(const char *s, u16 radix);
 
+#endif /* FR_NO_PRINT */
+
 /*===============================================
  * Square root and hypot
  *
@@ -480,16 +495,27 @@ static inline s32 FR_div_rnd(s64 num, s32 den) {
    * Based on piecewise-linear approximation of sqrt(x*x + y*y).
    * See US Patent 6,567,777 B1 (Chatterjee, expired).
    *
-   *   FR_hypot_fast(x, y)   4-segment, ~0.4% peak error
-   *   FR_hypot_fast8(x, y)  8-segment, ~0.14% peak error
+   *   FR_hypot_fast8(x, y)  8-segment, ~0.10% peak error
    *
    * Inputs are raw signed integers (or fixed-point at any radix — the
    * result is at the same radix as the inputs, just like FR_hypot).
    * No radix parameter needed because the algorithm is scale-invariant.
    */
-  s32 FR_hypot_fast(s32 x, s32 y);
   s32 FR_hypot_fast8(s32 x, s32 y);
 
+/*===============================================
+ * Wave generators and ADSR envelope
+ *
+ * Define FR_NO_WAVES before including this header to exclude all
+ * waveform generators (square, pulse, triangle, saw, noise) and the
+ * ADSR envelope from compilation. This saves ~400 B of ROM on targets
+ * that only need math/trig and don't do audio synthesis.
+ *
+ *   #define FR_NO_WAVES
+ *   #include "FR_math.h"
+ */
+#ifndef FR_NO_WAVES
+
 /*===============================================
  * Wave generators — synth-style fixed-shape waveforms.
  *
@@ -570,6 +596,8 @@ typedef struct fr_adsr_s {
   void fr_adsr_release(fr_adsr_t *env);
   s16  fr_adsr_step(fr_adsr_t *env);
 
+#endif /* FR_NO_WAVES */
+
 #ifdef __cplusplus
 
 } // extern "C"
diff --git a/src/FR_math_2D.cpp b/src/FR_math_2D.cpp
index 31132bf..573a09f 100644
--- a/src/FR_math_2D.cpp
+++ b/src/FR_math_2D.cpp
@@ -5,7 +5,7 @@
  *
  *	@copy Copyright (C) <2001-2026>  <M. A. Chatterjee>
  *  @author M A Chatterjee <deftio [at] deftio [dot] com>
- *	@version 2.0.5 M. A. Chatterjee, cleaned up naming
+ *	@version 2.0.6 M. A. Chatterjee, cleaned up naming
  *
  *  This file contains integer math settable fixed point radix math routines for
  *  use on systems in which floating point is not desired or unavailable.
diff --git a/src/FR_math_2D.h b/src/FR_math_2D.h
index 1045577..ba8456f 100644
--- a/src/FR_math_2D.h
+++ b/src/FR_math_2D.h
@@ -3,7 +3,7 @@
  *
  *	@copy Copyright (C) <2001-2026>  <M. A. Chatterjee>
  *  @author M A Chatterjee <deftio [at] deftio [dot] com>
- *	@version 2.0.5 M. A. Chatterjee, cleaned up naming
+ *	@version 2.0.6 M. A. Chatterjee, cleaned up naming
  *
  *  This file contains integer math settable fixed point radix math routines for
  *  use on systems in which floating point is not desired or unavailable.
diff --git a/tests/test_full_coverage.c b/tests/test_full_coverage.c
index 63215c7..0dfd248 100644
--- a/tests/test_full_coverage.c
+++ b/tests/test_full_coverage.c
@@ -440,36 +440,7 @@ int test_sqrt_hypot() {
     result = FR_hypot(I2FR(5, 16), I2FR(12, 16), 16);
     if (result < I2FR(13, 16) - 2 || result > I2FR(13, 16) + 2) return TEST_FAIL;
 
-    /* FR_hypot_fast (4-seg) — same test cases, wider tolerance (~0.3%) */
-    result = FR_hypot_fast(I2FR(3, 16), I2FR(4, 16));
-    /* 0.3% of 5.0 at radix 16 = 0.015 * 65536 ≈ 983, use 1000 */
-    if (result < I2FR(5, 16) - 1000 || result > I2FR(5, 16) + 1000) return TEST_FAIL;
-
-    result = FR_hypot_fast(0, 0);
-    if (result != 0) return TEST_FAIL;
-
-    result = FR_hypot_fast(I2FR(-3, 16), I2FR(-4, 16));
-    if (result < I2FR(5, 16) - 1000 || result > I2FR(5, 16) + 1000) return TEST_FAIL;
-
-    result = FR_hypot_fast(I2FR(5, 16), I2FR(12, 16));
-    if (result < I2FR(13, 16) - 2600 || result > I2FR(13, 16) + 2600) return TEST_FAIL;
-
-    /* Edge: one axis zero — tolerance is 0.4% of expected */
-    result = FR_hypot_fast(I2FR(7, 16), 0);
-    if (result < I2FR(7, 16) - 2000 || result > I2FR(7, 16) + 2000) return TEST_FAIL;
-
-    result = FR_hypot_fast(0, I2FR(7, 16));
-    if (result < I2FR(7, 16) - 2000 || result > I2FR(7, 16) + 2000) return TEST_FAIL;
-
-    /* Equal axes: hypot(1,1) = sqrt(2) ≈ 1.41421 */
-    result = FR_hypot_fast(I2FR(1, 16), I2FR(1, 16));
-    if (result < 92000 || result > 93300) return TEST_FAIL;
-
-    /* INT32_MIN must not crash (UB in negation) */
-    result = FR_hypot_fast((s32)0x80000000, 0);
-    if (result <= 0) return TEST_FAIL;
-
-    /* FR_hypot_fast8 (8-seg) — tighter tolerance (~0.1%) */
+    /* FR_hypot_fast8 (8-seg) — ~0.1% tolerance */
     result = FR_hypot_fast8(I2FR(3, 16), I2FR(4, 16));
     if (result < I2FR(5, 16) - 400 || result > I2FR(5, 16) + 400) return TEST_FAIL;
 
@@ -772,12 +743,12 @@ int test_edge_branches() {
     s32 r32;
     fr_adsr_t env;
 
-    /* FR_Tan(deg, radix) c==0 branch. At radix 0, deg=-16384 and
-     * deg=16384 both drive the internal BAM to exactly 90°/270°, so
+    /* FR_Tan(deg, radix) c==0 branch. At radix 0, deg=90 and deg=270
+     * drive the internal BAM to exactly 16384/49152 (90°/270°), so
      * cos==0 and we hit the saturation return. */
-    r32 = FR_Tan(-16384, 0);                 /* bam=16384 (sin>0) */
+    r32 = FR_Tan(90, 0);                     /* bam=16384 (sin>0) */
     if (r32 != FR_TRIG_MAXVAL) return TEST_FAIL;
-    r32 = FR_Tan(16384, 0);                  /* bam=49152 (sin<0) */
+    r32 = FR_Tan(270, 0);                    /* bam=49152 (sin<0) */
     if (r32 != -FR_TRIG_MAXVAL) return TEST_FAIL;
 
     /* FR_atan2 now returns radians at out_radix.
diff --git a/tests/test_tdd.cpp b/tests/test_tdd.cpp
index 336149f..80ae526 100644
--- a/tests/test_tdd.cpp
+++ b/tests/test_tdd.cpp
@@ -65,9 +65,14 @@ typedef struct {
     int n;
     double max_abs_err;
     double sum_abs_err;
-    double worst_input;
+    double max_pct_err;
+    double sum_pct_err;
+    double worst_input;        /* input that produced max abs error */
     double worst_actual;
     double worst_expected;
+    double worst_pct_input;    /* input that produced max pct error */
+    double worst_pct_actual;
+    double worst_pct_expected;
 } stats_t;
 
 static void stats_reset(stats_t *s) {
@@ -84,6 +89,15 @@ static void stats_add(stats_t *s, double in, double actual, double expected) {
         s->worst_expected = expected;
     }
     s->sum_abs_err += e;
+    /* Skip percent error when expected ≈ 0 to avoid division artifacts */
+    double pct = (fabs(expected) > 0.01) ? (e / fabs(expected)) * 100.0 : 0.0;
+    if (pct > s->max_pct_err) {
+        s->max_pct_err = pct;
+        s->worst_pct_input = in;
+        s->worst_pct_actual = actual;
+        s->worst_pct_expected = expected;
+    }
+    s->sum_pct_err += pct;
     s->n++;
 }
 
@@ -91,6 +105,22 @@ static double stats_mean(const stats_t *s) {
     return s->n ? s->sum_abs_err / s->n : 0.0;
 }
 
+static double stats_mean_pct(const stats_t *s) {
+    return s->n ? s->sum_pct_err / s->n : 0.0;
+}
+
+/* Set by FR_SHOWPEAK env var — adds a "Peak at" column to the accuracy table */
+static int g_showpeak = 0;
+
+/* Print one accuracy table row, optionally with peak-error input */
+static void acc_row(const char *name, const stats_t *s, double lsb, const char *note) {
+    printf("| %s | %.1f | %.4f | %.4f | %s",
+           name, s->max_abs_err / lsb, s->max_pct_err, stats_mean_pct(s), note);
+    if (g_showpeak)
+        printf(" | %.4g", s->worst_pct_input);
+    printf(" |\n");
+}
+
 static void md_h1(const char *t) { printf("\n# %s\n\n", t); }
 static void md_h2(const char *t) { printf("\n## %s\n\n", t); }
 static void md_h3(const char *t) { printf("\n### %s\n\n", t); }
@@ -1285,29 +1315,7 @@ static void section_v2_new(void) {
     table_row_stats("FR_hypot sweep", &hyp_stats);
     printf("\n");
 
-    md_h3("11.4b FR_hypot_fast (4-seg) vs hypot(), radix 16");
-    printf("| x | y | FR_hypot_fast | as double | hypot() | abs err | rel err%% |\n");
-    printf("|---:|---:|---:|---:|---:|---:|---:|\n");
-    stats_t hf4_stats; stats_reset(&hf4_stats);
-    for (int i = 0; i < (int)(sizeof(hyp_cases)/sizeof(hyp_cases[0])); i++) {
-        s32 fx = (s32)(hyp_cases[i].x * (1L << 16));
-        s32 fy = (s32)(hyp_cases[i].y * (1L << 16));
-        s32 r  = FR_hypot_fast(fx, fy);
-        double rd = frd(r, 16);
-        double ref = hypot(hyp_cases[i].x, hyp_cases[i].y);
-        double err = rd - ref; if (err < 0) err = -err;
-        double rel = (ref > 0) ? err / ref * 100.0 : 0.0;
-        stats_add(&hf4_stats, sqrt(hyp_cases[i].x*hyp_cases[i].x + hyp_cases[i].y*hyp_cases[i].y),
-                  rd, ref);
-        printf("| %g | %g | %ld | %.6g | %.6g | %.4g | %.4g |\n",
-               hyp_cases[i].x, hyp_cases[i].y, (long)r, rd, ref, err, rel);
-    }
-    printf("\n");
-    table_header_stats();
-    table_row_stats("FR_hypot_fast sweep", &hf4_stats);
-    printf("\n");
-
-    md_h3("11.4c FR_hypot_fast8 (8-seg) vs hypot(), radix 16");
+    md_h3("11.4b FR_hypot_fast8 (8-seg) vs hypot(), radix 16");
     printf("| x | y | FR_hypot_fast8 | as double | hypot() | abs err | rel err%% |\n");
     printf("|---:|---:|---:|---:|---:|---:|---:|\n");
     stats_t hf8_stats; stats_reset(&hf8_stats);
@@ -1723,8 +1731,8 @@ static void section_summary(void) {
     printf("| fr_cos / fr_sin / fr_cos_bam / fr_sin_bam / fr_cos_deg / fr_sin_deg | OK | 6 | s15.16 output; 129-entry quadrant table with round-to-nearest linear interp; exact at cardinal angles |\n");
     printf("| FR_acos | OK | 7.1 | Max error ~0.83° over [-1, +1] swept at 200 points |\n");
     printf("| FR_asin | OK | 7.2 | Same precision as FR_acos |\n");
-    printf("| FR_atan2 | OK | 7.3 | Octant-reduced arctan with 33-entry table; max err ≤1°; signature `FR_atan2(y, x)` returns degrees |\n");
-    printf("| FR_atan | OK | 7.3 | `FR_atan(x, radix)` calls `FR_atan2(x, 1<<radix)` |\n");
+    printf("| FR_atan2 | OK | 7.3 | Via asin/acos + hypot_fast8; 129-entry cos table; `FR_atan2(y, x, out_radix)` returns radians |\n");
+    printf("| FR_atan | OK | 7.3 | `FR_atan(x, radix, out_radix)` calls `FR_atan2(x, 1<<radix, out_radix)` |\n");
     printf("| FR_pow2 (positive integer x) | OK | 8.1 | Bit-exact for integer exponents in test range |\n");
     printf("| FR_pow2 (positive fractional x) | OK | 8.1, 8.2 | ~1e-6 error |\n");
     printf("| FR_pow2 (negative fractional x) | OK | 8.1, 8.2 | Mathematical floor (toward −∞); 65-entry fraction table with linear interp |\n");
@@ -1763,7 +1771,346 @@ static void section_summary(void) {
     printf("\n");
 }
 
+/* ============================================================
+ * Section 14: Accuracy Summary Table (machine-readable)
+ *
+ * Emits a markdown table between sentinel comments so that
+ * scripts/accuracy_report.sh can extract and patch it into
+ * README.md, docs/README.md, and pages/index.html.
+ * ============================================================ */
+
+static void section_accuracy_table(void) {
+    md_h2("14. Accuracy Summary Table");
+
+    printf("<!-- ACCURACY_TABLE_START -->\n");
+    if (g_showpeak) {
+        printf("| Function | Max err (LSB) | Max err (%%) | Avg err (%%) | Note | Peak at |\n");
+        printf("|---|---:|---:|---:|---|---:|\n");
+    } else {
+        printf("| Function | Max err (LSB) | Max err (%%) | Avg err (%%) | Note |\n");
+        printf("|---|---:|---:|---:|---|\n");
+    }
+
+    const int R = 16;
+    const double scale = (double)(1L << R);
+    const double lsb = 1.0 / scale;
+
+    /* Persistent stats so we can print diagnostics after the table */
+    stats_t st_sincos, st_tan, st_asincos, st_atan2;
+    stats_reset(&st_sincos); stats_reset(&st_tan);
+    stats_reset(&st_asincos); stats_reset(&st_atan2);
+
+    /* --- sin / cos --- */
+    {
+        stats_t &st = st_sincos;
+        const u16 radix = 7; /* s8.7 degrees: 128 steps/deg, [-256°,+256°) */
+        /* 65536-point sweep: all s16 values at radix 7 cover > full circle */
+        for (int i = -32768; i <= 32767; i++) {
+            double deg = (double)i / (1 << radix);
+            double rad = deg * M_PI / 180.0;
+            stats_add(&st, deg, frd(FR_Sin((s16)i, radix), FR_TRIG_OUT_PREC), sin(rad));
+            stats_add(&st, deg, frd(FR_Cos((s16)i, radix), FR_TRIG_OUT_PREC), cos(rad));
+        }
+        /* Special cases: exact integer degrees including negative */
+        s16 specials[] = {0,30,45,60,90,120,135,150,180,210,225,240,270,300,315,330,360,
+                          -30,-45,-60,-90,-120,-135,-150,-180,-210,-225,-240,-270,-300,-315,-330,-360};
+        for (int si = 0; si < (int)(sizeof(specials)/sizeof(specials[0])); si++) {
+            s16 d = specials[si];
+            double rad = d * M_PI / 180.0;
+            stats_add(&st, d, frd(FR_SinI(d), FR_TRIG_OUT_PREC), sin(rad));
+            stats_add(&st, d, frd(FR_CosI(d), FR_TRIG_OUT_PREC), cos(rad));
+        }
+        acc_row("sin / cos", &st, lsb, "65536-pt sweep + specials");
+    }
+
+    /* --- tan --- */
+    {
+        stats_t &st = st_tan;
+        const u16 radix = 7;
+        for (int i = -32768; i <= 32767; i++) {
+            double deg = (double)i / (1 << radix);
+            double rad = deg * M_PI / 180.0;
+            /* Skip near poles: |cos| < 0.01 → tan > 100 */
+            if (fabs(cos(rad)) < 0.01) continue;
+            stats_add(&st, deg, frd(FR_Tan((s16)i, radix), FR_TRIG_OUT_PREC), tan(rad));
+        }
+        /* Special cases: integer degrees (avoiding poles) */
+        s16 specials[] = {0,30,45,60,-30,-45,-60,120,135,150,-120,-135,-150};
+        for (int si = 0; si < (int)(sizeof(specials)/sizeof(specials[0])); si++) {
+            s16 d = specials[si];
+            double rad = d * M_PI / 180.0;
+            stats_add(&st, d, frd(FR_TanI(d), FR_TRIG_OUT_PREC), tan(rad));
+        }
+        acc_row("tan", &st, lsb, "65536-pt sweep (skip poles)");
+    }
+
+    /* --- asin / acos --- */
+    {
+        stats_t &st = st_asincos;
+        /* 65536-point sweep: all representable values at radix 15 over [-1, +1) */
+        for (int i = -32768; i <= 32767; i++) {
+            double xd = (double)i / (1 << 15);
+            if (xd < -1.0 || xd > 1.0) continue;
+            s32 rad = FR_asin((s32)i, 15, R);
+            stats_add(&st, xd, frd(rad, R), asin(xd));
+            rad = FR_acos((s32)i, 15, R);
+            stats_add(&st, xd, frd(rad, R), acos(xd));
+        }
+        acc_row("asin / acos", &st, lsb, "65536-pt; sqrt approx near boundary");
+    }
+
+    /* --- atan2 --- */
+    {
+        stats_t &st = st_atan2;
+        /* 65536-point sweep at each radius.
+         * Skip i=-32768 (exactly -pi): branch-cut convention differs
+         * between FR_atan2 (+pi) and libm (-pi), both correct.
+         * Start radii at 0.1 — at 0.01 inputs have <10 LSBs of angular
+         * resolution, testing input quantization not the algorithm.
+         * Also skip points where the minor axis has < 8 bits (256 counts)
+         * — below that, input quantization dominates over algorithm error. */
+        double radii[] = {0.1, 1.0, 10.0, 100.0, 1000.0};
+        for (int ri = 0; ri < (int)(sizeof(radii)/sizeof(radii[0])); ri++) {
+            double rad = radii[ri];
+            for (int i = -32767; i <= 32768; i++) {
+                double angle = i * M_PI / 32768.0;
+                double x = rad * cos(angle), y = rad * sin(angle);
+                s32 fx = (s32)(x * scale);
+                s32 fy = (s32)(y * scale);
+                if (fx == 0 && fy == 0) continue;
+                s32 afx = (fx < 0) ? -fx : fx;
+                s32 afy = (fy < 0) ? -fy : fy;
+                s32 minor = (afx < afy) ? afx : afy;
+                if (minor < 256) continue; /* input quantization, not algo */
+                s32 r = FR_atan2(fy, fx, R);
+                double ref = atan2(y, x);
+                /* Skip near ±pi branch cut: sign depends on sub-LSB
+                 * input quantization, not algorithm accuracy. */
+                if (fabs(fabs(ref) - M_PI) < 0.01) continue;
+                stats_add(&st, angle * 180.0 / M_PI, frd(r, R), ref);
+            }
+        }
+        /* Special cases: exact quadrant/octant/30-degree angles */
+        double specials_deg[] = {0,30,45,60,90,120,135,150,
+                                 -30,-45,-60,-90,-120,-135,-150,-170};
+        for (int si = 0; si < (int)(sizeof(specials_deg)/sizeof(specials_deg[0])); si++) {
+            double angle = specials_deg[si] * M_PI / 180.0;
+            double x = 100.0 * cos(angle), y = 100.0 * sin(angle);
+            s32 fx = (s32)(x * scale), fy = (s32)(y * scale);
+            if (fx == 0 && fy == 0) continue;
+            s32 r = FR_atan2(fy, fx, R);
+            stats_add(&st, specials_deg[si], frd(r, R), atan2(y, x));
+        }
+        acc_row("atan2", &st, lsb, "65536x5 radii; asin/acos+hypot_fast8");
+    }
+
+    /* --- atan --- */
+    {
+        stats_t st; stats_reset(&st);
+        /* Sweep atan(x) for x in [-10, 10] with fine steps near zero.
+         * FR_atan(input, radix, out_radix) calls FR_atan2(input, 1<<radix, out_radix).
+         * Skip |expected| < 0.01 to match the percent-error convention. */
+        for (int i = -10000; i <= 10000; i++) {
+            double x = i / 1000.0;
+            s32 fr = (s32)(x * scale);
+            s32 r = FR_atan(fr, (u16)R, (u16)R);
+            double ref = atan(x);
+            if (fabs(ref) < 0.01) continue;
+            stats_add(&st, x, frd(r, R), ref);
+        }
+        acc_row("atan", &st, lsb, "20001-pt sweep [-10,10]; via FR_atan2");
+    }
+
+    /* --- sqrt --- */
+    {
+        stats_t st; stats_reset(&st);
+        double inputs[] = {0.0001, 0.25, 0.5, 1, 2, 3, 4, 7, 9, 16, 25, 100, 1024, 10000, 32000};
+        for (int i = 0; i < (int)(sizeof(inputs)/sizeof(inputs[0])); i++) {
+            s32 fr = (s32)(inputs[i] * scale);
+            s32 r = FR_sqrt(fr, R);
+            stats_add(&st, inputs[i], frd(r, R), sqrt(inputs[i]));
+        }
+        /* Fine sweep */
+        for (int i = 1; i <= 1000; i++) {
+            double x = i * 10.0;
+            s32 fr = (s32)(x * scale);
+            s32 r = FR_sqrt(fr, R);
+            stats_add(&st, x, frd(r, R), sqrt(x));
+        }
+        acc_row("sqrt", &st, lsb, "Round-to-nearest");
+    }
+
+    /* --- log2 --- */
+    {
+        stats_t st; stats_reset(&st);
+        /* Integer inputs — stay within s32 range at radix 16 (max ~32767) */
+        for (int v = 1; v <= 32000; v += (v < 100 ? 1 : v / 10)) {
+            s32 fr = (s32)((double)v * scale);
+            if (fr <= 0) continue;
+            s32 r = FR_log2(fr, (u16)R, (u16)R);
+            stats_add(&st, (double)v, frd(r, R), log2((double)v));
+        }
+        /* Fractional sweep 0.125 .. 1.0 */
+        for (int i = 1; i <= 100; i++) {
+            double x = 0.125 + (0.875 * i / 100.0);
+            s32 fr = (s32)(x * scale);
+            if (fr <= 0) continue;
+            s32 r = FR_log2(fr, (u16)R, (u16)R);
+            stats_add(&st, x, frd(r, R), log2(x));
+        }
+        acc_row("log2", &st, lsb, "65-entry mantissa table");
+    }
+
+    /* --- pow2 --- */
+    {
+        stats_t st; stats_reset(&st);
+        for (int i = -800; i <= 800; i++) {
+            double x = i / 100.0;
+            s32 fr = (s32)(x * scale);
+            s32 r = FR_pow2(fr, R);
+            double ref = pow(2.0, x);
+            stats_add(&st, x, frd(r, R), ref);
+        }
+        acc_row("pow2", &st, lsb, "65-entry fraction table");
+    }
+
+    /* --- ln, log10 --- */
+    {
+        stats_t st; stats_reset(&st);
+        double inputs[] = {0.125, 0.25, 0.5, 1, 2, M_E, 3, 4, 5, 7, 8, 10, 20, 50, 100, 1000};
+        for (int i = 0; i < (int)(sizeof(inputs)/sizeof(inputs[0])); i++) {
+            s32 fr = (s32)(inputs[i] * scale);
+            if (fr <= 0) continue;
+            s32 r = FR_ln(fr, R, R);
+            double ref = log(inputs[i]);
+            stats_add(&st, inputs[i], frd(r, R), ref);
+            r = FR_log10(fr, R, R);
+            ref = log10(inputs[i]);
+            stats_add(&st, inputs[i], frd(r, R), ref);
+        }
+        acc_row("ln, log10", &st, lsb, "Via FR_MULK28 from log2");
+    }
+
+    /* --- exp (FR_EXP) --- */
+    {
+        stats_t st; stats_reset(&st);
+        for (int i = -400; i <= 400; i++) {
+            double x = i / 100.0;
+            s32 fr = (s32)(x * scale);
+            s32 r = FR_EXP(fr, R);
+            double ref = exp(x);
+            if (ref > 32000.0 || ref < 1e-6) continue; /* skip overflow/underflow */
+            stats_add(&st, x, frd(r, R), ref);
+        }
+        acc_row("exp", &st, lsb, "FR_MULK28 + FR_pow2");
+    }
+
+    /* --- exp_fast (FR_EXP_FAST) --- */
+    {
+        stats_t st; stats_reset(&st);
+        for (int i = -400; i <= 400; i++) {
+            double x = i / 100.0;
+            s32 fr = (s32)(x * scale);
+            s32 r = FR_EXP_FAST(fr, R);
+            double ref = exp(x);
+            if (ref > 32000.0 || ref < 1e-6) continue;
+            stats_add(&st, x, frd(r, R), ref);
+        }
+        acc_row("exp_fast", &st, lsb, "Shift-only scaling");
+    }
+
+    /* --- pow10 (FR_POW10) --- */
+    {
+        stats_t st; stats_reset(&st);
+        for (int i = -200; i <= 200; i++) {
+            double x = i / 100.0;
+            s32 fr = (s32)(x * scale);
+            s32 r = FR_POW10(fr, R);
+            double ref = pow(10.0, x);
+            if (ref > 32000.0 || ref < 1e-6) continue;
+            stats_add(&st, x, frd(r, R), ref);
+        }
+        acc_row("pow10", &st, lsb, "FR_MULK28 + FR_pow2");
+    }
+
+    /* --- pow10_fast (FR_POW10_FAST) --- */
+    {
+        stats_t st; stats_reset(&st);
+        for (int i = -200; i <= 200; i++) {
+            double x = i / 100.0;
+            s32 fr = (s32)(x * scale);
+            s32 r = FR_POW10_FAST(fr, R);
+            double ref = pow(10.0, x);
+            if (ref > 32000.0 || ref < 1e-6) continue;
+            stats_add(&st, x, frd(r, R), ref);
+        }
+        acc_row("pow10_fast", &st, lsb, "Shift-only scaling");
+    }
+
+    /* --- hypot (exact) --- */
+    {
+        stats_t st; stats_reset(&st);
+        struct { double x, y; } cases[] = {
+            {0,0},{1,0},{0,1},{3,4},{5,12},{8,15},{-3,-4},{-3,4},{3,-4},
+            {1,1},{0.5,0.5},{100,100},{1000,1},{1,1000}
+        };
+        for (int i = 0; i < (int)(sizeof(cases)/sizeof(cases[0])); i++) {
+            s32 fx = (s32)(cases[i].x * scale);
+            s32 fy = (s32)(cases[i].y * scale);
+            s32 r = FR_hypot(fx, fy, R);
+            double ref = hypot(cases[i].x, cases[i].y);
+            stats_add(&st, ref, frd(r, R), ref);
+        }
+        acc_row("hypot (exact)", &st, lsb, "64-bit intermediate");
+    }
+
+    /* --- hypot_fast8 (8-seg) --- */
+    {
+        stats_t st; stats_reset(&st);
+        struct { double x, y; } cases[] = {
+            {1,0},{0,1},{3,4},{5,12},{8,15},{-3,-4},{1,1},{0.5,0.5},
+            {100,100},{1000,1},{1,1000},{7,24},{20,21}
+        };
+        for (int i = 0; i < (int)(sizeof(cases)/sizeof(cases[0])); i++) {
+            s32 fx = (s32)(cases[i].x * scale);
+            s32 fy = (s32)(cases[i].y * scale);
+            s32 r = FR_hypot_fast8(fx, fy);
+            double ref = hypot(cases[i].x, cases[i].y);
+            if (ref > 0) stats_add(&st, ref, frd(r, R), ref);
+        }
+        acc_row("hypot_fast8 (8-seg)", &st, lsb, "Shift-only, no multiply");
+    }
+
+    printf("<!-- ACCURACY_TABLE_END -->\n");
+    printf("\n");
+
+    /* Diagnostic: show where each trig function's worst % error occurs */
+    md_h3("14.1 Worst-case percent error diagnostics");
+    printf("Shows the input that produced the maximum %% error for each trig function.\n");
+    printf("This helps identify whether the peak is a genuine algorithm limitation or\n");
+    printf("a near-zero denominator artifact.\n\n");
+    printf("| Function | Worst-pct input | Expected | Got | Abs err | Pct err |\n");
+    printf("|---|---|---:|---:|---:|---:|\n");
+
+    struct { const char *name; stats_t *s; } diag[] = {
+        {"sin / cos", &st_sincos},
+        {"tan",       &st_tan},
+        {"asin/acos", &st_asincos},
+        {"atan2",     &st_atan2},
+    };
+    for (int d = 0; d < (int)(sizeof(diag)/sizeof(diag[0])); d++) {
+        stats_t *s = diag[d].s;
+        double ae = fabs(s->worst_pct_actual - s->worst_pct_expected);
+        printf("| %s | %.4f | %.6f | %.6f | %.6f | %.4f%% |\n",
+               diag[d].name, s->worst_pct_input,
+               s->worst_pct_expected, s->worst_pct_actual, ae,
+               s->max_pct_err);
+    }
+    printf("\n");
+}
+
 int main(void) {
+    g_showpeak = (getenv("FR_SHOWPEAK") != NULL);
     md_h1("FR_Math TDD Characterization Report");
     printf("> Generated by `tests/test_tdd.cpp`. This is a measurement suite, not a pass/fail suite.\n");
     printf("> All numbers below are *what the library actually does*, compared to libm `double` references.\n");
@@ -1782,6 +2129,7 @@ int main(void) {
     section_v2_new();
     section_multiradix();
     section_summary();
+    section_accuracy_table();
 
     return 0;
 }
diff --git a/tools/make_release.sh b/tools/make_release.sh
index 441181d..28f7647 100755
--- a/tools/make_release.sh
+++ b/tools/make_release.sh
@@ -130,23 +130,13 @@ do_validate() {
     run_cmd make clean >/dev/null 2>&1
 
     echo ""
-    echo "  --- Strict compile (-Wall -Wextra -Werror -Wshadow) ---"
-    local strict_flags="-Isrc -Wall -Wextra -Werror -Wshadow -Os"
+    echo "  --- Build library + examples (uses LIB_WARN from makefile) ---"
     mkdir -p build
-    if ! cc ${strict_flags} -c src/FR_math.c -o build/FR_math_strict.o 2>build/strict_c.log; then
-        cat build/strict_c.log
-        fail "src/FR_math.c has compiler warnings"
+    if ! make lib examples 2>build/strict_build.log; then
+        cat build/strict_build.log
+        fail "Library build failed (compiler warnings or errors)"
     fi
-    if ! c++ ${strict_flags} -c src/FR_math_2D.cpp -o build/FR_math_2D_strict.o 2>build/strict_cpp.log; then
-        cat build/strict_cpp.log
-        fail "src/FR_math_2D.cpp has compiler warnings"
-    fi
-    pass "Zero warnings."
-
-    echo ""
-    echo "  --- Build library + examples ---"
-    run_cmd make lib examples >/dev/null 2>&1
-    pass "Library and examples built."
+    pass "Library and examples built — zero warnings."
 
     echo ""
     echo "  --- Full test suite ---"
@@ -196,6 +186,11 @@ do_validate() {
     else
         ls -l build/FR_math.o build/FR_math_2D.o
     fi
+
+    echo ""
+    echo "  --- Accuracy table ---"
+    bash "${PROJECT_ROOT}/scripts/accuracy_report.sh" --update
+    pass "Accuracy table updated in README + docs."
 }
 
 # -----------------------------------------------------------------------
@@ -233,7 +228,7 @@ do_cross_compile() {
 
 # Files the pipeline itself may modify (badge update, version sync).
 # Anything outside this list is unexpected and should block the release.
-PIPELINE_FILES="README.md VERSION src/FR_math.h library.properties library.json idf_component.yml llms.txt pages/assets/site.js src/FR_math_2D.h src/FR_math_2D.cpp"
+PIPELINE_FILES="README.md VERSION src/FR_math.h library.properties library.json idf_component.yml llms.txt pages/assets/site.js src/FR_math_2D.h src/FR_math_2D.cpp docs/README.md pages/index.html"
 
 do_commit_pipeline_changes() {
     step_header "Commit pipeline-generated changes"