comp-physics · sbryngelson · Jun 17, 2026 · Jun 17, 2026
diff --git a/aneforge/_compile.py b/aneforge/_compile.py
@@ -690,24 +690,29 @@ def _e_affine(em, t, n, s):
 
 @op("rms_norm")
 def _e_rms_norm(em, t, n, s):
-    M, D = t.shape[0], t.shape[-1]
-    g4 = em.weight(f"{n}_g", t.attrs["gamma"].reshape(1, D, 1, 1), allow_int8=False)
-    invd, eps = float(np.float16(1.0 / D)).hex(), float(np.float16(t.attrs["eps"])).hex()
-    em.line(f'tensor<int32,[4]> {n}_rs = const()[name=string("{n}_rs"), val=tensor<int32,[4]>([{M},{D},1,1])];')
-    em.line(f'tensor<int32,[{len(t.shape)}]> {n}_ro = const()[name=string("{n}_ro"), val=tensor<int32,[{len(t.shape)}]>({list(t.shape)})];')
-    em.line(f'tensor<int32,[1]> {n}_ax = const()[name=string("{n}_ax"), val=tensor<int32,[1]>([1])];')
+    # RMS-norm via the fused `reduce_l2_norm` over the last axis - the same fast
+    # reduction `l2_norm` uses. The previous lowering reshaped to [M,D,1,1] and ran
+    # `reduce_sum` over the *channel* axis, which falls off the ANE's fast reduction
+    # tile past ~256 rows and ran ~48x slower than this path at [1024,1024]
+    # (e.g. 6882us -> 142us), and 16-27x slower than layer_norm despite RMS being
+    # structurally cheaper. Mathematically identical:
+    #     rms(x) = x / sqrt(mean(x^2)+eps) * g
+    #            = x * sqrt(D) / sqrt(sum(x^2)) * g        (reduce_l2_norm = sqrt(sum x^2))
+    # eps enters as a safe-divide floor on the norm (as in l2_norm); the sqrt(D)
+    # rescale is folded into the gamma weight so no extra op is emitted.
+    D = t.shape[-1]
+    ax = len(t.shape) - 1
+    gw = (t.attrs["gamma"].reshape(1, D).astype(np.float32) * float(np.sqrt(D)))
+    g2 = em.weight(f"{n}_g", gw, allow_int8=False)
+    flo = float(np.float16((D * float(t.attrs["eps"])) ** 0.5)).hex()
+    red_shape = tuple(1 if i == ax else d for i, d in enumerate(t.shape))
+    em.line(f'tensor<int32,[1]> {n}_ax = const()[name=string("{n}_ax"), val=tensor<int32,[1]>([{ax}])];')
     em.line(f'bool {n}_kd = const()[name=string("{n}_kd"), val=bool(true)];')
-    em.line(f'tensor<fp16,[{M},{D},1,1]> {n}_x4 = reshape(shape={n}_rs, x={s[0]})[name=string("{n}_x4")];')
-    em.line(f'tensor<fp16,[{M},{D},1,1]> {n}_sq = mul(x={n}_x4, y={n}_x4)[name=string("{n}_sq")];')
-    em.line(f'tensor<fp16,[{M},1,1,1]> {n}_ss = reduce_sum(axes={n}_ax, keep_dims={n}_kd, x={n}_sq)[name=string("{n}_ss")];')
-    em.line(f'fp16 {n}_id = const()[name=string("{n}_id"), val=fp16({invd})];')
-    em.line(f'tensor<fp16,[{M},1,1,1]> {n}_ms = mul(x={n}_ss, y={n}_id)[name=string("{n}_ms")];')
-    em.line(f'fp16 {n}_ep = const()[name=string("{n}_ep"), val=fp16({eps})];')
-    em.line(f'tensor<fp16,[{M},1,1,1]> {n}_me = add(x={n}_ms, y={n}_ep)[name=string("{n}_me")];')
-    em.line(f'tensor<fp16,[{M},1,1,1]> {n}_rr = rsqrt(epsilon=fp16(0.0), x={n}_me)[name=string("{n}_rr")];')
-    em.line(f'tensor<fp16,[{M},{D},1,1]> {n}_xn = mul(x={n}_x4, y={n}_rr)[name=string("{n}_xn")];')
-    em.line(f'tensor<fp16,[{M},{D},1,1]> {n}_g4 = mul(x={n}_xn, y={g4})[name=string("{n}_g4")];')
-    em.line(f'{em.ty(t.shape)} {n} = reshape(shape={n}_ro, x={n}_g4)[name=string("{n}")];')
+    em.line(f'{em.ty(red_shape)} {n}_nrm = reduce_l2_norm(axes={n}_ax, keep_dims={n}_kd, x={s[0]})[name=string("{n}_nrm")];')
+    em.line(f'fp16 {n}_ep = const()[name=string("{n}_ep"), val=fp16({flo})];')
+    em.line(f'{em.ty(red_shape)} {n}_sn = maximum(x={n}_nrm, y={n}_ep)[name=string("{n}_sn")];')
+    em.line(f'{em.ty(t.shape)} {n}_dv = real_div(x={s[0]}, y={n}_sn)[name=string("{n}_dv")];')
+    em.line(f'{em.ty(t.shape)} {n} = mul(x={n}_dv, y={g2})[name=string("{n}")];')
 
 
 @op("layer_norm")