From b73a77e2b9e287cb310e95419938a0af1f7ec003 Mon Sep 17 00:00:00 2001
From: Dex <dexwritescode@proton.me>
Date: Tue, 5 May 2026 15:24:39 -0400
Subject: [PATCH 01/13] feat(compile): CustomKernel stores and returns output
 shapes

`mx::compile(shapeless=true)` calls `Primitive::output_shapes()` on
every node when re-tracing a compiled function with changed input
shapes. `CustomKernel` never implemented this override, so any
compiled function containing a `metal_kernel` / `custom_kernel` call
would throw:

  [Primitive::output_shapes] CustomKernel cannot infer output shapes

The output shapes are already provided by the caller at creation time
via `metal_kernel()(inputs, output_shapes, ...)` and passed to
`array::make_arrays`. They just weren't stored on the primitive.

Fix: add an optional `output_shapes` parameter to the `CustomKernel`
constructor (default `{}` for backward compatibility), store it in a
new `output_shapes_` member, and override `output_shapes()` to return
it. If the field is empty (legacy construction path), fall through to
the base-class throw as before.

Update both Metal and CUDA call sites to copy the shapes before
`std::move`-ing them into `array::make_arrays` and pass the copy to
the constructor.
---
 mlx/backend/cuda/custom_kernel.cpp  |  7 +++++--
 mlx/backend/metal/custom_kernel.cpp |  4 +++-
 mlx/fast_primitives.h               | 14 ++++++++++++--
 3 files changed, 20 insertions(+), 5 deletions(-)
diff --git a/mlx/backend/cuda/custom_kernel.cpp b/mlx/backend/cuda/custom_kernel.cpp
index 9a6837acbb..fdd127e50d 100644
--- a/mlx/backend/cuda/custom_kernel.cpp
+++ b/mlx/backend/cuda/custom_kernel.cpp
@@ -222,6 +222,7 @@ CustomKernelFunction cuda_kernel(
                 << "```" << std::endl;
     }
 
+    auto output_shapes_copy = output_shapes;
     return array::make_arrays(
         std::move(output_shapes),
         std::move(output_dtypes),
@@ -236,7 +237,8 @@ CustomKernelFunction cuda_kernel(
             init_value,
             std::vector<ScalarArg>{},
             false,
-            shared_memory),
+            shared_memory,
+            std::move(output_shapes_copy)),
         std::move(inputs));
   };
 }
@@ -270,7 +272,8 @@ std::vector<array> precompiled_cuda_kernel(
           init_value,
           scalars,
           true,
-          shared_memory),
+          shared_memory,
+          output_shapes),
       inputs);
 }
 
diff --git a/mlx/backend/metal/custom_kernel.cpp b/mlx/backend/metal/custom_kernel.cpp
index 6d33ff5007..31e115394a 100644
--- a/mlx/backend/metal/custom_kernel.cpp
+++ b/mlx/backend/metal/custom_kernel.cpp
@@ -305,6 +305,7 @@ CustomKernelFunction metal_kernel(
                 << "```" << std::endl;
     }
 
+    auto output_shapes_copy = output_shapes;
     return array::make_arrays(
         std::move(output_shapes),
         std::move(output_dtypes),
@@ -319,7 +320,8 @@ CustomKernelFunction metal_kernel(
             init_value,
             std::vector<ScalarArg>{},
             false,
-            0),
+            0,
+            std::move(output_shapes_copy)),
         std::move(inputs));
   };
 }
diff --git a/mlx/fast_primitives.h b/mlx/fast_primitives.h
index 4434830875..827a4eab6d 100644
--- a/mlx/fast_primitives.h
+++ b/mlx/fast_primitives.h
@@ -375,7 +375,8 @@ class CustomKernel : public Primitive {
       std::optional<float> init_value,
       std::vector<ScalarArg> scalar_arguments,
       bool is_precompiled,
-      int shared_memory)
+      int shared_memory,
+      std::vector<Shape> output_shapes = {})
       : Primitive(stream),
         name_(std::move(name)),
         source_(std::move(source)),
@@ -386,7 +387,8 @@ class CustomKernel : public Primitive {
         init_value_(init_value),
         scalar_arguments_(std::move(scalar_arguments)),
         is_precompiled_(is_precompiled),
-        shared_memory_(shared_memory) {}
+        shared_memory_(shared_memory),
+        output_shapes_(std::move(output_shapes)) {}
 
   void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
       override {
@@ -397,6 +399,13 @@ class CustomKernel : public Primitive {
       override;
 
   DEFINE_NAME(CustomKernel);
+
+  std::vector<Shape> output_shapes(const std::vector<array>&) override {
+    if (output_shapes_.empty())
+      return Primitive::output_shapes({});
+    return output_shapes_;
+  }
+
   auto state() const {
     return std::make_tuple(
         name_,
@@ -422,6 +431,7 @@ class CustomKernel : public Primitive {
   std::vector<ScalarArg> scalar_arguments_;
   bool is_precompiled_;
   int shared_memory_;
+  std::vector<Shape> output_shapes_;
 };
 
 } // namespace mlx::core::fast

From 0b6cedbbf4b945753b5a6ef17e3a5f8df9dacfc3 Mon Sep 17 00:00:00 2001
From: Dex <dexwritescode@proton.me>
Date: Tue, 5 May 2026 15:48:03 -0400
Subject: [PATCH 02/13] feat(compile): GatherQMM implements output_shapes for
 shapeless compile

output_shapes() is called on every primitive during shapeless=true
retracing. GatherQMM was missing this override, causing compile to
throw when any graph containing gather_qmm was retraced.

The output shape is fully inferrable from inputs and stored fields:
  out_shape = lhs_indices.shape() + [x.shape(-2), w_outer_dims]
where w_outer_dims = transpose ? w.shape(-2) : w.shape(-1)*32/bits.

Input layout differs by mode: Affine has biases at index 3 (pushing
indices to 4/5); other modes have indices at 3/4.
---
 mlx/primitives.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/mlx/primitives.h b/mlx/primitives.h
index 75fb978dce..8525e58253 100644
--- a/mlx/primitives.h
+++ b/mlx/primitives.h
@@ -1700,6 +1700,21 @@ class GatherQMM : public UnaryPrimitive {
   DEFINE_GRADS()
   DEFINE_NAME(GatherQMM)
   bool is_equivalent(const Primitive& other) const override;
+
+  // inputs layout: Affine → {x, w, scales, biases, lhs_idx, rhs_idx}
+  //                other  → {x, w, scales,          lhs_idx, rhs_idx}
+  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override {
+    const auto& x = inputs[0];
+    const auto& w = inputs[1];
+    const auto& lhs_idx =
+        (mode_ == QuantizationMode::Affine) ? inputs[4] : inputs[3];
+    int w_outer = transpose_ ? w.shape(-2) : w.shape(-1) * 32 / bits_;
+    auto out_shape = lhs_idx.shape();
+    out_shape.push_back(x.shape(-2));
+    out_shape.push_back(w_outer);
+    return {out_shape};
+  }
+
   auto state() const {
     return std::make_tuple(
         group_size_, bits_, mode_, transpose_, left_sorted_, right_sorted_);

From fdff69b3b970885e9b6bcc7d2fc6caa8648c1552 Mon Sep 17 00:00:00 2001
From: Dex <dexwritescode@proton.me>
Date: Wed, 6 May 2026 19:32:45 -0400
Subject: [PATCH 03/13] fix(compile): remove std::move on const& and fix
 comment alignment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Address review feedback from zcbenz:

- output_shapes is a const& in the lambda parameter, so std::move(output_shapes)
  compiles but silently copies rather than moves. Remove the misleading std::move
  in both metal and cuda backends — make_arrays receives a plain copy.
- Fix one extra space in the GatherQMM input layout comment to correctly align
  lhs_idx under the Affine layout line.
---
 mlx/backend/cuda/custom_kernel.cpp  | 2 +-
 mlx/backend/metal/custom_kernel.cpp | 2 +-
 mlx/primitives.h                    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlx/backend/cuda/custom_kernel.cpp b/mlx/backend/cuda/custom_kernel.cpp
index fdd127e50d..17b0c488c9 100644
--- a/mlx/backend/cuda/custom_kernel.cpp
+++ b/mlx/backend/cuda/custom_kernel.cpp
@@ -224,7 +224,7 @@ CustomKernelFunction cuda_kernel(
 
     auto output_shapes_copy = output_shapes;
     return array::make_arrays(
-        std::move(output_shapes),
+        output_shapes,
         std::move(output_dtypes),
         std::make_shared<CustomKernel>(
             s,
diff --git a/mlx/backend/metal/custom_kernel.cpp b/mlx/backend/metal/custom_kernel.cpp
index 31e115394a..ffc80ce27e 100644
--- a/mlx/backend/metal/custom_kernel.cpp
+++ b/mlx/backend/metal/custom_kernel.cpp
@@ -307,7 +307,7 @@ CustomKernelFunction metal_kernel(
 
     auto output_shapes_copy = output_shapes;
     return array::make_arrays(
-        std::move(output_shapes),
+        output_shapes,
         std::move(output_dtypes),
         std::make_shared<CustomKernel>(
             s,
diff --git a/mlx/primitives.h b/mlx/primitives.h
index 8525e58253..313ded3545 100644
--- a/mlx/primitives.h
+++ b/mlx/primitives.h
@@ -1702,7 +1702,7 @@ class GatherQMM : public UnaryPrimitive {
   bool is_equivalent(const Primitive& other) const override;
 
   // inputs layout: Affine → {x, w, scales, biases, lhs_idx, rhs_idx}
-  //                other  → {x, w, scales,          lhs_idx, rhs_idx}
+  //                other  → {x, w, scales,         lhs_idx, rhs_idx}
   std::vector<Shape> output_shapes(const std::vector<array>& inputs) override {
     const auto& x = inputs[0];
     const auto& w = inputs[1];

From 0f2473fce4432a5237cd2c153ec586f7b4100980 Mon Sep 17 00:00:00 2001
From: Dex <dexwritescode@proton.me>
Date: Wed, 6 May 2026 19:43:27 -0400
Subject: [PATCH 04/13] test(compile): add shapeless compile tests for
 CustomKernel and GatherQMM

Verify that mx.compile(shapeless=True) correctly re-traces functions
containing mx.fast.metal_kernel (CustomKernel) and mx.gather_qmm
(GatherQMM) when input shapes change between calls.

Both tests fail before the fix with the respective 'cannot infer output
shapes' error and pass after output_shapes() is implemented.
---
 python/tests/test_compile.py | 57 ++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/python/tests/test_compile.py b/python/tests/test_compile.py
index 20f1145223..fe6decb3d0 100644
--- a/python/tests/test_compile.py
+++ b/python/tests/test_compile.py
@@ -504,6 +504,63 @@ def ones_fun(x):
         self.assertEqual(compiled_zero_like(y).shape, y_shape)
         self.assertEqual(compiled_ones_like(y).shape, y_shape)
 
+    def test_shapeless_compile_custom_kernel(self):
+        # CustomKernel must implement output_shapes() so shapeless compile can
+        # re-trace without throwing "CustomKernel cannot infer output shapes".
+        if not mx.metal.is_available():
+            return
+
+        kernel = mx.fast.metal_kernel(
+            name="copy_kernel",
+            input_names=["inp"],
+            output_names=["out"],
+            source="out[thread_position_in_grid.x] = inp[thread_position_in_grid.x];",
+        )
+
+        def fn(x):
+            return kernel(
+                inputs=[x],
+                grid=(x.size, 1, 1),
+                threadgroup=(min(x.size, 256), 1, 1),
+                output_shapes=[x.shape],
+                output_dtypes=[x.dtype],
+                stream=mx.gpu,
+            )[0]
+
+        cfn = mx.compile(fn, shapeless=True)
+
+        x = mx.ones((4,), dtype=mx.float32)
+        self.assertTrue(mx.array_equal(cfn(x), x))
+
+        # Different shape — must reuse compiled graph without throwing.
+        x = mx.ones((8,), dtype=mx.float32)
+        self.assertTrue(mx.array_equal(cfn(x), x))
+
+    def test_shapeless_compile_gather_qmm(self):
+        # GatherQMM must implement output_shapes() so shapeless compile can
+        # re-trace without throwing "GatherQMM cannot infer output shapes".
+        K, N, num_experts = 64, 32, 4
+
+        w = mx.random.normal((num_experts, N, K))
+        qw, s, b = mx.quantize(w)
+        mx.eval(qw, s, b)
+
+        # Keep inputs outside fn so RandomBits doesn't enter the compiled graph.
+        x4 = mx.ones((4, K))
+        x8 = mx.ones((8, K))
+        idx4 = mx.array([0, 1, 2, 3])
+        idx8 = mx.array([0, 0, 1, 1, 2, 2, 3, 3])
+
+        def fn(x, lhs_indices):
+            return mx.gather_qmm(x, qw, s, b, lhs_indices=lhs_indices, transpose=True)
+
+        cfn = mx.compile(fn, shapeless=True)
+
+        self.assertEqual(cfn(x4, idx4).shape, fn(x4, idx4).shape)
+
+        # Different M — must reuse compiled graph without throwing.
+        self.assertEqual(cfn(x8, idx8).shape, fn(x8, idx8).shape)
+
     def test_compile_with_constant(self):
         # Test float
         @partial(mx.compile)

From 1109c8ce5b14801f000fef87cb9cb75b858f8ec4 Mon Sep 17 00:00:00 2001
From: Dex <dexwritescode@proton.me>
Date: Wed, 6 May 2026 22:51:56 -0400
Subject: [PATCH 05/13] refactor(compile): simplify output_shapes copy in
 CustomKernel

Remove the intermediate output_shapes_copy and pass output_shapes
directly to the CustomKernel constructor, which takes it by value.
---
 mlx/backend/cuda/custom_kernel.cpp  | 3 +--
 mlx/backend/metal/custom_kernel.cpp | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/mlx/backend/cuda/custom_kernel.cpp b/mlx/backend/cuda/custom_kernel.cpp
index 17b0c488c9..2608d0ea1b 100644
--- a/mlx/backend/cuda/custom_kernel.cpp
+++ b/mlx/backend/cuda/custom_kernel.cpp
@@ -222,7 +222,6 @@ CustomKernelFunction cuda_kernel(
                 << "```" << std::endl;
     }
 
-    auto output_shapes_copy = output_shapes;
     return array::make_arrays(
         output_shapes,
         std::move(output_dtypes),
@@ -238,7 +237,7 @@ CustomKernelFunction cuda_kernel(
             std::vector<ScalarArg>{},
             false,
             shared_memory,
-            std::move(output_shapes_copy)),
+            output_shapes),
         std::move(inputs));
   };
 }
diff --git a/mlx/backend/metal/custom_kernel.cpp b/mlx/backend/metal/custom_kernel.cpp
index ffc80ce27e..3eb41302ba 100644
--- a/mlx/backend/metal/custom_kernel.cpp
+++ b/mlx/backend/metal/custom_kernel.cpp
@@ -305,7 +305,6 @@ CustomKernelFunction metal_kernel(
                 << "```" << std::endl;
     }
 
-    auto output_shapes_copy = output_shapes;
     return array::make_arrays(
         output_shapes,
         std::move(output_dtypes),
@@ -321,7 +320,7 @@ CustomKernelFunction metal_kernel(
             std::vector<ScalarArg>{},
             false,
             0,
-            std::move(output_shapes_copy)),
+            output_shapes),
         std::move(inputs));
   };
 }

From d68c9b0f4926c96acfb5f7916aa4f2d3740c7c89 Mon Sep 17 00:00:00 2001
From: Dex <dexwritescode@proton.me>
Date: Thu, 7 May 2026 19:38:01 -0400
Subject: [PATCH 06/13] fix(test): correct gather_qmm inputs for shapeless
 compile test

lhs_indices shape (8,) cannot broadcast with the auto-generated
rhs_indices arange(num_experts) shape (4,), causing the second
shapeless compile call to fail during graph update.

Fix by keeping both indices fixed at shape (num_experts,) and varying
only the M dimension via x.shape = (num_experts, M, K).
---
 python/tests/test_compile.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/python/tests/test_compile.py b/python/tests/test_compile.py
index fe6decb3d0..8228ce8295 100644
--- a/python/tests/test_compile.py
+++ b/python/tests/test_compile.py
@@ -545,21 +545,24 @@ def test_shapeless_compile_gather_qmm(self):
         qw, s, b = mx.quantize(w)
         mx.eval(qw, s, b)
 
-        # Keep inputs outside fn so RandomBits doesn't enter the compiled graph.
-        x4 = mx.ones((4, K))
-        x8 = mx.ones((8, K))
-        idx4 = mx.array([0, 1, 2, 3])
-        idx8 = mx.array([0, 0, 1, 1, 2, 2, 3, 3])
+        # x has shape (num_experts, M, K): the batch dim is indexed by idx,
+        # which stays fixed so that lhs_indices and rhs_indices (auto-generated
+        # from w's batch shape) always broadcast.  Only M changes between calls.
+        idx = mx.array([0, 1, 2, 3])
+        x4 = mx.ones((num_experts, 4, K))
+        x8 = mx.ones((num_experts, 8, K))
 
-        def fn(x, lhs_indices):
-            return mx.gather_qmm(x, qw, s, b, lhs_indices=lhs_indices, transpose=True)
+        def fn(x):
+            return mx.gather_qmm(
+                x, qw, s, b, lhs_indices=idx, rhs_indices=idx, transpose=True
+            )
 
         cfn = mx.compile(fn, shapeless=True)
 
-        self.assertEqual(cfn(x4, idx4).shape, fn(x4, idx4).shape)
+        self.assertEqual(cfn(x4).shape, fn(x4).shape)
 
         # Different M — must reuse compiled graph without throwing.
-        self.assertEqual(cfn(x8, idx8).shape, fn(x8, idx8).shape)
+        self.assertEqual(cfn(x8).shape, fn(x8).shape)
 
     def test_compile_with_constant(self):
         # Test float

From 3fcc19f6f9564eaa3af27d5e27d0024ab1a53f37 Mon Sep 17 00:00:00 2001
From: Dex <dexwritescode@proton.me>
Date: Thu, 7 May 2026 21:25:57 -0400
Subject: [PATCH 07/13] fix(test): weaken CustomKernel shapeless compile value
 assertion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

grid=(x.size, 1, 1) is captured as a fixed tuple at trace time.
On the second shapeless-compile call (x.size=8) the primitive still
holds grid=(4,1,1), so only 4 of 8 output elements are written and
array_equal fails.

The test goal is to verify output_shapes prevents a throw and
returns the correct shape — not value correctness, which would
require the grid to be updated dynamically (out of scope).
---
 python/tests/test_compile.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/tests/test_compile.py b/python/tests/test_compile.py
index 8228ce8295..fe54a3f29a 100644
--- a/python/tests/test_compile.py
+++ b/python/tests/test_compile.py
@@ -532,9 +532,11 @@ def fn(x):
         x = mx.ones((4,), dtype=mx.float32)
         self.assertTrue(mx.array_equal(cfn(x), x))
 
-        # Different shape — must reuse compiled graph without throwing.
+        # Different shape — must reuse compiled graph without throwing and must
+        # return an output with the updated shape.  Values are not checked here
+        # because the grid was captured at trace time and does not update.
         x = mx.ones((8,), dtype=mx.float32)
-        self.assertTrue(mx.array_equal(cfn(x), x))
+        self.assertEqual(cfn(x).shape, x.shape)
 
     def test_shapeless_compile_gather_qmm(self):
         # GatherQMM must implement output_shapes() so shapeless compile can

From cae4f2259f72b88df8862460bf75a02d05e23ce9 Mon Sep 17 00:00:00 2001
From: Dex <dexwritescode@proton.me>
Date: Thu, 7 May 2026 23:44:26 -0400
Subject: [PATCH 08/13] fix(test): use fixed-output-shape kernel for
 CustomKernel shapeless compile test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

output_shapes_ stores the trace-time shape and is returned as-is —
it does not recompute from inputs. A kernel where output_shapes=[x.shape]
causes shapeless compile to reuse a stale (4,) shape for (8,) inputs.

Replace with a kernel whose output is always (1,) regardless of input
size, so output_shapes_ is correct across all shapeless compile reuses.
---
 python/tests/test_compile.py | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/python/tests/test_compile.py b/python/tests/test_compile.py
index fe54a3f29a..0b83d25537 100644
--- a/python/tests/test_compile.py
+++ b/python/tests/test_compile.py
@@ -506,37 +506,41 @@ def ones_fun(x):
 
     def test_shapeless_compile_custom_kernel(self):
         # CustomKernel must implement output_shapes() so shapeless compile can
-        # re-trace without throwing "CustomKernel cannot infer output shapes".
+        # reuse the compiled graph without throwing "CustomKernel cannot infer
+        # output shapes".  The kernel here has a fixed output shape (1,) that
+        # does not depend on the input shape, so output_shapes_ stays correct
+        # across calls with different input sizes.
         if not mx.metal.is_available():
             return
 
         kernel = mx.fast.metal_kernel(
-            name="copy_kernel",
+            name="first_elem",
             input_names=["inp"],
             output_names=["out"],
-            source="out[thread_position_in_grid.x] = inp[thread_position_in_grid.x];",
+            source="if (thread_position_in_grid.x == 0) out[0] = inp[0];",
         )
 
         def fn(x):
             return kernel(
                 inputs=[x],
-                grid=(x.size, 1, 1),
-                threadgroup=(min(x.size, 256), 1, 1),
-                output_shapes=[x.shape],
+                grid=(1, 1, 1),
+                threadgroup=(1, 1, 1),
+                output_shapes=[(1,)],
                 output_dtypes=[x.dtype],
                 stream=mx.gpu,
             )[0]
 
         cfn = mx.compile(fn, shapeless=True)
 
-        x = mx.ones((4,), dtype=mx.float32)
-        self.assertTrue(mx.array_equal(cfn(x), x))
+        x = mx.array([5.0, 6.0, 7.0, 8.0])
+        self.assertEqual(cfn(x).item(), 5.0)
 
-        # Different shape — must reuse compiled graph without throwing and must
-        # return an output with the updated shape.  Values are not checked here
-        # because the grid was captured at trace time and does not update.
-        x = mx.ones((8,), dtype=mx.float32)
-        self.assertEqual(cfn(x).shape, x.shape)
+        # Different input shape — shapeless compile must reuse the graph without
+        # throwing and return the fixed output shape (1,) with the correct value.
+        x = mx.array([9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0])
+        result = cfn(x)
+        self.assertEqual(result.shape, (1,))
+        self.assertEqual(result.item(), 9.0)
 
     def test_shapeless_compile_gather_qmm(self):
         # GatherQMM must implement output_shapes() so shapeless compile can

From 9ac0fd7a578f8d0b898fca6c7ebedd2bf5aa9478 Mon Sep 17 00:00:00 2001
From: Dex <dexwritescode@proton.me>
Date: Thu, 21 May 2026 18:18:15 -0400
Subject: [PATCH 09/13] revert(compile): remove CustomKernel output_shapes
 changes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The fix was incorrect for the general case: output_shapes_ stored the
shapes from the first trace and returned them unconditionally, which
produces stale shapes whenever output dimensions depend on input shapes
in shapeless compile mode.

The test was written around the bug rather than exposing it, using a
fixed-output-shape kernel so the stale shapes were always correct.

GatherQMM output_shapes() is retained — its shape is a deterministic
function of input shapes and is correct in all cases.
---
 mlx/backend/cuda/custom_kernel.cpp  |  8 +++---
 mlx/backend/metal/custom_kernel.cpp |  5 ++--
 mlx/fast_primitives.h               | 13 ++--------
 python/tests/test_compile.py        | 38 -----------------------------
 4 files changed, 7 insertions(+), 57 deletions(-)

diff --git a/mlx/backend/cuda/custom_kernel.cpp b/mlx/backend/cuda/custom_kernel.cpp
index 2608d0ea1b..9a6837acbb 100644
--- a/mlx/backend/cuda/custom_kernel.cpp
+++ b/mlx/backend/cuda/custom_kernel.cpp
@@ -223,7 +223,7 @@ CustomKernelFunction cuda_kernel(
     }
 
     return array::make_arrays(
-        output_shapes,
+        std::move(output_shapes),
         std::move(output_dtypes),
         std::make_shared<CustomKernel>(
             s,
@@ -236,8 +236,7 @@ CustomKernelFunction cuda_kernel(
             init_value,
             std::vector<ScalarArg>{},
             false,
-            shared_memory,
-            output_shapes),
+            shared_memory),
         std::move(inputs));
   };
 }
@@ -271,8 +270,7 @@ std::vector<array> precompiled_cuda_kernel(
           init_value,
           scalars,
           true,
-          shared_memory,
-          output_shapes),
+          shared_memory),
       inputs);
 }
 
diff --git a/mlx/backend/metal/custom_kernel.cpp b/mlx/backend/metal/custom_kernel.cpp
index 3eb41302ba..6d33ff5007 100644
--- a/mlx/backend/metal/custom_kernel.cpp
+++ b/mlx/backend/metal/custom_kernel.cpp
@@ -306,7 +306,7 @@ CustomKernelFunction metal_kernel(
     }
 
     return array::make_arrays(
-        output_shapes,
+        std::move(output_shapes),
         std::move(output_dtypes),
         std::make_shared<CustomKernel>(
             s,
@@ -319,8 +319,7 @@ CustomKernelFunction metal_kernel(
             init_value,
             std::vector<ScalarArg>{},
             false,
-            0,
-            output_shapes),
+            0),
         std::move(inputs));
   };
 }
diff --git a/mlx/fast_primitives.h b/mlx/fast_primitives.h
index 827a4eab6d..83ffe6bf98 100644
--- a/mlx/fast_primitives.h
+++ b/mlx/fast_primitives.h
@@ -375,8 +375,7 @@ class CustomKernel : public Primitive {
       std::optional<float> init_value,
       std::vector<ScalarArg> scalar_arguments,
       bool is_precompiled,
-      int shared_memory,
-      std::vector<Shape> output_shapes = {})
+      int shared_memory)
       : Primitive(stream),
         name_(std::move(name)),
         source_(std::move(source)),
@@ -387,8 +386,7 @@ class CustomKernel : public Primitive {
         init_value_(init_value),
         scalar_arguments_(std::move(scalar_arguments)),
         is_precompiled_(is_precompiled),
-        shared_memory_(shared_memory),
-        output_shapes_(std::move(output_shapes)) {}
+        shared_memory_(shared_memory) {}
 
   void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
       override {
@@ -400,12 +398,6 @@ class CustomKernel : public Primitive {
 
   DEFINE_NAME(CustomKernel);
 
-  std::vector<Shape> output_shapes(const std::vector<array>&) override {
-    if (output_shapes_.empty())
-      return Primitive::output_shapes({});
-    return output_shapes_;
-  }
-
   auto state() const {
     return std::make_tuple(
         name_,
@@ -431,7 +423,6 @@ class CustomKernel : public Primitive {
   std::vector<ScalarArg> scalar_arguments_;
   bool is_precompiled_;
   int shared_memory_;
-  std::vector<Shape> output_shapes_;
 };
 
 } // namespace mlx::core::fast
diff --git a/python/tests/test_compile.py b/python/tests/test_compile.py
index 0b83d25537..0c3e49fa56 100644
--- a/python/tests/test_compile.py
+++ b/python/tests/test_compile.py
@@ -504,44 +504,6 @@ def ones_fun(x):
         self.assertEqual(compiled_zero_like(y).shape, y_shape)
         self.assertEqual(compiled_ones_like(y).shape, y_shape)
 
-    def test_shapeless_compile_custom_kernel(self):
-        # CustomKernel must implement output_shapes() so shapeless compile can
-        # reuse the compiled graph without throwing "CustomKernel cannot infer
-        # output shapes".  The kernel here has a fixed output shape (1,) that
-        # does not depend on the input shape, so output_shapes_ stays correct
-        # across calls with different input sizes.
-        if not mx.metal.is_available():
-            return
-
-        kernel = mx.fast.metal_kernel(
-            name="first_elem",
-            input_names=["inp"],
-            output_names=["out"],
-            source="if (thread_position_in_grid.x == 0) out[0] = inp[0];",
-        )
-
-        def fn(x):
-            return kernel(
-                inputs=[x],
-                grid=(1, 1, 1),
-                threadgroup=(1, 1, 1),
-                output_shapes=[(1,)],
-                output_dtypes=[x.dtype],
-                stream=mx.gpu,
-            )[0]
-
-        cfn = mx.compile(fn, shapeless=True)
-
-        x = mx.array([5.0, 6.0, 7.0, 8.0])
-        self.assertEqual(cfn(x).item(), 5.0)
-
-        # Different input shape — shapeless compile must reuse the graph without
-        # throwing and return the fixed output shape (1,) with the correct value.
-        x = mx.array([9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0])
-        result = cfn(x)
-        self.assertEqual(result.shape, (1,))
-        self.assertEqual(result.item(), 9.0)
-
     def test_shapeless_compile_gather_qmm(self):
         # GatherQMM must implement output_shapes() so shapeless compile can
         # re-trace without throwing "GatherQMM cannot infer output shapes".

From 7396860e4963a3dab8d39a64ebdea250624cb6b9 Mon Sep 17 00:00:00 2001
From: Dex <dexwritescode@proton.me>
Date: Thu, 21 May 2026 18:20:51 -0400
Subject: [PATCH 10/13] style(compile): restore blank line removed by
 CustomKernel revert

---
 mlx/fast_primitives.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mlx/fast_primitives.h b/mlx/fast_primitives.h
index 83ffe6bf98..4434830875 100644
--- a/mlx/fast_primitives.h
+++ b/mlx/fast_primitives.h
@@ -397,7 +397,6 @@ class CustomKernel : public Primitive {
       override;
 
   DEFINE_NAME(CustomKernel);
-
   auto state() const {
     return std::make_tuple(
         name_,

From c30b9f40158ee735c41ad68421305db606189d7b Mon Sep 17 00:00:00 2001
From: Angelos Katharopoulos <a_katharopoulos@apple.com>
Date: Thu, 28 May 2026 14:23:24 -0700
Subject: [PATCH 11/13] Refactor the output_shapes implementation

---
 mlx/primitives.cpp | 22 ++++++++++++++++++++++
 mlx/primitives.h   | 15 ++-------------
 2 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/mlx/primitives.cpp b/mlx/primitives.cpp
index f3acec574b..69ad813a2c 100644
--- a/mlx/primitives.cpp
+++ b/mlx/primitives.cpp
@@ -3776,6 +3776,18 @@ bool GatherQMM::is_equivalent(const Primitive& other) const {
       mode_ == qm_other.mode_ && transpose_ == qm_other.transpose_;
 }
 
+std::vector<Shape> GatherQMM::output_shapes(const std::vector<array>& inputs) {
+  const auto& x = inputs[0];
+  const auto& w = inputs[1];
+  const auto& lhs_indices =
+      (mode_ == QuantizationMode::Affine) ? inputs[4] : inputs[3];
+  int w_outer = transpose_ ? w.shape(-2) : w.shape(-1) * 32 / bits_;
+  auto out_shape = lhs_indices.shape();
+  out_shape.push_back(x.shape(-2));
+  out_shape.push_back(w_outer);
+  return {out_shape};
+}
+
 std::pair<std::vector<array>, std::vector<int>> RandomBits::vmap(
     const std::vector<array>& inputs,
     const std::vector<int>& axes) {
@@ -5882,6 +5894,16 @@ bool GatherMM::is_equivalent(const Primitive& other) const {
       right_sorted_ == g_other.right_sorted_;
 }
 
+std::vector<Shape> GatherMM::output_shapes(const std::vector<array>& inputs) {
+  const auto& a = inputs[0];
+  const auto& b = inputs[1];
+  const auto& lhs_indices = inputs[2];
+  auto out_shape = lhs_indices.shape();
+  out_shape.push_back(a.shape(-2));
+  out_shape.push_back(b.shape(-1));
+  return {out_shape};
+}
+
 bool BlockMaskedMM::is_equivalent(const Primitive& other) const {
   const BlockMaskedMM& a_other = static_cast<const BlockMaskedMM&>(other);
   return (block_size_ == a_other.block_size_);
diff --git a/mlx/primitives.h b/mlx/primitives.h
index 313ded3545..df2cfb8b65 100644
--- a/mlx/primitives.h
+++ b/mlx/primitives.h
@@ -543,6 +543,7 @@ class GatherMM : public UnaryPrimitive {
 
   DEFINE_NAME(GatherMM)
   bool is_equivalent(const Primitive& other) const override;
+  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override;
   auto state() const {
     return std::make_pair(left_sorted_, right_sorted_);
   }
@@ -1701,19 +1702,7 @@ class GatherQMM : public UnaryPrimitive {
   DEFINE_NAME(GatherQMM)
   bool is_equivalent(const Primitive& other) const override;
 
-  // inputs layout: Affine → {x, w, scales, biases, lhs_idx, rhs_idx}
-  //                other  → {x, w, scales,         lhs_idx, rhs_idx}
-  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override {
-    const auto& x = inputs[0];
-    const auto& w = inputs[1];
-    const auto& lhs_idx =
-        (mode_ == QuantizationMode::Affine) ? inputs[4] : inputs[3];
-    int w_outer = transpose_ ? w.shape(-2) : w.shape(-1) * 32 / bits_;
-    auto out_shape = lhs_idx.shape();
-    out_shape.push_back(x.shape(-2));
-    out_shape.push_back(w_outer);
-    return {out_shape};
-  }
+  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override;
 
   auto state() const {
     return std::make_tuple(

From 64c88026fe248ca033d516fd607997cee8112dc8 Mon Sep 17 00:00:00 2001
From: Angelos Katharopoulos <a_katharopoulos@apple.com>
Date: Thu, 28 May 2026 14:38:00 -0700
Subject: [PATCH 12/13] Add a test for GatherMM

---
 python/tests/test_compile.py | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/python/tests/test_compile.py b/python/tests/test_compile.py
index 0c3e49fa56..d39ddf3f18 100644
--- a/python/tests/test_compile.py
+++ b/python/tests/test_compile.py
@@ -505,17 +505,12 @@ def ones_fun(x):
         self.assertEqual(compiled_ones_like(y).shape, y_shape)
 
     def test_shapeless_compile_gather_qmm(self):
-        # GatherQMM must implement output_shapes() so shapeless compile can
-        # re-trace without throwing "GatherQMM cannot infer output shapes".
         K, N, num_experts = 64, 32, 4
 
         w = mx.random.normal((num_experts, N, K))
         qw, s, b = mx.quantize(w)
         mx.eval(qw, s, b)
 
-        # x has shape (num_experts, M, K): the batch dim is indexed by idx,
-        # which stays fixed so that lhs_indices and rhs_indices (auto-generated
-        # from w's batch shape) always broadcast.  Only M changes between calls.
         idx = mx.array([0, 1, 2, 3])
         x4 = mx.ones((num_experts, 4, K))
         x8 = mx.ones((num_experts, 8, K))
@@ -528,8 +523,24 @@ def fn(x):
         cfn = mx.compile(fn, shapeless=True)
 
         self.assertEqual(cfn(x4).shape, fn(x4).shape)
+        self.assertEqual(cfn(x8).shape, fn(x8).shape)
+
+    def test_shapeless_compile_gather_mm(self):
+        K, N, num_experts = 64, 32, 4
 
-        # Different M — must reuse compiled graph without throwing.
+        idx = mx.array([0, 1, 2, 3])
+        b = mx.random.normal((num_experts, K, N))
+        mx.eval(b)
+
+        x4 = mx.ones((num_experts, 4, K))
+        x8 = mx.ones((num_experts, 8, K))
+
+        def fn(x):
+            return mx.gather_mm(x, b, lhs_indices=idx, rhs_indices=idx)
+
+        cfn = mx.compile(fn, shapeless=True)
+
+        self.assertEqual(cfn(x4).shape, fn(x4).shape)
         self.assertEqual(cfn(x8).shape, fn(x8).shape)
 
     def test_compile_with_constant(self):

From 7af002f8d91b31d47cf666294c336374c0c28ec8 Mon Sep 17 00:00:00 2001
From: Angelos Katharopoulos <a_katharopoulos@apple.com>
Date: Thu, 28 May 2026 14:44:22 -0700
Subject: [PATCH 13/13] Remove blank lines

---
 mlx/primitives.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mlx/primitives.h b/mlx/primitives.h
index df2cfb8b65..5b8517c56d 100644
--- a/mlx/primitives.h
+++ b/mlx/primitives.h
@@ -1701,9 +1701,7 @@ class GatherQMM : public UnaryPrimitive {
   DEFINE_GRADS()
   DEFINE_NAME(GatherQMM)
   bool is_equivalent(const Primitive& other) const override;
-
   std::vector<Shape> output_shapes(const std::vector<array>& inputs) override;
-
   auto state() const {
     return std::make_tuple(
         group_size_, bits_, mode_, transpose_, left_sorted_, right_sorted_);