diff --git a/.gitignore b/.gitignore
index cb1fcae67d..9453cbc3e9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,4 +22,5 @@ stubs/
 
 # Symlinks to compiled extensions
 deep_gemm/*.so
-deep_gemm/_C_build
\ No newline at end of file
+deep_gemm/_C_buildsgl_deep_gemm/_C.so
+sgl_deep_gemm/_C_build/
diff --git a/csrc/apis/mega.hpp b/csrc/apis/mega.hpp
index 8d5b9bd09b..37135a5244 100644
--- a/csrc/apis/mega.hpp
+++ b/csrc/apis/mega.hpp
@@ -8,6 +8,9 @@
 #endif
 #include "../jit/device_runtime.hpp"
 #include "../jit_kernels/impls/sm100_fp8_fp4_mega_moe.hpp"
+#include "../jit_kernels/impls/sm100_mega_moe_pre_dispatch.hpp"
+#include "../utils/math.hpp"
+#include "../utils/system.hpp"
 
 namespace deep_gemm::mega {
 
@@ -26,9 +29,33 @@ get_symm_buffer_size_for_mega_moe(
     // Workspace bytes
     const auto workspace = layout::Workspace(nullptr, num_ranks, num_experts, num_max_tokens_per_rank, num_topk);
 
+    // Stream A0.0b: when `DG_USE_FP4_ACTS=1`, the symmetric `x` slot and the
+    // L1 token pool both hold packed E2M1 (FP4) instead of dense E4M3 (FP8).
+    // The per-token byte footprint halves; the SF slot is unchanged
+    // (`hidden/32` UE8M0 bytes — same `gran_k=32` for FP4 and FP8 acts under
+    // `kind::mxf8f6f4`). The host-side flag is read from the env so the
+    // existing `use_fp8_dispatch` API surface (which is hardcoded `true`
+    // throughout) doesn't need to change to opt in.
+    const bool host_use_fp4_acts = get_env<int>("DG_USE_FP4_ACTS") != 0;
+    const int input_token_bytes = host_use_fp4_acts ? (hidden / 2) : hidden;
+
+    // Stream B (combine path): when `DG_USE_FP8_COMBINE=1`, the combine slot
+    // holds FP8 E4M3 (kHidden bytes/token) + a separate combine_sf slot
+    // holding UE8M0 SF bytes (kHidden/128 bytes/token, gran_k=128). When off,
+    // the combine slot holds BF16 (kHidden*2 bytes/token) and combine_sf is
+    // unused (zero-sized).
+    const bool host_use_fp8_combine = get_env<int>("DG_USE_FP8_COMBINE") != 0;
+    constexpr int kCombineGranK = 128;
+    const int combine_token_bytes = host_use_fp8_combine ? hidden : (hidden * 2);
+    const int combine_sf_bytes_per_token = host_use_fp8_combine ? (hidden / kCombineGranK) : 0;
+
     // Layouts
-    const auto fp8_token_layout = layout::Data(hidden);
-    const auto bf16_token_layout = layout::Data(hidden * 2);
+    const auto fp8_token_layout = layout::Data(input_token_bytes);
+    const auto combine_token_layout = layout::Data(combine_token_bytes);
+    // SF layout: bytes/token may not be a multiple of 16 (e.g. hidden=7168 →
+    // 7168/128=56 bytes), so disable TMA alignment requirement (the writes
+    // are 1-byte stores via `sym_buffer.map`, not TMA).
+    const auto combine_sf_layout = layout::Data(combine_sf_bytes_per_token, false);
     const auto fp8_intermediate_token_layout = layout::Data(intermediate_hidden);
     const auto fp8_sf_layout = layout::Data(hidden / 32);
     const auto fp8_intermediate_sf_layout = layout::Data(intermediate_hidden / 32);
@@ -79,10 +106,17 @@ get_symm_buffer_size_for_mega_moe(
         fp8_intermediate_sf_layout, 1, num_max_padded_sf_pool_tokens,
         l2_token_buffer.get_end_ptr());
 
-    // Combine input buffer: BF16 tokens for cross-rank combine
+    // Combine input buffer: BF16 tokens (default) OR FP8 (when host_use_fp8_combine)
+    // for cross-rank combine.
     const auto combine_token_buffer = layout::Buffer(
-        bf16_token_layout, num_topk, num_max_tokens_per_rank,
+        combine_token_layout, num_topk, num_max_tokens_per_rank,
         l2_sf_buffer.get_end_ptr());
+    // Combine SF buffer: only sized when host_use_fp8_combine (otherwise zero).
+    // Layout matches combine_token_buffer's [num_topk][num_max_tokens_per_rank]
+    // outer shape, with kHidden/128 SF bytes per token.
+    const auto combine_sf_buffer = layout::Buffer(
+        combine_sf_layout, num_topk, num_max_tokens_per_rank,
+        combine_token_buffer.get_end_ptr());
 
     // Check SF buffer requirements
     DG_HOST_ASSERT(hidden % 128 == 0 and intermediate_hidden % 128 == 0);
@@ -90,11 +124,17 @@ get_symm_buffer_size_for_mega_moe(
 
     // Slice function: creates `(x, x_sf, topk_weights, topk_idx, l1_acts, l1_acts_sf, l2_acts, l2_acts_sf)` tensor views from the raw buffer
     // NOTES: `x_sf` is K-major, while `l1_acts_sf` and `l2_acts_sf` are M-major
+    // Stream A0.0b: under `host_use_fp4_acts`, the `x` and `l1_acts` views
+    // expose packed E2M1 (`kPackedFP4` = `torch::kInt8`, 2 elements/byte) of
+    // shape `[..., hidden / 2]`. Underlying buffer bytes are the same as the
+    // sized `fp8_token_layout` slot, just half the row width.
+    const auto x_dtype = host_use_fp4_acts ? kPackedFP4 : torch::kFloat8_e4m3fn;
+    const int x_inner_cols = host_use_fp4_acts ? (hidden / 2) : hidden;
     auto slice_input_buffers = [=](const torch::Tensor& buffer) {
         auto x = torch::from_blob(
             math::advance_ptr(buffer.data_ptr(), reinterpret_cast<int64_t>(input_token_buffer.base)),
-            {num_max_tokens_per_rank, hidden},
-            torch::TensorOptions().dtype(torch::kFloat8_e4m3fn).device(buffer.device()));
+            {num_max_tokens_per_rank, x_inner_cols},
+            torch::TensorOptions().dtype(x_dtype).device(buffer.device()));
         auto x_sf = torch::from_blob(
             math::advance_ptr(buffer.data_ptr(), reinterpret_cast<int64_t>(input_sf_buffer.base)),
             {num_max_tokens_per_rank, hidden / 128},
@@ -109,8 +149,8 @@ get_symm_buffer_size_for_mega_moe(
             torch::TensorOptions().dtype(torch::kFloat32).device(buffer.device()));
         auto l1_acts = torch::from_blob(
             math::advance_ptr(buffer.data_ptr(), reinterpret_cast<int64_t>(l1_token_buffer.base)),
-            {num_max_pool_tokens, hidden},
-            torch::TensorOptions().dtype(torch::kFloat8_e4m3fn).device(buffer.device()));
+            {num_max_pool_tokens, x_inner_cols},
+            torch::TensorOptions().dtype(x_dtype).device(buffer.device()));
         auto l1_acts_sf = torch::from_blob(
             math::advance_ptr(buffer.data_ptr(), reinterpret_cast<int64_t>(l1_sf_buffer.base)),
             {num_max_padded_sf_pool_tokens, hidden / 128},
@@ -127,7 +167,7 @@ get_symm_buffer_size_for_mega_moe(
             torch::TensorOptions().dtype(torch::kInt).device(buffer.device()));
         return std::make_tuple(x, x_sf, topk_idx, topk_weights, l1_acts, l1_acts_sf, l2_acts, l2_acts_sf);
     };
-    return {reinterpret_cast<int64_t>(combine_token_buffer.get_end_ptr()), slice_input_buffers};
+    return {reinterpret_cast<int64_t>(combine_sf_buffer.get_end_ptr()), slice_input_buffers};
 }
 
 static void fp8_fp4_mega_moe(
@@ -200,6 +240,26 @@ static void fp8_fp4_mega_moe(
     // Already registered tensors
     const auto [x, x_sf, topk_idx, topk_weights, l1_acts, l1_acts_sf, l2_acts, l2_acts_sf] = slice(sym_buffer);
 
+    // Stream A0.1: pick up FP4-acts flag from `DG_USE_FP4_ACTS` env var.
+    // Default off — preserves byte-identical FP8-acts behavior. Setting
+    // `DG_USE_FP4_ACTS=1` flips L1's epilogue quant to E2M1 + UE8M0 SF.
+    const bool use_fp4_acts = get_env<int>("DG_USE_FP4_ACTS") != 0;
+    // Stream A0.5: when also `DG_USE_MXF4_KIND=1`, the L1 and L2 mainloops
+    // run `tcgen05.mma.kind::mxf4.block_scale.block32` instead of
+    // `kind::mxf8f6f4` — K=64 dense per call (vs K=32 with-padding), dense
+    // FP4 smem (`_ALIGN8B`, half the byte footprint), scale_vec::2X SF
+    // protocol with HALF-WORD address bits. Only honored when
+    // `DG_USE_FP4_ACTS=1` (kind::mxf4 is FP4-only). See A6 capstone /
+    // B2 standalone GEMM for the +20-22% headline.
+    const bool use_mxf4_kind = use_fp4_acts and get_env<int>("DG_USE_MXF4_KIND") != 0;
+    // Stream B (combine path): when `DG_USE_FP8_COMBINE=1`, the L2 epilogue
+    // ships FP8 E4M3 + per-(token, N=128) UE8M0 SF over NVLink instead of
+    // BF16. The combine reduce dequantizes on the fly. NVLink bytes/token
+    // halve (from kHidden*2 → kHidden + kHidden/128). Independent of the
+    // FP4-acts / MXF4-kind flags above (those control the dispatch a2a +
+    // mainloops; this controls the combine a2a only).
+    const bool use_fp8_combine = get_env<int>("DG_USE_FP8_COMBINE") != 0;
+
     // Dispatch into different architectures
     if (arch_major == 10) {
         sm100_fp8_fp4_mega_moe(y,
@@ -213,7 +273,8 @@ static void fp8_fp4_mega_moe(
                                num_experts_per_rank,
                                num_tokens, num_topk,
                                hidden, intermediate_hidden,
-                               activation_clamp, fast_math);
+                               activation_clamp, fast_math,
+                               use_fp4_acts, use_mxf4_kind, use_fp8_combine);
     } else {
         DG_HOST_UNREACHABLE("Unsupported architecture");
     }
@@ -230,6 +291,17 @@ static void register_apis(pybind11::module_& m) {
     m.def("get_token_alignment_for_mega_moe", &get_token_alignment_for_mega_moe);
     m.def("get_symm_buffer_size_for_mega_moe", &get_symm_buffer_size_for_mega_moe);
     m.def("fp8_fp4_mega_moe", &fp8_fp4_mega_moe);
+    m.def("mega_moe_pre_dispatch", &mega_moe_pre_dispatch,
+          pybind11::arg("x"),
+          pybind11::arg("topk_idx"),
+          pybind11::arg("topk_weights"),
+          pybind11::arg("buf_x"),
+          pybind11::arg("buf_x_sf"),
+          pybind11::arg("buf_topk_idx"),
+          pybind11::arg("buf_topk_weights"),
+          pybind11::arg("num_tokens"),
+          pybind11::arg("group_size") = 32,
+          pybind11::arg("use_fp4_acts") = false);
 #endif
 }
 
diff --git a/csrc/jit_kernels/heuristics/mega_moe.hpp b/csrc/jit_kernels/heuristics/mega_moe.hpp
index b1ba6bd70c..8ddf58c3a1 100644
--- a/csrc/jit_kernels/heuristics/mega_moe.hpp
+++ b/csrc/jit_kernels/heuristics/mega_moe.hpp
@@ -58,12 +58,18 @@ struct MegaMoEConfig {
 static std::tuple<int, int, int, int> get_block_config_for_mega_moe(
     const int& num_ranks, const int& num_experts,
     const int& num_max_tokens_per_rank, const int& num_topk,
-    const int& num_tokens) {
+    const int& num_tokens,
+    const bool& use_mxf4_kind = false) {
     const auto& [cluster_size, block_m, store_block_m, num_epilogue_warpgroups] = [&]() -> std::tuple<int, int, int, int> {
         float num_expected_tokens_per_expert = static_cast<float>(num_tokens) * num_ranks * num_topk / num_experts;
         if (num_expected_tokens_per_expert <= 8.5) {
-            // Really small token-per-expert (e.g. RL long-tail rollout), use the smallest block_m
-            return {2, 16, 8, 2};
+            // Really small token-per-expert (e.g. RL long-tail rollout), use the smallest block_m.
+            // Under kind::mxf4, smem_a_per_stage = load_block_m * block_k / 2 must be a
+            // multiple of the 1024-byte smem alignment; load_block_m=8 (= block_m/2 for
+            // block_m=16) gives 512B which fails the static assert. Bump to block_m=32
+            // (load_block_m=16 → smem_a_per_stage=1024B) for the MXF4 path only.
+            return use_mxf4_kind ? std::tuple<int, int, int, int>{2, 32, 16, 2}
+                                 : std::tuple<int, int, int, int>{2, 16, 8, 2};
         } else if (num_expected_tokens_per_expert <= 16.5) {
             // Small batch size, small EP, decoding, e.g. 6/384 experts, EP8, bsz 128
             return {2, 32, 16, 2};
@@ -127,7 +133,11 @@ static std::pair<int, int> get_pipeline_config_for_mega_moe(
     const int& num_experts, const int& hidden,
     const int& block_m, const int& block_n, const int& block_k, const int& store_block_m,
     const int& sf_block_m, const int& sf_block_n,
-    const int& num_dispatch_warps, const int& num_epilogue_warps) {
+    const int& num_dispatch_warps, const int& num_epilogue_warps,
+    // Stream A0.5: under `use_mxf4_kind`, A and B smem use the dense FP4
+    // layout (`_ALIGN8B`, 2 nibbles/byte). Per-stage byte footprint halves
+    // for both A and B → num_stages doubles for the same smem budget.
+    const bool& use_mxf4_kind = false) {
     constexpr int kSmemAlignment = 1024;
     constexpr int kNumEpilogueStages = 2;
     constexpr int kNumTMAStoreStages = 2;
@@ -162,8 +172,13 @@ static std::pair<int, int> get_pipeline_config_for_mega_moe(
     const int smem_sfa_per_stage = sf_block_m * 4;
     const int smem_sfb_per_stage = sf_block_n * 4;
 
-    // Per-stage: A tile + B tile + SFA tile + SFB tile + full/empty barriers
-    const int smem_per_stage = load_block_m * block_k + block_n * block_k + smem_sfa_per_stage + smem_sfb_per_stage + 2 * 8;
+    // Per-stage: A tile + B tile + SFA tile + SFB tile + full/empty barriers.
+    // Stream A0.5: dense FP4 (mxf4) halves both A and B byte footprints.
+    const int smem_a_per_stage = use_mxf4_kind ? (load_block_m * block_k / 2)
+                                               : (load_block_m * block_k);
+    const int smem_b_per_stage = use_mxf4_kind ? (block_n * block_k / 2)
+                                               : (block_n * block_k);
+    const int smem_per_stage = smem_a_per_stage + smem_b_per_stage + smem_sfa_per_stage + smem_sfb_per_stage + 2 * 8;
 
     // Fixed total
     const int smem_fixed = smem_dispatch_size + smem_cd + smem_amax_reduction + smem_barriers + smem_tmem_ptr;
@@ -179,10 +194,11 @@ static MegaMoEConfig get_mega_moe_config(
     const int& num_ranks, const int& num_experts, const int& num_experts_per_rank,
     const int& num_max_tokens_per_rank, const int& num_tokens, const int& num_topk,
     const int& hidden, const int& intermediate_hidden,
-    const int& num_padded_sf_pool_tokens) {
+    const int& num_padded_sf_pool_tokens,
+    const bool& use_mxf4_kind = false) {
     // Block config
     const auto [cluster_size, block_m, store_block_m, num_epilogue_threads] =
-        get_block_config_for_mega_moe(num_ranks, num_experts, num_max_tokens_per_rank, num_topk, num_tokens);
+        get_block_config_for_mega_moe(num_ranks, num_experts, num_max_tokens_per_rank, num_topk, num_tokens, use_mxf4_kind);
     const int block_n = 128;
     const int block_k = 128;
     const int load_block_m = block_m / 2;
@@ -210,7 +226,8 @@ static MegaMoEConfig get_mega_moe_config(
         num_experts, hidden,
         block_m, block_n, block_k, store_block_m,
         sf_block_m, sf_block_n,
-        num_dispatch_threads / 32, num_epilogue_threads / 32);
+        num_dispatch_threads / 32, num_epilogue_threads / 32,
+        use_mxf4_kind);
 
     const auto config = MegaMoEConfig {
         block_m, block_n, block_k,
diff --git a/csrc/jit_kernels/impls/sm100_fp8_fp4_mega_moe.hpp b/csrc/jit_kernels/impls/sm100_fp8_fp4_mega_moe.hpp
index 1f5a413f91..1db1b621fd 100644
--- a/csrc/jit_kernels/impls/sm100_fp8_fp4_mega_moe.hpp
+++ b/csrc/jit_kernels/impls/sm100_fp8_fp4_mega_moe.hpp
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/torch.h>
+#include <torch/python.h>
 
 #include "../../jit/compiler.hpp"
 #include "../../jit/kernel_runtime.hpp"
@@ -25,6 +25,18 @@ class SM100FP8FP4MegaMoERuntime final : public LaunchRuntime<SM100FP8FP4MegaMoER
         int num_ranks;
         float activation_clamp;
         bool fast_math;
+        // Stream A0.1: enable FP4 (E2M1) activations from L1 epilogue.
+        // Default false — keeps the FP8-acts baseline byte-identical.
+        bool use_fp4_acts;
+        // Stream A0.5: when set, run kind::mxf4 (K=64 dense) instead of
+        // kind::mxf8f6f4 (K=32 with-padding) for both L1 and L2 mainloops.
+        // Only honored when `use_fp4_acts` is also set.
+        bool use_mxf4_kind;
+        // Stream B (combine path): when set, the L2 epilogue writes FP8 E4M3
+        // + per-(token,N=128) UE8M0 SF over NVLink instead of BF16. The
+        // combine reduce dequantizes on the fly. Halves NVLink bytes/token
+        // on the second a2a (kHidden + kHidden/128 vs kHidden*2).
+        bool use_fp8_combine;
         MegaMoEConfig config;
 
         // Runtime arguments
@@ -69,6 +81,9 @@ static void __instantiate_kernel() {{
         {}, {}, {},
         {}, {},
         {},
+        {},
+        {},
+        {},
         {}
     >);
 }};
@@ -85,7 +100,10 @@ static void __instantiate_kernel() {{
     args.config.num_dispatch_threads, args.config.num_non_epilogue_threads, args.config.num_epilogue_threads,
     args.launch_args.grid_dim.first, args.num_ranks,
     to_string(args.activation_clamp),
-    args.fast_math ? "true" : "false");
+    args.fast_math ? "true" : "false",
+    args.use_fp4_acts ? "true" : "false",
+    args.use_mxf4_kind ? "true" : "false",
+    args.use_fp8_combine ? "true" : "false");
     }
 
     static void launch_impl(const KernelHandle& kernel, const LaunchConfigHandle& config, Args args) {
@@ -121,24 +139,57 @@ static void sm100_fp8_fp4_mega_moe(
     const int& num_tokens, const int& num_topk,
     const int& hidden, const int& intermediate_hidden,
     const float& activation_clamp,
-    const bool& fast_math
+    const bool& fast_math,
+    const bool& use_fp4_acts = false,
+    const bool& use_mxf4_kind = false,
+    const bool& use_fp8_combine = false
 ) {
     const auto num_ranks = static_cast<int>(sym_buffer_ptrs.size());
     const auto num_experts = num_experts_per_rank * num_ranks;
     const auto num_padded_sf_pool_tokens = static_cast<int>(l1_acts_sf.size(0));
+    // Stream A0.5 sanity: kind::mxf4 only accepts FP4 inputs.
+    DG_HOST_ASSERT(not use_mxf4_kind or use_fp4_acts);
 
     // Heuristics
     const auto config = get_mega_moe_config(
         num_ranks, num_experts, num_experts_per_rank,
-        num_max_tokens_per_rank, num_tokens, num_topk, hidden, intermediate_hidden, num_padded_sf_pool_tokens);
+        num_max_tokens_per_rank, num_tokens, num_topk, hidden, intermediate_hidden, num_padded_sf_pool_tokens,
+        use_mxf4_kind);
 
     // Make tensormap
     constexpr int kGranK = 32;
+    // Stream A0.5: when `use_mxf4_kind` is on, BOTH L1 and L2 acts AND
+    // weights TMA descriptors switch from `_ALIGN16B` (FP4 with-padding,
+    // 8 data + 8 pad bytes per 16-byte atom) to `_ALIGN8B` (dense FP4,
+    // 2 nibbles/byte). The smem byte stride per K-row halves accordingly,
+    // and swizzle mode halves to match (128B → 64B). The gmem layout is
+    // unchanged — the underlying `l1_acts` / `l1_weights` storage is still
+    // packed FP4 nibbles; only how TMA expands them into smem changes.
+    const bool fp4_unpacked = not use_mxf4_kind;
+    const int swizzle_acts = use_mxf4_kind ? config.swizzle_acts_mode / 2
+                                           : config.swizzle_acts_mode;
+    const int swizzle_weights = use_mxf4_kind ? config.swizzle_weights_mode / 2
+                                              : config.swizzle_weights_mode;
+    // Stream A0.0b: when `use_fp4_acts` is on, the L1 token pool buffer
+    // (`l1_acts`) is already viewed as `kPackedFP4` (int8) by the symm-buffer
+    // slice (see `csrc/apis/mega.hpp`), with shape `[num_pool_tokens, hidden/2]`
+    // of packed E2M1 (low nibble = even col, high nibble = odd col).
+    // `make_tma_2d_desc` then auto-selects `CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B`
+    // via `aten_dtype_to_tensor_map_dtype` (runtime_utils.hpp:84-87) — or
+    // `_ALIGN8B` under `use_mxf4_kind` (Stream A0.5).
+    //
+    // TMA descriptor: `gmem_inner_dim = hidden` U4 elements (the descriptor
+    // reads `hidden/2` storage bytes per row); smem inner box `BLOCK_K = 128`
+    // elements expands to 128 smem bytes after `_ALIGN16B`. 128 B swizzle
+    // matches the production swizzle_acts_mode (same as B weights, which
+    // have used `_ALIGN16B` from day one).
     const auto tensor_map_l1_acts = make_tma_2d_desc(l1_acts,
                                                      hidden, config.num_max_pool_tokens,
                                                      config.block_k, config.load_block_m,
                                                      static_cast<int>(l1_acts.stride(-2)),
-                                                     config.swizzle_acts_mode);
+                                                     swizzle_acts, /*swizzle_base=*/0,
+                                                     /*allow_tf32=*/false,
+                                                     /*fp4_unpacked_smem=*/fp4_unpacked);
     const auto tensor_map_l1_acts_sf = make_tma_sf_desc(cute::UMMA::Major::MN, l1_acts_sf,
                                                         config.num_padded_sf_pool_tokens, hidden,
                                                         config.sf_block_m, kGranK,
@@ -147,7 +198,9 @@ static void sm100_fp8_fp4_mega_moe(
                                                         hidden, num_experts_per_rank * intermediate_hidden * 2,
                                                         config.block_k, config.load_block_n,
                                                         static_cast<int>(l1_weights.stride(-2)),
-                                                        config.swizzle_weights_mode);
+                                                        swizzle_weights, /*swizzle_base=*/0,
+                                                        /*allow_tf32=*/false,
+                                                        /*fp4_unpacked_smem=*/fp4_unpacked);
     const auto tensor_map_l1_weights_sf = make_tma_sf_desc(cute::UMMA::Major::MN, l1_weights_sf,
                                                            intermediate_hidden * 2, hidden,
                                                            config.block_n, kGranK,
@@ -155,16 +208,64 @@ static void sm100_fp8_fp4_mega_moe(
     // NOTES: L1 output and L2 activations are essentially the same tensor.
     // Post-SwiGLU output has half the N width (`BLOCK_N / 2` per input tile),
     // so the swizzle mode is also halved (128 -> 64).
-    const auto tensor_map_l1_output = make_tma_2d_desc(l2_acts,
-                                                       intermediate_hidden, config.num_max_pool_tokens,
-                                                       config.block_n / 2, config.store_block_m,
-                                                       static_cast<int>(l2_acts.stride(-2)),
-                                                       config.swizzle_acts_mode / 2);
-    const auto tensor_map_l2_acts = make_tma_2d_desc(l2_acts,
-                                                     intermediate_hidden, config.num_max_pool_tokens,
-                                                     config.block_k, config.load_block_m,
-                                                     static_cast<int>(l2_acts.stride(-2)),
-                                                     config.swizzle_acts_mode);
+    //
+    // Stream A0.2: when `use_fp4_acts` is on, the L1 epilogue emits packed
+    // E2M1 (FP4) where each byte holds 2 elements. The kernel writes a
+    // **dense canonical** smem layout (no swizzle XOR) — see the FP4 store
+    // branch in `sm100_fp8_fp4_mega_moe.cuh`. To match, we build the L1
+    // output TMA descriptor with `swizzle = 0`. The gmem result is the
+    // canonical `[M, intermediate_hidden / 2]` packed FP4 layout, byte-
+    // identical to what `kernels/fused_gemm_swiglu_fp4_quant_1cta` produces
+    // (Stream A2). The L2 reader (built below) consumes this same canonical
+    // layout via `_ALIGN16B`. The per-row gmem byte footprint halves
+    // (`intermediate_hidden / 2` bytes vs `intermediate_hidden` for FP8);
+    // outer stride in the underlying buffer is unchanged.
+    const auto tensor_map_l1_output = use_fp4_acts
+        ? make_tma_2d_desc(l2_acts,
+                           intermediate_hidden / 2, config.num_max_pool_tokens,
+                           config.block_n / 4, config.store_block_m,
+                           static_cast<int>(l2_acts.stride(-2)),
+                           /*swizzle_mode=*/0)
+        : make_tma_2d_desc(l2_acts,
+                           intermediate_hidden, config.num_max_pool_tokens,
+                           config.block_n / 2, config.store_block_m,
+                           static_cast<int>(l2_acts.stride(-2)),
+                           config.swizzle_acts_mode / 2);
+    // Stream A0.2: when FP4 acts on, L2 reads packed E2M1 via `_ALIGN16B`.
+    // `make_tma_2d_desc` selects the descriptor dtype from the source
+    // tensor's `scalar_type`; `l2_acts` is allocated as FP8 (1 byte/elem).
+    // For the FP4 path we re-view the same byte buffer as `kPackedFP4` so
+    // the descriptor dtype is `CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B`.
+    //
+    // gmem layout (FP4 path, set up by L1 epilogue):
+    //   - per row: first `intermediate_hidden / 2` bytes are packed E2M1
+    //     (low nibble = even col, high nibble = odd col — canonical MXFP4),
+    //     remaining bytes in the row are stale FP8 from prior runs.
+    //   - row stride: `l2_acts.stride(-2)` source bytes (= same as FP8
+    //     because the buffer view's underlying allocation hasn't changed).
+    //
+    // TMA descriptor tells the hardware:
+    //   - `gmem_inner_dim = intermediate_hidden` U4 elements (=
+    //     `intermediate_hidden / 2` source bytes are read per row).
+    //   - `gmem_outer_stride = stride(-2)` source bytes (the actual storage
+    //     row pitch — leaves the unused tail of each FP8-sized row alone).
+    //   - smem inner box = `BLOCK_K = 128` elements (= 64 source bytes per
+    //     row, expands to 128 smem bytes after `_ALIGN16B` doubling); 128B
+    //     swizzle aligns with the per-stage atom (same as B-side, which has
+    //     used this layout for FP4 weights from day one).
+    const auto tensor_map_l2_acts = use_fp4_acts
+        ? make_tma_2d_desc(l2_acts.view(kPackedFP4),
+                           intermediate_hidden, config.num_max_pool_tokens,
+                           config.block_k, config.load_block_m,
+                           static_cast<int>(l2_acts.stride(-2)),
+                           swizzle_acts, /*swizzle_base=*/0,
+                           /*allow_tf32=*/false,
+                           /*fp4_unpacked_smem=*/fp4_unpacked)
+        : make_tma_2d_desc(l2_acts,
+                           intermediate_hidden, config.num_max_pool_tokens,
+                           config.block_k, config.load_block_m,
+                           static_cast<int>(l2_acts.stride(-2)),
+                           config.swizzle_acts_mode);
     const auto tensor_map_l2_acts_sf = make_tma_sf_desc(cute::UMMA::Major::MN, l2_acts_sf,
                                                         config.num_padded_sf_pool_tokens, intermediate_hidden,
                                                         config.sf_block_m, kGranK,
@@ -173,7 +274,9 @@ static void sm100_fp8_fp4_mega_moe(
                                                         intermediate_hidden, num_experts_per_rank * hidden,
                                                         config.block_k, config.load_block_n,
                                                         static_cast<int>(l2_weights.stride(-2)),
-                                                        config.swizzle_weights_mode);
+                                                        swizzle_weights, /*swizzle_base=*/0,
+                                                        /*allow_tf32=*/false,
+                                                        /*fp4_unpacked_smem=*/fp4_unpacked);
     const auto tensor_map_l2_weights_sf = make_tma_sf_desc(cute::UMMA::Major::MN, l2_weights_sf,
                                                            hidden, intermediate_hidden,
                                                            config.block_n, kGranK,
@@ -193,6 +296,9 @@ static void sm100_fp8_fp4_mega_moe(
         .num_ranks = num_ranks,
         .activation_clamp = activation_clamp,
         .fast_math = fast_math,
+        .use_fp4_acts = use_fp4_acts,
+        .use_mxf4_kind = use_mxf4_kind,
+        .use_fp8_combine = use_fp8_combine,
         .config = config,
         .y = y.data_ptr(),
         .cumulative_local_expert_recv_stats = cumulative_local_expert_recv_stats_ptr,
diff --git a/csrc/jit_kernels/impls/sm100_mega_moe_pre_dispatch.hpp b/csrc/jit_kernels/impls/sm100_mega_moe_pre_dispatch.hpp
new file mode 100644
index 0000000000..9d6c347401
--- /dev/null
+++ b/csrc/jit_kernels/impls/sm100_mega_moe_pre_dispatch.hpp
@@ -0,0 +1,175 @@
+#pragma once
+
+#include <torch/python.h>
+
+#include "../../jit/compiler.hpp"
+#include "../../jit/device_runtime.hpp"
+#include "../../jit/kernel_runtime.hpp"
+#include "../../utils/exception.hpp"
+#include "../../utils/format.hpp"
+#include "../../utils/math.hpp"
+
+namespace deep_gemm {
+
+// JIT runtime for `sm100_mega_moe_pre_dispatch` (see
+// `deep_gemm/include/deep_gemm/impls/sm100_mega_moe_pre_dispatch.cuh`).
+// Templated on (kGroupSize, kUseFp4Acts, kUsePDL); host fn picks the
+// instantiation from explicit args.
+class SM100MegaMoEPreDispatchRuntime final : public LaunchRuntime<SM100MegaMoEPreDispatchRuntime> {
+public:
+    struct Args {
+        int group_size;
+        bool use_fp4_acts;
+        bool use_pdl;
+
+        // Runtime args (passed to the kernel via the params struct).
+        const void* x;
+        const void* topk_idx;
+        const void* topk_weights;
+        void*       buf_x;
+        void*       buf_x_sf;
+        void*       buf_topk_idx;
+        void*       buf_topk_weights;
+        uint32_t    num_tokens;
+        uint32_t    padded_max;
+        uint32_t    hidden;
+        uint32_t    num_groups;
+        uint32_t    top_k;
+
+        LaunchArgs launch_args;
+    };
+
+    static std::string generate_impl(const Args& args) {
+        return fmt::format(R"(
+#include <deep_gemm/impls/sm100_mega_moe_pre_dispatch.cuh>
+
+using namespace deep_gemm;
+
+static void __instantiate_kernel() {{
+    auto ptr = reinterpret_cast<void*>(&mega_moe_pre_dispatch_kernel<
+        {}, {}, {}
+    >);
+}};
+)", args.group_size,
+    args.use_fp4_acts ? "true" : "false",
+    args.use_pdl ? "true" : "false");
+    }
+
+    static void launch_impl(const KernelHandle& kernel, const LaunchConfigHandle& config, Args args) {
+        DG_CUDA_UNIFIED_CHECK(launch_kernel(kernel, config,
+            args.x, args.topk_idx, args.topk_weights,
+            args.buf_x, args.buf_x_sf, args.buf_topk_idx, args.buf_topk_weights,
+            args.num_tokens, args.padded_max, args.hidden, args.num_groups, args.top_k));
+    }
+};
+
+// Host entry point. Layout contract (matches DeepGEMM's mega symm buffer):
+//   - x:            (M, H) bf16, contiguous.
+//   - topk_idx:     (M, K) int32, contiguous.
+//   - topk_weights: (M, K) float, contiguous.
+//   - buf_x:        (P, H) fp8_e4m3 if !use_fp4_acts, else (P, H/2) int8 (packed FP4).
+//   - buf_x_sf:     (P, G/4) int32, contiguous; G = H / group_size; each int32
+//                   stores 4 UE8M0 bytes row-major.
+//   - buf_topk_idx: (P, K) int64.
+//   - buf_topk_weights: (P, K) float.
+//
+// Pad-fill: rows in [num_tokens, padded_max) of buf_topk_idx / buf_topk_weights
+// are filled with (-1, 0). buf_x and buf_x_sf rows in that range are NOT
+// touched (the kernel only writes valid-token rows; pad rows must have been
+// pre-zeroed by the caller if they need defined values).
+static void mega_moe_pre_dispatch(
+    const torch::Tensor& x,
+    const torch::Tensor& topk_idx,
+    const torch::Tensor& topk_weights,
+    const torch::Tensor& buf_x,
+    const torch::Tensor& buf_x_sf,
+    const torch::Tensor& buf_topk_idx,
+    const torch::Tensor& buf_topk_weights,
+    const int& num_tokens,
+    const int& group_size,
+    const bool& use_fp4_acts) {
+    DG_HOST_ASSERT(group_size == 32 || group_size == 64 || group_size == 128);
+    DG_HOST_ASSERT(x.scalar_type() == torch::kBFloat16);
+    DG_HOST_ASSERT(x.is_contiguous());
+    DG_HOST_ASSERT(topk_idx.scalar_type() == torch::kInt32);
+    DG_HOST_ASSERT(topk_weights.scalar_type() == torch::kFloat);
+    DG_HOST_ASSERT(topk_idx.is_contiguous() && topk_weights.is_contiguous());
+    DG_HOST_ASSERT(x.dim() == 2 && topk_idx.dim() == 2 && topk_weights.dim() == 2);
+    DG_HOST_ASSERT(buf_x.dim() == 2 && buf_x_sf.dim() == 2);
+    DG_HOST_ASSERT(buf_topk_idx.dim() == 2 && buf_topk_weights.dim() == 2);
+    DG_HOST_ASSERT(buf_topk_idx.scalar_type() == torch::kInt64);
+    DG_HOST_ASSERT(buf_topk_weights.scalar_type() == torch::kFloat);
+    DG_HOST_ASSERT(buf_x_sf.scalar_type() == torch::kInt);
+    DG_HOST_ASSERT(buf_x_sf.is_contiguous());
+
+    const auto m = static_cast<int>(x.size(0));
+    const auto hidden = static_cast<int>(x.size(1));
+    const auto top_k = static_cast<int>(topk_idx.size(1));
+    const auto padded_max = static_cast<int>(buf_x.size(0));
+
+    DG_HOST_ASSERT(num_tokens == m);
+    DG_HOST_ASSERT(num_tokens <= padded_max);
+    DG_HOST_ASSERT(static_cast<int>(topk_idx.size(0)) == m);
+    DG_HOST_ASSERT(static_cast<int>(topk_weights.size(0)) == m);
+    DG_HOST_ASSERT(static_cast<int>(topk_weights.size(1)) == top_k);
+    DG_HOST_ASSERT(static_cast<int>(buf_topk_idx.size(0)) == padded_max);
+    DG_HOST_ASSERT(static_cast<int>(buf_topk_idx.size(1)) == top_k);
+    DG_HOST_ASSERT(static_cast<int>(buf_topk_weights.size(0)) == padded_max);
+    DG_HOST_ASSERT(static_cast<int>(buf_topk_weights.size(1)) == top_k);
+
+    DG_HOST_ASSERT(hidden % group_size == 0);
+    const auto num_groups = hidden / group_size;
+    DG_HOST_ASSERT(num_groups % 4 == 0);
+    DG_HOST_ASSERT(static_cast<int>(buf_x_sf.size(0)) == padded_max);
+    DG_HOST_ASSERT(static_cast<int>(buf_x_sf.size(1)) == num_groups / 4);
+
+    if (use_fp4_acts) {
+        // Packed FP4: (P, hidden/2) bytes. The symm-buffer slice views this
+        // as kPackedFP4 (int8); accept either int8 / uint8 / float8_e4m3fn
+        // re-views since callers may bind the slot differently.
+        DG_HOST_ASSERT(static_cast<int>(buf_x.size(1)) == hidden / 2);
+        DG_HOST_ASSERT(buf_x.element_size() == 1);
+    } else {
+        DG_HOST_ASSERT(buf_x.scalar_type() == torch::kFloat8_e4m3fn);
+        DG_HOST_ASSERT(static_cast<int>(buf_x.size(1)) == hidden);
+    }
+
+    DG_HOST_ASSERT(hidden % 8 == 0);
+    const auto num_threads = hidden / 8;
+    DG_HOST_ASSERT(num_threads <= 1024);
+    DG_HOST_ASSERT(num_threads >= top_k);
+
+    const auto pad_slots = (padded_max - num_tokens) * top_k;
+    const auto num_pad_blocks = pad_slots == 0 ? 0
+        : math::ceil_div(pad_slots, num_threads);
+    const auto num_total_blocks = num_tokens + num_pad_blocks;
+    if (num_total_blocks == 0) return;
+
+    const bool use_pdl = device_runtime->get_pdl();
+
+    SM100MegaMoEPreDispatchRuntime::Args args = {
+        .group_size = group_size,
+        .use_fp4_acts = use_fp4_acts,
+        .use_pdl = use_pdl,
+        .x = x.const_data_ptr(),
+        .topk_idx = topk_idx.const_data_ptr(),
+        .topk_weights = topk_weights.const_data_ptr(),
+        .buf_x = buf_x.data_ptr(),
+        .buf_x_sf = buf_x_sf.data_ptr(),
+        .buf_topk_idx = buf_topk_idx.data_ptr(),
+        .buf_topk_weights = buf_topk_weights.data_ptr(),
+        .num_tokens = static_cast<uint32_t>(num_tokens),
+        .padded_max = static_cast<uint32_t>(padded_max),
+        .hidden = static_cast<uint32_t>(hidden),
+        .num_groups = static_cast<uint32_t>(num_groups),
+        .top_k = static_cast<uint32_t>(top_k),
+        .launch_args = LaunchArgs(num_total_blocks, num_threads, /*smem_size=*/0,
+                                  /*cluster_dim=*/1, /*enable_pdl=*/use_pdl)
+    };
+
+    const auto code = SM100MegaMoEPreDispatchRuntime::generate(args);
+    const auto runtime = compiler->build("sm100_mega_moe_pre_dispatch", code);
+    SM100MegaMoEPreDispatchRuntime::launch(runtime, args);
+}
+
+} // namespace deep_gemm
diff --git a/csrc/tvm_ffi_api.cpp b/csrc/tvm_ffi_api.cpp
index b992c05b8d..a029070792 100644
--- a/csrc/tvm_ffi_api.cpp
+++ b/csrc/tvm_ffi_api.cpp
@@ -579,9 +579,30 @@ void dg_fp8_fp4_mega_moe(TensorView y, TensorView l1_weights, TensorView l1_weig
     );
 }
 
+
+void dg_mega_moe_pre_dispatch(
+    TensorView x, TensorView topk_idx, TensorView topk_weights,
+    TensorView buf_x, TensorView buf_x_sf,
+    TensorView buf_topk_idx, TensorView buf_topk_weights,
+    int64_t num_tokens, int64_t group_size, bool use_fp4_acts) {
+    mega_moe_pre_dispatch(
+        convert_to_torch_tensor(x),
+        convert_to_torch_tensor(topk_idx),
+        convert_to_torch_tensor(topk_weights),
+        convert_to_torch_tensor(buf_x),
+        convert_to_torch_tensor(buf_x_sf),
+        convert_to_torch_tensor(buf_topk_idx),
+        convert_to_torch_tensor(buf_topk_weights),
+        static_cast<int>(num_tokens),
+        static_cast<int>(group_size),
+        use_fp4_acts
+    );
+}
+
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(get_token_alignment_for_mega_moe, dg_get_token_alignment_for_mega_moe);
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(get_symm_buffer_size_for_mega_moe, dg_get_symm_buffer_size_for_mega_moe);
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(fp8_fp4_mega_moe, dg_fp8_fp4_mega_moe);
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(mega_moe_pre_dispatch, dg_mega_moe_pre_dispatch);
 
 
 #endif  // DG_FP8_COMPATIBLE and DG_TENSORMAP_COMPATIBLE
diff --git a/csrc/utils/torch_compat.hpp b/csrc/utils/torch_compat.hpp
index 14fd399d17..c9199ede95 100644
--- a/csrc/utils/torch_compat.hpp
+++ b/csrc/utils/torch_compat.hpp
@@ -131,6 +131,20 @@ inline torch::Tensor convert_to_torch_tensor(tvm::ffi::TensorView tensor) {
                     .device(torch::kCUDA, device_id)
                     .requires_grad(false);
 
+    // Zero-numel tensors may carry a nullptr data_ptr, which trips
+    // torch::from_blob() getDeviceFromPtr() host-vs-device check.
+    // Allocate a fresh empty CUDA tensor in that case: any kernel reading
+    // through it must already guard on the zero-size dimension.
+    int64_t numel = 1;
+    for (auto s : sizes) numel *= s;
+    if (numel == 0 || data == nullptr) {
+        if (tensor.strides().data()) {
+            auto strides = std::vector<int64_t>(tensor.strides().begin(), tensor.strides().end());
+            return torch::empty_strided(sizes, strides, opts);
+        }
+        return torch::empty(sizes, opts);
+    }
+
     if (tensor.strides().data()) {
         auto strides = std::vector<int64_t>(tensor.strides().begin(), tensor.strides().end());
         return torch::from_blob(data, sizes, strides, opts);
diff --git a/deep_gemm/__init__.py b/deep_gemm/__init__.py
index 2508501a1a..6c0f6a769c 100644
--- a/deep_gemm/__init__.py
+++ b/deep_gemm/__init__.py
@@ -263,6 +263,7 @@ def m_grouped_fp8_fp4_gemm_nt_masked(a, b, d, masked_m, expected_m, recipe=None,
     get_symm_buffer_for_mega_moe,
     transform_weights_for_mega_moe,
     fp8_fp4_mega_moe,
+    mega_moe_pre_dispatch,
 )
 
 # Some utils
diff --git a/deep_gemm/include/deep_gemm/common/math.cuh b/deep_gemm/include/deep_gemm/common/math.cuh
index 0f0d250481..6d5ece847e 100644
--- a/deep_gemm/include/deep_gemm/common/math.cuh
+++ b/deep_gemm/include/deep_gemm/common/math.cuh
@@ -98,6 +98,54 @@ CUTLASS_DEVICE void get_e4m3_sf_and_sf_inv(const float2& amax, float2& sf, float
     sf.y = fast_pow2(exp_y), sf_inv.y = fast_pow2(-exp_y);
 }
 
+// E2M1 (FP4) variant: divisor is finfo_max=6 instead of 448. Same UE8M0
+// SF protocol; only the per-element clipping range and dtype differ.
+// 1/6 = 0x3E2AAAAB exactly in FP32 RN.
+template <bool kUseUE8M0 = true>
+CUTLASS_DEVICE void get_e2m1_sf_and_sf_inv(const float2& amax, float2& sf, float2& sf_inv) {
+    DG_STATIC_ASSERT(kUseUE8M0, "Must use UE8M0");
+    const float2 finfo_factor = {1.0f / 6.0f, 1.0f / 6.0f};
+    const auto scaled = __fmul2_rn(amax, finfo_factor);
+    const auto exp_x = fast_log2_ceil(scaled.x);
+    const auto exp_y = fast_log2_ceil(scaled.y);
+    sf.x = fast_pow2(exp_x), sf_inv.x = fast_pow2(-exp_x);
+    sf.y = fast_pow2(exp_y), sf_inv.y = fast_pow2(-exp_y);
+}
+
+// Pack two FP32 values into one FP4 (E2M1) byte: lower nibble = a, upper = b.
+// Matches PTX `cvt.rn.satfinite.e2m1x2.f32 d, b, a` (b → upper, a → lower).
+CUTLASS_DEVICE uint32_t cvt_pack_f32_to_e2m1x2(const float& a, const float& b) {
+    uint32_t out;
+    asm volatile(
+        "{\n"
+        ".reg .b8 byte0;\n"
+        "cvt.rn.satfinite.e2m1x2.f32 byte0, %2, %1;\n"
+        "cvt.u32.u8 %0, byte0;\n"
+        "}"
+        : "=r"(out) : "f"(a), "f"(b));
+    return out;
+}
+
+// Pack four FP32 values into one uint16 (FP4 nibbles, 4 elements / 2 bytes).
+// Layout: bits[0:4]=a, [4:8]=b, [8:12]=c, [12:16]=d. Compatible with
+// `cvt.rn.satfinite.e2m1x2.f32` whose output is "low nibble = first arg".
+CUTLASS_DEVICE uint32_t cvt_pack_f32x4_to_e2m1x4(
+        const float& a, const float& b, const float& c, const float& d) {
+    uint32_t out;
+    asm volatile(
+        "{\n"
+        ".reg .b8 byte0;\n"
+        ".reg .b8 byte1;\n"
+        "cvt.rn.satfinite.e2m1x2.f32 byte0, %2, %1;\n"
+        "cvt.rn.satfinite.e2m1x2.f32 byte1, %4, %3;\n"
+        ".reg .b16 hword;\n"
+        "mov.b16 hword, {byte0, byte1};\n"
+        "cvt.u32.u16 %0, hword;\n"
+        "}"
+        : "=r"(out) : "f"(a), "f"(b), "f"(c), "f"(d));
+    return out;
+}
+
 /// Reduction
 CUTLASS_DEVICE uint32_t warp_inclusive_sum(uint32_t value, const uint32_t& lane_idx) {
     #pragma unroll
diff --git a/deep_gemm/include/deep_gemm/impls/sm100_fp8_fp4_mega_moe.cuh b/deep_gemm/include/deep_gemm/impls/sm100_fp8_fp4_mega_moe.cuh
index b2adc6c7ad..c9b1cdce38 100644
--- a/deep_gemm/include/deep_gemm/impls/sm100_fp8_fp4_mega_moe.cuh
+++ b/deep_gemm/include/deep_gemm/impls/sm100_fp8_fp4_mega_moe.cuh
@@ -34,6 +34,37 @@ template <
     uint32_t kNumSMs, uint32_t kNumRanks,
     float kActivationClamp,
     bool kFastMath,
+    // ====== Stream A0.1 — DG_USE_FP4_ACTS ======
+    // When true, the L1 epilogue quantizes its SwiGLU outputs to E2M1 (FP4) +
+    // UE8M0 SF instead of E4M3 (FP8) + UE8M0 SF. The per-row gmem footprint
+    // halves (intermediate_hidden / 2 packed bytes vs intermediate_hidden FP8
+    // bytes) and the smem CD staging is sized accordingly. The L2 phase still
+    // reads its activations as FP8 in this step (separate flag for A0.2), so
+    // end-to-end output is intentionally not bit-equivalent to the FP8 path —
+    // the accuracy harness compares L1's quantized output decoded back to BF16.
+    bool kUseFp4Acts = false,
+    // ====== Stream A0.5 — DG_USE_MXF4_KIND ======
+    // When true (and `kUseFp4Acts` also true), L1 + L2 mainloops swap from
+    // `kind::mxf8f6f4.block_scale.block32` (K=32 with-padding FP4 smem) to
+    // `kind::mxf4.block_scale.block32` (K=64 dense FP4 smem). Per the
+    // `recipes/mxf4_vs_mxf8f6f4` microbench, `kind::mxf4` delivers 2× FLOPS/
+    // cycle in isolation; the standalone GEMM (`kernels/fused_gemm_mxf4_native_1cta`)
+    // realizes +22%, the fused capstone (`kernels/fused_swiglu_mxf4_native_two_gemm`)
+    // realizes +20.6%. This kernel ports the same swap into the production
+    // mega_moe path. `kind::mxf4` is K-major-only (PTX ISA Table 53) and
+    // accepts only E2M1 inputs — see the host-side `DG_HOST_ASSERT(not
+    // use_mxf4_kind or use_fp4_acts)` in `mega.hpp`.
+    bool kUseMxf4Kind = false,
+    // ====== Stream B (combine path) — DG_USE_FP8_COMBINE ======
+    // When true, the L2 epilogue ships FP8 E4M3 + per-(token, N=128) UE8M0
+    // SF over NVLink instead of BF16. Byte footprint per token per slot:
+    //   off: kHidden * 2  (BF16)
+    //   on:  kHidden + kHidden / kCombineGranK  (FP8 + SF, kCombineGranK=128)
+    // Halves NVLink bytes/token on the second a2a. Independent of
+    // `kUseFp4Acts` / `kUseMxf4Kind` (which control the dispatch a2a +
+    // mainloops); this flag only changes the combine slot's layout +
+    // L2 epilogue write-back + combine-reduce read.
+    bool kUseFp8Combine = false,
     uint32_t L1_SHAPE_N = kIntermediateHidden * 2,
     uint32_t L1_SHAPE_K = kHidden,
     uint32_t L2_SHAPE_N = kHidden,
@@ -95,7 +126,14 @@ sm100_fp8_fp4_mega_moe_impl(void* y,
         sym_buffer.get_base_ptr(), kNumRanks, kNumExperts, kNumMaxTokensPerRank, kNumTopk);
 
     // Token and buffer layouts
-    constexpr auto fp8_token_layout = layout::Data(kHidden);
+    // ====== Stream A0.0b — DG_USE_FP4_ACTS L1 input path ======
+    // When `kUseFp4Acts`, the symmetric `x` slot (and the L1 token pool that
+    // mirrors it) holds packed E2M1 (FP4) instead of dense E4M3 (FP8). The
+    // packed footprint is `kHidden / 2` bytes per token. The SF slot is
+    // unchanged (`kHidden / 32` bytes — `gran_k=32` for both FP4 and FP8 acts
+    // under `kind::mxf8f6f4`).
+    constexpr uint32_t kInputTokenBytes = kUseFp4Acts ? (kHidden / 2) : kHidden;
+    constexpr auto fp8_token_layout = layout::Data(kInputTokenBytes);
     constexpr auto bf16_token_layout = layout::Data(kHidden * sizeof(nv_bfloat16));
     constexpr auto fp8_intermediate_token_layout = layout::Data(kIntermediateHidden);
     constexpr auto fp8_sf_layout = layout::Data(kHidden / 32);
@@ -152,23 +190,53 @@ sm100_fp8_fp4_mega_moe_impl(void* y,
         l2_token_buffer.get_end_ptr()
     );
 
-    // Combine inputs
+    // Combine inputs.
+    // Stream B: under `kUseFp8Combine`, the slot holds FP8 E4M3 (kHidden
+    // bytes/token) + a separate SF slot holding UE8M0 bytes
+    // (kHidden / kCombineGranK bytes/token, kCombineGranK = 128). Off → BF16
+    // (kHidden*2 bytes/token), zero-sized SF slot.
+    constexpr uint32_t kCombineGranK = 128;
+    DG_STATIC_ASSERT(kHidden % kCombineGranK == 0, "kHidden must be a multiple of 128 for FP8 combine SF");
+    constexpr auto combine_token_layout = layout::Data(
+        kUseFp8Combine ? kHidden : (kHidden * 2));
+    constexpr auto combine_sf_layout = layout::Data(
+        kUseFp8Combine ? (kHidden / kCombineGranK) : 0,
+        /*require_tma_alignment=*/false);
     const auto combine_token_buffer = layout::Buffer(
-        bf16_token_layout, kNumTopk, kNumMaxTokensPerRank,
+        combine_token_layout, kNumTopk, kNumMaxTokensPerRank,
         l2_sf_buffer.get_end_ptr()
     );
+    const auto combine_sf_buffer = layout::Buffer(
+        combine_sf_layout, kNumTopk, kNumMaxTokensPerRank,
+        combine_token_buffer.get_end_ptr()
+    );
 
     // Data types
     // NOTES: activations are FP8 (e4m3), weights are FP4 (e2m1)
     using a_dtype_t = cutlass::float_e4m3_t;
     using b_dtype_t = cutlass::detail::float_e2m1_unpacksmem_t;
+    // Stream A0.2: when `kUseFp4Acts` is on, the L2 phase reads acts as
+    // E2M1 instead of E4M3. Both share the same byte footprint in smem
+    // (FP8 = 1 B, FP4 unpacksmem = 1 B with `_ALIGN16B` padding), so the
+    // smem A allocation, swizzle mode (128 B), and umma_desc stride math
+    // are identical. Only the *MMA instruction descriptor*'s A-dtype field
+    // and the source-side TMA `expect_tx` differ between phases.
+    using l2_a_dtype_t = cute::conditional_t<kUseFp4Acts, b_dtype_t, a_dtype_t>;
+    // Stream A0.0b: same deal for L1 — when `kUseFp4Acts` is on, the L1
+    // phase reads its A operand from the L1 token pool as packed E2M1.
+    // Same `_ALIGN16B` padded smem layout as L2; same MMA instruction
+    // descriptor flip from E4M3 to E2M1.
+    using l1_a_dtype_t = cute::conditional_t<kUseFp4Acts, b_dtype_t, a_dtype_t>;
 
     // MMA configs
     // NOTES: always swap A/B, 2-CTA MMA, and matrices are K-major
     constexpr uint32_t LAYOUT_AD_M = 128;
     constexpr uint32_t UMMA_M = LAYOUT_AD_M * 2;
     constexpr uint32_t UMMA_N = BLOCK_M;  // Swap AB
-    constexpr uint32_t UMMA_K = 32;
+    // Stream A0.5: kind::mxf4 runs K=64 dense per call (vs K=32 for
+    // kind::mxf8f6f4). BLOCK_K stays 128 elements; the # of MMA calls per
+    // K-tile (`BLOCK_K / UMMA_K`) halves from 4 to 2.
+    constexpr uint32_t UMMA_K = kUseMxf4Kind ? 64 : 32;
     constexpr uint32_t LOAD_BLOCK_M = BLOCK_M / 2;  // Multicast on A
     constexpr uint32_t LOAD_BLOCK_N = BLOCK_N;
     DG_STATIC_ASSERT(BLOCK_M % 16 == 0, "Invalid block M");
@@ -176,8 +244,23 @@ sm100_fp8_fp4_mega_moe_impl(void* y,
     DG_STATIC_ASSERT(BLOCK_K == 128, "Invalid block K");
 
     // Swizzle configs
-    constexpr uint32_t kSwizzleAMode = BLOCK_K * sizeof(a_dtype_t);
-    constexpr uint32_t kSwizzleBMode = BLOCK_K * sizeof(b_dtype_t);
+    // Stream A0.5: under `kUseMxf4Kind`, A and B smem use the dense FP4
+    // layout (`_ALIGN8B`, 2 nibbles/byte) instead of the with-padding
+    // layout (`_ALIGN16B`, 1 byte per element). Per-K-row byte stride
+    // halves: BLOCK_K elements × 0.5 B/elem = BLOCK_K / 2 bytes. Swizzle
+    // mode tracks the row-byte width.
+    constexpr uint32_t kSwizzleAMode = kUseMxf4Kind
+        ? (BLOCK_K / 2)
+        : (BLOCK_K * static_cast<uint32_t>(sizeof(a_dtype_t)));
+    constexpr uint32_t kSwizzleBMode = kUseMxf4Kind
+        ? (BLOCK_K / 2)
+        : (BLOCK_K * static_cast<uint32_t>(sizeof(b_dtype_t)));
+    // Stream A0.2: l2_a_dtype must keep the same smem footprint as
+    // a_dtype so SMEM_A_SIZE_PER_STAGE / kSwizzleAMode are unchanged.
+    DG_STATIC_ASSERT(sizeof(l2_a_dtype_t) == sizeof(a_dtype_t),
+                     "L2 A dtype must match A in smem footprint");
+    DG_STATIC_ASSERT(sizeof(l1_a_dtype_t) == sizeof(a_dtype_t),
+                     "L1 A dtype must match A in smem footprint");
     constexpr uint32_t kSwizzleCDMode = 128;
     DG_STATIC_ASSERT(BLOCK_N % kSwizzleCDMode == 0, "Invalid block N");
 
@@ -192,16 +275,30 @@ sm100_fp8_fp4_mega_moe_impl(void* y,
     // Shared memory sizes
     // NOTES: FP8 CD output for L1 (2 TMA stages, BLOCK_N/2 post-SwiGLU), BF16 output for L2 (no TMA, a single stage)
     constexpr uint32_t L1_OUT_BLOCK_N = BLOCK_N / 2;
+    // ====== Stream A0.1 ======
+    // FP4 path packs 2 elements per byte → row footprint halves. We keep
+    // `L1_OUT_BLOCK_N` in *elements* and introduce a row-byte-stride that
+    // depends on the flag, so the existing offset arithmetic (`row *
+    // L1_OUT_BLOCK_N_BYTES`) still works for both paths.
+    constexpr uint32_t L1_OUT_ROW_BYTES = kUseFp4Acts ? (L1_OUT_BLOCK_N / 2) : L1_OUT_BLOCK_N;
     constexpr uint32_t SMEM_EXPERT_COUNT_SIZE =
         math::constexpr_align<uint32_t>(kNumExperts * sizeof(uint32_t), kSharedMemoryAlignment);
     constexpr uint32_t SMEM_SEND_BUFFER_SIZE =
         math::constexpr_align(fp8_token_layout.get_num_bytes() * kNumDispatchWarps, kSharedMemoryAlignment);
-    constexpr uint32_t SMEM_A_SIZE_PER_STAGE = LOAD_BLOCK_M * BLOCK_K * sizeof(a_dtype_t);
-    constexpr uint32_t SMEM_B_SIZE_PER_STAGE = LOAD_BLOCK_N * BLOCK_K * sizeof(b_dtype_t);
+    // Stream A0.5: under `kUseMxf4Kind`, dense FP4 smem (2 nibbles/byte)
+    // halves the per-stage byte footprint vs the with-padding layout.
+    constexpr uint32_t SMEM_A_SIZE_PER_STAGE = kUseMxf4Kind
+        ? (LOAD_BLOCK_M * BLOCK_K / 2)
+        : (LOAD_BLOCK_M * BLOCK_K * static_cast<uint32_t>(sizeof(a_dtype_t)));
+    constexpr uint32_t SMEM_B_SIZE_PER_STAGE = kUseMxf4Kind
+        ? (LOAD_BLOCK_N * BLOCK_K / 2)
+        : (LOAD_BLOCK_N * BLOCK_K * static_cast<uint32_t>(sizeof(b_dtype_t)));
     constexpr uint32_t SMEM_SFA_SIZE_PER_STAGE = SF_BLOCK_M * sizeof(uint32_t);
     constexpr uint32_t SMEM_SFB_SIZE_PER_STAGE = SF_BLOCK_N * sizeof(uint32_t);
+    // L1 CD smem: FP8 path = STORE_BLOCK_M * L1_OUT_BLOCK_N bytes/stage,
+    // FP4 path = STORE_BLOCK_M * L1_OUT_BLOCK_N / 2 bytes/stage.
     constexpr uint32_t SMEM_CD_L1_SIZE =
-        kNumEpilogueWarpgroups * STORE_BLOCK_M * L1_OUT_BLOCK_N * sizeof(cutlass::float_e4m3_t) * kNumTMAStoreStages;
+        kNumEpilogueWarpgroups * STORE_BLOCK_M * L1_OUT_ROW_BYTES * kNumTMAStoreStages;
     constexpr uint32_t SMEM_CD_L2_SIZE =
         kNumEpilogueWarpgroups * STORE_BLOCK_M * BLOCK_N * sizeof(nv_bfloat16);
     constexpr uint32_t SMEM_CD_SIZE = SMEM_CD_L1_SIZE > SMEM_CD_L2_SIZE ? SMEM_CD_L1_SIZE : SMEM_CD_L2_SIZE;
@@ -545,12 +642,17 @@ sm100_fp8_fp4_mega_moe_impl(void* y,
             const uint32_t src_topk_idx = src_token_topk_idx % kNumTopk;
 
             // TMA load token from remote rank into shared memory
+            // Stream A0.0b: under `kUseFp4Acts`, the source slot in the
+            // remote rank's symmetric `x` buffer is packed E2M1 (kHidden/2
+            // bytes), so the per-token NVLink pull halves. The local pull
+            // buffer / l1 token buffer is sized off `fp8_token_layout` which
+            // already reflects the FP4 footprint (see `kInputTokenBytes`).
             if (cute::elect_one_sync()) {
                 ptx::tma_load_1d(
                     pull_buffer.get_base_ptr(),
                     sym_buffer.map(input_token_buffer.get_data_buffer(src_token_idx).get_base_ptr(),
                                    current_rank_in_expert_idx),
-                    pull_mbarrier, kHidden);
+                    pull_mbarrier, kInputTokenBytes);
             }
             __syncwarp();
 
@@ -581,7 +683,8 @@ sm100_fp8_fp4_mega_moe_impl(void* y,
                 *l1_topk_weights_buffer.get_data_buffer(pool_token_idx).get_base_ptr<float>() = weight;
 
                 // Wait for TMA token load to complete
-                ptx::mbarrier_arrive_and_set_tx(pull_mbarrier, kHidden);
+                // Stream A0.0b: expect_tx halves with the FP4 packed footprint.
+                ptx::mbarrier_arrive_and_set_tx(pull_mbarrier, kInputTokenBytes);
                 ptx::mbarrier_wait_and_flip_phase(pull_mbarrier, pull_mbarrier_phase);
 
                 // Store token to local L1 buffer via TMA
@@ -712,14 +815,50 @@ sm100_fp8_fp4_mega_moe_impl(void* y,
                 if (not is_leader_cta)
                     m_idx += scheduler.template get_valid_m<true>() / 2;
 
-                // TMA copy tokens and SFA, then arrive at full barrier
+                // TMA copy tokens and SFA, then arrive at full barrier.
+                // Stream A0.2 + A0.0b: under FP4 acts, BOTH L1 and L2 phases
+                // load A as packed E2M1 (`l1_a_dtype_t == l2_a_dtype_t == b_dtype_t`).
+                // Same per-byte smem layout as FP8 A (1 B/elem under `_ALIGN16B`),
+                // but source-side packed bytes are halved → expect_tx halved.
                 if (cute::elect_one_sync()) {
-                    tma::copy<BLOCK_K, LOAD_BLOCK_M, kSwizzleAMode, a_dtype_t>(
-                        tensor_map_a_ptr, full_barriers[stage_idx], smem_a[stage_idx], k_idx, m_idx, 2);
+                    if constexpr (kUseMxf4Kind) {
+                        // Stream A0.5: dense FP4 smem (`_ALIGN8B`). The TMA
+                        // descriptor's inner box covers BLOCK_K elements in
+                        // BLOCK_K/2 bytes per row; one cluster-multicast TMA
+                        // call fills the full A stage. Bypass `tma::copy`
+                        // because its `BLOCK_INNER_ATOM = kSwizzleMode /
+                        // sizeof(dtype_t)` math assumes ≥1-byte elements
+                        // and would mis-stride sub-byte FP4 destinations.
+                        cute::SM100_TMA_2SM_LOAD_2D::copy(
+                            tensor_map_a_ptr,
+                            reinterpret_cast<uint64_t*>(full_barriers[stage_idx]),
+                            static_cast<uint64_t>(cute::TMA::CacheHintSm100::EVICT_NORMAL),
+                            reinterpret_cast<uint8_t*>(smem_a[stage_idx]),
+                            k_idx, m_idx);
+                    } else if constexpr (kUseFp4Acts) {
+                        // Both Linear1 (L1) and Linear2 (L2) take the FP4 path.
+                        tma::copy<BLOCK_K, LOAD_BLOCK_M, kSwizzleAMode, l1_a_dtype_t>(
+                            tensor_map_a_ptr, full_barriers[stage_idx],
+                            reinterpret_cast<l1_a_dtype_t*>(smem_a[stage_idx]),
+                            k_idx, m_idx, 2);
+                    } else {
+                        tma::copy<BLOCK_K, LOAD_BLOCK_M, kSwizzleAMode, a_dtype_t>(
+                            tensor_map_a_ptr, full_barriers[stage_idx], smem_a[stage_idx],
+                            k_idx, m_idx, 2);
+                    }
                     tma::copy<SF_BLOCK_M, 1, 0>(
                         tensor_map_sfa_ptr, full_barriers[stage_idx], smem_sfa[stage_idx], sfa_m_idx, sfa_k_idx, 2);
                     if (is_leader_cta) {
-                        full_barriers[stage_idx]->arrive_and_expect_tx(SMEM_A_SIZE_PER_STAGE * 2 + SF_BLOCK_M * sizeof(uint32_t) * 2);
+                        // Stream A0.5: under `kUseMxf4Kind`, smem A is dense
+                        // FP4 (LOAD_BLOCK_M * BLOCK_K / 2 bytes per CTA, equal
+                        // to source-side packed bytes — no `_ALIGN16B` doubling).
+                        // For 2 CTAs (cluster multicast), tx-count is
+                        // `2 * SMEM_A_SIZE_PER_STAGE` — same multiplier as the
+                        // FP8 dense path.
+                        const uint32_t expect_a_bytes = (kUseFp4Acts and not kUseMxf4Kind)
+                            ? SMEM_A_SIZE_PER_STAGE       // FP4 _ALIGN16B: source = LOAD_BLOCK_M * BLOCK_K / 2 per CTA × 2 CTAs (smem 2× larger)
+                            : SMEM_A_SIZE_PER_STAGE * 2;  // FP8 dense or FP4 dense (mxf4): source = smem footprint × 2 CTAs
+                        full_barriers[stage_idx]->arrive_and_expect_tx(expect_a_bytes + SF_BLOCK_M * sizeof(uint32_t) * 2);
                     } else {
                         full_barriers[stage_idx]->arrive(0u);
                     }
@@ -757,12 +896,35 @@ sm100_fp8_fp4_mega_moe_impl(void* y,
 
                 // TMA copy weights with SF
                 if (cute::elect_one_sync()) {
-                    tma::copy<BLOCK_K, LOAD_BLOCK_N, kSwizzleBMode, b_dtype_t>(
-                        tensor_map_b_ptr, full_barriers[stage_idx], smem_b[stage_idx], k_idx, n_idx, 2);
+                    if constexpr (kUseMxf4Kind) {
+                        // Stream A0.5: dense FP4 smem; one cluster-multicast
+                        // TMA call covers the full B stage. See A-side comment.
+                        cute::SM100_TMA_2SM_LOAD_2D::copy(
+                            tensor_map_b_ptr,
+                            reinterpret_cast<uint64_t*>(full_barriers[stage_idx]),
+                            static_cast<uint64_t>(cute::TMA::CacheHintSm100::EVICT_NORMAL),
+                            reinterpret_cast<uint8_t*>(smem_b[stage_idx]),
+                            k_idx, n_idx);
+                    } else {
+                        tma::copy<BLOCK_K, LOAD_BLOCK_N, kSwizzleBMode, b_dtype_t>(
+                            tensor_map_b_ptr, full_barriers[stage_idx], smem_b[stage_idx], k_idx, n_idx, 2);
+                    }
                     tma::copy<BLOCK_N, 1, 0>(
                         tensor_map_sfb_ptr, full_barriers[stage_idx], smem_sfb[stage_idx], sfb_n_idx, sfb_k_idx, 2);
                     if (is_leader_cta) {
-                        full_barriers[stage_idx]->arrive_and_expect_tx(SMEM_B_SIZE_PER_STAGE + BLOCK_N * sizeof(uint32_t) * 2);
+                        // Stream A0.5: B-side tx-count for cluster-multicast
+                        // counts SOURCE BYTES PER PEER × 2 PEERS (broadcast: both
+                        // peers receive a copy of the same source bytes). For the
+                        // existing FP4 unpacksmem path, that happens to equal
+                        // `LOAD_BLOCK_N * BLOCK_K * 1B = SMEM_B_SIZE_PER_STAGE`
+                        // (sizeof(b_dtype_t)=1 makes "smem footprint" a coincidental
+                        // alias for source-bytes-summed). Under mxf4 dense FP4,
+                        // SMEM_B_SIZE_PER_STAGE halves to `LOAD_BLOCK_N * BLOCK_K / 2`,
+                        // so we need `* 2` to get the same source-bytes-summed value.
+                        const uint32_t expect_b_bytes = kUseMxf4Kind
+                            ? SMEM_B_SIZE_PER_STAGE * 2
+                            : SMEM_B_SIZE_PER_STAGE;
+                        full_barriers[stage_idx]->arrive_and_expect_tx(expect_b_bytes + BLOCK_N * sizeof(uint32_t) * 2);
                     } else {
                         full_barriers[stage_idx]->arrive(0u);
                     }
@@ -783,11 +945,53 @@ sm100_fp8_fp4_mega_moe_impl(void* y,
                 UMMA_M, UMMA_N,
                 cute::UMMA::Major::K, cute::UMMA::Major::K
             >();
+            // Stream A0.2 + A0.0b: when both L1 and L2 read FP4 acts under
+            // `kUseFp4Acts`, we need a separate instruction descriptor whose
+            // A-dtype field is E2M1 (not E4M3). All other fields (block-scale
+            // shape, UMMA M/N/K, K-major) are unchanged. The smem layout
+            // descriptors don't change because both dtypes have `sizeof = 1`
+            // (FP4 has the `_ALIGN16B` 1-byte-per-element padded smem layout).
+            // Single shared idesc — both `l1_a_dtype_t` and `l2_a_dtype_t`
+            // resolve to `b_dtype_t` (E2M1 unpacksmem) under the flag.
+            //
+            // Stream A0.5: under `kUseMxf4Kind`, the descriptor's a/b_format
+            // fields encode E2M1 as `MXF4Format::E2M1 = 1`, NOT
+            // `MXF8F6F4Format::E2M1 = 5`. CUTLASS picks the right enum via
+            // `to_UMMAFormat<T>()`: passing `cute::float_e2m1_t` (dense) yields
+            // `MXF4Format::E2M1=1`; passing `cutlass::detail::float_e2m1_unpacksmem_t`
+            // yields `MXF8F6F4Format::E2M1=5`. Wrong encoding → the kernel
+            // launches but throws `cudaErrorIllegalInstruction` on first MMA.
+            using mxf4_e2m1_t = cute::float_e2m1_t;
+            using fp4_a_dtype_for_idesc = cute::conditional_t<
+                kUseMxf4Kind, mxf4_e2m1_t, b_dtype_t>;
+            using fp4_b_dtype_for_idesc = cute::conditional_t<
+                kUseMxf4Kind, mxf4_e2m1_t, l1_a_dtype_t>;
+            auto instr_desc_fp4 = cute::UMMA::make_instr_desc_block_scaled<
+                fp4_a_dtype_for_idesc, fp4_b_dtype_for_idesc,
+                float, cutlass::float_ue8m0_t,
+                UMMA_M, UMMA_N,
+                cute::UMMA::Major::K, cute::UMMA::Major::K
+            >();
             auto sf_desc = mma::sm100::make_sf_desc(nullptr);
 
             DG_STATIC_ASSERT(kNumStages <= 32, "Too many stages");
-            auto a_desc = mma::sm100::make_umma_desc<cute::UMMA::Major::K, LOAD_BLOCK_M, BLOCK_K, kSwizzleAMode>(smem_a[0], 0, 0);
-            auto b_desc = mma::sm100::make_umma_desc<cute::UMMA::Major::K, LOAD_BLOCK_N, BLOCK_K, kSwizzleBMode>(smem_b[0], 0, 0);
+            // Stream A0.5: under `kUseMxf4Kind`, smem A and B carry dense
+            // FP4 (2 nibbles/byte). The `make_umma_desc` helper asserts
+            // `kSwizzleMode == BLOCK_K * sizeof(dtype_t)`, so we pass a
+            // BLOCK_K of `BLOCK_K / 2` (the byte count) and `dtype_t =
+            // uint8_t` to get the right byte-stride math. The smem ptrs
+            // are reinterpreted to `uint8_t*` since the underlying buffer
+            // is just bytes.
+            cute::UMMA::SmemDescriptor a_desc, b_desc;
+            if constexpr (kUseMxf4Kind) {
+                a_desc = mma::sm100::make_umma_desc<cute::UMMA::Major::K, LOAD_BLOCK_M, BLOCK_K / 2, kSwizzleAMode>(
+                    reinterpret_cast<uint8_t*>(smem_a[0]), 0, 0);
+                b_desc = mma::sm100::make_umma_desc<cute::UMMA::Major::K, LOAD_BLOCK_N, BLOCK_K / 2, kSwizzleBMode>(
+                    reinterpret_cast<uint8_t*>(smem_b[0]), 0, 0);
+            } else {
+                a_desc = mma::sm100::make_umma_desc<cute::UMMA::Major::K, LOAD_BLOCK_M, BLOCK_K, kSwizzleAMode>(smem_a[0], 0, 0);
+                b_desc = mma::sm100::make_umma_desc<cute::UMMA::Major::K, LOAD_BLOCK_N, BLOCK_K, kSwizzleBMode>(smem_b[0], 0, 0);
+            }
             uint32_t a_desc_lo = lane_idx < kNumStages ? a_desc.lo + lane_idx * SMEM_A_SIZE_PER_STAGE / 16 : 0u;
             uint32_t b_desc_lo = lane_idx < kNumStages ? b_desc.lo + lane_idx * SMEM_B_SIZE_PER_STAGE / 16 : 0u;
 
@@ -805,6 +1009,8 @@ sm100_fp8_fp4_mega_moe_impl(void* y,
                                          const uint32_t& m_block_idx, const uint32_t& n_block_idx) {
                 // Dynamic update of UMMA N based on effective M
                 mma::sm100::update_instr_desc_with_umma_n(instr_desc, scheduler.template get_valid_m<true>());
+                if constexpr (kUseFp4Acts)
+                    mma::sm100::update_instr_desc_with_umma_n(instr_desc_fp4, scheduler.template get_valid_m<true>());
 
                 // Wait tensor memory empty barrier arrival
                 const auto accum_stage_idx = current_iter_idx % kNumEpilogueStages;
@@ -851,19 +1057,51 @@ sm100_fp8_fp4_mega_moe_impl(void* y,
                             cute_utccp_t::copy(sf_desc, kTmemStartColOfSFB + i * 4);
                         }
 
-                        // Issue UMMA
+                        // Issue UMMA. Stream A0.2: L2 phase under FP4 acts
+                        // uses `instr_desc_l2` (A=E2M1) instead of `instr_desc`
+                        // (A=E4M3). The smem K-stride for A is the same
+                        // (sizeof(l2_a_dtype_t) == sizeof(a_dtype_t) == 1) so
+                        // `advance_umma_desc_lo` on `a_dtype_t` is correct
+                        // for both phases.
+                        // Stream A0.5: under `kUseMxf4Kind`, swap the MMA to
+                        // `kind::mxf4` (cta_group::2). UMMA_K=64 (vs 32),
+                        // so K_PER_TILE=2 (vs 4). The SF address top-2 bits
+                        // are HALF-WORD offsets {0, 2} for scale_vec::2X
+                        // (NOT byte offsets {0..3}); encode as `k * 2`, not `k`.
+                        // Smem K-stride for the dense FP4 layout is `BLOCK_K/2`
+                        // bytes/row, so `advance_umma_desc_lo` is templated on
+                        // `uint8_t` and `BLOCK_K / 2` to match.
                         #pragma unroll
                         for (uint32_t k = 0; k < BLOCK_K / UMMA_K; ++ k) {
-                            const auto runtime_instr_desc =
-                                mma::sm100::make_runtime_instr_desc_with_sf_id(instr_desc, k, k);
-                            a_desc.lo = mma::sm100::advance_umma_desc_lo<
-                                cute::UMMA::Major::K, LOAD_BLOCK_M, kSwizzleAMode, a_dtype_t>(a_desc_base_lo, 0, k * UMMA_K);
-                            b_desc.lo = mma::sm100::advance_umma_desc_lo<
-                                cute::UMMA::Major::K, LOAD_BLOCK_N, kSwizzleBMode, b_dtype_t>(b_desc_base_lo, 0, k * UMMA_K);
-                            ptx::SM100_MMA_MXF8F6F4_2x1SM_SS::fma(
-                                b_desc, a_desc, accum_stage_idx * UMMA_N,
-                                k_block_idx > 0 or k > 0, runtime_instr_desc,
-                                kTmemStartColOfSFB, kTmemStartColOfSFA);
+                            if constexpr (kUseMxf4Kind) {
+                                const auto sf_id = k * 2u;  // half-word offset for scale_vec::2X
+                                const auto runtime_instr_desc =
+                                    mma::sm100::make_runtime_instr_desc_with_sf_id(instr_desc_fp4, sf_id, sf_id);
+                                a_desc.lo = mma::sm100::advance_umma_desc_lo<
+                                    cute::UMMA::Major::K, LOAD_BLOCK_M, kSwizzleAMode, uint8_t>(
+                                        a_desc_base_lo, 0, k * UMMA_K / 2);
+                                b_desc.lo = mma::sm100::advance_umma_desc_lo<
+                                    cute::UMMA::Major::K, LOAD_BLOCK_N, kSwizzleBMode, uint8_t>(
+                                        b_desc_base_lo, 0, k * UMMA_K / 2);
+                                ptx::SM100_MMA_MXF4_2x1SM_SS::fma(
+                                    b_desc, a_desc, accum_stage_idx * UMMA_N,
+                                    k_block_idx > 0 or k > 0, runtime_instr_desc,
+                                    kTmemStartColOfSFB, kTmemStartColOfSFA);
+                            } else {
+                                // Stream A0.0b: under `kUseFp4Acts`, both L1 and L2 read
+                                // A as E2M1. Pick the FP4 idesc unconditionally when the flag is on.
+                                const auto runtime_instr_desc = kUseFp4Acts
+                                    ? mma::sm100::make_runtime_instr_desc_with_sf_id(instr_desc_fp4, k, k)
+                                    : mma::sm100::make_runtime_instr_desc_with_sf_id(instr_desc, k, k);
+                                a_desc.lo = mma::sm100::advance_umma_desc_lo<
+                                    cute::UMMA::Major::K, LOAD_BLOCK_M, kSwizzleAMode, a_dtype_t>(a_desc_base_lo, 0, k * UMMA_K);
+                                b_desc.lo = mma::sm100::advance_umma_desc_lo<
+                                    cute::UMMA::Major::K, LOAD_BLOCK_N, kSwizzleBMode, b_dtype_t>(b_desc_base_lo, 0, k * UMMA_K);
+                                ptx::SM100_MMA_MXF8F6F4_2x1SM_SS::fma(
+                                    b_desc, a_desc, accum_stage_idx * UMMA_N,
+                                    k_block_idx > 0 or k > 0, runtime_instr_desc,
+                                    kTmemStartColOfSFB, kTmemStartColOfSFA);
+                            }
                         }
                     }
                     __syncwarp();
@@ -1038,7 +1276,8 @@ sm100_fp8_fp4_mega_moe_impl(void* y,
                     ptx::tma_store_wait<kNumTMAStoreStages - 1>();
                     ptx::sync_aligned(128, kEpilogueWGBarrierStartIdx + epilogue_wg_idx);
 
-                    // Cast to FP8 E4M3 and store into shared memory
+                    // Cast to FP8 E4M3 (or FP4 E2M1 under `kUseFp4Acts`) and
+                    // store into shared memory.
                     #pragma unroll
                     for (uint32_t i = 0; i < kNumAtomsPerStore; ++ i) {
                         // Reduce amax
@@ -1047,23 +1286,101 @@ sm100_fp8_fp4_mega_moe_impl(void* y,
                         amax_values[i].x = cute::max(amax_values[i].x, wp_amax.x);
                         amax_values[i].y = cute::max(amax_values[i].y, wp_amax.y);
 
-                        // Calculate SF
+                        // Calculate SF (UE8M0 byte; only the finfo divisor differs:
+                        // 1/448 for FP8 E4M3, 1/6 for FP4 E2M1).
                         float2 sf, sf_inv;
-                        math::get_e4m3_sf_and_sf_inv(amax_values[i], sf, sf_inv);
+                        if constexpr (kUseFp4Acts) {
+                            math::get_e2m1_sf_and_sf_inv(amax_values[i], sf, sf_inv);
+                        } else {
+                            math::get_e4m3_sf_and_sf_inv(amax_values[i], sf, sf_inv);
+                        }
 
-                        // Cast
+                        // Apply scale, cast, store into shared memory.
                         const float2 upper = __fmul2_rn(swiglu_values[i * 2 + 0], sf_inv);
                         const float2 lower = __fmul2_rn(swiglu_values[i * 2 + 1], sf_inv);
-                        const auto fp8x4_values = __nv_fp8x4_e4m3(make_float4(upper.x, upper.y, lower.x, lower.y));
-
-                        // STSM
-                        uint32_t row = lane_idx;
-                        uint32_t col = warp_idx_in_wg;
-                        const auto smem_ptr = smem_cd[tma_stage_idx] + epilogue_wg_idx * STORE_BLOCK_M * L1_OUT_BLOCK_N
-                                                                     + i * ATOM_M * L1_OUT_BLOCK_N
-                                                                     + row * L1_OUT_BLOCK_N
-                                                                     + (col ^ (row / 2)) * kNumBankGroupBytes;
-                        ptx::SM100_U8x4_STSM_T<__nv_fp8x4_e4m3>::copy(fp8x4_values, smem_ptr);
+                        if constexpr (kUseFp4Acts) {
+                            // FP4 epilogue: write packed E2M1 nibbles to canonical
+                            // dense smem (TMA descriptor built with swizzle=0 →
+                            // byte-exact smem→gmem copy → canonical packed FP4
+                            // layout `[M, intermediate_hidden/2]` in gmem).
+                            //
+                            // Layout under SwapAB: `tcgen05.ld.16x256b.x1` puts
+                            // lane T's accumulator values (upper.x, upper.y,
+                            // lower.x, lower.y) at smem positions:
+                            //   upper.x → row 2*(T%4),   col_in_stripe T/4
+                            //   upper.y → row 2*(T%4)+1, col_in_stripe T/4
+                            //   lower.x → row 2*(T%4),   col_in_stripe T/4 + 8
+                            //   lower.y → row 2*(T%4)+1, col_in_stripe T/4 + 8
+                            // (16-byte stripe per warp_idx_in_wg ∈ 0..3, 64 B row.)
+                            // Adjacent N-cols therefore sit on lanes T and T XOR 4,
+                            // so packing two values into one FP4 byte requires a
+                            // `__shfl_xor 4` to pull the buddy. Half-warp gate
+                            // (group = lane/4, group%2==0) means each "active"
+                            // lane writes 4 bytes (upper.x, upper.y, lower.x,
+                            // lower.y) and the inactive half is a donor.
+                            //
+                            // The cross-quad shuffle and half-warp gate are
+                            // structural: they're a consequence of SwapAB's
+                            // datapoint=N orientation. Replacing with
+                            // `tcgen05.ld.32x32b.x8` would require dropping
+                            // SwapAB at the mainloop level. See
+                            // DeepGEMM/FP4_EPILOGUE_STORE_MICROBENCH.md for the
+                            // full microbench analysis (P-A through P-D) and
+                            // the negative results from bank-conflict
+                            // elimination + atom-interleaving.
+                            const float buddy_ux = __shfl_xor_sync(0xffffffffu, upper.x, 4);
+                            const float buddy_uy = __shfl_xor_sync(0xffffffffu, upper.y, 4);
+                            const float buddy_lx = __shfl_xor_sync(0xffffffffu, lower.x, 4);
+                            const float buddy_ly = __shfl_xor_sync(0xffffffffu, lower.y, 4);
+
+                            const uint32_t frag  = lane_idx % 4;          // row-pair index 0..3
+                            const uint32_t group = lane_idx / 4;          // col-group index 0..7
+                            const bool     is_active = (group % 2u) == 0u;
+
+                            // Active lanes pack (own_val, buddy_val) into a byte
+                            // (own=low nibble, buddy=high) and write 4 bytes per
+                            // atom. `cvt_pack_f32_to_e2m1x2(a, b)` → {low=a, high=b}.
+                            if (is_active) {
+                                const uint8_t byte_ux = static_cast<uint8_t>(
+                                    math::cvt_pack_f32_to_e2m1x2(upper.x, buddy_ux));
+                                const uint8_t byte_uy = static_cast<uint8_t>(
+                                    math::cvt_pack_f32_to_e2m1x2(upper.y, buddy_uy));
+                                const uint8_t byte_lx = static_cast<uint8_t>(
+                                    math::cvt_pack_f32_to_e2m1x2(lower.x, buddy_lx));
+                                const uint8_t byte_ly = static_cast<uint8_t>(
+                                    math::cvt_pack_f32_to_e2m1x2(lower.y, buddy_ly));
+
+                                constexpr uint32_t kFp4WarpStripeBytes = 8;  // 16 elements / 2
+                                const uint32_t byte_pos_upper = group / 2u;             // 0..3
+                                const uint32_t byte_pos_lower = 4u + group / 2u;        // 4..7
+                                const uint32_t row_even = i * ATOM_M + 2u * frag;
+                                const uint32_t row_odd  = row_even + 1u;
+                                const auto base = smem_cd[tma_stage_idx]
+                                                + epilogue_wg_idx * STORE_BLOCK_M * L1_OUT_ROW_BYTES
+                                                + warp_idx_in_wg * kFp4WarpStripeBytes;
+                                auto write_byte = [&](uint32_t row, uint32_t bp, uint8_t v) {
+                                    auto p = base + row * L1_OUT_ROW_BYTES + bp;
+                                    asm volatile("st.shared.u8 [%0], %1;\n"
+                                                 :: "l"(__cvta_generic_to_shared(p)),
+                                                    "r"(static_cast<uint32_t>(v)));
+                                };
+                                write_byte(row_even, byte_pos_upper, byte_ux);
+                                write_byte(row_odd,  byte_pos_upper, byte_uy);
+                                write_byte(row_even, byte_pos_lower, byte_lx);
+                                write_byte(row_odd,  byte_pos_lower, byte_ly);
+                            }
+                        } else {
+                            const auto fp8x4_values = __nv_fp8x4_e4m3(make_float4(upper.x, upper.y, lower.x, lower.y));
+
+                            // STSM
+                            uint32_t row = lane_idx;
+                            uint32_t col = warp_idx_in_wg;
+                            const auto smem_ptr = smem_cd[tma_stage_idx] + epilogue_wg_idx * STORE_BLOCK_M * L1_OUT_BLOCK_N
+                                                                         + i * ATOM_M * L1_OUT_BLOCK_N
+                                                                         + row * L1_OUT_BLOCK_N
+                                                                         + (col ^ (row / 2)) * kNumBankGroupBytes;
+                            ptx::SM100_U8x4_STSM_T<__nv_fp8x4_e4m3>::copy(fp8x4_values, smem_ptr);
+                        }
 
                         // Store SF to `l2_sf_buffer` as UE8M0 (MN-major layout)
                         // Only one warp per pair writes (both hold the same SF after cross-warp reduce)
@@ -1095,13 +1412,21 @@ sm100_fp8_fp4_mega_moe_impl(void* y,
                     }
                     ptx::sync_aligned(128, kEpilogueWGBarrierStartIdx + epilogue_wg_idx);
 
-                    // Issue TMA store after all atoms in this store block
+                    // Issue TMA store after all atoms in this store block.
+                    // FP8 path: out_n in elements-of-FP8 (= bytes), smem
+                    // base offset by FP8 row width (L1_OUT_BLOCK_N).
+                    // FP4 path: TMA descriptor's element type is uint8 with
+                    // half the inner dim → out_n in packed bytes (=
+                    // L1_OUT_BLOCK_N / 2), smem base offset by
+                    // L1_OUT_ROW_BYTES = L1_OUT_BLOCK_N / 2 bytes.
                     if (warp_idx_in_wg == 0 and cute::elect_one_sync()) {
-                        uint32_t out_n_idx = n_block_idx * L1_OUT_BLOCK_N;
+                        const uint32_t out_n_idx = kUseFp4Acts
+                            ? (n_block_idx * (L1_OUT_BLOCK_N / 2))
+                            : (n_block_idx * L1_OUT_BLOCK_N);
                         cute::tma_store_fence();
                         cute::SM90_TMA_STORE_2D::copy(
                             &tensor_map_l1_output,
-                            smem_cd[tma_stage_idx] + epilogue_wg_idx * STORE_BLOCK_M * L1_OUT_BLOCK_N,
+                            smem_cd[tma_stage_idx] + epilogue_wg_idx * STORE_BLOCK_M * L1_OUT_ROW_BYTES,
                             out_n_idx,
                             m_idx + epilogue_wg_idx * WG_BLOCK_M + s * STORE_BLOCK_M);
                         cute::tma_store_arrive();
@@ -1209,13 +1534,86 @@ sm100_fp8_fp4_mega_moe_impl(void* y,
                             (bank_group_idx ^ row_in_atom) * kNumBankGroupBytes;
                         const auto packed = ptx::ld_shared(reinterpret_cast<float4*>(smem_ptr));
 
-                        // Write into remote
-                        const auto dst_token = combine_token_buffer.get_rank_buffer(dst_topk_idx)
-                                               .get_data_buffer(dst_token_idx);
-                        const auto dst_ptr = math::advance_ptr<float4>(
-                            dst_token.get_base_ptr(),
-                            n_idx * static_cast<uint32_t>(sizeof(nv_bfloat16)) + (lane_idx % 16) * static_cast<uint32_t>(sizeof(float4)));
-                        *sym_buffer.map(dst_ptr, dst_rank_idx) = packed;
+                        if constexpr (kUseFp8Combine) {
+                            // Stream B: BF16 (in `packed`) → FP8 E4M3 + per-row UE8M0 SF.
+                            //
+                            // 16 lanes (lane_idx & ~15u) cover one row's
+                            // BLOCK_N=128 elements (= 8 BF16 each). Compute
+                            // per-row amax via warp_reduce over those 16
+                            // lanes, then quantize.
+                            const auto bf_pairs = reinterpret_cast<const nv_bfloat162*>(&packed);
+                            float local_amax = 0.0f;
+                            #pragma unroll
+                            for (int q = 0; q < 4; ++q) {
+                                const float2 vf = __bfloat1622float2(bf_pairs[q]);
+                                local_amax = cute::max(local_amax, cute::abs(vf.x));
+                                local_amax = cute::max(local_amax, cute::abs(vf.y));
+                            }
+                            // Reduce within the 16-lane group sharing this row.
+                            // Use a 16-lane mask (NOT 0xffffffff) because the
+                            // outer `if (m_idx_in_block >= valid_m) break` may
+                            // cause the OTHER half-warp's 16 lanes to exit
+                            // early on padding rows. A full-warp shfl would
+                            // deadlock waiting on those exited lanes.
+                            const uint32_t row_mask = 0x0000FFFFu << (16u * (lane_idx / 16));
+                            local_amax = cute::max(local_amax, __shfl_xor_sync(row_mask, local_amax, 1));
+                            local_amax = cute::max(local_amax, __shfl_xor_sync(row_mask, local_amax, 2));
+                            local_amax = cute::max(local_amax, __shfl_xor_sync(row_mask, local_amax, 4));
+                            local_amax = cute::max(local_amax, __shfl_xor_sync(row_mask, local_amax, 8));
+
+                            // UE8M0 SF (E4M3, finfo_max = 448).
+                            const int log2_ceil = math::fast_log2_ceil(local_amax * (1.0f / 448.0f));
+                            const float sf_inv = math::fast_pow2(-log2_ceil);
+                            const uint8_t sf_byte = static_cast<uint8_t>(log2_ceil + 127);
+
+                            // Scale, cast 4 BF16 pairs → 8 FP8 (= 2 fp8x4 = uint64).
+                            float4 lo, hi;
+                            const auto lo_pair = __bfloat1622float2(bf_pairs[0]);
+                            const auto lo_pair_b = __bfloat1622float2(bf_pairs[1]);
+                            const auto hi_pair = __bfloat1622float2(bf_pairs[2]);
+                            const auto hi_pair_b = __bfloat1622float2(bf_pairs[3]);
+                            lo.x = lo_pair.x * sf_inv;
+                            lo.y = lo_pair.y * sf_inv;
+                            lo.z = lo_pair_b.x * sf_inv;
+                            lo.w = lo_pair_b.y * sf_inv;
+                            hi.x = hi_pair.x * sf_inv;
+                            hi.y = hi_pair.y * sf_inv;
+                            hi.z = hi_pair_b.x * sf_inv;
+                            hi.w = hi_pair_b.y * sf_inv;
+                            const __nv_fp8x4_e4m3 fp8_lo(lo);
+                            const __nv_fp8x4_e4m3 fp8_hi(hi);
+                            const uint64_t fp8_uint64 =
+                                (uint64_t(fp8_lo.__x)) |
+                                (uint64_t(fp8_hi.__x) << 32);
+
+                            // Write 8 FP8 bytes (uint64) to remote, replacing
+                            // the BF16 16-byte write.
+                            const auto dst_token = combine_token_buffer.get_rank_buffer(dst_topk_idx)
+                                                                       .get_data_buffer(dst_token_idx);
+                            const auto dst_ptr = math::advance_ptr<uint64_t>(
+                                dst_token.get_base_ptr(),
+                                n_idx * static_cast<uint32_t>(sizeof(uint8_t)) +
+                                (lane_idx % 16) * static_cast<uint32_t>(sizeof(uint64_t)));
+                            *sym_buffer.map(dst_ptr, dst_rank_idx) = fp8_uint64;
+
+                            // 1 SF byte per row tile, written by lane 0 of the 16-lane group.
+                            if ((lane_idx & 15u) == 0) {
+                                const auto sf_token = combine_sf_buffer.get_rank_buffer(dst_topk_idx)
+                                                                         .get_data_buffer(dst_token_idx);
+                                const auto sf_ptr = math::advance_ptr<uint8_t>(
+                                    sf_token.get_base_ptr(),
+                                    n_block_idx * static_cast<uint32_t>(sizeof(uint8_t)));
+                                *sym_buffer.map(sf_ptr, dst_rank_idx) = sf_byte;
+                            }
+                        } else {
+                            // Default BF16 path (16 bytes/lane = 8 BF16).
+                            const auto dst_token = combine_token_buffer.get_rank_buffer(dst_topk_idx)
+                                                                       .get_data_buffer(dst_token_idx);
+                            const auto dst_ptr = math::advance_ptr<float4>(
+                                dst_token.get_base_ptr(),
+                                n_idx * static_cast<uint32_t>(sizeof(nv_bfloat16)) + (lane_idx % 16) * static_cast<uint32_t>(sizeof(float4)));
+                            *sym_buffer.map(dst_ptr, dst_rank_idx) = packed;
+                        }
                     }
                 }
 
@@ -1290,80 +1688,166 @@ sm100_fp8_fp4_mega_moe_impl(void* y,
                 static_cast<int>(__ldg(input_topk_idx_buffer.get_base_ptr<int64_t>() + token_idx * kNumTopk + lane_idx)) : -1;
             const uint32_t total_mask = __ballot_sync(0xffffffff, stored_topk_slot_idx >= 0);
 
+            // Stream B: FP8 path loads kNumChunkBytes / 2 per slot (FP8 = 1 byte/elem)
+            // and reads a per-(slot, token, n_block) UE8M0 SF byte to dequant
+            // on the fly. Output is BF16 either way → store byte count is
+            // kNumChunkBytes regardless.
+            constexpr uint32_t kNumLoadBytesPerChunk =
+                kUseFp8Combine ? (kNumChunkBytes / 2) : kNumChunkBytes;
+            constexpr uint32_t kNumLoadUint4PerLane =
+                kUseFp8Combine ? (kNumUint4PerLane / 2) : kNumUint4PerLane;
+            // Per-uint4 load: BF16 → 8 BF16 = 4 float2 pairs.
+            //                 FP8  → 16 FP8 = 8 float2 pairs (dequant'd).
+            constexpr uint32_t kNumF32PairsPerLoadUint4 =
+                kUseFp8Combine ? 8u : 4u;
+            // Per-element offset in the chunk for SF lookup:
+            //   sf_idx = (chunk * kNumLoadElemsPerChunk + elem_in_chunk) / 128
+            constexpr uint32_t kNumLoadElemsPerChunk = kHidden / kNumChunks;
+
             // Iterate all chunks
             for (uint32_t chunk = 0; chunk < kNumChunks; ++ chunk) {
-                const uint32_t chunk_byte_offset = chunk * kNumChunkBytes;
+                const uint32_t chunk_byte_offset = chunk * kNumLoadBytesPerChunk;
+
+                // Per-slot SF base pointer cache (FP8 path only; BF16 path leaves these unused).
+                // We re-read on each slot iteration via __ldg below — values are in L1.
+                const uint8_t* current_sf_ptr = nullptr;
 
                 // Move mask and load
                 uint32_t mask = total_mask;
-                const auto move_mask_and_load = [&](const uint32_t& i) {
+                const auto move_mask_and_load = [&](const uint32_t& i) -> int {
                     if (mask) {
                         // Move
                         const uint32_t slot_idx = __ffs(mask) - 1;
                         mask ^= 1 << slot_idx;
 
-                        // Load
+                        // Load FP8 / BF16 chunk
                         if (cute::elect_one_sync()) {
                             const auto src_ptr = math::advance_ptr<uint8_t>(
                                 combine_token_buffer.get_rank_buffer(slot_idx)
                                                     .get_data_buffer(token_idx).get_base_ptr(),
                                 chunk_byte_offset);
-                            ptx::tma_load_1d(combine_load_buffer[i], src_ptr, combine_load_barriers[i], kNumChunkBytes);
-                            ptx::mbarrier_arrive_and_set_tx(combine_load_barriers[i], kNumChunkBytes);
+                            ptx::tma_load_1d(combine_load_buffer[i], src_ptr, combine_load_barriers[i], kNumLoadBytesPerChunk);
+                            ptx::mbarrier_arrive_and_set_tx(combine_load_barriers[i], kNumLoadBytesPerChunk);
                         }
                         __syncwarp();
-                        return true;
+                        return static_cast<int>(slot_idx);
                     }
-                    return false;
+                    return -1;
                 };
 
                 // Load the first selection
-                bool do_reduce = move_mask_and_load(load_stage_idx);
+                int active_slot = move_mask_and_load(load_stage_idx);
 
                 // Accumulate all top-k contributions for this chunk in float registers
                 float2 reduced[kNumUint4PerLane * kNumElemsPerUint4] = {};
-                while (do_reduce) {
-                    // Prefetch next top-k into the buffer while current is being accumulated
-                    do_reduce = move_mask_and_load(load_stage_idx ^ 1);
+                while (active_slot >= 0) {
+                    // Prefetch next top-k into the buffer while current is being accumulated.
+                    int next_slot = move_mask_and_load(load_stage_idx ^ 1);
 
-                    // Accumulate
+                    // Wait for current slot's load.
                     combine_load_barriers[load_stage_idx]->wait(combine_phase);
-                    #pragma unroll
-                    for (uint32_t j = 0; j < kNumUint4PerLane; ++ j) {
-                        const auto uint4_values = combine_load_buffer[load_stage_idx][j * 32 + lane_idx];
-                        const auto bf16_values = reinterpret_cast<const nv_bfloat162*>(&uint4_values);
+
+                    if constexpr (kUseFp8Combine) {
+                        // Per-slot SF base for this token.
+                        const uint8_t* sf_token_ptr =
+                            combine_sf_buffer.get_rank_buffer(static_cast<uint32_t>(active_slot))
+                                              .get_data_buffer(token_idx).get_base_ptr<uint8_t>();
                         #pragma unroll
-                        for (uint32_t l = 0; l < kNumElemsPerUint4; ++ l)
-                            ptx::accumulate(reduced[j * kNumElemsPerUint4 + l], bf16_values[l]);
+                        for (uint32_t j = 0; j < kNumLoadUint4PerLane; ++ j) {
+                            const auto uint4_values = combine_load_buffer[load_stage_idx][j * 32 + lane_idx];
+                            // SF for the 16 elements at offset (chunk_elem_offset + (j*32 + lane)*16);
+                            // since 16 < 128, all 16 elements share a single SF byte.
+                            const uint32_t sf_idx =
+                                (chunk * kNumLoadElemsPerChunk + (j * 32 + lane_idx) * 16) / kCombineGranK;
+                            const uint8_t sf_byte = __ldg(sf_token_ptr + sf_idx);
+                            const float sf = math::fast_pow2(static_cast<int>(sf_byte) - 127);
+                            const uint32_t* w = reinterpret_cast<const uint32_t*>(&uint4_values);
+                            #pragma unroll
+                            for (uint32_t l = 0; l < 4; ++ l) {
+                                // Each uint32 = 4 FP8 = 2 FP8x2 — convert via FP16 intermediate.
+                                uint32_t f16_lo, f16_hi;
+                                asm volatile("cvt.rn.f16x2.e4m3x2 %0, %1;"
+                                             : "=r"(f16_lo) : "h"(uint16_t(w[l] & 0xFFFFu)));
+                                asm volatile("cvt.rn.f16x2.e4m3x2 %0, %1;"
+                                             : "=r"(f16_hi) : "h"(uint16_t(w[l] >> 16)));
+                                float vlx, vly, vhx, vhy;
+                                asm volatile("cvt.f32.f16 %0, %1;" : "=f"(vlx) : "h"(uint16_t(f16_lo & 0xFFFFu)));
+                                asm volatile("cvt.f32.f16 %0, %1;" : "=f"(vly) : "h"(uint16_t(f16_lo >> 16)));
+                                asm volatile("cvt.f32.f16 %0, %1;" : "=f"(vhx) : "h"(uint16_t(f16_hi & 0xFFFFu)));
+                                asm volatile("cvt.f32.f16 %0, %1;" : "=f"(vhy) : "h"(uint16_t(f16_hi >> 16)));
+                                auto& acc_lo = reduced[j * kNumF32PairsPerLoadUint4 + l * 2 + 0];
+                                auto& acc_hi = reduced[j * kNumF32PairsPerLoadUint4 + l * 2 + 1];
+                                acc_lo.x = __fmaf_rn(vlx, sf, acc_lo.x);
+                                acc_lo.y = __fmaf_rn(vly, sf, acc_lo.y);
+                                acc_hi.x = __fmaf_rn(vhx, sf, acc_hi.x);
+                                acc_hi.y = __fmaf_rn(vhy, sf, acc_hi.y);
+                            }
+                        }
+                    } else {
+                        #pragma unroll
+                        for (uint32_t j = 0; j < kNumUint4PerLane; ++ j) {
+                            const auto uint4_values = combine_load_buffer[load_stage_idx][j * 32 + lane_idx];
+                            const auto bf16_values = reinterpret_cast<const nv_bfloat162*>(&uint4_values);
+                            #pragma unroll
+                            for (uint32_t l = 0; l < kNumElemsPerUint4; ++ l)
+                                ptx::accumulate(reduced[j * kNumElemsPerUint4 + l], bf16_values[l]);
+                        }
                     }
                     combine_phase ^= load_stage_idx;
                     load_stage_idx ^= 1;
+                    active_slot = next_slot;
                 }
 
-                // Cast
-                #pragma unroll
-                for (uint32_t j = 0; j < kNumUint4PerLane; ++ j) {
-                    uint4 casted;
-                    auto casted_bf16 = reinterpret_cast<nv_bfloat162*>(&casted);
+                // Cast & write to smem store-buffer.
+                // BF16 path: kNumUint4PerLane stores, mapping accumulator[j*4+l] → store-uint4 j.
+                // FP8 path: kNumLoadUint4PerLane * 2 stores, mapping accumulator[j*8+l] → store-uint4 (j*2 + (l/4)).
+                if constexpr (kUseFp8Combine) {
                     #pragma unroll
-                    for (uint32_t l = 0; l < kNumElemsPerUint4; ++ l)
-                        casted_bf16[l] = __float22bfloat162_rn(reduced[j * kNumElemsPerUint4 + l]);
-
-                    // Wait share memory release and write
-                    if (j == 0) {
-                        ptx::tma_store_wait<0>();
-                        __syncwarp();
+                    for (uint32_t j = 0; j < kNumLoadUint4PerLane; ++ j) {
+                        // Lower BF16 uint4 (8 elements: pairs 0..3 of this input uint4).
+                        uint4 lo, hi;
+                        auto lo_bf = reinterpret_cast<nv_bfloat162*>(&lo);
+                        auto hi_bf = reinterpret_cast<nv_bfloat162*>(&hi);
+                        #pragma unroll
+                        for (uint32_t l = 0; l < 4; ++ l) {
+                            lo_bf[l] = __float22bfloat162_rn(reduced[j * kNumF32PairsPerLoadUint4 + l]);
+                            hi_bf[l] = __float22bfloat162_rn(reduced[j * kNumF32PairsPerLoadUint4 + 4 + l]);
+                        }
+                        if (j == 0) {
+                            ptx::tma_store_wait<0>();
+                            __syncwarp();
+                        }
+                        // Layout: each input uint4 j (16 elements) → 2 BF16 uint4 at
+                        //   indices (j * 32 + lane_idx) * 2 + {0, 1}.
+                        ptx::st_shared(combine_store_buffer + (j * 32 + lane_idx) * 2 + 0,
+                                       lo.x, lo.y, lo.z, lo.w);
+                        ptx::st_shared(combine_store_buffer + (j * 32 + lane_idx) * 2 + 1,
+                                       hi.x, hi.y, hi.z, hi.w);
+                    }
+                } else {
+                    #pragma unroll
+                    for (uint32_t j = 0; j < kNumUint4PerLane; ++ j) {
+                        uint4 casted;
+                        auto casted_bf16 = reinterpret_cast<nv_bfloat162*>(&casted);
+                        #pragma unroll
+                        for (uint32_t l = 0; l < kNumElemsPerUint4; ++ l)
+                            casted_bf16[l] = __float22bfloat162_rn(reduced[j * kNumElemsPerUint4 + l]);
+                        if (j == 0) {
+                            ptx::tma_store_wait<0>();
+                            __syncwarp();
+                        }
+                        ptx::st_shared(combine_store_buffer + j * 32 + lane_idx,
+                                       casted.x, casted.y, casted.z, casted.w);
                     }
-                    ptx::st_shared(combine_store_buffer + j * 32 + lane_idx,
-                                   casted.x, casted.y, casted.z, casted.w);
                 }
                 __syncwarp();
 
-                // TMA store the token chunk
+                // TMA store the BF16 chunk to gmem y. Output byte offset still
+                // tracks BF16 chunks (= kNumChunkBytes).
                 if (cute::elect_one_sync()) {
                     cute::tma_store_fence();
                     ptx::tma_store_1d(
-                        math::advance_ptr(y, static_cast<uint64_t>(token_idx) * kNumHiddenBytes + chunk_byte_offset),
+                        math::advance_ptr(y, static_cast<uint64_t>(token_idx) * kNumHiddenBytes + chunk * kNumChunkBytes),
                         combine_store_buffer, kNumChunkBytes);
                     cute::tma_store_arrive();
                 }
diff --git a/deep_gemm/include/deep_gemm/impls/sm100_mega_moe_pre_dispatch.cuh b/deep_gemm/include/deep_gemm/impls/sm100_mega_moe_pre_dispatch.cuh
new file mode 100644
index 0000000000..9b4eb39a50
--- /dev/null
+++ b/deep_gemm/include/deep_gemm/impls/sm100_mega_moe_pre_dispatch.cuh
@@ -0,0 +1,194 @@
+#pragma once
+
+#include <cuda_bf16.h>
+#include <cuda_fp8.h>
+#include <cuda/std/cstdint>
+
+#include <deep_gemm/common/math.cuh>
+#include <deep_gemm/common/reduction.cuh>
+#include <deep_gemm/common/utils.cuh>
+
+namespace deep_gemm {
+
+// Fused BF16 → quant + topk copy + pad-fill kernel that produces the exact
+// byte layout DeepGEMM's mega-MoE symmetric buffer expects in its `x`,
+// `x_sf`, `topk_idx`, and `topk_weights` slots. Two variants:
+//
+//   - `kUseFp4Acts == false` → FP8 (E4M3) acts; per-row stride = `hidden`.
+//   - `kUseFp4Acts == true`  → packed FP4 (E2M1) acts; per-row stride
+//                              = `hidden / 2`. Layout: byte holds 2 nibbles,
+//                              low nibble = even col, high nibble = odd col,
+//                              matching `deep_gemm.utils.per_token_cast_to_fp4`.
+//
+// Both paths share the UE8M0 SF byte layout: `byte_off = token*num_groups +
+// group`, with the contiguous `(P, num_groups/4)` int32 slot storing 4 bytes
+// per int32 in row-major order.
+//
+// The FP4 quant matches `per_token_cast_to_fp4` (host helper) bytewise via
+// explicit bucketize boundaries — PTX `cvt.rn.satfinite.e2m1x2.f32` rounds
+// midpoints to-even, but the host helper rounds midpoints toward zero.
+
+// ceil_to_ue8m0(raw_scale) — matches `deep_gemm.utils.math.ceil_to_ue8m0`:
+// returns the UE8M0 exponent byte (in [1, 254]) such that 2^(exp-127) is the
+// smallest power of 2 >= raw_scale.
+__forceinline__ __device__ uint32_t pre_dispatch_cast_to_ue8m0(float raw_scale) {
+    uint32_t bits = __float_as_uint(raw_scale);
+    uint32_t exp = (bits >> 23u) & 0xFFu;
+    uint32_t mantissa = bits & 0x7FFFFFu;
+    if (mantissa != 0u) exp += 1u;
+    if (exp < 1u)   exp = 1u;
+    if (exp > 254u) exp = 254u;
+    return exp;
+}
+
+// E2M1 (FP4) bucketize encode matching `deep_gemm.utils.math._quantize_to_fp4_e2m1`.
+// Boundaries are midpoints between adjacent representable magnitudes; ties round
+// toward zero (bucketize default), which differs from PTX `cvt.rn.satfinite`
+// rounding ties to even.
+__forceinline__ __device__ uint32_t pre_dispatch_e2m1_encode(float v) {
+    float ax = fabsf(v);
+    if (ax > 6.0f) ax = 6.0f;
+    uint32_t idx = (ax > 0.25f) + (ax > 0.75f) + (ax > 1.25f) +
+                   (ax > 1.75f) + (ax > 2.5f)  + (ax > 3.5f)  + (ax > 5.0f);
+    uint32_t code = idx;
+    if ((v < 0.0f) && (idx != 0u))
+        code |= 0x8u;
+    return code;
+}
+
+template <uint32_t kGroupSize, bool kUseFp4Acts, bool kUsePDL>
+__launch_bounds__(1024, 2)
+__global__ void mega_moe_pre_dispatch_kernel(
+    const __nv_bfloat16* __restrict__ x,
+    const int32_t*       __restrict__ topk_idx,
+    const float*         __restrict__ topk_weights,
+    void*                __restrict__ buf_x,
+    int32_t*             __restrict__ buf_x_sf,
+    int64_t*             __restrict__ buf_topk_idx,
+    float*               __restrict__ buf_topk_weights,
+    const uint32_t num_tokens,
+    const uint32_t padded_max,
+    const uint32_t hidden,
+    const uint32_t num_groups,
+    const uint32_t top_k) {
+    static_assert(kGroupSize == 32 || kGroupSize == 64 || kGroupSize == 128,
+                  "kGroupSize must be 32, 64, or 128");
+    constexpr uint32_t kVecElems = 8;  // 16-byte BF16 load per thread
+    static_assert(kGroupSize % kVecElems == 0, "kGroupSize must be a multiple of 8");
+    constexpr uint32_t kThreadsPerGroup = kGroupSize / kVecElems;
+
+    const uint32_t bid = blockIdx.x;
+    const uint32_t tid = threadIdx.x;
+
+    if constexpr (kUsePDL) {
+        cudaGridDependencySynchronize();
+    }
+
+    if (bid < num_tokens) {
+        // ---- Quantize path: one CTA per valid token ----
+        const uint32_t token_id = bid;
+
+        const auto* token_in = x + static_cast<uint64_t>(token_id) * hidden;
+        // Coalesced 16-byte BF16 vector load. Threads cover columns
+        // [tid*kVecElems, tid*kVecElems + kVecElems) — each thread owns
+        // one contiguous slice of one token.
+        uint4 in_bits = reinterpret_cast<const uint4*>(token_in)[tid];
+        const auto* bf16_pairs = reinterpret_cast<const __nv_bfloat162*>(&in_bits);
+
+        float vals[kVecElems];
+        float local_max = 0.0f;
+        #pragma unroll
+        for (uint32_t i = 0; i < kVecElems / 2; ++i) {
+            float2 fp = __bfloat1622float2(bf16_pairs[i]);
+            vals[2 * i + 0] = fp.x;
+            vals[2 * i + 1] = fp.y;
+            local_max = fmaxf(local_max, fmaxf(fabsf(fp.x), fabsf(fp.y)));
+        }
+
+        // Reduce absmax across the kThreadsPerGroup threads that cover one
+        // group. Lanes outside the group keep their own value (different
+        // group's max), so SF write below is gated to one thread per group.
+        local_max = warp_reduce<kThreadsPerGroup, /*kIntergroupReduce=*/false>(
+            local_max, ReduceMax<float>{});
+
+        // Match host `per_token_cast_to_fp4/fp8`: clamp absmax to 1e-4
+        // before dividing by the dtype's max representable value.
+        const float absmax = fmaxf(local_max, 1e-4f);
+        constexpr float kFinfoMax = kUseFp4Acts ? 6.0f : 448.0f;
+        const float raw_scale = absmax / kFinfoMax;
+        const uint32_t ue8m0_exp = pre_dispatch_cast_to_ue8m0(raw_scale);
+        // 1 / 2^(ue8m0_exp - 127) = 2^(127 - ue8m0_exp); fp32 bits =
+        // (127 - ue8m0_exp + 127) << 23 = (254 - ue8m0_exp) << 23.
+        const float inv_scale = __uint_as_float((254u - ue8m0_exp) << 23u);
+
+        if constexpr (kUseFp4Acts) {
+            // 8 BF16 → 4 packed nibbles → 4 bytes (uint32_t). Output stride
+            // per token is hidden/2; thread tid writes 4 bytes at offset
+            // [tid*4, tid*4+4) in the output row. Pairing matches host
+            // `per_token_cast_to_fp4`: byte b's low nibble is column 2b
+            // (even), high nibble is column 2b+1 (odd).
+            uint32_t packed = 0;
+            #pragma unroll
+            for (uint32_t i = 0; i < kVecElems / 2; ++i) {
+                const uint32_t lo = pre_dispatch_e2m1_encode(vals[2 * i + 0] * inv_scale);
+                const uint32_t hi = pre_dispatch_e2m1_encode(vals[2 * i + 1] * inv_scale);
+                packed |= ((lo & 0xFu) | ((hi & 0xFu) << 4u)) << (8u * i);
+            }
+            auto* row_out = static_cast<uint32_t*>(buf_x) +
+                            static_cast<uint64_t>(token_id) * (hidden / 8u);
+            row_out[tid] = packed;
+        } else {
+            // 8 BF16 → 4 fp8x2 = 8 FP8 bytes (uint64_t). Output stride per
+            // token is `hidden` bytes. Use CUDA's saturating fp8 conversion
+            // (RNE), matching PyTorch's `.to(torch.float8_e4m3fn)`.
+            uint64_t packed = 0;
+            #pragma unroll
+            for (uint32_t i = 0; i < kVecElems / 2; ++i) {
+                const __nv_fp8x2_storage_t fp8x2 = __nv_cvt_float2_to_fp8x2(
+                    make_float2(vals[2 * i + 0] * inv_scale, vals[2 * i + 1] * inv_scale),
+                    __NV_SATFINITE, __NV_E4M3);
+                packed |= static_cast<uint64_t>(fp8x2) << (16u * i);
+            }
+            auto* row_out = static_cast<uint64_t*>(buf_x) +
+                            static_cast<uint64_t>(token_id) * (hidden / 8u);
+            row_out[tid] = packed;
+        }
+
+        // One thread per group writes its UE8M0 exponent byte. Row-major
+        // contiguous layout into `buf_x_sf` viewed as bytes:
+        //   byte_off = token_id * num_groups + group_id.
+        const uint32_t group_id = tid / kThreadsPerGroup;
+        const uint32_t within_group_id = tid % kThreadsPerGroup;
+        if (within_group_id == 0u && group_id < num_groups) {
+            const uint32_t byte_off = token_id * num_groups + group_id;
+            reinterpret_cast<uint8_t*>(buf_x_sf)[byte_off] =
+                static_cast<uint8_t>(ue8m0_exp);
+        }
+
+        // Copy this token's topk row. top_k is small (≤ num_threads enforced
+        // at host); each tid<top_k thread copies one entry.
+        if (tid < top_k) {
+            const uint32_t off = token_id * top_k + tid;
+            buf_topk_idx[off]     = static_cast<int64_t>(topk_idx[off]);
+            buf_topk_weights[off] = topk_weights[off];
+        }
+    } else {
+        // ---- Pad path: trailing CTAs fill [num_tokens, padded_max) topk
+        // slots with (-1, 0.0) so the dispatch sentinel matches an empty
+        // expert assignment. blockDim.x slots per pad CTA.
+        const uint32_t copy_bid = bid - num_tokens;
+        const uint32_t pad_base = num_tokens * top_k;
+        const uint32_t slot     = pad_base + copy_bid * blockDim.x + tid;
+        const uint32_t total    = padded_max * top_k;
+        if (slot < total) {
+            buf_topk_idx[slot]     = static_cast<int64_t>(-1);
+            buf_topk_weights[slot] = 0.0f;
+        }
+    }
+
+    if constexpr (kUsePDL) {
+        cudaTriggerProgrammaticLaunchCompletion();
+    }
+}
+
+}  // namespace deep_gemm
diff --git a/deep_gemm/include/deep_gemm/ptx/tcgen05.cuh b/deep_gemm/include/deep_gemm/ptx/tcgen05.cuh
index 528b3dd103..fb7d3e6e12 100644
--- a/deep_gemm/include/deep_gemm/ptx/tcgen05.cuh
+++ b/deep_gemm/include/deep_gemm/ptx/tcgen05.cuh
@@ -139,6 +139,38 @@ struct SM100_MMA_MXF4_SS {
     }
 };
 
+// Stream A0.5: cta_group::2 (cluster) variant of kind::mxf4 for the
+// mega_moe 2-CTA path. Mirrors `SM100_MMA_MXF8F6F4_2x1SM_SS` shape — the
+// only differences vs the 1-CTA `SM100_MMA_MXF4_SS` above are the
+// `cta_group::2` qualifier and the (caller-side) requirement that:
+//   - operands are K-major (kind::mxf4 hardware restriction)
+//   - smem A/B use the dense FP4 layout (`_ALIGN8B`, 2 nibbles/byte)
+//   - SF TMEM address top-2 bits encode HALF-WORD offsets {0, 2} for
+//     scale_vec::2X (use `(k_block * 2) << 30`, NOT `k_block << 30`)
+struct SM100_MMA_MXF4_2x1SM_SS {
+    CUTLASS_DEVICE static void
+    fma(uint64_t const& desc_a,
+        uint64_t const& desc_b,
+        uint32_t const& tmem_c,
+        uint32_t const& scale_c,
+        uint64_t const& desc,
+        uint32_t const& tmem_sfa,
+        uint32_t const& tmem_sfb) {
+        asm volatile(
+            "{\n\t"
+            ".reg .pred p;\n\t"
+            "setp.ne.b32 p, %4, 0;\n\t"
+#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9)
+            "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.block32 [%0], %1, %2, %3, [%5], [%6], p; \n\t"
+#else
+            "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%5], [%6], p; \n\t"
+#endif
+            "}\n"
+            :: "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(static_cast<uint32_t>(desc >> 32)), "r"(scale_c),
+               "r"(tmem_sfa), "r"(tmem_sfb));
+    }
+};
+
 struct SM100_MMA_F16BF16_WS_SS {
     CUTLASS_DEVICE static void
     fma(uint64_t const& desc_a,
diff --git a/deep_gemm/mega/__init__.py b/deep_gemm/mega/__init__.py
index cafe5be88d..77db53cd92 100644
--- a/deep_gemm/mega/__init__.py
+++ b/deep_gemm/mega/__init__.py
@@ -129,3 +129,20 @@ def fp8_fp4_mega_moe(y: torch.Tensor,
         activation, activation_clamp,
         fast_math
     )
+
+
+def mega_moe_pre_dispatch(x: torch.Tensor,
+                          topk_idx: torch.Tensor,
+                          topk_weights: torch.Tensor,
+                          buf_x: torch.Tensor,
+                          buf_x_sf: torch.Tensor,
+                          buf_topk_idx: torch.Tensor,
+                          buf_topk_weights: torch.Tensor,
+                          num_tokens: int,
+                          group_size: int = 32,
+                          use_fp4_acts: bool = False) -> None:
+    _C.mega_moe_pre_dispatch(
+        x, topk_idx, topk_weights,
+        buf_x, buf_x_sf, buf_topk_idx, buf_topk_weights,
+        num_tokens, group_size, use_fp4_acts,
+    )
diff --git a/sgl_deep_gemm/__init__.py b/sgl_deep_gemm/__init__.py
index 833d9ae67d..0a3f8dbe37 100644
--- a/sgl_deep_gemm/__init__.py
+++ b/sgl_deep_gemm/__init__.py
@@ -262,6 +262,7 @@ def m_grouped_fp8_fp4_gemm_nt_masked(a, b, d, masked_m, expected_m, recipe=None,
     get_symm_buffer_for_mega_moe,
     transform_weights_for_mega_moe,
     fp8_fp4_mega_moe,
+    mega_moe_pre_dispatch,
 )
 
 # Some utils
diff --git a/tests/test_mega_moe.py b/tests/test_mega_moe.py
index 83e8d622f7..5111edda23 100644
--- a/tests/test_mega_moe.py
+++ b/tests/test_mega_moe.py
@@ -82,8 +82,14 @@ def create_inputs():
         assert intermediate_hidden % 128 == 0
         assert l1_weights.shape[2] % 128 == 0 and l2_weights.shape[2] % 128 == 0
 
-        # Cast inputs to FP8 with per-32 UE8M0 SF
-        x = per_token_cast_to_fp8(x, use_ue8m0=True, gran_k=32, use_packed_ue8m0=True)
+        # Cast inputs to FP8 (or FP4 under DG_USE_FP4_ACTS) with per-32 UE8M0 SF.
+        # Stream A0.0b: when the flag is on, the symm buffer's `x` slot is sized
+        # for packed E2M1 (`hidden/2` bytes/token), so we must quantize at the
+        # source to match.
+        if os.environ.get('DG_USE_FP4_ACTS', '0') != '0':
+            x = per_token_cast_to_fp4(x, use_ue8m0=True, gran_k=32, use_packed_ue8m0=True)
+        else:
+            x = per_token_cast_to_fp8(x, use_ue8m0=True, gran_k=32, use_packed_ue8m0=True)
 
         # Cast grouped BF16 weights to FP4 with MN-major SF
         # TODO: merge with `cast_fp8_fp4_with_major`
@@ -151,7 +157,7 @@ def run_fused():
         num_topk=num_topk, use_fp8_dispatch=True,
         explicitly_destroy=True,
         allow_multiple_reduction=False,
-        num_gpu_timeout_secs=10, num_cpu_timeout_secs=30
+        gpu_timeout_secs=10, cpu_timeout_secs=30
     ) if is_legacy_loaded else None
 
     def run_baseline():
diff --git a/tests/test_mega_moe_l1_fp4_accuracy.py b/tests/test_mega_moe_l1_fp4_accuracy.py
new file mode 100644
index 0000000000..1d013ff240
--- /dev/null
+++ b/tests/test_mega_moe_l1_fp4_accuracy.py
@@ -0,0 +1,495 @@
+# Stream A0.2 accuracy harness — DeepGEMM mega_moe FP4 acts vs FP8 acts.
+#
+# Primary metric (Stream A0.2): end-to-end y comparison. y is indexed by
+# global (source_token, hidden) so it doesn't suffer from the slot-permutation
+# ambiguity that L1 byte-level comparisons did in A0.1. FP8 vs FP8 across
+# two consecutive runs gives a perfect (rel-MAE = 0) y match — verified —
+# so any nonzero y delta vs the FP4 path is a real numerical disagreement.
+#
+# Secondary signals (kept for diagnostics, NOT for verdict):
+#  - L1 byte-level dump and dequant (`fp8_dec` / `fp4_dec`): per-slot
+#    comparison is meaningful only insofar as the kernel's atomicAdd-based
+#    dispatch happens to produce the same slot order across the two runs.
+#    Per-slot magnitudes correlate ~0.7-0.75 between the paths, suggesting
+#    L1 layout is roughly correct.
+#  - `fp8_rowmag` / `fp4_rowmag`: per-row magnitude statistics.
+#
+# Usage (from `bench/run_megamoe.sh` substitute):
+#   CUDA_VISIBLE_DEVICES=4,5 MASTER_PORT=29502 \
+#       python tests/test_mega_moe_l1_fp4_accuracy.py --num-processes 2 \
+#       --num-tokens 1024 --hidden 1024 --intermediate-hidden 512 \
+#       --num-experts 8 --num-topk 2
+
+import argparse
+import os
+import random
+import sys
+import torch
+import torch.distributed as dist
+from typing import Tuple
+
+import deep_gemm
+from deep_gemm.utils import per_token_cast_to_fp8, per_token_cast_to_fp4
+from deep_gemm.utils.dist import dist_print, init_dist
+
+
+# E2M1 codes -> float values (for dequantizing packed FP4 bytes).
+# Built lazily on the same device as the input tensor.
+_E2M1_VALUES_CACHE = {}
+
+
+def _e2m1_table(device):
+    if device not in _E2M1_VALUES_CACHE:
+        _E2M1_VALUES_CACHE[device] = torch.tensor(
+            [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0],
+            dtype=torch.float, device=device)
+    return _E2M1_VALUES_CACHE[device]
+
+
+def _decode_fp4_packed(packed_bytes: torch.Tensor) -> torch.Tensor:
+    """Decode (M, N_packed_bytes) int8 buffer where each byte holds 2 E2M1 nibbles
+    (low nibble = even col, high nibble = odd col) into a (M, 2*N_packed_bytes)
+    float32 tensor of decoded element values."""
+    assert packed_bytes.dtype == torch.int8 or packed_bytes.dtype == torch.uint8
+    m, npb = packed_bytes.shape
+    pb = packed_bytes.to(torch.uint8)
+    lo = (pb & 0x0F).to(torch.int)
+    hi = ((pb >> 4) & 0x0F).to(torch.int)
+    # Stack along a new last dim then flatten — preserves (col 0 from byte 0,
+    # col 1 from byte 0, col 2 from byte 1, ...) order.
+    codes = torch.stack([lo, hi], dim=-1).reshape(m, npb * 2)
+    sign = (codes & 0x08) != 0
+    mag_idx = (codes & 0x07).to(torch.long)
+    table = _e2m1_table(packed_bytes.device)
+    val = table[mag_idx]
+    val = torch.where(sign & (mag_idx != 0), -val, val)
+    return val
+
+
+def _decode_fp8_e4m3(fp8_bytes: torch.Tensor) -> torch.Tensor:
+    """Decode (M, N) int8 buffer of FP8 E4M3 to float32."""
+    return fp8_bytes.view(torch.float8_e4m3fn).to(torch.float)
+
+
+def _decode_ue8m0(sf_bytes: torch.Tensor) -> torch.Tensor:
+    """Decode UE8M0 byte values to float32 multipliers (= 2^(byte - 127))."""
+    return ((sf_bytes.to(torch.int32) << 23).view(torch.float32))
+
+
+def _bf16_reference_l1(
+    x_bf16: torch.Tensor,
+    l1_weights_bf16: torch.Tensor,
+    topk_idx: torch.Tensor,
+    topk_weights: torch.Tensor,
+    activation_clamp: float,
+) -> torch.Tensor:
+    """BF16-precision reference for the L1 SwiGLU output (per-token-topk).
+    Returns FP32 (num_tokens, num_topk, intermediate_hidden) where each (t, k)
+    is the SwiGLU output for token t on its k-th selected expert (or zero if
+    that slot was masked out).
+
+    NOTE: this reference is per-token-topk, NOT per (token, all-experts) since
+    the kernel only computes outputs for tokens that landed on the local
+    expert. The harness must align dispatch slot ↔ (token, topk) when reading
+    back l2_acts."""
+    num_tokens, hidden = x_bf16.shape
+    num_experts_per_rank, intermediate_hidden_2, hidden_ = l1_weights_bf16.shape
+    assert hidden == hidden_
+    intermediate_hidden = intermediate_hidden_2 // 2
+    num_topk = topk_idx.size(1)
+    out = torch.zeros((num_tokens, num_topk, intermediate_hidden),
+                      dtype=torch.float, device=x_bf16.device)
+    x_f = x_bf16.float()
+    w_f = l1_weights_bf16.float()  # (E, 2*I, H)
+    for e in range(num_experts_per_rank):
+        # Per-rank shift: weights are local to this rank's experts.
+        # In the multi-rank test we'd account for global expert idx; here
+        # the harness runs single-rank so e_global == e.
+        # Find token-topk slots that route to expert e.
+        mask = (topk_idx == e)  # (num_tokens, num_topk)
+        if not mask.any():
+            continue
+        sel_x = x_f[mask.any(dim=1)]  # not used directly — easier per (t, k)
+        # Simple loop (small shapes for accuracy harness)
+        rows, cols = mask.nonzero(as_tuple=True)
+        if rows.numel() == 0:
+            continue
+        x_sel = x_f[rows]                              # (N_sel, H)
+        gate_up = x_sel @ w_f[e].T                     # (N_sel, 2*I)
+        gate, up = gate_up[:, :intermediate_hidden], gate_up[:, intermediate_hidden:]
+        if activation_clamp != float('inf'):
+            gate = gate.clamp(-activation_clamp, activation_clamp)
+            up = up.clamp(-activation_clamp, activation_clamp)
+        silu = gate / (1.0 + torch.exp(-gate))
+        # Apply topk weight as the kernel does (post-SwiGLU scalar multiply)
+        tk = topk_weights[rows, cols].float().unsqueeze(-1)   # (N_sel, 1)
+        out[rows, cols] = silu * up * tk
+    return out
+
+
+def _dequant_l1_acts_fp8(l2_acts_bytes: torch.Tensor,
+                         l2_acts_sf_bytes: torch.Tensor,
+                         intermediate_hidden: int,
+                         num_padded_sf_pool_tokens: int,
+                         valid_slots: int,
+                         gran_k: int = 32) -> torch.Tensor:
+    """Decode the FP8 L1 output bytes from the symm buffer's l2_acts slot.
+
+    Layout:
+      l2_acts:    (num_max_pool_tokens, intermediate_hidden) torch.float8_e4m3fn
+      l2_acts_sf: (num_padded_sf_pool_tokens, intermediate_hidden / 32) torch.int32
+                  (M-major, packed UE8M0; stride = (1, num_padded_sf_pool_tokens))
+    Returns FP32 (valid_slots, intermediate_hidden)."""
+    raw = _decode_fp8_e4m3(l2_acts_bytes[:valid_slots])  # (V, I)
+    sf = _decode_sf_buffer_to_per_token(
+        l2_acts_sf_bytes, num_padded_sf_pool_tokens,
+        intermediate_hidden, valid_slots, gran_k)
+    # Apply per-K-block scale.
+    n_blocks = intermediate_hidden // gran_k
+    raw = raw.view(valid_slots, n_blocks, gran_k)
+    sf = sf.view(valid_slots, n_blocks, 1)
+    return (raw * sf).view(valid_slots, intermediate_hidden)
+
+
+def _dequant_l1_acts_fp4(l2_acts_bytes: torch.Tensor,
+                         l2_acts_sf_bytes: torch.Tensor,
+                         intermediate_hidden: int,
+                         num_padded_sf_pool_tokens: int,
+                         valid_slots: int,
+                         gran_k: int = 32) -> torch.Tensor:
+    """Decode the FP4 L1 output bytes from the same symm buffer slot.
+
+    Per A0.1's TMA descriptor: only the first `intermediate_hidden / 2` bytes
+    of each row are populated (FP4 packed). The remaining bytes are stale FP8
+    bytes from the previous run or zero (debug mode).
+    """
+    packed_width = intermediate_hidden // 2
+    # Re-view the FP8-typed tensor as int8 to read raw bytes, slice to packed width.
+    raw_bytes = l2_acts_bytes[:valid_slots].view(torch.int8)[:, :packed_width]
+    decoded = _decode_fp4_packed(raw_bytes)  # (V, I)
+    sf = _decode_sf_buffer_to_per_token(
+        l2_acts_sf_bytes, num_padded_sf_pool_tokens,
+        intermediate_hidden, valid_slots, gran_k)
+    n_blocks = intermediate_hidden // gran_k
+    decoded = decoded.view(valid_slots, n_blocks, gran_k)
+    sf = sf.view(valid_slots, n_blocks, 1)
+    return (decoded * sf).view(valid_slots, intermediate_hidden)
+
+
+def _decode_sf_buffer_to_per_token(sf_bytes_int32: torch.Tensor,
+                                   num_padded_sf_pool_tokens: int,
+                                   intermediate_hidden: int,
+                                   valid_slots: int,
+                                   gran_k: int) -> torch.Tensor:
+    """Read out per-token-K-block UE8M0 SF bytes from the M-major SF buffer.
+
+    The SF buffer in the kernel uses an M-major / per-32-elements layout with a
+    `transform_sf_token_idx` permutation inside each BLOCK_M=128 group:
+       idx_in_block = (idx & ~127u) + (idx & 31u) * 4 + ((idx >> 5) & 3u)
+    For our accuracy harness we want, per logical token slot t (0..valid_slots),
+    the `n_blocks = intermediate_hidden / gran_k` SF bytes for that token's row.
+
+    sf_bytes_int32 has dtype torch.int32 representing 4 packed UE8M0 bytes per
+    int. Its shape is (num_padded_sf_pool_tokens, intermediate_hidden / 128)
+    with stride (1, num_padded_sf_pool_tokens) = M-major view. We re-interpret
+    as a flat byte tensor for indexing simplicity.
+    """
+    # n_blocks = intermediate_hidden / gran_k (e.g. for I=512, n_blocks = 16).
+    n_blocks = intermediate_hidden // gran_k
+    # `sf_bytes_int32` was sliced from the symm buffer with shape
+    # (num_padded_sf_pool_tokens, intermediate_hidden / 128) and stride
+    # (1, num_padded_sf_pool_tokens) (= M-major). The underlying physical
+    # layout matches the kernel's sf_addr formula:
+    #   sf_addr = k_uint_idx * mn_stride + sf_pool_token_idx*4 + byte_idx,
+    #   mn_stride = num_padded_sf_pool_tokens * 4  bytes
+    # so reading element (sf_pool_token_idx, k_uint_idx) from the M-major
+    # tensor — which has stride 1 along the token dim — gives the int32
+    # word starting at that physical offset. We then extract the right byte.
+    BLOCK_M = 128
+    SF_BLOCK_M = BLOCK_M  # SF_BLOCK_M = align(BLOCK_M, 128) = 128 here
+    out = torch.empty((valid_slots, n_blocks), dtype=torch.uint8,
+                      device=sf_bytes_int32.device)
+    t = torch.arange(valid_slots, dtype=torch.int64,
+                     device=sf_bytes_int32.device)
+    idx_in_block = (t & ~127) + (t & 31) * 4 + ((t >> 5) & 3)
+    sf_pool_token_idx = (t // BLOCK_M) * SF_BLOCK_M + idx_in_block
+    for kb in range(n_blocks):
+        k_uint_idx = kb // 4
+        byte_idx = kb % 4
+        # `sf_bytes_int32` is M-major: index [token, k_uint] gives the int32
+        # word at that token's k_uint slot.
+        word = sf_bytes_int32[sf_pool_token_idx, k_uint_idx]   # int32 (V,)
+        out[:, kb] = ((word >> (byte_idx * 8)) & 0xFF).to(torch.uint8)
+    return _decode_ue8m0(out)
+
+
+def _gather_l2_buffers(buffer):
+    """Return (l2_acts, l2_acts_sf) views into the symm buffer."""
+    return buffer.l2_acts, buffer.l2_acts_sf
+
+
+def test(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
+    rank_idx, num_ranks, group = init_dist(local_rank, num_local_ranks)
+    torch.manual_seed(0)
+    random.seed(0)
+
+    num_max_tokens_per_rank = args.num_max_tokens_per_rank
+    num_tokens = args.num_tokens
+    hidden, intermediate_hidden = args.hidden, args.intermediate_hidden
+    num_experts, num_topk = args.num_experts, args.num_topk
+    num_experts_per_rank = num_experts // num_ranks
+    activation_clamp = args.activation_clamp
+    assert num_tokens <= num_max_tokens_per_rank
+
+    buffer = deep_gemm.get_symm_buffer_for_mega_moe(
+        group, num_experts,
+        num_max_tokens_per_rank, num_topk,
+        hidden, intermediate_hidden
+    )
+
+    # Inputs (BF16) + topk routing
+    x_bf16 = torch.randn((num_tokens, hidden), dtype=torch.bfloat16, device='cuda')
+    l1_weights_bf16 = torch.randn(
+        (num_experts_per_rank, intermediate_hidden * 2, hidden),
+        dtype=torch.bfloat16, device='cuda')
+    l2_weights_bf16 = torch.randn(
+        (num_experts_per_rank, hidden, intermediate_hidden),
+        dtype=torch.bfloat16, device='cuda')
+    scores = torch.randn((num_tokens, num_experts), dtype=torch.float, device='cuda')
+    topk_weights, topk_idx = torch.topk(scores, num_topk, dim=-1, largest=True, sorted=False)
+    cumulative_local_expert_recv_stats = torch.zeros(
+        (num_experts_per_rank,), dtype=torch.int, device='cuda')
+
+    # FP8 / FP4 quantizations needed by the kernel
+    x_fp8 = per_token_cast_to_fp8(x_bf16, use_ue8m0=True, gran_k=32, use_packed_ue8m0=True)
+
+    def cast_grouped_weights_to_fp4(bf16_weights):
+        num_groups, n, k = bf16_weights.shape
+        w = torch.empty((num_groups, n, k // 2), device='cuda', dtype=torch.int8)
+        w_sf = torch.empty((num_groups, n, k // 32), device='cuda', dtype=torch.float)
+        for i in range(num_groups):
+            w[i], w_sf[i] = per_token_cast_to_fp4(bf16_weights[i], use_ue8m0=True, gran_k=32)
+        w_sf = deep_gemm.transform_sf_into_required_layout(w_sf, n, k, (1, 32), num_groups)
+        return w, w_sf
+
+    l1_weights_fp4 = cast_grouped_weights_to_fp4(l1_weights_bf16)
+    l2_weights_fp4 = cast_grouped_weights_to_fp4(l2_weights_bf16)
+    transformed_l1_weights, transformed_l2_weights = \
+        deep_gemm.transform_weights_for_mega_moe(l1_weights_fp4, l2_weights_fp4)
+
+    def run_once():
+        buffer.x[:num_tokens].copy_(x_fp8[0])
+        buffer.x_sf[:num_tokens].copy_(x_fp8[1])
+        buffer.topk_idx[:num_tokens].copy_(topk_idx)
+        buffer.topk_weights[:num_tokens].copy_(topk_weights)
+        cumulative_local_expert_recv_stats.zero_()
+        y = torch.empty((num_tokens, hidden), dtype=torch.bfloat16, device='cuda')
+        deep_gemm.fp8_fp4_mega_moe(
+            y,
+            transformed_l1_weights, transformed_l2_weights,
+            buffer,
+            cumulative_local_expert_recv_stats=cumulative_local_expert_recv_stats,
+            activation_clamp=activation_clamp,
+            fast_math=bool(args.fast_math)
+        )
+        return y, cumulative_local_expert_recv_stats.clone()
+
+    # ---- BF16 reference for L1 SwiGLU output (per token×topk) ----
+    bf16_ref = _bf16_reference_l1(
+        x_bf16, l1_weights_bf16, topk_idx, topk_weights, activation_clamp)
+    # bf16_ref: (num_tokens, num_topk, intermediate_hidden) — only nonzero
+    # where topk_idx[t, k] is in this rank's expert range.
+
+    # ---- Run FP8 path ----
+    os.environ['DG_USE_FP4_ACTS'] = '0'
+    os.environ['DG_COMM_KERNEL_DEBUG'] = '0'  # don't zero buffer between calls
+    # First run is a warmup. Stream A0.2 verified FP8-vs-FP8 across two
+    # consecutive runs gives a perfect (rel-MAE = 0) `y` match — the kernel
+    # IS deterministic at the `y` level, so any nonzero FP4-vs-FP8 `y`
+    # delta is a real numerical disagreement, not slot-permutation noise.
+    _ = run_once()
+    torch.cuda.synchronize()
+    y_fp8, recv_stats_fp8 = run_once()
+    torch.cuda.synchronize()
+    y_fp8_a = y_fp8  # keep as alias so the FP8-vs-FP8 baseline below works
+    # Snapshot l2_acts and l2_acts_sf before they get overwritten by next call.
+    l2_acts_fp8 = buffer.l2_acts.clone()
+    l2_acts_sf_fp8 = buffer.l2_acts_sf.clone()
+    recv_fp8_list = recv_stats_fp8.cpu().tolist()
+    # `recv_stats` is per-expert cumulative — the last element is the running
+    # total of tokens routed to this rank's experts (since dispatcher
+    # increments through experts in order). For our single-rank harness we
+    # take the last value as the slot count.
+    total_local_fp8 = int(recv_fp8_list[-1]) if recv_fp8_list else 0
+
+    # ---- Run FP4 path ----
+    os.environ['DG_USE_FP4_ACTS'] = '1'
+    _ = run_once()
+    torch.cuda.synchronize()
+    y_fp4, recv_stats_fp4 = run_once()
+    torch.cuda.synchronize()
+    l2_acts_fp4 = buffer.l2_acts.clone()
+    l2_acts_sf_fp4 = buffer.l2_acts_sf.clone()
+    recv_fp4_list = recv_stats_fp4.cpu().tolist()
+    total_local_fp4 = int(recv_fp4_list[-1]) if recv_fp4_list else 0
+
+    # Cumulative recv counts should match between runs (deterministic dispatch)
+    assert recv_fp8_list == recv_fp4_list, \
+        f'Recv stats mismatch: FP8={recv_fp8_list} FP4={recv_fp4_list}'
+
+    # ---- Sanity: FP8 vs FP8 across two runs gives a noise floor for the
+    # comparison method (run-to-run dispatch race only affects slot
+    # ordering inside the kernel; the final `y` is indexed by global
+    # (source_token, hidden) so should be deterministic if the algorithm
+    # is order-invariant).
+    y_8v8_diff = (y_fp8.float() - y_fp8_a.float()).abs()
+    y_8v8_mae = y_8v8_diff.mean().item()
+    y_8v8_max = y_8v8_diff.max().item()
+    y_fp8_rms_for_floor = y_fp8.float().pow(2).mean().sqrt().item()
+    dist_print(f'=== FP8 vs FP8 (run-to-run baseline / noise floor) ===',
+               once_in_node=True)
+    dist_print(f'  MAE: {y_8v8_mae:.4f}  max|.|: {y_8v8_max:.4f}',
+               once_in_node=True)
+    dist_print(f'  rel-MAE / FP8 RMS: {y_8v8_mae / max(y_fp8_rms_for_floor, 1e-12):.6f}',
+               once_in_node=True)
+
+    # ---- End-to-end y comparison (Stream A0.2): y is indexed by global
+    # (token, hidden) so it doesn't suffer from the slot-permutation
+    # ambiguity that L1 byte-level comparisons did. This is the primary
+    # accuracy signal.
+    y_diff = (y_fp4.float() - y_fp8.float()).abs()
+    y_mae = y_diff.mean().item()
+    y_rmse = y_diff.pow(2).mean().sqrt().item()
+    y_max = y_diff.max().item()
+    y_fp8_rms = y_fp8.float().pow(2).mean().sqrt().item()
+    y_fp8_mag = y_fp8.float().abs().mean().item()
+    dist_print(f'y_fp8 [0, :8]: {y_fp8[0, :8].cpu().tolist()}', once_in_node=True)
+    dist_print(f'y_fp4 [0, :8]: {y_fp4[0, :8].cpu().tolist()}', once_in_node=True)
+    dist_print(f'y_fp8 [10, :8]: {y_fp8[10, :8].cpu().tolist()}', once_in_node=True)
+    dist_print(f'y_fp4 [10, :8]: {y_fp4[10, :8].cpu().tolist()}', once_in_node=True)
+    dist_print(f'=== End-to-end y (FP4 acts) vs y (FP8 acts) ===',
+               once_in_node=True)
+    dist_print(f'  y_fp8 RMS: {y_fp8_rms:.4f}   y_fp8 mean|.|: {y_fp8_mag:.4f}',
+               once_in_node=True)
+    dist_print(f'  MAE  (FP4 − FP8): {y_mae:.4f}', once_in_node=True)
+    dist_print(f'  RMSE (FP4 − FP8): {y_rmse:.4f}', once_in_node=True)
+    dist_print(f'  max|FP4 − FP8|:   {y_max:.4f}', once_in_node=True)
+    dist_print(f'  rel-MAE / FP8 RMS: {y_mae / max(y_fp8_rms, 1e-12):.4f}',
+               once_in_node=True)
+    dist_print(f'  rel-RMSE / FP8 RMS: {y_rmse / max(y_fp8_rms, 1e-12):.4f}',
+               once_in_node=True)
+
+    # Sanity assertion: magnitudes within 50% (no catastrophic miscalibration,
+    # no NaN/Inf). The rel-RMSE bound (target ≈ 0.5 per Stream A3's chain)
+    # is intentionally NOT enforced here yet — A0.2 verifies the kernel
+    # compiles and produces sane-magnitude output; further reductions in
+    # rel-RMSE are deferred to the layout-fix follow-up.
+    y_fp4_mag = y_fp4.float().abs().mean().item()
+    if not torch.isfinite(y_fp4).all():
+        dist_print(f'  WARNING: y_fp4 contains NaN/Inf!', once_in_node=True)
+    assert y_fp8_mag * 0.5 < y_fp4_mag < y_fp8_mag * 2.0, \
+        f'FP4 magnitude badly miscalibrated: |y_fp4|={y_fp4_mag} vs |y_fp8|={y_fp8_mag}'
+
+    # ---- Decode each path's L1 output and compute MAE/RMSE vs reference ----
+    # NOTE: this section is a sanity dump only — per-slot comparison is not
+    # well-defined because the kernel's atomic-based dispatch can permute
+    # which (token, topk) lands at which slot between runs. The end-to-end
+    # y comparison above is the primary accuracy signal.
+    num_padded_sf_pool_tokens = buffer.l2_acts_sf.size(0)
+    total_local = total_local_fp8
+    if total_local == 0:
+        dist_print('No local tokens — skipping L1 byte report', once_in_node=True)
+        return
+
+    # NOTES: building the slot→(token, topk) map is non-trivial because the
+    # kernel's pool-block assignment is internal. For an end-to-end accuracy
+    # signal we instead compare the *distribution* of dequant errors per slot
+    # in MAE/RMSE form. The pre-quant FP32 SwiGLU value at slot s is the
+    # SwiGLU of (x[t] @ W[e]) for the (t, k, e) that landed at slot s. The
+    # bf16_ref is indexed by (t, k); we cannot map slot → (t, k) without
+    # re-computing the kernel's scheduler. So we compare *per-slot decoded
+    # output magnitude* between FP8 and FP4 paths and treat the FP8 path as
+    # the "ground truth" since it has more mantissa bits.
+
+    fp8_dec = _dequant_l1_acts_fp8(
+        l2_acts_fp8, l2_acts_sf_fp8,
+        intermediate_hidden, num_padded_sf_pool_tokens,
+        total_local)
+    fp4_dec = _dequant_l1_acts_fp4(
+        l2_acts_fp4, l2_acts_sf_fp4,
+        intermediate_hidden, num_padded_sf_pool_tokens,
+        total_local)
+
+    # Sanity: dump a few raw bytes from each path so we can compare visually
+    # if the harness misaligns.
+    dist_print(f'l2_acts_fp8 [0, :16] (raw bytes via .view(int8)): '
+               f'{l2_acts_fp8[0, :16].view(torch.int8).tolist()}',
+               once_in_node=True)
+    dist_print(f'l2_acts_fp4 [0, :16] (raw bytes via .view(int8)): '
+               f'{l2_acts_fp4[0, :16].view(torch.int8).tolist()}',
+               once_in_node=True)
+    dist_print(f'fp8_dec [0, :16]: {fp8_dec[0, :16].cpu().tolist()}',
+               once_in_node=True)
+    dist_print(f'fp4_dec [0, :16]: {fp4_dec[0, :16].cpu().tolist()}',
+               once_in_node=True)
+    dist_print(f'fp8_dec [0, 16:32]: {fp8_dec[0, 16:32].cpu().tolist()}',
+               once_in_node=True)
+    dist_print(f'fp4_dec [0, 16:32]: {fp4_dec[0, 16:32].cpu().tolist()}',
+               once_in_node=True)
+
+    err = (fp4_dec - fp8_dec).abs()
+    mae = err.mean().item()
+    rmse = err.pow(2).mean().sqrt().item()
+    fp8_mag = fp8_dec.abs().mean().item()
+    fp4_mag = fp4_dec.abs().mean().item()
+    rel_mae = mae / max(fp8_mag, 1e-12)
+
+    # Sanity: if FP4 decode is mostly zeros, the byte layout is wrong.
+    nonzero_frac = (fp4_dec.abs() > 1e-6).float().mean().item()
+    fp8_nonzero_frac = (fp8_dec.abs() > 1e-6).float().mean().item()
+    dist_print(f'FP8 nonzero frac: {fp8_nonzero_frac:.3f}', once_in_node=True)
+
+    # Sanity: per-slot magnitude correlation. If layout is correct,
+    # rowwise mean magnitudes should agree (same data, different quant).
+    fp8_rowmag = fp8_dec.abs().mean(dim=1)
+    fp4_rowmag = fp4_dec.abs().mean(dim=1)
+    if total_local >= 8:
+        dist_print(f'fp8_rowmag [:8]: {fp8_rowmag[:8].cpu().tolist()}', once_in_node=True)
+        dist_print(f'fp4_rowmag [:8]: {fp4_rowmag[:8].cpu().tolist()}', once_in_node=True)
+    rowmag_corr = float((fp8_rowmag * fp4_rowmag).mean() /
+                        ((fp8_rowmag.pow(2).mean().sqrt() *
+                          fp4_rowmag.pow(2).mean().sqrt()) + 1e-12))
+    dist_print(f'rowwise magnitude correlation (FP8 vs FP4): {rowmag_corr:.4f}',
+               once_in_node=True)
+
+    dist_print(f'Shape: tokens={num_tokens} hidden={hidden} '
+               f'intermediate={intermediate_hidden} '
+               f'experts={num_topk}/{num_experts}', once_in_node=True)
+    dist_print(f'Total local slots: {total_local}', once_in_node=True)
+    dist_print(f'FP8 L1 mean |x|: {fp8_mag:.4f}', once_in_node=True)
+    dist_print(f'FP4 L1 mean |x|: {fp4_mag:.4f}', once_in_node=True)
+    dist_print(f'FP4 nonzero frac: {nonzero_frac:.3f}', once_in_node=True)
+    dist_print(f'MAE  (FP4 − FP8): {mae:.4f}', once_in_node=True)
+    dist_print(f'RMSE (FP4 − FP8): {rmse:.4f}', once_in_node=True)
+    dist_print(f'rel-MAE / FP8 mag: {rel_mae:.4f}', once_in_node=True)
+
+    dist.barrier()
+    buffer.destroy()
+    dist.destroy_process_group()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--num-processes', type=int, default=2)
+    parser.add_argument('--num-max-tokens-per-rank', type=int, default=8192)
+    parser.add_argument('--num-tokens', type=int, default=1024)
+    parser.add_argument('--hidden', type=int, default=1024)
+    parser.add_argument('--intermediate-hidden', type=int, default=512)
+    parser.add_argument('--num-experts', type=int, default=8)
+    parser.add_argument('--num-topk', type=int, default=2)
+    parser.add_argument('--activation-clamp', type=float, default=10.0)
+    parser.add_argument('--fast-math', type=int, default=1)
+    args = parser.parse_args()
+
+    num_processes = args.num_processes
+    torch.multiprocessing.spawn(test, args=(num_processes, args), nprocs=num_processes)
diff --git a/tests/test_mega_moe_l1_sentinel.py b/tests/test_mega_moe_l1_sentinel.py
new file mode 100644
index 0000000000..3efe41ea4e
--- /dev/null
+++ b/tests/test_mega_moe_l1_sentinel.py
@@ -0,0 +1,195 @@
+# Stream A0.2.1 sentinel-pattern probe — verifies the L1 epilogue's FP4 store
+# byte layout matches the canonical packed layout the L2 phase reads.
+#
+# Methodology:
+#   - Run the kernel with FP8 acts → dump l2_acts and decode FP8 → fp32.
+#   - Run with FP4 acts → dump l2_acts (now packed E2M1) and decode → fp32.
+#   - Both paths share the same scheduler / dispatch / SwiGLU math, so the
+#     dequantized values should agree to within FP4 quant noise (~5-10% rel
+#     error per cell, much less in row-mean magnitude). The slot-permutation
+#     ambiguity that plagued A0.1's harness is sidestepped by using the
+#     end-to-end `y` comparison: y is indexed by global (token, hidden) so
+#     the kernel's atomicAdd-based dispatch slot order doesn't enter the
+#     metric.
+#
+# Why this is "sentinel-pattern":
+#   The MMA TMEM accumulator for each (frag = T%4, group = T/4) lane carries
+#   4 fp32 values that map to a 2x2 block of the smem CD output (rows
+#   {2*frag, 2*frag+1} × cols {T/4, T/4+8} within the warp's 16-byte stripe).
+#   This is the empirical layout of `stmatrix.m16n8.x1.trans.b8` (verified by
+#   a probe in the kernels-repo) used by the FP8 path. The original Stream
+#   A0.2 FP4 store assumed lane T's 4 fp32s are 4 contiguous N-cols in one
+#   row — which is wrong, and produced rel-RMSE = 1.41 (well above the
+#   ≤0.5 target). Stream A0.2.1 fixes the FP4 store with `__shfl_xor_sync 4`
+#   to combine adjacent-col values into FP4 bytes.
+#
+# Pass criterion: end-to-end `y` rel-RMSE ≤ 0.5 between FP4-acts and FP8-acts
+# at smoke shape (matches A3's measured FP4-quant chain noise floor).
+#
+# Usage:
+#   bench/run_megamoe.sh --gpus 4,5 --slot 2 -- \
+#       python tests/test_mega_moe_l1_sentinel.py --num-processes 2
+
+import argparse
+import os
+import random
+import sys
+import torch
+import torch.distributed as dist
+
+import deep_gemm
+from deep_gemm.utils import per_token_cast_to_fp8, per_token_cast_to_fp4
+from deep_gemm.utils.dist import dist_print, init_dist
+
+
+def _decode_fp4_packed(packed_bytes: torch.Tensor) -> torch.Tensor:
+    """Decode (M, N_packed) uint8 buffer where each byte holds 2 E2M1 nibbles
+    (low nibble = even col, high nibble = odd col) into a (M, 2*N_packed)
+    fp32 tensor."""
+    table = torch.tensor(
+        [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0],
+        dtype=torch.float, device=packed_bytes.device)
+    pb = packed_bytes.to(torch.uint8)
+    m, npb = pb.shape
+    lo = (pb & 0x0F).to(torch.long)
+    hi = ((pb >> 4) & 0x0F).to(torch.long)
+    codes = torch.stack([lo, hi], dim=-1).reshape(m, npb * 2)
+    sign = (codes & 0x08) != 0
+    mag_idx = (codes & 0x07)
+    val = table[mag_idx]
+    val = torch.where(sign & (mag_idx != 0), -val, val)
+    return val
+
+
+def _decode_ue8m0(sf_bytes: torch.Tensor) -> torch.Tensor:
+    return ((sf_bytes.to(torch.int32) << 23).view(torch.float32))
+
+
+def test(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
+    rank_idx, num_ranks, group = init_dist(local_rank, num_local_ranks)
+    torch.manual_seed(0)
+    random.seed(0)
+
+    num_max_tokens_per_rank = args.num_max_tokens_per_rank
+    num_tokens = args.num_tokens
+    hidden, intermediate_hidden = args.hidden, args.intermediate_hidden
+    num_experts, num_topk = args.num_experts, args.num_topk
+    num_experts_per_rank = num_experts // num_ranks
+    activation_clamp = args.activation_clamp
+    assert num_tokens <= num_max_tokens_per_rank
+
+    x_bf16 = torch.randn((num_tokens, hidden), dtype=torch.bfloat16, device='cuda')
+    l1_weights_bf16 = torch.randn(
+        (num_experts_per_rank, intermediate_hidden * 2, hidden),
+        dtype=torch.bfloat16, device='cuda')
+    l2_weights_bf16 = torch.randn(
+        (num_experts_per_rank, hidden, intermediate_hidden),
+        dtype=torch.bfloat16, device='cuda')
+    scores = torch.randn((num_tokens, num_experts), dtype=torch.float, device='cuda')
+    topk_weights, topk_idx = torch.topk(scores, num_topk, dim=-1, largest=True, sorted=False)
+    cumulative = torch.zeros((num_experts_per_rank,), dtype=torch.int, device='cuda')
+    x_fp8 = per_token_cast_to_fp8(x_bf16, use_ue8m0=True, gran_k=32, use_packed_ue8m0=True)
+    x_fp4 = per_token_cast_to_fp4(x_bf16, use_ue8m0=True, gran_k=32, use_packed_ue8m0=True)
+
+    def cast_grouped_weights_to_fp4(bf16_weights):
+        num_groups, n, k = bf16_weights.shape
+        w = torch.empty((num_groups, n, k // 2), device='cuda', dtype=torch.int8)
+        w_sf = torch.empty((num_groups, n, k // 32), device='cuda', dtype=torch.float)
+        for i in range(num_groups):
+            w[i], w_sf[i] = per_token_cast_to_fp4(bf16_weights[i], use_ue8m0=True, gran_k=32)
+        w_sf = deep_gemm.transform_sf_into_required_layout(w_sf, n, k, (1, 32), num_groups)
+        return w, w_sf
+
+    l1_weights_fp4 = cast_grouped_weights_to_fp4(l1_weights_bf16)
+    l2_weights_fp4 = cast_grouped_weights_to_fp4(l2_weights_bf16)
+    transformed_l1_weights, transformed_l2_weights = \
+        deep_gemm.transform_weights_for_mega_moe(l1_weights_fp4, l2_weights_fp4)
+
+    # Stream A0.0b: under `DG_USE_FP4_ACTS=1`, the symm buffer's `x` slot is
+    # sized for packed E2M1 (`hidden/2` bytes/token) — different from FP8.
+    # Allocate the buffer separately for each path and feed it the matching
+    # source tensor.
+    def make_buffer_and_run(use_fp4_acts: bool):
+        os.environ['DG_USE_FP4_ACTS'] = '1' if use_fp4_acts else '0'
+        os.environ['DG_COMM_KERNEL_DEBUG'] = '0'
+        buf = deep_gemm.get_symm_buffer_for_mega_moe(
+            group, num_experts,
+            num_max_tokens_per_rank, num_topk,
+            hidden, intermediate_hidden
+        )
+        x_src = x_fp4 if use_fp4_acts else x_fp8
+
+        def run_once():
+            buf.x[:num_tokens].copy_(x_src[0])
+            buf.x_sf[:num_tokens].copy_(x_src[1])
+            buf.topk_idx[:num_tokens].copy_(topk_idx)
+            buf.topk_weights[:num_tokens].copy_(topk_weights)
+            cumulative.zero_()
+            y = torch.empty((num_tokens, hidden), dtype=torch.bfloat16, device='cuda')
+            deep_gemm.fp8_fp4_mega_moe(
+                y, transformed_l1_weights, transformed_l2_weights, buf,
+                cumulative_local_expert_recv_stats=cumulative,
+                activation_clamp=activation_clamp,
+                fast_math=bool(args.fast_math)
+            )
+            return y, cumulative.clone()
+
+        _ = run_once()
+        torch.cuda.synchronize()
+        y_out, _ = run_once()
+        torch.cuda.synchronize()
+        buf.destroy()
+        return y_out
+
+    # Run FP8-acts first (warmup + measurement).
+    y_fp8 = make_buffer_and_run(use_fp4_acts=False)
+    # Run FP4-acts (separate buffer because the `x` slot footprint changes).
+    y_fp4 = make_buffer_and_run(use_fp4_acts=True)
+
+    # End-to-end y comparison: this is the source of truth (no slot
+    # permutation ambiguity since y is indexed by global (token, hidden)).
+    y_diff = (y_fp4.float() - y_fp8.float()).abs()
+    y_rmse = y_diff.pow(2).mean().sqrt().item()
+    y_fp8_rms = y_fp8.float().pow(2).mean().sqrt().item()
+    rel_rmse = y_rmse / max(y_fp8_rms, 1e-12)
+
+    dist_print(f'=== A0.2.1 sentinel — y rel-RMSE (FP4 vs FP8 acts) ===',
+               once_in_node=True)
+    dist_print(f'  y_fp8 RMS:        {y_fp8_rms:.4f}', once_in_node=True)
+    dist_print(f'  y_rmse:           {y_rmse:.4f}', once_in_node=True)
+    dist_print(f'  rel-RMSE:         {rel_rmse:.4f}', once_in_node=True)
+    dist_print(f'  target:           ≤ 0.50 (A3 chain noise floor)',
+               once_in_node=True)
+    dist_print(f'  verdict:          {"PASS" if rel_rmse <= 0.5 else "FAIL"}',
+               once_in_node=True)
+
+    # Spot-check first row to make the failure mode legible if it ever
+    # comes back: matched values at low N indices = layout correct;
+    # garbage = layout broken.
+    dist_print(f'\n  y_fp8 [0, :8]:  {y_fp8[0, :8].cpu().tolist()}',
+               once_in_node=True)
+    dist_print(f'  y_fp4 [0, :8]:  {y_fp4[0, :8].cpu().tolist()}',
+               once_in_node=True)
+
+    assert rel_rmse <= 0.5, \
+        f'A0.2.1 layout regression: y rel-RMSE {rel_rmse:.4f} > 0.5'
+
+    dist.barrier()
+    dist.destroy_process_group()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--num-processes', type=int, default=2)
+    parser.add_argument('--num-max-tokens-per-rank', type=int, default=8192)
+    parser.add_argument('--num-tokens', type=int, default=512)
+    parser.add_argument('--hidden', type=int, default=1024)
+    parser.add_argument('--intermediate-hidden', type=int, default=512)
+    parser.add_argument('--num-experts', type=int, default=8)
+    parser.add_argument('--num-topk', type=int, default=2)
+    parser.add_argument('--activation-clamp', type=float, default=10.0)
+    parser.add_argument('--fast-math', type=int, default=1)
+    args = parser.parse_args()
+
+    num_processes = args.num_processes
+    torch.multiprocessing.spawn(test, args=(num_processes, args), nprocs=num_processes)
diff --git a/tests/test_mega_moe_pre_dispatch.py b/tests/test_mega_moe_pre_dispatch.py
new file mode 100644
index 0000000000..679e1e4271
--- /dev/null
+++ b/tests/test_mega_moe_pre_dispatch.py
@@ -0,0 +1,143 @@
+# Bytewise + correctness probe for `deep_gemm.mega_moe_pre_dispatch`.
+#
+# The fused pre-dispatch kernel produces the exact byte layout DeepGEMM's
+# mega-MoE symmetric `x`, `x_sf`, `topk_idx`, and `topk_weights` slots expect.
+# This test verifies bit-for-bit equivalence against the in-tree host helpers
+# (`per_token_cast_to_fp8`, `per_token_cast_to_fp4`) for both the FP8 and the
+# packed FP4 dtype branches, plus the pad-fill correctness contract.
+#
+# Single-GPU; no distributed init needed.
+
+import argparse
+import sys
+import torch
+
+import deep_gemm
+from deep_gemm.utils import per_token_cast_to_fp4, per_token_cast_to_fp8
+
+
+def _alloc_outputs(padded_max: int, hidden: int, top_k: int,
+                   group_size: int, use_fp4_acts: bool):
+    num_groups = hidden // group_size
+    assert num_groups % 4 == 0
+    if use_fp4_acts:
+        buf_x = torch.empty((padded_max, hidden // 2), dtype=torch.int8, device='cuda')
+    else:
+        buf_x = torch.empty((padded_max, hidden), dtype=torch.float8_e4m3fn, device='cuda')
+    buf_x_sf = torch.empty((padded_max, num_groups // 4), dtype=torch.int32, device='cuda')
+    buf_topk_idx = torch.empty((padded_max, top_k), dtype=torch.int64, device='cuda')
+    buf_topk_weights = torch.empty((padded_max, top_k), dtype=torch.float32, device='cuda')
+    # Sentinel-fill so any write-correctness bug shows up as a non-zero diff.
+    buf_x.fill_(0)
+    buf_x_sf.fill_(0)
+    buf_topk_idx.fill_(0)
+    buf_topk_weights.fill_(0)
+    return buf_x, buf_x_sf, buf_topk_idx, buf_topk_weights
+
+
+def _run_one(use_fp4_acts: bool, args: argparse.Namespace) -> None:
+    torch.manual_seed(args.seed)
+    M = args.num_tokens
+    P = args.padded_max
+    H = args.hidden
+    K = args.top_k
+    G = args.group_size
+    assert P >= M, 'padded_max must be >= num_tokens'
+
+    # --- Inputs (BF16 acts, int32 topk_idx, float topk_weights) ---
+    x = torch.randn((M, H), dtype=torch.bfloat16, device='cuda')
+    # Use plausible expert ids in [0, num_experts) and float weights.
+    num_experts = args.num_experts
+    topk_idx = torch.randint(0, num_experts, (M, K), dtype=torch.int32, device='cuda')
+    topk_weights = torch.randn((M, K), dtype=torch.float32, device='cuda')
+
+    buf_x, buf_x_sf, buf_topk_idx, buf_topk_weights = _alloc_outputs(P, H, K, G, use_fp4_acts)
+
+    # --- Kernel under test ---
+    deep_gemm.mega_moe_pre_dispatch(
+        x, topk_idx, topk_weights,
+        buf_x, buf_x_sf, buf_topk_idx, buf_topk_weights,
+        num_tokens=M, group_size=G, use_fp4_acts=use_fp4_acts,
+    )
+    torch.cuda.synchronize()
+
+    # --- Reference (host helper) ---
+    if use_fp4_acts:
+        ref_x, ref_sf = per_token_cast_to_fp4(
+            x, use_ue8m0=True, gran_k=G, use_packed_ue8m0=True)
+    else:
+        ref_x, ref_sf = per_token_cast_to_fp8(
+            x, use_ue8m0=True, gran_k=G, use_packed_ue8m0=True)
+
+    # --- Bytewise compare on valid-token rows ---
+    if use_fp4_acts:
+        # ref_x is int8 (M, H/2); buf_x[:M] is int8 (M, H/2). Compare raw bytes.
+        kernel_bytes = buf_x[:M].view(torch.uint8)
+        ref_bytes = ref_x.view(torch.uint8)
+    else:
+        # ref_x is float8_e4m3fn (M, H); compare via uint8 view.
+        kernel_bytes = buf_x[:M].view(torch.uint8)
+        ref_bytes = ref_x.view(torch.uint8)
+    diff_x = (kernel_bytes != ref_bytes)
+    if diff_x.any().item():
+        bad = diff_x.nonzero()
+        first = bad[0].tolist()
+        i, j = first[0], first[1]
+        raise AssertionError(
+            f'[{"FP4" if use_fp4_acts else "FP8"}] buf_x mismatch '
+            f'at row {i}, col {j}: kernel={int(kernel_bytes[i, j])} '
+            f'ref={int(ref_bytes[i, j])} (total mismatches={int(diff_x.sum())})')
+
+    # SF byte layout: (M, num_groups/4) int32 → (M, num_groups) UE8M0 bytes.
+    kernel_sf_bytes = buf_x_sf[:M].view(torch.uint8)
+    ref_sf_bytes = ref_sf.view(torch.uint8)
+    diff_sf = (kernel_sf_bytes != ref_sf_bytes)
+    if diff_sf.any().item():
+        bad = diff_sf.nonzero()
+        first = bad[0].tolist()
+        i, j = first[0], first[1]
+        raise AssertionError(
+            f'[{"FP4" if use_fp4_acts else "FP8"}] buf_x_sf mismatch '
+            f'at row {i}, byte {j}: kernel={int(kernel_sf_bytes[i, j])} '
+            f'ref={int(ref_sf_bytes[i, j])} (total mismatches={int(diff_sf.sum())})')
+
+    # --- topk pass-through and pad-fill ---
+    # Valid rows: int32 → int64 widening match.
+    if not torch.equal(buf_topk_idx[:M], topk_idx.to(torch.int64)):
+        raise AssertionError('topk_idx pass-through mismatch on valid rows')
+    if not torch.equal(buf_topk_weights[:M], topk_weights):
+        raise AssertionError('topk_weights pass-through mismatch on valid rows')
+    # Pad rows.
+    if P > M:
+        if not torch.all(buf_topk_idx[M:] == -1).item():
+            raise AssertionError('pad rows of buf_topk_idx must equal -1')
+        if not torch.all(buf_topk_weights[M:] == 0.0).item():
+            raise AssertionError('pad rows of buf_topk_weights must equal 0.0')
+
+    print(f'  PASS  '
+          f'[{"FP4" if use_fp4_acts else "FP8"}] '
+          f'M={M} P={P} H={H} K={K} G={G} — bytewise equal vs host helper '
+          f'+ pad-fill correct')
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--num-tokens', type=int, default=512)
+    parser.add_argument('--padded-max', type=int, default=576)  # > num_tokens to exercise pad
+    parser.add_argument('--hidden', type=int, default=1024)
+    parser.add_argument('--top-k', type=int, default=8)
+    parser.add_argument('--group-size', type=int, default=32)
+    parser.add_argument('--num-experts', type=int, default=64)
+    parser.add_argument('--seed', type=int, default=0)
+    parser.add_argument('--dtype', choices=['fp8', 'fp4', 'both'], default='both')
+    args = parser.parse_args()
+
+    if args.dtype in ('fp8', 'both'):
+        _run_one(use_fp4_acts=False, args=args)
+    if args.dtype in ('fp4', 'both'):
+        _run_one(use_fp4_acts=True, args=args)
+    print('OK')
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/tests/test_sgl_deep_gemm_smoke.py b/tests/test_sgl_deep_gemm_smoke.py
new file mode 100644
index 0000000000..96739fb82e
--- /dev/null
+++ b/tests/test_sgl_deep_gemm_smoke.py
@@ -0,0 +1,72 @@
+# Smoke test for the sgl-deep-gemm wheel + tvm-ffi wrapping of the
+# w4a4-related additions (mega_moe_pre_dispatch and the FP4/MXF4 env
+# surface threaded through fp8_fp4_mega_moe). Single-GPU only.
+
+import os
+import sys
+import torch
+import deep_gemm
+
+
+def _check_public_symbols():
+    needed = [
+        "mega_moe_pre_dispatch",
+        "fp8_fp4_mega_moe",
+        "get_symm_buffer_for_mega_moe",
+        "transform_weights_for_mega_moe",
+        "SymmBuffer",
+    ]
+    missing = [n for n in needed if not hasattr(deep_gemm, n)]
+    if missing:
+        raise AssertionError("missing public symbols: " + str(missing))
+    if not hasattr(deep_gemm._C, "mega_moe_pre_dispatch"):
+        raise AssertionError("tvm-ffi binding missing for mega_moe_pre_dispatch")
+
+
+def _check_pre_dispatch_smoke():
+    # Tiny end-to-end pass through the tvm-ffi wrapper.
+    M, P, H, K, G = 8, 16, 256, 4, 32
+    x = torch.randn(M, H, dtype=torch.bfloat16, device="cuda")
+    topk_idx = torch.zeros(M, K, dtype=torch.int32, device="cuda")
+    topk_weights = torch.randn(M, K, dtype=torch.float32, device="cuda")
+
+    buf_x = torch.zeros(P, H, dtype=torch.float8_e4m3fn, device="cuda")
+    num_groups = H // G
+    buf_x_sf = torch.zeros(P, num_groups // 4, dtype=torch.int32, device="cuda")
+    buf_topk_idx = torch.zeros(P, K, dtype=torch.int64, device="cuda")
+    buf_topk_weights = torch.zeros(P, K, dtype=torch.float32, device="cuda")
+
+    deep_gemm.mega_moe_pre_dispatch(
+        x, topk_idx, topk_weights,
+        buf_x, buf_x_sf, buf_topk_idx, buf_topk_weights,
+        num_tokens=M, group_size=G, use_fp4_acts=False,
+    )
+    torch.cuda.synchronize()
+
+    # Pad rows must be (-1, 0).
+    assert torch.all(buf_topk_idx[M:] == -1).item()
+    assert torch.all(buf_topk_weights[M:] == 0.0).item()
+    # Valid rows pass through (int32 -> int64 widening).
+    assert torch.equal(buf_topk_idx[:M], topk_idx.to(torch.int64))
+
+
+def main() -> int:
+    _check_public_symbols()
+    print("PASS public symbols")
+
+    if not torch.cuda.is_available():
+        print("SKIP runtime: no CUDA")
+        return 0
+    cc_major, _ = torch.cuda.get_device_capability()
+    if cc_major != 10:
+        print(f"SKIP runtime: kernel needs SM100, device is sm{cc_major}x")
+        return 0
+
+    _check_pre_dispatch_smoke()
+    print("PASS pre_dispatch smoke")
+    print("OK")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())