From 00fa0bf65a35c111c94dc8924ee4e6eda8285629 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ludwig=20B=C3=B6ss?= <ludwig.boess@outlook.de>
Date: Tue, 19 May 2026 19:25:22 +0200
Subject: [PATCH 1/7] port: team policy and vendor specific sort

---
 CMakeLists.txt                              |   66 +
 cmake/defaults.cmake                        |   15 +
 cmake/report.cmake                          |   25 +
 src/engines/srpic/currents.h                |  135 +-
 src/framework/containers/particles.h        |   58 +
 src/framework/containers/particles_sort.cpp |  341 ++++-
 src/global/arch/kokkos_aliases.h            |   27 +
 src/global/utils/sort_dispatch.h            |  171 +++
 src/global/utils/sorting.h                  |  155 ++-
 src/kernels/currents_deposit.hpp            | 1395 ++++++++++++-------
 tests/framework/CMakeLists.txt              |    6 +
 tests/framework/sort_by_key.cpp             |  110 ++
 tests/global/tiling.cpp                     |   29 +-
 tests/kernels/CMakeLists.txt                |    3 +
 tests/kernels/deposit_tiled.cpp             |  262 ++++
 15 files changed, 2286 insertions(+), 512 deletions(-)
 create mode 100644 src/global/utils/sort_dispatch.h
 create mode 100644 tests/framework/sort_by_key.cpp
 create mode 100644 tests/kernels/deposit_tiled.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cd5f0c258..acac1d7d5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,6 +58,16 @@ set(gpu_aware_mpi
   ${default_gpu_aware_mpi}
   CACHE BOOL "Enable GPU-aware MPI")
 
+set(team_policy
+  ${default_team_policy}
+  CACHE BOOL "Enable team_policy tile-blocked deposit/pusher kernels")
+set(team_policy_tile_size
+  ${default_team_policy_tile_size}
+  CACHE STRING "team_policy tile edge length in cells")
+set(team_policy_tile_sizes
+  "4;6;8;10;12"
+  CACHE STRING "team_policy tile-size choices")
+
 # -------------------------- Compilation settings -------------------------- #
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -136,6 +146,62 @@ else()
   set(DEVICE_ENABLED OFF)
 endif()
 
+# ------------------------------ team_policy wiring ------------------------ #
+if(${team_policy})
+  list(FIND team_policy_tile_sizes "${team_policy_tile_size}" _tps_idx)
+  if(_tps_idx EQUAL -1)
+    message(FATAL_ERROR
+      "${Red}team_policy_tile_size must be one of ${team_policy_tile_sizes}, "
+      "got '${team_policy_tile_size}'${ColorReset}")
+  endif()
+  add_compile_options("-D TEAM_POLICY")
+  add_compile_options("-D TEAM_POLICY_TILE_SIZE=${team_policy_tile_size}")
+
+  # Vendor sort: oneDPL on SYCL, Thrust on CUDA. Used automatically
+  # when found; falls back to Kokkos::BinSort otherwise.
+  if("${Kokkos_DEVICES}" MATCHES "SYCL")
+    find_package(oneDPL QUIET)
+    if(oneDPL_FOUND)
+      message(STATUS "team_policy: oneDPL found, enabling SYCL sort_by_key")
+      add_compile_options("-D ONEDPL_ENABLED")
+      set(DEPENDENCIES ${DEPENDENCIES} oneDPL)
+    else()
+      message(STATUS "team_policy: oneDPL not found; using BinSort fallback "
+                     "for SYCL sort_by_key")
+    endif()
+  endif()
+
+  if("${Kokkos_DEVICES}" MATCHES "CUDA")
+    find_package(Thrust QUIET)
+    if(Thrust_FOUND)
+      message(STATUS "team_policy: Thrust enabled for CUDA sort_by_key")
+      add_compile_options("-D THRUST_ENABLED")
+    else()
+      message(STATUS "team_policy: Thrust not found; using BinSort fallback "
+                     "for CUDA sort_by_key")
+    endif()
+  endif()
+
+  if("${Kokkos_DEVICES}" MATCHES "HIP")
+    # rocThrust ships with ROCm and exposes the same thrust:: API. Using
+    # it lets the HIP backend build a single permutation via
+    # sort_by_key and gather all SoA members through one reused buffer,
+    # instead of the legacy per-member Kokkos::BinSort path which
+    # allocates a fresh `sorted_values` buffer for every member every
+    # step (the dominant source of allocator churn / fragmentation on
+    # ROCm).
+    find_package(rocthrust QUIET)
+    if(rocthrust_FOUND)
+      message(STATUS "team_policy: rocThrust enabled for HIP sort_by_key")
+      add_compile_options("-D ROCTHRUST_ENABLED")
+      set(DEPENDENCIES ${DEPENDENCIES} roc::rocthrust)
+    else()
+      message(STATUS "team_policy: rocThrust not found; using BinSort "
+                     "fallback for HIP sort_by_key")
+    endif()
+  endif()
+endif()
+
 # MPI
 if(${mpi})
   find_or_fetch_dependency(MPI FALSE REQUIRED)
diff --git a/cmake/defaults.cmake b/cmake/defaults.cmake
index fb8790019..a85accf84 100644
--- a/cmake/defaults.cmake
+++ b/cmake/defaults.cmake
@@ -92,3 +92,18 @@ else()
 endif()
 
 set_property(CACHE default_gpu_aware_mpi PROPERTY TYPE BOOL)
+
+if(DEFINED ENV{Entity_ENABLE_TEAM_POLICY})
+  set(default_team_policy
+      $ENV{Entity_ENABLE_TEAM_POLICY}
+      CACHE INTERNAL "Default flag for team_policy tile-blocked kernels")
+else()
+  set(default_team_policy
+      OFF
+      CACHE INTERNAL "Default flag for team_policy tile-blocked kernels")
+endif()
+set_property(CACHE default_team_policy PROPERTY TYPE BOOL)
+
+set(default_team_policy_tile_size
+    8
+    CACHE INTERNAL "Default tile edge length in cells for team_policy")
diff --git a/cmake/report.cmake b/cmake/report.cmake
index 7b145bfc7..65a22a7a6 100644
--- a/cmake/report.cmake
+++ b/cmake/report.cmake
@@ -122,6 +122,26 @@ if(${mpi} AND ${DEVICE_ENABLED})
     GPU_AWARE_MPI_REPORT
     46)
 endif()
+printchoices(
+  "Team Policy"
+  "team_policy"
+  "${ON_OFF_VALUES}"
+  ${team_policy}
+  OFF
+  "${Green}"
+  TEAM_POLICY_REPORT
+  46)
+if(${team_policy})
+  printchoices(
+    "Team Tile Size"
+    "team_policy_tile_size"
+    "${team_policy_tile_sizes}"
+    ${team_policy_tile_size}
+    ${default_team_policy_tile_size}
+    "${Blue}"
+    TEAM_POLICY_TILE_SIZE_REPORT
+    46)
+endif()
 printchoices(
   "Debug mode"
   "DEBUG"
@@ -197,6 +217,11 @@ if(${mpi} AND ${DEVICE_ENABLED})
   string(APPEND REPORT_TEXT "  " ${GPU_AWARE_MPI_REPORT} "\n")
 endif()
 
+string(APPEND REPORT_TEXT "  " ${TEAM_POLICY_REPORT} "\n")
+if(${team_policy})
+  string(APPEND REPORT_TEXT "  " ${TEAM_POLICY_TILE_SIZE_REPORT} "\n")
+endif()
+
 string(
   APPEND
   REPORT_TEXT
diff --git a/src/engines/srpic/currents.h b/src/engines/srpic/currents.h
index 3afabea8a..faf0bb3ad 100644
--- a/src/engines/srpic/currents.h
+++ b/src/engines/srpic/currents.h
@@ -2,7 +2,8 @@
  * @file engines/srpic/currents.h
  * @brief Current deposition and filtering routines for the SRPIC engine
  * @implements
- *   - ntt::srpic::CallDepositKernel<> -> void
+ *   - ntt::srpic::CallDepositKernel<> -> void                 (flat path)
+ *   - ntt::srpic::CallDepositKernelTiled<> -> void            (TEAM_POLICY)
  *   - ntt::srpic::CurrentsDeposit<> -> void
  *   - ntt::srpic::CurrentsFilter<> -> void
  * @namespaces:
@@ -61,11 +62,142 @@ namespace ntt {
                              dt));
     }
 
+#if defined(TEAM_POLICY)
+    /**
+     * @brief Tiled deposit launcher (TeamPolicy + per-team scratch).
+     *
+     * Iterates over `tile_layout.ntiles_total` teams; each team accumulates
+     * its tile's particle contributions in SLM scratch and atomically
+     * flushes to the global J. Requires the species to have been sorted
+     * with `team_policy` enabled (`tile_layout` populated by
+     * `SortSpatially`).
+     *
+     * Falls back to the flat kernel if `tile_offsets` is empty — this
+     * happens on the first step before the first sort, or for very small
+     * species that exited early in `SortSpatially`. The fallback uses the
+     * passed-in `scatter_cur` so the caller still composes correctly.
+     */
+    template <SRMetricClass M, unsigned short O>
+    void CallDepositKernelTiled(
+      const Particles<M::Dim, M::CoordType>& species,
+      const M&                               local_metric,
+      const ndfield_t<M::Dim, 3>&            cur,
+      real_t                                 dt) {
+      static_assert(O <= 11u, "Shape order must be <= 11");
+      constexpr unsigned short T = static_cast<unsigned short>(
+        TEAM_POLICY_TILE_SIZE);
+      const auto& layout = species.tile_layout();
+      raise::ErrorIf(layout.ntiles_total == 0u,
+                     "CallDepositKernelTiled: tile_layout has 0 tiles — call "
+                     "SortSpatially before CurrentsDeposit",
+                     HERE);
+      raise::ErrorIf(layout.tile_offsets.extent(0) != layout.ntiles_total + 1u,
+                     "CallDepositKernelTiled: tile_offsets size inconsistent "
+                     "with ntiles_total",
+                     HERE);
+
+      using kernel_t = kernel::DepositCurrents_kernel_tiled<SimEngine::SRPIC,
+                                                            M,
+                                                            O,
+                                                            T>;
+      kernel_t kern { cur,
+                      species.i1,
+                      species.i2,
+                      species.i3,
+                      species.i1_prev,
+                      species.i2_prev,
+                      species.i3_prev,
+                      species.dx1,
+                      species.dx2,
+                      species.dx3,
+                      species.dx1_prev,
+                      species.dx2_prev,
+                      species.dx3_prev,
+                      species.ux1,
+                      species.ux2,
+                      species.ux3,
+                      species.phi,
+                      species.weight,
+                      species.tag,
+                      local_metric,
+                      (real_t)(species.charge()),
+                      dt,
+                      layout };
+
+      Kokkos::TeamPolicy<> policy(static_cast<int>(layout.ntiles_total),
+                                  Kokkos::AUTO);
+      policy.set_scratch_size(0, Kokkos::PerTeam(kernel_t::scratch_bytes()));
+      Kokkos::parallel_for("CurrentsDepositTiled", policy, kern);
+    }
+#endif // TEAM_POLICY
+
     template <SRMetricClass M>
     void CurrentsDeposit(Domain<SimEngine::SRPIC, M>& domain,
                          const prm::Parameters&       engine_params) {
       const auto dt = engine_params.get<real_t>("dt");
       Kokkos::deep_copy(domain.fields.cur, ZERO);
+
+#if defined(TEAM_POLICY)
+
+      // First-step fallback: if any contributing species has not been
+      // sorted yet (tile_layout still empty), fall back to the flat
+      // scatter-view path for that step. Subsequent steps see populated
+      // layouts and use the tiled kernel.
+      bool any_unsorted = false;
+      for (auto& species : domain.species) {
+        if ((species.pusher() == ParticlePusher::NONE) or
+            (species.npart() == 0) or cmp::AlmostZero_host(species.charge())) {
+          continue;
+        }
+        if (species.tile_layout().ntiles_total == 0u or
+            species.tile_layout().tile_offsets.extent(0) == 0u) {
+          any_unsorted = true;
+          break;
+        }
+      }
+      if (any_unsorted) {
+        auto scatter_cur = Kokkos::Experimental::create_scatter_view(
+          domain.fields.cur);
+        for (auto& species : domain.species) {
+          if ((species.pusher() == ParticlePusher::NONE) or
+              (species.npart() == 0) or cmp::AlmostZero_host(species.charge())) {
+            continue;
+          }
+          logger::Checkpoint(
+            fmt::format("Launching currents deposit (flat fallback, no sort yet) "
+                        "for %d [%s] : %lu %f",
+                        species.index(),
+                        species.label().c_str(),
+                        species.npart(),
+                        (double)species.charge()),
+            HERE);
+          CallDepositKernel<M, SHAPE_ORDER>(species,
+                                             domain.mesh.metric,
+                                             scatter_cur,
+                                             dt);
+        }
+        Kokkos::Experimental::contribute(domain.fields.cur, scatter_cur);
+      } else {
+        for (auto& species : domain.species) {
+          if ((species.pusher() == ParticlePusher::NONE) or
+              (species.npart() == 0) or cmp::AlmostZero_host(species.charge())) {
+            continue;
+          }
+          logger::Checkpoint(
+            fmt::format("Launching tiled currents deposit for %d [%s] : %lu %f",
+                        species.index(),
+                        species.label().c_str(),
+                        species.npart(),
+                        (double)species.charge()),
+            HERE);
+
+          CallDepositKernelTiled<M, SHAPE_ORDER>(species,
+                                                  domain.mesh.metric,
+                                                  domain.fields.cur,
+                                                  dt);
+        }
+      }
+#else
       auto scatter_cur = Kokkos::Experimental::create_scatter_view(
         domain.fields.cur);
       for (auto& species : domain.species) {
@@ -84,6 +216,7 @@ namespace ntt {
         CallDepositKernel<M, SHAPE_ORDER>(species, domain.mesh.metric, scatter_cur, dt);
       }
       Kokkos::Experimental::contribute(domain.fields.cur, scatter_cur);
+#endif
     }
 
     template <SRMetricClass M>
diff --git a/src/framework/containers/particles.h b/src/framework/containers/particles.h
index 895026552..0a15cb5d9 100644
--- a/src/framework/containers/particles.h
+++ b/src/framework/containers/particles.h
@@ -25,6 +25,7 @@
 #include "traits/metric.h"
 #include "utils/error.h"
 #include "utils/formatting.h"
+#include "utils/sorting.h"
 
 #include "framework/containers/species.h"
 #include "framework/domain/grid.h"
@@ -90,6 +91,30 @@ namespace ntt {
     const uint8_t m_ntags { (uint8_t)(2 + math::pow(3, (int)D) - 1) };
 #endif
 
+    // team_policy: tile metadata produced by SortSpatially
+    // and consumed by the tiled deposit / pusher kernels. Lazily
+    // allocated on first sort. The sort backend itself (oneDPL on SYCL,
+    // Thrust on CUDA, std::sort on Host, Kokkos::BinSort otherwise) is
+    // selected at compile time based on the Kokkos device and the
+    // vendor libraries detected by CMake.
+    TileLayout<D> m_tile_layout {};
+
+#if defined(TEAM_POLICY) &&                                                    \
+  ((defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)) ||                       \
+   (defined(CUDA_ENABLED) && defined(THRUST_ENABLED)) ||                       \
+   (defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED)))
+    // Persistent byte scratch reused by every SoA-member gather in
+    // `apply_permutation_to_soa`, across all members and all timesteps.
+    // Without this each member would allocate (and free) its own
+    // transient buffer every sort; recycling one persistent buffer
+    // removes that allocation churn entirely — the structural fix for
+    // the ROCm sort slowdown / fragmentation. Grown monotonically to
+    // the largest required size, never shrunk. Kokkos device
+    // allocations are over-aligned (>= 8 B), so reinterpreting the
+    // bytes as any SoA element type (<= 8 B PODs) is well-defined.
+    array_t<char*> m_perm_scratch {};
+#endif
+
   public:
     // for empty allocation
     Particles() {}
@@ -276,9 +301,42 @@ namespace ntt {
     /**
      * @brief Sort particles spatially by their cell indices
      * @param grid The grid object to get the cell information for sorting
+     * @note In team_policy mode (compile-time `team_policy=ON`), also
+     *       populates `m_tile_layout` with tile-offset and per-tile
+     *       permutation metadata that the tiled deposit/pusher kernels
+     *       consume.
      */
     void SortSpatially(const Grid<D>&);
 
+#if defined(TEAM_POLICY) &&                                                    \
+  ((defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)) ||                       \
+   (defined(CUDA_ENABLED) && defined(THRUST_ENABLED)) ||                       \
+   (defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED)))
+  private:
+    /**
+     * @brief Apply a particle-index permutation (built by oneDPL/Thrust
+     *        sort_by_key) to every SoA member array. Sequential — one
+     *        transient buffer at a time, fenced before scope exit.
+     *        Only compiled when a vendor sort backend is enabled; the
+     *        BinSort path applies the permutation in place via
+     *        `sorter.sort(view)` instead.
+     */
+    void apply_permutation_to_soa(const prtl_perm_t& perm);
+
+  public:
+#endif
+
+    /**
+     * @brief Read-only access to the tile layout produced by the most
+     *        recent SortSpatially call. Returns a default-constructed
+     *        layout (`ntiles_total == 0`) when the species has not yet
+     *        been sorted.
+     */
+    [[nodiscard]]
+    auto tile_layout() const -> const TileLayout<D>& {
+      return m_tile_layout;
+    }
+
     /**
      * @brief Copy particle data from device to host.
      */
diff --git a/src/framework/containers/particles_sort.cpp b/src/framework/containers/particles_sort.cpp
index 904fb3fc7..c04813e12 100644
--- a/src/framework/containers/particles_sort.cpp
+++ b/src/framework/containers/particles_sort.cpp
@@ -8,10 +8,22 @@
 #include "framework/containers/particles.h"
 #include "framework/domain/grid.h"
 
+#if defined(TEAM_POLICY)
+  #if (defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)) ||                    \
+      (defined(CUDA_ENABLED) && defined(THRUST_ENABLED)) ||                    \
+      (defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED))
+    #define TEAM_POLICY_USE_VENDOR_SORT
+    #include "utils/sort_dispatch.h"
+  #endif
+#endif
+
 #include <Kokkos_Core.hpp>
 #include <Kokkos_ScatterView.hpp>
+#include <Kokkos_Sort.hpp>
 #include <Kokkos_StdAlgorithms.hpp>
 
+#include <algorithm>
+#include <cstddef>
 #include <string>
 #include <utility>
 #include <vector>
@@ -195,6 +207,192 @@ namespace ntt {
 
   template <Dimension D, Coord::type C>
   void Particles<D, C>::SortSpatially(const Grid<D>& grid) {
+#if defined(TEAM_POLICY)
+    // ---------------------- team_policy: tile-based sort ------------------ //
+    const auto npart_local = npart();
+    if (npart_local == 0u) {
+      m_tile_layout = TileLayout<D> {};
+      m_is_sorted   = true;
+      return;
+    }
+
+    constexpr unsigned short T = static_cast<unsigned short>(
+      TEAM_POLICY_TILE_SIZE);
+    static_assert(T > 0u, "TEAM_POLICY_TILE_SIZE must be > 0");
+
+    // 1. Compute per-axis tile counts and total_tiles.
+    const auto ncells_active = grid.n_active();
+    ncells_t   ntx[3] { 1u, 1u, 1u };
+    ncells_t   total_tiles { 1u };
+    if constexpr ((D == Dim::_1D) or (D == Dim::_2D) or (D == Dim::_3D)) {
+      ntx[0]       = static_cast<ncells_t>(math::ceil(
+        static_cast<double>(ncells_active[0]) / static_cast<double>(T)));
+      total_tiles *= ntx[0];
+    }
+    if constexpr ((D == Dim::_2D) or (D == Dim::_3D)) {
+      ntx[1]       = static_cast<ncells_t>(math::ceil(
+        static_cast<double>(ncells_active[1]) / static_cast<double>(T)));
+      total_tiles *= ntx[1];
+    }
+    if constexpr (D == Dim::_3D) {
+      ntx[2]       = static_cast<ncells_t>(math::ceil(
+        static_cast<double>(ncells_active[2]) / static_cast<double>(T)));
+      total_tiles *= ntx[2];
+    }
+
+    // 2. Compute per-particle tile key (with min(i, i_prev)).
+    array_t<ncells_t*> tile_indices { "tile_indices", npart_local };
+    Kokkos::parallel_for(
+      "FillTileIndices",
+      rangeActiveParticles(),
+      sort::PositionToTileIndex<D, false, true> { i1,
+                                                   i2,
+                                                   i3,
+                                                   tag,
+                                                   tile_indices,
+                                                   ncells_active,
+                                                   static_cast<ncells_t>(T),
+                                                   array_t<npart_t*> {},
+                                                   i1_prev,
+                                                   i2_prev,
+                                                   i3_prev });
+
+    // 3. Sort. Vendor library (oneDPL/Thrust) when compiled in;
+    //    Kokkos::BinSort otherwise. n_bins = total_tiles + 2 covers
+    //    the dead-particle sentinel bin (total_tiles + 1u).
+    const ncells_t n_bins = total_tiles + 2u;
+    const auto     slice  = prtl_slice_t(0, npart_local);
+  #if defined(TEAM_POLICY_USE_VENDOR_SORT)
+    // Vendor path: produce an explicit permutation via sort_by_key,
+    // then apply it to each SoA member with a sequential one-buffer
+    // gather (peak transient = one `npart × sizeof(member)` buffer.
+    prtl_perm_t perm { "tile_perm", npart_local };
+    #if defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)
+    sort_helpers::sort_by_key_dispatch(tile_indices,
+                                       perm,
+                                       n_bins,
+                                       sort::backend::OneDPL {});
+    #elif defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED)
+    sort_helpers::sort_by_key_dispatch(tile_indices,
+                                       perm,
+                                       n_bins,
+                                       sort::backend::Rocthrust {});
+    #else
+    sort_helpers::sort_by_key_dispatch(tile_indices,
+                                       perm,
+                                       n_bins,
+                                       sort::backend::Thrust {});
+    #endif
+    Kokkos::fence("SortSpatially: pre-gather drain");
+    apply_permutation_to_soa(perm);
+  #else
+    // BinSort path: same mechanism as legacy SortSpatially (BinSort
+    // allocates one temp View per `sorter.sort(view)` call and frees
+    // it before the next), so peak transient memory is bounded.
+    using sorter_op_t = Kokkos::BinOp1D<array_t<ncells_t*>>;
+    using sorter_t    = Kokkos::BinSort<array_t<ncells_t*>, sorter_op_t>;
+    auto bin_op       = sorter_op_t { static_cast<int>(n_bins), 0u, n_bins };
+    auto sorter       = sorter_t { tile_indices, bin_op, false };
+    sorter.create_permute_vector();
+    if constexpr (D == Dim::_1D or D == Dim::_2D or D == Dim::_3D) {
+      sorter.sort(Kokkos::subview(i1, slice));
+      sorter.sort(Kokkos::subview(i1_prev, slice));
+      sorter.sort(Kokkos::subview(dx1, slice));
+      sorter.sort(Kokkos::subview(dx1_prev, slice));
+    }
+    if constexpr (D == Dim::_2D or D == Dim::_3D) {
+      sorter.sort(Kokkos::subview(i2, slice));
+      sorter.sort(Kokkos::subview(i2_prev, slice));
+      sorter.sort(Kokkos::subview(dx2, slice));
+      sorter.sort(Kokkos::subview(dx2_prev, slice));
+    }
+    if constexpr (D == Dim::_3D) {
+      sorter.sort(Kokkos::subview(i3, slice));
+      sorter.sort(Kokkos::subview(i3_prev, slice));
+      sorter.sort(Kokkos::subview(dx3, slice));
+      sorter.sort(Kokkos::subview(dx3_prev, slice));
+    }
+    sorter.sort(Kokkos::subview(ux1, slice));
+    sorter.sort(Kokkos::subview(ux2, slice));
+    sorter.sort(Kokkos::subview(ux3, slice));
+    sorter.sort(Kokkos::subview(weight, slice));
+    sorter.sort(Kokkos::subview(tag, slice));
+    if constexpr (D == Dim::_2D and C != Coord::Cartesian) {
+      sorter.sort(Kokkos::subview(phi, slice));
+    }
+    for (auto pldr { 0u }; pldr < npld_r(); ++pldr) {
+      sorter.sort(Kokkos::subview(pld_r, slice, pldr));
+    }
+    for (auto pldi { 0u }; pldi < npld_i(); ++pldi) {
+      sorter.sort(Kokkos::subview(pld_i, slice, pldi));
+    }
+    // Apply the same permutation to `tile_indices` itself so it ends
+    // monotonically non-decreasing for the offsets pass below.
+    sorter.sort(tile_indices);
+  #endif // TEAM_POLICY_USE_VENDOR_SORT
+
+    // 5. Compute per-tile prefix-sum `tile_offsets` for the tiled
+    //    pusher. `tile_indices` is now sorted (monotonically
+    //    non-decreasing for alive particles, dead sentinel
+    //    `total_tiles + 1` clustered at the end) — vendor sort_by_key
+    //    sorts keys in place; the BinSort path explicitly applies the
+    //    same permutation to `tile_indices` above. Transition-detect
+    //    directly on it: the start of each non-empty tile is the only
+    //    place a write happens — atomic-free in the dense branch.
+    //    Empty tiles (no particles) are filled by a reverse pass on a
+    //    small host mirror (`total_tiles ≈ 176K` at production scale →
+    //    ~700 KB).
+    {
+      array_t<npart_t*> tile_offsets { "tile_offsets", total_tiles + 1u };
+      Kokkos::deep_copy(tile_offsets, static_cast<npart_t>(npart_local));
+
+      const auto total_tiles_v = total_tiles;
+      auto       ti_v          = tile_indices;
+      Kokkos::parallel_for(
+        "DetectTileBoundaries",
+        rangeActiveParticles(),
+        Lambda(prtlidx_t p) {
+          const auto t_curr   = ti_v(p);
+          const bool boundary = (p == 0u) || (ti_v(p - 1u) != t_curr);
+          if (!boundary) {
+            return;
+          }
+          if (t_curr < total_tiles_v) {
+            tile_offsets(t_curr) = p;
+          } else {
+            // First dead particle — also marks the alive_count boundary
+            // stored at index total_tiles.
+            Kokkos::atomic_min(&tile_offsets(total_tiles_v), p);
+          }
+        });
+
+      auto h_offsets = Kokkos::create_mirror_view(tile_offsets);
+      Kokkos::deep_copy(h_offsets, tile_offsets);
+      for (auto t = static_cast<std::size_t>(total_tiles); t-- > 0u;) {
+        if (h_offsets(t) > h_offsets(t + 1u)) {
+          h_offsets(t) = h_offsets(t + 1u);
+        }
+      }
+      Kokkos::deep_copy(tile_offsets, h_offsets);
+
+      m_tile_layout.tile_offsets = tile_offsets;
+    }
+
+    // 6. Populate `m_tile_layout` size/shape. `tile_perm` is not used
+    //    in the current design — the SoA arrays are physically permuted
+    //    into tile order, so consumers iterate
+    //    `[tile_offsets(t), tile_offsets(t+1))` directly without a
+    //    separate permutation indirection.
+    m_tile_layout.ntiles_per_axis[0] = ntx[0];
+    m_tile_layout.ntiles_per_axis[1] = ntx[1];
+    m_tile_layout.ntiles_per_axis[2] = ntx[2];
+    m_tile_layout.ntiles_total       = total_tiles;
+    m_tile_layout.tile_size          = T;
+    m_tile_layout.tile_perm          = prtl_perm_t {};
+    m_is_sorted                      = true;
+
+    Kokkos::fence("SortSpatially: end of team_policy path");
+#else  // !TEAM_POLICY — legacy in-place BinSort by global cell index
     const auto nx2         = grid.n_active(in::x2);
     const auto nx3         = grid.n_active(in::x3);
     const auto total_cells = grid.num_active();
@@ -250,13 +448,153 @@ namespace ntt {
     for (auto pldi { 0u }; pldi < npld_i(); ++pldi) {
       sorter.sort(Kokkos::subview(pld_i, slice, pldi));
     }
+#endif // TEAM_POLICY
+  }
+
+#if defined(TEAM_POLICY_USE_VENDOR_SORT)
+  namespace permute_helpers {
+
+    // Permute a 1D SoA member array `arr` in place by `perm`, gathering
+    // through `scratch` — a persistent byte buffer reused by every
+    // member and every timestep (no per-call allocation). An unmanaged
+    // typed view aliases the scratch bytes; the caller guarantees
+    // `scratch` is large enough and that Kokkos' device over-alignment
+    // covers the element type.
+    template <typename V>
+    inline void permute_1d_inplace(V&                    arr,
+                                   const prtl_perm_t&    perm,
+                                   npart_t               n,
+                                   const array_t<char*>& scratch) {
+      if (n == 0u) {
+        return;
+      }
+      using value_t = typename V::non_const_value_type;
+      using buf_t   = Kokkos::View<value_t*,
+                                 typename V::array_layout,
+                                 typename V::memory_space,
+                                 Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+      buf_t buf(reinterpret_cast<value_t*>(scratch.data()), n);
+      auto  perm_v = perm;
+      auto  arr_v  = arr;
+      Kokkos::parallel_for(
+        "Permute1D",
+        n,
+        KOKKOS_LAMBDA(const npart_t p) { buf(p) = arr_v(perm_v(p)); });
+      Kokkos::deep_copy(Kokkos::subview(arr, prtl_slice_t(0u, n)), buf);
+      Kokkos::fence("permute_1d_inplace: end");
+    }
+
+    // 2D analogue for `pld_r` / `pld_i`.
+    template <typename V>
+    inline void permute_2d_inplace(V&                    arr,
+                                   const prtl_perm_t&    perm,
+                                   npart_t               n,
+                                   npart_t               ncols,
+                                   const array_t<char*>& scratch) {
+      if (n == 0u or ncols == 0u) {
+        return;
+      }
+      using value_t = typename V::non_const_value_type;
+      using buf_t   = Kokkos::View<value_t**,
+                                 typename V::array_layout,
+                                 typename V::memory_space,
+                                 Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+      buf_t buf(reinterpret_cast<value_t*>(scratch.data()), n, ncols);
+      auto  perm_v = perm;
+      auto  arr_v  = arr;
+      Kokkos::parallel_for(
+        "Permute2D",
+        CreateParticleRangePolicy<Dim::_2D>({ 0u, 0u }, { n, ncols }),
+        KOKKOS_LAMBDA(const npart_t p, const npart_t l) {
+          buf(p, l) = arr_v(perm_v(p), l);
+        });
+      Kokkos::deep_copy(Kokkos::subview(arr, prtl_slice_t(0u, n), Kokkos::ALL),
+                        buf);
+      Kokkos::fence("permute_2d_inplace: end");
+    }
+
+  } // namespace permute_helpers
+
+  template <Dimension D, Coord::type C>
+  void Particles<D, C>::apply_permutation_to_soa(const prtl_perm_t& perm) {
+    const auto n = npart();
+    if (n == 0u) {
+      return;
+    }
+
+    // Size the persistent scratch once to the largest gather any member
+    // needs this call: 1D members need n * sizeof(real_t) bytes (the
+    // widest element); the 2D payloads need n * ncols * elem bytes.
+    // Grown monotonically, never shrunk — so after warmup this incurs
+    // no allocation at all.
+    std::size_t need = static_cast<std::size_t>(n) * sizeof(real_t);
+    if (npld_r() > 0) {
+      need = std::max(need,
+                      static_cast<std::size_t>(n) *
+                        static_cast<std::size_t>(npld_r()) * sizeof(real_t));
+    }
+    if (npld_i() > 0) {
+      need = std::max(need,
+                      static_cast<std::size_t>(n) *
+                        static_cast<std::size_t>(npld_i()) * sizeof(npart_t));
+    }
+    if (m_perm_scratch.extent(0) < need) {
+      m_perm_scratch = array_t<char*> { "perm_scratch", need };
+    }
+    const auto& scratch = m_perm_scratch;
+
+    using permute_helpers::permute_1d_inplace;
+    using permute_helpers::permute_2d_inplace;
+
+    if constexpr (D == Dim::_1D or D == Dim::_2D or D == Dim::_3D) {
+      permute_1d_inplace(i1, perm, n, scratch);
+      permute_1d_inplace(dx1, perm, n, scratch);
+      permute_1d_inplace(i1_prev, perm, n, scratch);
+      permute_1d_inplace(dx1_prev, perm, n, scratch);
+    }
+    if constexpr (D == Dim::_2D or D == Dim::_3D) {
+      permute_1d_inplace(i2, perm, n, scratch);
+      permute_1d_inplace(dx2, perm, n, scratch);
+      permute_1d_inplace(i2_prev, perm, n, scratch);
+      permute_1d_inplace(dx2_prev, perm, n, scratch);
+    }
+    if constexpr (D == Dim::_3D) {
+      permute_1d_inplace(i3, perm, n, scratch);
+      permute_1d_inplace(dx3, perm, n, scratch);
+      permute_1d_inplace(i3_prev, perm, n, scratch);
+      permute_1d_inplace(dx3_prev, perm, n, scratch);
+    }
+    permute_1d_inplace(ux1, perm, n, scratch);
+    permute_1d_inplace(ux2, perm, n, scratch);
+    permute_1d_inplace(ux3, perm, n, scratch);
+    permute_1d_inplace(weight, perm, n, scratch);
+    permute_1d_inplace(tag, perm, n, scratch);
+    if constexpr (D == Dim::_2D and C != Coord::Cartesian) {
+      permute_1d_inplace(phi, perm, n, scratch);
+    }
+    if (npld_r() > 0) {
+      permute_2d_inplace(pld_r, perm, n, static_cast<npart_t>(npld_r()), scratch);
+    }
+    if (npld_i() > 0) {
+      permute_2d_inplace(pld_i, perm, n, static_cast<npart_t>(npld_i()), scratch);
+    }
   }
+#endif // TEAM_POLICY_USE_VENDOR_SORT
+
+#if defined(TEAM_POLICY_USE_VENDOR_SORT)
+  #define APPLY_PERM_INSTANTIATE(D, C)                                         \
+    template void Particles<D, C>::apply_permutation_to_soa(                   \
+      const prtl_perm_t&);
+#else
+  #define APPLY_PERM_INSTANTIATE(D, C)
+#endif
 
 #define PARTICLES_SORT(D, C)                                                   \
   template auto Particles<D, C>::NpartsPerTagAndOffsets() const                \
     -> std::pair<std::vector<npart_t>, array_t<npart_t*>>;                     \
   template void Particles<D, C>::RemoveDead();                                 \
-  template void Particles<D, C>::SortSpatially(const Grid<D>&);
+  template void Particles<D, C>::SortSpatially(const Grid<D>&);                \
+  APPLY_PERM_INSTANTIATE(D, C)
 
   PARTICLES_SORT(Dim::_1D, Coord::Cartesian)
   PARTICLES_SORT(Dim::_2D, Coord::Cartesian)
@@ -266,5 +604,6 @@ namespace ntt {
   PARTICLES_SORT(Dim::_3D, Coord::Spherical)
   PARTICLES_SORT(Dim::_3D, Coord::Qspherical)
 #undef PARTICLES_SORT
+#undef APPLY_PERM_INSTANTIATE
 
 } // namespace ntt
diff --git a/src/global/arch/kokkos_aliases.h b/src/global/arch/kokkos_aliases.h
index 314e78c0e..43f21ce47 100644
--- a/src/global/arch/kokkos_aliases.h
+++ b/src/global/arch/kokkos_aliases.h
@@ -10,6 +10,7 @@
  *   - CreateRangePolicy, CreateRangePolicyOnHost
  *   - random_number_pool_t, random_generator_t
  *   - Random function
+ *   - prtl_perm_t, TileLayout<>
  * @cpp:
  *   - arch/kokkos_aliases.cpp
  * @namespaces:
@@ -258,6 +259,32 @@ template <Dimension D>
 auto CreateRangePolicyOnHost(const tuple_t<ncells_t, D>&,
                              const tuple_t<ncells_t, D>&) -> range_h_t<D>;
 
+// --------------------------- team_policy types ---------------------------- //
+// Particle permutation index: maps a sorted-position p in [0, npart) to a
+// pre-sort particle index. Produced by SortSpatially, consumed by tiled
+// pusher and deposit kernels to walk particles tile-by-tile without
+// physically re-permuting the SoA arrays in lock step every step.
+using prtl_perm_t = array_t<npart_t*>;
+
+// Tile layout metadata: the contract between Stream 1 (sort) and Streams
+// 2/3 (tiled deposit / pusher). All members are device-resident.
+//   ntiles_per_axis : number of tiles along each axis (1 for unused axes).
+//   ntiles_total    : product of ntiles_per_axis = league size for TeamPolicy.
+//   tile_size       : tile edge length in cells (compile-time CMake knob,
+//                     replicated here for runtime checks).
+//   tile_offsets    : prefix-sum of per-tile particle counts; size
+//                     ntiles_total + 1; tile t owns particles
+//                     [tile_offsets(t), tile_offsets(t+1)).
+//   tile_perm       : size npart, particle index sorted by tile.
+template <Dimension D>
+struct TileLayout {
+  ncells_t          ntiles_per_axis[3] { 1u, 1u, 1u };
+  ncells_t          ntiles_total { 0u };
+  unsigned short    tile_size { 0u };
+  array_t<npart_t*> tile_offsets;
+  prtl_perm_t       tile_perm;
+};
+
 // Random number pool/generator type alias
 // (using math:: instead of Kokkos:: to suppress compiler warning on unused namespace alias)
 using random_number_pool_t = math::Random_XorShift64_Pool<Kokkos::DefaultExecutionSpace>;
diff --git a/src/global/utils/sort_dispatch.h b/src/global/utils/sort_dispatch.h
new file mode 100644
index 000000000..506a90742
--- /dev/null
+++ b/src/global/utils/sort_dispatch.h
@@ -0,0 +1,171 @@
+/**
+ * @file utils/sort_dispatch.h
+ * @brief Backend-dispatched sort_by_key for team_policy SortSpatially.
+ * @implements
+ *   - sort_helpers::sort_by_key_dispatch -> void  (BinSort, OneDPL, Thrust, StdSort)
+ * @namespaces:
+ *   - ntt::sort_helpers::
+ * @macros:
+ *   - TEAM_POLICY
+ *   - SYCL_ENABLED, ONEDPL_ENABLED  (oneDPL overload)
+ *   - CUDA_ENABLED, THRUST_ENABLED  (Thrust overload)
+ *
+ * @note Each overload produces a permutation `perm` of size N such that
+ *       keys[perm[0]] <= keys[perm[1]] <= ... in stable order.
+ *       Always-available overloads: BinSort (uses Kokkos::BinSort) and
+ *       StdSort (host-side std::stable_sort fallback). The vendor-library
+ *       overloads (OneDPL on SYCL, Thrust on CUDA) are conditional on the
+ *       respective build flags.
+ */
+
+#ifndef GLOBAL_UTILS_SORT_DISPATCH_H
+#define GLOBAL_UTILS_SORT_DISPATCH_H
+
+#if !defined(TEAM_POLICY)
+  #error "sort_dispatch.h is only meaningful when TEAM_POLICY is defined"
+#endif
+
+#include "global.h"
+
+#include "arch/kokkos_aliases.h"
+#include "utils/sorting.h"
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Sort.hpp>
+
+#if defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)
+  #include <oneapi/dpl/algorithm>
+  #include <oneapi/dpl/execution>
+#endif
+#if defined(CUDA_ENABLED) && defined(THRUST_ENABLED)
+  #include <thrust/device_ptr.h>
+  #include <thrust/sequence.h>
+  #include <thrust/sort.h>
+#endif
+#if defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED)
+  #include <thrust/device_ptr.h>
+  #include <thrust/execution_policy.h>
+  #include <thrust/sequence.h>
+  #include <thrust/sort.h>
+#endif
+
+#include <algorithm>
+#include <numeric>
+
+namespace ntt::sort_helpers {
+
+  // Always-available legacy fallback: Kokkos::BinSort. n_bins must be an
+  // upper bound on distinct key values.
+  inline void sort_by_key_dispatch(const array_t<ncells_t*>& keys,
+                                   prtl_perm_t&              perm,
+                                   ncells_t                  n_bins,
+                                   ::sort::backend::BinSort) {
+    const auto n = static_cast<npart_t>(keys.extent(0));
+    if (n == 0u) {
+      return;
+    }
+    using sorter_op_t = Kokkos::BinOp1D<array_t<ncells_t*>>;
+    using sorter_t    = Kokkos::BinSort<array_t<ncells_t*>, sorter_op_t>;
+    auto bin_op       = sorter_op_t { static_cast<int>(n_bins), 0u, n_bins };
+    auto sorter       = sorter_t { keys, bin_op, false };
+    sorter.create_permute_vector();
+    auto perm_v = perm;
+    Kokkos::parallel_for(
+      "PermInitIota",
+      n,
+      KOKKOS_LAMBDA(const npart_t i) { perm_v(i) = i; });
+    Kokkos::fence("sort_by_key_dispatch BinSort: pre-sort");
+    sorter.sort(perm);
+    Kokkos::fence("sort_by_key_dispatch BinSort: post-sort");
+  }
+
+#if defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)
+  inline void sort_by_key_dispatch(const array_t<ncells_t*>& keys,
+                                   prtl_perm_t&              perm,
+                                   ncells_t /*n_bins*/,
+                                   ::sort::backend::OneDPL) {
+    const auto n = static_cast<npart_t>(keys.extent(0));
+    if (n == 0u) {
+      return;
+    }
+    auto*  keys_ptr = keys.data();
+    auto*  perm_ptr = perm.data();
+    auto   exec     = Kokkos::DefaultExecutionSpace();
+    auto   perm_v   = perm;
+    Kokkos::parallel_for(
+      "PermInitIota",
+      n,
+      KOKKOS_LAMBDA(const npart_t i) { perm_v(i) = i; });
+    // Drain Kokkos's queue so oneDPL's policy sees the iota'd perm even
+    // if oneDPL submits to a different SYCL queue internally.
+    exec.fence("sort_by_key_dispatch OneDPL: pre-sort");
+    auto queue  = exec.sycl_queue();
+    auto policy = oneapi::dpl::execution::make_device_policy(queue);
+    oneapi::dpl::sort_by_key(policy, keys_ptr, keys_ptr + n, perm_ptr);
+    exec.fence("sort_by_key_dispatch OneDPL: post-sort");
+  }
+#endif
+
+#if defined(CUDA_ENABLED) && defined(THRUST_ENABLED)
+  inline void sort_by_key_dispatch(const array_t<ncells_t*>& keys,
+                                   prtl_perm_t&              perm,
+                                   ncells_t /*n_bins*/,
+                                   ::sort::backend::Thrust) {
+    const auto n = static_cast<npart_t>(keys.extent(0));
+    if (n == 0u) {
+      return;
+    }
+    Kokkos::fence("sort_by_key_dispatch Thrust: pre-sort");
+    thrust::device_ptr<ncells_t> kp(keys.data());
+    thrust::device_ptr<npart_t>  pp(perm.data());
+    thrust::sequence(pp, pp + n);
+    thrust::sort_by_key(kp, kp + n, pp);
+    Kokkos::fence("sort_by_key_dispatch Thrust: post-sort");
+  }
+#endif
+
+#if defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED)
+  // rocThrust exposes the same thrust:: API as CUDA Thrust; with hipcc
+  // device_ptr-based algorithms dispatch to the HIP backend. Mirrors
+  // the CUDA Thrust overload.
+  inline void sort_by_key_dispatch(const array_t<ncells_t*>& keys,
+                                   prtl_perm_t&              perm,
+                                   ncells_t /*n_bins*/,
+                                   ::sort::backend::Rocthrust) {
+    const auto n = static_cast<npart_t>(keys.extent(0));
+    if (n == 0u) {
+      return;
+    }
+    Kokkos::fence("sort_by_key_dispatch Rocthrust: pre-sort");
+    thrust::device_ptr<ncells_t> kp(keys.data());
+    thrust::device_ptr<npart_t>  pp(perm.data());
+    thrust::sequence(pp, pp + n);
+    thrust::sort_by_key(kp, kp + n, pp);
+    Kokkos::fence("sort_by_key_dispatch Rocthrust: post-sort");
+  }
+#endif
+
+  // Host fallback: indirect-sort via std::stable_sort.
+  inline void sort_by_key_dispatch(const array_t<ncells_t*>& keys,
+                                   prtl_perm_t&              perm,
+                                   ncells_t /*n_bins*/,
+                                   ::sort::backend::StdSort) {
+    const auto n = static_cast<npart_t>(keys.extent(0));
+    if (n == 0u) {
+      return;
+    }
+    auto keys_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
+                                                       keys);
+    auto perm_h = Kokkos::create_mirror_view(perm);
+    std::iota(perm_h.data(), perm_h.data() + n, npart_t { 0u });
+    std::stable_sort(perm_h.data(),
+                     perm_h.data() + n,
+                     [&](npart_t a, npart_t b) {
+                       return keys_h(a) < keys_h(b);
+                     });
+    Kokkos::deep_copy(perm, perm_h);
+  }
+
+} // namespace ntt::sort_helpers
+
+#endif // GLOBAL_UTILS_SORT_DISPATCH_H
diff --git a/src/global/utils/sorting.h b/src/global/utils/sorting.h
index dbe774402..8442f5ddb 100644
--- a/src/global/utils/sorting.h
+++ b/src/global/utils/sorting.h
@@ -5,6 +5,7 @@
  *   - sort::BinBool<>
  *   - sort::BinTag<>
  *   - sort::PositionToTileIndex<>
+ *   - sort::backend tag types (compile-time tag dispatch for sort_by_key)
  * @namespaces:
  *   - sort::
  * @note BinBool sorts by boolean values "true" then "false"
@@ -62,9 +63,27 @@ namespace sort {
     const int m_max_bins;
   };
 
-  template <Dimension D, bool Count>
+  /**
+   * @brief Bin a particle into a tile of edge length `tile_size` cells.
+   * @tparam D     Dimension.
+   * @tparam Count If true, atomic-increment `num_ppt[tile]` for each live
+   *               particle (used to populate per-tile counts in one pass).
+   * @tparam UsePrev If true, the bin key uses `min(i_curr, i_prev)` instead
+   *                 of `i_curr` alone. This guarantees that a particle whose
+   *                 Esirkepov stencil straddles a tile boundary (because it
+   *                 crossed the boundary during the pusher) lands in the
+   *                 lower-indexed of the two tiles. Combined with a halo of
+   *                 `O+1` cells in the deposit's per-tile scratch, this
+   *                 keeps every particle's stencil inside its assigned
+   *                 tile's interior+halo region. See plan §S2.4.
+   *
+   * Dead particles get the sentinel `total_tiles + 1u` so they sort to the
+   * end (or get skipped, depending on the consumer).
+   */
+  template <Dimension D, bool Count, bool UsePrev = false>
   struct PositionToTileIndex {
     const array_t<int*>   i1, i2, i3;
+    const array_t<int*>   i1_prev, i2_prev, i3_prev;
     const array_t<short*> tag;
     array_t<ncells_t*>    tile_indices;
     ncells_t              tile_size;
@@ -72,20 +91,37 @@ namespace sort {
 
     ncells_t ntx2 { 0u }, ntx3 { 0u };
     ncells_t total_tiles { 0u };
-
-    PositionToTileIndex(const array_t<int*>&         i1,
-                        const array_t<int*>&         i2,
-                        const array_t<int*>&         i3,
-                        const array_t<short*>&       tag,
-                        array_t<ncells_t*>&          tile_indices,
+    // Active-cell extents per axis. Used to clamp the bin key when
+    // UsePrev=true, since `i_prev` can be transiently negative after the
+    // pusher's periodic wrap (`i_prev -= ni`) or out-of-range after an
+    // MPI receive that hasn't translated frames. Without clamping, the
+    // signed-to-unsigned promotion in `int(-1) / uint32_t(T)` produces
+    // ~1.07e9, the linearised `tile_indices(p)` overflows past `n_bins`,
+    // and BinSort's internal `atomic_add(&bin_count[wild_idx], 1)`
+    // faults on an unmapped page.
+    int ncells1 { 1 }, ncells2 { 1 }, ncells3 { 1 };
+
+    PositionToTileIndex(const array_t<int*>&         i1_,
+                        const array_t<int*>&         i2_,
+                        const array_t<int*>&         i3_,
+                        const array_t<short*>&       tag_,
+                        array_t<ncells_t*>&          tile_indices_,
                         const std::vector<ncells_t>& ncells,
-                        ncells_t                     tile_size = 1u)
-      : i1 { i1 }
-      , i2 { i2 }
-      , i3 { i3 }
-      , tag { tag }
-      , tile_indices { tile_indices }
-      , tile_size { tile_size }
+                        ncells_t                     tile_size_ = 1u,
+                        const array_t<npart_t*>& num_ppt_ = { "num_ppt", 0u },
+                        const array_t<int*>& i1_prev_ = {},
+                        const array_t<int*>& i2_prev_ = {},
+                        const array_t<int*>& i3_prev_ = {})
+      : i1 { i1_ }
+      , i2 { i2_ }
+      , i3 { i3_ }
+      , i1_prev { i1_prev_ }
+      , i2_prev { i2_prev_ }
+      , i3_prev { i3_prev_ }
+      , tag { tag_ }
+      , tile_indices { tile_indices_ }
+      , tile_size { tile_size_ }
+      , num_ppt { num_ppt_ }
       , ntx2 { 1u }
       , ntx3 { 1u }
       , total_tiles { 1u } {
@@ -93,49 +129,122 @@ namespace sort {
                      "ncells size must match D",
                      HERE);
       if constexpr ((D == Dim::_1D) or (D == Dim::_2D) or (D == Dim::_3D)) {
+        ncells1       = static_cast<int>(ncells[0]);
         npart_t ntx1  = static_cast<ncells_t>(math::ceil(
           static_cast<double>(ncells[0]) / static_cast<double>(tile_size)));
         total_tiles  *= ntx1;
       }
       if constexpr ((D == Dim::_2D) or (D == Dim::_3D)) {
+        ncells2      = static_cast<int>(ncells[1]);
         ntx2         = static_cast<ncells_t>(math::ceil(
           static_cast<double>(ncells[1]) / static_cast<double>(tile_size)));
         total_tiles *= ntx2;
       }
       if constexpr (D == Dim::_3D) {
+        ncells3      = static_cast<int>(ncells[2]);
         ntx3         = static_cast<ncells_t>(math::ceil(
           static_cast<double>(ncells[2]) / static_cast<double>(tile_size)));
         total_tiles *= ntx3;
       }
       if constexpr (Count) {
-        num_ppt = array_t<npart_t*> { "num_ppt", total_tiles };
+        raise::ErrorIf(num_ppt.extent(0) != total_tiles,
+                       "num_ppt must have extent equal to total tiles",
+                       HERE);
+      }
+      if constexpr (UsePrev) {
+        raise::ErrorIf(
+          i1_prev.extent(0) == 0u,
+          "PositionToTileIndex<UsePrev=true> requires i1_prev to be set",
+          HERE);
+        if constexpr ((D == Dim::_2D) or (D == Dim::_3D)) {
+          raise::ErrorIf(
+            i2_prev.extent(0) == 0u,
+            "PositionToTileIndex<UsePrev=true> requires i2_prev to be set",
+            HERE);
+        }
+        if constexpr (D == Dim::_3D) {
+          raise::ErrorIf(
+            i3_prev.extent(0) == 0u,
+            "PositionToTileIndex<UsePrev=true> requires i3_prev to be set",
+            HERE);
+        }
       }
     }
 
-    Inline auto operator()(prtldx_t p) const {
+    Inline auto operator()(prtlidx_t p) const {
       if (tag(p) != ntt::ParticleTag::alive) {
         tile_indices(p) = total_tiles + 1u;
       } else {
+        // bin key per-axis: use min(i, i_prev) when UsePrev so that a
+        // particle straddling a boundary lands in the lower tile.
+        // Then clamp to [0, ncells_axis - 1] — `i_prev` can be negative
+        // (after the pusher's periodic-wrap path: `i_prev -= ni`) or
+        // out-of-range (after MPI receive without frame translation).
+        // Without the clamp, signed-to-unsigned promotion in
+        // `int(-1) / uint32_t(T)` makes `tile_indices(p)` overflow far
+        // past `n_bins`, and BinSort's `atomic_add(&bin_count[bin],1)`
+        // faults on an unmapped page.
+        const auto clamp_axis = [](int v, int ncells) -> int {
+          return (v < 0) ? 0 : ((v >= ncells) ? (ncells - 1) : v);
+        };
+        const auto key1 = [&]() -> int {
+          if constexpr (UsePrev) {
+            const int raw = (i1(p) < i1_prev(p)) ? i1(p) : i1_prev(p);
+            return clamp_axis(raw, ncells1);
+          } else {
+            return i1(p);
+          }
+        }();
+        const auto key2 = [&]() -> int {
+          if constexpr (UsePrev) {
+            const int raw = (i2(p) < i2_prev(p)) ? i2(p) : i2_prev(p);
+            return clamp_axis(raw, ncells2);
+          } else {
+            return i2(p);
+          }
+        }();
+        const auto key3 = [&]() -> int {
+          if constexpr (UsePrev) {
+            const int raw = (i3(p) < i3_prev(p)) ? i3(p) : i3_prev(p);
+            return clamp_axis(raw, ncells3);
+          } else {
+            return i3(p);
+          }
+        }();
         if constexpr (D == Dim::_1D) {
-          tile_indices(p) = static_cast<ncells_t>(i1(p) / tile_size);
+          tile_indices(p) = static_cast<ncells_t>(key1 / tile_size);
         } else if constexpr (D == Dim::_2D) {
-          tile_indices(p) = static_cast<ncells_t>(i1(p) / tile_size) * ntx2 +
-                            static_cast<ncells_t>(i2(p) / tile_size);
+          tile_indices(p) = static_cast<ncells_t>(key1 / tile_size) * ntx2 +
+                            static_cast<ncells_t>(key2 / tile_size);
         } else if constexpr (D == Dim::_3D) {
-          tile_indices(p) = (static_cast<ncells_t>(i1(p) / tile_size) * ntx2 +
-                             static_cast<ncells_t>(i2(p) / tile_size)) *
+          tile_indices(p) = (static_cast<ncells_t>(key1 / tile_size) * ntx2 +
+                             static_cast<ncells_t>(key2 / tile_size)) *
                               ntx3 +
-                            static_cast<ncells_t>(i3(p) / tile_size);
+                            static_cast<ncells_t>(key3 / tile_size);
         } else {
           raise::KernelError(HERE, "Wrong D in SortSpatially");
         }
         if constexpr (Count) {
-          Kokkos::atomic_add(&num_ppt(tile_indices(p)), 1u);
+          Kokkos::atomic_add(&num_ppt(tile_indices(p)), 1);
         }
       }
     }
   };
 
+  // -------------------- Backend dispatch for sort_by_key ------------------- //
+  // Compile-time tags for tag-dispatch into backend-specific
+  // sort_by_key implementations. Selection is fully compile-time: the
+  // backend that resolves depends on the active Kokkos device and the
+  // availability of the corresponding vendor library.
+  namespace backend {
+    struct OneDPL {};
+    struct Thrust {};
+    struct Rocthrust {};
+    struct StdSort {};
+    // Always-available legacy fallback using Kokkos::BinSort.
+    struct BinSort {};
+  } // namespace backend
+
 } // namespace sort
 
 #endif // GLOBAL_UTILS_SORTING_H
diff --git a/src/kernels/currents_deposit.hpp b/src/kernels/currents_deposit.hpp
index 268187af2..5eb7ff2b6 100644
--- a/src/kernels/currents_deposit.hpp
+++ b/src/kernels/currents_deposit.hpp
@@ -1,10 +1,24 @@
 /**
  * @file kernels/currents_deposit.hpp
- * @brief Covariant algorithms for the current deposition
+ * @brief Covariant algorithms for the current deposition.
+ *
+ * Two kernels share the same per-particle body
+ * (`kernel::deposit::deposit_one_particle`):
+ *   - `kernel::DepositCurrents_kernel<S, M, O>` flat (RangePolicy over particles,
+ *     writes into a `Kokkos::Experimental::ScatterView`). Always available.
+ *   - `kernel::DepositCurrents_kernel_tiled<S, M, O, T_TILE>` team-policy
+ *     (one team per spatial tile, accumulates into team SLM scratch with
+ *     atomic adds, then flushes to global J). Available when `team_policy=ON`
+ *     (`#if defined(TEAM_POLICY)`). Stream 2 of the Pattern A plan.
+ *
  * @implements
+ *   - kernel::deposit::PrtlPack<>
+ *   - kernel::deposit::deposit_one_particle<>
  *   - kernel::DepositCurrents_kernel<>
+ *   - kernel::DepositCurrents_kernel_tiled<>   (TEAM_POLICY only)
  * @namespaces:
  *   - kernel::
+ *   - kernel::deposit::
  */
 
 #ifndef KERNELS_CURRENTS_DEPOSIT_HPP
@@ -18,100 +32,99 @@
 #include "utils/error.h"
 #include "utils/numeric.h"
 
-#include "particle_shapes.hpp"
+#include "kernels/particle_shapes.hpp"
 
 #include <Kokkos_Core.hpp>
+#include <Kokkos_ScatterView.hpp>
 
 #define i_di_to_Xi(I, DI) (static_cast<real_t>((I)) + static_cast<real_t>((DI)))
 
 namespace kernel {
   using namespace ntt;
 
-  /**
-   * @brief Algorithm for the current deposition
-   */
-  template <SimEngine::type S, MetricClass M, unsigned short O = 1u>
-  class DepositCurrents_kernel {
-    static_assert(O <= 11u, "Shape function order O must be <= 11");
-    static constexpr auto D = M::Dim;
-
-    scatter_ndfield_t<D, 3>  J;
-    const array_t<int*>      i1, i2, i3;
-    const array_t<int*>      i1_prev, i2_prev, i3_prev;
-    const array_t<prtldx_t*> dx1, dx2, dx3;
-    const array_t<prtldx_t*> dx1_prev, dx2_prev, dx3_prev;
-    const array_t<real_t*>   ux1, ux2, ux3;
-    const array_t<real_t*>   phi;
-    const array_t<real_t*>   weight;
-    const array_t<short*>    tag;
-    const M                  metric;
-    const real_t             charge, inv_dt;
+  namespace deposit {
 
-  public:
     /**
-     * @brief explicit constructor.
+     * @brief Per-particle reference pack consumed by both the flat and tiled
+     *        deposit kernels. The same set of SoA references is captured by
+     *        each kernel; bundling them here keeps the helper's argument
+     *        list manageable and ensures every consumer reads the same
+     *        view aliases.
      */
-    DepositCurrents_kernel(const scatter_ndfield_t<D, 3>& scatter_cur,
-                           const array_t<int*>&           i1,
-                           const array_t<int*>&           i2,
-                           const array_t<int*>&           i3,
-                           const array_t<int*>&           i1_prev,
-                           const array_t<int*>&           i2_prev,
-                           const array_t<int*>&           i3_prev,
-                           const array_t<prtldx_t*>&      dx1,
-                           const array_t<prtldx_t*>&      dx2,
-                           const array_t<prtldx_t*>&      dx3,
-                           const array_t<prtldx_t*>&      dx1_prev,
-                           const array_t<prtldx_t*>&      dx2_prev,
-                           const array_t<prtldx_t*>&      dx3_prev,
-                           const array_t<real_t*>&        ux1,
-                           const array_t<real_t*>&        ux2,
-                           const array_t<real_t*>&        ux3,
-                           const array_t<real_t*>&        phi,
-                           const array_t<real_t*>&        weight,
-                           const array_t<short*>&         tag,
-                           const M&                       metric,
-                           real_t                         charge,
-                           const real_t                   dt)
-      : J { scatter_cur }
-      , i1 { i1 }
-      , i2 { i2 }
-      , i3 { i3 }
-      , i1_prev { i1_prev }
-      , i2_prev { i2_prev }
-      , i3_prev { i3_prev }
-      , dx1 { dx1 }
-      , dx2 { dx2 }
-      , dx3 { dx3 }
-      , dx1_prev { dx1_prev }
-      , dx2_prev { dx2_prev }
-      , dx3_prev { dx3_prev }
-      , ux1 { ux1 }
-      , ux2 { ux2 }
-      , ux3 { ux3 }
-      , phi { phi }
-      , weight { weight }
-      , tag { tag }
-      , metric { metric }
-      , charge { charge }
-      , inv_dt { ONE / dt } {
-      raise::ErrorIf(
-        (O == 2u and N_GHOSTS < 2),
-        "Order of interpolation is 2, but number of ghost cells is < 2",
-        HERE);
-    }
+    template <Dimension D>
+    struct PrtlPack {
+      array_t<int*>      i1, i2, i3;
+      array_t<int*>      i1_prev, i2_prev, i3_prev;
+      array_t<prtldx_t*> dx1, dx2, dx3;
+      array_t<prtldx_t*> dx1_prev, dx2_prev, dx3_prev;
+      array_t<real_t*>   ux1, ux2, ux3;
+      array_t<real_t*>   phi;
+      array_t<real_t*>   weight;
+      array_t<short*>    tag;
+    };
 
     /**
-     * @brief Iteration of the loop over particles.
-     * @param p index.
+     * @brief Per-particle deposit body, shared between the flat and tiled
+     *        kernels.
+     *
+     * The caller supplies a `deposit_at(idx..., comp, val)` callback that
+     * applies the contribution `val` to the J component `comp` at the
+     * **global** J cell index `idx...` (already includes the `N_GHOSTS`
+     * offset). The flat kernel's callback simply does
+     * `J_acc(idx..., comp) += val` on its scatter-view accessor; the tiled
+     * kernel's callback translates `idx...` into per-tile scratch
+     * coordinates and uses `Kokkos::atomic_add` on SLM. Either way, this
+     * function is identical numerically and contains the only deposit math
+     * in the codebase.
+     *
+     * Dead particles return early. The callback is invoked once per cell
+     * write, with the dimension-appropriate signature:
+     *   - 1D: `deposit_at(int g_i1, int comp, real_t val)`
+     *   - 2D: `deposit_at(int g_i1, int g_i2, int comp, real_t val)`
+     *   - 3D: `deposit_at(int g_i1, int g_i2, int g_i3, int comp, real_t val)`
      */
-    Inline auto operator()(prtlidx_t p) const -> void {
+    template <SimEngine::type S, MetricClass M, unsigned short O,
+              typename DepositFn>
+    Inline void deposit_one_particle(prtlidx_t       p,
+                                     const PrtlPack<M::Dim>& prtls,
+                                     const M&        metric,
+                                     real_t          charge,
+                                     real_t          inv_dt,
+                                     DepositFn       deposit_at) {
+      static_assert(O <= 11u, "Shape function order O must be <= 11");
+      constexpr auto D = M::Dim;
+
+      const auto& i1       = prtls.i1;
+      const auto& i2       = prtls.i2;
+      const auto& i3       = prtls.i3;
+      const auto& i1_prev  = prtls.i1_prev;
+      const auto& i2_prev  = prtls.i2_prev;
+      const auto& i3_prev  = prtls.i3_prev;
+      const auto& dx1      = prtls.dx1;
+      const auto& dx2      = prtls.dx2;
+      const auto& dx3      = prtls.dx3;
+      const auto& dx1_prev = prtls.dx1_prev;
+      const auto& dx2_prev = prtls.dx2_prev;
+      const auto& dx3_prev = prtls.dx3_prev;
+      const auto& ux1      = prtls.ux1;
+      const auto& ux2      = prtls.ux2;
+      const auto& ux3      = prtls.ux3;
+      const auto& phi      = prtls.phi;
+      const auto& weight   = prtls.weight;
+      const auto& tag      = prtls.tag;
+
       if (tag(p) == ParticleTag::dead) {
         return;
       }
+
       // recover particle velocity to deposit in unsimulated direction
-      vec_t<Dim::_3D> vp { ZERO };
-      {
+      [[maybe_unused]] vec_t<Dim::_3D> vp { ZERO };
+      // `vp` only feeds the unsimulated-direction current in the 1D
+      // (jx2, jx3) and 2D (jx3) branches. In 3D every J component comes
+      // from the Esirkepov/zigzag charge motion and `vp` is never read,
+      // so the metric transform + 1/sqrt + NaN/Inf guard below is pure
+      // dead work there — skip it (also frees xp/inv_energy registers).
+      if constexpr (D != Dim::_3D) {
         coord_t<M::PrtlDim> xp { ZERO };
         if constexpr (D == Dim::_1D) {
           xp[0] = i_di_to_Xi(i1(p), dx1(p));
@@ -167,7 +180,6 @@ namespace kernel {
 
       const real_t coeff { weight(p) * charge };
 
-      // ToDo: interpolation_order as parameter
       if constexpr (O == 0u) {
         /*
           Zig-zag deposit
@@ -191,8 +203,6 @@ namespace kernel {
                               dx1(p) - dxp_r_1) *
                              coeff * inv_dt };
 
-        auto J_acc = J.access();
-
         if constexpr (D == Dim::_1D) {
           const real_t Fx2_1 { HALF * vp[1] * coeff };
           const real_t Fx2_2 { HALF * vp[1] * coeff };
@@ -200,18 +210,18 @@ namespace kernel {
           const real_t Fx3_1 { HALF * vp[2] * coeff };
           const real_t Fx3_2 { HALF * vp[2] * coeff };
 
-          J_acc(i1_prev(p) + N_GHOSTS, cur::jx1) += Fx1_1;
-          J_acc(i1(p) + N_GHOSTS, cur::jx1)      += Fx1_2;
+          deposit_at(i1_prev(p) + N_GHOSTS, cur::jx1, Fx1_1);
+          deposit_at(i1(p) + N_GHOSTS, cur::jx1, Fx1_2);
 
-          J_acc(i1_prev(p) + N_GHOSTS, cur::jx2)     += Fx2_1 * (ONE - Wx1_1);
-          J_acc(i1_prev(p) + N_GHOSTS + 1, cur::jx2) += Fx2_1 * Wx1_1;
-          J_acc(i1(p) + N_GHOSTS, cur::jx2)          += Fx2_2 * (ONE - Wx1_2);
-          J_acc(i1(p) + N_GHOSTS + 1, cur::jx2)      += Fx2_2 * Wx1_2;
+          deposit_at(i1_prev(p) + N_GHOSTS, cur::jx2, Fx2_1 * (ONE - Wx1_1));
+          deposit_at(i1_prev(p) + N_GHOSTS + 1, cur::jx2, Fx2_1 * Wx1_1);
+          deposit_at(i1(p) + N_GHOSTS, cur::jx2, Fx2_2 * (ONE - Wx1_2));
+          deposit_at(i1(p) + N_GHOSTS + 1, cur::jx2, Fx2_2 * Wx1_2);
 
-          J_acc(i1_prev(p) + N_GHOSTS, cur::jx3)     += Fx3_1 * (ONE - Wx1_1);
-          J_acc(i1_prev(p) + N_GHOSTS + 1, cur::jx3) += Fx3_1 * Wx1_1;
-          J_acc(i1(p) + N_GHOSTS, cur::jx3)          += Fx3_2 * (ONE - Wx1_2);
-          J_acc(i1(p) + N_GHOSTS + 1, cur::jx3)      += Fx3_2 * Wx1_2;
+          deposit_at(i1_prev(p) + N_GHOSTS, cur::jx3, Fx3_1 * (ONE - Wx1_1));
+          deposit_at(i1_prev(p) + N_GHOSTS + 1, cur::jx3, Fx3_1 * Wx1_1);
+          deposit_at(i1(p) + N_GHOSTS, cur::jx3, Fx3_2 * (ONE - Wx1_2));
+          deposit_at(i1(p) + N_GHOSTS + 1, cur::jx3, Fx3_2 * Wx1_2);
         } else if constexpr (D == Dim::_2D || D == Dim::_3D) {
           const auto dxp_r_2 { static_cast<prtldx_t>(i2(p) == i2_prev(p)) *
                                (dx2(p) + dx2_prev(p)) *
@@ -236,51 +246,73 @@ namespace kernel {
             const real_t Fx3_1 { HALF * vp[2] * coeff };
             const real_t Fx3_2 { HALF * vp[2] * coeff };
 
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS,
-                  cur::jx1) += Fx1_1 * (ONE - Wx2_1);
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS + 1,
-                  cur::jx1) += Fx1_1 * Wx2_1;
-            J_acc(i1(p) + N_GHOSTS, i2(p) + N_GHOSTS, cur::jx1) += Fx1_2 *
-                                                                   (ONE - Wx2_2);
-            J_acc(i1(p) + N_GHOSTS, i2(p) + N_GHOSTS + 1, cur::jx1) += Fx1_2 * Wx2_2;
-
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS,
-                  cur::jx2) += Fx2_1 * (ONE - Wx1_1);
-            J_acc(i1_prev(p) + N_GHOSTS + 1,
-                  i2_prev(p) + N_GHOSTS,
-                  cur::jx2) += Fx2_1 * Wx1_1;
-            J_acc(i1(p) + N_GHOSTS, i2(p) + N_GHOSTS, cur::jx2) += Fx2_2 *
-                                                                   (ONE - Wx1_2);
-            J_acc(i1(p) + N_GHOSTS + 1, i2(p) + N_GHOSTS, cur::jx2) += Fx2_2 * Wx1_2;
-
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1);
-            J_acc(i1_prev(p) + N_GHOSTS + 1,
-                  i2_prev(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_1 * Wx1_1 * (ONE - Wx2_1);
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS + 1,
-                  cur::jx3) += Fx3_1 * (ONE - Wx1_1) * Wx2_1;
-            J_acc(i1_prev(p) + N_GHOSTS + 1,
-                  i2_prev(p) + N_GHOSTS + 1,
-                  cur::jx3) += Fx3_1 * Wx1_1 * Wx2_1;
-
-            J_acc(i1(p) + N_GHOSTS,
-                  i2(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2);
-            J_acc(i1(p) + N_GHOSTS + 1,
-                  i2(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_2 * Wx1_2 * (ONE - Wx2_2);
-            J_acc(i1(p) + N_GHOSTS,
-                  i2(p) + N_GHOSTS + 1,
-                  cur::jx3) += Fx3_2 * (ONE - Wx1_2) * Wx2_2;
-            J_acc(i1(p) + N_GHOSTS + 1,
-                  i2(p) + N_GHOSTS + 1,
-                  cur::jx3) += Fx3_2 * Wx1_2 * Wx2_2;
+            deposit_at(i1_prev(p) + N_GHOSTS,
+                       i2_prev(p) + N_GHOSTS,
+                       cur::jx1,
+                       Fx1_1 * (ONE - Wx2_1));
+            deposit_at(i1_prev(p) + N_GHOSTS,
+                       i2_prev(p) + N_GHOSTS + 1,
+                       cur::jx1,
+                       Fx1_1 * Wx2_1);
+            deposit_at(i1(p) + N_GHOSTS,
+                       i2(p) + N_GHOSTS,
+                       cur::jx1,
+                       Fx1_2 * (ONE - Wx2_2));
+            deposit_at(i1(p) + N_GHOSTS,
+                       i2(p) + N_GHOSTS + 1,
+                       cur::jx1,
+                       Fx1_2 * Wx2_2);
+
+            deposit_at(i1_prev(p) + N_GHOSTS,
+                       i2_prev(p) + N_GHOSTS,
+                       cur::jx2,
+                       Fx2_1 * (ONE - Wx1_1));
+            deposit_at(i1_prev(p) + N_GHOSTS + 1,
+                       i2_prev(p) + N_GHOSTS,
+                       cur::jx2,
+                       Fx2_1 * Wx1_1);
+            deposit_at(i1(p) + N_GHOSTS,
+                       i2(p) + N_GHOSTS,
+                       cur::jx2,
+                       Fx2_2 * (ONE - Wx1_2));
+            deposit_at(i1(p) + N_GHOSTS + 1,
+                       i2(p) + N_GHOSTS,
+                       cur::jx2,
+                       Fx2_2 * Wx1_2);
+
+            deposit_at(i1_prev(p) + N_GHOSTS,
+                       i2_prev(p) + N_GHOSTS,
+                       cur::jx3,
+                       Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1));
+            deposit_at(i1_prev(p) + N_GHOSTS + 1,
+                       i2_prev(p) + N_GHOSTS,
+                       cur::jx3,
+                       Fx3_1 * Wx1_1 * (ONE - Wx2_1));
+            deposit_at(i1_prev(p) + N_GHOSTS,
+                       i2_prev(p) + N_GHOSTS + 1,
+                       cur::jx3,
+                       Fx3_1 * (ONE - Wx1_1) * Wx2_1);
+            deposit_at(i1_prev(p) + N_GHOSTS + 1,
+                       i2_prev(p) + N_GHOSTS + 1,
+                       cur::jx3,
+                       Fx3_1 * Wx1_1 * Wx2_1);
+
+            deposit_at(i1(p) + N_GHOSTS,
+                       i2(p) + N_GHOSTS,
+                       cur::jx3,
+                       Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2));
+            deposit_at(i1(p) + N_GHOSTS + 1,
+                       i2(p) + N_GHOSTS,
+                       cur::jx3,
+                       Fx3_2 * Wx1_2 * (ONE - Wx2_2));
+            deposit_at(i1(p) + N_GHOSTS,
+                       i2(p) + N_GHOSTS + 1,
+                       cur::jx3,
+                       Fx3_2 * (ONE - Wx1_2) * Wx2_2);
+            deposit_at(i1(p) + N_GHOSTS + 1,
+                       i2(p) + N_GHOSTS + 1,
+                       cur::jx3,
+                       Fx3_2 * Wx1_2 * Wx2_2);
           } else {
             const auto   dxp_r_3 { static_cast<prtldx_t>(i3(p) == i3_prev(p)) *
                                  (dx3(p) + dx3_prev(p)) *
@@ -300,107 +332,131 @@ namespace kernel {
                                   dx3(p) - dxp_r_3) *
                                  coeff * inv_dt };
 
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS,
-                  i3_prev(p) + N_GHOSTS,
-                  cur::jx1) += Fx1_1 * (ONE - Wx2_1) * (ONE - Wx3_1);
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS + 1,
-                  i3_prev(p) + N_GHOSTS,
-                  cur::jx1) += Fx1_1 * Wx2_1 * (ONE - Wx3_1);
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS,
-                  i3_prev(p) + N_GHOSTS + 1,
-                  cur::jx1) += Fx1_1 * (ONE - Wx2_1) * Wx3_1;
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS + 1,
-                  i3_prev(p) + N_GHOSTS + 1,
-                  cur::jx1) += Fx1_1 * Wx2_1 * Wx3_1;
-
-            J_acc(i1(p) + N_GHOSTS,
-                  i2(p) + N_GHOSTS,
-                  i3(p) + N_GHOSTS,
-                  cur::jx1) += Fx1_2 * (ONE - Wx2_2) * (ONE - Wx3_2);
-            J_acc(i1(p) + N_GHOSTS,
-                  i2(p) + N_GHOSTS + 1,
-                  i3(p) + N_GHOSTS,
-                  cur::jx1) += Fx1_2 * Wx2_2 * (ONE - Wx3_2);
-            J_acc(i1(p) + N_GHOSTS,
-                  i2(p) + N_GHOSTS,
-                  i3(p) + N_GHOSTS + 1,
-                  cur::jx1) += Fx1_2 * (ONE - Wx2_2) * Wx3_2;
-            J_acc(i1(p) + N_GHOSTS,
-                  i2(p) + N_GHOSTS + 1,
-                  i3(p) + N_GHOSTS + 1,
-                  cur::jx1) += Fx1_2 * Wx2_2 * Wx3_2;
-
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS,
-                  i3_prev(p) + N_GHOSTS,
-                  cur::jx2) += Fx2_1 * (ONE - Wx1_1) * (ONE - Wx3_1);
-            J_acc(i1_prev(p) + N_GHOSTS + 1,
-                  i2_prev(p) + N_GHOSTS,
-                  i3_prev(p) + N_GHOSTS,
-                  cur::jx2) += Fx2_1 * Wx1_1 * (ONE - Wx3_1);
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS,
-                  i3_prev(p) + N_GHOSTS + 1,
-                  cur::jx2) += Fx2_1 * (ONE - Wx1_1) * Wx3_1;
-            J_acc(i1_prev(p) + N_GHOSTS + 1,
-                  i2_prev(p) + N_GHOSTS,
-                  i3_prev(p) + N_GHOSTS + 1,
-                  cur::jx2) += Fx2_1 * Wx1_1 * Wx3_1;
-
-            J_acc(i1(p) + N_GHOSTS,
-                  i2(p) + N_GHOSTS,
-                  i3(p) + N_GHOSTS,
-                  cur::jx2) += Fx2_2 * (ONE - Wx1_2) * (ONE - Wx3_2);
-            J_acc(i1(p) + N_GHOSTS + 1,
-                  i2(p) + N_GHOSTS,
-                  i3(p) + N_GHOSTS,
-                  cur::jx2) += Fx2_2 * Wx1_2 * (ONE - Wx3_2);
-            J_acc(i1(p) + N_GHOSTS,
-                  i2(p) + N_GHOSTS,
-                  i3(p) + N_GHOSTS + 1,
-                  cur::jx2) += Fx2_2 * (ONE - Wx1_2) * Wx3_2;
-            J_acc(i1(p) + N_GHOSTS + 1,
-                  i2(p) + N_GHOSTS,
-                  i3(p) + N_GHOSTS + 1,
-                  cur::jx2) += Fx2_2 * Wx1_2 * Wx3_2;
-
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS,
-                  i3_prev(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1);
-            J_acc(i1_prev(p) + N_GHOSTS + 1,
-                  i2_prev(p) + N_GHOSTS,
-                  i3_prev(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_1 * Wx1_1 * (ONE - Wx2_1);
-            J_acc(i1_prev(p) + N_GHOSTS,
-                  i2_prev(p) + N_GHOSTS + 1,
-                  i3_prev(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_1 * (ONE - Wx1_1) * Wx2_1;
-            J_acc(i1_prev(p) + N_GHOSTS + 1,
-                  i2_prev(p) + N_GHOSTS + 1,
-                  i3_prev(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_1 * Wx1_1 * Wx2_1;
-
-            J_acc(i1(p) + N_GHOSTS,
-                  i2(p) + N_GHOSTS,
-                  i3(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2);
-            J_acc(i1(p) + N_GHOSTS + 1,
-                  i2(p) + N_GHOSTS,
-                  i3(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_2 * Wx1_2 * (ONE - Wx2_2);
-            J_acc(i1(p) + N_GHOSTS,
-                  i2(p) + N_GHOSTS + 1,
-                  i3(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_2 * (ONE - Wx1_2) * Wx2_2;
-            J_acc(i1(p) + N_GHOSTS + 1,
-                  i2(p) + N_GHOSTS + 1,
-                  i3(p) + N_GHOSTS,
-                  cur::jx3) += Fx3_2 * Wx1_2 * Wx2_2;
+            deposit_at(i1_prev(p) + N_GHOSTS,
+                       i2_prev(p) + N_GHOSTS,
+                       i3_prev(p) + N_GHOSTS,
+                       cur::jx1,
+                       Fx1_1 * (ONE - Wx2_1) * (ONE - Wx3_1));
+            deposit_at(i1_prev(p) + N_GHOSTS,
+                       i2_prev(p) + N_GHOSTS + 1,
+                       i3_prev(p) + N_GHOSTS,
+                       cur::jx1,
+                       Fx1_1 * Wx2_1 * (ONE - Wx3_1));
+            deposit_at(i1_prev(p) + N_GHOSTS,
+                       i2_prev(p) + N_GHOSTS,
+                       i3_prev(p) + N_GHOSTS + 1,
+                       cur::jx1,
+                       Fx1_1 * (ONE - Wx2_1) * Wx3_1);
+            deposit_at(i1_prev(p) + N_GHOSTS,
+                       i2_prev(p) + N_GHOSTS + 1,
+                       i3_prev(p) + N_GHOSTS + 1,
+                       cur::jx1,
+                       Fx1_1 * Wx2_1 * Wx3_1);
+
+            deposit_at(i1(p) + N_GHOSTS,
+                       i2(p) + N_GHOSTS,
+                       i3(p) + N_GHOSTS,
+                       cur::jx1,
+                       Fx1_2 * (ONE - Wx2_2) * (ONE - Wx3_2));
+            deposit_at(i1(p) + N_GHOSTS,
+                       i2(p) + N_GHOSTS + 1,
+                       i3(p) + N_GHOSTS,
+                       cur::jx1,
+                       Fx1_2 * Wx2_2 * (ONE - Wx3_2));
+            deposit_at(i1(p) + N_GHOSTS,
+                       i2(p) + N_GHOSTS,
+                       i3(p) + N_GHOSTS + 1,
+                       cur::jx1,
+                       Fx1_2 * (ONE - Wx2_2) * Wx3_2);
+            deposit_at(i1(p) + N_GHOSTS,
+                       i2(p) + N_GHOSTS + 1,
+                       i3(p) + N_GHOSTS + 1,
+                       cur::jx1,
+                       Fx1_2 * Wx2_2 * Wx3_2);
+
+            deposit_at(i1_prev(p) + N_GHOSTS,
+                       i2_prev(p) + N_GHOSTS,
+                       i3_prev(p) + N_GHOSTS,
+                       cur::jx2,
+                       Fx2_1 * (ONE - Wx1_1) * (ONE - Wx3_1));
+            deposit_at(i1_prev(p) + N_GHOSTS + 1,
+                       i2_prev(p) + N_GHOSTS,
+                       i3_prev(p) + N_GHOSTS,
+                       cur::jx2,
+                       Fx2_1 * Wx1_1 * (ONE - Wx3_1));
+            deposit_at(i1_prev(p) + N_GHOSTS,
+                       i2_prev(p) + N_GHOSTS,
+                       i3_prev(p) + N_GHOSTS + 1,
+                       cur::jx2,
+                       Fx2_1 * (ONE - Wx1_1) * Wx3_1);
+            deposit_at(i1_prev(p) + N_GHOSTS + 1,
+                       i2_prev(p) + N_GHOSTS,
+                       i3_prev(p) + N_GHOSTS + 1,
+                       cur::jx2,
+                       Fx2_1 * Wx1_1 * Wx3_1);
+
+            deposit_at(i1(p) + N_GHOSTS,
+                       i2(p) + N_GHOSTS,
+                       i3(p) + N_GHOSTS,
+                       cur::jx2,
+                       Fx2_2 * (ONE - Wx1_2) * (ONE - Wx3_2));
+            deposit_at(i1(p) + N_GHOSTS + 1,
+                       i2(p) + N_GHOSTS,
+                       i3(p) + N_GHOSTS,
+                       cur::jx2,
+                       Fx2_2 * Wx1_2 * (ONE - Wx3_2));
+            deposit_at(i1(p) + N_GHOSTS,
+                       i2(p) + N_GHOSTS,
+                       i3(p) + N_GHOSTS + 1,
+                       cur::jx2,
+                       Fx2_2 * (ONE - Wx1_2) * Wx3_2);
+            deposit_at(i1(p) + N_GHOSTS + 1,
+                       i2(p) + N_GHOSTS,
+                       i3(p) + N_GHOSTS + 1,
+                       cur::jx2,
+                       Fx2_2 * Wx1_2 * Wx3_2);
+
+            deposit_at(i1_prev(p) + N_GHOSTS,
+                       i2_prev(p) + N_GHOSTS,
+                       i3_prev(p) + N_GHOSTS,
+                       cur::jx3,
+                       Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1));
+            deposit_at(i1_prev(p) + N_GHOSTS + 1,
+                       i2_prev(p) + N_GHOSTS,
+                       i3_prev(p) + N_GHOSTS,
+                       cur::jx3,
+                       Fx3_1 * Wx1_1 * (ONE - Wx2_1));
+            deposit_at(i1_prev(p) + N_GHOSTS,
+                       i2_prev(p) + N_GHOSTS + 1,
+                       i3_prev(p) + N_GHOSTS,
+                       cur::jx3,
+                       Fx3_1 * (ONE - Wx1_1) * Wx2_1);
+            deposit_at(i1_prev(p) + N_GHOSTS + 1,
+                       i2_prev(p) + N_GHOSTS + 1,
+                       i3_prev(p) + N_GHOSTS,
+                       cur::jx3,
+                       Fx3_1 * Wx1_1 * Wx2_1);
+
+            deposit_at(i1(p) + N_GHOSTS,
+                       i2(p) + N_GHOSTS,
+                       i3(p) + N_GHOSTS,
+                       cur::jx3,
+                       Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2));
+            deposit_at(i1(p) + N_GHOSTS + 1,
+                       i2(p) + N_GHOSTS,
+                       i3(p) + N_GHOSTS,
+                       cur::jx3,
+                       Fx3_2 * Wx1_2 * (ONE - Wx2_2));
+            deposit_at(i1(p) + N_GHOSTS,
+                       i2(p) + N_GHOSTS + 1,
+                       i3(p) + N_GHOSTS,
+                       cur::jx3,
+                       Fx3_2 * (ONE - Wx1_2) * Wx2_2);
+            deposit_at(i1(p) + N_GHOSTS + 1,
+                       i2(p) + N_GHOSTS + 1,
+                       i3(p) + N_GHOSTS,
+                       cur::jx3,
+                       Fx3_2 * Wx1_2 * Wx2_2);
           }
         }
       } else if constexpr ((O >= 1u) and (O <= 11u)) {
@@ -421,33 +477,14 @@ namespace kernel {
                                    fS_x1);
 
         if constexpr (D == Dim::_1D) {
-          // define weight vectors
-          real_t Wx1[O + 2];
-          real_t Wx23[O + 2];
-
-          // Calculate weight function
-#pragma unroll
-          for (int i = 0; i < O + 2; ++i) {
-            // Esirkepov 2001, Eq. 38 for 1D case
-            Wx1[i]  = fS_x1[i] - iS_x1[i];
-            Wx23[i] = HALF * (fS_x1[i] + iS_x1[i]);
-          }
-
-          // contribution within the shape function stencil
-          real_t jx1[O + 2];
-
-          // prefactors for j update
+          // (1D): fused Esirkepov, no [O+2] temporaries.
+          //   jx1[i] = -Qdx1dt * sum_{i'=0}^{i} (fS_x1[i'] - iS_x1[i'])
+          //          = -Qdx1dt * P1[i]                  (Eq. 38, 1D)
+          //   Wx23[i] = HALF * (fS_x1[i] + iS_x1[i])     (computed inline)
           const real_t Qdx1dt = coeff * inv_dt;
           const real_t QVx2   = coeff * vp[1];
           const real_t QVx3   = coeff * vp[2];
 
-          // Calculate current contribution
-          jx1[0] = -Qdx1dt * Wx1[0];
-#pragma unroll
-          for (int i = 1; i < O + 2; ++i) {
-            jx1[i] = jx1[i - 1] - Qdx1dt * Wx1[i];
-          }
-
           // account for ghost cells
           i1_min += N_GHOSTS;
           i1_max += N_GHOSTS;
@@ -455,21 +492,18 @@ namespace kernel {
           // get number of update indices for asymmetric movement
           const int di_x1 = i1_max - i1_min;
 
-          /*
-              Current update
-          */
-          auto J_acc = J.access();
-
-          for (int i = 0; i < di_x1; ++i) {
-            J_acc(i1_min + i, cur::jx1) += jx1[i];
-          }
-
+          // Current update — fused over the union line so the J cell
+          // stays L1-resident across the 3 component atomic_adds.
+          real_t P1 = ZERO;
           for (int i = 0; i <= di_x1; ++i) {
-            J_acc(i1_min + i, cur::jx2) += QVx2 * Wx23[i];
-          }
-
-          for (int i = 0; i <= di_x1; ++i) {
-            J_acc(i1_min + i, cur::jx3) += QVx3 * Wx23[i];
+            P1 += fS_x1[i] - iS_x1[i];
+            const int    gi   = i1_min + i;
+            const real_t Wx23 = HALF * (fS_x1[i] + iS_x1[i]);
+            if (i < di_x1) {
+              deposit_at(gi, cur::jx1, -Qdx1dt * P1);
+            }
+            deposit_at(gi, cur::jx2, QVx2 * Wx23);
+            deposit_at(gi, cur::jx3, QVx3 * Wx23);
           }
 
         } else if constexpr (D == Dim::_2D) {
@@ -489,63 +523,23 @@ namespace kernel {
                                      iS_x2,
                                      fS_x2);
 
-          // define weight tensors
-          real_t Wx1[O + 2][O + 2];
-          real_t Wx2[O + 2][O + 2];
-          real_t Wx3[O + 2][O + 2];
-
-// Calculate weight function
-#pragma unroll
-          for (int i = 0; i < O + 2; ++i) {
-#pragma unroll
-            for (int j = 0; j < O + 2; ++j) {
-              // Esirkepov 2001, Eq. 38 (simplified)
-              Wx1[i][j] = HALF * (fS_x1[i] - iS_x1[i]) * (fS_x2[j] + iS_x2[j]);
-
-              Wx2[i][j] = HALF * (fS_x1[i] + iS_x1[i]) * (fS_x2[j] - iS_x2[j]);
-
-              Wx3[i][j] = THIRD * (fS_x2[j] * (HALF * iS_x1[i] + fS_x1[i]) +
-                                   iS_x2[j] * (HALF * fS_x1[i] + iS_x1[i]));
-            }
-          }
-
-          // contribution within the shape function stencil
-          real_t jx1[O + 2][O + 2], jx2[O + 2][O + 2];
-
-          // prefactors for j update
-          const real_t Qdx1dt = coeff * inv_dt;
-          const real_t Qdx2dt = coeff * inv_dt;
-          const real_t QVx3   = coeff * vp[2];
-
-          // Calculate current contribution
-
-          // jx1
-#pragma unroll
-          for (int j = 0; j < O + 2; ++j) {
-            jx1[0][j] = -Qdx1dt * Wx1[0][j];
-          }
-
-#pragma unroll
-          for (int i = 1; i < O + 2; ++i) {
-#pragma unroll
-            for (int j = 0; j < O + 2; ++j) {
-              jx1[i][j] = jx1[i - 1][j] - Qdx1dt * Wx1[i][j];
-            }
-          }
-
-          // jx2
-#pragma unroll
-          for (int i = 0; i < O + 2; ++i) {
-            jx2[i][0] = -Qdx2dt * Wx2[i][0];
-          }
-
-#pragma unroll
-          for (int j = 1; j < O + 2; ++j) {
-#pragma unroll
-            for (int i = 0; i < O + 2; ++i) {
-              jx2[i][j] = jx2[i][j - 1] - Qdx2dt * Wx2[i][j];
-            }
-          }
+          // (2D): fused Esirkepov, no [O+2]^2 temporaries.
+          //
+          // Esirkepov 2001 Eq. 38 (simplified) is separable: with
+          // P1[i] = sum_{i'=0}^{i} (fS_x1[i'] - iS_x1[i']) and
+          // P2[j] = sum_{j'=0}^{j} (fS_x2[j'] - iS_x2[j']),
+          //   jx1[i][j] = -Q*HALF * P1[i] * (fS_x2[j] + iS_x2[j])
+          //   jx2[i][j] = -Q*HALF * P2[j] * (fS_x1[i] + iS_x1[i])
+          //   Wx3[i][j] = THIRD*( fS_x2[j]*(HALF*iS_x1[i]+fS_x1[i])
+          //                     + iS_x2[j]*(HALF*fS_x1[i]+iS_x1[i]) )
+          // with Q = coeff*inv_dt (Qdx1dt == Qdx2dt). Same value as the
+          // old explicit Wx/jx tensors up to FP reassociation;
+          // charge-conserving by construction. Prefix sums carried as
+          // running scalars, so the only per-thread state is the 
+          // existing 1D shape arrays.
+          const real_t QVx3 = coeff * vp[2];
+          // -Q*HALF prefactor (Qdx1dt == Qdx2dt == coeff*inv_dt)
+          const real_t cf = -(coeff * inv_dt) * HALF;
 
           // account for ghost cells
           i1_min += N_GHOSTS;
@@ -557,26 +551,30 @@ namespace kernel {
           const int di_x1 = i1_max - i1_min;
           const int di_x2 = i2_max - i2_min;
 
-          /*
-              Current update
-          */
-          auto J_acc = J.access();
-
-          for (int i = 0; i < di_x1; ++i) {
-            for (int j = 0; j <= di_x2; ++j) {
-              J_acc(i1_min + i, i2_min + j, cur::jx1) += jx1[i][j];
-            }
-          }
-
-          for (int i = 0; i <= di_x1; ++i) {
-            for (int j = 0; j < di_x2; ++j) {
-              J_acc(i1_min + i, i2_min + j, cur::jx2) += jx2[i][j];
-            }
-          }
-
+          // Current update — fused over the union plane so the J cell
+          // line stays L1-resident across the 3 component atomic_adds.
+          real_t P1 = ZERO;
           for (int i = 0; i <= di_x1; ++i) {
+            P1 += fS_x1[i] - iS_x1[i];
+            const int    gi   = i1_min + i;
+            const real_t iSx1 = iS_x1[i];
+            const real_t fSx1 = fS_x1[i];
+            const real_t A1   = fSx1 + iSx1;     // jx2 cross-factor
+            real_t       P2   = ZERO;
             for (int j = 0; j <= di_x2; ++j) {
-              J_acc(i1_min + i, i2_min + j, cur::jx3) += QVx3 * Wx3[i][j];
+              P2 += fS_x2[j] - iS_x2[j];
+              const int    gj   = i2_min + j;
+              const real_t iSx2 = iS_x2[j];
+              const real_t fSx2 = fS_x2[j];
+              if (i < di_x1) {
+                deposit_at(gi, gj, cur::jx1, cf * P1 * (fSx2 + iSx2));
+              }
+              if (j < di_x2) {
+                deposit_at(gi, gj, cur::jx2, cf * P2 * A1);
+              }
+              const real_t Wx3 = THIRD * (fSx2 * (HALF * iSx1 + fSx1) +
+                                          iSx2 * (HALF * fSx1 + iSx1));
+              deposit_at(gi, gj, cur::jx3, QVx3 * Wx3);
             }
           }
 
@@ -610,104 +608,33 @@ namespace kernel {
                                      iS_x3,
                                      fS_x3);
 
-          // define weight tensors
-          real_t Wx1[O + 2][O + 2][O + 2];
-          real_t Wx2[O + 2][O + 2][O + 2];
-          real_t Wx3[O + 2][O + 2][O + 2];
-
-// Calculate weight function
-#pragma unroll
-          for (int i = 0; i < O + 2; ++i) {
-#pragma unroll
-            for (int j = 0; j < O + 2; ++j) {
-#pragma unroll
-              for (int k = 0; k < O + 2; ++k) {
-                // Esirkepov 2001, Eq. 31
-                Wx1[i][j][k] = THIRD * (fS_x1[i] - iS_x1[i]) *
-                               ((iS_x2[j] * iS_x3[k] + fS_x2[j] * fS_x3[k]) +
-                                HALF * (iS_x3[k] * fS_x2[j] + iS_x2[j] * fS_x3[k]));
-
-                Wx2[i][j][k] = THIRD * (fS_x2[j] - iS_x2[j]) *
-                               (iS_x1[i] * iS_x3[k] + fS_x1[i] * fS_x3[k] +
-                                HALF * (iS_x3[k] * fS_x1[i] + iS_x1[i] * fS_x3[k]));
-
-                Wx3[i][j][k] = THIRD * (fS_x3[k] - iS_x3[k]) *
-                               (iS_x1[i] * iS_x2[j] + fS_x1[i] * fS_x2[j] +
-                                HALF * (iS_x1[i] * fS_x2[j] + iS_x2[j] * fS_x1[i]));
-              }
-            }
-          }
-
-          // contribution within the shape function stencil
-          real_t jx1[O + 2][O + 2][O + 2], jx2[O + 2][O + 2][O + 2],
-            jx3[O + 2][O + 2][O + 2];
-
-          // prefactors to j update
-          const real_t Qdxdt = coeff * inv_dt;
-          const real_t Qdydt = coeff * inv_dt;
-          const real_t Qdzdt = coeff * inv_dt;
-
-          // Calculate current contribution
-
-          // jx1
-#pragma unroll
-          for (int j = 0; j < O + 2; ++j) {
-#pragma unroll
-            for (int k = 0; k < O + 2; ++k) {
-              jx1[0][j][k] = -Qdxdt * Wx1[0][j][k];
-            }
-          }
-
-#pragma unroll
-          for (int i = 1; i < O + 2; ++i) {
-#pragma unroll
-            for (int j = 0; j < O + 2; ++j) {
-#pragma unroll
-              for (int k = 0; k < O + 2; ++k) {
-                jx1[i][j][k] = jx1[i - 1][j][k] - Qdxdt * Wx1[i][j][k];
-              }
-            }
-          }
-
-          // jx2
-#pragma unroll
-          for (int i = 0; i < O + 2; ++i) {
-#pragma unroll
-            for (int k = 0; k < O + 2; ++k) {
-              jx2[i][0][k] = -Qdydt * Wx2[i][0][k];
-            }
-          }
-
-#pragma unroll
-          for (int i = 0; i < O + 2; ++i) {
-#pragma unroll
-            for (int j = 1; j < O + 2; ++j) {
-#pragma unroll
-              for (int k = 0; k < O + 2; ++k) {
-                jx2[i][j][k] = jx2[i][j - 1][k] - Qdydt * Wx2[i][j][k];
-              }
-            }
-          }
-
-          // jx3
-#pragma unroll
-          for (int i = 0; i < O + 2; ++i) {
-#pragma unroll
-            for (int j = 0; j < O + 2; ++j) {
-              jx3[i][j][0] = -Qdydt * Wx3[i][j][0];
-            }
-          }
-
-#pragma unroll
-          for (int i = 0; i < O + 2; ++i) {
-#pragma unroll
-            for (int j = 0; j < O + 2; ++j) {
-#pragma unroll
-              for (int k = 1; k < O + 2; ++k) {
-                jx3[i][j][k] = jx3[i][j][k - 1] - Qdzdt * Wx3[i][j][k];
-              }
-            }
-          }
+          // fused Esirkepov, no (O+2)^3 temporaries.
+          //
+          // The Esirkepov 3D current (2001, Eq. 31) is separable: with
+          // P1[i] = sum_{i'=0}^{i} (fS_x1[i'] - iS_x1[i']) (and likewise
+          // P2[j], P3[k]) the cumulative-sum currents collapse to
+          //
+          //   jx1[i][j][k] = -Q*THIRD * P1[i] * G23(j,k)
+          //   jx2[i][j][k] = -Q*THIRD * P2[j] * H13(i,k)
+          //   jx3[i][j][k] = -Q*THIRD * P3[k] * F12(i,j)
+          //
+          // with the 1D-shape cross-factors
+          //
+          //   G23(j,k) = iS_x2[j]*iS_x3[k] + fS_x2[j]*fS_x3[k]
+          //            + HALF*(iS_x3[k]*fS_x2[j] + iS_x2[j]*fS_x3[k])
+          //   H13(i,k) = iS_x1[i]*iS_x3[k] + fS_x1[i]*fS_x3[k]
+          //            + HALF*(iS_x3[k]*fS_x1[i] + iS_x1[i]*fS_x3[k])
+          //   F12(i,j) = iS_x1[i]*iS_x2[j] + fS_x1[i]*fS_x2[j]
+          //            + HALF*(iS_x1[i]*fS_x2[j] + iS_x2[j]*fS_x1[i])
+          //
+          // and Q = coeff*inv_dt (Qdxdt == Qdydt == Qdzdt). This is the
+          // same value as the old explicit Wx/jx tensors up to
+          // floating-point reassociation: charge-conserving by
+          // construction (the Esirkepov decomposition is exact). The
+          // prefix sums are carried as running scalars in the deposit
+          // loop, so the only per-thread state is the existing 1D shape
+          // arrays (no (O+2)^3 / (O+2)^2 locals, hence far fewer VGPRs
+          // and no private-memory tensor traffic).
 
           // account for ghost cells
           i1_min += N_GHOSTS;
@@ -722,31 +649,50 @@ namespace kernel {
           const int di_x2 = i2_max - i2_min;
           const int di_x3 = i3_max - i3_min;
 
+          // -Q*THIRD prefactor (Qdxdt == Qdydt == Qdzdt == coeff*inv_dt)
+          const real_t cf = -(coeff * inv_dt) * THIRD;
+
           /*
-            Current update
+            Current update — fused over the union cube so the J cell
+            line stays L1-resident across the 3 component atomic_adds.
+            Per-cell branches on (i<di_x1), (j<di_x2), (k<di_x3) skip
+            the trailing slab where each component's stencil ends one
+            cell short of the union; particles within a tile share
+            di_x* so the branch predicates cleanly.
           */
-          auto J_acc = J.access();
-
-          for (int i = 0; i < di_x1; ++i) {
-            for (int j = 0; j <= di_x2; ++j) {
-              for (int k = 0; k <= di_x3; ++k) {
-                J_acc(i1_min + i, i2_min + j, i3_min + k, cur::jx1) += jx1[i][j][k];
-              }
-            }
-          }
-
-          for (int i = 0; i <= di_x1; ++i) {
-            for (int j = 0; j < di_x2; ++j) {
-              for (int k = 0; k <= di_x3; ++k) {
-                J_acc(i1_min + i, i2_min + j, i3_min + k, cur::jx2) += jx2[i][j][k];
-              }
-            }
-          }
-
+          real_t P1 = ZERO;
           for (int i = 0; i <= di_x1; ++i) {
+            P1 += fS_x1[i] - iS_x1[i];
+            const int    gi    = i1_min + i;
+            const real_t iSx1i = iS_x1[i];
+            const real_t fSx1i = fS_x1[i];
+            real_t       P2    = ZERO;
             for (int j = 0; j <= di_x2; ++j) {
-              for (int k = 0; k < di_x3; ++k) {
-                J_acc(i1_min + i, i2_min + j, i3_min + k, cur::jx3) += jx3[i][j][k];
+              P2 += fS_x2[j] - iS_x2[j];
+              const int    gj    = i2_min + j;
+              const real_t iSx2j = iS_x2[j];
+              const real_t fSx2j = fS_x2[j];
+              const real_t F12   = iSx1i * iSx2j + fSx1i * fSx2j +
+                                 HALF * (iSx1i * fSx2j + iSx2j * fSx1i);
+              real_t P3 = ZERO;
+              for (int k = 0; k <= di_x3; ++k) {
+                P3 += fS_x3[k] - iS_x3[k];
+                const int    gk    = i3_min + k;
+                const real_t iSx3k = iS_x3[k];
+                const real_t fSx3k = fS_x3[k];
+                if (i < di_x1) {
+                  const real_t G23 = iSx2j * iSx3k + fSx2j * fSx3k +
+                                     HALF * (iSx3k * fSx2j + iSx2j * fSx3k);
+                  deposit_at(gi, gj, gk, cur::jx1, cf * P1 * G23);
+                }
+                if (j < di_x2) {
+                  const real_t H13 = iSx1i * iSx3k + fSx1i * fSx3k +
+                                     HALF * (iSx3k * fSx1i + iSx1i * fSx3k);
+                  deposit_at(gi, gj, gk, cur::jx2, cf * P2 * H13);
+                }
+                if (k < di_x3) {
+                  deposit_at(gi, gj, gk, cur::jx3, cf * P3 * F12);
+                }
               }
             }
           }
@@ -759,7 +705,522 @@ namespace kernel {
           "What are you even doing here? Entity already goes to 11!");
       }
     }
+
+  } // namespace deposit
+
+  /**
+   * @brief Flat current-deposition kernel.
+   *
+   * One thread per particle (RangePolicy). Writes are coalesced through a
+   * `Kokkos::Experimental::ScatterView` to avoid per-thread atomics on
+   * global J. Constructor signature is unchanged from prior versions —
+   * `engines/srpic/currents.h` continues to call it identically.
+   */
+  template <SimEngine::type S, MetricClass M, unsigned short O = 1u>
+  class DepositCurrents_kernel {
+    static_assert(O <= 11u, "Shape function order O must be <= 11");
+    static constexpr auto D = M::Dim;
+
+    scatter_ndfield_t<D, 3>     J;
+    deposit::PrtlPack<D>        prtls;
+    const M                     metric;
+    const real_t                charge, inv_dt;
+
+  public:
+    DepositCurrents_kernel(const scatter_ndfield_t<D, 3>& scatter_cur,
+                           const array_t<int*>&           i1,
+                           const array_t<int*>&           i2,
+                           const array_t<int*>&           i3,
+                           const array_t<int*>&           i1_prev,
+                           const array_t<int*>&           i2_prev,
+                           const array_t<int*>&           i3_prev,
+                           const array_t<prtldx_t*>&      dx1,
+                           const array_t<prtldx_t*>&      dx2,
+                           const array_t<prtldx_t*>&      dx3,
+                           const array_t<prtldx_t*>&      dx1_prev,
+                           const array_t<prtldx_t*>&      dx2_prev,
+                           const array_t<prtldx_t*>&      dx3_prev,
+                           const array_t<real_t*>&        ux1,
+                           const array_t<real_t*>&        ux2,
+                           const array_t<real_t*>&        ux3,
+                           const array_t<real_t*>&        phi,
+                           const array_t<real_t*>&        weight,
+                           const array_t<short*>&         tag,
+                           const M&                       metric,
+                           real_t                         charge,
+                           const real_t                   dt)
+      : J { scatter_cur }
+      , prtls { i1,       i2,       i3,       i1_prev,  i2_prev,  i3_prev,
+                dx1,      dx2,      dx3,      dx1_prev, dx2_prev, dx3_prev,
+                ux1,      ux2,      ux3,      phi,      weight,   tag }
+      , metric { metric }
+      , charge { charge }
+      , inv_dt { ONE / dt } {
+      raise::ErrorIf(
+        (O == 2u and N_GHOSTS < 2),
+        "Order of interpolation is 2, but number of ghost cells is < 2",
+        HERE);
+    }
+
+    Inline auto operator()(prtlidx_t p) const -> void {
+      auto J_acc = J.access();
+      if constexpr (D == Dim::_1D) {
+        deposit::deposit_one_particle<S, M, O>(
+          p,
+          prtls,
+          metric,
+          charge,
+          inv_dt,
+          [&](int g_i1, int comp, real_t v) {
+            J_acc(g_i1, comp) += v;
+          });
+      } else if constexpr (D == Dim::_2D) {
+        deposit::deposit_one_particle<S, M, O>(
+          p,
+          prtls,
+          metric,
+          charge,
+          inv_dt,
+          [&](int g_i1, int g_i2, int comp, real_t v) {
+            J_acc(g_i1, g_i2, comp) += v;
+          });
+      } else if constexpr (D == Dim::_3D) {
+        deposit::deposit_one_particle<S, M, O>(
+          p,
+          prtls,
+          metric,
+          charge,
+          inv_dt,
+          [&](int g_i1, int g_i2, int g_i3, int comp, real_t v) {
+            J_acc(g_i1, g_i2, g_i3, comp) += v;
+          });
+      }
+    }
   };
+
+#if defined(TEAM_POLICY)
+
+  /**
+   * @brief Tiled current-deposition kernel.
+   *
+   * One team per spatial tile (`league_size = ntiles_total`). Each team
+   * accumulates particle contributions into a per-team scratch buffer of
+   * shape `(T_TILE + 2*HALO)^D × 3` real_t, where `HALO = O + 1` cells per
+   * side. Scratch atomics live in SLM (PVC: ~5–10 cycles per
+   * `atomic_add`); the global J is touched only once per scratch cell at
+   * flush time. Compared with the flat scatter-view kernel:
+   *   - global atomic pressure ~ (T_TILE + 2*HALO)^D × 3 per tile
+   *     instead of (stencil writes per particle × particles)
+   *   - per-particle stencil writes are tile-local (SLM) instead of
+   *     scattering through global HBM
+   *
+   * Supports `O ∈ {0, ..., 11}`. `O == 0` (zigzag) is wired for
+   * A/B benchmarking against the flat scatter-view kernel — its narrow
+   * stencil typically makes scratch alloc/zero/flush overhead a
+   * regression there, but it's good to be able to measure the
+   * crossover. To revert and use flat for zigzag-only builds, change
+   * the dispatch in `engines/srpic/currents.h` from
+   * `#if defined(TEAM_POLICY)` to
+   * `#if defined(TEAM_POLICY) && (SHAPE_ORDER > 0)`.
+   *
+   * Particle iteration order is governed by `tile_offsets`: tile `t`
+   * owns particles `[tile_offsets(t), tile_offsets(t+1))`, post-sort.
+   * `SortSpatially` (`particles_sort.cpp`) is responsible for keeping
+   * the SoA arrays consistent with that.
+   *
+   * **Halo sizing and escape valve.** Sort runs at the end of the
+   * previous step (see `srpic.hpp`), so at deposit time the particle
+   * has already been pushed once — its `min(i, i_prev)` may differ
+   * from the bin key by one cell of drift per step elapsed since the
+   * last sort. The scratch HALO is `STENCIL_REACH(O) + DRIFT`, where
+   * `STENCIL_REACH = 2` for zigzag (writes `{i_prev, i_prev+1, i,
+   * i+1}` ⇒ +2 above `min(i, i_prev)` with `|Δi|=1`) and `O` for
+   * Esirkepov, and `DRIFT` is a fixed constant (1) covering the one
+   * guaranteed post-sort pusher step.
+   *
+   * HALO is sized for the *common* (every-step-sorted) case, not for
+   * a worst-case sort cadence: correctness does **not** depend on it.
+   * Any particle whose stencil escapes the scratch tile — because it
+   * drifted further than `DRIFT` (e.g. a large runtime
+   * `spatial_sorting_interval`), or because the halo is otherwise
+   * undersized — silently falls back to a direct, bounds-clipped
+   * `Kokkos::atomic_add` on the global J view. That path is
+   * charge-conserving (each particle's stencil is deposited exactly
+   * once, partly to private SLM scratch and partly to global J, and
+   * scratch is flushed once via `atomic_add`); it is merely slower
+   * per write. Sorting less often than every step therefore costs
+   * escape-valve traffic, never accuracy.
+   */
+  template <SimEngine::type S, MetricClass M, unsigned short O,
+            unsigned short T_TILE>
+  class DepositCurrents_kernel_tiled {
+    static_assert(O <= 11u, "Shape order O must be <= 11");
+    static_assert(T_TILE > 0u, "T_TILE must be positive");
+    static constexpr auto D = M::Dim;
+
+    // Per-side scratch halo, derived from first principles.
+    //
+    //   total halo = stencil_reach(O) + drift_between_sort_and_deposit
+    //
+    // stencil_reach(O) — maximum cells the deposit writes ABOVE
+    // min(i, i_prev) under CFL |v·dt/dx| ≤ 1/2:
+    //   - O == 0 (zigzag):  writes {i_prev, i_prev+1, i, i+1}  ⇒ +2
+    //   - O >= 1 Esirkepov: `for_deposit` returns an (O+2)-wide
+    //     array but only O+1 entries are non-zero, and the union
+    //     window satisfies `i_max - i_min <= O+1` (see
+    //     particle_shapes.hpp::for_deposit). The genuine one-sided
+    //     reach above min(i, i_prev) is therefore O, not O+1 — the
+    //     old `O+1` carried one extra cell of conservative padding
+    //     on top of the already-conservative drift term below.
+    //
+    // drift — sort runs at end-of-step (see srpic.hpp), so a particle
+    // sees exactly one pusher step before the *next* step's deposit
+    // when sorted every step (the common case). DRIFT is therefore a
+    // fixed constant of 1, NOT a compile-time function of the runtime
+    // sort cadence. Sizing the halo for the common case (rather than a
+    // worst-case sort interval) is what keeps the scratch small enough
+    // for good occupancy; a species sorted less often than
+    // every step just drifts past the halo and takes the global-J
+    // escape valve more often — correct, only slower (see the class
+    // doc-comment for why this is charge-conserving).
+    //
+    static constexpr int STENCIL_REACH = (O == 0u)
+                                           ? 2
+                                           : static_cast<int>(O);
+    static constexpr int DRIFT = 1;
+    static constexpr int HALO  = STENCIL_REACH + DRIFT;
+    static constexpr int TE    = static_cast<int>(T_TILE) + 2 * HALO;
+
+    using exec_space   = Kokkos::DefaultExecutionSpace;
+    using team_policy  = Kokkos::TeamPolicy<exec_space>;
+    using member_t     = typename team_policy::member_type;
+    using scratch_mem  = typename exec_space::scratch_memory_space;
+
+    // Scratch view types: trailing extent of 3 (jx1, jx2, jx3 components)
+    // is fixed by a runtime extent so we don't need a separate dimension
+    // template per component count.
+    using scratch_1d_t = Kokkos::View<real_t**,
+                                      scratch_mem,
+                                      Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+    using scratch_2d_t = Kokkos::View<real_t***,
+                                      scratch_mem,
+                                      Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+    using scratch_3d_t = Kokkos::View<real_t****,
+                                      scratch_mem,
+                                      Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+
+    ndfield_t<D, 3>           J;
+    deposit::PrtlPack<D>      prtls;
+    const M                   metric;
+    const real_t              charge, inv_dt;
+
+    // Tile metadata produced by SortSpatially.
+    array_t<npart_t*>         tile_offsets;
+    ncells_t                  ntx1 { 1u }, ntx2 { 1u }, ntx3 { 1u };
+    ncells_t                  total_tiles { 0u };
+
+    // J's full storage extent including all ghost cells. Used to clip
+    // the cooperative flush so that a partial tile at the high end of
+    // the domain does not over-write past the J view.
+    int                       j_ext1 { 0 }, j_ext2 { 0 }, j_ext3 { 0 };
+
+  public:
+    DepositCurrents_kernel_tiled(const ndfield_t<D, 3>&    cur,
+                                 const array_t<int*>&      i1,
+                                 const array_t<int*>&      i2,
+                                 const array_t<int*>&      i3,
+                                 const array_t<int*>&      i1_prev,
+                                 const array_t<int*>&      i2_prev,
+                                 const array_t<int*>&      i3_prev,
+                                 const array_t<prtldx_t*>& dx1,
+                                 const array_t<prtldx_t*>& dx2,
+                                 const array_t<prtldx_t*>& dx3,
+                                 const array_t<prtldx_t*>& dx1_prev,
+                                 const array_t<prtldx_t*>& dx2_prev,
+                                 const array_t<prtldx_t*>& dx3_prev,
+                                 const array_t<real_t*>&   ux1,
+                                 const array_t<real_t*>&   ux2,
+                                 const array_t<real_t*>&   ux3,
+                                 const array_t<real_t*>&   phi,
+                                 const array_t<real_t*>&   weight,
+                                 const array_t<short*>&    tag,
+                                 const M&                  metric,
+                                 real_t                    charge,
+                                 const real_t              dt,
+                                 const TileLayout<D>&      layout)
+      : J { cur }
+      , prtls { i1,       i2,       i3,       i1_prev,  i2_prev,  i3_prev,
+                dx1,      dx2,      dx3,      dx1_prev, dx2_prev, dx3_prev,
+                ux1,      ux2,      ux3,      phi,      weight,   tag }
+      , metric { metric }
+      , charge { charge }
+      , inv_dt { ONE / dt }
+      , tile_offsets { layout.tile_offsets }
+      , ntx1 { layout.ntiles_per_axis[0] }
+      , ntx2 { layout.ntiles_per_axis[1] }
+      , ntx3 { layout.ntiles_per_axis[2] }
+      , total_tiles { layout.ntiles_total } {
+      raise::ErrorIf(
+        layout.tile_size != T_TILE,
+        "Tiled deposit launched with mismatched T_TILE and runtime tile_size",
+        HERE);
+      // Note: HALO is allowed to exceed N_GHOSTS. The cooperative
+      // scratch→J flush and the per-particle escape valve both bounds-clip
+      // their writes against `j_ext*` so writes that would land past J's
+      // ghost stripe are silently dropped (they only ever come from a
+      // particle whose stencil reaches into the domain ghost region, where
+      // CommunicateFields will re-supply the contribution).
+      if constexpr (D == Dim::_1D || D == Dim::_2D || D == Dim::_3D) {
+        j_ext1 = static_cast<int>(cur.extent(0));
+      }
+      if constexpr (D == Dim::_2D || D == Dim::_3D) {
+        j_ext2 = static_cast<int>(cur.extent(1));
+      }
+      if constexpr (D == Dim::_3D) {
+        j_ext3 = static_cast<int>(cur.extent(2));
+      }
+    }
+
+    /**
+     * @brief Per-team scratch size in bytes. Used by the launcher to set
+     *        `team_policy.set_scratch_size(0, Kokkos::PerTeam(bytes))`.
+     */
+    static constexpr std::size_t scratch_bytes() {
+      if constexpr (D == Dim::_1D) {
+        return scratch_1d_t::shmem_size(TE, 3);
+      } else if constexpr (D == Dim::_2D) {
+        return scratch_2d_t::shmem_size(TE, TE, 3);
+      } else {
+        return scratch_3d_t::shmem_size(TE, TE, TE, 3);
+      }
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const member_t& team) const {
+      const auto tile_id = static_cast<ncells_t>(team.league_rank());
+      // Tile coordinates (tile-grid indices) → tile origin in **active**
+      // cell coords (no ghost offset). Using ncells_t to match the linearised
+      // tile index produced by SortSpatially.
+      ncells_t tx1 = 0, tx2 = 0, tx3 = 0;
+      if constexpr (D == Dim::_1D) {
+        tx1 = tile_id;
+      } else if constexpr (D == Dim::_2D) {
+        tx1 = tile_id / ntx2;
+        tx2 = tile_id - tx1 * ntx2;
+      } else {
+        const auto plane = ntx2 * ntx3;
+        tx1              = tile_id / plane;
+        const auto rem   = tile_id - tx1 * plane;
+        tx2              = rem / ntx3;
+        tx3              = rem - tx2 * ntx3;
+      }
+      // origin_active = lowest active-cell index in the tile (no ghost).
+      // origin_J      = same value translated into J's storage coordinate
+      //                 (i.e. plus N_GHOSTS).
+      // origin_J_low  = J coordinate of scratch index 0 (i.e. origin_J - HALO).
+      // local index `li` in scratch ↔ global J index `gi = li + origin_J_low`.
+      const int origin_J1_low = static_cast<int>(tx1 * T_TILE)
+                                + static_cast<int>(N_GHOSTS) - HALO;
+      const int origin_J2_low = static_cast<int>(tx2 * T_TILE)
+                                + static_cast<int>(N_GHOSTS) - HALO;
+      const int origin_J3_low = static_cast<int>(tx3 * T_TILE)
+                                + static_cast<int>(N_GHOSTS) - HALO;
+
+      // Allocate scratch and cooperatively zero-fill it.
+      if constexpr (D == Dim::_1D) {
+        scratch_1d_t scr(team.team_scratch(0), TE, 3);
+        Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, TE * 3),
+          [&](const int idx) {
+            const int li = idx / 3;
+            const int c  = idx - li * 3;
+            scr(li, c)   = ZERO;
+          });
+        team.team_barrier();
+
+        const auto p_begin = tile_offsets(tile_id);
+        const auto p_end   = tile_offsets(tile_id + 1u);
+        const int  e1_d    = j_ext1;
+        Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, p_begin, p_end),
+          [&](const npart_t p) {
+            deposit::deposit_one_particle<S, M, O>(
+              p,
+              prtls,
+              metric,
+              charge,
+              inv_dt,
+              // Escape valve: a particle whose stencil reaches past the
+              // tile's scratch (e.g. exceeded the compile-time
+              // STENCIL_REACH + DRIFT budget) falls back to a direct
+              // atomic_add on the global J view. Bounds-clipped against
+              // J's storage extent so writes past the domain ghost stripe
+              // are dropped (matches the cooperative flush below; those
+              // contributions are re-supplied by SynchronizeFields(J)).
+              [&](int g_i1, int comp, real_t v) {
+                const int li = g_i1 - origin_J1_low;
+                if (li >= 0 && li < TE) {
+                  Kokkos::atomic_add(&scr(li, comp), v);
+                } else if (g_i1 >= 0 && g_i1 < e1_d) {
+                  Kokkos::atomic_add(&J(g_i1, comp), v);
+                }
+              });
+          });
+        team.team_barrier();
+
+        // Cooperative flush of scratch to global J. Bounds-clip against
+        // the J view extent in case a partial high-end tile (or non-zero
+        // halo at domain edges) would otherwise write past J.
+        const int e1 = j_ext1;
+        Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, TE * 3),
+          [&](const int idx) {
+            const int li = idx / 3;
+            const int c  = idx - li * 3;
+            const int gi = li + origin_J1_low;
+            if (gi < 0 || gi >= e1) {
+              return;
+            }
+            const real_t v = scr(li, c);
+            if (v != ZERO) {
+              Kokkos::atomic_add(&J(gi, c), v);
+            }
+          });
+      } else if constexpr (D == Dim::_2D) {
+        scratch_2d_t scr(team.team_scratch(0), TE, TE, 3);
+        Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, TE * TE * 3),
+          [&](const int idx) {
+            const int lij  = idx / 3;
+            const int c    = idx - lij * 3;
+            const int li   = lij / TE;
+            const int lj   = lij - li * TE;
+            scr(li, lj, c) = ZERO;
+          });
+        team.team_barrier();
+
+        const auto p_begin = tile_offsets(tile_id);
+        const auto p_end   = tile_offsets(tile_id + 1u);
+        const int  e1_d    = j_ext1;
+        const int  e2_d    = j_ext2;
+        Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, p_begin, p_end),
+          [&](const npart_t p) {
+            deposit::deposit_one_particle<S, M, O>(
+              p,
+              prtls,
+              metric,
+              charge,
+              inv_dt,
+              // See 1D branch for rationale.
+              [&](int g_i1, int g_i2, int comp, real_t v) {
+                const int li = g_i1 - origin_J1_low;
+                const int lj = g_i2 - origin_J2_low;
+                if (li >= 0 && li < TE && lj >= 0 && lj < TE) {
+                  Kokkos::atomic_add(&scr(li, lj, comp), v);
+                } else if (g_i1 >= 0 && g_i1 < e1_d && g_i2 >= 0 &&
+                           g_i2 < e2_d) {
+                  Kokkos::atomic_add(&J(g_i1, g_i2, comp), v);
+                }
+              });
+          });
+        team.team_barrier();
+
+        const int e1 = j_ext1;
+        const int e2 = j_ext2;
+        Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, TE * TE * 3),
+          [&](const int idx) {
+            const int lij = idx / 3;
+            const int c   = idx - lij * 3;
+            const int li  = lij / TE;
+            const int lj  = lij - li * TE;
+            const int gi  = li + origin_J1_low;
+            const int gj  = lj + origin_J2_low;
+            if (gi < 0 || gi >= e1 || gj < 0 || gj >= e2) {
+              return;
+            }
+            const real_t v = scr(li, lj, c);
+            if (v != ZERO) {
+              Kokkos::atomic_add(&J(gi, gj, c), v);
+            }
+          });
+      } else if constexpr (D == Dim::_3D) {
+        scratch_3d_t scr(team.team_scratch(0), TE, TE, TE, 3);
+        const int    cells = TE * TE * TE;
+        Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, cells * 3),
+          [&](const int idx) {
+            const int lijk     = idx / 3;
+            const int c        = idx - lijk * 3;
+            const int li       = lijk / (TE * TE);
+            const int rem      = lijk - li * TE * TE;
+            const int lj       = rem / TE;
+            const int lk       = rem - lj * TE;
+            scr(li, lj, lk, c) = ZERO;
+          });
+        team.team_barrier();
+
+        const auto p_begin = tile_offsets(tile_id);
+        const auto p_end   = tile_offsets(tile_id + 1u);
+        const int  e1_d    = j_ext1;
+        const int  e2_d    = j_ext2;
+        const int  e3_d    = j_ext3;
+        Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, p_begin, p_end),
+          [&](const npart_t p) {
+            deposit::deposit_one_particle<S, M, O>(
+              p,
+              prtls,
+              metric,
+              charge,
+              inv_dt,
+              // See 1D branch for rationale.
+              [&](int g_i1, int g_i2, int g_i3, int comp, real_t v) {
+                const int li = g_i1 - origin_J1_low;
+                const int lj = g_i2 - origin_J2_low;
+                const int lk = g_i3 - origin_J3_low;
+                if (li >= 0 && li < TE && lj >= 0 && lj < TE && lk >= 0 &&
+                    lk < TE) {
+                  Kokkos::atomic_add(&scr(li, lj, lk, comp), v);
+                } else if (g_i1 >= 0 && g_i1 < e1_d && g_i2 >= 0 &&
+                           g_i2 < e2_d && g_i3 >= 0 && g_i3 < e3_d) {
+                  Kokkos::atomic_add(&J(g_i1, g_i2, g_i3, comp), v);
+                }
+              });
+          });
+        team.team_barrier();
+
+        const int e1 = j_ext1;
+        const int e2 = j_ext2;
+        const int e3 = j_ext3;
+        Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, cells * 3),
+          [&](const int idx) {
+            const int lijk = idx / 3;
+            const int c    = idx - lijk * 3;
+            const int li   = lijk / (TE * TE);
+            const int rem  = lijk - li * TE * TE;
+            const int lj   = rem / TE;
+            const int lk   = rem - lj * TE;
+            const int gi   = li + origin_J1_low;
+            const int gj   = lj + origin_J2_low;
+            const int gk   = lk + origin_J3_low;
+            if (gi < 0 || gi >= e1 || gj < 0 || gj >= e2 || gk < 0 ||
+                gk >= e3) {
+              return;
+            }
+            const real_t v = scr(li, lj, lk, c);
+            if (v != ZERO) {
+              Kokkos::atomic_add(&J(gi, gj, gk, c), v);
+            }
+          });
+      }
+    }
+  };
+
+#endif // TEAM_POLICY
+
 } // namespace kernel
 
 #undef i_di_to_Xi
diff --git a/tests/framework/CMakeLists.txt b/tests/framework/CMakeLists.txt
index d6dc295af..9a2e2865e 100644
--- a/tests/framework/CMakeLists.txt
+++ b/tests/framework/CMakeLists.txt
@@ -44,6 +44,12 @@ else()
   gen_test(particles_sort false)
 endif()
 
+# team_policy X-3: per-backend sort_by_key permutation test (only built
+# when the compile-time team_policy toggle is on).
+if(${team_policy})
+  gen_test(sort_by_key false)
+endif()
+
 gen_test(fields false)
 gen_test(grid_mesh false)
 if(${DEBUG})
diff --git a/tests/framework/sort_by_key.cpp b/tests/framework/sort_by_key.cpp
new file mode 100644
index 000000000..94dc51d0e
--- /dev/null
+++ b/tests/framework/sort_by_key.cpp
@@ -0,0 +1,110 @@
+/**
+ * @brief X-3 (team_policy) — sort_by_key permutation test.
+ *
+ * Exercises every backend overload of `ntt::sort_helpers::sort_by_key_dispatch`
+ * that is compiled in for the current Kokkos device. For each backend:
+ *   1. Allocate keys = { 5, 2, 5, 1, 3, 5, 2 }, perm = (uninitialised).
+ *   2. Call sort_by_key_dispatch.
+ *   3. Verify that keys[perm[i]] is sorted in non-decreasing order.
+ *
+ * Stability is verified for the BinSort and StdSort backends (the others
+ * promise stability per their documentation but we don't bake that into
+ * the test).
+ *
+ * Built only when `team_policy=ON` at CMake time.
+ */
+#include "enums.h"
+#include "global.h"
+
+#include "arch/kokkos_aliases.h"
+#include "utils/error.h"
+#include "utils/sort_dispatch.h"
+#include "utils/sorting.h"
+
+#include <Kokkos_Core.hpp>
+
+#include <iostream>
+#include <vector>
+
+namespace {
+
+  using namespace ntt;
+
+  template <typename Backend>
+  void test_one_backend(const char* label, Backend tag) {
+    const std::vector<ncells_t> keys_host_init { 5u, 2u, 5u, 1u, 3u, 5u, 2u };
+    const npart_t               n     = keys_host_init.size();
+    const ncells_t              n_max = 6u; // bin range [0, n_max)
+
+    array_t<ncells_t*> keys { "keys", n };
+    auto               keys_h = Kokkos::create_mirror_view(keys);
+    for (npart_t i = 0u; i < n; ++i) {
+      keys_h(i) = keys_host_init[i];
+    }
+    Kokkos::deep_copy(keys, keys_h);
+
+    prtl_perm_t perm { "perm", n };
+
+    sort_helpers::sort_by_key_dispatch(keys, perm, n_max, tag);
+
+    auto perm_h = Kokkos::create_mirror_view(perm);
+    Kokkos::deep_copy(perm_h, perm);
+
+    // Validate: keys[perm[0]] <= keys[perm[1]] <= ...
+    for (npart_t i = 1u; i < n; ++i) {
+      const auto a = keys_host_init[perm_h(i - 1u)];
+      const auto b = keys_host_init[perm_h(i)];
+      raise::ErrorIf(
+        a > b,
+        std::string("sort_by_key_dispatch produced non-sorted permutation "
+                    "for backend ") +
+          label,
+        HERE);
+    }
+
+    // Validate: perm is a permutation of [0, n).
+    std::vector<int> seen(n, 0);
+    for (npart_t i = 0u; i < n; ++i) {
+      const auto idx = perm_h(i);
+      raise::ErrorIf(idx >= n,
+                     std::string("permutation index out of range for "
+                                 "backend ") +
+                       label,
+                     HERE);
+      seen[idx] += 1;
+    }
+    for (npart_t i = 0u; i < n; ++i) {
+      raise::ErrorIf(seen[i] != 1,
+                     std::string("permutation not a bijection for backend ") +
+                       label,
+                     HERE);
+    }
+
+    std::cout << "[OK] sort_by_key_dispatch<" << label << ">: "
+              << "keys[perm] sorted, perm is a bijection." << std::endl;
+  }
+
+} // namespace
+
+auto main(int argc, char* argv[]) -> int {
+  ntt::GlobalInitialize(argc, argv);
+  try {
+    // Always-available backends.
+    test_one_backend("BinSort", ::sort::backend::BinSort {});
+#if !defined(DEVICE_ENABLED)
+    test_one_backend("StdSort", ::sort::backend::StdSort {});
+#endif
+#if defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)
+    test_one_backend("OneDPL", ::sort::backend::OneDPL {});
+#endif
+#if defined(CUDA_ENABLED) && defined(THRUST_ENABLED)
+    test_one_backend("Thrust", ::sort::backend::Thrust {});
+#endif
+  } catch (std::exception& e) {
+    std::cerr << e.what() << std::endl;
+    ntt::GlobalFinalize();
+    return 1;
+  }
+  ntt::GlobalFinalize();
+  return 0;
+}
diff --git a/tests/global/tiling.cpp b/tests/global/tiling.cpp
index d0f0a412c..2f926ce28 100644
--- a/tests/global/tiling.cpp
+++ b/tests/global/tiling.cpp
@@ -1,6 +1,5 @@
 #include "arch/kokkos_aliases.h"
 #include "utils/error.h"
-#include "utils/formatting.h"
 #include "utils/numeric.h"
 #include "utils/sorting.h"
 
@@ -122,37 +121,27 @@ void test_tiling(const array_t<int*>&         i1,
 
     const auto ntiles = nt1 * nt2 * nt3;
 
-    auto position_to_tile_kernel = sort::PositionToTileIndex<D, true> {
-      i1, i2, i3, tag, tile_indices, ncells, ts
-    };
-    Kokkos::parallel_for("Tiling", npart, position_to_tile_kernel);
-    const auto num_ppt = position_to_tile_kernel.num_ppt;
+    array_t<npart_t*> num_ppt { "num_ppt", ntiles };
+    Kokkos::parallel_for(
+      "Tiling",
+      npart,
+      sort::PositionToTileIndex<D, true> { i1, i2, i3, tag, tile_indices, ncells, ts, num_ppt });
     Kokkos::parallel_for(
       "Checking",
       npart,
       Lambda(prtlidx_t p) {
         CheckValue<D>(p, i1, i2, i3, tag, tile_indices, nt1, nt2, nt3, ntiles, ts);
       });
-    raise::ErrorIf(
-      num_ppt.extent(0) != ntiles,
-      fmt::format("num_ppt size does not match number of tiles %u != %u",
-                  num_ppt.extent(0),
-                  ntiles),
-      HERE);
 
     npart_t tot_alive = 0u;
     Kokkos::parallel_reduce(
       "CountAliveInTiles",
       ntiles,
-      Lambda(cellidx_t t, npart_t & count) { count += num_ppt(t); },
+      Lambda(prtlidx_t t, npart_t & count) { count += num_ppt(t); },
       tot_alive);
-    raise::ErrorIf(
-      tot_alive != npart - ndead,
-      fmt::format("Error in counting particles per tile: %u != %u - %u",
-                  tot_alive,
-                  npart,
-                  ndead),
-      HERE);
+    raise::ErrorIf(tot_alive != npart - ndead,
+                   "Error in counting particles per tile",
+                   HERE);
   }
 }
 
diff --git a/tests/kernels/CMakeLists.txt b/tests/kernels/CMakeLists.txt
index f1438e108..0fe23f578 100644
--- a/tests/kernels/CMakeLists.txt
+++ b/tests/kernels/CMakeLists.txt
@@ -26,6 +26,9 @@ endfunction()
 gen_test(faraday_mink)
 gen_test(ampere_mink)
 gen_test(deposit)
+if(${team_policy})
+  gen_test(deposit_tiled)
+endif()
 gen_test(digital_filter)
 gen_test(particle_moments)
 gen_test(fields_to_phys)
diff --git a/tests/kernels/deposit_tiled.cpp b/tests/kernels/deposit_tiled.cpp
new file mode 100644
index 000000000..87f4f3f4d
--- /dev/null
+++ b/tests/kernels/deposit_tiled.cpp
@@ -0,0 +1,262 @@
+/**
+ * @file tests/kernels/deposit_tiled.cpp
+ * @brief X-1 numerical-equivalence test for the tiled deposit kernel.
+ *
+ * Runs the flat (`DepositCurrents_kernel`) and tiled
+ * (`DepositCurrents_kernel_tiled`) kernels on identical particle SoA inputs
+ * for shape orders O = 1..11 and asserts that the resulting J array is
+ * identical cell-by-cell within a small floating-point tolerance.
+ *
+ * Built only when `team_policy=ON` (`-D TEAM_POLICY` defined). The test
+ * matches the per-particle setup used in `deposit.cpp` so that any
+ * regression in the shared `kernel::deposit::deposit_one_particle` body
+ * is caught by both tests.
+ */
+
+#include "enums.h"
+#include "global.h"
+
+#include "arch/kokkos_aliases.h"
+#include "utils/comparators.h"
+
+#include "metrics/minkowski.h"
+
+#include "kernels/currents_deposit.hpp"
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_ScatterView.hpp>
+
+#include <cmath>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+
+namespace {
+
+  using namespace ntt;
+
+  void errorIf(bool condition, const std::string& msg) {
+    if (condition) {
+      throw std::runtime_error(msg);
+    }
+  }
+
+  template <typename T>
+  void put_value(const array_t<T*>& arr, T value, int i) {
+    auto h = Kokkos::create_mirror_view(arr);
+    h(i)   = value;
+    Kokkos::deep_copy(arr, h);
+  }
+
+  // Builds tile_offsets for a single-particle test. Particle 0 is alive
+  // and lives in tile (tx1, tx2); slots 1..n_slots-1 carry the dead
+  // sentinel and are never referenced by tile_offsets — so the tiled
+  // kernel never iterates over them.
+  array_t<npart_t*> build_tile_offsets_single_particle(ncells_t ntx1,
+                                                       ncells_t ntx2,
+                                                       ncells_t tx1,
+                                                       ncells_t tx2) {
+    const ncells_t total_tiles = ntx1 * ntx2;
+    const ncells_t hot_tile    = tx1 * ntx2 + tx2;
+    array_t<npart_t*> offsets("tile_offsets", total_tiles + 1u);
+    auto h = Kokkos::create_mirror_view(offsets);
+    for (ncells_t t = 0; t <= total_tiles; ++t) {
+      h(t) = (t <= hot_tile) ? static_cast<npart_t>(0)
+                             : static_cast<npart_t>(1);
+    }
+    Kokkos::deep_copy(offsets, h);
+    return offsets;
+  }
+
+  template <unsigned short O, unsigned short T_TILE>
+  void run_one_case() {
+    using metric_t = metric::Minkowski<Dim::_2D>;
+    constexpr unsigned short nx1 = 50u, nx2 = 50u;
+    metric_t metric { { nx1, nx2 },
+                      { { 0.0, 55.0 }, { 0.0, 55.0 } },
+                      {} };
+
+    // Particle setup (mirrors deposit.cpp).
+    const int      i0 = 25, j0 = 21, i0f = 24, j0f = 20;
+    const real_t   uz = 2.5;
+    const prtldx_t dxi = static_cast<prtldx_t>(0.65);
+    const prtldx_t dxf = static_cast<prtldx_t>(0.99);
+    const prtldx_t dyi = static_cast<prtldx_t>(0.65);
+    const prtldx_t dyf = static_cast<prtldx_t>(0.80);
+
+    array_t<int*>      i1 { "i1", 10 };
+    array_t<int*>      i2 { "i2", 10 };
+    array_t<int*>      i3 { "i3", 10 };
+    array_t<int*>      i1_prev { "i1_prev", 10 };
+    array_t<int*>      i2_prev { "i2_prev", 10 };
+    array_t<int*>      i3_prev { "i3_prev", 10 };
+    array_t<prtldx_t*> dx1 { "dx1", 10 };
+    array_t<prtldx_t*> dx2 { "dx2", 10 };
+    array_t<prtldx_t*> dx3 { "dx3", 10 };
+    array_t<prtldx_t*> dx1_prev { "dx1_prev", 10 };
+    array_t<prtldx_t*> dx2_prev { "dx2_prev", 10 };
+    array_t<prtldx_t*> dx3_prev { "dx3_prev", 10 };
+    array_t<real_t*>   ux1 { "ux1", 10 };
+    array_t<real_t*>   ux2 { "ux2", 10 };
+    array_t<real_t*>   ux3 { "ux3", 10 };
+    array_t<real_t*>   phi { "phi", 10 };
+    array_t<real_t*>   weight { "weight", 10 };
+    array_t<short*>    tag { "tag", 10 };
+    const real_t       charge = 1.0, dt = 1.0;
+
+    put_value<int>(i1, i0f, 0);
+    put_value<int>(i2, j0f, 0);
+    put_value<int>(i1_prev, i0, 0);
+    put_value<int>(i2_prev, j0, 0);
+    put_value<prtldx_t>(dx1, dxf, 0);
+    put_value<prtldx_t>(dx2, dyf, 0);
+    put_value<prtldx_t>(dx1_prev, dxi, 0);
+    put_value<prtldx_t>(dx2_prev, dyi, 0);
+    put_value<real_t>(ux1, ZERO, 0);
+    put_value<real_t>(ux2, ZERO, 0);
+    put_value<real_t>(ux3, uz, 0);
+    put_value<real_t>(weight, 1.0, 0);
+    put_value<short>(tag, ParticleTag::alive, 0);
+
+    // Run the flat kernel.
+    ndfield_t<Dim::_2D, 3> J_flat { "J_flat",
+                                    nx1 + 2u * N_GHOSTS,
+                                    nx2 + 2u * N_GHOSTS };
+    {
+      auto J_scat = Kokkos::Experimental::create_scatter_view(J_flat);
+      Kokkos::parallel_for(
+        "FlatDeposit",
+        10,
+        kernel::DepositCurrents_kernel<SimEngine::SRPIC, metric_t, O>(
+          J_scat,
+          i1, i2, i3,
+          i1_prev, i2_prev, i3_prev,
+          dx1, dx2, dx3,
+          dx1_prev, dx2_prev, dx3_prev,
+          ux1, ux2, ux3,
+          phi, weight, tag,
+          metric, charge, dt));
+      Kokkos::Experimental::contribute(J_flat, J_scat);
+      Kokkos::fence("flat deposit done");
+    }
+
+    // Run the tiled kernel. Build a TileLayout with one alive particle
+    // landing in its expected tile (sort key = min(i, i_prev) / T_TILE).
+    ndfield_t<Dim::_2D, 3> J_tiled { "J_tiled",
+                                     nx1 + 2u * N_GHOSTS,
+                                     nx2 + 2u * N_GHOSTS };
+    {
+      const auto sort_i1 = static_cast<int>(
+        (i0 < i0f) ? i0 : i0f); // min(i, i_prev) before clamp
+      const auto sort_i2 = static_cast<int>((j0 < j0f) ? j0 : j0f);
+      const auto ntx1    = static_cast<ncells_t>(
+        std::ceil(static_cast<double>(nx1) / static_cast<double>(T_TILE)));
+      const auto ntx2 = static_cast<ncells_t>(
+        std::ceil(static_cast<double>(nx2) / static_cast<double>(T_TILE)));
+      const auto tx1 = static_cast<ncells_t>(sort_i1) / T_TILE;
+      const auto tx2 = static_cast<ncells_t>(sort_i2) / T_TILE;
+
+      TileLayout<Dim::_2D> layout;
+      layout.ntiles_per_axis[0] = ntx1;
+      layout.ntiles_per_axis[1] = ntx2;
+      layout.ntiles_per_axis[2] = 1u;
+      layout.ntiles_total       = ntx1 * ntx2;
+      layout.tile_size          = T_TILE;
+      layout.tile_offsets       = build_tile_offsets_single_particle(ntx1,
+                                                                      ntx2,
+                                                                      tx1,
+                                                                      tx2);
+
+      using kernel_t =
+        kernel::DepositCurrents_kernel_tiled<SimEngine::SRPIC, metric_t, O, T_TILE>;
+      kernel_t kern { J_tiled,
+                      i1, i2, i3,
+                      i1_prev, i2_prev, i3_prev,
+                      dx1, dx2, dx3,
+                      dx1_prev, dx2_prev, dx3_prev,
+                      ux1, ux2, ux3,
+                      phi, weight, tag,
+                      metric, charge, dt, layout };
+
+      Kokkos::TeamPolicy<> policy(static_cast<int>(layout.ntiles_total),
+                                  Kokkos::AUTO);
+      policy.set_scratch_size(0,
+                              Kokkos::PerTeam(kernel_t::scratch_bytes()));
+      Kokkos::parallel_for("TiledDeposit", policy, kern);
+      Kokkos::fence("tiled deposit done");
+    }
+
+    // Compare J_flat vs J_tiled cell-by-cell.
+    auto h_flat  = Kokkos::create_mirror_view(J_flat);
+    auto h_tiled = Kokkos::create_mirror_view(J_tiled);
+    Kokkos::deep_copy(h_flat, J_flat);
+    Kokkos::deep_copy(h_tiled, J_tiled);
+
+    const real_t eps      = static_cast<real_t>(1.0e-5);
+    real_t       max_diff = ZERO;
+    int          fail_count = 0;
+    for (ncells_t i = 0; i < h_flat.extent(0); ++i) {
+      for (ncells_t j = 0; j < h_flat.extent(1); ++j) {
+        for (int c = 0; c < 3; ++c) {
+          const real_t a    = h_flat(i, j, c);
+          const real_t b    = h_tiled(i, j, c);
+          const real_t diff = math::fabs(a - b);
+          const real_t mag  = math::max(math::fabs(a), math::fabs(b));
+          if (diff > max_diff) {
+            max_diff = diff;
+          }
+          if (diff > eps * math::max(mag, static_cast<real_t>(1.0))) {
+            if (fail_count < 5) {
+              std::cerr << "  J(" << i << "," << j << ",c=" << c
+                        << ") flat=" << a << " tiled=" << b
+                        << " diff=" << diff << '\n';
+            }
+            ++fail_count;
+          }
+        }
+      }
+    }
+    if (fail_count > 0) {
+      std::cerr << "X-1 deposit_tiled equivalence FAILED for O=" << O
+                << " T_TILE=" << T_TILE
+                << " : " << fail_count << " mismatches; max_diff=" << max_diff
+                << '\n';
+      throw std::logic_error("DepositCurrents_kernel_tiled mismatch");
+    }
+    std::cerr << "X-1 deposit_tiled OK  O=" << O << " T_TILE=" << T_TILE
+              << "  max_diff=" << max_diff << '\n';
+  }
+
+  template <unsigned short T_TILE>
+  void run_all_orders() {
+    run_one_case<0u, T_TILE>();
+    run_one_case<1u, T_TILE>();
+    run_one_case<2u, T_TILE>();
+    run_one_case<3u, T_TILE>();
+    run_one_case<4u, T_TILE>();
+    run_one_case<5u, T_TILE>();
+    run_one_case<6u, T_TILE>();
+    run_one_case<7u, T_TILE>();
+    run_one_case<8u, T_TILE>();
+    run_one_case<9u, T_TILE>();
+    run_one_case<10u, T_TILE>();
+    run_one_case<11u, T_TILE>();
+  }
+
+} // namespace
+
+auto main(int argc, char* argv[]) -> int {
+  Kokkos::initialize(argc, argv);
+  try {
+    // Run with each tile-size choice from the validated CMake list.
+    run_all_orders<4u>();
+    run_all_orders<8u>();
+    run_all_orders<12u>();
+  } catch (std::exception& e) {
+    std::cerr << e.what() << '\n';
+    Kokkos::finalize();
+    return 1;
+  }
+  Kokkos::finalize();
+  return 0;
+}

From 04d403c606147e0d651cac28f930d5592f808c84 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ludwig=20B=C3=B6ss?= <ludwig.boess@outlook.de>
Date: Tue, 19 May 2026 19:25:38 +0200
Subject: [PATCH 2/7] removed redundant inner loop over species

---
 src/framework/domain/metadomain_sort.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/framework/domain/metadomain_sort.cpp b/src/framework/domain/metadomain_sort.cpp
index 791bf31a8..a2b08f8be 100644
--- a/src/framework/domain/metadomain_sort.cpp
+++ b/src/framework/domain/metadomain_sort.cpp
@@ -21,9 +21,7 @@ namespace ntt {
       const auto clearing_interval = species.clearing_interval();
       if ((clearing_interval > 0u) and (step % clearing_interval == 0u) and
           (step > 0u)) {
-        for (auto& species : domain.species) {
-          species.RemoveDead();
-        }
+        species.RemoveDead();
       }
       const auto spatial_sorting_interval = species.spatial_sorting_interval();
       if ((spatial_sorting_interval > 0u) and

From 5830c0740e43851f5d26afb8e35438160be8a007 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ludwig=20B=C3=B6ss?= <ludwig.boess@outlook.de>
Date: Tue, 19 May 2026 19:26:11 +0200
Subject: [PATCH 3/7] frontier-specific memory pool allocation

---
 src/global/global.cpp | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/src/global/global.cpp b/src/global/global.cpp
index ec22fd2f3..c3aa1bcdc 100644
--- a/src/global/global.cpp
+++ b/src/global/global.cpp
@@ -6,8 +6,51 @@
   #include <mpi.h>
 #endif // MPI_ENABLED
 
+#if defined(HIP_ENABLED)
+  #include <hip/hip_runtime.h>
+
+  #include <cstdint>
+#endif // HIP_ENABLED
+
+namespace {
+#if defined(HIP_ENABLED)
+  // Turn the ROCm stream-ordered allocator into a caching arena.
+  //
+  // This Kokkos build uses hipMallocAsync/hipFreeAsync (Kokkos option
+  // IMPL_HIP_MALLOC_ASYNC). The default memory pool has a release
+  // threshold of 0, so every freed block is handed back to the driver
+  // at the next stream sync. With ~50 GB of particle SoA permanently
+  // pinned and only ~14 GB free, the per-step churn of dozens of
+  // large, differently-sized sort/comm scratch buffers fragments that
+  // free space: allocation cost grows monotonically (ParticleSort
+  // slowdown) until no contiguous mid-size block remains and BinSort's
+  // `sorted_values` allocation fails (OOM). Raising the release
+  // threshold to "unlimited" makes the pool retain and recycle freed
+  // blocks instead, which stabilizes the working set and removes both
+  // the slowdown and the OOM.
+  void ConfigureHipMemPool() {
+    int device = 0;
+    if (hipGetDevice(&device) != hipSuccess) {
+      return;
+    }
+    hipMemPool_t pool = nullptr;
+    if (hipDeviceGetDefaultMemPool(&pool, device) != hipSuccess or
+        pool == nullptr) {
+      return;
+    }
+    uint64_t threshold = UINT64_MAX;
+    (void)hipMemPoolSetAttribute(pool,
+                                 hipMemPoolAttrReleaseThreshold,
+                                 &threshold);
+  }
+#endif // HIP_ENABLED
+} // namespace
+
 void ntt::GlobalInitialize(int argc, char* argv[]) {
   Kokkos::initialize(argc, argv);
+#if defined(HIP_ENABLED)
+  ConfigureHipMemPool();
+#endif // HIP_ENABLED
 #if defined(MPI_ENABLED)
   MPI_Init(&argc, &argv);
 #endif // MPI_ENABLED

From cf14e9f11891afc732bfb9d289b2c16c53af770c Mon Sep 17 00:00:00 2001
From: LudwigBoess <ludwig.boess@outlook.de>
Date: Tue, 26 May 2026 17:06:27 +0000
Subject: [PATCH 4/7] support more tile sizes

---
 CMakeLists.txt                  | 2 +-
 tests/kernels/deposit_tiled.cpp | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 248a34e7b..06494a7db 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,7 +65,7 @@ set(team_policy_tile_size
   ${default_team_policy_tile_size}
   CACHE STRING "team_policy tile edge length in cells")
 set(team_policy_tile_sizes
-  "4;6;8;10;12"
+  "4;6;8;10;12;14;16"
   CACHE STRING "team_policy tile-size choices")
 
 # -------------------------- Compilation settings -------------------------- #
diff --git a/tests/kernels/deposit_tiled.cpp b/tests/kernels/deposit_tiled.cpp
index 87f4f3f4d..504cc3818 100644
--- a/tests/kernels/deposit_tiled.cpp
+++ b/tests/kernels/deposit_tiled.cpp
@@ -250,8 +250,12 @@ auto main(int argc, char* argv[]) -> int {
   try {
     // Run with each tile-size choice from the validated CMake list.
     run_all_orders<4u>();
+    run_all_orders<6u>();
     run_all_orders<8u>();
+    run_all_orders<10u>();
     run_all_orders<12u>();
+    run_all_orders<14u>();
+    run_all_orders<16u>();
   } catch (std::exception& e) {
     std::cerr << e.what() << '\n';
     Kokkos::finalize();

From 787aa045750fa2c582b54d02c5e8d3f4265de1e0 Mon Sep 17 00:00:00 2001
From: LudwigBoess <ludwig.boess@outlook.de>
Date: Tue, 26 May 2026 17:12:06 +0000
Subject: [PATCH 5/7] removed persistent sort scratch to reduce memory overhead

---
 src/framework/containers/particles.h        |  16 ---
 src/framework/containers/particles_sort.cpp | 111 +++++++-------------
 2 files changed, 38 insertions(+), 89 deletions(-)

diff --git a/src/framework/containers/particles.h b/src/framework/containers/particles.h
index 0a15cb5d9..4f0770729 100644
--- a/src/framework/containers/particles.h
+++ b/src/framework/containers/particles.h
@@ -99,22 +99,6 @@ namespace ntt {
     // vendor libraries detected by CMake.
     TileLayout<D> m_tile_layout {};
 
-#if defined(TEAM_POLICY) &&                                                    \
-  ((defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)) ||                       \
-   (defined(CUDA_ENABLED) && defined(THRUST_ENABLED)) ||                       \
-   (defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED)))
-    // Persistent byte scratch reused by every SoA-member gather in
-    // `apply_permutation_to_soa`, across all members and all timesteps.
-    // Without this each member would allocate (and free) its own
-    // transient buffer every sort; recycling one persistent buffer
-    // removes that allocation churn entirely — the structural fix for
-    // the ROCm sort slowdown / fragmentation. Grown monotonically to
-    // the largest required size, never shrunk. Kokkos device
-    // allocations are over-aligned (>= 8 B), so reinterpreting the
-    // bytes as any SoA element type (<= 8 B PODs) is well-defined.
-    array_t<char*> m_perm_scratch {};
-#endif
-
   public:
     // for empty allocation
     Particles() {}
diff --git a/src/framework/containers/particles_sort.cpp b/src/framework/containers/particles_sort.cpp
index c04813e12..0317e0b5f 100644
--- a/src/framework/containers/particles_sort.cpp
+++ b/src/framework/containers/particles_sort.cpp
@@ -454,28 +454,20 @@ namespace ntt {
 #if defined(TEAM_POLICY_USE_VENDOR_SORT)
   namespace permute_helpers {
 
-    // Permute a 1D SoA member array `arr` in place by `perm`, gathering
-    // through `scratch` — a persistent byte buffer reused by every
-    // member and every timestep (no per-call allocation). An unmanaged
-    // typed view aliases the scratch bytes; the caller guarantees
-    // `scratch` is large enough and that Kokkos' device over-alignment
-    // covers the element type.
+    // Permute a 1D SoA member array `arr` in place by `perm`, using a
+    // single transient buffer of size `n`. Buffer is freed at scope
+    // exit; the explicit fence right before that drains queued GPU
+    // work referencing it.
     template <typename V>
-    inline void permute_1d_inplace(V&                    arr,
-                                   const prtl_perm_t&    perm,
-                                   npart_t               n,
-                                   const array_t<char*>& scratch) {
+    inline void permute_1d_inplace(V&                 arr,
+                                   const prtl_perm_t& perm,
+                                   npart_t            n) {
       if (n == 0u) {
         return;
       }
-      using value_t = typename V::non_const_value_type;
-      using buf_t   = Kokkos::View<value_t*,
-                                 typename V::array_layout,
-                                 typename V::memory_space,
-                                 Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
-      buf_t buf(reinterpret_cast<value_t*>(scratch.data()), n);
-      auto  perm_v = perm;
-      auto  arr_v  = arr;
+      V    buf(std::string(arr.label()) + "_perm_buf", n);
+      auto perm_v = perm;
+      auto arr_v  = arr;
       Kokkos::parallel_for(
         "Permute1D",
         n,
@@ -486,22 +478,16 @@ namespace ntt {
 
     // 2D analogue for `pld_r` / `pld_i`.
     template <typename V>
-    inline void permute_2d_inplace(V&                    arr,
-                                   const prtl_perm_t&    perm,
-                                   npart_t               n,
-                                   npart_t               ncols,
-                                   const array_t<char*>& scratch) {
+    inline void permute_2d_inplace(V&                 arr,
+                                   const prtl_perm_t& perm,
+                                   npart_t            n,
+                                   npart_t            ncols) {
       if (n == 0u or ncols == 0u) {
         return;
       }
-      using value_t = typename V::non_const_value_type;
-      using buf_t   = Kokkos::View<value_t**,
-                                 typename V::array_layout,
-                                 typename V::memory_space,
-                                 Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
-      buf_t buf(reinterpret_cast<value_t*>(scratch.data()), n, ncols);
-      auto  perm_v = perm;
-      auto  arr_v  = arr;
+      V    buf(std::string(arr.label()) + "_perm_buf", n, ncols);
+      auto perm_v = perm;
+      auto arr_v  = arr;
       Kokkos::parallel_for(
         "Permute2D",
         CreateParticleRangePolicy<Dim::_2D>({ 0u, 0u }, { n, ncols }),
@@ -522,61 +508,40 @@ namespace ntt {
       return;
     }
 
-    // Size the persistent scratch once to the largest gather any member
-    // needs this call: 1D members need n * sizeof(real_t) bytes (the
-    // widest element); the 2D payloads need n * ncols * elem bytes.
-    // Grown monotonically, never shrunk — so after warmup this incurs
-    // no allocation at all.
-    std::size_t need = static_cast<std::size_t>(n) * sizeof(real_t);
-    if (npld_r() > 0) {
-      need = std::max(need,
-                      static_cast<std::size_t>(n) *
-                        static_cast<std::size_t>(npld_r()) * sizeof(real_t));
-    }
-    if (npld_i() > 0) {
-      need = std::max(need,
-                      static_cast<std::size_t>(n) *
-                        static_cast<std::size_t>(npld_i()) * sizeof(npart_t));
-    }
-    if (m_perm_scratch.extent(0) < need) {
-      m_perm_scratch = array_t<char*> { "perm_scratch", need };
-    }
-    const auto& scratch = m_perm_scratch;
-
     using permute_helpers::permute_1d_inplace;
     using permute_helpers::permute_2d_inplace;
 
     if constexpr (D == Dim::_1D or D == Dim::_2D or D == Dim::_3D) {
-      permute_1d_inplace(i1, perm, n, scratch);
-      permute_1d_inplace(dx1, perm, n, scratch);
-      permute_1d_inplace(i1_prev, perm, n, scratch);
-      permute_1d_inplace(dx1_prev, perm, n, scratch);
+      permute_1d_inplace(i1, perm, n);
+      permute_1d_inplace(dx1, perm, n);
+      permute_1d_inplace(i1_prev, perm, n);
+      permute_1d_inplace(dx1_prev, perm, n);
     }
     if constexpr (D == Dim::_2D or D == Dim::_3D) {
-      permute_1d_inplace(i2, perm, n, scratch);
-      permute_1d_inplace(dx2, perm, n, scratch);
-      permute_1d_inplace(i2_prev, perm, n, scratch);
-      permute_1d_inplace(dx2_prev, perm, n, scratch);
+      permute_1d_inplace(i2, perm, n);
+      permute_1d_inplace(dx2, perm, n);
+      permute_1d_inplace(i2_prev, perm, n);
+      permute_1d_inplace(dx2_prev, perm, n);
     }
     if constexpr (D == Dim::_3D) {
-      permute_1d_inplace(i3, perm, n, scratch);
-      permute_1d_inplace(dx3, perm, n, scratch);
-      permute_1d_inplace(i3_prev, perm, n, scratch);
-      permute_1d_inplace(dx3_prev, perm, n, scratch);
-    }
-    permute_1d_inplace(ux1, perm, n, scratch);
-    permute_1d_inplace(ux2, perm, n, scratch);
-    permute_1d_inplace(ux3, perm, n, scratch);
-    permute_1d_inplace(weight, perm, n, scratch);
-    permute_1d_inplace(tag, perm, n, scratch);
+      permute_1d_inplace(i3, perm, n);
+      permute_1d_inplace(dx3, perm, n);
+      permute_1d_inplace(i3_prev, perm, n);
+      permute_1d_inplace(dx3_prev, perm, n);
+    }
+    permute_1d_inplace(ux1, perm, n);
+    permute_1d_inplace(ux2, perm, n);
+    permute_1d_inplace(ux3, perm, n);
+    permute_1d_inplace(weight, perm, n);
+    permute_1d_inplace(tag, perm, n);
     if constexpr (D == Dim::_2D and C != Coord::Cartesian) {
-      permute_1d_inplace(phi, perm, n, scratch);
+      permute_1d_inplace(phi, perm, n);
     }
     if (npld_r() > 0) {
-      permute_2d_inplace(pld_r, perm, n, static_cast<npart_t>(npld_r()), scratch);
+      permute_2d_inplace(pld_r, perm, n, static_cast<npart_t>(npld_r()));
     }
     if (npld_i() > 0) {
-      permute_2d_inplace(pld_i, perm, n, static_cast<npart_t>(npld_i()), scratch);
+      permute_2d_inplace(pld_i, perm, n, static_cast<npart_t>(npld_i()));
     }
   }
 #endif // TEAM_POLICY_USE_VENDOR_SORT

From 827cf264c2903b3179252c88f19c0a7bb090357d Mon Sep 17 00:00:00 2001
From: LudwigBoess <ludwig.boess@outlook.de>
Date: Tue, 2 Jun 2026 18:20:33 +0000
Subject: [PATCH 6/7] added team policy reporting

---
 src/engines/reporter.cpp      | 3 +++
 src/global/utils/reporter.cpp | 6 ++++++
 2 files changed, 9 insertions(+)

diff --git a/src/engines/reporter.cpp b/src/engines/reporter.cpp
index f56874d23..3caeda5de 100644
--- a/src/engines/reporter.cpp
+++ b/src/engines/reporter.cpp
@@ -32,6 +32,9 @@ namespace ntt {
                        "%s",
                        params.template get<std::string>("simulation.name").c_str());
     reporter::AddParam(report, 4, "Engine", "%s", SimEngine(S).to_string());
+#if defined(TEAM_POLICY)
+    reporter::AddParam(report, 4, "Tile size", "%d", TEAM_POLICY_TILE_SIZE);
+#endif
     reporter::AddParam(report, 4, "Metric", "%s", M.to_string());
 #if SHAPE_ORDER == 0
     reporter::AddParam(report, 4, "Deposit", "%s", "zigzag");
diff --git a/src/global/utils/reporter.cpp b/src/global/utils/reporter.cpp
index 77117c4b9..a4b10eee6 100644
--- a/src/global/utils/reporter.cpp
+++ b/src/global/utils/reporter.cpp
@@ -250,6 +250,12 @@ namespace reporter {
 #else
     AddParam(report, 4, "GPU_AWARE_MPI", "%s", "OFF");
 #endif
+
+#if defined(TEAM_POLICY)
+    AddParam(report, 4, "TEAM_POLICY", "%s", "ON");
+#else
+    AddParam(report, 4, "TEAM_POLICY", "%s", "OFF");
+#endif
     report += "\n";
     return report;
   }

From 4b81914aae5cbc15cea19217b284113f617750f3 Mon Sep 17 00:00:00 2001
From: LudwigBoess <ludwig.boess@outlook.de>
Date: Tue, 2 Jun 2026 19:08:47 +0000
Subject: [PATCH 7/7] explicitly bind GPU Transport Layer for GPU aware MPI on
 Frontier

---
 CMakeLists.txt | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 06494a7db..0137671a0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -211,6 +211,62 @@ if(${mpi})
   if(${DEVICE_ENABLED})
     if(${gpu_aware_mpi})
       add_compile_options("-D GPU_AWARE_MPI")
+
+      # On Cray systems (e.g. Frontier) GPU-aware Cray MPICH can only
+      # handle device pointers if the GPU Transport Layer (GTL) library
+      # is linked. The Cray compiler wrappers (cc/CC) inject this
+      # automatically, but we build with hipcc/nvcc directly, so
+      # find_package(MPI) only finds base libmpi and the GTL is left
+      # out -> MPI_Sendrecv on a device pointer fails with
+      # "OFI ... Bad address". Add it explicitly here.
+      #
+      # Cray PE exports PE_MPICH_GTL_DIR_<accel> / PE_MPICH_GTL_LIBS_<accel>
+      # (e.g. amd_gfx90a -> -lmpi_gtl_hsa). Their absence means this is
+      # not a Cray MPICH build, in which case nothing extra is needed.
+      if("${Kokkos_DEVICES}" MATCHES "HIP")
+        set(_gtl_accels amd_gfx942 amd_gfx940 amd_gfx90a amd_gfx908 amd_gfx906)
+      elseif("${Kokkos_DEVICES}" MATCHES "CUDA")
+        set(_gtl_accels nvidia90 nvidia80 nvidia70)
+      elseif("${Kokkos_DEVICES}" MATCHES "SYCL")
+        set(_gtl_accels ponteVecchio)
+      else()
+        set(_gtl_accels "")
+      endif()
+
+      set(_gtl_dir "")
+      set(_gtl_libflag "")
+      foreach(_accel ${_gtl_accels})
+        if((NOT _gtl_dir) AND (DEFINED ENV{PE_MPICH_GTL_DIR_${_accel}}))
+          # strip the leading "-L" from the Cray-provided value
+          string(REGEX REPLACE "^-L" ""
+                 _gtl_dir "$ENV{PE_MPICH_GTL_DIR_${_accel}}")
+          string(REGEX REPLACE "^-l" ""
+                 _gtl_libflag "$ENV{PE_MPICH_GTL_LIBS_${_accel}}")
+        endif()
+      endforeach()
+
+      if(_gtl_dir AND _gtl_libflag)
+        find_library(MPI_GTL_LIBRARY
+          NAMES ${_gtl_libflag}
+          HINTS "${_gtl_dir}"
+          NO_DEFAULT_PATH)
+        if(MPI_GTL_LIBRARY)
+          message(STATUS
+            "GPU-aware MPI: linking Cray GTL library ${MPI_GTL_LIBRARY}")
+          set(DEPENDENCIES ${DEPENDENCIES} ${MPI_GTL_LIBRARY})
+        else()
+          message(FATAL_ERROR
+            "${Red}gpu_aware_mpi=ON: Cray MPICH detected but the GTL "
+            "library 'lib${_gtl_libflag}' was not found in '${_gtl_dir}'. "
+            "GPU-aware MPI will crash at runtime without it. Make sure the "
+            "craype-accel module is loaded, or build with gpu_aware_mpi=OFF."
+            "${ColorReset}")
+        endif()
+      else()
+        message(STATUS
+          "GPU-aware MPI: no Cray GTL environment found; assuming the MPI "
+          "implementation is GPU-aware without an extra transport library.")
+      endif()
     endif()
   else()
     set(gpu_aware_mpi