From 00fa0bf65a35c111c94dc8924ee4e6eda8285629 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ludwig=20B=C3=B6ss?= Date: Tue, 19 May 2026 19:25:22 +0200 Subject: [PATCH 1/7] port: team policy and vendor specific sort --- CMakeLists.txt | 66 + cmake/defaults.cmake | 15 + cmake/report.cmake | 25 + src/engines/srpic/currents.h | 135 +- src/framework/containers/particles.h | 58 + src/framework/containers/particles_sort.cpp | 341 ++++- src/global/arch/kokkos_aliases.h | 27 + src/global/utils/sort_dispatch.h | 171 +++ src/global/utils/sorting.h | 155 ++- src/kernels/currents_deposit.hpp | 1395 ++++++++++++------- tests/framework/CMakeLists.txt | 6 + tests/framework/sort_by_key.cpp | 110 ++ tests/global/tiling.cpp | 29 +- tests/kernels/CMakeLists.txt | 3 + tests/kernels/deposit_tiled.cpp | 262 ++++ 15 files changed, 2286 insertions(+), 512 deletions(-) create mode 100644 src/global/utils/sort_dispatch.h create mode 100644 tests/framework/sort_by_key.cpp create mode 100644 tests/kernels/deposit_tiled.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index cd5f0c258..acac1d7d5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,6 +58,16 @@ set(gpu_aware_mpi ${default_gpu_aware_mpi} CACHE BOOL "Enable GPU-aware MPI") +set(team_policy + ${default_team_policy} + CACHE BOOL "Enable team_policy tile-blocked deposit/pusher kernels") +set(team_policy_tile_size + ${default_team_policy_tile_size} + CACHE STRING "team_policy tile edge length in cells") +set(team_policy_tile_sizes + "4;6;8;10;12" + CACHE STRING "team_policy tile-size choices") + # -------------------------- Compilation settings -------------------------- # set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) @@ -136,6 +146,62 @@ else() set(DEVICE_ENABLED OFF) endif() +# ------------------------------ team_policy wiring ------------------------ # +if(${team_policy}) + list(FIND team_policy_tile_sizes "${team_policy_tile_size}" _tps_idx) + if(_tps_idx EQUAL -1) + message(FATAL_ERROR + "${Red}team_policy_tile_size must be one of ${team_policy_tile_sizes}, " + "got '${team_policy_tile_size}'${ColorReset}") + endif() + add_compile_options("-D TEAM_POLICY") + add_compile_options("-D TEAM_POLICY_TILE_SIZE=${team_policy_tile_size}") + + # Vendor sort: oneDPL on SYCL, Thrust on CUDA. Used automatically + # when found; falls back to Kokkos::BinSort otherwise. + if("${Kokkos_DEVICES}" MATCHES "SYCL") + find_package(oneDPL QUIET) + if(oneDPL_FOUND) + message(STATUS "team_policy: oneDPL found, enabling SYCL sort_by_key") + add_compile_options("-D ONEDPL_ENABLED") + set(DEPENDENCIES ${DEPENDENCIES} oneDPL) + else() + message(STATUS "team_policy: oneDPL not found; using BinSort fallback " + "for SYCL sort_by_key") + endif() + endif() + + if("${Kokkos_DEVICES}" MATCHES "CUDA") + find_package(Thrust QUIET) + if(Thrust_FOUND) + message(STATUS "team_policy: Thrust enabled for CUDA sort_by_key") + add_compile_options("-D THRUST_ENABLED") + else() + message(STATUS "team_policy: Thrust not found; using BinSort fallback " + "for CUDA sort_by_key") + endif() + endif() + + if("${Kokkos_DEVICES}" MATCHES "HIP") + # rocThrust ships with ROCm and exposes the same thrust:: API. Using + # it lets the HIP backend build a single permutation via + # sort_by_key and gather all SoA members through one reused buffer, + # instead of the legacy per-member Kokkos::BinSort path which + # allocates a fresh `sorted_values` buffer for every member every + # step (the dominant source of allocator churn / fragmentation on + # ROCm). + find_package(rocthrust QUIET) + if(rocthrust_FOUND) + message(STATUS "team_policy: rocThrust enabled for HIP sort_by_key") + add_compile_options("-D ROCTHRUST_ENABLED") + set(DEPENDENCIES ${DEPENDENCIES} roc::rocthrust) + else() + message(STATUS "team_policy: rocThrust not found; using BinSort " + "fallback for HIP sort_by_key") + endif() + endif() +endif() + # MPI if(${mpi}) find_or_fetch_dependency(MPI FALSE REQUIRED) diff --git a/cmake/defaults.cmake b/cmake/defaults.cmake index fb8790019..a85accf84 100644 --- a/cmake/defaults.cmake +++ b/cmake/defaults.cmake @@ -92,3 +92,18 @@ else() endif() set_property(CACHE default_gpu_aware_mpi PROPERTY TYPE BOOL) + +if(DEFINED ENV{Entity_ENABLE_TEAM_POLICY}) + set(default_team_policy + $ENV{Entity_ENABLE_TEAM_POLICY} + CACHE INTERNAL "Default flag for team_policy tile-blocked kernels") +else() + set(default_team_policy + OFF + CACHE INTERNAL "Default flag for team_policy tile-blocked kernels") +endif() +set_property(CACHE default_team_policy PROPERTY TYPE BOOL) + +set(default_team_policy_tile_size + 8 + CACHE INTERNAL "Default tile edge length in cells for team_policy") diff --git a/cmake/report.cmake b/cmake/report.cmake index 7b145bfc7..65a22a7a6 100644 --- a/cmake/report.cmake +++ b/cmake/report.cmake @@ -122,6 +122,26 @@ if(${mpi} AND ${DEVICE_ENABLED}) GPU_AWARE_MPI_REPORT 46) endif() +printchoices( + "Team Policy" + "team_policy" + "${ON_OFF_VALUES}" + ${team_policy} + OFF + "${Green}" + TEAM_POLICY_REPORT + 46) +if(${team_policy}) + printchoices( + "Team Tile Size" + "team_policy_tile_size" + "${team_policy_tile_sizes}" + ${team_policy_tile_size} + ${default_team_policy_tile_size} + "${Blue}" + TEAM_POLICY_TILE_SIZE_REPORT + 46) +endif() printchoices( "Debug mode" "DEBUG" @@ -197,6 +217,11 @@ if(${mpi} AND ${DEVICE_ENABLED}) string(APPEND REPORT_TEXT " " ${GPU_AWARE_MPI_REPORT} "\n") endif() +string(APPEND REPORT_TEXT " " ${TEAM_POLICY_REPORT} "\n") +if(${team_policy}) + string(APPEND REPORT_TEXT " " ${TEAM_POLICY_TILE_SIZE_REPORT} "\n") +endif() + string( APPEND REPORT_TEXT diff --git a/src/engines/srpic/currents.h b/src/engines/srpic/currents.h index 3afabea8a..faf0bb3ad 100644 --- a/src/engines/srpic/currents.h +++ b/src/engines/srpic/currents.h @@ -2,7 +2,8 @@ * @file engines/srpic/currents.h * @brief Current deposition and filtering routines for the SRPIC engine * @implements - * - ntt::srpic::CallDepositKernel<> -> void + * - ntt::srpic::CallDepositKernel<> -> void (flat path) + * - ntt::srpic::CallDepositKernelTiled<> -> void (TEAM_POLICY) * - ntt::srpic::CurrentsDeposit<> -> void * - ntt::srpic::CurrentsFilter<> -> void * @namespaces: @@ -61,11 +62,142 @@ namespace ntt { dt)); } +#if defined(TEAM_POLICY) + /** + * @brief Tiled deposit launcher (TeamPolicy + per-team scratch). + * + * Iterates over `tile_layout.ntiles_total` teams; each team accumulates + * its tile's particle contributions in SLM scratch and atomically + * flushes to the global J. Requires the species to have been sorted + * with `team_policy` enabled (`tile_layout` populated by + * `SortSpatially`). + * + * Falls back to the flat kernel if `tile_offsets` is empty — this + * happens on the first step before the first sort, or for very small + * species that exited early in `SortSpatially`. The fallback uses the + * passed-in `scatter_cur` so the caller still composes correctly. + */ + template + void CallDepositKernelTiled( + const Particles& species, + const M& local_metric, + const ndfield_t& cur, + real_t dt) { + static_assert(O <= 11u, "Shape order must be <= 11"); + constexpr unsigned short T = static_cast( + TEAM_POLICY_TILE_SIZE); + const auto& layout = species.tile_layout(); + raise::ErrorIf(layout.ntiles_total == 0u, + "CallDepositKernelTiled: tile_layout has 0 tiles — call " + "SortSpatially before CurrentsDeposit", + HERE); + raise::ErrorIf(layout.tile_offsets.extent(0) != layout.ntiles_total + 1u, + "CallDepositKernelTiled: tile_offsets size inconsistent " + "with ntiles_total", + HERE); + + using kernel_t = kernel::DepositCurrents_kernel_tiled; + kernel_t kern { cur, + species.i1, + species.i2, + species.i3, + species.i1_prev, + species.i2_prev, + species.i3_prev, + species.dx1, + species.dx2, + species.dx3, + species.dx1_prev, + species.dx2_prev, + species.dx3_prev, + species.ux1, + species.ux2, + species.ux3, + species.phi, + species.weight, + species.tag, + local_metric, + (real_t)(species.charge()), + dt, + layout }; + + Kokkos::TeamPolicy<> policy(static_cast(layout.ntiles_total), + Kokkos::AUTO); + policy.set_scratch_size(0, Kokkos::PerTeam(kernel_t::scratch_bytes())); + Kokkos::parallel_for("CurrentsDepositTiled", policy, kern); + } +#endif // TEAM_POLICY + template void CurrentsDeposit(Domain& domain, const prm::Parameters& engine_params) { const auto dt = engine_params.get("dt"); Kokkos::deep_copy(domain.fields.cur, ZERO); + +#if defined(TEAM_POLICY) + + // First-step fallback: if any contributing species has not been + // sorted yet (tile_layout still empty), fall back to the flat + // scatter-view path for that step. Subsequent steps see populated + // layouts and use the tiled kernel. + bool any_unsorted = false; + for (auto& species : domain.species) { + if ((species.pusher() == ParticlePusher::NONE) or + (species.npart() == 0) or cmp::AlmostZero_host(species.charge())) { + continue; + } + if (species.tile_layout().ntiles_total == 0u or + species.tile_layout().tile_offsets.extent(0) == 0u) { + any_unsorted = true; + break; + } + } + if (any_unsorted) { + auto scatter_cur = Kokkos::Experimental::create_scatter_view( + domain.fields.cur); + for (auto& species : domain.species) { + if ((species.pusher() == ParticlePusher::NONE) or + (species.npart() == 0) or cmp::AlmostZero_host(species.charge())) { + continue; + } + logger::Checkpoint( + fmt::format("Launching currents deposit (flat fallback, no sort yet) " + "for %d [%s] : %lu %f", + species.index(), + species.label().c_str(), + species.npart(), + (double)species.charge()), + HERE); + CallDepositKernel(species, + domain.mesh.metric, + scatter_cur, + dt); + } + Kokkos::Experimental::contribute(domain.fields.cur, scatter_cur); + } else { + for (auto& species : domain.species) { + if ((species.pusher() == ParticlePusher::NONE) or + (species.npart() == 0) or cmp::AlmostZero_host(species.charge())) { + continue; + } + logger::Checkpoint( + fmt::format("Launching tiled currents deposit for %d [%s] : %lu %f", + species.index(), + species.label().c_str(), + species.npart(), + (double)species.charge()), + HERE); + + CallDepositKernelTiled(species, + domain.mesh.metric, + domain.fields.cur, + dt); + } + } +#else auto scatter_cur = Kokkos::Experimental::create_scatter_view( domain.fields.cur); for (auto& species : domain.species) { @@ -84,6 +216,7 @@ namespace ntt { CallDepositKernel(species, domain.mesh.metric, scatter_cur, dt); } Kokkos::Experimental::contribute(domain.fields.cur, scatter_cur); +#endif } template diff --git a/src/framework/containers/particles.h b/src/framework/containers/particles.h index 895026552..0a15cb5d9 100644 --- a/src/framework/containers/particles.h +++ b/src/framework/containers/particles.h @@ -25,6 +25,7 @@ #include "traits/metric.h" #include "utils/error.h" #include "utils/formatting.h" +#include "utils/sorting.h" #include "framework/containers/species.h" #include "framework/domain/grid.h" @@ -90,6 +91,30 @@ namespace ntt { const uint8_t m_ntags { (uint8_t)(2 + math::pow(3, (int)D) - 1) }; #endif + // team_policy: tile metadata produced by SortSpatially + // and consumed by the tiled deposit / pusher kernels. Lazily + // allocated on first sort. The sort backend itself (oneDPL on SYCL, + // Thrust on CUDA, std::sort on Host, Kokkos::BinSort otherwise) is + // selected at compile time based on the Kokkos device and the + // vendor libraries detected by CMake. + TileLayout m_tile_layout {}; + +#if defined(TEAM_POLICY) && \ + ((defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)) || \ + (defined(CUDA_ENABLED) && defined(THRUST_ENABLED)) || \ + (defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED))) + // Persistent byte scratch reused by every SoA-member gather in + // `apply_permutation_to_soa`, across all members and all timesteps. + // Without this each member would allocate (and free) its own + // transient buffer every sort; recycling one persistent buffer + // removes that allocation churn entirely — the structural fix for + // the ROCm sort slowdown / fragmentation. Grown monotonically to + // the largest required size, never shrunk. Kokkos device + // allocations are over-aligned (>= 8 B), so reinterpreting the + // bytes as any SoA element type (<= 8 B PODs) is well-defined. + array_t m_perm_scratch {}; +#endif + public: // for empty allocation Particles() {} @@ -276,9 +301,42 @@ namespace ntt { /** * @brief Sort particles spatially by their cell indices * @param grid The grid object to get the cell information for sorting + * @note In team_policy mode (compile-time `team_policy=ON`), also + * populates `m_tile_layout` with tile-offset and per-tile + * permutation metadata that the tiled deposit/pusher kernels + * consume. */ void SortSpatially(const Grid&); +#if defined(TEAM_POLICY) && \ + ((defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)) || \ + (defined(CUDA_ENABLED) && defined(THRUST_ENABLED)) || \ + (defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED))) + private: + /** + * @brief Apply a particle-index permutation (built by oneDPL/Thrust + * sort_by_key) to every SoA member array. Sequential — one + * transient buffer at a time, fenced before scope exit. + * Only compiled when a vendor sort backend is enabled; the + * BinSort path applies the permutation in place via + * `sorter.sort(view)` instead. + */ + void apply_permutation_to_soa(const prtl_perm_t& perm); + + public: +#endif + + /** + * @brief Read-only access to the tile layout produced by the most + * recent SortSpatially call. Returns a default-constructed + * layout (`ntiles_total == 0`) when the species has not yet + * been sorted. + */ + [[nodiscard]] + auto tile_layout() const -> const TileLayout& { + return m_tile_layout; + } + /** * @brief Copy particle data from device to host. */ diff --git a/src/framework/containers/particles_sort.cpp b/src/framework/containers/particles_sort.cpp index 904fb3fc7..c04813e12 100644 --- a/src/framework/containers/particles_sort.cpp +++ b/src/framework/containers/particles_sort.cpp @@ -8,10 +8,22 @@ #include "framework/containers/particles.h" #include "framework/domain/grid.h" +#if defined(TEAM_POLICY) + #if (defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)) || \ + (defined(CUDA_ENABLED) && defined(THRUST_ENABLED)) || \ + (defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED)) + #define TEAM_POLICY_USE_VENDOR_SORT + #include "utils/sort_dispatch.h" + #endif +#endif + #include #include +#include #include +#include +#include #include #include #include @@ -195,6 +207,192 @@ namespace ntt { template void Particles::SortSpatially(const Grid& grid) { +#if defined(TEAM_POLICY) + // ---------------------- team_policy: tile-based sort ------------------ // + const auto npart_local = npart(); + if (npart_local == 0u) { + m_tile_layout = TileLayout {}; + m_is_sorted = true; + return; + } + + constexpr unsigned short T = static_cast( + TEAM_POLICY_TILE_SIZE); + static_assert(T > 0u, "TEAM_POLICY_TILE_SIZE must be > 0"); + + // 1. Compute per-axis tile counts and total_tiles. + const auto ncells_active = grid.n_active(); + ncells_t ntx[3] { 1u, 1u, 1u }; + ncells_t total_tiles { 1u }; + if constexpr ((D == Dim::_1D) or (D == Dim::_2D) or (D == Dim::_3D)) { + ntx[0] = static_cast(math::ceil( + static_cast(ncells_active[0]) / static_cast(T))); + total_tiles *= ntx[0]; + } + if constexpr ((D == Dim::_2D) or (D == Dim::_3D)) { + ntx[1] = static_cast(math::ceil( + static_cast(ncells_active[1]) / static_cast(T))); + total_tiles *= ntx[1]; + } + if constexpr (D == Dim::_3D) { + ntx[2] = static_cast(math::ceil( + static_cast(ncells_active[2]) / static_cast(T))); + total_tiles *= ntx[2]; + } + + // 2. Compute per-particle tile key (with min(i, i_prev)). + array_t tile_indices { "tile_indices", npart_local }; + Kokkos::parallel_for( + "FillTileIndices", + rangeActiveParticles(), + sort::PositionToTileIndex { i1, + i2, + i3, + tag, + tile_indices, + ncells_active, + static_cast(T), + array_t {}, + i1_prev, + i2_prev, + i3_prev }); + + // 3. Sort. Vendor library (oneDPL/Thrust) when compiled in; + // Kokkos::BinSort otherwise. n_bins = total_tiles + 2 covers + // the dead-particle sentinel bin (total_tiles + 1u). + const ncells_t n_bins = total_tiles + 2u; + const auto slice = prtl_slice_t(0, npart_local); + #if defined(TEAM_POLICY_USE_VENDOR_SORT) + // Vendor path: produce an explicit permutation via sort_by_key, + // then apply it to each SoA member with a sequential one-buffer + // gather (peak transient = one `npart × sizeof(member)` buffer. + prtl_perm_t perm { "tile_perm", npart_local }; + #if defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED) + sort_helpers::sort_by_key_dispatch(tile_indices, + perm, + n_bins, + sort::backend::OneDPL {}); + #elif defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED) + sort_helpers::sort_by_key_dispatch(tile_indices, + perm, + n_bins, + sort::backend::Rocthrust {}); + #else + sort_helpers::sort_by_key_dispatch(tile_indices, + perm, + n_bins, + sort::backend::Thrust {}); + #endif + Kokkos::fence("SortSpatially: pre-gather drain"); + apply_permutation_to_soa(perm); + #else + // BinSort path: same mechanism as legacy SortSpatially (BinSort + // allocates one temp View per `sorter.sort(view)` call and frees + // it before the next), so peak transient memory is bounded. + using sorter_op_t = Kokkos::BinOp1D>; + using sorter_t = Kokkos::BinSort, sorter_op_t>; + auto bin_op = sorter_op_t { static_cast(n_bins), 0u, n_bins }; + auto sorter = sorter_t { tile_indices, bin_op, false }; + sorter.create_permute_vector(); + if constexpr (D == Dim::_1D or D == Dim::_2D or D == Dim::_3D) { + sorter.sort(Kokkos::subview(i1, slice)); + sorter.sort(Kokkos::subview(i1_prev, slice)); + sorter.sort(Kokkos::subview(dx1, slice)); + sorter.sort(Kokkos::subview(dx1_prev, slice)); + } + if constexpr (D == Dim::_2D or D == Dim::_3D) { + sorter.sort(Kokkos::subview(i2, slice)); + sorter.sort(Kokkos::subview(i2_prev, slice)); + sorter.sort(Kokkos::subview(dx2, slice)); + sorter.sort(Kokkos::subview(dx2_prev, slice)); + } + if constexpr (D == Dim::_3D) { + sorter.sort(Kokkos::subview(i3, slice)); + sorter.sort(Kokkos::subview(i3_prev, slice)); + sorter.sort(Kokkos::subview(dx3, slice)); + sorter.sort(Kokkos::subview(dx3_prev, slice)); + } + sorter.sort(Kokkos::subview(ux1, slice)); + sorter.sort(Kokkos::subview(ux2, slice)); + sorter.sort(Kokkos::subview(ux3, slice)); + sorter.sort(Kokkos::subview(weight, slice)); + sorter.sort(Kokkos::subview(tag, slice)); + if constexpr (D == Dim::_2D and C != Coord::Cartesian) { + sorter.sort(Kokkos::subview(phi, slice)); + } + for (auto pldr { 0u }; pldr < npld_r(); ++pldr) { + sorter.sort(Kokkos::subview(pld_r, slice, pldr)); + } + for (auto pldi { 0u }; pldi < npld_i(); ++pldi) { + sorter.sort(Kokkos::subview(pld_i, slice, pldi)); + } + // Apply the same permutation to `tile_indices` itself so it ends + // monotonically non-decreasing for the offsets pass below. + sorter.sort(tile_indices); + #endif // TEAM_POLICY_USE_VENDOR_SORT + + // 5. Compute per-tile prefix-sum `tile_offsets` for the tiled + // pusher. `tile_indices` is now sorted (monotonically + // non-decreasing for alive particles, dead sentinel + // `total_tiles + 1` clustered at the end) — vendor sort_by_key + // sorts keys in place; the BinSort path explicitly applies the + // same permutation to `tile_indices` above. Transition-detect + // directly on it: the start of each non-empty tile is the only + // place a write happens — atomic-free in the dense branch. + // Empty tiles (no particles) are filled by a reverse pass on a + // small host mirror (`total_tiles ≈ 176K` at production scale → + // ~700 KB). + { + array_t tile_offsets { "tile_offsets", total_tiles + 1u }; + Kokkos::deep_copy(tile_offsets, static_cast(npart_local)); + + const auto total_tiles_v = total_tiles; + auto ti_v = tile_indices; + Kokkos::parallel_for( + "DetectTileBoundaries", + rangeActiveParticles(), + Lambda(prtlidx_t p) { + const auto t_curr = ti_v(p); + const bool boundary = (p == 0u) || (ti_v(p - 1u) != t_curr); + if (!boundary) { + return; + } + if (t_curr < total_tiles_v) { + tile_offsets(t_curr) = p; + } else { + // First dead particle — also marks the alive_count boundary + // stored at index total_tiles. + Kokkos::atomic_min(&tile_offsets(total_tiles_v), p); + } + }); + + auto h_offsets = Kokkos::create_mirror_view(tile_offsets); + Kokkos::deep_copy(h_offsets, tile_offsets); + for (auto t = static_cast(total_tiles); t-- > 0u;) { + if (h_offsets(t) > h_offsets(t + 1u)) { + h_offsets(t) = h_offsets(t + 1u); + } + } + Kokkos::deep_copy(tile_offsets, h_offsets); + + m_tile_layout.tile_offsets = tile_offsets; + } + + // 6. Populate `m_tile_layout` size/shape. `tile_perm` is not used + // in the current design — the SoA arrays are physically permuted + // into tile order, so consumers iterate + // `[tile_offsets(t), tile_offsets(t+1))` directly without a + // separate permutation indirection. + m_tile_layout.ntiles_per_axis[0] = ntx[0]; + m_tile_layout.ntiles_per_axis[1] = ntx[1]; + m_tile_layout.ntiles_per_axis[2] = ntx[2]; + m_tile_layout.ntiles_total = total_tiles; + m_tile_layout.tile_size = T; + m_tile_layout.tile_perm = prtl_perm_t {}; + m_is_sorted = true; + + Kokkos::fence("SortSpatially: end of team_policy path"); +#else // !TEAM_POLICY — legacy in-place BinSort by global cell index const auto nx2 = grid.n_active(in::x2); const auto nx3 = grid.n_active(in::x3); const auto total_cells = grid.num_active(); @@ -250,13 +448,153 @@ namespace ntt { for (auto pldi { 0u }; pldi < npld_i(); ++pldi) { sorter.sort(Kokkos::subview(pld_i, slice, pldi)); } +#endif // TEAM_POLICY + } + +#if defined(TEAM_POLICY_USE_VENDOR_SORT) + namespace permute_helpers { + + // Permute a 1D SoA member array `arr` in place by `perm`, gathering + // through `scratch` — a persistent byte buffer reused by every + // member and every timestep (no per-call allocation). An unmanaged + // typed view aliases the scratch bytes; the caller guarantees + // `scratch` is large enough and that Kokkos' device over-alignment + // covers the element type. + template + inline void permute_1d_inplace(V& arr, + const prtl_perm_t& perm, + npart_t n, + const array_t& scratch) { + if (n == 0u) { + return; + } + using value_t = typename V::non_const_value_type; + using buf_t = Kokkos::View>; + buf_t buf(reinterpret_cast(scratch.data()), n); + auto perm_v = perm; + auto arr_v = arr; + Kokkos::parallel_for( + "Permute1D", + n, + KOKKOS_LAMBDA(const npart_t p) { buf(p) = arr_v(perm_v(p)); }); + Kokkos::deep_copy(Kokkos::subview(arr, prtl_slice_t(0u, n)), buf); + Kokkos::fence("permute_1d_inplace: end"); + } + + // 2D analogue for `pld_r` / `pld_i`. + template + inline void permute_2d_inplace(V& arr, + const prtl_perm_t& perm, + npart_t n, + npart_t ncols, + const array_t& scratch) { + if (n == 0u or ncols == 0u) { + return; + } + using value_t = typename V::non_const_value_type; + using buf_t = Kokkos::View>; + buf_t buf(reinterpret_cast(scratch.data()), n, ncols); + auto perm_v = perm; + auto arr_v = arr; + Kokkos::parallel_for( + "Permute2D", + CreateParticleRangePolicy({ 0u, 0u }, { n, ncols }), + KOKKOS_LAMBDA(const npart_t p, const npart_t l) { + buf(p, l) = arr_v(perm_v(p), l); + }); + Kokkos::deep_copy(Kokkos::subview(arr, prtl_slice_t(0u, n), Kokkos::ALL), + buf); + Kokkos::fence("permute_2d_inplace: end"); + } + + } // namespace permute_helpers + + template + void Particles::apply_permutation_to_soa(const prtl_perm_t& perm) { + const auto n = npart(); + if (n == 0u) { + return; + } + + // Size the persistent scratch once to the largest gather any member + // needs this call: 1D members need n * sizeof(real_t) bytes (the + // widest element); the 2D payloads need n * ncols * elem bytes. + // Grown monotonically, never shrunk — so after warmup this incurs + // no allocation at all. + std::size_t need = static_cast(n) * sizeof(real_t); + if (npld_r() > 0) { + need = std::max(need, + static_cast(n) * + static_cast(npld_r()) * sizeof(real_t)); + } + if (npld_i() > 0) { + need = std::max(need, + static_cast(n) * + static_cast(npld_i()) * sizeof(npart_t)); + } + if (m_perm_scratch.extent(0) < need) { + m_perm_scratch = array_t { "perm_scratch", need }; + } + const auto& scratch = m_perm_scratch; + + using permute_helpers::permute_1d_inplace; + using permute_helpers::permute_2d_inplace; + + if constexpr (D == Dim::_1D or D == Dim::_2D or D == Dim::_3D) { + permute_1d_inplace(i1, perm, n, scratch); + permute_1d_inplace(dx1, perm, n, scratch); + permute_1d_inplace(i1_prev, perm, n, scratch); + permute_1d_inplace(dx1_prev, perm, n, scratch); + } + if constexpr (D == Dim::_2D or D == Dim::_3D) { + permute_1d_inplace(i2, perm, n, scratch); + permute_1d_inplace(dx2, perm, n, scratch); + permute_1d_inplace(i2_prev, perm, n, scratch); + permute_1d_inplace(dx2_prev, perm, n, scratch); + } + if constexpr (D == Dim::_3D) { + permute_1d_inplace(i3, perm, n, scratch); + permute_1d_inplace(dx3, perm, n, scratch); + permute_1d_inplace(i3_prev, perm, n, scratch); + permute_1d_inplace(dx3_prev, perm, n, scratch); + } + permute_1d_inplace(ux1, perm, n, scratch); + permute_1d_inplace(ux2, perm, n, scratch); + permute_1d_inplace(ux3, perm, n, scratch); + permute_1d_inplace(weight, perm, n, scratch); + permute_1d_inplace(tag, perm, n, scratch); + if constexpr (D == Dim::_2D and C != Coord::Cartesian) { + permute_1d_inplace(phi, perm, n, scratch); + } + if (npld_r() > 0) { + permute_2d_inplace(pld_r, perm, n, static_cast(npld_r()), scratch); + } + if (npld_i() > 0) { + permute_2d_inplace(pld_i, perm, n, static_cast(npld_i()), scratch); + } } +#endif // TEAM_POLICY_USE_VENDOR_SORT + +#if defined(TEAM_POLICY_USE_VENDOR_SORT) + #define APPLY_PERM_INSTANTIATE(D, C) \ + template void Particles::apply_permutation_to_soa( \ + const prtl_perm_t&); +#else + #define APPLY_PERM_INSTANTIATE(D, C) +#endif #define PARTICLES_SORT(D, C) \ template auto Particles::NpartsPerTagAndOffsets() const \ -> std::pair, array_t>; \ template void Particles::RemoveDead(); \ - template void Particles::SortSpatially(const Grid&); + template void Particles::SortSpatially(const Grid&); \ + APPLY_PERM_INSTANTIATE(D, C) PARTICLES_SORT(Dim::_1D, Coord::Cartesian) PARTICLES_SORT(Dim::_2D, Coord::Cartesian) @@ -266,5 +604,6 @@ namespace ntt { PARTICLES_SORT(Dim::_3D, Coord::Spherical) PARTICLES_SORT(Dim::_3D, Coord::Qspherical) #undef PARTICLES_SORT +#undef APPLY_PERM_INSTANTIATE } // namespace ntt diff --git a/src/global/arch/kokkos_aliases.h b/src/global/arch/kokkos_aliases.h index 314e78c0e..43f21ce47 100644 --- a/src/global/arch/kokkos_aliases.h +++ b/src/global/arch/kokkos_aliases.h @@ -10,6 +10,7 @@ * - CreateRangePolicy, CreateRangePolicyOnHost * - random_number_pool_t, random_generator_t * - Random function + * - prtl_perm_t, TileLayout<> * @cpp: * - arch/kokkos_aliases.cpp * @namespaces: @@ -258,6 +259,32 @@ template auto CreateRangePolicyOnHost(const tuple_t&, const tuple_t&) -> range_h_t; +// --------------------------- team_policy types ---------------------------- // +// Particle permutation index: maps a sorted-position p in [0, npart) to a +// pre-sort particle index. Produced by SortSpatially, consumed by tiled +// pusher and deposit kernels to walk particles tile-by-tile without +// physically re-permuting the SoA arrays in lock step every step. +using prtl_perm_t = array_t; + +// Tile layout metadata: the contract between Stream 1 (sort) and Streams +// 2/3 (tiled deposit / pusher). All members are device-resident. +// ntiles_per_axis : number of tiles along each axis (1 for unused axes). +// ntiles_total : product of ntiles_per_axis = league size for TeamPolicy. +// tile_size : tile edge length in cells (compile-time CMake knob, +// replicated here for runtime checks). +// tile_offsets : prefix-sum of per-tile particle counts; size +// ntiles_total + 1; tile t owns particles +// [tile_offsets(t), tile_offsets(t+1)). +// tile_perm : size npart, particle index sorted by tile. +template +struct TileLayout { + ncells_t ntiles_per_axis[3] { 1u, 1u, 1u }; + ncells_t ntiles_total { 0u }; + unsigned short tile_size { 0u }; + array_t tile_offsets; + prtl_perm_t tile_perm; +}; + // Random number pool/generator type alias // (using math:: instead of Kokkos:: to suppress compiler warning on unused namespace alias) using random_number_pool_t = math::Random_XorShift64_Pool; diff --git a/src/global/utils/sort_dispatch.h b/src/global/utils/sort_dispatch.h new file mode 100644 index 000000000..506a90742 --- /dev/null +++ b/src/global/utils/sort_dispatch.h @@ -0,0 +1,171 @@ +/** + * @file utils/sort_dispatch.h + * @brief Backend-dispatched sort_by_key for team_policy SortSpatially. + * @implements + * - sort_helpers::sort_by_key_dispatch -> void (BinSort, OneDPL, Thrust, StdSort) + * @namespaces: + * - ntt::sort_helpers:: + * @macros: + * - TEAM_POLICY + * - SYCL_ENABLED, ONEDPL_ENABLED (oneDPL overload) + * - CUDA_ENABLED, THRUST_ENABLED (Thrust overload) + * + * @note Each overload produces a permutation `perm` of size N such that + * keys[perm[0]] <= keys[perm[1]] <= ... in stable order. + * Always-available overloads: BinSort (uses Kokkos::BinSort) and + * StdSort (host-side std::stable_sort fallback). The vendor-library + * overloads (OneDPL on SYCL, Thrust on CUDA) are conditional on the + * respective build flags. + */ + +#ifndef GLOBAL_UTILS_SORT_DISPATCH_H +#define GLOBAL_UTILS_SORT_DISPATCH_H + +#if !defined(TEAM_POLICY) + #error "sort_dispatch.h is only meaningful when TEAM_POLICY is defined" +#endif + +#include "global.h" + +#include "arch/kokkos_aliases.h" +#include "utils/sorting.h" + +#include +#include + +#if defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED) + #include + #include +#endif +#if defined(CUDA_ENABLED) && defined(THRUST_ENABLED) + #include + #include + #include +#endif +#if defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED) + #include + #include + #include + #include +#endif + +#include +#include + +namespace ntt::sort_helpers { + + // Always-available legacy fallback: Kokkos::BinSort. n_bins must be an + // upper bound on distinct key values. + inline void sort_by_key_dispatch(const array_t& keys, + prtl_perm_t& perm, + ncells_t n_bins, + ::sort::backend::BinSort) { + const auto n = static_cast(keys.extent(0)); + if (n == 0u) { + return; + } + using sorter_op_t = Kokkos::BinOp1D>; + using sorter_t = Kokkos::BinSort, sorter_op_t>; + auto bin_op = sorter_op_t { static_cast(n_bins), 0u, n_bins }; + auto sorter = sorter_t { keys, bin_op, false }; + sorter.create_permute_vector(); + auto perm_v = perm; + Kokkos::parallel_for( + "PermInitIota", + n, + KOKKOS_LAMBDA(const npart_t i) { perm_v(i) = i; }); + Kokkos::fence("sort_by_key_dispatch BinSort: pre-sort"); + sorter.sort(perm); + Kokkos::fence("sort_by_key_dispatch BinSort: post-sort"); + } + +#if defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED) + inline void sort_by_key_dispatch(const array_t& keys, + prtl_perm_t& perm, + ncells_t /*n_bins*/, + ::sort::backend::OneDPL) { + const auto n = static_cast(keys.extent(0)); + if (n == 0u) { + return; + } + auto* keys_ptr = keys.data(); + auto* perm_ptr = perm.data(); + auto exec = Kokkos::DefaultExecutionSpace(); + auto perm_v = perm; + Kokkos::parallel_for( + "PermInitIota", + n, + KOKKOS_LAMBDA(const npart_t i) { perm_v(i) = i; }); + // Drain Kokkos's queue so oneDPL's policy sees the iota'd perm even + // if oneDPL submits to a different SYCL queue internally. + exec.fence("sort_by_key_dispatch OneDPL: pre-sort"); + auto queue = exec.sycl_queue(); + auto policy = oneapi::dpl::execution::make_device_policy(queue); + oneapi::dpl::sort_by_key(policy, keys_ptr, keys_ptr + n, perm_ptr); + exec.fence("sort_by_key_dispatch OneDPL: post-sort"); + } +#endif + +#if defined(CUDA_ENABLED) && defined(THRUST_ENABLED) + inline void sort_by_key_dispatch(const array_t& keys, + prtl_perm_t& perm, + ncells_t /*n_bins*/, + ::sort::backend::Thrust) { + const auto n = static_cast(keys.extent(0)); + if (n == 0u) { + return; + } + Kokkos::fence("sort_by_key_dispatch Thrust: pre-sort"); + thrust::device_ptr kp(keys.data()); + thrust::device_ptr pp(perm.data()); + thrust::sequence(pp, pp + n); + thrust::sort_by_key(kp, kp + n, pp); + Kokkos::fence("sort_by_key_dispatch Thrust: post-sort"); + } +#endif + +#if defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED) + // rocThrust exposes the same thrust:: API as CUDA Thrust; with hipcc + // device_ptr-based algorithms dispatch to the HIP backend. Mirrors + // the CUDA Thrust overload. + inline void sort_by_key_dispatch(const array_t& keys, + prtl_perm_t& perm, + ncells_t /*n_bins*/, + ::sort::backend::Rocthrust) { + const auto n = static_cast(keys.extent(0)); + if (n == 0u) { + return; + } + Kokkos::fence("sort_by_key_dispatch Rocthrust: pre-sort"); + thrust::device_ptr kp(keys.data()); + thrust::device_ptr pp(perm.data()); + thrust::sequence(pp, pp + n); + thrust::sort_by_key(kp, kp + n, pp); + Kokkos::fence("sort_by_key_dispatch Rocthrust: post-sort"); + } +#endif + + // Host fallback: indirect-sort via std::stable_sort. + inline void sort_by_key_dispatch(const array_t& keys, + prtl_perm_t& perm, + ncells_t /*n_bins*/, + ::sort::backend::StdSort) { + const auto n = static_cast(keys.extent(0)); + if (n == 0u) { + return; + } + auto keys_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), + keys); + auto perm_h = Kokkos::create_mirror_view(perm); + std::iota(perm_h.data(), perm_h.data() + n, npart_t { 0u }); + std::stable_sort(perm_h.data(), + perm_h.data() + n, + [&](npart_t a, npart_t b) { + return keys_h(a) < keys_h(b); + }); + Kokkos::deep_copy(perm, perm_h); + } + +} // namespace ntt::sort_helpers + +#endif // GLOBAL_UTILS_SORT_DISPATCH_H diff --git a/src/global/utils/sorting.h b/src/global/utils/sorting.h index dbe774402..8442f5ddb 100644 --- a/src/global/utils/sorting.h +++ b/src/global/utils/sorting.h @@ -5,6 +5,7 @@ * - sort::BinBool<> * - sort::BinTag<> * - sort::PositionToTileIndex<> + * - sort::backend tag types (compile-time tag dispatch for sort_by_key) * @namespaces: * - sort:: * @note BinBool sorts by boolean values "true" then "false" @@ -62,9 +63,27 @@ namespace sort { const int m_max_bins; }; - template + /** + * @brief Bin a particle into a tile of edge length `tile_size` cells. + * @tparam D Dimension. + * @tparam Count If true, atomic-increment `num_ppt[tile]` for each live + * particle (used to populate per-tile counts in one pass). + * @tparam UsePrev If true, the bin key uses `min(i_curr, i_prev)` instead + * of `i_curr` alone. This guarantees that a particle whose + * Esirkepov stencil straddles a tile boundary (because it + * crossed the boundary during the pusher) lands in the + * lower-indexed of the two tiles. Combined with a halo of + * `O+1` cells in the deposit's per-tile scratch, this + * keeps every particle's stencil inside its assigned + * tile's interior+halo region. See plan §S2.4. + * + * Dead particles get the sentinel `total_tiles + 1u` so they sort to the + * end (or get skipped, depending on the consumer). + */ + template struct PositionToTileIndex { const array_t i1, i2, i3; + const array_t i1_prev, i2_prev, i3_prev; const array_t tag; array_t tile_indices; ncells_t tile_size; @@ -72,20 +91,37 @@ namespace sort { ncells_t ntx2 { 0u }, ntx3 { 0u }; ncells_t total_tiles { 0u }; - - PositionToTileIndex(const array_t& i1, - const array_t& i2, - const array_t& i3, - const array_t& tag, - array_t& tile_indices, + // Active-cell extents per axis. Used to clamp the bin key when + // UsePrev=true, since `i_prev` can be transiently negative after the + // pusher's periodic wrap (`i_prev -= ni`) or out-of-range after an + // MPI receive that hasn't translated frames. Without clamping, the + // signed-to-unsigned promotion in `int(-1) / uint32_t(T)` produces + // ~1.07e9, the linearised `tile_indices(p)` overflows past `n_bins`, + // and BinSort's internal `atomic_add(&bin_count[wild_idx], 1)` + // faults on an unmapped page. + int ncells1 { 1 }, ncells2 { 1 }, ncells3 { 1 }; + + PositionToTileIndex(const array_t& i1_, + const array_t& i2_, + const array_t& i3_, + const array_t& tag_, + array_t& tile_indices_, const std::vector& ncells, - ncells_t tile_size = 1u) - : i1 { i1 } - , i2 { i2 } - , i3 { i3 } - , tag { tag } - , tile_indices { tile_indices } - , tile_size { tile_size } + ncells_t tile_size_ = 1u, + const array_t& num_ppt_ = { "num_ppt", 0u }, + const array_t& i1_prev_ = {}, + const array_t& i2_prev_ = {}, + const array_t& i3_prev_ = {}) + : i1 { i1_ } + , i2 { i2_ } + , i3 { i3_ } + , i1_prev { i1_prev_ } + , i2_prev { i2_prev_ } + , i3_prev { i3_prev_ } + , tag { tag_ } + , tile_indices { tile_indices_ } + , tile_size { tile_size_ } + , num_ppt { num_ppt_ } , ntx2 { 1u } , ntx3 { 1u } , total_tiles { 1u } { @@ -93,49 +129,122 @@ namespace sort { "ncells size must match D", HERE); if constexpr ((D == Dim::_1D) or (D == Dim::_2D) or (D == Dim::_3D)) { + ncells1 = static_cast(ncells[0]); npart_t ntx1 = static_cast(math::ceil( static_cast(ncells[0]) / static_cast(tile_size))); total_tiles *= ntx1; } if constexpr ((D == Dim::_2D) or (D == Dim::_3D)) { + ncells2 = static_cast(ncells[1]); ntx2 = static_cast(math::ceil( static_cast(ncells[1]) / static_cast(tile_size))); total_tiles *= ntx2; } if constexpr (D == Dim::_3D) { + ncells3 = static_cast(ncells[2]); ntx3 = static_cast(math::ceil( static_cast(ncells[2]) / static_cast(tile_size))); total_tiles *= ntx3; } if constexpr (Count) { - num_ppt = array_t { "num_ppt", total_tiles }; + raise::ErrorIf(num_ppt.extent(0) != total_tiles, + "num_ppt must have extent equal to total tiles", + HERE); + } + if constexpr (UsePrev) { + raise::ErrorIf( + i1_prev.extent(0) == 0u, + "PositionToTileIndex requires i1_prev to be set", + HERE); + if constexpr ((D == Dim::_2D) or (D == Dim::_3D)) { + raise::ErrorIf( + i2_prev.extent(0) == 0u, + "PositionToTileIndex requires i2_prev to be set", + HERE); + } + if constexpr (D == Dim::_3D) { + raise::ErrorIf( + i3_prev.extent(0) == 0u, + "PositionToTileIndex requires i3_prev to be set", + HERE); + } } } - Inline auto operator()(prtldx_t p) const { + Inline auto operator()(prtlidx_t p) const { if (tag(p) != ntt::ParticleTag::alive) { tile_indices(p) = total_tiles + 1u; } else { + // bin key per-axis: use min(i, i_prev) when UsePrev so that a + // particle straddling a boundary lands in the lower tile. + // Then clamp to [0, ncells_axis - 1] — `i_prev` can be negative + // (after the pusher's periodic-wrap path: `i_prev -= ni`) or + // out-of-range (after MPI receive without frame translation). + // Without the clamp, signed-to-unsigned promotion in + // `int(-1) / uint32_t(T)` makes `tile_indices(p)` overflow far + // past `n_bins`, and BinSort's `atomic_add(&bin_count[bin],1)` + // faults on an unmapped page. + const auto clamp_axis = [](int v, int ncells) -> int { + return (v < 0) ? 0 : ((v >= ncells) ? (ncells - 1) : v); + }; + const auto key1 = [&]() -> int { + if constexpr (UsePrev) { + const int raw = (i1(p) < i1_prev(p)) ? i1(p) : i1_prev(p); + return clamp_axis(raw, ncells1); + } else { + return i1(p); + } + }(); + const auto key2 = [&]() -> int { + if constexpr (UsePrev) { + const int raw = (i2(p) < i2_prev(p)) ? i2(p) : i2_prev(p); + return clamp_axis(raw, ncells2); + } else { + return i2(p); + } + }(); + const auto key3 = [&]() -> int { + if constexpr (UsePrev) { + const int raw = (i3(p) < i3_prev(p)) ? i3(p) : i3_prev(p); + return clamp_axis(raw, ncells3); + } else { + return i3(p); + } + }(); if constexpr (D == Dim::_1D) { - tile_indices(p) = static_cast(i1(p) / tile_size); + tile_indices(p) = static_cast(key1 / tile_size); } else if constexpr (D == Dim::_2D) { - tile_indices(p) = static_cast(i1(p) / tile_size) * ntx2 + - static_cast(i2(p) / tile_size); + tile_indices(p) = static_cast(key1 / tile_size) * ntx2 + + static_cast(key2 / tile_size); } else if constexpr (D == Dim::_3D) { - tile_indices(p) = (static_cast(i1(p) / tile_size) * ntx2 + - static_cast(i2(p) / tile_size)) * + tile_indices(p) = (static_cast(key1 / tile_size) * ntx2 + + static_cast(key2 / tile_size)) * ntx3 + - static_cast(i3(p) / tile_size); + static_cast(key3 / tile_size); } else { raise::KernelError(HERE, "Wrong D in SortSpatially"); } if constexpr (Count) { - Kokkos::atomic_add(&num_ppt(tile_indices(p)), 1u); + Kokkos::atomic_add(&num_ppt(tile_indices(p)), 1); } } } }; + // -------------------- Backend dispatch for sort_by_key ------------------- // + // Compile-time tags for tag-dispatch into backend-specific + // sort_by_key implementations. Selection is fully compile-time: the + // backend that resolves depends on the active Kokkos device and the + // availability of the corresponding vendor library. + namespace backend { + struct OneDPL {}; + struct Thrust {}; + struct Rocthrust {}; + struct StdSort {}; + // Always-available legacy fallback using Kokkos::BinSort. + struct BinSort {}; + } // namespace backend + } // namespace sort #endif // GLOBAL_UTILS_SORTING_H diff --git a/src/kernels/currents_deposit.hpp b/src/kernels/currents_deposit.hpp index 268187af2..5eb7ff2b6 100644 --- a/src/kernels/currents_deposit.hpp +++ b/src/kernels/currents_deposit.hpp @@ -1,10 +1,24 @@ /** * @file kernels/currents_deposit.hpp - * @brief Covariant algorithms for the current deposition + * @brief Covariant algorithms for the current deposition. + * + * Two kernels share the same per-particle body + * (`kernel::deposit::deposit_one_particle`): + * - `kernel::DepositCurrents_kernel` flat (RangePolicy over particles, + * writes into a `Kokkos::Experimental::ScatterView`). Always available. + * - `kernel::DepositCurrents_kernel_tiled` team-policy + * (one team per spatial tile, accumulates into team SLM scratch with + * atomic adds, then flushes to global J). Available when `team_policy=ON` + * (`#if defined(TEAM_POLICY)`). Stream 2 of the Pattern A plan. + * * @implements + * - kernel::deposit::PrtlPack<> + * - kernel::deposit::deposit_one_particle<> * - kernel::DepositCurrents_kernel<> + * - kernel::DepositCurrents_kernel_tiled<> (TEAM_POLICY only) * @namespaces: * - kernel:: + * - kernel::deposit:: */ #ifndef KERNELS_CURRENTS_DEPOSIT_HPP @@ -18,100 +32,99 @@ #include "utils/error.h" #include "utils/numeric.h" -#include "particle_shapes.hpp" +#include "kernels/particle_shapes.hpp" #include +#include #define i_di_to_Xi(I, DI) (static_cast((I)) + static_cast((DI))) namespace kernel { using namespace ntt; - /** - * @brief Algorithm for the current deposition - */ - template - class DepositCurrents_kernel { - static_assert(O <= 11u, "Shape function order O must be <= 11"); - static constexpr auto D = M::Dim; - - scatter_ndfield_t J; - const array_t i1, i2, i3; - const array_t i1_prev, i2_prev, i3_prev; - const array_t dx1, dx2, dx3; - const array_t dx1_prev, dx2_prev, dx3_prev; - const array_t ux1, ux2, ux3; - const array_t phi; - const array_t weight; - const array_t tag; - const M metric; - const real_t charge, inv_dt; + namespace deposit { - public: /** - * @brief explicit constructor. + * @brief Per-particle reference pack consumed by both the flat and tiled + * deposit kernels. The same set of SoA references is captured by + * each kernel; bundling them here keeps the helper's argument + * list manageable and ensures every consumer reads the same + * view aliases. */ - DepositCurrents_kernel(const scatter_ndfield_t& scatter_cur, - const array_t& i1, - const array_t& i2, - const array_t& i3, - const array_t& i1_prev, - const array_t& i2_prev, - const array_t& i3_prev, - const array_t& dx1, - const array_t& dx2, - const array_t& dx3, - const array_t& dx1_prev, - const array_t& dx2_prev, - const array_t& dx3_prev, - const array_t& ux1, - const array_t& ux2, - const array_t& ux3, - const array_t& phi, - const array_t& weight, - const array_t& tag, - const M& metric, - real_t charge, - const real_t dt) - : J { scatter_cur } - , i1 { i1 } - , i2 { i2 } - , i3 { i3 } - , i1_prev { i1_prev } - , i2_prev { i2_prev } - , i3_prev { i3_prev } - , dx1 { dx1 } - , dx2 { dx2 } - , dx3 { dx3 } - , dx1_prev { dx1_prev } - , dx2_prev { dx2_prev } - , dx3_prev { dx3_prev } - , ux1 { ux1 } - , ux2 { ux2 } - , ux3 { ux3 } - , phi { phi } - , weight { weight } - , tag { tag } - , metric { metric } - , charge { charge } - , inv_dt { ONE / dt } { - raise::ErrorIf( - (O == 2u and N_GHOSTS < 2), - "Order of interpolation is 2, but number of ghost cells is < 2", - HERE); - } + template + struct PrtlPack { + array_t i1, i2, i3; + array_t i1_prev, i2_prev, i3_prev; + array_t dx1, dx2, dx3; + array_t dx1_prev, dx2_prev, dx3_prev; + array_t ux1, ux2, ux3; + array_t phi; + array_t weight; + array_t tag; + }; /** - * @brief Iteration of the loop over particles. - * @param p index. + * @brief Per-particle deposit body, shared between the flat and tiled + * kernels. + * + * The caller supplies a `deposit_at(idx..., comp, val)` callback that + * applies the contribution `val` to the J component `comp` at the + * **global** J cell index `idx...` (already includes the `N_GHOSTS` + * offset). The flat kernel's callback simply does + * `J_acc(idx..., comp) += val` on its scatter-view accessor; the tiled + * kernel's callback translates `idx...` into per-tile scratch + * coordinates and uses `Kokkos::atomic_add` on SLM. Either way, this + * function is identical numerically and contains the only deposit math + * in the codebase. + * + * Dead particles return early. The callback is invoked once per cell + * write, with the dimension-appropriate signature: + * - 1D: `deposit_at(int g_i1, int comp, real_t val)` + * - 2D: `deposit_at(int g_i1, int g_i2, int comp, real_t val)` + * - 3D: `deposit_at(int g_i1, int g_i2, int g_i3, int comp, real_t val)` */ - Inline auto operator()(prtlidx_t p) const -> void { + template + Inline void deposit_one_particle(prtlidx_t p, + const PrtlPack& prtls, + const M& metric, + real_t charge, + real_t inv_dt, + DepositFn deposit_at) { + static_assert(O <= 11u, "Shape function order O must be <= 11"); + constexpr auto D = M::Dim; + + const auto& i1 = prtls.i1; + const auto& i2 = prtls.i2; + const auto& i3 = prtls.i3; + const auto& i1_prev = prtls.i1_prev; + const auto& i2_prev = prtls.i2_prev; + const auto& i3_prev = prtls.i3_prev; + const auto& dx1 = prtls.dx1; + const auto& dx2 = prtls.dx2; + const auto& dx3 = prtls.dx3; + const auto& dx1_prev = prtls.dx1_prev; + const auto& dx2_prev = prtls.dx2_prev; + const auto& dx3_prev = prtls.dx3_prev; + const auto& ux1 = prtls.ux1; + const auto& ux2 = prtls.ux2; + const auto& ux3 = prtls.ux3; + const auto& phi = prtls.phi; + const auto& weight = prtls.weight; + const auto& tag = prtls.tag; + if (tag(p) == ParticleTag::dead) { return; } + // recover particle velocity to deposit in unsimulated direction - vec_t vp { ZERO }; - { + [[maybe_unused]] vec_t vp { ZERO }; + // `vp` only feeds the unsimulated-direction current in the 1D + // (jx2, jx3) and 2D (jx3) branches. In 3D every J component comes + // from the Esirkepov/zigzag charge motion and `vp` is never read, + // so the metric transform + 1/sqrt + NaN/Inf guard below is pure + // dead work there — skip it (also frees xp/inv_energy registers). + if constexpr (D != Dim::_3D) { coord_t xp { ZERO }; if constexpr (D == Dim::_1D) { xp[0] = i_di_to_Xi(i1(p), dx1(p)); @@ -167,7 +180,6 @@ namespace kernel { const real_t coeff { weight(p) * charge }; - // ToDo: interpolation_order as parameter if constexpr (O == 0u) { /* Zig-zag deposit @@ -191,8 +203,6 @@ namespace kernel { dx1(p) - dxp_r_1) * coeff * inv_dt }; - auto J_acc = J.access(); - if constexpr (D == Dim::_1D) { const real_t Fx2_1 { HALF * vp[1] * coeff }; const real_t Fx2_2 { HALF * vp[1] * coeff }; @@ -200,18 +210,18 @@ namespace kernel { const real_t Fx3_1 { HALF * vp[2] * coeff }; const real_t Fx3_2 { HALF * vp[2] * coeff }; - J_acc(i1_prev(p) + N_GHOSTS, cur::jx1) += Fx1_1; - J_acc(i1(p) + N_GHOSTS, cur::jx1) += Fx1_2; + deposit_at(i1_prev(p) + N_GHOSTS, cur::jx1, Fx1_1); + deposit_at(i1(p) + N_GHOSTS, cur::jx1, Fx1_2); - J_acc(i1_prev(p) + N_GHOSTS, cur::jx2) += Fx2_1 * (ONE - Wx1_1); - J_acc(i1_prev(p) + N_GHOSTS + 1, cur::jx2) += Fx2_1 * Wx1_1; - J_acc(i1(p) + N_GHOSTS, cur::jx2) += Fx2_2 * (ONE - Wx1_2); - J_acc(i1(p) + N_GHOSTS + 1, cur::jx2) += Fx2_2 * Wx1_2; + deposit_at(i1_prev(p) + N_GHOSTS, cur::jx2, Fx2_1 * (ONE - Wx1_1)); + deposit_at(i1_prev(p) + N_GHOSTS + 1, cur::jx2, Fx2_1 * Wx1_1); + deposit_at(i1(p) + N_GHOSTS, cur::jx2, Fx2_2 * (ONE - Wx1_2)); + deposit_at(i1(p) + N_GHOSTS + 1, cur::jx2, Fx2_2 * Wx1_2); - J_acc(i1_prev(p) + N_GHOSTS, cur::jx3) += Fx3_1 * (ONE - Wx1_1); - J_acc(i1_prev(p) + N_GHOSTS + 1, cur::jx3) += Fx3_1 * Wx1_1; - J_acc(i1(p) + N_GHOSTS, cur::jx3) += Fx3_2 * (ONE - Wx1_2); - J_acc(i1(p) + N_GHOSTS + 1, cur::jx3) += Fx3_2 * Wx1_2; + deposit_at(i1_prev(p) + N_GHOSTS, cur::jx3, Fx3_1 * (ONE - Wx1_1)); + deposit_at(i1_prev(p) + N_GHOSTS + 1, cur::jx3, Fx3_1 * Wx1_1); + deposit_at(i1(p) + N_GHOSTS, cur::jx3, Fx3_2 * (ONE - Wx1_2)); + deposit_at(i1(p) + N_GHOSTS + 1, cur::jx3, Fx3_2 * Wx1_2); } else if constexpr (D == Dim::_2D || D == Dim::_3D) { const auto dxp_r_2 { static_cast(i2(p) == i2_prev(p)) * (dx2(p) + dx2_prev(p)) * @@ -236,51 +246,73 @@ namespace kernel { const real_t Fx3_1 { HALF * vp[2] * coeff }; const real_t Fx3_2 { HALF * vp[2] * coeff }; - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - cur::jx1) += Fx1_1 * (ONE - Wx2_1); - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS + 1, - cur::jx1) += Fx1_1 * Wx2_1; - J_acc(i1(p) + N_GHOSTS, i2(p) + N_GHOSTS, cur::jx1) += Fx1_2 * - (ONE - Wx2_2); - J_acc(i1(p) + N_GHOSTS, i2(p) + N_GHOSTS + 1, cur::jx1) += Fx1_2 * Wx2_2; - - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - cur::jx2) += Fx2_1 * (ONE - Wx1_1); - J_acc(i1_prev(p) + N_GHOSTS + 1, - i2_prev(p) + N_GHOSTS, - cur::jx2) += Fx2_1 * Wx1_1; - J_acc(i1(p) + N_GHOSTS, i2(p) + N_GHOSTS, cur::jx2) += Fx2_2 * - (ONE - Wx1_2); - J_acc(i1(p) + N_GHOSTS + 1, i2(p) + N_GHOSTS, cur::jx2) += Fx2_2 * Wx1_2; - - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - cur::jx3) += Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1); - J_acc(i1_prev(p) + N_GHOSTS + 1, - i2_prev(p) + N_GHOSTS, - cur::jx3) += Fx3_1 * Wx1_1 * (ONE - Wx2_1); - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS + 1, - cur::jx3) += Fx3_1 * (ONE - Wx1_1) * Wx2_1; - J_acc(i1_prev(p) + N_GHOSTS + 1, - i2_prev(p) + N_GHOSTS + 1, - cur::jx3) += Fx3_1 * Wx1_1 * Wx2_1; - - J_acc(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS, - cur::jx3) += Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2); - J_acc(i1(p) + N_GHOSTS + 1, - i2(p) + N_GHOSTS, - cur::jx3) += Fx3_2 * Wx1_2 * (ONE - Wx2_2); - J_acc(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS + 1, - cur::jx3) += Fx3_2 * (ONE - Wx1_2) * Wx2_2; - J_acc(i1(p) + N_GHOSTS + 1, - i2(p) + N_GHOSTS + 1, - cur::jx3) += Fx3_2 * Wx1_2 * Wx2_2; + deposit_at(i1_prev(p) + N_GHOSTS, + i2_prev(p) + N_GHOSTS, + cur::jx1, + Fx1_1 * (ONE - Wx2_1)); + deposit_at(i1_prev(p) + N_GHOSTS, + i2_prev(p) + N_GHOSTS + 1, + cur::jx1, + Fx1_1 * Wx2_1); + deposit_at(i1(p) + N_GHOSTS, + i2(p) + N_GHOSTS, + cur::jx1, + Fx1_2 * (ONE - Wx2_2)); + deposit_at(i1(p) + N_GHOSTS, + i2(p) + N_GHOSTS + 1, + cur::jx1, + Fx1_2 * Wx2_2); + + deposit_at(i1_prev(p) + N_GHOSTS, + i2_prev(p) + N_GHOSTS, + cur::jx2, + Fx2_1 * (ONE - Wx1_1)); + deposit_at(i1_prev(p) + N_GHOSTS + 1, + i2_prev(p) + N_GHOSTS, + cur::jx2, + Fx2_1 * Wx1_1); + deposit_at(i1(p) + N_GHOSTS, + i2(p) + N_GHOSTS, + cur::jx2, + Fx2_2 * (ONE - Wx1_2)); + deposit_at(i1(p) + N_GHOSTS + 1, + i2(p) + N_GHOSTS, + cur::jx2, + Fx2_2 * Wx1_2); + + deposit_at(i1_prev(p) + N_GHOSTS, + i2_prev(p) + N_GHOSTS, + cur::jx3, + Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1)); + deposit_at(i1_prev(p) + N_GHOSTS + 1, + i2_prev(p) + N_GHOSTS, + cur::jx3, + Fx3_1 * Wx1_1 * (ONE - Wx2_1)); + deposit_at(i1_prev(p) + N_GHOSTS, + i2_prev(p) + N_GHOSTS + 1, + cur::jx3, + Fx3_1 * (ONE - Wx1_1) * Wx2_1); + deposit_at(i1_prev(p) + N_GHOSTS + 1, + i2_prev(p) + N_GHOSTS + 1, + cur::jx3, + Fx3_1 * Wx1_1 * Wx2_1); + + deposit_at(i1(p) + N_GHOSTS, + i2(p) + N_GHOSTS, + cur::jx3, + Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2)); + deposit_at(i1(p) + N_GHOSTS + 1, + i2(p) + N_GHOSTS, + cur::jx3, + Fx3_2 * Wx1_2 * (ONE - Wx2_2)); + deposit_at(i1(p) + N_GHOSTS, + i2(p) + N_GHOSTS + 1, + cur::jx3, + Fx3_2 * (ONE - Wx1_2) * Wx2_2); + deposit_at(i1(p) + N_GHOSTS + 1, + i2(p) + N_GHOSTS + 1, + cur::jx3, + Fx3_2 * Wx1_2 * Wx2_2); } else { const auto dxp_r_3 { static_cast(i3(p) == i3_prev(p)) * (dx3(p) + dx3_prev(p)) * @@ -300,107 +332,131 @@ namespace kernel { dx3(p) - dxp_r_3) * coeff * inv_dt }; - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS, - cur::jx1) += Fx1_1 * (ONE - Wx2_1) * (ONE - Wx3_1); - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS + 1, - i3_prev(p) + N_GHOSTS, - cur::jx1) += Fx1_1 * Wx2_1 * (ONE - Wx3_1); - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS + 1, - cur::jx1) += Fx1_1 * (ONE - Wx2_1) * Wx3_1; - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS + 1, - i3_prev(p) + N_GHOSTS + 1, - cur::jx1) += Fx1_1 * Wx2_1 * Wx3_1; - - J_acc(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS, - cur::jx1) += Fx1_2 * (ONE - Wx2_2) * (ONE - Wx3_2); - J_acc(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS + 1, - i3(p) + N_GHOSTS, - cur::jx1) += Fx1_2 * Wx2_2 * (ONE - Wx3_2); - J_acc(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS + 1, - cur::jx1) += Fx1_2 * (ONE - Wx2_2) * Wx3_2; - J_acc(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS + 1, - i3(p) + N_GHOSTS + 1, - cur::jx1) += Fx1_2 * Wx2_2 * Wx3_2; - - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS, - cur::jx2) += Fx2_1 * (ONE - Wx1_1) * (ONE - Wx3_1); - J_acc(i1_prev(p) + N_GHOSTS + 1, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS, - cur::jx2) += Fx2_1 * Wx1_1 * (ONE - Wx3_1); - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS + 1, - cur::jx2) += Fx2_1 * (ONE - Wx1_1) * Wx3_1; - J_acc(i1_prev(p) + N_GHOSTS + 1, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS + 1, - cur::jx2) += Fx2_1 * Wx1_1 * Wx3_1; - - J_acc(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS, - cur::jx2) += Fx2_2 * (ONE - Wx1_2) * (ONE - Wx3_2); - J_acc(i1(p) + N_GHOSTS + 1, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS, - cur::jx2) += Fx2_2 * Wx1_2 * (ONE - Wx3_2); - J_acc(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS + 1, - cur::jx2) += Fx2_2 * (ONE - Wx1_2) * Wx3_2; - J_acc(i1(p) + N_GHOSTS + 1, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS + 1, - cur::jx2) += Fx2_2 * Wx1_2 * Wx3_2; - - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS, - cur::jx3) += Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1); - J_acc(i1_prev(p) + N_GHOSTS + 1, - i2_prev(p) + N_GHOSTS, - i3_prev(p) + N_GHOSTS, - cur::jx3) += Fx3_1 * Wx1_1 * (ONE - Wx2_1); - J_acc(i1_prev(p) + N_GHOSTS, - i2_prev(p) + N_GHOSTS + 1, - i3_prev(p) + N_GHOSTS, - cur::jx3) += Fx3_1 * (ONE - Wx1_1) * Wx2_1; - J_acc(i1_prev(p) + N_GHOSTS + 1, - i2_prev(p) + N_GHOSTS + 1, - i3_prev(p) + N_GHOSTS, - cur::jx3) += Fx3_1 * Wx1_1 * Wx2_1; - - J_acc(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS, - cur::jx3) += Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2); - J_acc(i1(p) + N_GHOSTS + 1, - i2(p) + N_GHOSTS, - i3(p) + N_GHOSTS, - cur::jx3) += Fx3_2 * Wx1_2 * (ONE - Wx2_2); - J_acc(i1(p) + N_GHOSTS, - i2(p) + N_GHOSTS + 1, - i3(p) + N_GHOSTS, - cur::jx3) += Fx3_2 * (ONE - Wx1_2) * Wx2_2; - J_acc(i1(p) + N_GHOSTS + 1, - i2(p) + N_GHOSTS + 1, - i3(p) + N_GHOSTS, - cur::jx3) += Fx3_2 * Wx1_2 * Wx2_2; + deposit_at(i1_prev(p) + N_GHOSTS, + i2_prev(p) + N_GHOSTS, + i3_prev(p) + N_GHOSTS, + cur::jx1, + Fx1_1 * (ONE - Wx2_1) * (ONE - Wx3_1)); + deposit_at(i1_prev(p) + N_GHOSTS, + i2_prev(p) + N_GHOSTS + 1, + i3_prev(p) + N_GHOSTS, + cur::jx1, + Fx1_1 * Wx2_1 * (ONE - Wx3_1)); + deposit_at(i1_prev(p) + N_GHOSTS, + i2_prev(p) + N_GHOSTS, + i3_prev(p) + N_GHOSTS + 1, + cur::jx1, + Fx1_1 * (ONE - Wx2_1) * Wx3_1); + deposit_at(i1_prev(p) + N_GHOSTS, + i2_prev(p) + N_GHOSTS + 1, + i3_prev(p) + N_GHOSTS + 1, + cur::jx1, + Fx1_1 * Wx2_1 * Wx3_1); + + deposit_at(i1(p) + N_GHOSTS, + i2(p) + N_GHOSTS, + i3(p) + N_GHOSTS, + cur::jx1, + Fx1_2 * (ONE - Wx2_2) * (ONE - Wx3_2)); + deposit_at(i1(p) + N_GHOSTS, + i2(p) + N_GHOSTS + 1, + i3(p) + N_GHOSTS, + cur::jx1, + Fx1_2 * Wx2_2 * (ONE - Wx3_2)); + deposit_at(i1(p) + N_GHOSTS, + i2(p) + N_GHOSTS, + i3(p) + N_GHOSTS + 1, + cur::jx1, + Fx1_2 * (ONE - Wx2_2) * Wx3_2); + deposit_at(i1(p) + N_GHOSTS, + i2(p) + N_GHOSTS + 1, + i3(p) + N_GHOSTS + 1, + cur::jx1, + Fx1_2 * Wx2_2 * Wx3_2); + + deposit_at(i1_prev(p) + N_GHOSTS, + i2_prev(p) + N_GHOSTS, + i3_prev(p) + N_GHOSTS, + cur::jx2, + Fx2_1 * (ONE - Wx1_1) * (ONE - Wx3_1)); + deposit_at(i1_prev(p) + N_GHOSTS + 1, + i2_prev(p) + N_GHOSTS, + i3_prev(p) + N_GHOSTS, + cur::jx2, + Fx2_1 * Wx1_1 * (ONE - Wx3_1)); + deposit_at(i1_prev(p) + N_GHOSTS, + i2_prev(p) + N_GHOSTS, + i3_prev(p) + N_GHOSTS + 1, + cur::jx2, + Fx2_1 * (ONE - Wx1_1) * Wx3_1); + deposit_at(i1_prev(p) + N_GHOSTS + 1, + i2_prev(p) + N_GHOSTS, + i3_prev(p) + N_GHOSTS + 1, + cur::jx2, + Fx2_1 * Wx1_1 * Wx3_1); + + deposit_at(i1(p) + N_GHOSTS, + i2(p) + N_GHOSTS, + i3(p) + N_GHOSTS, + cur::jx2, + Fx2_2 * (ONE - Wx1_2) * (ONE - Wx3_2)); + deposit_at(i1(p) + N_GHOSTS + 1, + i2(p) + N_GHOSTS, + i3(p) + N_GHOSTS, + cur::jx2, + Fx2_2 * Wx1_2 * (ONE - Wx3_2)); + deposit_at(i1(p) + N_GHOSTS, + i2(p) + N_GHOSTS, + i3(p) + N_GHOSTS + 1, + cur::jx2, + Fx2_2 * (ONE - Wx1_2) * Wx3_2); + deposit_at(i1(p) + N_GHOSTS + 1, + i2(p) + N_GHOSTS, + i3(p) + N_GHOSTS + 1, + cur::jx2, + Fx2_2 * Wx1_2 * Wx3_2); + + deposit_at(i1_prev(p) + N_GHOSTS, + i2_prev(p) + N_GHOSTS, + i3_prev(p) + N_GHOSTS, + cur::jx3, + Fx3_1 * (ONE - Wx1_1) * (ONE - Wx2_1)); + deposit_at(i1_prev(p) + N_GHOSTS + 1, + i2_prev(p) + N_GHOSTS, + i3_prev(p) + N_GHOSTS, + cur::jx3, + Fx3_1 * Wx1_1 * (ONE - Wx2_1)); + deposit_at(i1_prev(p) + N_GHOSTS, + i2_prev(p) + N_GHOSTS + 1, + i3_prev(p) + N_GHOSTS, + cur::jx3, + Fx3_1 * (ONE - Wx1_1) * Wx2_1); + deposit_at(i1_prev(p) + N_GHOSTS + 1, + i2_prev(p) + N_GHOSTS + 1, + i3_prev(p) + N_GHOSTS, + cur::jx3, + Fx3_1 * Wx1_1 * Wx2_1); + + deposit_at(i1(p) + N_GHOSTS, + i2(p) + N_GHOSTS, + i3(p) + N_GHOSTS, + cur::jx3, + Fx3_2 * (ONE - Wx1_2) * (ONE - Wx2_2)); + deposit_at(i1(p) + N_GHOSTS + 1, + i2(p) + N_GHOSTS, + i3(p) + N_GHOSTS, + cur::jx3, + Fx3_2 * Wx1_2 * (ONE - Wx2_2)); + deposit_at(i1(p) + N_GHOSTS, + i2(p) + N_GHOSTS + 1, + i3(p) + N_GHOSTS, + cur::jx3, + Fx3_2 * (ONE - Wx1_2) * Wx2_2); + deposit_at(i1(p) + N_GHOSTS + 1, + i2(p) + N_GHOSTS + 1, + i3(p) + N_GHOSTS, + cur::jx3, + Fx3_2 * Wx1_2 * Wx2_2); } } } else if constexpr ((O >= 1u) and (O <= 11u)) { @@ -421,33 +477,14 @@ namespace kernel { fS_x1); if constexpr (D == Dim::_1D) { - // define weight vectors - real_t Wx1[O + 2]; - real_t Wx23[O + 2]; - - // Calculate weight function -#pragma unroll - for (int i = 0; i < O + 2; ++i) { - // Esirkepov 2001, Eq. 38 for 1D case - Wx1[i] = fS_x1[i] - iS_x1[i]; - Wx23[i] = HALF * (fS_x1[i] + iS_x1[i]); - } - - // contribution within the shape function stencil - real_t jx1[O + 2]; - - // prefactors for j update + // (1D): fused Esirkepov, no [O+2] temporaries. + // jx1[i] = -Qdx1dt * sum_{i'=0}^{i} (fS_x1[i'] - iS_x1[i']) + // = -Qdx1dt * P1[i] (Eq. 38, 1D) + // Wx23[i] = HALF * (fS_x1[i] + iS_x1[i]) (computed inline) const real_t Qdx1dt = coeff * inv_dt; const real_t QVx2 = coeff * vp[1]; const real_t QVx3 = coeff * vp[2]; - // Calculate current contribution - jx1[0] = -Qdx1dt * Wx1[0]; -#pragma unroll - for (int i = 1; i < O + 2; ++i) { - jx1[i] = jx1[i - 1] - Qdx1dt * Wx1[i]; - } - // account for ghost cells i1_min += N_GHOSTS; i1_max += N_GHOSTS; @@ -455,21 +492,18 @@ namespace kernel { // get number of update indices for asymmetric movement const int di_x1 = i1_max - i1_min; - /* - Current update - */ - auto J_acc = J.access(); - - for (int i = 0; i < di_x1; ++i) { - J_acc(i1_min + i, cur::jx1) += jx1[i]; - } - + // Current update — fused over the union line so the J cell + // stays L1-resident across the 3 component atomic_adds. + real_t P1 = ZERO; for (int i = 0; i <= di_x1; ++i) { - J_acc(i1_min + i, cur::jx2) += QVx2 * Wx23[i]; - } - - for (int i = 0; i <= di_x1; ++i) { - J_acc(i1_min + i, cur::jx3) += QVx3 * Wx23[i]; + P1 += fS_x1[i] - iS_x1[i]; + const int gi = i1_min + i; + const real_t Wx23 = HALF * (fS_x1[i] + iS_x1[i]); + if (i < di_x1) { + deposit_at(gi, cur::jx1, -Qdx1dt * P1); + } + deposit_at(gi, cur::jx2, QVx2 * Wx23); + deposit_at(gi, cur::jx3, QVx3 * Wx23); } } else if constexpr (D == Dim::_2D) { @@ -489,63 +523,23 @@ namespace kernel { iS_x2, fS_x2); - // define weight tensors - real_t Wx1[O + 2][O + 2]; - real_t Wx2[O + 2][O + 2]; - real_t Wx3[O + 2][O + 2]; - -// Calculate weight function -#pragma unroll - for (int i = 0; i < O + 2; ++i) { -#pragma unroll - for (int j = 0; j < O + 2; ++j) { - // Esirkepov 2001, Eq. 38 (simplified) - Wx1[i][j] = HALF * (fS_x1[i] - iS_x1[i]) * (fS_x2[j] + iS_x2[j]); - - Wx2[i][j] = HALF * (fS_x1[i] + iS_x1[i]) * (fS_x2[j] - iS_x2[j]); - - Wx3[i][j] = THIRD * (fS_x2[j] * (HALF * iS_x1[i] + fS_x1[i]) + - iS_x2[j] * (HALF * fS_x1[i] + iS_x1[i])); - } - } - - // contribution within the shape function stencil - real_t jx1[O + 2][O + 2], jx2[O + 2][O + 2]; - - // prefactors for j update - const real_t Qdx1dt = coeff * inv_dt; - const real_t Qdx2dt = coeff * inv_dt; - const real_t QVx3 = coeff * vp[2]; - - // Calculate current contribution - - // jx1 -#pragma unroll - for (int j = 0; j < O + 2; ++j) { - jx1[0][j] = -Qdx1dt * Wx1[0][j]; - } - -#pragma unroll - for (int i = 1; i < O + 2; ++i) { -#pragma unroll - for (int j = 0; j < O + 2; ++j) { - jx1[i][j] = jx1[i - 1][j] - Qdx1dt * Wx1[i][j]; - } - } - - // jx2 -#pragma unroll - for (int i = 0; i < O + 2; ++i) { - jx2[i][0] = -Qdx2dt * Wx2[i][0]; - } - -#pragma unroll - for (int j = 1; j < O + 2; ++j) { -#pragma unroll - for (int i = 0; i < O + 2; ++i) { - jx2[i][j] = jx2[i][j - 1] - Qdx2dt * Wx2[i][j]; - } - } + // (2D): fused Esirkepov, no [O+2]^2 temporaries. + // + // Esirkepov 2001 Eq. 38 (simplified) is separable: with + // P1[i] = sum_{i'=0}^{i} (fS_x1[i'] - iS_x1[i']) and + // P2[j] = sum_{j'=0}^{j} (fS_x2[j'] - iS_x2[j']), + // jx1[i][j] = -Q*HALF * P1[i] * (fS_x2[j] + iS_x2[j]) + // jx2[i][j] = -Q*HALF * P2[j] * (fS_x1[i] + iS_x1[i]) + // Wx3[i][j] = THIRD*( fS_x2[j]*(HALF*iS_x1[i]+fS_x1[i]) + // + iS_x2[j]*(HALF*fS_x1[i]+iS_x1[i]) ) + // with Q = coeff*inv_dt (Qdx1dt == Qdx2dt). Same value as the + // old explicit Wx/jx tensors up to FP reassociation; + // charge-conserving by construction. Prefix sums carried as + // running scalars, so the only per-thread state is the + // existing 1D shape arrays. + const real_t QVx3 = coeff * vp[2]; + // -Q*HALF prefactor (Qdx1dt == Qdx2dt == coeff*inv_dt) + const real_t cf = -(coeff * inv_dt) * HALF; // account for ghost cells i1_min += N_GHOSTS; @@ -557,26 +551,30 @@ namespace kernel { const int di_x1 = i1_max - i1_min; const int di_x2 = i2_max - i2_min; - /* - Current update - */ - auto J_acc = J.access(); - - for (int i = 0; i < di_x1; ++i) { - for (int j = 0; j <= di_x2; ++j) { - J_acc(i1_min + i, i2_min + j, cur::jx1) += jx1[i][j]; - } - } - - for (int i = 0; i <= di_x1; ++i) { - for (int j = 0; j < di_x2; ++j) { - J_acc(i1_min + i, i2_min + j, cur::jx2) += jx2[i][j]; - } - } - + // Current update — fused over the union plane so the J cell + // line stays L1-resident across the 3 component atomic_adds. + real_t P1 = ZERO; for (int i = 0; i <= di_x1; ++i) { + P1 += fS_x1[i] - iS_x1[i]; + const int gi = i1_min + i; + const real_t iSx1 = iS_x1[i]; + const real_t fSx1 = fS_x1[i]; + const real_t A1 = fSx1 + iSx1; // jx2 cross-factor + real_t P2 = ZERO; for (int j = 0; j <= di_x2; ++j) { - J_acc(i1_min + i, i2_min + j, cur::jx3) += QVx3 * Wx3[i][j]; + P2 += fS_x2[j] - iS_x2[j]; + const int gj = i2_min + j; + const real_t iSx2 = iS_x2[j]; + const real_t fSx2 = fS_x2[j]; + if (i < di_x1) { + deposit_at(gi, gj, cur::jx1, cf * P1 * (fSx2 + iSx2)); + } + if (j < di_x2) { + deposit_at(gi, gj, cur::jx2, cf * P2 * A1); + } + const real_t Wx3 = THIRD * (fSx2 * (HALF * iSx1 + fSx1) + + iSx2 * (HALF * fSx1 + iSx1)); + deposit_at(gi, gj, cur::jx3, QVx3 * Wx3); } } @@ -610,104 +608,33 @@ namespace kernel { iS_x3, fS_x3); - // define weight tensors - real_t Wx1[O + 2][O + 2][O + 2]; - real_t Wx2[O + 2][O + 2][O + 2]; - real_t Wx3[O + 2][O + 2][O + 2]; - -// Calculate weight function -#pragma unroll - for (int i = 0; i < O + 2; ++i) { -#pragma unroll - for (int j = 0; j < O + 2; ++j) { -#pragma unroll - for (int k = 0; k < O + 2; ++k) { - // Esirkepov 2001, Eq. 31 - Wx1[i][j][k] = THIRD * (fS_x1[i] - iS_x1[i]) * - ((iS_x2[j] * iS_x3[k] + fS_x2[j] * fS_x3[k]) + - HALF * (iS_x3[k] * fS_x2[j] + iS_x2[j] * fS_x3[k])); - - Wx2[i][j][k] = THIRD * (fS_x2[j] - iS_x2[j]) * - (iS_x1[i] * iS_x3[k] + fS_x1[i] * fS_x3[k] + - HALF * (iS_x3[k] * fS_x1[i] + iS_x1[i] * fS_x3[k])); - - Wx3[i][j][k] = THIRD * (fS_x3[k] - iS_x3[k]) * - (iS_x1[i] * iS_x2[j] + fS_x1[i] * fS_x2[j] + - HALF * (iS_x1[i] * fS_x2[j] + iS_x2[j] * fS_x1[i])); - } - } - } - - // contribution within the shape function stencil - real_t jx1[O + 2][O + 2][O + 2], jx2[O + 2][O + 2][O + 2], - jx3[O + 2][O + 2][O + 2]; - - // prefactors to j update - const real_t Qdxdt = coeff * inv_dt; - const real_t Qdydt = coeff * inv_dt; - const real_t Qdzdt = coeff * inv_dt; - - // Calculate current contribution - - // jx1 -#pragma unroll - for (int j = 0; j < O + 2; ++j) { -#pragma unroll - for (int k = 0; k < O + 2; ++k) { - jx1[0][j][k] = -Qdxdt * Wx1[0][j][k]; - } - } - -#pragma unroll - for (int i = 1; i < O + 2; ++i) { -#pragma unroll - for (int j = 0; j < O + 2; ++j) { -#pragma unroll - for (int k = 0; k < O + 2; ++k) { - jx1[i][j][k] = jx1[i - 1][j][k] - Qdxdt * Wx1[i][j][k]; - } - } - } - - // jx2 -#pragma unroll - for (int i = 0; i < O + 2; ++i) { -#pragma unroll - for (int k = 0; k < O + 2; ++k) { - jx2[i][0][k] = -Qdydt * Wx2[i][0][k]; - } - } - -#pragma unroll - for (int i = 0; i < O + 2; ++i) { -#pragma unroll - for (int j = 1; j < O + 2; ++j) { -#pragma unroll - for (int k = 0; k < O + 2; ++k) { - jx2[i][j][k] = jx2[i][j - 1][k] - Qdydt * Wx2[i][j][k]; - } - } - } - - // jx3 -#pragma unroll - for (int i = 0; i < O + 2; ++i) { -#pragma unroll - for (int j = 0; j < O + 2; ++j) { - jx3[i][j][0] = -Qdydt * Wx3[i][j][0]; - } - } - -#pragma unroll - for (int i = 0; i < O + 2; ++i) { -#pragma unroll - for (int j = 0; j < O + 2; ++j) { -#pragma unroll - for (int k = 1; k < O + 2; ++k) { - jx3[i][j][k] = jx3[i][j][k - 1] - Qdzdt * Wx3[i][j][k]; - } - } - } + // fused Esirkepov, no (O+2)^3 temporaries. + // + // The Esirkepov 3D current (2001, Eq. 31) is separable: with + // P1[i] = sum_{i'=0}^{i} (fS_x1[i'] - iS_x1[i']) (and likewise + // P2[j], P3[k]) the cumulative-sum currents collapse to + // + // jx1[i][j][k] = -Q*THIRD * P1[i] * G23(j,k) + // jx2[i][j][k] = -Q*THIRD * P2[j] * H13(i,k) + // jx3[i][j][k] = -Q*THIRD * P3[k] * F12(i,j) + // + // with the 1D-shape cross-factors + // + // G23(j,k) = iS_x2[j]*iS_x3[k] + fS_x2[j]*fS_x3[k] + // + HALF*(iS_x3[k]*fS_x2[j] + iS_x2[j]*fS_x3[k]) + // H13(i,k) = iS_x1[i]*iS_x3[k] + fS_x1[i]*fS_x3[k] + // + HALF*(iS_x3[k]*fS_x1[i] + iS_x1[i]*fS_x3[k]) + // F12(i,j) = iS_x1[i]*iS_x2[j] + fS_x1[i]*fS_x2[j] + // + HALF*(iS_x1[i]*fS_x2[j] + iS_x2[j]*fS_x1[i]) + // + // and Q = coeff*inv_dt (Qdxdt == Qdydt == Qdzdt). This is the + // same value as the old explicit Wx/jx tensors up to + // floating-point reassociation: charge-conserving by + // construction (the Esirkepov decomposition is exact). The + // prefix sums are carried as running scalars in the deposit + // loop, so the only per-thread state is the existing 1D shape + // arrays (no (O+2)^3 / (O+2)^2 locals, hence far fewer VGPRs + // and no private-memory tensor traffic). // account for ghost cells i1_min += N_GHOSTS; @@ -722,31 +649,50 @@ namespace kernel { const int di_x2 = i2_max - i2_min; const int di_x3 = i3_max - i3_min; + // -Q*THIRD prefactor (Qdxdt == Qdydt == Qdzdt == coeff*inv_dt) + const real_t cf = -(coeff * inv_dt) * THIRD; + /* - Current update + Current update — fused over the union cube so the J cell + line stays L1-resident across the 3 component atomic_adds. + Per-cell branches on (i + class DepositCurrents_kernel { + static_assert(O <= 11u, "Shape function order O must be <= 11"); + static constexpr auto D = M::Dim; + + scatter_ndfield_t J; + deposit::PrtlPack prtls; + const M metric; + const real_t charge, inv_dt; + + public: + DepositCurrents_kernel(const scatter_ndfield_t& scatter_cur, + const array_t& i1, + const array_t& i2, + const array_t& i3, + const array_t& i1_prev, + const array_t& i2_prev, + const array_t& i3_prev, + const array_t& dx1, + const array_t& dx2, + const array_t& dx3, + const array_t& dx1_prev, + const array_t& dx2_prev, + const array_t& dx3_prev, + const array_t& ux1, + const array_t& ux2, + const array_t& ux3, + const array_t& phi, + const array_t& weight, + const array_t& tag, + const M& metric, + real_t charge, + const real_t dt) + : J { scatter_cur } + , prtls { i1, i2, i3, i1_prev, i2_prev, i3_prev, + dx1, dx2, dx3, dx1_prev, dx2_prev, dx3_prev, + ux1, ux2, ux3, phi, weight, tag } + , metric { metric } + , charge { charge } + , inv_dt { ONE / dt } { + raise::ErrorIf( + (O == 2u and N_GHOSTS < 2), + "Order of interpolation is 2, but number of ghost cells is < 2", + HERE); + } + + Inline auto operator()(prtlidx_t p) const -> void { + auto J_acc = J.access(); + if constexpr (D == Dim::_1D) { + deposit::deposit_one_particle( + p, + prtls, + metric, + charge, + inv_dt, + [&](int g_i1, int comp, real_t v) { + J_acc(g_i1, comp) += v; + }); + } else if constexpr (D == Dim::_2D) { + deposit::deposit_one_particle( + p, + prtls, + metric, + charge, + inv_dt, + [&](int g_i1, int g_i2, int comp, real_t v) { + J_acc(g_i1, g_i2, comp) += v; + }); + } else if constexpr (D == Dim::_3D) { + deposit::deposit_one_particle( + p, + prtls, + metric, + charge, + inv_dt, + [&](int g_i1, int g_i2, int g_i3, int comp, real_t v) { + J_acc(g_i1, g_i2, g_i3, comp) += v; + }); + } + } }; + +#if defined(TEAM_POLICY) + + /** + * @brief Tiled current-deposition kernel. + * + * One team per spatial tile (`league_size = ntiles_total`). Each team + * accumulates particle contributions into a per-team scratch buffer of + * shape `(T_TILE + 2*HALO)^D × 3` real_t, where `HALO = O + 1` cells per + * side. Scratch atomics live in SLM (PVC: ~5–10 cycles per + * `atomic_add`); the global J is touched only once per scratch cell at + * flush time. Compared with the flat scatter-view kernel: + * - global atomic pressure ~ (T_TILE + 2*HALO)^D × 3 per tile + * instead of (stencil writes per particle × particles) + * - per-particle stencil writes are tile-local (SLM) instead of + * scattering through global HBM + * + * Supports `O ∈ {0, ..., 11}`. `O == 0` (zigzag) is wired for + * A/B benchmarking against the flat scatter-view kernel — its narrow + * stencil typically makes scratch alloc/zero/flush overhead a + * regression there, but it's good to be able to measure the + * crossover. To revert and use flat for zigzag-only builds, change + * the dispatch in `engines/srpic/currents.h` from + * `#if defined(TEAM_POLICY)` to + * `#if defined(TEAM_POLICY) && (SHAPE_ORDER > 0)`. + * + * Particle iteration order is governed by `tile_offsets`: tile `t` + * owns particles `[tile_offsets(t), tile_offsets(t+1))`, post-sort. + * `SortSpatially` (`particles_sort.cpp`) is responsible for keeping + * the SoA arrays consistent with that. + * + * **Halo sizing and escape valve.** Sort runs at the end of the + * previous step (see `srpic.hpp`), so at deposit time the particle + * has already been pushed once — its `min(i, i_prev)` may differ + * from the bin key by one cell of drift per step elapsed since the + * last sort. The scratch HALO is `STENCIL_REACH(O) + DRIFT`, where + * `STENCIL_REACH = 2` for zigzag (writes `{i_prev, i_prev+1, i, + * i+1}` ⇒ +2 above `min(i, i_prev)` with `|Δi|=1`) and `O` for + * Esirkepov, and `DRIFT` is a fixed constant (1) covering the one + * guaranteed post-sort pusher step. + * + * HALO is sized for the *common* (every-step-sorted) case, not for + * a worst-case sort cadence: correctness does **not** depend on it. + * Any particle whose stencil escapes the scratch tile — because it + * drifted further than `DRIFT` (e.g. a large runtime + * `spatial_sorting_interval`), or because the halo is otherwise + * undersized — silently falls back to a direct, bounds-clipped + * `Kokkos::atomic_add` on the global J view. That path is + * charge-conserving (each particle's stencil is deposited exactly + * once, partly to private SLM scratch and partly to global J, and + * scratch is flushed once via `atomic_add`); it is merely slower + * per write. Sorting less often than every step therefore costs + * escape-valve traffic, never accuracy. + */ + template + class DepositCurrents_kernel_tiled { + static_assert(O <= 11u, "Shape order O must be <= 11"); + static_assert(T_TILE > 0u, "T_TILE must be positive"); + static constexpr auto D = M::Dim; + + // Per-side scratch halo, derived from first principles. + // + // total halo = stencil_reach(O) + drift_between_sort_and_deposit + // + // stencil_reach(O) — maximum cells the deposit writes ABOVE + // min(i, i_prev) under CFL |v·dt/dx| ≤ 1/2: + // - O == 0 (zigzag): writes {i_prev, i_prev+1, i, i+1} ⇒ +2 + // - O >= 1 Esirkepov: `for_deposit` returns an (O+2)-wide + // array but only O+1 entries are non-zero, and the union + // window satisfies `i_max - i_min <= O+1` (see + // particle_shapes.hpp::for_deposit). The genuine one-sided + // reach above min(i, i_prev) is therefore O, not O+1 — the + // old `O+1` carried one extra cell of conservative padding + // on top of the already-conservative drift term below. + // + // drift — sort runs at end-of-step (see srpic.hpp), so a particle + // sees exactly one pusher step before the *next* step's deposit + // when sorted every step (the common case). DRIFT is therefore a + // fixed constant of 1, NOT a compile-time function of the runtime + // sort cadence. Sizing the halo for the common case (rather than a + // worst-case sort interval) is what keeps the scratch small enough + // for good occupancy; a species sorted less often than + // every step just drifts past the halo and takes the global-J + // escape valve more often — correct, only slower (see the class + // doc-comment for why this is charge-conserving). + // + static constexpr int STENCIL_REACH = (O == 0u) + ? 2 + : static_cast(O); + static constexpr int DRIFT = 1; + static constexpr int HALO = STENCIL_REACH + DRIFT; + static constexpr int TE = static_cast(T_TILE) + 2 * HALO; + + using exec_space = Kokkos::DefaultExecutionSpace; + using team_policy = Kokkos::TeamPolicy; + using member_t = typename team_policy::member_type; + using scratch_mem = typename exec_space::scratch_memory_space; + + // Scratch view types: trailing extent of 3 (jx1, jx2, jx3 components) + // is fixed by a runtime extent so we don't need a separate dimension + // template per component count. + using scratch_1d_t = Kokkos::View>; + using scratch_2d_t = Kokkos::View>; + using scratch_3d_t = Kokkos::View>; + + ndfield_t J; + deposit::PrtlPack prtls; + const M metric; + const real_t charge, inv_dt; + + // Tile metadata produced by SortSpatially. + array_t tile_offsets; + ncells_t ntx1 { 1u }, ntx2 { 1u }, ntx3 { 1u }; + ncells_t total_tiles { 0u }; + + // J's full storage extent including all ghost cells. Used to clip + // the cooperative flush so that a partial tile at the high end of + // the domain does not over-write past the J view. + int j_ext1 { 0 }, j_ext2 { 0 }, j_ext3 { 0 }; + + public: + DepositCurrents_kernel_tiled(const ndfield_t& cur, + const array_t& i1, + const array_t& i2, + const array_t& i3, + const array_t& i1_prev, + const array_t& i2_prev, + const array_t& i3_prev, + const array_t& dx1, + const array_t& dx2, + const array_t& dx3, + const array_t& dx1_prev, + const array_t& dx2_prev, + const array_t& dx3_prev, + const array_t& ux1, + const array_t& ux2, + const array_t& ux3, + const array_t& phi, + const array_t& weight, + const array_t& tag, + const M& metric, + real_t charge, + const real_t dt, + const TileLayout& layout) + : J { cur } + , prtls { i1, i2, i3, i1_prev, i2_prev, i3_prev, + dx1, dx2, dx3, dx1_prev, dx2_prev, dx3_prev, + ux1, ux2, ux3, phi, weight, tag } + , metric { metric } + , charge { charge } + , inv_dt { ONE / dt } + , tile_offsets { layout.tile_offsets } + , ntx1 { layout.ntiles_per_axis[0] } + , ntx2 { layout.ntiles_per_axis[1] } + , ntx3 { layout.ntiles_per_axis[2] } + , total_tiles { layout.ntiles_total } { + raise::ErrorIf( + layout.tile_size != T_TILE, + "Tiled deposit launched with mismatched T_TILE and runtime tile_size", + HERE); + // Note: HALO is allowed to exceed N_GHOSTS. The cooperative + // scratch→J flush and the per-particle escape valve both bounds-clip + // their writes against `j_ext*` so writes that would land past J's + // ghost stripe are silently dropped (they only ever come from a + // particle whose stencil reaches into the domain ghost region, where + // CommunicateFields will re-supply the contribution). + if constexpr (D == Dim::_1D || D == Dim::_2D || D == Dim::_3D) { + j_ext1 = static_cast(cur.extent(0)); + } + if constexpr (D == Dim::_2D || D == Dim::_3D) { + j_ext2 = static_cast(cur.extent(1)); + } + if constexpr (D == Dim::_3D) { + j_ext3 = static_cast(cur.extent(2)); + } + } + + /** + * @brief Per-team scratch size in bytes. Used by the launcher to set + * `team_policy.set_scratch_size(0, Kokkos::PerTeam(bytes))`. + */ + static constexpr std::size_t scratch_bytes() { + if constexpr (D == Dim::_1D) { + return scratch_1d_t::shmem_size(TE, 3); + } else if constexpr (D == Dim::_2D) { + return scratch_2d_t::shmem_size(TE, TE, 3); + } else { + return scratch_3d_t::shmem_size(TE, TE, TE, 3); + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const member_t& team) const { + const auto tile_id = static_cast(team.league_rank()); + // Tile coordinates (tile-grid indices) → tile origin in **active** + // cell coords (no ghost offset). Using ncells_t to match the linearised + // tile index produced by SortSpatially. + ncells_t tx1 = 0, tx2 = 0, tx3 = 0; + if constexpr (D == Dim::_1D) { + tx1 = tile_id; + } else if constexpr (D == Dim::_2D) { + tx1 = tile_id / ntx2; + tx2 = tile_id - tx1 * ntx2; + } else { + const auto plane = ntx2 * ntx3; + tx1 = tile_id / plane; + const auto rem = tile_id - tx1 * plane; + tx2 = rem / ntx3; + tx3 = rem - tx2 * ntx3; + } + // origin_active = lowest active-cell index in the tile (no ghost). + // origin_J = same value translated into J's storage coordinate + // (i.e. plus N_GHOSTS). + // origin_J_low = J coordinate of scratch index 0 (i.e. origin_J - HALO). + // local index `li` in scratch ↔ global J index `gi = li + origin_J_low`. + const int origin_J1_low = static_cast(tx1 * T_TILE) + + static_cast(N_GHOSTS) - HALO; + const int origin_J2_low = static_cast(tx2 * T_TILE) + + static_cast(N_GHOSTS) - HALO; + const int origin_J3_low = static_cast(tx3 * T_TILE) + + static_cast(N_GHOSTS) - HALO; + + // Allocate scratch and cooperatively zero-fill it. + if constexpr (D == Dim::_1D) { + scratch_1d_t scr(team.team_scratch(0), TE, 3); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, TE * 3), + [&](const int idx) { + const int li = idx / 3; + const int c = idx - li * 3; + scr(li, c) = ZERO; + }); + team.team_barrier(); + + const auto p_begin = tile_offsets(tile_id); + const auto p_end = tile_offsets(tile_id + 1u); + const int e1_d = j_ext1; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, p_begin, p_end), + [&](const npart_t p) { + deposit::deposit_one_particle( + p, + prtls, + metric, + charge, + inv_dt, + // Escape valve: a particle whose stencil reaches past the + // tile's scratch (e.g. exceeded the compile-time + // STENCIL_REACH + DRIFT budget) falls back to a direct + // atomic_add on the global J view. Bounds-clipped against + // J's storage extent so writes past the domain ghost stripe + // are dropped (matches the cooperative flush below; those + // contributions are re-supplied by SynchronizeFields(J)). + [&](int g_i1, int comp, real_t v) { + const int li = g_i1 - origin_J1_low; + if (li >= 0 && li < TE) { + Kokkos::atomic_add(&scr(li, comp), v); + } else if (g_i1 >= 0 && g_i1 < e1_d) { + Kokkos::atomic_add(&J(g_i1, comp), v); + } + }); + }); + team.team_barrier(); + + // Cooperative flush of scratch to global J. Bounds-clip against + // the J view extent in case a partial high-end tile (or non-zero + // halo at domain edges) would otherwise write past J. + const int e1 = j_ext1; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, TE * 3), + [&](const int idx) { + const int li = idx / 3; + const int c = idx - li * 3; + const int gi = li + origin_J1_low; + if (gi < 0 || gi >= e1) { + return; + } + const real_t v = scr(li, c); + if (v != ZERO) { + Kokkos::atomic_add(&J(gi, c), v); + } + }); + } else if constexpr (D == Dim::_2D) { + scratch_2d_t scr(team.team_scratch(0), TE, TE, 3); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, TE * TE * 3), + [&](const int idx) { + const int lij = idx / 3; + const int c = idx - lij * 3; + const int li = lij / TE; + const int lj = lij - li * TE; + scr(li, lj, c) = ZERO; + }); + team.team_barrier(); + + const auto p_begin = tile_offsets(tile_id); + const auto p_end = tile_offsets(tile_id + 1u); + const int e1_d = j_ext1; + const int e2_d = j_ext2; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, p_begin, p_end), + [&](const npart_t p) { + deposit::deposit_one_particle( + p, + prtls, + metric, + charge, + inv_dt, + // See 1D branch for rationale. + [&](int g_i1, int g_i2, int comp, real_t v) { + const int li = g_i1 - origin_J1_low; + const int lj = g_i2 - origin_J2_low; + if (li >= 0 && li < TE && lj >= 0 && lj < TE) { + Kokkos::atomic_add(&scr(li, lj, comp), v); + } else if (g_i1 >= 0 && g_i1 < e1_d && g_i2 >= 0 && + g_i2 < e2_d) { + Kokkos::atomic_add(&J(g_i1, g_i2, comp), v); + } + }); + }); + team.team_barrier(); + + const int e1 = j_ext1; + const int e2 = j_ext2; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, TE * TE * 3), + [&](const int idx) { + const int lij = idx / 3; + const int c = idx - lij * 3; + const int li = lij / TE; + const int lj = lij - li * TE; + const int gi = li + origin_J1_low; + const int gj = lj + origin_J2_low; + if (gi < 0 || gi >= e1 || gj < 0 || gj >= e2) { + return; + } + const real_t v = scr(li, lj, c); + if (v != ZERO) { + Kokkos::atomic_add(&J(gi, gj, c), v); + } + }); + } else if constexpr (D == Dim::_3D) { + scratch_3d_t scr(team.team_scratch(0), TE, TE, TE, 3); + const int cells = TE * TE * TE; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, cells * 3), + [&](const int idx) { + const int lijk = idx / 3; + const int c = idx - lijk * 3; + const int li = lijk / (TE * TE); + const int rem = lijk - li * TE * TE; + const int lj = rem / TE; + const int lk = rem - lj * TE; + scr(li, lj, lk, c) = ZERO; + }); + team.team_barrier(); + + const auto p_begin = tile_offsets(tile_id); + const auto p_end = tile_offsets(tile_id + 1u); + const int e1_d = j_ext1; + const int e2_d = j_ext2; + const int e3_d = j_ext3; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, p_begin, p_end), + [&](const npart_t p) { + deposit::deposit_one_particle( + p, + prtls, + metric, + charge, + inv_dt, + // See 1D branch for rationale. + [&](int g_i1, int g_i2, int g_i3, int comp, real_t v) { + const int li = g_i1 - origin_J1_low; + const int lj = g_i2 - origin_J2_low; + const int lk = g_i3 - origin_J3_low; + if (li >= 0 && li < TE && lj >= 0 && lj < TE && lk >= 0 && + lk < TE) { + Kokkos::atomic_add(&scr(li, lj, lk, comp), v); + } else if (g_i1 >= 0 && g_i1 < e1_d && g_i2 >= 0 && + g_i2 < e2_d && g_i3 >= 0 && g_i3 < e3_d) { + Kokkos::atomic_add(&J(g_i1, g_i2, g_i3, comp), v); + } + }); + }); + team.team_barrier(); + + const int e1 = j_ext1; + const int e2 = j_ext2; + const int e3 = j_ext3; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, cells * 3), + [&](const int idx) { + const int lijk = idx / 3; + const int c = idx - lijk * 3; + const int li = lijk / (TE * TE); + const int rem = lijk - li * TE * TE; + const int lj = rem / TE; + const int lk = rem - lj * TE; + const int gi = li + origin_J1_low; + const int gj = lj + origin_J2_low; + const int gk = lk + origin_J3_low; + if (gi < 0 || gi >= e1 || gj < 0 || gj >= e2 || gk < 0 || + gk >= e3) { + return; + } + const real_t v = scr(li, lj, lk, c); + if (v != ZERO) { + Kokkos::atomic_add(&J(gi, gj, gk, c), v); + } + }); + } + } + }; + +#endif // TEAM_POLICY + } // namespace kernel #undef i_di_to_Xi diff --git a/tests/framework/CMakeLists.txt b/tests/framework/CMakeLists.txt index d6dc295af..9a2e2865e 100644 --- a/tests/framework/CMakeLists.txt +++ b/tests/framework/CMakeLists.txt @@ -44,6 +44,12 @@ else() gen_test(particles_sort false) endif() +# team_policy X-3: per-backend sort_by_key permutation test (only built +# when the compile-time team_policy toggle is on). +if(${team_policy}) + gen_test(sort_by_key false) +endif() + gen_test(fields false) gen_test(grid_mesh false) if(${DEBUG}) diff --git a/tests/framework/sort_by_key.cpp b/tests/framework/sort_by_key.cpp new file mode 100644 index 000000000..94dc51d0e --- /dev/null +++ b/tests/framework/sort_by_key.cpp @@ -0,0 +1,110 @@ +/** + * @brief X-3 (team_policy) — sort_by_key permutation test. + * + * Exercises every backend overload of `ntt::sort_helpers::sort_by_key_dispatch` + * that is compiled in for the current Kokkos device. For each backend: + * 1. Allocate keys = { 5, 2, 5, 1, 3, 5, 2 }, perm = (uninitialised). + * 2. Call sort_by_key_dispatch. + * 3. Verify that keys[perm[i]] is sorted in non-decreasing order. + * + * Stability is verified for the BinSort and StdSort backends (the others + * promise stability per their documentation but we don't bake that into + * the test). + * + * Built only when `team_policy=ON` at CMake time. + */ +#include "enums.h" +#include "global.h" + +#include "arch/kokkos_aliases.h" +#include "utils/error.h" +#include "utils/sort_dispatch.h" +#include "utils/sorting.h" + +#include + +#include +#include + +namespace { + + using namespace ntt; + + template + void test_one_backend(const char* label, Backend tag) { + const std::vector keys_host_init { 5u, 2u, 5u, 1u, 3u, 5u, 2u }; + const npart_t n = keys_host_init.size(); + const ncells_t n_max = 6u; // bin range [0, n_max) + + array_t keys { "keys", n }; + auto keys_h = Kokkos::create_mirror_view(keys); + for (npart_t i = 0u; i < n; ++i) { + keys_h(i) = keys_host_init[i]; + } + Kokkos::deep_copy(keys, keys_h); + + prtl_perm_t perm { "perm", n }; + + sort_helpers::sort_by_key_dispatch(keys, perm, n_max, tag); + + auto perm_h = Kokkos::create_mirror_view(perm); + Kokkos::deep_copy(perm_h, perm); + + // Validate: keys[perm[0]] <= keys[perm[1]] <= ... + for (npart_t i = 1u; i < n; ++i) { + const auto a = keys_host_init[perm_h(i - 1u)]; + const auto b = keys_host_init[perm_h(i)]; + raise::ErrorIf( + a > b, + std::string("sort_by_key_dispatch produced non-sorted permutation " + "for backend ") + + label, + HERE); + } + + // Validate: perm is a permutation of [0, n). + std::vector seen(n, 0); + for (npart_t i = 0u; i < n; ++i) { + const auto idx = perm_h(i); + raise::ErrorIf(idx >= n, + std::string("permutation index out of range for " + "backend ") + + label, + HERE); + seen[idx] += 1; + } + for (npart_t i = 0u; i < n; ++i) { + raise::ErrorIf(seen[i] != 1, + std::string("permutation not a bijection for backend ") + + label, + HERE); + } + + std::cout << "[OK] sort_by_key_dispatch<" << label << ">: " + << "keys[perm] sorted, perm is a bijection." << std::endl; + } + +} // namespace + +auto main(int argc, char* argv[]) -> int { + ntt::GlobalInitialize(argc, argv); + try { + // Always-available backends. + test_one_backend("BinSort", ::sort::backend::BinSort {}); +#if !defined(DEVICE_ENABLED) + test_one_backend("StdSort", ::sort::backend::StdSort {}); +#endif +#if defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED) + test_one_backend("OneDPL", ::sort::backend::OneDPL {}); +#endif +#if defined(CUDA_ENABLED) && defined(THRUST_ENABLED) + test_one_backend("Thrust", ::sort::backend::Thrust {}); +#endif + } catch (std::exception& e) { + std::cerr << e.what() << std::endl; + ntt::GlobalFinalize(); + return 1; + } + ntt::GlobalFinalize(); + return 0; +} diff --git a/tests/global/tiling.cpp b/tests/global/tiling.cpp index d0f0a412c..2f926ce28 100644 --- a/tests/global/tiling.cpp +++ b/tests/global/tiling.cpp @@ -1,6 +1,5 @@ #include "arch/kokkos_aliases.h" #include "utils/error.h" -#include "utils/formatting.h" #include "utils/numeric.h" #include "utils/sorting.h" @@ -122,37 +121,27 @@ void test_tiling(const array_t& i1, const auto ntiles = nt1 * nt2 * nt3; - auto position_to_tile_kernel = sort::PositionToTileIndex { - i1, i2, i3, tag, tile_indices, ncells, ts - }; - Kokkos::parallel_for("Tiling", npart, position_to_tile_kernel); - const auto num_ppt = position_to_tile_kernel.num_ppt; + array_t num_ppt { "num_ppt", ntiles }; + Kokkos::parallel_for( + "Tiling", + npart, + sort::PositionToTileIndex { i1, i2, i3, tag, tile_indices, ncells, ts, num_ppt }); Kokkos::parallel_for( "Checking", npart, Lambda(prtlidx_t p) { CheckValue(p, i1, i2, i3, tag, tile_indices, nt1, nt2, nt3, ntiles, ts); }); - raise::ErrorIf( - num_ppt.extent(0) != ntiles, - fmt::format("num_ppt size does not match number of tiles %u != %u", - num_ppt.extent(0), - ntiles), - HERE); npart_t tot_alive = 0u; Kokkos::parallel_reduce( "CountAliveInTiles", ntiles, - Lambda(cellidx_t t, npart_t & count) { count += num_ppt(t); }, + Lambda(prtlidx_t t, npart_t & count) { count += num_ppt(t); }, tot_alive); - raise::ErrorIf( - tot_alive != npart - ndead, - fmt::format("Error in counting particles per tile: %u != %u - %u", - tot_alive, - npart, - ndead), - HERE); + raise::ErrorIf(tot_alive != npart - ndead, + "Error in counting particles per tile", + HERE); } } diff --git a/tests/kernels/CMakeLists.txt b/tests/kernels/CMakeLists.txt index f1438e108..0fe23f578 100644 --- a/tests/kernels/CMakeLists.txt +++ b/tests/kernels/CMakeLists.txt @@ -26,6 +26,9 @@ endfunction() gen_test(faraday_mink) gen_test(ampere_mink) gen_test(deposit) +if(${team_policy}) + gen_test(deposit_tiled) +endif() gen_test(digital_filter) gen_test(particle_moments) gen_test(fields_to_phys) diff --git a/tests/kernels/deposit_tiled.cpp b/tests/kernels/deposit_tiled.cpp new file mode 100644 index 000000000..87f4f3f4d --- /dev/null +++ b/tests/kernels/deposit_tiled.cpp @@ -0,0 +1,262 @@ +/** + * @file tests/kernels/deposit_tiled.cpp + * @brief X-1 numerical-equivalence test for the tiled deposit kernel. + * + * Runs the flat (`DepositCurrents_kernel`) and tiled + * (`DepositCurrents_kernel_tiled`) kernels on identical particle SoA inputs + * for shape orders O = 1..11 and asserts that the resulting J array is + * identical cell-by-cell within a small floating-point tolerance. + * + * Built only when `team_policy=ON` (`-D TEAM_POLICY` defined). The test + * matches the per-particle setup used in `deposit.cpp` so that any + * regression in the shared `kernel::deposit::deposit_one_particle` body + * is caught by both tests. + */ + +#include "enums.h" +#include "global.h" + +#include "arch/kokkos_aliases.h" +#include "utils/comparators.h" + +#include "metrics/minkowski.h" + +#include "kernels/currents_deposit.hpp" + +#include +#include + +#include +#include +#include +#include + +namespace { + + using namespace ntt; + + void errorIf(bool condition, const std::string& msg) { + if (condition) { + throw std::runtime_error(msg); + } + } + + template + void put_value(const array_t& arr, T value, int i) { + auto h = Kokkos::create_mirror_view(arr); + h(i) = value; + Kokkos::deep_copy(arr, h); + } + + // Builds tile_offsets for a single-particle test. Particle 0 is alive + // and lives in tile (tx1, tx2); slots 1..n_slots-1 carry the dead + // sentinel and are never referenced by tile_offsets — so the tiled + // kernel never iterates over them. + array_t build_tile_offsets_single_particle(ncells_t ntx1, + ncells_t ntx2, + ncells_t tx1, + ncells_t tx2) { + const ncells_t total_tiles = ntx1 * ntx2; + const ncells_t hot_tile = tx1 * ntx2 + tx2; + array_t offsets("tile_offsets", total_tiles + 1u); + auto h = Kokkos::create_mirror_view(offsets); + for (ncells_t t = 0; t <= total_tiles; ++t) { + h(t) = (t <= hot_tile) ? static_cast(0) + : static_cast(1); + } + Kokkos::deep_copy(offsets, h); + return offsets; + } + + template + void run_one_case() { + using metric_t = metric::Minkowski; + constexpr unsigned short nx1 = 50u, nx2 = 50u; + metric_t metric { { nx1, nx2 }, + { { 0.0, 55.0 }, { 0.0, 55.0 } }, + {} }; + + // Particle setup (mirrors deposit.cpp). + const int i0 = 25, j0 = 21, i0f = 24, j0f = 20; + const real_t uz = 2.5; + const prtldx_t dxi = static_cast(0.65); + const prtldx_t dxf = static_cast(0.99); + const prtldx_t dyi = static_cast(0.65); + const prtldx_t dyf = static_cast(0.80); + + array_t i1 { "i1", 10 }; + array_t i2 { "i2", 10 }; + array_t i3 { "i3", 10 }; + array_t i1_prev { "i1_prev", 10 }; + array_t i2_prev { "i2_prev", 10 }; + array_t i3_prev { "i3_prev", 10 }; + array_t dx1 { "dx1", 10 }; + array_t dx2 { "dx2", 10 }; + array_t dx3 { "dx3", 10 }; + array_t dx1_prev { "dx1_prev", 10 }; + array_t dx2_prev { "dx2_prev", 10 }; + array_t dx3_prev { "dx3_prev", 10 }; + array_t ux1 { "ux1", 10 }; + array_t ux2 { "ux2", 10 }; + array_t ux3 { "ux3", 10 }; + array_t phi { "phi", 10 }; + array_t weight { "weight", 10 }; + array_t tag { "tag", 10 }; + const real_t charge = 1.0, dt = 1.0; + + put_value(i1, i0f, 0); + put_value(i2, j0f, 0); + put_value(i1_prev, i0, 0); + put_value(i2_prev, j0, 0); + put_value(dx1, dxf, 0); + put_value(dx2, dyf, 0); + put_value(dx1_prev, dxi, 0); + put_value(dx2_prev, dyi, 0); + put_value(ux1, ZERO, 0); + put_value(ux2, ZERO, 0); + put_value(ux3, uz, 0); + put_value(weight, 1.0, 0); + put_value(tag, ParticleTag::alive, 0); + + // Run the flat kernel. + ndfield_t J_flat { "J_flat", + nx1 + 2u * N_GHOSTS, + nx2 + 2u * N_GHOSTS }; + { + auto J_scat = Kokkos::Experimental::create_scatter_view(J_flat); + Kokkos::parallel_for( + "FlatDeposit", + 10, + kernel::DepositCurrents_kernel( + J_scat, + i1, i2, i3, + i1_prev, i2_prev, i3_prev, + dx1, dx2, dx3, + dx1_prev, dx2_prev, dx3_prev, + ux1, ux2, ux3, + phi, weight, tag, + metric, charge, dt)); + Kokkos::Experimental::contribute(J_flat, J_scat); + Kokkos::fence("flat deposit done"); + } + + // Run the tiled kernel. Build a TileLayout with one alive particle + // landing in its expected tile (sort key = min(i, i_prev) / T_TILE). + ndfield_t J_tiled { "J_tiled", + nx1 + 2u * N_GHOSTS, + nx2 + 2u * N_GHOSTS }; + { + const auto sort_i1 = static_cast( + (i0 < i0f) ? i0 : i0f); // min(i, i_prev) before clamp + const auto sort_i2 = static_cast((j0 < j0f) ? j0 : j0f); + const auto ntx1 = static_cast( + std::ceil(static_cast(nx1) / static_cast(T_TILE))); + const auto ntx2 = static_cast( + std::ceil(static_cast(nx2) / static_cast(T_TILE))); + const auto tx1 = static_cast(sort_i1) / T_TILE; + const auto tx2 = static_cast(sort_i2) / T_TILE; + + TileLayout layout; + layout.ntiles_per_axis[0] = ntx1; + layout.ntiles_per_axis[1] = ntx2; + layout.ntiles_per_axis[2] = 1u; + layout.ntiles_total = ntx1 * ntx2; + layout.tile_size = T_TILE; + layout.tile_offsets = build_tile_offsets_single_particle(ntx1, + ntx2, + tx1, + tx2); + + using kernel_t = + kernel::DepositCurrents_kernel_tiled; + kernel_t kern { J_tiled, + i1, i2, i3, + i1_prev, i2_prev, i3_prev, + dx1, dx2, dx3, + dx1_prev, dx2_prev, dx3_prev, + ux1, ux2, ux3, + phi, weight, tag, + metric, charge, dt, layout }; + + Kokkos::TeamPolicy<> policy(static_cast(layout.ntiles_total), + Kokkos::AUTO); + policy.set_scratch_size(0, + Kokkos::PerTeam(kernel_t::scratch_bytes())); + Kokkos::parallel_for("TiledDeposit", policy, kern); + Kokkos::fence("tiled deposit done"); + } + + // Compare J_flat vs J_tiled cell-by-cell. + auto h_flat = Kokkos::create_mirror_view(J_flat); + auto h_tiled = Kokkos::create_mirror_view(J_tiled); + Kokkos::deep_copy(h_flat, J_flat); + Kokkos::deep_copy(h_tiled, J_tiled); + + const real_t eps = static_cast(1.0e-5); + real_t max_diff = ZERO; + int fail_count = 0; + for (ncells_t i = 0; i < h_flat.extent(0); ++i) { + for (ncells_t j = 0; j < h_flat.extent(1); ++j) { + for (int c = 0; c < 3; ++c) { + const real_t a = h_flat(i, j, c); + const real_t b = h_tiled(i, j, c); + const real_t diff = math::fabs(a - b); + const real_t mag = math::max(math::fabs(a), math::fabs(b)); + if (diff > max_diff) { + max_diff = diff; + } + if (diff > eps * math::max(mag, static_cast(1.0))) { + if (fail_count < 5) { + std::cerr << " J(" << i << "," << j << ",c=" << c + << ") flat=" << a << " tiled=" << b + << " diff=" << diff << '\n'; + } + ++fail_count; + } + } + } + } + if (fail_count > 0) { + std::cerr << "X-1 deposit_tiled equivalence FAILED for O=" << O + << " T_TILE=" << T_TILE + << " : " << fail_count << " mismatches; max_diff=" << max_diff + << '\n'; + throw std::logic_error("DepositCurrents_kernel_tiled mismatch"); + } + std::cerr << "X-1 deposit_tiled OK O=" << O << " T_TILE=" << T_TILE + << " max_diff=" << max_diff << '\n'; + } + + template + void run_all_orders() { + run_one_case<0u, T_TILE>(); + run_one_case<1u, T_TILE>(); + run_one_case<2u, T_TILE>(); + run_one_case<3u, T_TILE>(); + run_one_case<4u, T_TILE>(); + run_one_case<5u, T_TILE>(); + run_one_case<6u, T_TILE>(); + run_one_case<7u, T_TILE>(); + run_one_case<8u, T_TILE>(); + run_one_case<9u, T_TILE>(); + run_one_case<10u, T_TILE>(); + run_one_case<11u, T_TILE>(); + } + +} // namespace + +auto main(int argc, char* argv[]) -> int { + Kokkos::initialize(argc, argv); + try { + // Run with each tile-size choice from the validated CMake list. + run_all_orders<4u>(); + run_all_orders<8u>(); + run_all_orders<12u>(); + } catch (std::exception& e) { + std::cerr << e.what() << '\n'; + Kokkos::finalize(); + return 1; + } + Kokkos::finalize(); + return 0; +} From 04d403c606147e0d651cac28f930d5592f808c84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ludwig=20B=C3=B6ss?= Date: Tue, 19 May 2026 19:25:38 +0200 Subject: [PATCH 2/7] removed redundant inner loop over species --- src/framework/domain/metadomain_sort.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/framework/domain/metadomain_sort.cpp b/src/framework/domain/metadomain_sort.cpp index 791bf31a8..a2b08f8be 100644 --- a/src/framework/domain/metadomain_sort.cpp +++ b/src/framework/domain/metadomain_sort.cpp @@ -21,9 +21,7 @@ namespace ntt { const auto clearing_interval = species.clearing_interval(); if ((clearing_interval > 0u) and (step % clearing_interval == 0u) and (step > 0u)) { - for (auto& species : domain.species) { - species.RemoveDead(); - } + species.RemoveDead(); } const auto spatial_sorting_interval = species.spatial_sorting_interval(); if ((spatial_sorting_interval > 0u) and From 5830c0740e43851f5d26afb8e35438160be8a007 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ludwig=20B=C3=B6ss?= Date: Tue, 19 May 2026 19:26:11 +0200 Subject: [PATCH 3/7] frontier-specific memory pool allocation --- src/global/global.cpp | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/src/global/global.cpp b/src/global/global.cpp index ec22fd2f3..c3aa1bcdc 100644 --- a/src/global/global.cpp +++ b/src/global/global.cpp @@ -6,8 +6,51 @@ #include #endif // MPI_ENABLED +#if defined(HIP_ENABLED) + #include + + #include +#endif // HIP_ENABLED + +namespace { +#if defined(HIP_ENABLED) + // Turn the ROCm stream-ordered allocator into a caching arena. + // + // This Kokkos build uses hipMallocAsync/hipFreeAsync (Kokkos option + // IMPL_HIP_MALLOC_ASYNC). The default memory pool has a release + // threshold of 0, so every freed block is handed back to the driver + // at the next stream sync. With ~50 GB of particle SoA permanently + // pinned and only ~14 GB free, the per-step churn of dozens of + // large, differently-sized sort/comm scratch buffers fragments that + // free space: allocation cost grows monotonically (ParticleSort + // slowdown) until no contiguous mid-size block remains and BinSort's + // `sorted_values` allocation fails (OOM). Raising the release + // threshold to "unlimited" makes the pool retain and recycle freed + // blocks instead, which stabilizes the working set and removes both + // the slowdown and the OOM. + void ConfigureHipMemPool() { + int device = 0; + if (hipGetDevice(&device) != hipSuccess) { + return; + } + hipMemPool_t pool = nullptr; + if (hipDeviceGetDefaultMemPool(&pool, device) != hipSuccess or + pool == nullptr) { + return; + } + uint64_t threshold = UINT64_MAX; + (void)hipMemPoolSetAttribute(pool, + hipMemPoolAttrReleaseThreshold, + &threshold); + } +#endif // HIP_ENABLED +} // namespace + void ntt::GlobalInitialize(int argc, char* argv[]) { Kokkos::initialize(argc, argv); +#if defined(HIP_ENABLED) + ConfigureHipMemPool(); +#endif // HIP_ENABLED #if defined(MPI_ENABLED) MPI_Init(&argc, &argv); #endif // MPI_ENABLED From cf14e9f11891afc732bfb9d289b2c16c53af770c Mon Sep 17 00:00:00 2001 From: LudwigBoess Date: Tue, 26 May 2026 17:06:27 +0000 Subject: [PATCH 4/7] support more tile sizes --- CMakeLists.txt | 2 +- tests/kernels/deposit_tiled.cpp | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 248a34e7b..06494a7db 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,7 +65,7 @@ set(team_policy_tile_size ${default_team_policy_tile_size} CACHE STRING "team_policy tile edge length in cells") set(team_policy_tile_sizes - "4;6;8;10;12" + "4;6;8;10;12;14;16" CACHE STRING "team_policy tile-size choices") # -------------------------- Compilation settings -------------------------- # diff --git a/tests/kernels/deposit_tiled.cpp b/tests/kernels/deposit_tiled.cpp index 87f4f3f4d..504cc3818 100644 --- a/tests/kernels/deposit_tiled.cpp +++ b/tests/kernels/deposit_tiled.cpp @@ -250,8 +250,12 @@ auto main(int argc, char* argv[]) -> int { try { // Run with each tile-size choice from the validated CMake list. run_all_orders<4u>(); + run_all_orders<6u>(); run_all_orders<8u>(); + run_all_orders<10u>(); run_all_orders<12u>(); + run_all_orders<14u>(); + run_all_orders<16u>(); } catch (std::exception& e) { std::cerr << e.what() << '\n'; Kokkos::finalize(); From 787aa045750fa2c582b54d02c5e8d3f4265de1e0 Mon Sep 17 00:00:00 2001 From: LudwigBoess Date: Tue, 26 May 2026 17:12:06 +0000 Subject: [PATCH 5/7] removed persistent sort scratch to reduce memory overhead --- src/framework/containers/particles.h | 16 --- src/framework/containers/particles_sort.cpp | 111 +++++++------------- 2 files changed, 38 insertions(+), 89 deletions(-) diff --git a/src/framework/containers/particles.h b/src/framework/containers/particles.h index 0a15cb5d9..4f0770729 100644 --- a/src/framework/containers/particles.h +++ b/src/framework/containers/particles.h @@ -99,22 +99,6 @@ namespace ntt { // vendor libraries detected by CMake. TileLayout m_tile_layout {}; -#if defined(TEAM_POLICY) && \ - ((defined(SYCL_ENABLED) && defined(ONEDPL_ENABLED)) || \ - (defined(CUDA_ENABLED) && defined(THRUST_ENABLED)) || \ - (defined(HIP_ENABLED) && defined(ROCTHRUST_ENABLED))) - // Persistent byte scratch reused by every SoA-member gather in - // `apply_permutation_to_soa`, across all members and all timesteps. - // Without this each member would allocate (and free) its own - // transient buffer every sort; recycling one persistent buffer - // removes that allocation churn entirely — the structural fix for - // the ROCm sort slowdown / fragmentation. Grown monotonically to - // the largest required size, never shrunk. Kokkos device - // allocations are over-aligned (>= 8 B), so reinterpreting the - // bytes as any SoA element type (<= 8 B PODs) is well-defined. - array_t m_perm_scratch {}; -#endif - public: // for empty allocation Particles() {} diff --git a/src/framework/containers/particles_sort.cpp b/src/framework/containers/particles_sort.cpp index c04813e12..0317e0b5f 100644 --- a/src/framework/containers/particles_sort.cpp +++ b/src/framework/containers/particles_sort.cpp @@ -454,28 +454,20 @@ namespace ntt { #if defined(TEAM_POLICY_USE_VENDOR_SORT) namespace permute_helpers { - // Permute a 1D SoA member array `arr` in place by `perm`, gathering - // through `scratch` — a persistent byte buffer reused by every - // member and every timestep (no per-call allocation). An unmanaged - // typed view aliases the scratch bytes; the caller guarantees - // `scratch` is large enough and that Kokkos' device over-alignment - // covers the element type. + // Permute a 1D SoA member array `arr` in place by `perm`, using a + // single transient buffer of size `n`. Buffer is freed at scope + // exit; the explicit fence right before that drains queued GPU + // work referencing it. template - inline void permute_1d_inplace(V& arr, - const prtl_perm_t& perm, - npart_t n, - const array_t& scratch) { + inline void permute_1d_inplace(V& arr, + const prtl_perm_t& perm, + npart_t n) { if (n == 0u) { return; } - using value_t = typename V::non_const_value_type; - using buf_t = Kokkos::View>; - buf_t buf(reinterpret_cast(scratch.data()), n); - auto perm_v = perm; - auto arr_v = arr; + V buf(std::string(arr.label()) + "_perm_buf", n); + auto perm_v = perm; + auto arr_v = arr; Kokkos::parallel_for( "Permute1D", n, @@ -486,22 +478,16 @@ namespace ntt { // 2D analogue for `pld_r` / `pld_i`. template - inline void permute_2d_inplace(V& arr, - const prtl_perm_t& perm, - npart_t n, - npart_t ncols, - const array_t& scratch) { + inline void permute_2d_inplace(V& arr, + const prtl_perm_t& perm, + npart_t n, + npart_t ncols) { if (n == 0u or ncols == 0u) { return; } - using value_t = typename V::non_const_value_type; - using buf_t = Kokkos::View>; - buf_t buf(reinterpret_cast(scratch.data()), n, ncols); - auto perm_v = perm; - auto arr_v = arr; + V buf(std::string(arr.label()) + "_perm_buf", n, ncols); + auto perm_v = perm; + auto arr_v = arr; Kokkos::parallel_for( "Permute2D", CreateParticleRangePolicy({ 0u, 0u }, { n, ncols }), @@ -522,61 +508,40 @@ namespace ntt { return; } - // Size the persistent scratch once to the largest gather any member - // needs this call: 1D members need n * sizeof(real_t) bytes (the - // widest element); the 2D payloads need n * ncols * elem bytes. - // Grown monotonically, never shrunk — so after warmup this incurs - // no allocation at all. - std::size_t need = static_cast(n) * sizeof(real_t); - if (npld_r() > 0) { - need = std::max(need, - static_cast(n) * - static_cast(npld_r()) * sizeof(real_t)); - } - if (npld_i() > 0) { - need = std::max(need, - static_cast(n) * - static_cast(npld_i()) * sizeof(npart_t)); - } - if (m_perm_scratch.extent(0) < need) { - m_perm_scratch = array_t { "perm_scratch", need }; - } - const auto& scratch = m_perm_scratch; - using permute_helpers::permute_1d_inplace; using permute_helpers::permute_2d_inplace; if constexpr (D == Dim::_1D or D == Dim::_2D or D == Dim::_3D) { - permute_1d_inplace(i1, perm, n, scratch); - permute_1d_inplace(dx1, perm, n, scratch); - permute_1d_inplace(i1_prev, perm, n, scratch); - permute_1d_inplace(dx1_prev, perm, n, scratch); + permute_1d_inplace(i1, perm, n); + permute_1d_inplace(dx1, perm, n); + permute_1d_inplace(i1_prev, perm, n); + permute_1d_inplace(dx1_prev, perm, n); } if constexpr (D == Dim::_2D or D == Dim::_3D) { - permute_1d_inplace(i2, perm, n, scratch); - permute_1d_inplace(dx2, perm, n, scratch); - permute_1d_inplace(i2_prev, perm, n, scratch); - permute_1d_inplace(dx2_prev, perm, n, scratch); + permute_1d_inplace(i2, perm, n); + permute_1d_inplace(dx2, perm, n); + permute_1d_inplace(i2_prev, perm, n); + permute_1d_inplace(dx2_prev, perm, n); } if constexpr (D == Dim::_3D) { - permute_1d_inplace(i3, perm, n, scratch); - permute_1d_inplace(dx3, perm, n, scratch); - permute_1d_inplace(i3_prev, perm, n, scratch); - permute_1d_inplace(dx3_prev, perm, n, scratch); - } - permute_1d_inplace(ux1, perm, n, scratch); - permute_1d_inplace(ux2, perm, n, scratch); - permute_1d_inplace(ux3, perm, n, scratch); - permute_1d_inplace(weight, perm, n, scratch); - permute_1d_inplace(tag, perm, n, scratch); + permute_1d_inplace(i3, perm, n); + permute_1d_inplace(dx3, perm, n); + permute_1d_inplace(i3_prev, perm, n); + permute_1d_inplace(dx3_prev, perm, n); + } + permute_1d_inplace(ux1, perm, n); + permute_1d_inplace(ux2, perm, n); + permute_1d_inplace(ux3, perm, n); + permute_1d_inplace(weight, perm, n); + permute_1d_inplace(tag, perm, n); if constexpr (D == Dim::_2D and C != Coord::Cartesian) { - permute_1d_inplace(phi, perm, n, scratch); + permute_1d_inplace(phi, perm, n); } if (npld_r() > 0) { - permute_2d_inplace(pld_r, perm, n, static_cast(npld_r()), scratch); + permute_2d_inplace(pld_r, perm, n, static_cast(npld_r())); } if (npld_i() > 0) { - permute_2d_inplace(pld_i, perm, n, static_cast(npld_i()), scratch); + permute_2d_inplace(pld_i, perm, n, static_cast(npld_i())); } } #endif // TEAM_POLICY_USE_VENDOR_SORT From 827cf264c2903b3179252c88f19c0a7bb090357d Mon Sep 17 00:00:00 2001 From: LudwigBoess Date: Tue, 2 Jun 2026 18:20:33 +0000 Subject: [PATCH 6/7] added team policy reporting --- src/engines/reporter.cpp | 3 +++ src/global/utils/reporter.cpp | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/src/engines/reporter.cpp b/src/engines/reporter.cpp index f56874d23..3caeda5de 100644 --- a/src/engines/reporter.cpp +++ b/src/engines/reporter.cpp @@ -32,6 +32,9 @@ namespace ntt { "%s", params.template get("simulation.name").c_str()); reporter::AddParam(report, 4, "Engine", "%s", SimEngine(S).to_string()); +#if defined(TEAM_POLICY) + reporter::AddParam(report, 4, "Tile size", "%d", TEAM_POLICY_TILE_SIZE); +#endif reporter::AddParam(report, 4, "Metric", "%s", M.to_string()); #if SHAPE_ORDER == 0 reporter::AddParam(report, 4, "Deposit", "%s", "zigzag"); diff --git a/src/global/utils/reporter.cpp b/src/global/utils/reporter.cpp index 77117c4b9..a4b10eee6 100644 --- a/src/global/utils/reporter.cpp +++ b/src/global/utils/reporter.cpp @@ -250,6 +250,12 @@ namespace reporter { #else AddParam(report, 4, "GPU_AWARE_MPI", "%s", "OFF"); #endif + +#if defined(TEAM_POLICY) + AddParam(report, 4, "TEAM_POLICY", "%s", "ON"); +#else + AddParam(report, 4, "TEAM_POLICY", "%s", "OFF"); +#endif report += "\n"; return report; } From 4b81914aae5cbc15cea19217b284113f617750f3 Mon Sep 17 00:00:00 2001 From: LudwigBoess Date: Tue, 2 Jun 2026 19:08:47 +0000 Subject: [PATCH 7/7] explicitly bind GPU Transport Layer for GPU aware MPI on Frontier --- CMakeLists.txt | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 06494a7db..0137671a0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -211,6 +211,62 @@ if(${mpi}) if(${DEVICE_ENABLED}) if(${gpu_aware_mpi}) add_compile_options("-D GPU_AWARE_MPI") + + # On Cray systems (e.g. Frontier) GPU-aware Cray MPICH can only + # handle device pointers if the GPU Transport Layer (GTL) library + # is linked. The Cray compiler wrappers (cc/CC) inject this + # automatically, but we build with hipcc/nvcc directly, so + # find_package(MPI) only finds base libmpi and the GTL is left + # out -> MPI_Sendrecv on a device pointer fails with + # "OFI ... Bad address". Add it explicitly here. + # + # Cray PE exports PE_MPICH_GTL_DIR_ / PE_MPICH_GTL_LIBS_ + # (e.g. amd_gfx90a -> -lmpi_gtl_hsa). Their absence means this is + # not a Cray MPICH build, in which case nothing extra is needed. + if("${Kokkos_DEVICES}" MATCHES "HIP") + set(_gtl_accels amd_gfx942 amd_gfx940 amd_gfx90a amd_gfx908 amd_gfx906) + elseif("${Kokkos_DEVICES}" MATCHES "CUDA") + set(_gtl_accels nvidia90 nvidia80 nvidia70) + elseif("${Kokkos_DEVICES}" MATCHES "SYCL") + set(_gtl_accels ponteVecchio) + else() + set(_gtl_accels "") + endif() + + set(_gtl_dir "") + set(_gtl_libflag "") + foreach(_accel ${_gtl_accels}) + if((NOT _gtl_dir) AND (DEFINED ENV{PE_MPICH_GTL_DIR_${_accel}})) + # strip the leading "-L" from the Cray-provided value + string(REGEX REPLACE "^-L" "" + _gtl_dir "$ENV{PE_MPICH_GTL_DIR_${_accel}}") + string(REGEX REPLACE "^-l" "" + _gtl_libflag "$ENV{PE_MPICH_GTL_LIBS_${_accel}}") + endif() + endforeach() + + if(_gtl_dir AND _gtl_libflag) + find_library(MPI_GTL_LIBRARY + NAMES ${_gtl_libflag} + HINTS "${_gtl_dir}" + NO_DEFAULT_PATH) + if(MPI_GTL_LIBRARY) + message(STATUS + "GPU-aware MPI: linking Cray GTL library ${MPI_GTL_LIBRARY}") + set(DEPENDENCIES ${DEPENDENCIES} ${MPI_GTL_LIBRARY}) + else() + message(FATAL_ERROR + "${Red}gpu_aware_mpi=ON: Cray MPICH detected but the GTL " + "library 'lib${_gtl_libflag}' was not found in '${_gtl_dir}'. " + "GPU-aware MPI will crash at runtime without it. Make sure the " + "craype-accel module is loaded, or build with gpu_aware_mpi=OFF." + "${ColorReset}") + endif() + else() + message(STATUS + "GPU-aware MPI: no Cray GTL environment found; assuming the MPI " + "implementation is GPU-aware without an extra transport library.") + endif() endif() else() set(gpu_aware_mpi