Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 122 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,16 @@ set(gpu_aware_mpi
${default_gpu_aware_mpi}
CACHE BOOL "Enable GPU-aware MPI")

set(team_policy
${default_team_policy}
CACHE BOOL "Enable team_policy tile-blocked deposit/pusher kernels")
set(team_policy_tile_size
${default_team_policy_tile_size}
CACHE STRING "team_policy tile edge length in cells")
set(team_policy_tile_sizes
"4;6;8;10;12;14;16"
CACHE STRING "team_policy tile-size choices")

# -------------------------- Compilation settings -------------------------- #
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
Expand Down Expand Up @@ -136,6 +146,62 @@ else()
set(DEVICE_ENABLED OFF)
endif()

# ------------------------------ team_policy wiring ------------------------ #
if(${team_policy})
list(FIND team_policy_tile_sizes "${team_policy_tile_size}" _tps_idx)
if(_tps_idx EQUAL -1)
message(FATAL_ERROR
"${Red}team_policy_tile_size must be one of ${team_policy_tile_sizes}, "
"got '${team_policy_tile_size}'${ColorReset}")
endif()
add_compile_options("-D TEAM_POLICY")
add_compile_options("-D TEAM_POLICY_TILE_SIZE=${team_policy_tile_size}")

# Vendor sort: oneDPL on SYCL, Thrust on CUDA. Used automatically
# when found; falls back to Kokkos::BinSort otherwise.
if("${Kokkos_DEVICES}" MATCHES "SYCL")
find_package(oneDPL QUIET)
if(oneDPL_FOUND)
message(STATUS "team_policy: oneDPL found, enabling SYCL sort_by_key")
add_compile_options("-D ONEDPL_ENABLED")
set(DEPENDENCIES ${DEPENDENCIES} oneDPL)
else()
message(STATUS "team_policy: oneDPL not found; using BinSort fallback "
"for SYCL sort_by_key")
endif()
endif()

if("${Kokkos_DEVICES}" MATCHES "CUDA")
find_package(Thrust QUIET)
if(Thrust_FOUND)
message(STATUS "team_policy: Thrust enabled for CUDA sort_by_key")
add_compile_options("-D THRUST_ENABLED")
else()
message(STATUS "team_policy: Thrust not found; using BinSort fallback "
"for CUDA sort_by_key")
endif()
endif()

if("${Kokkos_DEVICES}" MATCHES "HIP")
# rocThrust ships with ROCm and exposes the same thrust:: API. Using
# it lets the HIP backend build a single permutation via
# sort_by_key and gather all SoA members through one reused buffer,
# instead of the legacy per-member Kokkos::BinSort path which
# allocates a fresh `sorted_values` buffer for every member every
# step (the dominant source of allocator churn / fragmentation on
# ROCm).
find_package(rocthrust QUIET)
if(rocthrust_FOUND)
message(STATUS "team_policy: rocThrust enabled for HIP sort_by_key")
add_compile_options("-D ROCTHRUST_ENABLED")
set(DEPENDENCIES ${DEPENDENCIES} roc::rocthrust)
else()
message(STATUS "team_policy: rocThrust not found; using BinSort "
"fallback for HIP sort_by_key")
endif()
endif()
endif()

# MPI
if(${mpi})
find_or_fetch_dependency(MPI FALSE REQUIRED)
Expand All @@ -145,6 +211,62 @@ if(${mpi})
if(${DEVICE_ENABLED})
if(${gpu_aware_mpi})
add_compile_options("-D GPU_AWARE_MPI")

# On Cray systems (e.g. Frontier) GPU-aware Cray MPICH can only
# handle device pointers if the GPU Transport Layer (GTL) library
# is linked. The Cray compiler wrappers (cc/CC) inject this
# automatically, but we build with hipcc/nvcc directly, so
# find_package(MPI) only finds base libmpi and the GTL is left
# out -> MPI_Sendrecv on a device pointer fails with
# "OFI ... Bad address". Add it explicitly here.
#
# Cray PE exports PE_MPICH_GTL_DIR_<accel> / PE_MPICH_GTL_LIBS_<accel>
# (e.g. amd_gfx90a -> -lmpi_gtl_hsa). Their absence means this is
# not a Cray MPICH build, in which case nothing extra is needed.
if("${Kokkos_DEVICES}" MATCHES "HIP")
set(_gtl_accels amd_gfx942 amd_gfx940 amd_gfx90a amd_gfx908 amd_gfx906)
elseif("${Kokkos_DEVICES}" MATCHES "CUDA")
set(_gtl_accels nvidia90 nvidia80 nvidia70)
elseif("${Kokkos_DEVICES}" MATCHES "SYCL")
set(_gtl_accels ponteVecchio)
else()
set(_gtl_accels "")
endif()

set(_gtl_dir "")
set(_gtl_libflag "")
foreach(_accel ${_gtl_accels})
if((NOT _gtl_dir) AND (DEFINED ENV{PE_MPICH_GTL_DIR_${_accel}}))
# strip the leading "-L" from the Cray-provided value
string(REGEX REPLACE "^-L" ""
_gtl_dir "$ENV{PE_MPICH_GTL_DIR_${_accel}}")
string(REGEX REPLACE "^-l" ""
_gtl_libflag "$ENV{PE_MPICH_GTL_LIBS_${_accel}}")
endif()
endforeach()

if(_gtl_dir AND _gtl_libflag)
find_library(MPI_GTL_LIBRARY
NAMES ${_gtl_libflag}
HINTS "${_gtl_dir}"
NO_DEFAULT_PATH)
if(MPI_GTL_LIBRARY)
message(STATUS
"GPU-aware MPI: linking Cray GTL library ${MPI_GTL_LIBRARY}")
set(DEPENDENCIES ${DEPENDENCIES} ${MPI_GTL_LIBRARY})
else()
message(FATAL_ERROR
"${Red}gpu_aware_mpi=ON: Cray MPICH detected but the GTL "
"library 'lib${_gtl_libflag}' was not found in '${_gtl_dir}'. "
"GPU-aware MPI will crash at runtime without it. Make sure the "
"craype-accel module is loaded, or build with gpu_aware_mpi=OFF."
"${ColorReset}")
endif()
else()
message(STATUS
"GPU-aware MPI: no Cray GTL environment found; assuming the MPI "
"implementation is GPU-aware without an extra transport library.")
endif()
endif()
else()
set(gpu_aware_mpi
Expand Down
15 changes: 15 additions & 0 deletions cmake/defaults.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,18 @@ else()
endif()

set_property(CACHE default_gpu_aware_mpi PROPERTY TYPE BOOL)

if(DEFINED ENV{Entity_ENABLE_TEAM_POLICY})
set(default_team_policy
$ENV{Entity_ENABLE_TEAM_POLICY}
CACHE INTERNAL "Default flag for team_policy tile-blocked kernels")
else()
set(default_team_policy
OFF
CACHE INTERNAL "Default flag for team_policy tile-blocked kernels")
endif()
set_property(CACHE default_team_policy PROPERTY TYPE BOOL)

set(default_team_policy_tile_size
8
CACHE INTERNAL "Default tile edge length in cells for team_policy")
25 changes: 25 additions & 0 deletions cmake/report.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,26 @@ if(${mpi} AND ${DEVICE_ENABLED})
GPU_AWARE_MPI_REPORT
46)
endif()
printchoices(
"Team Policy"
"team_policy"
"${ON_OFF_VALUES}"
${team_policy}
OFF
"${Green}"
TEAM_POLICY_REPORT
46)
if(${team_policy})
printchoices(
"Team Tile Size"
"team_policy_tile_size"
"${team_policy_tile_sizes}"
${team_policy_tile_size}
${default_team_policy_tile_size}
"${Blue}"
TEAM_POLICY_TILE_SIZE_REPORT
46)
endif()
printchoices(
"Debug mode"
"DEBUG"
Expand Down Expand Up @@ -197,6 +217,11 @@ if(${mpi} AND ${DEVICE_ENABLED})
string(APPEND REPORT_TEXT " " ${GPU_AWARE_MPI_REPORT} "\n")
endif()

string(APPEND REPORT_TEXT " " ${TEAM_POLICY_REPORT} "\n")
if(${team_policy})
string(APPEND REPORT_TEXT " " ${TEAM_POLICY_TILE_SIZE_REPORT} "\n")
endif()

string(
APPEND
REPORT_TEXT
Expand Down
3 changes: 3 additions & 0 deletions src/engines/reporter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ namespace ntt {
"%s",
params.template get<std::string>("simulation.name").c_str());
reporter::AddParam(report, 4, "Engine", "%s", SimEngine(S).to_string());
#if defined(TEAM_POLICY)
reporter::AddParam(report, 4, "Tile size", "%d", TEAM_POLICY_TILE_SIZE);
#endif
reporter::AddParam(report, 4, "Metric", "%s", M.to_string());
#if SHAPE_ORDER == 0
reporter::AddParam(report, 4, "Deposit", "%s", "zigzag");
Expand Down
135 changes: 134 additions & 1 deletion src/engines/srpic/currents.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
* @file engines/srpic/currents.h
* @brief Current deposition and filtering routines for the SRPIC engine
* @implements
* - ntt::srpic::CallDepositKernel<> -> void
* - ntt::srpic::CallDepositKernel<> -> void (flat path)
* - ntt::srpic::CallDepositKernelTiled<> -> void (TEAM_POLICY)
* - ntt::srpic::CurrentsDeposit<> -> void
* - ntt::srpic::CurrentsFilter<> -> void
* @namespaces:
Expand Down Expand Up @@ -61,11 +62,142 @@ namespace ntt {
dt));
}

#if defined(TEAM_POLICY)
/**
* @brief Tiled deposit launcher (TeamPolicy + per-team scratch).
*
* Iterates over `tile_layout.ntiles_total` teams; each team accumulates
* its tile's particle contributions in SLM scratch and atomically
* flushes to the global J. Requires the species to have been sorted
* with `team_policy` enabled (`tile_layout` populated by
* `SortSpatially`).
*
* Falls back to the flat kernel if `tile_offsets` is empty — this
* happens on the first step before the first sort, or for very small
* species that exited early in `SortSpatially`. The fallback uses the
* passed-in `scatter_cur` so the caller still composes correctly.
*/
template <SRMetricClass M, unsigned short O>
void CallDepositKernelTiled(
const Particles<M::Dim, M::CoordType>& species,
const M& local_metric,
const ndfield_t<M::Dim, 3>& cur,
real_t dt) {
static_assert(O <= 11u, "Shape order must be <= 11");
constexpr unsigned short T = static_cast<unsigned short>(
TEAM_POLICY_TILE_SIZE);
const auto& layout = species.tile_layout();
raise::ErrorIf(layout.ntiles_total == 0u,
"CallDepositKernelTiled: tile_layout has 0 tiles — call "
"SortSpatially before CurrentsDeposit",
HERE);
raise::ErrorIf(layout.tile_offsets.extent(0) != layout.ntiles_total + 1u,
"CallDepositKernelTiled: tile_offsets size inconsistent "
"with ntiles_total",
HERE);

using kernel_t = kernel::DepositCurrents_kernel_tiled<SimEngine::SRPIC,
M,
O,
T>;
kernel_t kern { cur,
species.i1,
species.i2,
species.i3,
species.i1_prev,
species.i2_prev,
species.i3_prev,
species.dx1,
species.dx2,
species.dx3,
species.dx1_prev,
species.dx2_prev,
species.dx3_prev,
species.ux1,
species.ux2,
species.ux3,
species.phi,
species.weight,
species.tag,
local_metric,
(real_t)(species.charge()),
dt,
layout };

Kokkos::TeamPolicy<> policy(static_cast<int>(layout.ntiles_total),
Kokkos::AUTO);
policy.set_scratch_size(0, Kokkos::PerTeam(kernel_t::scratch_bytes()));
Kokkos::parallel_for("CurrentsDepositTiled", policy, kern);
}
#endif // TEAM_POLICY

template <SRMetricClass M>
void CurrentsDeposit(Domain<SimEngine::SRPIC, M>& domain,
const prm::Parameters& engine_params) {
const auto dt = engine_params.get<real_t>("dt");
Kokkos::deep_copy(domain.fields.cur, ZERO);

#if defined(TEAM_POLICY)

// First-step fallback: if any contributing species has not been
// sorted yet (tile_layout still empty), fall back to the flat
// scatter-view path for that step. Subsequent steps see populated
// layouts and use the tiled kernel.
bool any_unsorted = false;
for (auto& species : domain.species) {
if ((species.pusher() == ParticlePusher::NONE) or
(species.npart() == 0) or cmp::AlmostZero_host(species.charge())) {
continue;
}
if (species.tile_layout().ntiles_total == 0u or
species.tile_layout().tile_offsets.extent(0) == 0u) {
any_unsorted = true;
break;
}
}
if (any_unsorted) {
auto scatter_cur = Kokkos::Experimental::create_scatter_view(
domain.fields.cur);
for (auto& species : domain.species) {
if ((species.pusher() == ParticlePusher::NONE) or
(species.npart() == 0) or cmp::AlmostZero_host(species.charge())) {
continue;
}
logger::Checkpoint(
fmt::format("Launching currents deposit (flat fallback, no sort yet) "
"for %d [%s] : %lu %f",
species.index(),
species.label().c_str(),
species.npart(),
(double)species.charge()),
HERE);
CallDepositKernel<M, SHAPE_ORDER>(species,
domain.mesh.metric,
scatter_cur,
dt);
}
Kokkos::Experimental::contribute(domain.fields.cur, scatter_cur);
} else {
for (auto& species : domain.species) {
if ((species.pusher() == ParticlePusher::NONE) or
(species.npart() == 0) or cmp::AlmostZero_host(species.charge())) {
continue;
}
logger::Checkpoint(
fmt::format("Launching tiled currents deposit for %d [%s] : %lu %f",
species.index(),
species.label().c_str(),
species.npart(),
(double)species.charge()),
HERE);

CallDepositKernelTiled<M, SHAPE_ORDER>(species,
domain.mesh.metric,
domain.fields.cur,
dt);
}
}
#else
auto scatter_cur = Kokkos::Experimental::create_scatter_view(
domain.fields.cur);
for (auto& species : domain.species) {
Expand All @@ -84,6 +216,7 @@ namespace ntt {
CallDepositKernel<M, SHAPE_ORDER>(species, domain.mesh.metric, scatter_cur, dt);
}
Kokkos::Experimental::contribute(domain.fields.cur, scatter_cur);
#endif
}

template <SRMetricClass M>
Expand Down
Loading
Loading