From 322f82eab23cf7566c210d8225b264a9d7453bcc Mon Sep 17 00:00:00 2001
From: Francesco Derme <francesco.derme02@gmail.com>
Date: Tue, 16 Dec 2025 11:24:55 +0100
Subject: [PATCH 01/33] run.py

---
 .gitattributes                                |    2 +
 Makefile                                      |    2 +-
 include/alpaka/acc/AccCpuOmp2Blocks.hpp       |  234 +++
 include/alpaka/acc/AccCpuOmp2Threads.hpp      |  237 +++
 include/alpaka/acc/AccCpuSerial.hpp           |  227 +++
 include/alpaka/acc/AccCpuSycl.hpp             |   38 +
 include/alpaka/acc/AccCpuTbbBlocks.hpp        |  228 +++
 include/alpaka/acc/AccCpuThreads.hpp          |  245 +++
 include/alpaka/acc/AccDevProps.hpp            |   34 +
 include/alpaka/acc/AccFpgaSyclIntel.hpp       |   38 +
 include/alpaka/acc/AccGenericSycl.hpp         |  214 +++
 include/alpaka/acc/AccGpuCudaRt.hpp           |   34 +
 include/alpaka/acc/AccGpuHipRt.hpp            |   34 +
 include/alpaka/acc/AccGpuSyclIntel.hpp        |   38 +
 include/alpaka/acc/AccGpuUniformCudaHipRt.hpp |  307 ++++
 include/alpaka/acc/Tag.hpp                    |   72 +
 include/alpaka/acc/TagAccIsEnabled.hpp        |   36 +
 include/alpaka/acc/Traits.hpp                 |  115 ++
 include/alpaka/alpaka.hpp                     |  229 +++
 include/alpaka/atomic/AtomicAtomicRef.hpp     |  237 +++
 include/alpaka/atomic/AtomicCpu.hpp           |   30 +
 include/alpaka/atomic/AtomicGenericSycl.hpp   |  263 +++
 include/alpaka/atomic/AtomicHierarchy.hpp     |   34 +
 include/alpaka/atomic/AtomicNoOp.hpp          |   37 +
 include/alpaka/atomic/AtomicOmpBuiltIn.hpp    |  320 ++++
 include/alpaka/atomic/AtomicStdLibLock.hpp    |  103 ++
 .../alpaka/atomic/AtomicUniformCudaHip.hpp    |  512 ++++++
 .../atomic/AtomicUniformCudaHipBuiltIn.hpp    |  321 ++++
 include/alpaka/atomic/Op.hpp                  |  249 +++
 include/alpaka/atomic/Traits.hpp              |  304 ++++
 .../dyn/BlockSharedDynMemberAllocKiB.hpp      |   15 +
 .../dyn/BlockSharedMemDynGenericSycl.hpp      |   43 +
 .../shared/dyn/BlockSharedMemDynMember.hpp    |  113 ++
 ...BlockSharedMemDynUniformCudaHipBuiltIn.hpp |   57 +
 include/alpaka/block/shared/dyn/Traits.hpp    |   44 +
 .../shared/st/BlockSharedMemStGenericSycl.hpp |   67 +
 .../shared/st/BlockSharedMemStMember.hpp      |   59 +
 .../st/BlockSharedMemStMemberMasterSync.hpp   |   86 +
 .../BlockSharedMemStUniformCudaHipBuiltIn.hpp |   60 +
 include/alpaka/block/shared/st/Traits.hpp     |   59 +
 .../st/detail/BlockSharedMemStMemberImpl.hpp  |  145 ++
 .../alpaka/block/sync/BlockSyncBarrierOmp.hpp |  109 ++
 .../block/sync/BlockSyncBarrierThread.hpp     |   62 +
 .../block/sync/BlockSyncGenericSycl.hpp       |   79 +
 include/alpaka/block/sync/BlockSyncNoOp.hpp   |   40 +
 .../sync/BlockSyncUniformCudaHipBuiltIn.hpp   |  122 ++
 include/alpaka/block/sync/Traits.hpp          |  107 ++
 include/alpaka/core/Align.hpp                 |   65 +
 include/alpaka/core/AlignedAlloc.hpp          |   23 +
 include/alpaka/core/ApiCudaRt.hpp             |  402 +++++
 include/alpaka/core/ApiHipRt.hpp              |  441 +++++
 include/alpaka/core/Assert.hpp                |  105 ++
 include/alpaka/core/BarrierThread.hpp         |  168 ++
 include/alpaka/core/BoostPredef.hpp           |   79 +
 include/alpaka/core/CallbackThread.hpp        |  171 ++
 include/alpaka/core/ClipCast.hpp              |   27 +
 include/alpaka/core/Common.hpp                |  221 +++
 include/alpaka/core/Concepts.hpp              |   67 +
 include/alpaka/core/Cuda.hpp                  |   58 +
 include/alpaka/core/CudaHipCommon.hpp         |  161 ++
 include/alpaka/core/Debug.hpp                 |   77 +
 include/alpaka/core/Decay.hpp                 |   16 +
 include/alpaka/core/DemangleTypeNames.hpp     |   23 +
 include/alpaka/core/Hip.hpp                   |   14 +
 include/alpaka/core/OmpSchedule.hpp           |   88 +
 include/alpaka/core/Positioning.hpp           |   49 +
 include/alpaka/core/RemoveRestrict.hpp        |   35 +
 include/alpaka/core/RuntimeMacros.hpp         |   52 +
 include/alpaka/core/Sycl.hpp                  |  199 +++
 include/alpaka/core/ThreadPool.hpp            |  104 ++
 include/alpaka/core/UniformCudaHip.hpp        |  113 ++
 include/alpaka/core/Unreachable.hpp           |   25 +
 include/alpaka/core/Unroll.hpp                |   25 +
 include/alpaka/core/Utility.hpp               |   62 +
 include/alpaka/core/Vectorize.hpp             |  358 ++++
 include/alpaka/dev/DevCpu.hpp                 |  207 +++
 include/alpaka/dev/DevCpuSycl.hpp             |   17 +
 include/alpaka/dev/DevCudaRt.hpp              |   18 +
 include/alpaka/dev/DevFpgaSyclIntel.hpp       |   17 +
 include/alpaka/dev/DevGenericSycl.hpp         |  282 ++++
 include/alpaka/dev/DevGpuSyclIntel.hpp        |   17 +
 include/alpaka/dev/DevHipRt.hpp               |   18 +
 include/alpaka/dev/DevUniformCudaHipRt.hpp    |  269 +++
 include/alpaka/dev/Traits.hpp                 |  140 ++
 include/alpaka/dev/common/QueueRegistry.hpp   |   59 +
 include/alpaka/dev/cpu/SysInfo.hpp            |  237 +++
 include/alpaka/dev/cpu/Wait.hpp               |   27 +
 include/alpaka/dim/DimArithmetic.hpp          |   19 +
 include/alpaka/dim/DimIntegralConst.hpp       |   16 +
 include/alpaka/dim/Traits.hpp                 |   20 +
 include/alpaka/elem/Traits.hpp                |   33 +
 include/alpaka/event/EventCpu.hpp             |   13 +
 include/alpaka/event/EventCpuSycl.hpp         |   17 +
 include/alpaka/event/EventCudaRt.hpp          |   18 +
 include/alpaka/event/EventFpgaSyclIntel.hpp   |   17 +
 include/alpaka/event/EventGenericSycl.hpp     |  161 ++
 include/alpaka/event/EventGenericThreads.hpp  |  395 +++++
 include/alpaka/event/EventGpuSyclIntel.hpp    |   17 +
 include/alpaka/event/EventHipRt.hpp           |   18 +
 .../alpaka/event/EventUniformCudaHipRt.hpp    |  263 +++
 include/alpaka/event/Traits.hpp               |   38 +
 include/alpaka/example/ExampleDefaultAcc.hpp  |   41 +
 .../alpaka/example/ExecuteForEachAccTag.hpp   |   27 +
 include/alpaka/exec/ElementIndex.hpp          |   18 +
 include/alpaka/exec/IndependentElements.hpp   |  454 +++++
 include/alpaka/exec/Once.hpp                  |   56 +
 include/alpaka/exec/UniformElements.hpp       | 1145 +++++++++++++
 include/alpaka/extent/Traits.hpp              |  162 ++
 include/alpaka/idx/Accessors.hpp              |  116 ++
 include/alpaka/idx/MapIdx.hpp                 |   98 ++
 include/alpaka/idx/Traits.hpp                 |   44 +
 include/alpaka/idx/bt/IdxBtGenericSycl.hpp    |   77 +
 include/alpaka/idx/bt/IdxBtLinear.hpp         |   72 +
 include/alpaka/idx/bt/IdxBtOmp.hpp            |   77 +
 include/alpaka/idx/bt/IdxBtRefThreadIdMap.hpp |   77 +
 .../idx/bt/IdxBtUniformCudaHipBuiltIn.hpp     |   81 +
 include/alpaka/idx/bt/IdxBtZero.hpp           |   53 +
 include/alpaka/idx/gb/IdxGbGenericSycl.hpp    |   77 +
 include/alpaka/idx/gb/IdxGbLinear.hpp         |   73 +
 include/alpaka/idx/gb/IdxGbRef.hpp            |   59 +
 .../idx/gb/IdxGbUniformCudaHipBuiltIn.hpp     |   81 +
 include/alpaka/intrinsic/IntrinsicCpu.hpp     |   88 +
 .../alpaka/intrinsic/IntrinsicFallback.hpp    |   77 +
 .../alpaka/intrinsic/IntrinsicGenericSycl.hpp |   57 +
 .../IntrinsicUniformCudaHipBuiltIn.hpp        |   78 +
 include/alpaka/intrinsic/Traits.hpp           |   84 +
 .../kernel/KernelFunctionAttributes.hpp       |   25 +
 include/alpaka/kernel/SyclSubgroupSize.hpp    |  120 ++
 .../alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp |  991 +++++++++++
 .../kernel/TaskKernelCpuOmp2Threads.hpp       |  232 +++
 include/alpaka/kernel/TaskKernelCpuSerial.hpp |  171 ++
 include/alpaka/kernel/TaskKernelCpuSycl.hpp   |   20 +
 .../alpaka/kernel/TaskKernelCpuTbbBlocks.hpp  |  183 ++
 .../alpaka/kernel/TaskKernelCpuThreads.hpp    |  240 +++
 .../alpaka/kernel/TaskKernelFpgaSyclIntel.hpp |   20 +
 .../alpaka/kernel/TaskKernelGenericSycl.hpp   |  314 ++++
 include/alpaka/kernel/TaskKernelGpuCudaRt.hpp |   19 +
 include/alpaka/kernel/TaskKernelGpuHipRt.hpp  |   18 +
 .../alpaka/kernel/TaskKernelGpuSyclIntel.hpp  |   20 +
 .../kernel/TaskKernelGpuUniformCudaHipRt.hpp  |  373 +++++
 include/alpaka/kernel/Traits.hpp              |  383 +++++
 include/alpaka/math/Complex.hpp               |  582 +++++++
 include/alpaka/math/FloatEqualExact.hpp       |   50 +
 include/alpaka/math/MathGenericSycl.hpp       |  751 +++++++++
 include/alpaka/math/MathStdLib.hpp            |  299 ++++
 .../alpaka/math/MathUniformCudaHipBuiltIn.hpp | 1373 +++++++++++++++
 include/alpaka/math/Traits.hpp                | 1488 +++++++++++++++++
 include/alpaka/mem/alloc/AllocCpuAligned.hpp  |   67 +
 include/alpaka/mem/alloc/AllocCpuNew.hpp      |   39 +
 include/alpaka/mem/alloc/Traits.hpp           |   46 +
 include/alpaka/mem/buf/BufCpu.hpp             |  314 ++++
 include/alpaka/mem/buf/BufCpuSycl.hpp         |   19 +
 include/alpaka/mem/buf/BufCudaRt.hpp          |   18 +
 include/alpaka/mem/buf/BufFpgaSyclIntel.hpp   |   19 +
 include/alpaka/mem/buf/BufGenericSycl.hpp     |  272 +++
 include/alpaka/mem/buf/BufGpuSyclIntel.hpp    |   19 +
 include/alpaka/mem/buf/BufHipRt.hpp           |   18 +
 .../alpaka/mem/buf/BufUniformCudaHipRt.hpp    |  422 +++++
 include/alpaka/mem/buf/SetKernel.hpp          |   58 +
 include/alpaka/mem/buf/Traits.hpp             |  192 +++
 include/alpaka/mem/buf/cpu/Copy.hpp           |  220 +++
 include/alpaka/mem/buf/cpu/Set.hpp            |  186 +++
 include/alpaka/mem/buf/sycl/Common.hpp        |   57 +
 include/alpaka/mem/buf/sycl/Copy.hpp          |  240 +++
 include/alpaka/mem/buf/sycl/Set.hpp           |  212 +++
 .../alpaka/mem/buf/uniformCudaHip/Copy.hpp    |  643 +++++++
 include/alpaka/mem/buf/uniformCudaHip/Set.hpp |  385 +++++
 include/alpaka/mem/fence/MemFenceCpu.hpp      |   61 +
 .../alpaka/mem/fence/MemFenceCpuSerial.hpp    |   49 +
 .../alpaka/mem/fence/MemFenceGenericSycl.hpp  |   60 +
 .../alpaka/mem/fence/MemFenceOmp2Blocks.hpp   |   54 +
 .../alpaka/mem/fence/MemFenceOmp2Threads.hpp  |   68 +
 .../fence/MemFenceUniformCudaHipBuiltIn.hpp   |   65 +
 include/alpaka/mem/fence/Traits.hpp           |   66 +
 include/alpaka/mem/global/DeviceGlobalCpu.hpp |  151 ++
 .../mem/global/DeviceGlobalGenericSycl.hpp    |   96 ++
 .../DeviceGlobalUniformCudaHipBuiltIn.hpp     |  187 +++
 include/alpaka/mem/global/Traits.hpp          |   45 +
 include/alpaka/mem/view/Traits.hpp            |  614 +++++++
 include/alpaka/mem/view/ViewAccessOps.hpp     |  151 ++
 include/alpaka/mem/view/ViewConst.hpp         |  115 ++
 include/alpaka/mem/view/ViewPlainPtr.hpp      |  192 +++
 include/alpaka/mem/view/ViewStdArray.hpp      |   94 ++
 include/alpaka/mem/view/ViewStdVector.hpp     |   92 +
 include/alpaka/mem/view/ViewSubView.hpp       |  217 +++
 include/alpaka/meta/Apply.hpp                 |   22 +
 include/alpaka/meta/CartesianProduct.hpp      |   84 +
 include/alpaka/meta/Concatenate.hpp           |   29 +
 include/alpaka/meta/DependentFalseType.hpp    |   17 +
 include/alpaka/meta/Filter.hpp                |   47 +
 include/alpaka/meta/Fold.hpp                  |   24 +
 include/alpaka/meta/ForEachType.hpp           |   52 +
 include/alpaka/meta/Functional.hpp            |   30 +
 include/alpaka/meta/InheritFromList.hpp       |   16 +
 include/alpaka/meta/IntegerSequence.hpp       |  125 ++
 include/alpaka/meta/Integral.hpp              |   56 +
 include/alpaka/meta/IsArrayOrVector.hpp       |   65 +
 include/alpaka/meta/IsStrictBase.hpp          |   15 +
 include/alpaka/meta/NdLoop.hpp                |   85 +
 include/alpaka/meta/NonZero.hpp               |   27 +
 include/alpaka/meta/Set.hpp                   |   60 +
 include/alpaka/meta/Transform.hpp             |   22 +
 include/alpaka/meta/TypeListOps.hpp           |   95 ++
 include/alpaka/meta/Unique.hpp                |   41 +
 include/alpaka/offset/Traits.hpp              |  132 ++
 include/alpaka/platform/PlatformCpu.hpp       |   69 +
 include/alpaka/platform/PlatformCpuSycl.hpp   |   33 +
 include/alpaka/platform/PlatformCudaRt.hpp    |   18 +
 .../alpaka/platform/PlatformFpgaSyclIntel.hpp |   51 +
 .../alpaka/platform/PlatformGenericSycl.hpp   |  746 +++++++++
 .../alpaka/platform/PlatformGpuSyclIntel.hpp  |   36 +
 include/alpaka/platform/PlatformHipRt.hpp     |   18 +
 .../platform/PlatformUniformCudaHipRt.hpp     |  265 +++
 include/alpaka/platform/Traits.hpp            |   94 ++
 include/alpaka/queue/Properties.hpp           |   20 +
 include/alpaka/queue/QueueCpuBlocking.hpp     |   13 +
 include/alpaka/queue/QueueCpuNonBlocking.hpp  |   13 +
 include/alpaka/queue/QueueCpuSyclBlocking.hpp |   17 +
 .../alpaka/queue/QueueCpuSyclNonBlocking.hpp  |   17 +
 include/alpaka/queue/QueueCudaRtBlocking.hpp  |   18 +
 .../alpaka/queue/QueueCudaRtNonBlocking.hpp   |   18 +
 .../queue/QueueFpgaSyclIntelBlocking.hpp      |   17 +
 .../queue/QueueFpgaSyclIntelNonBlocking.hpp   |   17 +
 .../alpaka/queue/QueueGenericSyclBlocking.hpp |   17 +
 .../queue/QueueGenericSyclNonBlocking.hpp     |   17 +
 .../queue/QueueGenericThreadsBlocking.hpp     |  166 ++
 .../queue/QueueGenericThreadsNonBlocking.hpp  |  156 ++
 .../queue/QueueGpuSyclIntelBlocking.hpp       |   17 +
 .../queue/QueueGpuSyclIntelNonBlocking.hpp    |   17 +
 include/alpaka/queue/QueueHipRtBlocking.hpp   |   18 +
 .../alpaka/queue/QueueHipRtNonBlocking.hpp    |   18 +
 .../queue/QueueUniformCudaHipRtBlocking.hpp   |   19 +
 .../QueueUniformCudaHipRtNonBlocking.hpp      |   19 +
 include/alpaka/queue/Traits.hpp               |   71 +
 include/alpaka/queue/cpu/ICpuQueue.hpp        |   14 +
 .../alpaka/queue/cpu/IGenericThreadsQueue.hpp |   35 +
 .../queue/cuda_hip/QueueUniformCudaHipRt.hpp  |  245 +++
 .../queue/sycl/QueueGenericSyclBase.hpp       |  289 ++++
 .../rand/Philox/MultiplyAndSplit64to32.hpp    |   43 +
 .../alpaka/rand/Philox/PhiloxBaseCommon.hpp   |   92 +
 .../alpaka/rand/Philox/PhiloxConstants.hpp    |   70 +
 include/alpaka/rand/Philox/PhiloxSingle.hpp   |  148 ++
 .../alpaka/rand/Philox/PhiloxStateless.hpp    |  125 ++
 .../rand/Philox/PhiloxStatelessKeyedBase.hpp  |   36 +
 include/alpaka/rand/Philox/PhiloxVector.hpp   |  102 ++
 include/alpaka/rand/RandDefault.hpp           |  216 +++
 include/alpaka/rand/RandGenericSycl.hpp       |  198 +++
 include/alpaka/rand/RandPhilox.hpp            |  201 +++
 include/alpaka/rand/RandPhiloxStateless.hpp   |   30 +
 include/alpaka/rand/RandStdLib.hpp            |  279 ++++
 .../alpaka/rand/RandUniformCudaHipRand.hpp    |  283 ++++
 include/alpaka/rand/TinyMT/Engine.hpp         |   66 +
 include/alpaka/rand/TinyMT/LICENSE.txt        |   38 +
 include/alpaka/rand/TinyMT/tinymt32.h         |  429 +++++
 include/alpaka/rand/Traits.hpp                |  100 ++
 include/alpaka/standalone/CpuOmp2Blocks.hpp   |    9 +
 include/alpaka/standalone/CpuOmp2Threads.hpp  |    9 +
 include/alpaka/standalone/CpuSerial.hpp       |    9 +
 include/alpaka/standalone/CpuSycl.hpp         |   13 +
 include/alpaka/standalone/CpuTbbBlocks.hpp    |    9 +
 include/alpaka/standalone/CpuThreads.hpp      |    9 +
 include/alpaka/standalone/FpgaSyclIntel.hpp   |   13 +
 include/alpaka/standalone/GenericSycl.hpp     |    9 +
 include/alpaka/standalone/GpuCudaRt.hpp       |   21 +
 include/alpaka/standalone/GpuHipRt.hpp        |    9 +
 include/alpaka/standalone/GpuSyclIntel.hpp    |   13 +
 include/alpaka/test/Array.hpp                 |   29 +
 include/alpaka/test/Check.hpp                 |   19 +
 include/alpaka/test/Extent.hpp                |   42 +
 .../alpaka/test/KernelExecutionFixture.hpp    |  105 ++
 include/alpaka/test/MeasureKernelRunTime.hpp  |   47 +
 include/alpaka/test/acc/TestAccs.hpp          |  183 ++
 include/alpaka/test/dim/TestDims.hpp          |   34 +
 .../test/event/EventHostManualTrigger.hpp     |  779 +++++++++
 include/alpaka/test/idx/TestIdxs.hpp          |   28 +
 include/alpaka/test/mem/view/Iterator.hpp     |  143 ++
 include/alpaka/test/mem/view/ViewTest.hpp     |  264 +++
 include/alpaka/test/queue/Queue.hpp           |  146 ++
 .../test/queue/QueueCpuOmp2Collective.hpp     |  297 ++++
 .../alpaka/test/queue/QueueTestFixture.hpp    |   23 +
 include/alpaka/traits/Traits.hpp              |   37 +
 include/alpaka/vec/Traits.hpp                 |  102 ++
 include/alpaka/vec/Vec.hpp                    |  799 +++++++++
 include/alpaka/version.hpp                    |   14 +
 include/alpaka/wait/Traits.hpp                |   50 +
 include/alpaka/warp/Traits.hpp                |  317 ++++
 include/alpaka/warp/WarpGenericSycl.hpp       |  200 +++
 include/alpaka/warp/WarpSingleThread.hpp      |  121 ++
 .../alpaka/warp/WarpUniformCudaHipBuiltIn.hpp |  189 +++
 include/alpaka/workdiv/Traits.hpp             |   77 +
 include/alpaka/workdiv/WorkDivGenericSycl.hpp |  119 ++
 include/alpaka/workdiv/WorkDivHelpers.hpp     |  554 ++++++
 include/alpaka/workdiv/WorkDivMembers.hpp     |  159 ++
 .../workdiv/WorkDivUniformCudaHipBuiltIn.hpp  |  117 ++
 run.py                                        |   68 +
 295 files changed, 39671 insertions(+), 1 deletion(-)
 create mode 100644 .gitattributes
 create mode 100644 include/alpaka/acc/AccCpuOmp2Blocks.hpp
 create mode 100644 include/alpaka/acc/AccCpuOmp2Threads.hpp
 create mode 100644 include/alpaka/acc/AccCpuSerial.hpp
 create mode 100644 include/alpaka/acc/AccCpuSycl.hpp
 create mode 100644 include/alpaka/acc/AccCpuTbbBlocks.hpp
 create mode 100644 include/alpaka/acc/AccCpuThreads.hpp
 create mode 100644 include/alpaka/acc/AccDevProps.hpp
 create mode 100644 include/alpaka/acc/AccFpgaSyclIntel.hpp
 create mode 100644 include/alpaka/acc/AccGenericSycl.hpp
 create mode 100644 include/alpaka/acc/AccGpuCudaRt.hpp
 create mode 100644 include/alpaka/acc/AccGpuHipRt.hpp
 create mode 100644 include/alpaka/acc/AccGpuSyclIntel.hpp
 create mode 100644 include/alpaka/acc/AccGpuUniformCudaHipRt.hpp
 create mode 100644 include/alpaka/acc/Tag.hpp
 create mode 100644 include/alpaka/acc/TagAccIsEnabled.hpp
 create mode 100644 include/alpaka/acc/Traits.hpp
 create mode 100644 include/alpaka/alpaka.hpp
 create mode 100644 include/alpaka/atomic/AtomicAtomicRef.hpp
 create mode 100644 include/alpaka/atomic/AtomicCpu.hpp
 create mode 100644 include/alpaka/atomic/AtomicGenericSycl.hpp
 create mode 100644 include/alpaka/atomic/AtomicHierarchy.hpp
 create mode 100644 include/alpaka/atomic/AtomicNoOp.hpp
 create mode 100644 include/alpaka/atomic/AtomicOmpBuiltIn.hpp
 create mode 100644 include/alpaka/atomic/AtomicStdLibLock.hpp
 create mode 100644 include/alpaka/atomic/AtomicUniformCudaHip.hpp
 create mode 100644 include/alpaka/atomic/AtomicUniformCudaHipBuiltIn.hpp
 create mode 100644 include/alpaka/atomic/Op.hpp
 create mode 100644 include/alpaka/atomic/Traits.hpp
 create mode 100644 include/alpaka/block/shared/dyn/BlockSharedDynMemberAllocKiB.hpp
 create mode 100644 include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp
 create mode 100644 include/alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp
 create mode 100644 include/alpaka/block/shared/dyn/BlockSharedMemDynUniformCudaHipBuiltIn.hpp
 create mode 100644 include/alpaka/block/shared/dyn/Traits.hpp
 create mode 100644 include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp
 create mode 100644 include/alpaka/block/shared/st/BlockSharedMemStMember.hpp
 create mode 100644 include/alpaka/block/shared/st/BlockSharedMemStMemberMasterSync.hpp
 create mode 100644 include/alpaka/block/shared/st/BlockSharedMemStUniformCudaHipBuiltIn.hpp
 create mode 100644 include/alpaka/block/shared/st/Traits.hpp
 create mode 100644 include/alpaka/block/shared/st/detail/BlockSharedMemStMemberImpl.hpp
 create mode 100644 include/alpaka/block/sync/BlockSyncBarrierOmp.hpp
 create mode 100644 include/alpaka/block/sync/BlockSyncBarrierThread.hpp
 create mode 100644 include/alpaka/block/sync/BlockSyncGenericSycl.hpp
 create mode 100644 include/alpaka/block/sync/BlockSyncNoOp.hpp
 create mode 100644 include/alpaka/block/sync/BlockSyncUniformCudaHipBuiltIn.hpp
 create mode 100644 include/alpaka/block/sync/Traits.hpp
 create mode 100644 include/alpaka/core/Align.hpp
 create mode 100644 include/alpaka/core/AlignedAlloc.hpp
 create mode 100644 include/alpaka/core/ApiCudaRt.hpp
 create mode 100644 include/alpaka/core/ApiHipRt.hpp
 create mode 100644 include/alpaka/core/Assert.hpp
 create mode 100644 include/alpaka/core/BarrierThread.hpp
 create mode 100644 include/alpaka/core/BoostPredef.hpp
 create mode 100644 include/alpaka/core/CallbackThread.hpp
 create mode 100644 include/alpaka/core/ClipCast.hpp
 create mode 100644 include/alpaka/core/Common.hpp
 create mode 100644 include/alpaka/core/Concepts.hpp
 create mode 100644 include/alpaka/core/Cuda.hpp
 create mode 100644 include/alpaka/core/CudaHipCommon.hpp
 create mode 100644 include/alpaka/core/Debug.hpp
 create mode 100644 include/alpaka/core/Decay.hpp
 create mode 100644 include/alpaka/core/DemangleTypeNames.hpp
 create mode 100644 include/alpaka/core/Hip.hpp
 create mode 100644 include/alpaka/core/OmpSchedule.hpp
 create mode 100644 include/alpaka/core/Positioning.hpp
 create mode 100644 include/alpaka/core/RemoveRestrict.hpp
 create mode 100644 include/alpaka/core/RuntimeMacros.hpp
 create mode 100644 include/alpaka/core/Sycl.hpp
 create mode 100644 include/alpaka/core/ThreadPool.hpp
 create mode 100644 include/alpaka/core/UniformCudaHip.hpp
 create mode 100644 include/alpaka/core/Unreachable.hpp
 create mode 100644 include/alpaka/core/Unroll.hpp
 create mode 100644 include/alpaka/core/Utility.hpp
 create mode 100644 include/alpaka/core/Vectorize.hpp
 create mode 100644 include/alpaka/dev/DevCpu.hpp
 create mode 100644 include/alpaka/dev/DevCpuSycl.hpp
 create mode 100644 include/alpaka/dev/DevCudaRt.hpp
 create mode 100644 include/alpaka/dev/DevFpgaSyclIntel.hpp
 create mode 100644 include/alpaka/dev/DevGenericSycl.hpp
 create mode 100644 include/alpaka/dev/DevGpuSyclIntel.hpp
 create mode 100644 include/alpaka/dev/DevHipRt.hpp
 create mode 100644 include/alpaka/dev/DevUniformCudaHipRt.hpp
 create mode 100644 include/alpaka/dev/Traits.hpp
 create mode 100644 include/alpaka/dev/common/QueueRegistry.hpp
 create mode 100644 include/alpaka/dev/cpu/SysInfo.hpp
 create mode 100644 include/alpaka/dev/cpu/Wait.hpp
 create mode 100644 include/alpaka/dim/DimArithmetic.hpp
 create mode 100644 include/alpaka/dim/DimIntegralConst.hpp
 create mode 100644 include/alpaka/dim/Traits.hpp
 create mode 100644 include/alpaka/elem/Traits.hpp
 create mode 100644 include/alpaka/event/EventCpu.hpp
 create mode 100644 include/alpaka/event/EventCpuSycl.hpp
 create mode 100644 include/alpaka/event/EventCudaRt.hpp
 create mode 100644 include/alpaka/event/EventFpgaSyclIntel.hpp
 create mode 100644 include/alpaka/event/EventGenericSycl.hpp
 create mode 100644 include/alpaka/event/EventGenericThreads.hpp
 create mode 100644 include/alpaka/event/EventGpuSyclIntel.hpp
 create mode 100644 include/alpaka/event/EventHipRt.hpp
 create mode 100644 include/alpaka/event/EventUniformCudaHipRt.hpp
 create mode 100644 include/alpaka/event/Traits.hpp
 create mode 100644 include/alpaka/example/ExampleDefaultAcc.hpp
 create mode 100644 include/alpaka/example/ExecuteForEachAccTag.hpp
 create mode 100644 include/alpaka/exec/ElementIndex.hpp
 create mode 100644 include/alpaka/exec/IndependentElements.hpp
 create mode 100644 include/alpaka/exec/Once.hpp
 create mode 100644 include/alpaka/exec/UniformElements.hpp
 create mode 100644 include/alpaka/extent/Traits.hpp
 create mode 100644 include/alpaka/idx/Accessors.hpp
 create mode 100644 include/alpaka/idx/MapIdx.hpp
 create mode 100644 include/alpaka/idx/Traits.hpp
 create mode 100644 include/alpaka/idx/bt/IdxBtGenericSycl.hpp
 create mode 100644 include/alpaka/idx/bt/IdxBtLinear.hpp
 create mode 100644 include/alpaka/idx/bt/IdxBtOmp.hpp
 create mode 100644 include/alpaka/idx/bt/IdxBtRefThreadIdMap.hpp
 create mode 100644 include/alpaka/idx/bt/IdxBtUniformCudaHipBuiltIn.hpp
 create mode 100644 include/alpaka/idx/bt/IdxBtZero.hpp
 create mode 100644 include/alpaka/idx/gb/IdxGbGenericSycl.hpp
 create mode 100644 include/alpaka/idx/gb/IdxGbLinear.hpp
 create mode 100644 include/alpaka/idx/gb/IdxGbRef.hpp
 create mode 100644 include/alpaka/idx/gb/IdxGbUniformCudaHipBuiltIn.hpp
 create mode 100644 include/alpaka/intrinsic/IntrinsicCpu.hpp
 create mode 100644 include/alpaka/intrinsic/IntrinsicFallback.hpp
 create mode 100644 include/alpaka/intrinsic/IntrinsicGenericSycl.hpp
 create mode 100644 include/alpaka/intrinsic/IntrinsicUniformCudaHipBuiltIn.hpp
 create mode 100644 include/alpaka/intrinsic/Traits.hpp
 create mode 100644 include/alpaka/kernel/KernelFunctionAttributes.hpp
 create mode 100644 include/alpaka/kernel/SyclSubgroupSize.hpp
 create mode 100644 include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp
 create mode 100644 include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp
 create mode 100644 include/alpaka/kernel/TaskKernelCpuSerial.hpp
 create mode 100644 include/alpaka/kernel/TaskKernelCpuSycl.hpp
 create mode 100644 include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp
 create mode 100644 include/alpaka/kernel/TaskKernelCpuThreads.hpp
 create mode 100644 include/alpaka/kernel/TaskKernelFpgaSyclIntel.hpp
 create mode 100644 include/alpaka/kernel/TaskKernelGenericSycl.hpp
 create mode 100644 include/alpaka/kernel/TaskKernelGpuCudaRt.hpp
 create mode 100644 include/alpaka/kernel/TaskKernelGpuHipRt.hpp
 create mode 100644 include/alpaka/kernel/TaskKernelGpuSyclIntel.hpp
 create mode 100644 include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp
 create mode 100644 include/alpaka/kernel/Traits.hpp
 create mode 100644 include/alpaka/math/Complex.hpp
 create mode 100644 include/alpaka/math/FloatEqualExact.hpp
 create mode 100644 include/alpaka/math/MathGenericSycl.hpp
 create mode 100644 include/alpaka/math/MathStdLib.hpp
 create mode 100644 include/alpaka/math/MathUniformCudaHipBuiltIn.hpp
 create mode 100644 include/alpaka/math/Traits.hpp
 create mode 100644 include/alpaka/mem/alloc/AllocCpuAligned.hpp
 create mode 100644 include/alpaka/mem/alloc/AllocCpuNew.hpp
 create mode 100644 include/alpaka/mem/alloc/Traits.hpp
 create mode 100644 include/alpaka/mem/buf/BufCpu.hpp
 create mode 100644 include/alpaka/mem/buf/BufCpuSycl.hpp
 create mode 100644 include/alpaka/mem/buf/BufCudaRt.hpp
 create mode 100644 include/alpaka/mem/buf/BufFpgaSyclIntel.hpp
 create mode 100644 include/alpaka/mem/buf/BufGenericSycl.hpp
 create mode 100644 include/alpaka/mem/buf/BufGpuSyclIntel.hpp
 create mode 100644 include/alpaka/mem/buf/BufHipRt.hpp
 create mode 100644 include/alpaka/mem/buf/BufUniformCudaHipRt.hpp
 create mode 100644 include/alpaka/mem/buf/SetKernel.hpp
 create mode 100644 include/alpaka/mem/buf/Traits.hpp
 create mode 100644 include/alpaka/mem/buf/cpu/Copy.hpp
 create mode 100644 include/alpaka/mem/buf/cpu/Set.hpp
 create mode 100644 include/alpaka/mem/buf/sycl/Common.hpp
 create mode 100644 include/alpaka/mem/buf/sycl/Copy.hpp
 create mode 100644 include/alpaka/mem/buf/sycl/Set.hpp
 create mode 100644 include/alpaka/mem/buf/uniformCudaHip/Copy.hpp
 create mode 100644 include/alpaka/mem/buf/uniformCudaHip/Set.hpp
 create mode 100644 include/alpaka/mem/fence/MemFenceCpu.hpp
 create mode 100644 include/alpaka/mem/fence/MemFenceCpuSerial.hpp
 create mode 100644 include/alpaka/mem/fence/MemFenceGenericSycl.hpp
 create mode 100644 include/alpaka/mem/fence/MemFenceOmp2Blocks.hpp
 create mode 100644 include/alpaka/mem/fence/MemFenceOmp2Threads.hpp
 create mode 100644 include/alpaka/mem/fence/MemFenceUniformCudaHipBuiltIn.hpp
 create mode 100644 include/alpaka/mem/fence/Traits.hpp
 create mode 100644 include/alpaka/mem/global/DeviceGlobalCpu.hpp
 create mode 100644 include/alpaka/mem/global/DeviceGlobalGenericSycl.hpp
 create mode 100644 include/alpaka/mem/global/DeviceGlobalUniformCudaHipBuiltIn.hpp
 create mode 100644 include/alpaka/mem/global/Traits.hpp
 create mode 100644 include/alpaka/mem/view/Traits.hpp
 create mode 100644 include/alpaka/mem/view/ViewAccessOps.hpp
 create mode 100644 include/alpaka/mem/view/ViewConst.hpp
 create mode 100644 include/alpaka/mem/view/ViewPlainPtr.hpp
 create mode 100644 include/alpaka/mem/view/ViewStdArray.hpp
 create mode 100644 include/alpaka/mem/view/ViewStdVector.hpp
 create mode 100644 include/alpaka/mem/view/ViewSubView.hpp
 create mode 100644 include/alpaka/meta/Apply.hpp
 create mode 100644 include/alpaka/meta/CartesianProduct.hpp
 create mode 100644 include/alpaka/meta/Concatenate.hpp
 create mode 100644 include/alpaka/meta/DependentFalseType.hpp
 create mode 100644 include/alpaka/meta/Filter.hpp
 create mode 100644 include/alpaka/meta/Fold.hpp
 create mode 100644 include/alpaka/meta/ForEachType.hpp
 create mode 100644 include/alpaka/meta/Functional.hpp
 create mode 100644 include/alpaka/meta/InheritFromList.hpp
 create mode 100644 include/alpaka/meta/IntegerSequence.hpp
 create mode 100644 include/alpaka/meta/Integral.hpp
 create mode 100644 include/alpaka/meta/IsArrayOrVector.hpp
 create mode 100644 include/alpaka/meta/IsStrictBase.hpp
 create mode 100644 include/alpaka/meta/NdLoop.hpp
 create mode 100644 include/alpaka/meta/NonZero.hpp
 create mode 100644 include/alpaka/meta/Set.hpp
 create mode 100644 include/alpaka/meta/Transform.hpp
 create mode 100644 include/alpaka/meta/TypeListOps.hpp
 create mode 100644 include/alpaka/meta/Unique.hpp
 create mode 100644 include/alpaka/offset/Traits.hpp
 create mode 100644 include/alpaka/platform/PlatformCpu.hpp
 create mode 100644 include/alpaka/platform/PlatformCpuSycl.hpp
 create mode 100644 include/alpaka/platform/PlatformCudaRt.hpp
 create mode 100644 include/alpaka/platform/PlatformFpgaSyclIntel.hpp
 create mode 100644 include/alpaka/platform/PlatformGenericSycl.hpp
 create mode 100644 include/alpaka/platform/PlatformGpuSyclIntel.hpp
 create mode 100644 include/alpaka/platform/PlatformHipRt.hpp
 create mode 100644 include/alpaka/platform/PlatformUniformCudaHipRt.hpp
 create mode 100644 include/alpaka/platform/Traits.hpp
 create mode 100644 include/alpaka/queue/Properties.hpp
 create mode 100644 include/alpaka/queue/QueueCpuBlocking.hpp
 create mode 100644 include/alpaka/queue/QueueCpuNonBlocking.hpp
 create mode 100644 include/alpaka/queue/QueueCpuSyclBlocking.hpp
 create mode 100644 include/alpaka/queue/QueueCpuSyclNonBlocking.hpp
 create mode 100644 include/alpaka/queue/QueueCudaRtBlocking.hpp
 create mode 100644 include/alpaka/queue/QueueCudaRtNonBlocking.hpp
 create mode 100644 include/alpaka/queue/QueueFpgaSyclIntelBlocking.hpp
 create mode 100644 include/alpaka/queue/QueueFpgaSyclIntelNonBlocking.hpp
 create mode 100644 include/alpaka/queue/QueueGenericSyclBlocking.hpp
 create mode 100644 include/alpaka/queue/QueueGenericSyclNonBlocking.hpp
 create mode 100644 include/alpaka/queue/QueueGenericThreadsBlocking.hpp
 create mode 100644 include/alpaka/queue/QueueGenericThreadsNonBlocking.hpp
 create mode 100644 include/alpaka/queue/QueueGpuSyclIntelBlocking.hpp
 create mode 100644 include/alpaka/queue/QueueGpuSyclIntelNonBlocking.hpp
 create mode 100644 include/alpaka/queue/QueueHipRtBlocking.hpp
 create mode 100644 include/alpaka/queue/QueueHipRtNonBlocking.hpp
 create mode 100644 include/alpaka/queue/QueueUniformCudaHipRtBlocking.hpp
 create mode 100644 include/alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp
 create mode 100644 include/alpaka/queue/Traits.hpp
 create mode 100644 include/alpaka/queue/cpu/ICpuQueue.hpp
 create mode 100644 include/alpaka/queue/cpu/IGenericThreadsQueue.hpp
 create mode 100644 include/alpaka/queue/cuda_hip/QueueUniformCudaHipRt.hpp
 create mode 100644 include/alpaka/queue/sycl/QueueGenericSyclBase.hpp
 create mode 100644 include/alpaka/rand/Philox/MultiplyAndSplit64to32.hpp
 create mode 100644 include/alpaka/rand/Philox/PhiloxBaseCommon.hpp
 create mode 100644 include/alpaka/rand/Philox/PhiloxConstants.hpp
 create mode 100644 include/alpaka/rand/Philox/PhiloxSingle.hpp
 create mode 100644 include/alpaka/rand/Philox/PhiloxStateless.hpp
 create mode 100644 include/alpaka/rand/Philox/PhiloxStatelessKeyedBase.hpp
 create mode 100644 include/alpaka/rand/Philox/PhiloxVector.hpp
 create mode 100644 include/alpaka/rand/RandDefault.hpp
 create mode 100644 include/alpaka/rand/RandGenericSycl.hpp
 create mode 100644 include/alpaka/rand/RandPhilox.hpp
 create mode 100644 include/alpaka/rand/RandPhiloxStateless.hpp
 create mode 100644 include/alpaka/rand/RandStdLib.hpp
 create mode 100644 include/alpaka/rand/RandUniformCudaHipRand.hpp
 create mode 100644 include/alpaka/rand/TinyMT/Engine.hpp
 create mode 100644 include/alpaka/rand/TinyMT/LICENSE.txt
 create mode 100644 include/alpaka/rand/TinyMT/tinymt32.h
 create mode 100644 include/alpaka/rand/Traits.hpp
 create mode 100644 include/alpaka/standalone/CpuOmp2Blocks.hpp
 create mode 100644 include/alpaka/standalone/CpuOmp2Threads.hpp
 create mode 100644 include/alpaka/standalone/CpuSerial.hpp
 create mode 100644 include/alpaka/standalone/CpuSycl.hpp
 create mode 100644 include/alpaka/standalone/CpuTbbBlocks.hpp
 create mode 100644 include/alpaka/standalone/CpuThreads.hpp
 create mode 100644 include/alpaka/standalone/FpgaSyclIntel.hpp
 create mode 100644 include/alpaka/standalone/GenericSycl.hpp
 create mode 100644 include/alpaka/standalone/GpuCudaRt.hpp
 create mode 100644 include/alpaka/standalone/GpuHipRt.hpp
 create mode 100644 include/alpaka/standalone/GpuSyclIntel.hpp
 create mode 100644 include/alpaka/test/Array.hpp
 create mode 100644 include/alpaka/test/Check.hpp
 create mode 100644 include/alpaka/test/Extent.hpp
 create mode 100644 include/alpaka/test/KernelExecutionFixture.hpp
 create mode 100644 include/alpaka/test/MeasureKernelRunTime.hpp
 create mode 100644 include/alpaka/test/acc/TestAccs.hpp
 create mode 100644 include/alpaka/test/dim/TestDims.hpp
 create mode 100644 include/alpaka/test/event/EventHostManualTrigger.hpp
 create mode 100644 include/alpaka/test/idx/TestIdxs.hpp
 create mode 100644 include/alpaka/test/mem/view/Iterator.hpp
 create mode 100644 include/alpaka/test/mem/view/ViewTest.hpp
 create mode 100644 include/alpaka/test/queue/Queue.hpp
 create mode 100644 include/alpaka/test/queue/QueueCpuOmp2Collective.hpp
 create mode 100644 include/alpaka/test/queue/QueueTestFixture.hpp
 create mode 100644 include/alpaka/traits/Traits.hpp
 create mode 100644 include/alpaka/vec/Traits.hpp
 create mode 100644 include/alpaka/vec/Vec.hpp
 create mode 100644 include/alpaka/version.hpp
 create mode 100644 include/alpaka/wait/Traits.hpp
 create mode 100644 include/alpaka/warp/Traits.hpp
 create mode 100644 include/alpaka/warp/WarpGenericSycl.hpp
 create mode 100644 include/alpaka/warp/WarpSingleThread.hpp
 create mode 100644 include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp
 create mode 100644 include/alpaka/workdiv/Traits.hpp
 create mode 100644 include/alpaka/workdiv/WorkDivGenericSycl.hpp
 create mode 100644 include/alpaka/workdiv/WorkDivHelpers.hpp
 create mode 100644 include/alpaka/workdiv/WorkDivMembers.hpp
 create mode 100644 include/alpaka/workdiv/WorkDivUniformCudaHipBuiltIn.hpp
 create mode 100644 run.py

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..f5d92e1
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,2 @@
+# External libraries
+include/alpaka linguist-vendored
diff --git a/Makefile b/Makefile
index 172fcf6..039bb19 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@ CXXFLAGS ?= -std=c++17 -O2 -Wall
 
 KERNEL_DIR ?= kernels
 TEST_DIR   ?= tests
-ALPAKA_DIR ?= $(CURDIR)/../alpaka/include
+ALPAKA_DIR ?= $(CURDIR)/include
 BIN_DIR    ?= bin
 
 KERNEL_HEADERS := $(wildcard $(KERNEL_DIR)/*.hpp)
diff --git a/include/alpaka/acc/AccCpuOmp2Blocks.hpp b/include/alpaka/acc/AccCpuOmp2Blocks.hpp
new file mode 100644
index 0000000..27661f5
--- /dev/null
+++ b/include/alpaka/acc/AccCpuOmp2Blocks.hpp
@@ -0,0 +1,234 @@
+/* Copyright 2024 Axel Huebl, Benjamin Worpitz, René Widera, Jan Stephan, Bernhard Manfred Gruber, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+// Base classes.
+#include "alpaka/atomic/AtomicCpu.hpp"
+#include "alpaka/atomic/AtomicHierarchy.hpp"
+#include "alpaka/atomic/AtomicNoOp.hpp"
+#include "alpaka/atomic/AtomicOmpBuiltIn.hpp"
+#include "alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp"
+#include "alpaka/block/shared/st/BlockSharedMemStMember.hpp"
+#include "alpaka/block/sync/BlockSyncNoOp.hpp"
+#include "alpaka/core/DemangleTypeNames.hpp"
+#include "alpaka/idx/bt/IdxBtZero.hpp"
+#include "alpaka/idx/gb/IdxGbRef.hpp"
+#include "alpaka/intrinsic/IntrinsicCpu.hpp"
+#include "alpaka/math/MathStdLib.hpp"
+#include "alpaka/mem/fence/MemFenceOmp2Blocks.hpp"
+#include "alpaka/rand/RandDefault.hpp"
+#include "alpaka/rand/RandStdLib.hpp"
+#include "alpaka/warp/WarpSingleThread.hpp"
+#include "alpaka/workdiv/WorkDivMembers.hpp"
+
+// Specialized traits.
+#include "alpaka/acc/Traits.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/kernel/Traits.hpp"
+#include "alpaka/platform/Traits.hpp"
+
+// Implementation details.
+#include "alpaka/acc/Tag.hpp"
+#include "alpaka/core/ClipCast.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/dev/DevCpu.hpp"
+
+#include <limits>
+#include <typeinfo>
+
+#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
+
+#    if _OPENMP < 200203
+#        error If ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
+#    endif
+
+namespace alpaka
+{
+    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelCpuOmp2Blocks;
+
+    //! The CPU OpenMP 2.0 block accelerator.
+    //!
+    //! This accelerator allows parallel kernel execution on a CPU device.
+    //! It uses OpenMP 2.0 to implement the grid block parallelism.
+    //! The block idx is restricted to 1x1x1.
+    template<typename TDim, typename TIdx>
+    class AccCpuOmp2Blocks final
+        : public WorkDivMembers<TDim, TIdx>
+        , public gb::IdxGbRef<TDim, TIdx>
+        , public bt::IdxBtZero<TDim, TIdx>
+        , public AtomicHierarchy<
+              AtomicCpu, // grid atomics
+              AtomicOmpBuiltIn, // block atomics
+              AtomicNoOp> // thread atomics
+        , public math::MathStdLib
+        , public BlockSharedMemDynMember<>
+        , public BlockSharedMemStMember<>
+        , public BlockSyncNoOp
+        , public IntrinsicCpu
+        , public MemFenceOmp2Blocks
+#    ifdef ALPAKA_DISABLE_VENDOR_RNG
+        , public rand::RandDefault
+#    else
+        , public rand::RandStdLib
+#    endif
+        , public warp::WarpSingleThread
+        , public concepts::Implements<ConceptAcc, AccCpuOmp2Blocks<TDim, TIdx>>
+    {
+        static_assert(
+            sizeof(TIdx) >= sizeof(int),
+            "Index type is not supported, consider using int or a larger type.");
+
+    public:
+        // Partial specialization with the correct TDim and TIdx is not allowed.
+        template<typename TDim2, typename TIdx2, typename TKernelFnObj, typename... TArgs>
+        friend class ::alpaka::TaskKernelCpuOmp2Blocks;
+
+        AccCpuOmp2Blocks(AccCpuOmp2Blocks const&) = delete;
+        AccCpuOmp2Blocks(AccCpuOmp2Blocks&&) = delete;
+        auto operator=(AccCpuOmp2Blocks const&) -> AccCpuOmp2Blocks& = delete;
+        auto operator=(AccCpuOmp2Blocks&&) -> AccCpuOmp2Blocks& = delete;
+
+    private:
+        template<typename TWorkDiv>
+        ALPAKA_FN_HOST AccCpuOmp2Blocks(TWorkDiv const& workDiv, std::size_t const& blockSharedMemDynSizeBytes)
+            : WorkDivMembers<TDim, TIdx>(workDiv)
+            , gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx)
+            , BlockSharedMemDynMember<>(blockSharedMemDynSizeBytes)
+            , BlockSharedMemStMember<>(staticMemBegin(), staticMemCapacity())
+            , m_gridBlockIdx(Vec<TDim, TIdx>::zeros())
+        {
+        }
+
+    private:
+        // getIdx
+        Vec<TDim, TIdx> mutable m_gridBlockIdx; //!< The index of the currently executed block.
+    };
+
+    namespace trait
+    {
+        //! The CPU OpenMP 2.0 block accelerator accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct AccType<AccCpuOmp2Blocks<TDim, TIdx>>
+        {
+            using type = AccCpuOmp2Blocks<TDim, TIdx>;
+        };
+
+        //! The CPU OpenMP 2.0 block single thread accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IsSingleThreadAcc<AccCpuOmp2Blocks<TDim, TIdx>> : std::true_type
+        {
+        };
+
+        //! The CPU OpenMP 2.0 block multi thread accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IsMultiThreadAcc<AccCpuOmp2Blocks<TDim, TIdx>> : std::false_type
+        {
+        };
+
+        //! The CPU OpenMP 2.0 block accelerator device properties get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetAccDevProps<AccCpuOmp2Blocks<TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST static auto getAccDevProps(DevCpu const& dev) -> alpaka::AccDevProps<TDim, TIdx>
+            {
+                return {// m_multiProcessorCount
+                        alpaka::core::clipCast<TIdx>(omp_get_max_threads()),
+                        // m_gridBlockExtentMax
+                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+                        // m_gridBlockCountMax
+                        std::numeric_limits<TIdx>::max(),
+                        // m_blockThreadExtentMax
+                        Vec<TDim, TIdx>::ones(),
+                        // m_blockThreadCountMax
+                        static_cast<TIdx>(1),
+                        // m_threadElemExtentMax
+                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+                        // m_threadElemCountMax
+                        std::numeric_limits<TIdx>::max(),
+                        // m_sharedMemSizeBytes
+                        static_cast<size_t>(AccCpuOmp2Blocks<TDim, TIdx>::staticAllocBytes()),
+                        // m_globalMemSizeBytes
+                        getMemBytes(dev)};
+            }
+        };
+
+        //! The CPU OpenMP 2.0 block accelerator name trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetAccName<AccCpuOmp2Blocks<TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST static auto getAccName() -> std::string
+            {
+                return "AccCpuOmp2Blocks<" + std::to_string(TDim::value) + "," + core::demangled<TIdx> + ">";
+            }
+        };
+
+        //! The CPU OpenMP 2.0 block accelerator device type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DevType<AccCpuOmp2Blocks<TDim, TIdx>>
+        {
+            using type = DevCpu;
+        };
+
+        //! The CPU OpenMP 2.0 block accelerator dimension getter trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<AccCpuOmp2Blocks<TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //! The CPU OpenMP 2.0 block accelerator execution task type trait specialization.
+        template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
+        struct CreateTaskKernel<AccCpuOmp2Blocks<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
+        {
+            ALPAKA_FN_HOST static auto createTaskKernel(
+                TWorkDiv const& workDiv,
+                TKernelFnObj const& kernelFnObj,
+                TArgs&&... args)
+            {
+                if(workDiv.m_blockThreadExtent.prod() != static_cast<TIdx>(1u))
+                {
+                    throw std::runtime_error(
+                        "The given work division is not valid for a single thread Acc: "
+                        + getAccName<AccCpuOmp2Blocks<TDim, TIdx>>() + ". Threads per block should be 1!");
+                }
+
+                return TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>(
+                    workDiv,
+                    kernelFnObj,
+                    std::forward<TArgs>(args)...);
+            }
+        };
+
+        //! The CPU OpenMP 2.0 block execution task platform type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct PlatformType<AccCpuOmp2Blocks<TDim, TIdx>>
+        {
+            using type = PlatformCpu;
+        };
+
+        //! The CPU OpenMP 2.0 block accelerator idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<AccCpuOmp2Blocks<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+
+        template<typename TDim, typename TIdx>
+        struct AccToTag<alpaka::AccCpuOmp2Blocks<TDim, TIdx>>
+        {
+            using type = alpaka::TagCpuOmp2Blocks;
+        };
+
+        template<typename TDim, typename TIdx>
+        struct TagToAcc<alpaka::TagCpuOmp2Blocks, TDim, TIdx>
+        {
+            using type = alpaka::AccCpuOmp2Blocks<TDim, TIdx>;
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/acc/AccCpuOmp2Threads.hpp b/include/alpaka/acc/AccCpuOmp2Threads.hpp
new file mode 100644
index 0000000..bc326bc
--- /dev/null
+++ b/include/alpaka/acc/AccCpuOmp2Threads.hpp
@@ -0,0 +1,237 @@
+/* Copyright 2024 Axel Huebl, Benjamin Worpitz, René Widera, Jan Stephan, Bernhard Manfred Gruber, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+// Base classes.
+#include "alpaka/atomic/AtomicCpu.hpp"
+#include "alpaka/atomic/AtomicHierarchy.hpp"
+#include "alpaka/atomic/AtomicOmpBuiltIn.hpp"
+#include "alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp"
+#include "alpaka/block/shared/st/BlockSharedMemStMemberMasterSync.hpp"
+#include "alpaka/block/sync/BlockSyncBarrierOmp.hpp"
+#include "alpaka/core/DemangleTypeNames.hpp"
+#include "alpaka/idx/bt/IdxBtOmp.hpp"
+#include "alpaka/idx/gb/IdxGbRef.hpp"
+#include "alpaka/intrinsic/IntrinsicCpu.hpp"
+#include "alpaka/math/MathStdLib.hpp"
+#include "alpaka/mem/fence/MemFenceOmp2Threads.hpp"
+#include "alpaka/rand/RandDefault.hpp"
+#include "alpaka/rand/RandStdLib.hpp"
+#include "alpaka/warp/WarpSingleThread.hpp"
+#include "alpaka/workdiv/WorkDivMembers.hpp"
+
+// Specialized traits.
+#include "alpaka/acc/Traits.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/kernel/Traits.hpp"
+#include "alpaka/platform/Traits.hpp"
+
+// Implementation details.
+#include "alpaka/acc/Tag.hpp"
+#include "alpaka/core/ClipCast.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/dev/DevCpu.hpp"
+
+#include <limits>
+#include <typeinfo>
+
+#ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
+
+#    if _OPENMP < 200203
+#        error If ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
+#    endif
+
+#    include <omp.h>
+
+namespace alpaka
+{
+    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelCpuOmp2Threads;
+
+    //! The CPU OpenMP 2.0 thread accelerator.
+    //!
+    //! This accelerator allows parallel kernel execution on a CPU device.
+    //! It uses OpenMP 2.0 to implement the block thread parallelism.
+    template<typename TDim, typename TIdx>
+    class AccCpuOmp2Threads final
+        : public WorkDivMembers<TDim, TIdx>
+        , public gb::IdxGbRef<TDim, TIdx>
+        , public bt::IdxBtOmp<TDim, TIdx>
+        , public AtomicHierarchy<
+              AtomicCpu, // grid atomics
+              AtomicOmpBuiltIn, // block atomics
+              AtomicOmpBuiltIn> // thread atomics
+        , public math::MathStdLib
+        , public BlockSharedMemDynMember<>
+        , public BlockSharedMemStMemberMasterSync<>
+        , public BlockSyncBarrierOmp
+        , public IntrinsicCpu
+        , public MemFenceOmp2Threads
+#    ifdef ALPAKA_DISABLE_VENDOR_RNG
+        , public rand::RandDefault
+#    else
+        , public rand::RandStdLib
+#    endif
+        , public warp::WarpSingleThread
+        , public concepts::Implements<ConceptAcc, AccCpuOmp2Threads<TDim, TIdx>>
+    {
+        static_assert(
+            sizeof(TIdx) >= sizeof(int),
+            "Index type is not supported, consider using int or a larger type.");
+
+    public:
+        // Partial specialization with the correct TDim and TIdx is not allowed.
+        template<typename TDim2, typename TIdx2, typename TKernelFnObj, typename... TArgs>
+        friend class ::alpaka::TaskKernelCpuOmp2Threads;
+
+        AccCpuOmp2Threads(AccCpuOmp2Threads const&) = delete;
+        AccCpuOmp2Threads(AccCpuOmp2Threads&&) = delete;
+        auto operator=(AccCpuOmp2Threads const&) -> AccCpuOmp2Threads& = delete;
+        auto operator=(AccCpuOmp2Threads&&) -> AccCpuOmp2Threads& = delete;
+
+    private:
+        template<typename TWorkDiv>
+        ALPAKA_FN_HOST AccCpuOmp2Threads(TWorkDiv const& workDiv, std::size_t const& blockSharedMemDynSizeBytes)
+            : WorkDivMembers<TDim, TIdx>(workDiv)
+            , gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx)
+            , BlockSharedMemDynMember<>(blockSharedMemDynSizeBytes)
+            , BlockSharedMemStMemberMasterSync<>(
+                  staticMemBegin(),
+                  staticMemCapacity(),
+                  [this]() { syncBlockThreads(*this); },
+                  []() noexcept { return (::omp_get_thread_num() == 0); })
+            , m_gridBlockIdx(Vec<TDim, TIdx>::zeros())
+        {
+        }
+
+    private:
+        // getIdx
+        Vec<TDim, TIdx> mutable m_gridBlockIdx; //!< The index of the currently executed block.
+    };
+
+    namespace trait
+    {
+        //! The CPU OpenMP 2.0 thread accelerator accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct AccType<AccCpuOmp2Threads<TDim, TIdx>>
+        {
+            using type = AccCpuOmp2Threads<TDim, TIdx>;
+        };
+
+        //! The CPU OpenMP 2.0 thread single thread accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IsSingleThreadAcc<AccCpuOmp2Threads<TDim, TIdx>> : std::false_type
+        {
+        };
+
+        //! The CPU OpenMP 2.0 thread multi thread accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IsMultiThreadAcc<AccCpuOmp2Threads<TDim, TIdx>> : std::true_type
+        {
+        };
+
+        //! The CPU OpenMP 2.0 thread accelerator device properties get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetAccDevProps<AccCpuOmp2Threads<TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST static auto getAccDevProps(DevCpu const& dev) -> alpaka::AccDevProps<TDim, TIdx>
+            {
+#    ifdef ALPAKA_CI
+                auto const blockThreadCountMax = alpaka::core::clipCast<TIdx>(std::min(4, ::omp_get_max_threads()));
+#    else
+                auto const blockThreadCountMax = alpaka::core::clipCast<TIdx>(::omp_get_max_threads());
+#    endif
+                auto const memBytes = getMemBytes(dev);
+                return {// m_multiProcessorCount
+                        static_cast<TIdx>(1),
+                        // m_gridBlockExtentMax
+                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+                        // m_gridBlockCountMax
+                        std::numeric_limits<TIdx>::max(),
+                        // m_blockThreadExtentMax
+                        Vec<TDim, TIdx>::all(blockThreadCountMax),
+                        // m_blockThreadCountMax
+                        blockThreadCountMax,
+                        // m_threadElemExtentMax
+                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+                        // m_threadElemCountMax
+                        std::numeric_limits<TIdx>::max(),
+                        // m_sharedMemSizeBytes
+                        memBytes,
+                        // m_globalMemSizeBytes
+                        memBytes};
+            }
+        };
+
+        //! The CPU OpenMP 2.0 thread accelerator name trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetAccName<AccCpuOmp2Threads<TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST static auto getAccName() -> std::string
+            {
+                return "AccCpuOmp2Threads<" + std::to_string(TDim::value) + "," + core::demangled<TIdx> + ">";
+            }
+        };
+
+        //! The CPU OpenMP 2.0 thread accelerator device type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DevType<AccCpuOmp2Threads<TDim, TIdx>>
+        {
+            using type = DevCpu;
+        };
+
+        //! The CPU OpenMP 2.0 thread accelerator dimension getter trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<AccCpuOmp2Threads<TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //! The CPU OpenMP 2.0 thread accelerator execution task type trait specialization.
+        template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
+        struct CreateTaskKernel<AccCpuOmp2Threads<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
+        {
+            ALPAKA_FN_HOST static auto createTaskKernel(
+                TWorkDiv const& workDiv,
+                TKernelFnObj const& kernelFnObj,
+                TArgs&&... args)
+            {
+                return TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>(
+                    workDiv,
+                    kernelFnObj,
+                    std::forward<TArgs>(args)...);
+            }
+        };
+
+        //! The CPU OpenMP 2.0 thread execution task platform type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct PlatformType<AccCpuOmp2Threads<TDim, TIdx>>
+        {
+            using type = PlatformCpu;
+        };
+
+        //! The CPU OpenMP 2.0 thread accelerator idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<AccCpuOmp2Threads<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+
+        template<typename TDim, typename TIdx>
+        struct AccToTag<alpaka::AccCpuOmp2Threads<TDim, TIdx>>
+        {
+            using type = alpaka::TagCpuOmp2Threads;
+        };
+
+        template<typename TDim, typename TIdx>
+        struct TagToAcc<alpaka::TagCpuOmp2Threads, TDim, TIdx>
+        {
+            using type = alpaka::AccCpuOmp2Threads<TDim, TIdx>;
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/acc/AccCpuSerial.hpp b/include/alpaka/acc/AccCpuSerial.hpp
new file mode 100644
index 0000000..e1b223f
--- /dev/null
+++ b/include/alpaka/acc/AccCpuSerial.hpp
@@ -0,0 +1,227 @@
+/* Copyright 2024 Axel Huebl, Benjamin Worpitz, René Widera, Jan Stephan, Bernhard Manfred Gruber, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+// Base classes.
+#include "alpaka/atomic/AtomicCpu.hpp"
+#include "alpaka/atomic/AtomicHierarchy.hpp"
+#include "alpaka/atomic/AtomicNoOp.hpp"
+#include "alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp"
+#include "alpaka/block/shared/st/BlockSharedMemStMember.hpp"
+#include "alpaka/block/sync/BlockSyncNoOp.hpp"
+#include "alpaka/core/DemangleTypeNames.hpp"
+#include "alpaka/idx/bt/IdxBtZero.hpp"
+#include "alpaka/idx/gb/IdxGbRef.hpp"
+#include "alpaka/intrinsic/IntrinsicCpu.hpp"
+#include "alpaka/math/MathStdLib.hpp"
+#include "alpaka/mem/fence/MemFenceCpuSerial.hpp"
+#include "alpaka/rand/RandDefault.hpp"
+#include "alpaka/rand/RandStdLib.hpp"
+#include "alpaka/warp/WarpSingleThread.hpp"
+#include "alpaka/workdiv/WorkDivMembers.hpp"
+
+// Specialized traits.
+#include "alpaka/acc/Traits.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/kernel/Traits.hpp"
+#include "alpaka/platform/Traits.hpp"
+
+// Implementation details.
+#include "alpaka/acc/Tag.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/dev/DevCpu.hpp"
+
+#include <memory>
+#include <typeinfo>
+
+#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
+
+namespace alpaka
+{
+    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelCpuSerial;
+
+    //! The CPU serial accelerator.
+    //!
+    //! This accelerator allows serial kernel execution on a CPU device.
+    //! The block idx is restricted to 1x1x1 and all blocks are executed serially so there is no parallelism at all.
+    template<typename TDim, typename TIdx>
+    class AccCpuSerial final
+        : public WorkDivMembers<TDim, TIdx>
+        , public gb::IdxGbRef<TDim, TIdx>
+        , public bt::IdxBtZero<TDim, TIdx>
+        , public AtomicHierarchy<
+              AtomicCpu, // grid atomics
+              AtomicNoOp, // block atomics
+              AtomicNoOp> // thread atomics
+        , public math::MathStdLib
+        , public BlockSharedMemDynMember<>
+        , public BlockSharedMemStMember<>
+        , public BlockSyncNoOp
+        , public IntrinsicCpu
+        , public MemFenceCpuSerial
+#    ifdef ALPAKA_DISABLE_VENDOR_RNG
+        , public rand::RandDefault
+#    else
+        , public rand::RandStdLib
+#    endif
+        , public warp::WarpSingleThread
+        , public concepts::Implements<ConceptAcc, AccCpuSerial<TDim, TIdx>>
+    {
+        static_assert(
+            sizeof(TIdx) >= sizeof(int),
+            "Index type is not supported, consider using int or a larger type.");
+
+    public:
+        // Partial specialization with the correct TDim and TIdx is not allowed.
+        template<typename TDim2, typename TIdx2, typename TKernelFnObj, typename... TArgs>
+        friend class ::alpaka::TaskKernelCpuSerial;
+
+        AccCpuSerial(AccCpuSerial const&) = delete;
+        AccCpuSerial(AccCpuSerial&&) = delete;
+        auto operator=(AccCpuSerial const&) -> AccCpuSerial& = delete;
+        auto operator=(AccCpuSerial&&) -> AccCpuSerial& = delete;
+
+    private:
+        template<typename TWorkDiv>
+        ALPAKA_FN_HOST AccCpuSerial(TWorkDiv const& workDiv, size_t const& blockSharedMemDynSizeBytes)
+            : WorkDivMembers<TDim, TIdx>(workDiv)
+            , gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx)
+            , BlockSharedMemDynMember<>(blockSharedMemDynSizeBytes)
+            , BlockSharedMemStMember<>(staticMemBegin(), staticMemCapacity())
+            , m_gridBlockIdx(Vec<TDim, TIdx>::zeros())
+        {
+        }
+
+    private:
+        // getIdx
+        Vec<TDim, TIdx> mutable m_gridBlockIdx; //!< The index of the currently executed block.
+    };
+
+    namespace trait
+    {
+        //! The CPU serial accelerator accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct AccType<AccCpuSerial<TDim, TIdx>>
+        {
+            using type = AccCpuSerial<TDim, TIdx>;
+        };
+
+        //! The CPU serial single thread accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IsSingleThreadAcc<AccCpuSerial<TDim, TIdx>> : std::true_type
+        {
+        };
+
+        //! The CPU serial multi thread accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IsMultiThreadAcc<AccCpuSerial<TDim, TIdx>> : std::false_type
+        {
+        };
+
+        //! The CPU serial accelerator device properties get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetAccDevProps<AccCpuSerial<TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST static auto getAccDevProps(DevCpu const& dev) -> AccDevProps<TDim, TIdx>
+            {
+                return {// m_multiProcessorCount
+                        static_cast<TIdx>(1),
+                        // m_gridBlockExtentMax
+                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+                        // m_gridBlockCountMax
+                        std::numeric_limits<TIdx>::max(),
+                        // m_blockThreadExtentMax
+                        Vec<TDim, TIdx>::ones(),
+                        // m_blockThreadCountMax
+                        static_cast<TIdx>(1),
+                        // m_threadElemExtentMax
+                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+                        // m_threadElemCountMax
+                        std::numeric_limits<TIdx>::max(),
+                        // m_sharedMemSizeBytes
+                        static_cast<size_t>(AccCpuSerial<TDim, TIdx>::staticAllocBytes()),
+                        // m_globalMemSizeBytes
+                        getMemBytes(dev)};
+            }
+        };
+
+        //! The CPU serial accelerator name trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetAccName<AccCpuSerial<TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST static auto getAccName() -> std::string
+            {
+                return "AccCpuSerial<" + std::to_string(TDim::value) + "," + core::demangled<TIdx> + ">";
+            }
+        };
+
+        //! The CPU serial accelerator device type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DevType<AccCpuSerial<TDim, TIdx>>
+        {
+            using type = DevCpu;
+        };
+
+        //! The CPU serial accelerator dimension getter trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<AccCpuSerial<TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //! The CPU serial accelerator execution task type trait specialization.
+        template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
+        struct CreateTaskKernel<AccCpuSerial<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
+        {
+            ALPAKA_FN_HOST static auto createTaskKernel(
+                TWorkDiv const& workDiv,
+                TKernelFnObj const& kernelFnObj,
+                TArgs&&... args)
+            {
+                if(workDiv.m_blockThreadExtent.prod() != static_cast<TIdx>(1u))
+                {
+                    throw std::runtime_error(
+                        "The given work division is not valid for a single thread Acc: "
+                        + getAccName<AccCpuSerial<TDim, TIdx>>() + ". Threads per block should be 1!");
+                }
+
+                return TaskKernelCpuSerial<TDim, TIdx, TKernelFnObj, TArgs...>(
+                    workDiv,
+                    kernelFnObj,
+                    std::forward<TArgs>(args)...);
+            }
+        };
+
+        //! The CPU serial execution task platform type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct PlatformType<AccCpuSerial<TDim, TIdx>>
+        {
+            using type = PlatformCpu;
+        };
+
+        //! The CPU serial accelerator idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<AccCpuSerial<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+
+        template<typename TDim, typename TIdx>
+        struct AccToTag<alpaka::AccCpuSerial<TDim, TIdx>>
+        {
+            using type = alpaka::TagCpuSerial;
+        };
+
+        template<typename TDim, typename TIdx>
+        struct TagToAcc<alpaka::TagCpuSerial, TDim, TIdx>
+        {
+            using type = alpaka::AccCpuSerial<TDim, TIdx>;
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/acc/AccCpuSycl.hpp b/include/alpaka/acc/AccCpuSycl.hpp
new file mode 100644
index 0000000..e4e7378
--- /dev/null
+++ b/include/alpaka/acc/AccCpuSycl.hpp
@@ -0,0 +1,38 @@
+/* Copyright 2024 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/acc/AccGenericSycl.hpp"
+#include "alpaka/acc/Tag.hpp"
+#include "alpaka/core/Sycl.hpp"
+
+#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_CPU)
+
+namespace alpaka
+{
+    //! The CPU SYCL accelerator.
+    //!
+    //! This accelerator allows parallel kernel execution on a oneAPI-capable CPU target device.
+    template<typename TDim, typename TIdx>
+    using AccCpuSycl = AccGenericSycl<TagCpuSycl, TDim, TIdx>;
+
+    namespace trait
+    {
+        template<typename TDim, typename TIdx>
+        struct AccToTag<alpaka::AccCpuSycl<TDim, TIdx>>
+        {
+            using type = alpaka::TagCpuSycl;
+        };
+
+        template<typename TDim, typename TIdx>
+        struct TagToAcc<alpaka::TagCpuSycl, TDim, TIdx>
+        {
+            using type = alpaka::AccCpuSycl<TDim, TIdx>;
+        };
+    } // namespace trait
+
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/acc/AccCpuTbbBlocks.hpp b/include/alpaka/acc/AccCpuTbbBlocks.hpp
new file mode 100644
index 0000000..d283523
--- /dev/null
+++ b/include/alpaka/acc/AccCpuTbbBlocks.hpp
@@ -0,0 +1,228 @@
+/* Copyright 2024 Axel Huebl, Benjamin Worpitz, Erik Zenker, René Widera, Jan Stephan, Bernhard Manfred Gruber,
+ *                Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+// Base classes.
+#include "alpaka/atomic/AtomicCpu.hpp"
+#include "alpaka/atomic/AtomicHierarchy.hpp"
+#include "alpaka/atomic/AtomicNoOp.hpp"
+#include "alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp"
+#include "alpaka/block/shared/st/BlockSharedMemStMember.hpp"
+#include "alpaka/block/sync/BlockSyncNoOp.hpp"
+#include "alpaka/core/DemangleTypeNames.hpp"
+#include "alpaka/idx/bt/IdxBtZero.hpp"
+#include "alpaka/idx/gb/IdxGbRef.hpp"
+#include "alpaka/intrinsic/IntrinsicCpu.hpp"
+#include "alpaka/math/MathStdLib.hpp"
+#include "alpaka/mem/fence/MemFenceCpu.hpp"
+#include "alpaka/rand/RandDefault.hpp"
+#include "alpaka/rand/RandStdLib.hpp"
+#include "alpaka/warp/WarpSingleThread.hpp"
+#include "alpaka/workdiv/WorkDivMembers.hpp"
+
+// Specialized traits.
+#include "alpaka/acc/Traits.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/kernel/Traits.hpp"
+#include "alpaka/platform/Traits.hpp"
+
+// Implementation details.
+#include "alpaka/acc/Tag.hpp"
+#include "alpaka/core/ClipCast.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/dev/DevCpu.hpp"
+
+#include <memory>
+#include <typeinfo>
+
+#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
+
+#    include <tbb/tbb.h>
+
+namespace alpaka
+{
+    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelCpuTbbBlocks;
+
+    //! The CPU TBB block accelerator.
+    template<typename TDim, typename TIdx>
+    class AccCpuTbbBlocks final
+        : public WorkDivMembers<TDim, TIdx>
+        , public gb::IdxGbRef<TDim, TIdx>
+        , public bt::IdxBtZero<TDim, TIdx>
+        , public AtomicHierarchy<
+              AtomicCpu, // grid atomics
+              AtomicCpu, // block atomics
+              AtomicNoOp> // thread atomics
+        , public math::MathStdLib
+        , public BlockSharedMemDynMember<>
+        , public BlockSharedMemStMember<>
+        , public BlockSyncNoOp
+        , public IntrinsicCpu
+        , public MemFenceCpu
+#    ifdef ALPAKA_DISABLE_VENDOR_RNG
+        , public rand::RandDefault
+#    else
+        , public rand::RandStdLib
+#    endif
+        , public warp::WarpSingleThread
+        , public concepts::Implements<ConceptAcc, AccCpuTbbBlocks<TDim, TIdx>>
+    {
+        static_assert(
+            sizeof(TIdx) >= sizeof(int),
+            "Index type is not supported, consider using int or a larger type.");
+
+    public:
+        // Partial specialization with the correct TDim and TIdx is not allowed.
+        template<typename TDim2, typename TIdx2, typename TKernelFnObj, typename... TArgs>
+        friend class ::alpaka::TaskKernelCpuTbbBlocks;
+
+        AccCpuTbbBlocks(AccCpuTbbBlocks const&) = delete;
+        AccCpuTbbBlocks(AccCpuTbbBlocks&&) = delete;
+        auto operator=(AccCpuTbbBlocks const&) -> AccCpuTbbBlocks& = delete;
+        auto operator=(AccCpuTbbBlocks&&) -> AccCpuTbbBlocks& = delete;
+
+    private:
+        template<typename TWorkDiv>
+        ALPAKA_FN_HOST AccCpuTbbBlocks(TWorkDiv const& workDiv, std::size_t const& blockSharedMemDynSizeBytes)
+            : WorkDivMembers<TDim, TIdx>(workDiv)
+            , gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx)
+            , BlockSharedMemDynMember<>(blockSharedMemDynSizeBytes)
+            , BlockSharedMemStMember<>(staticMemBegin(), staticMemCapacity())
+            , m_gridBlockIdx(Vec<TDim, TIdx>::zeros())
+        {
+        }
+
+    private:
+        // getIdx
+        Vec<TDim, TIdx> mutable m_gridBlockIdx; //!< The index of the currently executed block.
+    };
+
+    namespace trait
+    {
+        //! The CPU TBB block accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct AccType<AccCpuTbbBlocks<TDim, TIdx>>
+        {
+            using type = AccCpuTbbBlocks<TDim, TIdx>;
+        };
+
+        //! The CPU TBB block single thread accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IsSingleThreadAcc<AccCpuTbbBlocks<TDim, TIdx>> : std::true_type
+        {
+        };
+
+        //! The CPU TBB block multi thread accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IsMultiThreadAcc<AccCpuTbbBlocks<TDim, TIdx>> : std::false_type
+        {
+        };
+
+        //! The CPU TBB block accelerator device properties get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetAccDevProps<AccCpuTbbBlocks<TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST static auto getAccDevProps(DevCpu const& dev) -> AccDevProps<TDim, TIdx>
+            {
+                return {// m_multiProcessorCount
+                        alpaka::core::clipCast<TIdx>(tbb::this_task_arena::max_concurrency()),
+                        // m_gridBlockExtentMax
+                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+                        // m_gridBlockCountMax
+                        std::numeric_limits<TIdx>::max(),
+                        // m_blockThreadExtentMax
+                        Vec<TDim, TIdx>::ones(),
+                        // m_blockThreadCountMax
+                        static_cast<TIdx>(1),
+                        // m_threadElemExtentMax
+                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+                        // m_threadElemCountMax
+                        std::numeric_limits<TIdx>::max(),
+                        // m_sharedMemSizeBytes
+                        static_cast<size_t>(AccCpuTbbBlocks<TDim, TIdx>::staticAllocBytes()),
+                        // m_globalMemSizeBytes
+                        getMemBytes(dev)};
+            }
+        };
+
+        //! The CPU TBB block accelerator name trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetAccName<AccCpuTbbBlocks<TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST static auto getAccName() -> std::string
+            {
+                return "AccCpuTbbBlocks<" + std::to_string(TDim::value) + "," + core::demangled<TIdx> + ">";
+            }
+        };
+
+        //! The CPU TBB block accelerator device type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DevType<AccCpuTbbBlocks<TDim, TIdx>>
+        {
+            using type = DevCpu;
+        };
+
+        //! The CPU TBB block accelerator dimension getter trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<AccCpuTbbBlocks<TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //! The CPU TBB block accelerator execution task type trait specialization.
+        template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
+        struct CreateTaskKernel<AccCpuTbbBlocks<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
+        {
+            ALPAKA_FN_HOST static auto createTaskKernel(
+                TWorkDiv const& workDiv,
+                TKernelFnObj const& kernelFnObj,
+                TArgs&&... args)
+            {
+                if(workDiv.m_blockThreadExtent.prod() != static_cast<TIdx>(1u))
+                {
+                    throw std::runtime_error(
+                        "The given work division is not valid for a single thread Acc: "
+                        + getAccName<AccCpuTbbBlocks<TDim, TIdx>>() + ". Threads per block should be 1!");
+                }
+
+                return TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>(
+                    workDiv,
+                    kernelFnObj,
+                    std::forward<TArgs>(args)...);
+            }
+        };
+
+        //! The CPU TBB block execution task platform type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct PlatformType<AccCpuTbbBlocks<TDim, TIdx>>
+        {
+            using type = PlatformCpu;
+        };
+
+        //! The CPU TBB block accelerator idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<AccCpuTbbBlocks<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+
+        template<typename TDim, typename TIdx>
+        struct AccToTag<alpaka::AccCpuTbbBlocks<TDim, TIdx>>
+        {
+            using type = alpaka::TagCpuTbbBlocks;
+        };
+
+        template<typename TDim, typename TIdx>
+        struct TagToAcc<alpaka::TagCpuTbbBlocks, TDim, TIdx>
+        {
+            using type = alpaka::AccCpuTbbBlocks<TDim, TIdx>;
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/acc/AccCpuThreads.hpp b/include/alpaka/acc/AccCpuThreads.hpp
new file mode 100644
index 0000000..ce8f04a
--- /dev/null
+++ b/include/alpaka/acc/AccCpuThreads.hpp
@@ -0,0 +1,245 @@
+/* Copyright 2024 Axel Huebl, Benjamin Worpitz, René Widera, Jan Stephan, Bernhard Manfred Gruber, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+// Base classes.
+#include "alpaka/atomic/AtomicCpu.hpp"
+#include "alpaka/atomic/AtomicHierarchy.hpp"
+#include "alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp"
+#include "alpaka/block/shared/st/BlockSharedMemStMemberMasterSync.hpp"
+#include "alpaka/block/sync/BlockSyncBarrierThread.hpp"
+#include "alpaka/core/DemangleTypeNames.hpp"
+#include "alpaka/idx/bt/IdxBtRefThreadIdMap.hpp"
+#include "alpaka/idx/gb/IdxGbRef.hpp"
+#include "alpaka/intrinsic/IntrinsicCpu.hpp"
+#include "alpaka/math/MathStdLib.hpp"
+#include "alpaka/mem/fence/MemFenceCpu.hpp"
+#include "alpaka/rand/RandDefault.hpp"
+#include "alpaka/rand/RandStdLib.hpp"
+#include "alpaka/warp/WarpSingleThread.hpp"
+#include "alpaka/workdiv/WorkDivMembers.hpp"
+
+// Specialized traits.
+#include "alpaka/acc/Traits.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/kernel/Traits.hpp"
+#include "alpaka/platform/Traits.hpp"
+
+// Implementation details.
+#include "alpaka/acc/Tag.hpp"
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/ClipCast.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/dev/DevCpu.hpp"
+
+#include <memory>
+#include <thread>
+#include <typeinfo>
+
+#ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
+
+namespace alpaka
+{
+    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelCpuThreads;
+
+    //! The CPU threads accelerator.
+    //!
+    //! This accelerator allows parallel kernel execution on a CPU device.
+    //! It uses std::thread to implement the parallelism.
+    template<typename TDim, typename TIdx>
+    class AccCpuThreads final
+        : public WorkDivMembers<TDim, TIdx>
+        , public gb::IdxGbRef<TDim, TIdx>
+        , public bt::IdxBtRefThreadIdMap<TDim, TIdx>
+        , public AtomicHierarchy<
+              AtomicCpu, // grid atomics
+              AtomicCpu, // block atomics
+              AtomicCpu> // thread atomics
+        , public math::MathStdLib
+        , public BlockSharedMemDynMember<>
+        , public BlockSharedMemStMemberMasterSync<>
+        , public BlockSyncBarrierThread<TIdx>
+        , public IntrinsicCpu
+        , public MemFenceCpu
+#    ifdef ALPAKA_DISABLE_VENDOR_RNG
+        , public rand::RandDefault
+#    else
+        , public rand::RandStdLib
+#    endif
+        , public warp::WarpSingleThread
+        , public concepts::Implements<ConceptAcc, AccCpuThreads<TDim, TIdx>>
+    {
+        static_assert(
+            sizeof(TIdx) >= sizeof(int),
+            "Index type is not supported, consider using int or a larger type.");
+
+    public:
+        // Partial specialization with the correct TDim and TIdx is not allowed.
+        template<typename TDim2, typename TIdx2, typename TKernelFnObj, typename... TArgs>
+        friend class ::alpaka::TaskKernelCpuThreads;
+
+        AccCpuThreads(AccCpuThreads const&) = delete;
+        AccCpuThreads(AccCpuThreads&&) = delete;
+        auto operator=(AccCpuThreads const&) -> AccCpuThreads& = delete;
+        auto operator=(AccCpuThreads&&) -> AccCpuThreads& = delete;
+
+    private:
+        template<typename TWorkDiv>
+        ALPAKA_FN_HOST AccCpuThreads(TWorkDiv const& workDiv, std::size_t const& blockSharedMemDynSizeBytes)
+            : WorkDivMembers<TDim, TIdx>(workDiv)
+            , gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx)
+            , bt::IdxBtRefThreadIdMap<TDim, TIdx>(m_threadToIndexMap)
+            , BlockSharedMemDynMember<>(blockSharedMemDynSizeBytes)
+            , BlockSharedMemStMemberMasterSync<>(
+                  staticMemBegin(),
+                  staticMemCapacity(),
+                  [this]() { syncBlockThreads(*this); },
+                  [this]() noexcept { return (m_idMasterThread == std::this_thread::get_id()); })
+            , BlockSyncBarrierThread<TIdx>(getWorkDiv<Block, Threads>(workDiv).prod())
+            , m_gridBlockIdx(Vec<TDim, TIdx>::zeros())
+        {
+        }
+
+    private:
+        // getIdx
+        std::mutex mutable m_mtxMapInsert; //!< The mutex used to secure insertion into the ThreadIdToIdxMap.
+        typename bt::IdxBtRefThreadIdMap<TDim, TIdx>::
+            ThreadIdToIdxMap mutable m_threadToIndexMap; //!< The mapping of thread id's to indices.
+        Vec<TDim, TIdx> mutable m_gridBlockIdx; //!< The index of the currently executed block.
+
+        // allocBlockSharedArr
+        std::thread::id mutable m_idMasterThread; //!< The id of the master thread.
+    };
+
+    namespace trait
+    {
+        //! The CPU threads accelerator accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct AccType<AccCpuThreads<TDim, TIdx>>
+        {
+            using type = AccCpuThreads<TDim, TIdx>;
+        };
+
+        //! The CPU threads single thread accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IsSingleThreadAcc<AccCpuThreads<TDim, TIdx>> : std::false_type
+        {
+        };
+
+        //! The CPU threads multi thread accelerator type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IsMultiThreadAcc<AccCpuThreads<TDim, TIdx>> : std::true_type
+        {
+        };
+
+        //! The CPU threads accelerator device properties get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetAccDevProps<AccCpuThreads<TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST static auto getAccDevProps(DevCpu const& dev) -> AccDevProps<TDim, TIdx>
+            {
+#    ifdef ALPAKA_CI
+                auto const blockThreadCountMax = static_cast<TIdx>(8);
+#    else
+                // \TODO: Magic number. What is the maximum? Just set a reasonable value? There is a implementation
+                // defined maximum where the creation of a new thread crashes. std::thread::hardware_concurrency can
+                // return 0, so 1 is the default case?
+                auto const blockThreadCountMax = std::max(
+                    static_cast<TIdx>(1),
+                    alpaka::core::clipCast<TIdx>(std::thread::hardware_concurrency() * 8));
+#    endif
+                auto const memBytes = getMemBytes(dev);
+                return {// m_multiProcessorCount
+                        static_cast<TIdx>(1),
+                        // m_gridBlockExtentMax
+                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+                        // m_gridBlockCountMax
+                        std::numeric_limits<TIdx>::max(),
+                        // m_blockThreadExtentMax
+                        Vec<TDim, TIdx>::all(blockThreadCountMax),
+                        // m_blockThreadCountMax
+                        blockThreadCountMax,
+                        // m_threadElemExtentMax
+                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+                        // m_threadElemCountMax
+                        std::numeric_limits<TIdx>::max(),
+                        // m_sharedMemSizeBytes
+                        memBytes,
+                        // m_globalMemSizeBytes
+                        memBytes};
+            }
+        };
+
+        //! The CPU threads accelerator name trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetAccName<AccCpuThreads<TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST static auto getAccName() -> std::string
+            {
+                return "AccCpuThreads<" + std::to_string(TDim::value) + "," + core::demangled<TIdx> + ">";
+            }
+        };
+
+        //! The CPU threads accelerator device type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DevType<AccCpuThreads<TDim, TIdx>>
+        {
+            using type = DevCpu;
+        };
+
+        //! The CPU threads accelerator dimension getter trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<AccCpuThreads<TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //! The CPU threads accelerator execution task type trait specialization.
+        template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
+        struct CreateTaskKernel<AccCpuThreads<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
+        {
+            ALPAKA_FN_HOST static auto createTaskKernel(
+                TWorkDiv const& workDiv,
+                TKernelFnObj const& kernelFnObj,
+                TArgs&&... args)
+            {
+                return TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>(
+                    workDiv,
+                    kernelFnObj,
+                    std::forward<TArgs>(args)...);
+            }
+        };
+
+        //! The CPU threads execution task platform type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct PlatformType<AccCpuThreads<TDim, TIdx>>
+        {
+            using type = PlatformCpu;
+        };
+
+        //! The CPU threads accelerator idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<AccCpuThreads<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+
+        template<typename TDim, typename TIdx>
+        struct AccToTag<alpaka::AccCpuThreads<TDim, TIdx>>
+        {
+            using type = alpaka::TagCpuThreads;
+        };
+
+        template<typename TDim, typename TIdx>
+        struct TagToAcc<alpaka::TagCpuThreads, TDim, TIdx>
+        {
+            using type = alpaka::AccCpuThreads<TDim, TIdx>;
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/acc/AccDevProps.hpp b/include/alpaka/acc/AccDevProps.hpp
new file mode 100644
index 0000000..a199d54
--- /dev/null
+++ b/include/alpaka/acc/AccDevProps.hpp
@@ -0,0 +1,34 @@
+/* Copyright 2024 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/vec/Vec.hpp"
+
+namespace alpaka
+{
+    //! The acceleration properties on a device.
+    //
+    // \TODO:
+    //  TIdx m_maxClockFrequencyHz;            //!< Maximum clock frequency of the device in Hz.
+    template<typename TDim, typename TIdx>
+    struct AccDevProps
+    {
+        static_assert(
+            sizeof(TIdx) >= sizeof(int),
+            "Index type is not supported, consider using int or a larger type.");
+
+        // Please keep the order of data members so aggregate initialization does not break!
+        TIdx m_multiProcessorCount; //!< The number of multiprocessors.
+        Vec<TDim, TIdx> m_gridBlockExtentMax; //!< The maximum number of blocks in each dimension of the grid.
+        TIdx m_gridBlockCountMax; //!< The maximum number of blocks in a grid.
+        Vec<TDim, TIdx> m_blockThreadExtentMax; //!< The maximum number of threads in each dimension of a block.
+        TIdx m_blockThreadCountMax; //!< The maximum number of threads in a block.
+        Vec<TDim, TIdx> m_threadElemExtentMax; //!< The maximum number of elements in each dimension of a thread.
+        TIdx m_threadElemCountMax; //!< The maximum number of elements in a threads.
+        size_t m_sharedMemSizeBytes; //!< The size of shared memory per block
+        size_t m_globalMemSizeBytes; //!< The size of global memory
+    };
+} // namespace alpaka
diff --git a/include/alpaka/acc/AccFpgaSyclIntel.hpp b/include/alpaka/acc/AccFpgaSyclIntel.hpp
new file mode 100644
index 0000000..d0e099f
--- /dev/null
+++ b/include/alpaka/acc/AccFpgaSyclIntel.hpp
@@ -0,0 +1,38 @@
+/* Copyright 2024 Jan Stephan, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/acc/AccGenericSycl.hpp"
+#include "alpaka/acc/Tag.hpp"
+#include "alpaka/core/Sycl.hpp"
+
+#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_FPGA)
+
+namespace alpaka
+{
+    //! The Intel FPGA SYCL accelerator.
+    //!
+    //! This accelerator allows parallel kernel execution on a oneAPI-capable Intel FPGA target device.
+    template<typename TDim, typename TIdx>
+    using AccFpgaSyclIntel = AccGenericSycl<TagFpgaSyclIntel, TDim, TIdx>;
+
+    namespace trait
+    {
+        template<typename TDim, typename TIdx>
+        struct AccToTag<alpaka::AccFpgaSyclIntel<TDim, TIdx>>
+        {
+            using type = alpaka::TagFpgaSyclIntel;
+        };
+
+        template<typename TDim, typename TIdx>
+        struct TagToAcc<alpaka::TagFpgaSyclIntel, TDim, TIdx>
+        {
+            using type = alpaka::AccFpgaSyclIntel<TDim, TIdx>;
+        };
+    } // namespace trait
+
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/acc/AccGenericSycl.hpp b/include/alpaka/acc/AccGenericSycl.hpp
new file mode 100644
index 0000000..4679344
--- /dev/null
+++ b/include/alpaka/acc/AccGenericSycl.hpp
@@ -0,0 +1,214 @@
+/* Copyright 2024 Jan Stephan, Antonio Di Pilato, Andrea Bocci, Luca Ferragina, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+// Base classes.
+#include "alpaka/atomic/AtomicGenericSycl.hpp"
+#include "alpaka/atomic/AtomicHierarchy.hpp"
+#include "alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp"
+#include "alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp"
+#include "alpaka/block/sync/BlockSyncGenericSycl.hpp"
+#include "alpaka/dev/DevGenericSycl.hpp"
+#include "alpaka/idx/bt/IdxBtGenericSycl.hpp"
+#include "alpaka/idx/gb/IdxGbGenericSycl.hpp"
+#include "alpaka/intrinsic/IntrinsicGenericSycl.hpp"
+#include "alpaka/math/MathGenericSycl.hpp"
+#include "alpaka/mem/fence/MemFenceGenericSycl.hpp"
+#include "alpaka/platform/PlatformGenericSycl.hpp"
+#include "alpaka/rand/RandDefault.hpp"
+#include "alpaka/rand/RandGenericSycl.hpp"
+#include "alpaka/warp/WarpGenericSycl.hpp"
+#include "alpaka/workdiv/WorkDivGenericSycl.hpp"
+
+// Specialized traits.
+#include "alpaka/acc/Traits.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/kernel/Traits.hpp"
+#include "alpaka/platform/Traits.hpp"
+#include "alpaka/vec/Vec.hpp"
+
+// Implementation details.
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/ClipCast.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/core/Sycl.hpp"
+
+#include <cstddef>
+#include <string>
+#include <type_traits>
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+
+#    include <sycl/sycl.hpp>
+
+namespace alpaka
+{
+    template<typename TTag, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelGenericSycl;
+
+    //! The SYCL accelerator.
+    //!
+    //! This accelerator allows parallel kernel execution on SYCL devices.
+    template<typename TTag, typename TDim, typename TIdx>
+    class AccGenericSycl
+        : public WorkDivGenericSycl<TDim, TIdx>
+        , public gb::IdxGbGenericSycl<TDim, TIdx>
+        , public bt::IdxBtGenericSycl<TDim, TIdx>
+        , public AtomicHierarchy<AtomicGenericSycl, AtomicGenericSycl, AtomicGenericSycl>
+        , public math::MathGenericSycl
+        , public BlockSharedMemDynGenericSycl
+        , public BlockSharedMemStGenericSycl
+        , public BlockSyncGenericSycl<TDim>
+        , public IntrinsicGenericSycl
+        , public MemFenceGenericSycl
+#    ifdef ALPAKA_DISABLE_VENDOR_RNG
+        , public rand::RandDefault
+#    else
+        , public rand::RandGenericSycl<TDim>
+#    endif
+        , public warp::WarpGenericSycl<TDim>
+        , public concepts::Implements<ConceptAcc, AccGenericSycl<TTag, TDim, TIdx>>
+    {
+        static_assert(TDim::value > 0, "The SYCL accelerator must have a dimension greater than zero.");
+
+    public:
+        AccGenericSycl(AccGenericSycl const&) = delete;
+        AccGenericSycl(AccGenericSycl&&) = delete;
+        auto operator=(AccGenericSycl const&) -> AccGenericSycl& = delete;
+        auto operator=(AccGenericSycl&&) -> AccGenericSycl& = delete;
+
+        AccGenericSycl(
+            Vec<TDim, TIdx> const& threadElemExtent,
+            sycl::nd_item<TDim::value> work_item,
+            sycl::local_accessor<std::byte> dyn_shared_acc,
+            sycl::local_accessor<std::byte> st_shared_acc)
+            : WorkDivGenericSycl<TDim, TIdx>{threadElemExtent, work_item}
+            , gb::IdxGbGenericSycl<TDim, TIdx>{work_item}
+            , bt::IdxBtGenericSycl<TDim, TIdx>{work_item}
+            , BlockSharedMemDynGenericSycl{dyn_shared_acc}
+            , BlockSharedMemStGenericSycl{st_shared_acc}
+            , BlockSyncGenericSycl<TDim>{work_item}
+#    ifndef ALPAKA_DISABLE_VENDOR_RNG
+            , rand::RandGenericSycl<TDim>{work_item}
+#    endif
+            , warp::WarpGenericSycl<TDim>{work_item}
+        {
+        }
+    };
+} // namespace alpaka
+
+namespace alpaka::trait
+{
+    //! The SYCL accelerator type trait specialization.
+    template<typename TTag, typename TDim, typename TIdx>
+    struct AccType<AccGenericSycl<TTag, TDim, TIdx>>
+    {
+        using type = AccGenericSycl<TTag, TDim, TIdx>;
+    };
+
+    //! The SYCL single thread accelerator type trait specialization.
+    template<typename TTag, typename TDim, typename TIdx>
+    struct IsSingleThreadAcc<AccGenericSycl<TTag, TDim, TIdx>> : std::false_type
+    {
+    };
+
+    //! The SYCL multi thread accelerator type trait specialization.
+    template<typename TTag, typename TDim, typename TIdx>
+    struct IsMultiThreadAcc<AccGenericSycl<TTag, TDim, TIdx>> : std::true_type
+    {
+    };
+
+    //! The SYCL accelerator device properties get trait specialization.
+    template<typename TTag, typename TDim, typename TIdx>
+    struct GetAccDevProps<AccGenericSycl<TTag, TDim, TIdx>>
+    {
+        static auto getAccDevProps(DevGenericSycl<TTag> const& dev) -> AccDevProps<TDim, TIdx>
+        {
+            auto const device = dev.getNativeHandle().first;
+            auto const max_threads_dim
+                = device.template get_info<sycl::info::device::max_work_item_sizes<TDim::value>>();
+            Vec<TDim, TIdx> max_threads_dim_vec{};
+            for(int i = 0; i < static_cast<int>(TDim::value); i++)
+                max_threads_dim_vec[i] = alpaka::core::clipCast<TIdx>(max_threads_dim[i]);
+            return {// m_multiProcessorCount
+                    alpaka::core::clipCast<TIdx>(device.template get_info<sycl::info::device::max_compute_units>()),
+                    // m_gridBlockExtentMax
+                    getExtentVecEnd<TDim>(Vec<DimInt<3u>, TIdx>(
+                        // WARNING: There is no SYCL way to determine these values
+                        std::numeric_limits<TIdx>::max(),
+                        std::numeric_limits<TIdx>::max(),
+                        std::numeric_limits<TIdx>::max())),
+                    // m_gridBlockCountMax
+                    std::numeric_limits<TIdx>::max(),
+                    // m_blockThreadExtentMax
+                    max_threads_dim_vec,
+                    // m_blockThreadCountMax
+                    alpaka::core::clipCast<TIdx>(device.template get_info<sycl::info::device::max_work_group_size>()),
+                    // m_threadElemExtentMax
+                    Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+                    // m_threadElemCountMax
+                    std::numeric_limits<TIdx>::max(),
+                    // m_sharedMemSizeBytes
+                    device.template get_info<sycl::info::device::local_mem_size>(),
+                    // m_globalMemSizeBytes
+                    getMemBytes(dev)};
+        }
+    };
+
+    //! The SYCL accelerator name trait specialization.
+    template<typename TTag, typename TDim, typename TIdx>
+    struct GetAccName<AccGenericSycl<TTag, TDim, TIdx>>
+    {
+        static auto getAccName() -> std::string
+        {
+            return std::string("Acc") + core::demangled<TTag>.substr(__builtin_strlen("alpaka::Tag")) + "<"
+                   + std::to_string(TDim::value) + "," + core::demangled<TIdx> + ">";
+        }
+    };
+
+    //! The SYCL accelerator device type trait specialization.
+    template<typename TTag, typename TDim, typename TIdx>
+    struct DevType<AccGenericSycl<TTag, TDim, TIdx>>
+    {
+        using type = DevGenericSycl<TTag>;
+    };
+
+    //! The SYCL accelerator dimension getter trait specialization.
+    template<typename TTag, typename TDim, typename TIdx>
+    struct DimType<AccGenericSycl<TTag, TDim, TIdx>>
+    {
+        using type = TDim;
+    };
+
+    //! The SYCL accelerator execution task type trait specialization.
+    template<typename TTag, typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
+    struct CreateTaskKernel<AccGenericSycl<TTag, TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
+    {
+        static auto createTaskKernel(TWorkDiv const& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
+        {
+            return TaskKernelGenericSycl<TTag, AccGenericSycl<TTag, TDim, TIdx>, TDim, TIdx, TKernelFnObj, TArgs...>{
+                workDiv,
+                kernelFnObj,
+                std::forward<TArgs>(args)...};
+        }
+    };
+
+    //! The SYCL execution task platform type trait specialization.
+    template<typename TTag, typename TDim, typename TIdx>
+    struct PlatformType<AccGenericSycl<TTag, TDim, TIdx>>
+    {
+        using type = PlatformGenericSycl<TTag>;
+    };
+
+    //! The SYCL accelerator idx type trait specialization.
+    template<typename TTag, typename TDim, typename TIdx>
+    struct IdxType<AccGenericSycl<TTag, TDim, TIdx>>
+    {
+        using type = TIdx;
+    };
+} // namespace alpaka::trait
+
+#endif
diff --git a/include/alpaka/acc/AccGpuCudaRt.hpp b/include/alpaka/acc/AccGpuCudaRt.hpp
new file mode 100644
index 0000000..5f27e51
--- /dev/null
+++ b/include/alpaka/acc/AccGpuCudaRt.hpp
@@ -0,0 +1,34 @@
+/* Copyright 2022 Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/acc/AccGpuUniformCudaHipRt.hpp"
+#include "alpaka/acc/Tag.hpp"
+#include "alpaka/core/ApiCudaRt.hpp"
+
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+
+namespace alpaka
+{
+    template<typename TDim, typename TIdx>
+    using AccGpuCudaRt = AccGpuUniformCudaHipRt<ApiCudaRt, TDim, TIdx>;
+
+    namespace trait
+    {
+        template<typename TDim, typename TIdx>
+        struct AccToTag<alpaka::AccGpuCudaRt<TDim, TIdx>>
+        {
+            using type = alpaka::TagGpuCudaRt;
+        };
+
+        template<typename TDim, typename TIdx>
+        struct TagToAcc<alpaka::TagGpuCudaRt, TDim, TIdx>
+        {
+            using type = alpaka::AccGpuCudaRt<TDim, TIdx>;
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
diff --git a/include/alpaka/acc/AccGpuHipRt.hpp b/include/alpaka/acc/AccGpuHipRt.hpp
new file mode 100644
index 0000000..43c94ab
--- /dev/null
+++ b/include/alpaka/acc/AccGpuHipRt.hpp
@@ -0,0 +1,34 @@
+/* Copyright 2022 Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/acc/AccGpuUniformCudaHipRt.hpp"
+#include "alpaka/acc/Tag.hpp"
+#include "alpaka/core/ApiHipRt.hpp"
+
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+
+namespace alpaka
+{
+    template<typename TDim, typename TIdx>
+    using AccGpuHipRt = AccGpuUniformCudaHipRt<ApiHipRt, TDim, TIdx>;
+
+    namespace trait
+    {
+        template<typename TDim, typename TIdx>
+        struct AccToTag<alpaka::AccGpuHipRt<TDim, TIdx>>
+        {
+            using type = alpaka::TagGpuHipRt;
+        };
+
+        template<typename TDim, typename TIdx>
+        struct TagToAcc<alpaka::TagGpuHipRt, TDim, TIdx>
+        {
+            using type = alpaka::AccGpuHipRt<TDim, TIdx>;
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#endif // ALPAKA_ACC_GPU_HIP_ENABLED
diff --git a/include/alpaka/acc/AccGpuSyclIntel.hpp b/include/alpaka/acc/AccGpuSyclIntel.hpp
new file mode 100644
index 0000000..2e75b43
--- /dev/null
+++ b/include/alpaka/acc/AccGpuSyclIntel.hpp
@@ -0,0 +1,38 @@
+/* Copyright 2024 Jan Stephan, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/acc/AccGenericSycl.hpp"
+#include "alpaka/acc/Tag.hpp"
+#include "alpaka/core/Sycl.hpp"
+
+#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_GPU)
+
+namespace alpaka
+{
+    //! The Intel GPU SYCL accelerator.
+    //!
+    //! This accelerator allows parallel kernel execution on a oneAPI-capable Intel GPU target device.
+    template<typename TDim, typename TIdx>
+    using AccGpuSyclIntel = AccGenericSycl<TagGpuSyclIntel, TDim, TIdx>;
+
+    namespace trait
+    {
+        template<typename TDim, typename TIdx>
+        struct AccToTag<alpaka::AccGpuSyclIntel<TDim, TIdx>>
+        {
+            using type = alpaka::TagGpuSyclIntel;
+        };
+
+        template<typename TDim, typename TIdx>
+        struct TagToAcc<alpaka::TagGpuSyclIntel, TDim, TIdx>
+        {
+            using type = alpaka::AccGpuSyclIntel<TDim, TIdx>;
+        };
+    } // namespace trait
+
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp b/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp
new file mode 100644
index 0000000..bc0e8cb
--- /dev/null
+++ b/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp
@@ -0,0 +1,307 @@
+/* Copyright 2024 Benjamin Worpitz, René Widera, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Antonio Di Pilato
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+// Base classes.
+#include "alpaka/atomic/AtomicHierarchy.hpp"
+#include "alpaka/atomic/AtomicUniformCudaHipBuiltIn.hpp"
+#include "alpaka/block/shared/dyn/BlockSharedMemDynUniformCudaHipBuiltIn.hpp"
+#include "alpaka/block/shared/st/BlockSharedMemStUniformCudaHipBuiltIn.hpp"
+#include "alpaka/block/sync/BlockSyncUniformCudaHipBuiltIn.hpp"
+#include "alpaka/core/DemangleTypeNames.hpp"
+#include "alpaka/idx/bt/IdxBtUniformCudaHipBuiltIn.hpp"
+#include "alpaka/idx/gb/IdxGbUniformCudaHipBuiltIn.hpp"
+#include "alpaka/intrinsic/IntrinsicUniformCudaHipBuiltIn.hpp"
+#include "alpaka/math/MathUniformCudaHipBuiltIn.hpp"
+#include "alpaka/mem/fence/MemFenceUniformCudaHipBuiltIn.hpp"
+#include "alpaka/rand/RandDefault.hpp"
+#include "alpaka/rand/RandUniformCudaHipRand.hpp"
+#include "alpaka/warp/WarpUniformCudaHipBuiltIn.hpp"
+#include "alpaka/workdiv/WorkDivUniformCudaHipBuiltIn.hpp"
+
+// Specialized traits.
+#include "alpaka/acc/Traits.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/kernel/Traits.hpp"
+#include "alpaka/platform/Traits.hpp"
+
+// Implementation details.
+#include "alpaka/core/ClipCast.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/core/Cuda.hpp"
+#include "alpaka/dev/DevUniformCudaHipRt.hpp"
+
+#include <typeinfo>
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+namespace alpaka
+{
+    template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelGpuUniformCudaHipRt;
+
+    //! The GPU CUDA accelerator.
+    //!
+    //! This accelerator allows parallel kernel execution on devices supporting CUDA.
+    template<typename TApi, typename TDim, typename TIdx>
+    class AccGpuUniformCudaHipRt final
+        : public WorkDivUniformCudaHipBuiltIn<TDim, TIdx>
+        , public gb::IdxGbUniformCudaHipBuiltIn<TDim, TIdx>
+        , public bt::IdxBtUniformCudaHipBuiltIn<TDim, TIdx>
+        , public AtomicHierarchy<
+              AtomicUniformCudaHipBuiltIn, // grid atomics
+              AtomicUniformCudaHipBuiltIn, // block atomics
+              AtomicUniformCudaHipBuiltIn> // thread atomics
+        , public math::MathUniformCudaHipBuiltIn
+        , public BlockSharedMemDynUniformCudaHipBuiltIn
+        , public BlockSharedMemStUniformCudaHipBuiltIn
+        , public BlockSyncUniformCudaHipBuiltIn
+        , public IntrinsicUniformCudaHipBuiltIn
+        , public MemFenceUniformCudaHipBuiltIn
+#    ifdef ALPAKA_DISABLE_VENDOR_RNG
+        , public rand::RandDefault
+#    else
+        , public rand::RandUniformCudaHipRand<TApi>
+#    endif
+        , public warp::WarpUniformCudaHipBuiltIn
+        , public concepts::Implements<ConceptAcc, AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
+    {
+        static_assert(
+            sizeof(TIdx) >= sizeof(int),
+            "Index type is not supported, consider using int or a larger type.");
+
+    public:
+        AccGpuUniformCudaHipRt(AccGpuUniformCudaHipRt const&) = delete;
+        AccGpuUniformCudaHipRt(AccGpuUniformCudaHipRt&&) = delete;
+        auto operator=(AccGpuUniformCudaHipRt const&) -> AccGpuUniformCudaHipRt& = delete;
+        auto operator=(AccGpuUniformCudaHipRt&&) -> AccGpuUniformCudaHipRt& = delete;
+
+        ALPAKA_FN_HOST_ACC AccGpuUniformCudaHipRt(Vec<TDim, TIdx> const& threadElemExtent)
+            : WorkDivUniformCudaHipBuiltIn<TDim, TIdx>(threadElemExtent)
+        {
+        }
+    };
+
+    namespace trait
+    {
+        //! The GPU CUDA accelerator accelerator type trait specialization.
+        template<typename TApi, typename TDim, typename TIdx>
+        struct AccType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
+        {
+            using type = AccGpuUniformCudaHipRt<TApi, TDim, TIdx>;
+        };
+
+        //! The GPU CUDA single thread accelerator type trait specialization.
+        template<typename TApi, typename TDim, typename TIdx>
+        struct IsSingleThreadAcc<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>> : std::false_type
+        {
+        };
+
+        //! The GPU CUDA multi thread accelerator type trait specialization.
+        template<typename TApi, typename TDim, typename TIdx>
+        struct IsMultiThreadAcc<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>> : std::true_type
+        {
+        };
+
+        //! The GPU CUDA accelerator device properties get trait specialization.
+        template<typename TApi, typename TDim, typename TIdx>
+        struct GetAccDevProps<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST static auto getAccDevProps(DevUniformCudaHipRt<TApi> const& dev) -> AccDevProps<TDim, TIdx>
+            {
+#    ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+                // Reading only the necessary attributes with cudaDeviceGetAttribute is faster than reading all with
+                // cuda https://devblogs.nvidia.com/cuda-pro-tip-the-fast-way-to-query-device-properties/
+                int multiProcessorCount = {};
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
+                    &multiProcessorCount,
+                    TApi::deviceAttributeMultiprocessorCount,
+                    dev.getNativeHandle()));
+
+                int maxGridSize[3] = {};
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
+                    &maxGridSize[0],
+                    TApi::deviceAttributeMaxGridDimX,
+                    dev.getNativeHandle()));
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
+                    &maxGridSize[1],
+                    TApi::deviceAttributeMaxGridDimY,
+                    dev.getNativeHandle()));
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
+                    &maxGridSize[2],
+                    TApi::deviceAttributeMaxGridDimZ,
+                    dev.getNativeHandle()));
+
+                int maxBlockDim[3] = {};
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
+                    &maxBlockDim[0],
+                    TApi::deviceAttributeMaxBlockDimX,
+                    dev.getNativeHandle()));
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
+                    &maxBlockDim[1],
+                    TApi::deviceAttributeMaxBlockDimY,
+                    dev.getNativeHandle()));
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
+                    &maxBlockDim[2],
+                    TApi::deviceAttributeMaxBlockDimZ,
+                    dev.getNativeHandle()));
+
+                int maxThreadsPerBlock = {};
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
+                    &maxThreadsPerBlock,
+                    TApi::deviceAttributeMaxThreadsPerBlock,
+                    dev.getNativeHandle()));
+
+                int sharedMemSizeBytes = {};
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
+                    &sharedMemSizeBytes,
+                    TApi::deviceAttributeMaxSharedMemoryPerBlock,
+                    dev.getNativeHandle()));
+
+                return {// m_multiProcessorCount
+                        alpaka::core::clipCast<TIdx>(multiProcessorCount),
+                        // m_gridBlockExtentMax
+                        getExtentVecEnd<TDim>(Vec<DimInt<3u>, TIdx>(
+                            alpaka::core::clipCast<TIdx>(maxGridSize[2u]),
+                            alpaka::core::clipCast<TIdx>(maxGridSize[1u]),
+                            alpaka::core::clipCast<TIdx>(maxGridSize[0u]))),
+                        // m_gridBlockCountMax
+                        std::numeric_limits<TIdx>::max(),
+                        // m_blockThreadExtentMax
+                        getExtentVecEnd<TDim>(Vec<DimInt<3u>, TIdx>(
+                            alpaka::core::clipCast<TIdx>(maxBlockDim[2u]),
+                            alpaka::core::clipCast<TIdx>(maxBlockDim[1u]),
+                            alpaka::core::clipCast<TIdx>(maxBlockDim[0u]))),
+                        // m_blockThreadCountMax
+                        alpaka::core::clipCast<TIdx>(maxThreadsPerBlock),
+                        // m_threadElemExtentMax
+                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+                        // m_threadElemCountMax
+                        std::numeric_limits<TIdx>::max(),
+                        // m_sharedMemSizeBytes
+                        static_cast<size_t>(sharedMemSizeBytes),
+                        // m_globalMemSizeBytes
+                        getMemBytes(dev)};
+
+#    else
+                typename TApi::DeviceProp_t properties;
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::getDeviceProperties(&properties, dev.getNativeHandle()));
+
+                return {// m_multiProcessorCount
+                        alpaka::core::clipCast<TIdx>(properties.multiProcessorCount),
+                        // m_gridBlockExtentMax
+                        getExtentVecEnd<TDim>(Vec<DimInt<3u>, TIdx>(
+                            alpaka::core::clipCast<TIdx>(properties.maxGridSize[2u]),
+                            alpaka::core::clipCast<TIdx>(properties.maxGridSize[1u]),
+                            alpaka::core::clipCast<TIdx>(properties.maxGridSize[0u]))),
+                        // m_gridBlockCountMax
+                        std::numeric_limits<TIdx>::max(),
+                        // m_blockThreadExtentMax
+                        getExtentVecEnd<TDim>(Vec<DimInt<3u>, TIdx>(
+                            alpaka::core::clipCast<TIdx>(properties.maxThreadsDim[2u]),
+                            alpaka::core::clipCast<TIdx>(properties.maxThreadsDim[1u]),
+                            alpaka::core::clipCast<TIdx>(properties.maxThreadsDim[0u]))),
+                        // m_blockThreadCountMax
+                        alpaka::core::clipCast<TIdx>(properties.maxThreadsPerBlock),
+                        // m_threadElemExtentMax
+                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
+                        // m_threadElemCountMax
+                        std::numeric_limits<TIdx>::max(),
+                        // m_sharedMemSizeBytes
+                        static_cast<size_t>(properties.sharedMemPerBlock),
+                        // m_globalMemSizeBytes
+                        getMemBytes(dev)};
+#    endif
+            }
+        };
+
+        //! The GPU CUDA accelerator name trait specialization.
+        template<typename TApi, typename TDim, typename TIdx>
+        struct GetAccName<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST static auto getAccName() -> std::string
+            {
+                return std::string("AccGpu") + TApi::name + "Rt<" + std::to_string(TDim::value) + ","
+                       + core::demangled<TIdx> + ">";
+            }
+        };
+
+        //! The GPU CUDA accelerator device type trait specialization.
+        template<typename TApi, typename TDim, typename TIdx>
+        struct DevType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
+        {
+            using type = DevUniformCudaHipRt<TApi>;
+        };
+
+        //! The GPU CUDA accelerator dimension getter trait specialization.
+        template<typename TApi, typename TDim, typename TIdx>
+        struct DimType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+    } // namespace trait
+
+    namespace detail
+    {
+        //! specialization of the TKernelFnObj return type evaluation
+        //
+        // It is not possible to determine the result type of a __device__ lambda for CUDA on the host side.
+        // https://github.com/alpaka-group/alpaka/pull/695#issuecomment-446103194
+        // The execution task TaskKernelGpuUniformCudaHipRt is therefore performing this check on device side.
+        template<typename TApi, typename TDim, typename TIdx>
+        struct CheckFnReturnType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
+        {
+            template<typename TKernelFnObj, typename... TArgs>
+            void operator()(TKernelFnObj const&, TArgs const&...)
+            {
+            }
+        };
+    } // namespace detail
+
+    namespace trait
+    {
+        //! The GPU CUDA accelerator execution task type trait specialization.
+        template<
+            typename TApi,
+            typename TDim,
+            typename TIdx,
+            typename TWorkDiv,
+            typename TKernelFnObj,
+            typename... TArgs>
+        struct CreateTaskKernel<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
+        {
+            ALPAKA_FN_HOST static auto createTaskKernel(
+                TWorkDiv const& workDiv,
+                TKernelFnObj const& kernelFnObj,
+                TArgs&&... args)
+            {
+                return TaskKernelGpuUniformCudaHipRt<
+                    TApi,
+                    AccGpuUniformCudaHipRt<TApi, TDim, TIdx>,
+                    TDim,
+                    TIdx,
+                    TKernelFnObj,
+                    TArgs...>(workDiv, kernelFnObj, std::forward<TArgs>(args)...);
+            }
+        };
+
+        //! The CPU CUDA execution task platform type trait specialization.
+        template<typename TApi, typename TDim, typename TIdx>
+        struct PlatformType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
+        {
+            using type = PlatformUniformCudaHipRt<TApi>;
+        };
+
+        //! The GPU CUDA accelerator idx type trait specialization.
+        template<typename TApi, typename TDim, typename TIdx>
+        struct IdxType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/acc/Tag.hpp b/include/alpaka/acc/Tag.hpp
new file mode 100644
index 0000000..f7880af
--- /dev/null
+++ b/include/alpaka/acc/Tag.hpp
@@ -0,0 +1,72 @@
+/* Copyright 2023 Simeon Ehrig, Jan Stephan, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+
+#include <iostream>
+#include <type_traits>
+
+#define CREATE_ACC_TAG(tag_name)                                                                                      \
+    struct tag_name                                                                                                   \
+    {                                                                                                                 \
+        static std::string get_name()                                                                                 \
+        {                                                                                                             \
+            return #tag_name;                                                                                         \
+        }                                                                                                             \
+    }
+
+namespace alpaka
+{
+    CREATE_ACC_TAG(TagCpuOmp2Blocks);
+    CREATE_ACC_TAG(TagCpuOmp2Threads);
+    CREATE_ACC_TAG(TagCpuSerial);
+    CREATE_ACC_TAG(TagCpuSycl);
+    CREATE_ACC_TAG(TagCpuTbbBlocks);
+    CREATE_ACC_TAG(TagCpuThreads);
+    CREATE_ACC_TAG(TagFpgaSyclIntel);
+    CREATE_ACC_TAG(TagGenericSycl);
+    CREATE_ACC_TAG(TagGpuCudaRt);
+    CREATE_ACC_TAG(TagGpuHipRt);
+    CREATE_ACC_TAG(TagGpuSyclIntel);
+
+    namespace trait
+    {
+        template<typename TAcc>
+        struct AccToTag;
+
+        template<typename TTag, typename TDim, typename TIdx>
+        struct TagToAcc;
+    } // namespace trait
+
+    //! \brief maps an acc type to a tag type
+    //! \tparam TAcc alpaka acc type
+    template<typename TAcc>
+    using AccToTag = typename trait::AccToTag<TAcc>::type;
+
+    //! \brief maps a tag type to an acc type
+    //! \tparam TTag alpaka tag type
+    //! \tparam TDim dimension of the mapped acc type
+    //! \tparam TIdx index type of the mapped acc type
+    template<typename TTag, typename TDim, typename TIdx>
+    using TagToAcc = typename trait::TagToAcc<TTag, TDim, TIdx>::type;
+
+    template<typename TAcc, typename... TTag>
+    inline constexpr bool accMatchesTags = (std::is_same_v<alpaka::AccToTag<TAcc>, TTag> || ...);
+
+    //! list of all available tags
+    using AccTags = std::tuple<
+        alpaka::TagCpuSerial,
+        alpaka::TagCpuThreads,
+        alpaka::TagCpuTbbBlocks,
+        alpaka::TagCpuOmp2Blocks,
+        alpaka::TagCpuOmp2Threads,
+        alpaka::TagGpuCudaRt,
+        alpaka::TagGpuHipRt,
+        alpaka::TagCpuSycl,
+        alpaka::TagFpgaSyclIntel,
+        alpaka::TagGpuSyclIntel>;
+
+} // namespace alpaka
diff --git a/include/alpaka/acc/TagAccIsEnabled.hpp b/include/alpaka/acc/TagAccIsEnabled.hpp
new file mode 100644
index 0000000..c21fd2b
--- /dev/null
+++ b/include/alpaka/acc/TagAccIsEnabled.hpp
@@ -0,0 +1,36 @@
+#pragma once
+
+// include all Acc's because of the struct AccIsEnabled
+// if an acc is not include, it will be not enabled independent of the compiler flags
+#include "alpaka/acc/AccCpuOmp2Blocks.hpp"
+#include "alpaka/acc/AccCpuOmp2Threads.hpp"
+#include "alpaka/acc/AccCpuSerial.hpp"
+#include "alpaka/acc/AccCpuSycl.hpp"
+#include "alpaka/acc/AccCpuTbbBlocks.hpp"
+#include "alpaka/acc/AccCpuThreads.hpp"
+#include "alpaka/acc/AccFpgaSyclIntel.hpp"
+#include "alpaka/acc/AccGpuCudaRt.hpp"
+#include "alpaka/acc/AccGpuHipRt.hpp"
+#include "alpaka/dim/DimIntegralConst.hpp"
+#include "alpaka/meta/Filter.hpp"
+
+#include <type_traits>
+
+namespace alpaka
+{
+    //! \brief check if the accelerator is enabled for a given tag
+    //! \tparam TTag alpaka tag type
+    template<typename TTag, typename = void>
+    struct AccIsEnabled : std::false_type
+    {
+    };
+
+    template<typename TTag>
+    struct AccIsEnabled<TTag, std::void_t<TagToAcc<TTag, alpaka::DimInt<1>, int>>> : std::true_type
+    {
+    };
+
+    //! list of all tags where the related accelerator is enabled
+    using EnabledAccTags = alpaka::meta::Filter<AccTags, alpaka::AccIsEnabled>;
+
+} // namespace alpaka
diff --git a/include/alpaka/acc/Traits.hpp b/include/alpaka/acc/Traits.hpp
new file mode 100644
index 0000000..48fa0b1
--- /dev/null
+++ b/include/alpaka/acc/Traits.hpp
@@ -0,0 +1,115 @@
+/* Copyright 2024 Benjamin Worpitz, Bernhard Manfred Gruber, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/acc/AccDevProps.hpp"
+#include "alpaka/core/Common.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/core/DemangleTypeNames.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/dim/Traits.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/kernel/Traits.hpp"
+#include "alpaka/platform/Traits.hpp"
+#include "alpaka/queue/Traits.hpp"
+
+#include <string>
+#include <type_traits>
+#include <typeinfo>
+
+namespace alpaka
+{
+    struct ConceptAcc
+    {
+    };
+
+    //! True if TAcc is an accelerator, i.e. if it implements the ConceptAcc concept.
+    template<typename TAcc>
+    inline constexpr bool isAccelerator = concepts::ImplementsConcept<ConceptAcc, TAcc>::value;
+
+    //! The accelerator traits.
+    namespace trait
+    {
+        //! The accelerator type trait.
+        template<typename T, typename TSfinae = void>
+        struct AccType;
+
+        //! The single thread accelerator trait.
+        //!
+        //! If TAcc is an accelerator that supports only a single thread per block, inherit from std::true_type.
+        //! If TAcc is not an accelerator, or an accelerator that supports multiple threads per block, inherit from
+        //! std::false_type.
+        template<typename TAcc, typename TSfinae = void>
+        struct IsSingleThreadAcc : std::false_type
+        {
+        };
+
+        //! The multi thread accelerator trait.
+        //!
+        //! If TAcc is an accelerator that supports multiple threads per block, inherit from std::true_type.
+        //! If TAcc is not an accelerator, or an accelerator that supports only a single thread per block, inherit from
+        //! std::false_type.
+        template<typename TAcc, typename TSfinae = void>
+        struct IsMultiThreadAcc : std::false_type
+        {
+        };
+
+        //! The device properties get trait.
+        template<typename TAcc, typename TSfinae = void>
+        struct GetAccDevProps;
+
+        //! The accelerator name trait.
+        //!
+        //! The default implementation returns the mangled class name.
+        template<typename TAcc, typename TSfinae = void>
+        struct GetAccName
+        {
+            ALPAKA_FN_HOST static auto getAccName() -> std::string
+            {
+                return core::demangled<TAcc>;
+            }
+        };
+    } // namespace trait
+
+    //! The accelerator type trait alias template to remove the ::type.
+    template<typename T>
+    using Acc = typename trait::AccType<T>::type;
+
+    //! True if TAcc is an accelerator that supports only a single thread per block, false otherwise.
+    template<typename TAcc>
+    inline constexpr bool isSingleThreadAcc = trait::IsSingleThreadAcc<TAcc>::value;
+
+    //! True if TAcc is an accelerator that supports multiple threads per block, false otherwise.
+    template<typename TAcc>
+    inline constexpr bool isMultiThreadAcc = trait::IsMultiThreadAcc<TAcc>::value;
+
+    //! \return The acceleration properties on the given device.
+    template<typename TAcc, typename TDev>
+    ALPAKA_FN_HOST auto getAccDevProps(TDev const& dev) -> AccDevProps<Dim<TAcc>, Idx<TAcc>>
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptAcc, TAcc>;
+        return trait::GetAccDevProps<ImplementationBase>::getAccDevProps(dev);
+    }
+
+    //! \return The accelerator name
+    //!
+    //! \tparam TAcc The accelerator type.
+    template<typename TAcc>
+    ALPAKA_FN_HOST auto getAccName() -> std::string
+    {
+        return trait::GetAccName<TAcc>::getAccName();
+    }
+
+    namespace trait
+    {
+        template<typename TAcc, typename TProperty>
+        struct QueueType<TAcc, TProperty, std::enable_if_t<concepts::ImplementsConcept<ConceptAcc, TAcc>::value>>
+        {
+            using type = typename QueueType<typename alpaka::trait::PlatformType<TAcc>::type, TProperty>::type;
+        };
+
+    } // namespace trait
+
+} // namespace alpaka
diff --git a/include/alpaka/alpaka.hpp b/include/alpaka/alpaka.hpp
new file mode 100644
index 0000000..fe410cf
--- /dev/null
+++ b/include/alpaka/alpaka.hpp
@@ -0,0 +1,229 @@
+/* Copyright 2023 Axel Hübl, Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera, Bernhard Manfred Gruber,
+ *                Jan Stephan, Antonio Di Pilato, Luca Ferragina, Aurora Perego, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+// Include the whole library.
+
+// version number
+#include "alpaka/version.hpp"
+// acc
+#include "alpaka/acc/AccCpuOmp2Blocks.hpp"
+#include "alpaka/acc/AccCpuOmp2Threads.hpp"
+#include "alpaka/acc/AccCpuSerial.hpp"
+#include "alpaka/acc/AccCpuSycl.hpp"
+#include "alpaka/acc/AccCpuTbbBlocks.hpp"
+#include "alpaka/acc/AccCpuThreads.hpp"
+#include "alpaka/acc/AccDevProps.hpp"
+#include "alpaka/acc/AccFpgaSyclIntel.hpp"
+#include "alpaka/acc/AccGenericSycl.hpp"
+#include "alpaka/acc/AccGpuCudaRt.hpp"
+#include "alpaka/acc/AccGpuHipRt.hpp"
+#include "alpaka/acc/AccGpuSyclIntel.hpp"
+#include "alpaka/acc/Tag.hpp"
+#include "alpaka/acc/TagAccIsEnabled.hpp"
+#include "alpaka/acc/Traits.hpp"
+// atomic
+#include "alpaka/atomic/AtomicCpu.hpp"
+#include "alpaka/atomic/AtomicGenericSycl.hpp"
+#include "alpaka/atomic/AtomicNoOp.hpp"
+#include "alpaka/atomic/AtomicOmpBuiltIn.hpp"
+#include "alpaka/atomic/AtomicUniformCudaHipBuiltIn.hpp"
+#include "alpaka/atomic/Op.hpp"
+#include "alpaka/atomic/Traits.hpp"
+// block
+// shared
+// dynamic
+#include "alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp"
+#include "alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp"
+#include "alpaka/block/shared/dyn/BlockSharedMemDynUniformCudaHipBuiltIn.hpp"
+#include "alpaka/block/shared/dyn/Traits.hpp"
+// static
+#include "alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp"
+#include "alpaka/block/shared/st/BlockSharedMemStMember.hpp"
+#include "alpaka/block/shared/st/BlockSharedMemStMemberMasterSync.hpp"
+#include "alpaka/block/shared/st/BlockSharedMemStUniformCudaHipBuiltIn.hpp"
+#include "alpaka/block/shared/st/Traits.hpp"
+// sync
+#include "alpaka/block/sync/BlockSyncBarrierOmp.hpp"
+#include "alpaka/block/sync/BlockSyncBarrierThread.hpp"
+#include "alpaka/block/sync/BlockSyncGenericSycl.hpp"
+#include "alpaka/block/sync/BlockSyncNoOp.hpp"
+#include "alpaka/block/sync/BlockSyncUniformCudaHipBuiltIn.hpp"
+#include "alpaka/block/sync/Traits.hpp"
+// core
+#include "alpaka/core/Align.hpp"
+#include "alpaka/core/AlignedAlloc.hpp"
+#include "alpaka/core/Assert.hpp"
+#include "alpaka/core/BarrierThread.hpp"
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/ClipCast.hpp"
+#include "alpaka/core/Common.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/core/Cuda.hpp"
+#include "alpaka/core/Debug.hpp"
+#include "alpaka/core/Hip.hpp"
+#include "alpaka/core/OmpSchedule.hpp"
+#include "alpaka/core/Positioning.hpp"
+#include "alpaka/core/RemoveRestrict.hpp"
+#include "alpaka/core/RuntimeMacros.hpp"
+#include "alpaka/core/Sycl.hpp"
+#include "alpaka/core/ThreadPool.hpp"
+#include "alpaka/core/Unreachable.hpp"
+#include "alpaka/core/Unroll.hpp"
+#include "alpaka/core/Utility.hpp"
+#include "alpaka/core/Vectorize.hpp"
+// dev
+#include "alpaka/dev/DevCpu.hpp"
+#include "alpaka/dev/DevCpuSycl.hpp"
+#include "alpaka/dev/DevCudaRt.hpp"
+#include "alpaka/dev/DevFpgaSyclIntel.hpp"
+#include "alpaka/dev/DevGenericSycl.hpp"
+#include "alpaka/dev/DevGpuSyclIntel.hpp"
+#include "alpaka/dev/DevHipRt.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/dev/cpu/Wait.hpp"
+// dim
+#include "alpaka/dim/DimArithmetic.hpp"
+#include "alpaka/dim/DimIntegralConst.hpp"
+#include "alpaka/dim/Traits.hpp"
+// event
+#include "alpaka/event/EventCpu.hpp"
+#include "alpaka/event/EventCpuSycl.hpp"
+#include "alpaka/event/EventCudaRt.hpp"
+#include "alpaka/event/EventFpgaSyclIntel.hpp"
+#include "alpaka/event/EventGenericSycl.hpp"
+#include "alpaka/event/EventGpuSyclIntel.hpp"
+#include "alpaka/event/EventHipRt.hpp"
+#include "alpaka/event/Traits.hpp"
+// exec
+#include "alpaka/exec/ElementIndex.hpp"
+#include "alpaka/exec/IndependentElements.hpp"
+#include "alpaka/exec/Once.hpp"
+#include "alpaka/exec/UniformElements.hpp"
+// extent
+#include "alpaka/extent/Traits.hpp"
+// idx
+#include "alpaka/idx/Accessors.hpp"
+#include "alpaka/idx/MapIdx.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/idx/bt/IdxBtGenericSycl.hpp"
+#include "alpaka/idx/bt/IdxBtOmp.hpp"
+#include "alpaka/idx/bt/IdxBtRefThreadIdMap.hpp"
+#include "alpaka/idx/bt/IdxBtUniformCudaHipBuiltIn.hpp"
+#include "alpaka/idx/bt/IdxBtZero.hpp"
+#include "alpaka/idx/gb/IdxGbGenericSycl.hpp"
+#include "alpaka/idx/gb/IdxGbRef.hpp"
+#include "alpaka/idx/gb/IdxGbUniformCudaHipBuiltIn.hpp"
+// kernel
+#include "alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp"
+#include "alpaka/kernel/TaskKernelCpuOmp2Threads.hpp"
+#include "alpaka/kernel/TaskKernelCpuSerial.hpp"
+#include "alpaka/kernel/TaskKernelCpuSycl.hpp"
+#include "alpaka/kernel/TaskKernelCpuTbbBlocks.hpp"
+#include "alpaka/kernel/TaskKernelCpuThreads.hpp"
+#include "alpaka/kernel/TaskKernelFpgaSyclIntel.hpp"
+#include "alpaka/kernel/TaskKernelGenericSycl.hpp"
+#include "alpaka/kernel/TaskKernelGpuCudaRt.hpp"
+#include "alpaka/kernel/TaskKernelGpuHipRt.hpp"
+#include "alpaka/kernel/TaskKernelGpuSyclIntel.hpp"
+#include "alpaka/kernel/Traits.hpp"
+// math
+#include "alpaka/math/Complex.hpp"
+#include "alpaka/math/MathGenericSycl.hpp"
+#include "alpaka/math/MathStdLib.hpp"
+#include "alpaka/math/MathUniformCudaHipBuiltIn.hpp"
+// mem
+#include "alpaka/mem/alloc/AllocCpuAligned.hpp"
+#include "alpaka/mem/alloc/AllocCpuNew.hpp"
+#include "alpaka/mem/alloc/Traits.hpp"
+#include "alpaka/mem/buf/BufCpu.hpp"
+#include "alpaka/mem/buf/BufCpuSycl.hpp"
+#include "alpaka/mem/buf/BufCudaRt.hpp"
+#include "alpaka/mem/buf/BufFpgaSyclIntel.hpp"
+#include "alpaka/mem/buf/BufGenericSycl.hpp"
+#include "alpaka/mem/buf/BufGpuSyclIntel.hpp"
+#include "alpaka/mem/buf/BufHipRt.hpp"
+#include "alpaka/mem/buf/Traits.hpp"
+#include "alpaka/mem/fence/MemFenceCpu.hpp"
+#include "alpaka/mem/fence/MemFenceCpuSerial.hpp"
+#include "alpaka/mem/fence/MemFenceGenericSycl.hpp"
+#include "alpaka/mem/fence/MemFenceOmp2Blocks.hpp"
+#include "alpaka/mem/fence/MemFenceOmp2Threads.hpp"
+#include "alpaka/mem/fence/MemFenceUniformCudaHipBuiltIn.hpp"
+#include "alpaka/mem/fence/Traits.hpp"
+#include "alpaka/mem/global/DeviceGlobalCpu.hpp"
+#include "alpaka/mem/global/DeviceGlobalGenericSycl.hpp"
+#include "alpaka/mem/global/DeviceGlobalUniformCudaHipBuiltIn.hpp"
+#include "alpaka/mem/global/Traits.hpp"
+#include "alpaka/mem/view/Traits.hpp"
+#include "alpaka/mem/view/ViewConst.hpp"
+#include "alpaka/mem/view/ViewPlainPtr.hpp"
+#include "alpaka/mem/view/ViewStdArray.hpp"
+#include "alpaka/mem/view/ViewStdVector.hpp"
+#include "alpaka/mem/view/ViewSubView.hpp"
+// meta
+#include "alpaka/meta/Apply.hpp"
+#include "alpaka/meta/CartesianProduct.hpp"
+#include "alpaka/meta/Concatenate.hpp"
+#include "alpaka/meta/DependentFalseType.hpp"
+#include "alpaka/meta/Filter.hpp"
+#include "alpaka/meta/Fold.hpp"
+#include "alpaka/meta/ForEachType.hpp"
+#include "alpaka/meta/Functional.hpp"
+#include "alpaka/meta/IntegerSequence.hpp"
+#include "alpaka/meta/Integral.hpp"
+#include "alpaka/meta/IsArrayOrVector.hpp"
+#include "alpaka/meta/IsStrictBase.hpp"
+#include "alpaka/meta/NdLoop.hpp"
+#include "alpaka/meta/NonZero.hpp"
+#include "alpaka/meta/Set.hpp"
+#include "alpaka/meta/Transform.hpp"
+#include "alpaka/meta/TypeListOps.hpp"
+// offset
+#include "alpaka/offset/Traits.hpp"
+// platform
+#include "alpaka/platform/PlatformCpu.hpp"
+#include "alpaka/platform/PlatformCpuSycl.hpp"
+#include "alpaka/platform/PlatformCudaRt.hpp"
+#include "alpaka/platform/PlatformFpgaSyclIntel.hpp"
+#include "alpaka/platform/PlatformGpuSyclIntel.hpp"
+#include "alpaka/platform/PlatformHipRt.hpp"
+#include "alpaka/platform/Traits.hpp"
+// rand
+#include "alpaka/rand/RandDefault.hpp"
+#include "alpaka/rand/RandGenericSycl.hpp"
+#include "alpaka/rand/RandPhilox.hpp"
+#include "alpaka/rand/RandStdLib.hpp"
+#include "alpaka/rand/RandUniformCudaHipRand.hpp"
+#include "alpaka/rand/Traits.hpp"
+// idx
+#include "alpaka/idx/Traits.hpp"
+// queue
+#include "alpaka/queue/Properties.hpp"
+#include "alpaka/queue/QueueCpuBlocking.hpp"
+#include "alpaka/queue/QueueCpuNonBlocking.hpp"
+#include "alpaka/queue/QueueCpuSyclBlocking.hpp"
+#include "alpaka/queue/QueueCpuSyclNonBlocking.hpp"
+#include "alpaka/queue/QueueCudaRtBlocking.hpp"
+#include "alpaka/queue/QueueCudaRtNonBlocking.hpp"
+#include "alpaka/queue/QueueFpgaSyclIntelBlocking.hpp"
+#include "alpaka/queue/QueueFpgaSyclIntelNonBlocking.hpp"
+#include "alpaka/queue/QueueGpuSyclIntelBlocking.hpp"
+#include "alpaka/queue/QueueGpuSyclIntelNonBlocking.hpp"
+#include "alpaka/queue/QueueHipRtBlocking.hpp"
+#include "alpaka/queue/QueueHipRtNonBlocking.hpp"
+#include "alpaka/queue/Traits.hpp"
+// traits
+#include "alpaka/traits/Traits.hpp"
+// wait
+#include "alpaka/wait/Traits.hpp"
+// workdiv
+#include "alpaka/workdiv/Traits.hpp"
+#include "alpaka/workdiv/WorkDivHelpers.hpp"
+#include "alpaka/workdiv/WorkDivMembers.hpp"
+// vec
+#include "alpaka/vec/Traits.hpp"
+#include "alpaka/vec/Vec.hpp"
diff --git a/include/alpaka/atomic/AtomicAtomicRef.hpp b/include/alpaka/atomic/AtomicAtomicRef.hpp
new file mode 100644
index 0000000..61b825c
--- /dev/null
+++ b/include/alpaka/atomic/AtomicAtomicRef.hpp
@@ -0,0 +1,237 @@
+/* Copyright 2022 Felice Pantaleo, Andrea Bocci, Jan Stephan
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/atomic/Traits.hpp"
+#include "alpaka/core/BoostPredef.hpp"
+
+#include <array>
+#include <atomic>
+#include <type_traits>
+
+#ifndef ALPAKA_DISABLE_ATOMIC_ATOMICREF
+#    ifndef ALPAKA_HAS_STD_ATOMIC_REF
+#        include <boost/atomic.hpp>
+#    endif
+
+namespace alpaka
+{
+    namespace detail
+    {
+#    if defined(ALPAKA_HAS_STD_ATOMIC_REF)
+        template<typename T>
+        using atomic_ref = std::atomic_ref<T>;
+#    else
+        template<typename T>
+        using atomic_ref = boost::atomic_ref<T>;
+#    endif
+    } // namespace detail
+
+    //! The atomic ops based on atomic_ref for CPU accelerators.
+    //
+    //  Atomics can be used in the grids, blocks and threads hierarchy levels.
+    //
+
+    class AtomicAtomicRef
+    {
+    };
+
+    template<typename T>
+    void isSupportedByAtomicAtomicRef()
+    {
+        static_assert(
+            std::is_trivially_copyable_v<T> && alpaka::detail::atomic_ref<T>::required_alignment <= alignof(T),
+            "Type not supported by AtomicAtomicRef, please recompile defining "
+            "ALPAKA_DISABLE_ATOMIC_ATOMICREF.");
+    }
+
+    namespace trait
+    {
+        //! The CPU accelerators AtomicAdd.
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicAdd, AtomicAtomicRef, T, THierarchy>
+        {
+            ALPAKA_FN_HOST static auto atomicOp(AtomicAtomicRef const&, T* const addr, T const& value) -> T
+            {
+                isSupportedByAtomicAtomicRef<T>();
+                alpaka::detail::atomic_ref<T> ref(*addr);
+                return ref.fetch_add(value);
+            }
+        };
+
+        //! The CPU accelerators AtomicSub.
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicSub, AtomicAtomicRef, T, THierarchy>
+        {
+            ALPAKA_FN_HOST static auto atomicOp(AtomicAtomicRef const&, T* const addr, T const& value) -> T
+            {
+                isSupportedByAtomicAtomicRef<T>();
+                alpaka::detail::atomic_ref<T> ref(*addr);
+                return ref.fetch_sub(value);
+            }
+        };
+
+        //! The CPU accelerators AtomicMin.
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicMin, AtomicAtomicRef, T, THierarchy>
+        {
+            ALPAKA_FN_HOST static auto atomicOp(AtomicAtomicRef const&, T* const addr, T const& value) -> T
+            {
+                isSupportedByAtomicAtomicRef<T>();
+                alpaka::detail::atomic_ref<T> ref(*addr);
+                T old = ref;
+                T result = old;
+                result = std::min(result, value);
+                while(!ref.compare_exchange_weak(old, result))
+                {
+                    result = old;
+                    result = std::min(result, value);
+                }
+                return old;
+            }
+        };
+
+        //! The CPU accelerators AtomicMax.
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicMax, AtomicAtomicRef, T, THierarchy>
+        {
+            ALPAKA_FN_HOST static auto atomicOp(AtomicAtomicRef const&, T* const addr, T const& value) -> T
+            {
+                isSupportedByAtomicAtomicRef<T>();
+                alpaka::detail::atomic_ref<T> ref(*addr);
+                T old = ref;
+                T result = old;
+                result = std::max(result, value);
+                while(!ref.compare_exchange_weak(old, result))
+                {
+                    result = old;
+                    result = std::max(result, value);
+                }
+                return old;
+            }
+        };
+
+        //! The CPU accelerators AtomicExch.
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicExch, AtomicAtomicRef, T, THierarchy>
+        {
+            ALPAKA_FN_HOST static auto atomicOp(AtomicAtomicRef const&, T* const addr, T const& value) -> T
+            {
+                isSupportedByAtomicAtomicRef<T>();
+                alpaka::detail::atomic_ref<T> ref(*addr);
+                T old = ref;
+                T result = value;
+                while(!ref.compare_exchange_weak(old, result))
+                {
+                    result = value;
+                }
+                return old;
+            }
+        };
+
+        //! The CPU accelerators AtomicInc.
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicInc, AtomicAtomicRef, T, THierarchy>
+        {
+            ALPAKA_FN_HOST static auto atomicOp(AtomicAtomicRef const&, T* const addr, T const& value) -> T
+            {
+                isSupportedByAtomicAtomicRef<T>();
+                alpaka::detail::atomic_ref<T> ref(*addr);
+                T old = ref;
+                T result = ((old >= value) ? 0 : static_cast<T>(old + 1));
+                while(!ref.compare_exchange_weak(old, result))
+                {
+                    result = ((old >= value) ? 0 : static_cast<T>(old + 1));
+                }
+                return old;
+            }
+        };
+
+        //! The CPU accelerators AtomicDec.
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicDec, AtomicAtomicRef, T, THierarchy>
+        {
+            ALPAKA_FN_HOST static auto atomicOp(AtomicAtomicRef const&, T* const addr, T const& value) -> T
+            {
+                isSupportedByAtomicAtomicRef<T>();
+                alpaka::detail::atomic_ref<T> ref(*addr);
+                T old = ref;
+                T result = ((old >= value) ? 0 : static_cast<T>(old - 1));
+                while(!ref.compare_exchange_weak(old, result))
+                {
+                    result = ((old >= value) ? 0 : static_cast<T>(old - 1));
+                }
+                return old;
+            }
+        };
+
+        //! The CPU accelerators AtomicAnd.
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicAnd, AtomicAtomicRef, T, THierarchy>
+        {
+            ALPAKA_FN_HOST static auto atomicOp(AtomicAtomicRef const&, T* const addr, T const& value) -> T
+            {
+                isSupportedByAtomicAtomicRef<T>();
+                alpaka::detail::atomic_ref<T> ref(*addr);
+                return ref.fetch_and(value);
+            }
+        };
+
+        //! The CPU accelerators AtomicOr.
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicOr, AtomicAtomicRef, T, THierarchy>
+        {
+            ALPAKA_FN_HOST static auto atomicOp(AtomicAtomicRef const&, T* const addr, T const& value) -> T
+            {
+                isSupportedByAtomicAtomicRef<T>();
+                alpaka::detail::atomic_ref<T> ref(*addr);
+                return ref.fetch_or(value);
+            }
+        };
+
+        //! The CPU accelerators AtomicXor.
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicXor, AtomicAtomicRef, T, THierarchy>
+        {
+            ALPAKA_FN_HOST static auto atomicOp(AtomicAtomicRef const&, T* const addr, T const& value) -> T
+            {
+                isSupportedByAtomicAtomicRef<T>();
+                alpaka::detail::atomic_ref<T> ref(*addr);
+                return ref.fetch_xor(value);
+            }
+        };
+
+        //! The CPU accelerators AtomicCas.
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicCas, AtomicAtomicRef, T, THierarchy>
+        {
+            ALPAKA_FN_HOST static auto atomicOp(
+                AtomicAtomicRef const&,
+                T* const addr,
+                T const& compare,
+                T const& value) -> T
+            {
+                isSupportedByAtomicAtomicRef<T>();
+                alpaka::detail::atomic_ref<T> ref(*addr);
+                T old = ref;
+                T result;
+                do
+                {
+#    if BOOST_COMP_GNUC || BOOST_COMP_CLANG
+#        pragma GCC diagnostic push
+#        pragma GCC diagnostic ignored "-Wfloat-equal"
+#    endif
+                    result = ((old == compare) ? value : old);
+#    if BOOST_COMP_GNUC || BOOST_COMP_CLANG
+#        pragma GCC diagnostic pop
+#    endif
+                } while(!ref.compare_exchange_weak(old, result));
+                return old;
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/atomic/AtomicCpu.hpp b/include/alpaka/atomic/AtomicCpu.hpp
new file mode 100644
index 0000000..5667bd0
--- /dev/null
+++ b/include/alpaka/atomic/AtomicCpu.hpp
@@ -0,0 +1,30 @@
+/* Copyright 2024 Andrea Bocci, Felice Pantaleo
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+
+// clang 9/10/11 together with nvcc<11.6.0 as host compiler fails at compile time when using boost::atomic_ref
+#ifdef BOOST_COMP_CLANG_AVAILABLE
+#    if(BOOST_COMP_CLANG < BOOST_VERSION_NUMBER(12, 0, 0) && BOOST_COMP_NVCC                                          \
+        && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 6, 0))
+#        if !defined(ALPAKA_DISABLE_ATOMIC_ATOMICREF)
+#            define ALPAKA_DISABLE_ATOMIC_ATOMICREF
+#        endif
+#    endif
+#endif // BOOST_COMP_CLANG_AVAILABLE
+
+#include "alpaka/atomic/AtomicAtomicRef.hpp"
+#include "alpaka/atomic/AtomicStdLibLock.hpp"
+
+namespace alpaka
+{
+#ifndef ALPAKA_DISABLE_ATOMIC_ATOMICREF
+    using AtomicCpu = AtomicAtomicRef;
+#else
+    using AtomicCpu = AtomicStdLibLock<16>;
+#endif // ALPAKA_DISABLE_ATOMIC_ATOMICREF
+
+} // namespace alpaka
diff --git a/include/alpaka/atomic/AtomicGenericSycl.hpp b/include/alpaka/atomic/AtomicGenericSycl.hpp
new file mode 100644
index 0000000..bdfa53b
--- /dev/null
+++ b/include/alpaka/atomic/AtomicGenericSycl.hpp
@@ -0,0 +1,263 @@
+/* Copyright 2023 Jan Stephan, Andrea Bocci, Luca Ferragina
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/atomic/Op.hpp"
+#include "alpaka/atomic/Traits.hpp"
+#include "alpaka/core/Positioning.hpp"
+#include "alpaka/meta/DependentFalseType.hpp"
+
+#include <cstdint>
+#include <type_traits>
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+
+#    include <sycl/sycl.hpp>
+
+namespace alpaka
+{
+    //! The SYCL accelerator atomic ops.
+    //
+    //  Atomics can used in the hierarchy level grids, blocks and threads.
+    //  Atomics are not guaranteed to be safe between devices
+    class AtomicGenericSycl
+    {
+    };
+
+    namespace detail
+    {
+        template<typename THierarchy>
+        struct SyclMemoryScope
+        {
+        };
+
+        template<>
+        struct SyclMemoryScope<hierarchy::Grids>
+        {
+            static constexpr auto value = sycl::memory_scope::device;
+        };
+
+        template<>
+        struct SyclMemoryScope<hierarchy::Blocks>
+        {
+            static constexpr auto value = sycl::memory_scope::device;
+        };
+
+        template<>
+        struct SyclMemoryScope<hierarchy::Threads>
+        {
+            static constexpr auto value = sycl::memory_scope::work_group;
+        };
+
+        template<typename T, typename THierarchy>
+        using sycl_atomic_ref = sycl::atomic_ref<T, sycl::memory_order::relaxed, SyclMemoryScope<THierarchy>::value>;
+
+        template<typename THierarchy, typename T, typename TOp>
+        inline auto callAtomicOp(T* const addr, TOp&& op)
+        {
+            auto ref = sycl_atomic_ref<T, THierarchy>{*addr};
+            return op(ref);
+        }
+
+        template<typename TRef, typename T, typename TEval>
+        inline auto casWithCondition(T* const addr, TEval&& eval)
+        {
+            auto ref = TRef{*addr};
+            auto old_val = ref.load();
+
+            // prefer compare_exchange_weak when in a loop, assuming that eval is not expensive
+            while(!ref.compare_exchange_weak(old_val, eval(old_val)))
+            {
+            }
+
+            return old_val;
+        }
+    } // namespace detail
+} // namespace alpaka
+
+namespace alpaka::trait
+{
+    // Add.
+    //! The SYCL accelerator atomic operation.
+    template<typename T, typename THierarchy>
+    struct AtomicOp<AtomicAdd, AtomicGenericSycl, T, THierarchy>
+    {
+        static_assert(std::is_integral_v<T> || std::is_floating_point_v<T>, "SYCL atomics do not support this type");
+
+        static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T
+        {
+            return alpaka::detail::callAtomicOp<THierarchy>(
+                addr,
+                [&value](auto& ref) { return ref.fetch_add(value); });
+        }
+    };
+
+    // Sub.
+    //! The SYCL accelerator atomic operation.
+    template<typename T, typename THierarchy>
+    struct AtomicOp<AtomicSub, AtomicGenericSycl, T, THierarchy>
+    {
+        static_assert(std::is_integral_v<T> || std::is_floating_point_v<T>, "SYCL atomics do not support this type");
+
+        static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T
+        {
+            return alpaka::detail::callAtomicOp<THierarchy>(
+                addr,
+                [&value](auto& ref) { return ref.fetch_sub(value); });
+        }
+    };
+
+    // Min.
+    //! The SYCL accelerator atomic operation.
+    template<typename T, typename THierarchy>
+    struct AtomicOp<AtomicMin, AtomicGenericSycl, T, THierarchy>
+    {
+        static_assert(std::is_integral_v<T> || std::is_floating_point_v<T>, "SYCL atomics do not support this type");
+
+        static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T
+        {
+            return alpaka::detail::callAtomicOp<THierarchy>(
+                addr,
+                [&value](auto& ref) { return ref.fetch_min(value); });
+        }
+    };
+
+    // Max.
+    //! The SYCL accelerator atomic operation.
+    template<typename T, typename THierarchy>
+    struct AtomicOp<AtomicMax, AtomicGenericSycl, T, THierarchy>
+    {
+        static_assert(std::is_integral_v<T> || std::is_floating_point_v<T>, "SYCL atomics do not support this type");
+
+        static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T
+        {
+            return alpaka::detail::callAtomicOp<THierarchy>(
+                addr,
+                [&value](auto& ref) { return ref.fetch_max(value); });
+        }
+    };
+
+    // Exch.
+    //! The SYCL accelerator atomic operation.
+    template<typename T, typename THierarchy>
+    struct AtomicOp<AtomicExch, AtomicGenericSycl, T, THierarchy>
+    {
+        static_assert(
+            (std::is_integral_v<T> || std::is_floating_point_v<T>) and(sizeof(T) == 4 || sizeof(T) == 8),
+            "SYCL atomics do not support this type");
+
+        static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T
+        {
+            return alpaka::detail::callAtomicOp<THierarchy>(addr, [&value](auto& ref) { return ref.exchange(value); });
+        }
+    };
+
+    // Inc.
+    //! The SYCL accelerator atomic operation.
+    template<typename T, typename THierarchy>
+    struct AtomicOp<AtomicInc, AtomicGenericSycl, T, THierarchy>
+    {
+        static_assert(
+            std::is_unsigned_v<T> && (sizeof(T) == 4 || sizeof(T) == 8),
+            "SYCL atomics support only 32- and 64-bits unsigned integral types");
+
+        static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T
+        {
+            auto inc = [&value](auto old_val)
+            { return (old_val >= value) ? static_cast<T>(0) : (old_val + static_cast<T>(1)); };
+            return alpaka::detail::casWithCondition<alpaka::detail::sycl_atomic_ref<T, THierarchy>>(addr, inc);
+        }
+    };
+
+    // Dec.
+    //! The SYCL accelerator atomic operation.
+    template<typename T, typename THierarchy>
+    struct AtomicOp<AtomicDec, AtomicGenericSycl, T, THierarchy>
+    {
+        static_assert(
+            std::is_unsigned_v<T> && (sizeof(T) == 4 || sizeof(T) == 8),
+            "SYCL atomics support only 32- and 64-bits unsigned integral types");
+
+        static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T
+        {
+            auto dec = [&value](auto& old_val)
+            { return ((old_val == 0) || (old_val > value)) ? value : (old_val - static_cast<T>(1)); };
+            return alpaka::detail::casWithCondition<alpaka::detail::sycl_atomic_ref<T, THierarchy>>(addr, dec);
+        }
+    };
+
+    // And.
+    //! The SYCL accelerator atomic operation.
+    template<typename T, typename THierarchy>
+    struct AtomicOp<AtomicAnd, AtomicGenericSycl, T, THierarchy>
+    {
+        static_assert(std::is_integral_v<T>, "Bitwise operations only supported for integral types.");
+
+        static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T
+        {
+            return alpaka::detail::callAtomicOp<THierarchy>(
+                addr,
+                [&value](auto& ref) { return ref.fetch_and(value); });
+        }
+    };
+
+    // Or.
+    //! The SYCL accelerator atomic operation.
+    template<typename T, typename THierarchy>
+    struct AtomicOp<AtomicOr, AtomicGenericSycl, T, THierarchy>
+    {
+        static_assert(std::is_integral_v<T>, "Bitwise operations only supported for integral types.");
+
+        static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T
+        {
+            return alpaka::detail::callAtomicOp<THierarchy>(addr, [&value](auto& ref) { return ref.fetch_or(value); });
+        }
+    };
+
+    // Xor.
+    //! The SYCL accelerator atomic operation.
+    template<typename T, typename THierarchy>
+    struct AtomicOp<AtomicXor, AtomicGenericSycl, T, THierarchy>
+    {
+        static_assert(std::is_integral_v<T>, "Bitwise operations only supported for integral types.");
+
+        static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T
+        {
+            return alpaka::detail::callAtomicOp<THierarchy>(
+                addr,
+                [&value](auto& ref) { return ref.fetch_xor(value); });
+        }
+    };
+
+    // Cas.
+    //! The SYCL accelerator atomic operation.
+    template<typename T, typename THierarchy>
+    struct AtomicOp<AtomicCas, AtomicGenericSycl, T, THierarchy>
+    {
+        static_assert(std::is_integral_v<T> || std::is_floating_point_v<T>, "SYCL atomics do not support this type");
+
+        static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& expected, T const& desired) -> T
+        {
+            auto cas = [&expected, &desired](auto& ref)
+            {
+                auto expected_ = expected;
+                // Atomically compares the value of `ref` with the value of `expected`.
+                // If the values are equal, replaces the value of `ref` with `desired`.
+                // Otherwise updates `expected` with the value of `ref`.
+                // Returns a bool telling us if the exchange happened or not, but the Alpaka API does not make use of
+                // it.
+                ref.compare_exchange_strong(expected_, desired);
+
+                // If the update succeded, return the previous value of `ref`.
+                // Otherwise, return the current value of `ref`.
+                return expected_;
+            };
+
+            return alpaka::detail::callAtomicOp<THierarchy>(addr, cas);
+        }
+    };
+} // namespace alpaka::trait
+
+#endif
diff --git a/include/alpaka/atomic/AtomicHierarchy.hpp b/include/alpaka/atomic/AtomicHierarchy.hpp
new file mode 100644
index 0000000..d9c3c3a
--- /dev/null
+++ b/include/alpaka/atomic/AtomicHierarchy.hpp
@@ -0,0 +1,34 @@
+/* Copyright 2020 Benjamin Worpitz, René Widera, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/atomic/Traits.hpp"
+#include "alpaka/meta/InheritFromList.hpp"
+#include "alpaka/meta/Unique.hpp"
+
+#include <tuple>
+
+namespace alpaka
+{
+    //! build a single class to inherit from different atomic implementations
+    //
+    //  This implementation inherit from all three hierarchies.
+    //  The multiple usage of the same type for different levels is allowed.
+    //  The class provide the feature that each atomic operation can be focused
+    //  to a hierarchy level in alpaka. A operation to a hierarchy is independent
+    //  to the memory hierarchy.
+    //
+    //  \tparam TGridAtomic atomic implementation for atomic operations between grids within a device
+    //  \tparam TBlockAtomic atomic implementation for atomic operations between blocks within a grid
+    //  \tparam TThreadAtomic atomic implementation for atomic operations between threads within a block
+    template<typename TGridAtomic, typename TBlockAtomic, typename TThreadAtomic>
+    using AtomicHierarchy = alpaka::meta::InheritFromList<alpaka::meta::Unique<std::tuple<
+        TGridAtomic,
+        TBlockAtomic,
+        TThreadAtomic,
+        concepts::Implements<ConceptAtomicGrids, TGridAtomic>,
+        concepts::Implements<ConceptAtomicBlocks, TBlockAtomic>,
+        concepts::Implements<ConceptAtomicThreads, TThreadAtomic>>>>;
+} // namespace alpaka
diff --git a/include/alpaka/atomic/AtomicNoOp.hpp b/include/alpaka/atomic/AtomicNoOp.hpp
new file mode 100644
index 0000000..d51a2c3
--- /dev/null
+++ b/include/alpaka/atomic/AtomicNoOp.hpp
@@ -0,0 +1,37 @@
+/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/atomic/Traits.hpp"
+
+namespace alpaka
+{
+    //! The NoOp atomic ops.
+    class AtomicNoOp
+    {
+    };
+
+    namespace trait
+    {
+        //! The NoOp atomic operation.
+        template<typename TOp, typename T, typename THierarchy>
+        struct AtomicOp<TOp, AtomicNoOp, T, THierarchy>
+        {
+            ALPAKA_FN_HOST static auto atomicOp(AtomicNoOp const& /* atomic */, T* const addr, T const& value) -> T
+            {
+                return TOp()(addr, value);
+            }
+
+            ALPAKA_FN_HOST static auto atomicOp(
+                AtomicNoOp const& /* atomic */,
+                T* const addr,
+                T const& compare,
+                T const& value) -> T
+            {
+                return TOp()(addr, compare, value);
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/atomic/AtomicOmpBuiltIn.hpp b/include/alpaka/atomic/AtomicOmpBuiltIn.hpp
new file mode 100644
index 0000000..e1f0ba0
--- /dev/null
+++ b/include/alpaka/atomic/AtomicOmpBuiltIn.hpp
@@ -0,0 +1,320 @@
+/* Copyright 2022 René Widera, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/atomic/Op.hpp"
+#include "alpaka/atomic/Traits.hpp"
+#include "alpaka/core/BoostPredef.hpp"
+
+#ifdef _OPENMP
+
+namespace alpaka
+{
+    //! The OpenMP accelerators atomic ops.
+    //
+    //  Atomics can be used in the blocks and threads hierarchy levels.
+    //  Atomics are not guaranteed to be safe between devices or grids.
+    class AtomicOmpBuiltIn
+    {
+    };
+
+    namespace trait
+    {
+// check for OpenMP 3.1+
+// "omp atomic capture" is not supported before OpenMP 3.1
+#    if _OPENMP >= 201107
+
+        //! The OpenMP accelerators atomic operation: ADD
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicAdd, AtomicOmpBuiltIn, T, THierarchy>
+        {
+            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T const& value) -> T
+            {
+                T old;
+                auto& ref(*addr);
+// atomically update ref, but capture the original value in old
+#        if BOOST_COMP_GNUC
+#            pragma GCC diagnostic push
+#            pragma GCC diagnostic ignored "-Wconversion"
+#        endif
+#        pragma omp atomic capture
+                {
+                    old = ref;
+                    ref += value;
+                }
+#        if BOOST_COMP_GNUC
+#            pragma GCC diagnostic pop
+#        endif
+                return old;
+            }
+        };
+
+        //! The OpenMP accelerators atomic operation: SUB
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicSub, AtomicOmpBuiltIn, T, THierarchy>
+        {
+            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T const& value) -> T
+            {
+                T old;
+                auto& ref(*addr);
+// atomically update ref, but capture the original value in old
+#        if BOOST_COMP_GNUC
+#            pragma GCC diagnostic push
+#            pragma GCC diagnostic ignored "-Wconversion"
+#        endif
+#        pragma omp atomic capture
+                {
+                    old = ref;
+                    ref -= value;
+                }
+#        if BOOST_COMP_GNUC
+#            pragma GCC diagnostic pop
+#        endif
+                return old;
+            }
+        };
+
+        //! The OpenMP accelerators atomic operation: EXCH
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicExch, AtomicOmpBuiltIn, T, THierarchy>
+        {
+            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T const& value) -> T
+            {
+                T old;
+                auto& ref(*addr);
+// atomically update ref, but capture the original value in old
+#        pragma omp atomic capture
+                {
+                    old = ref;
+                    ref = value;
+                }
+                return old;
+            }
+        };
+
+        //! The OpenMP accelerators atomic operation: AND
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicAnd, AtomicOmpBuiltIn, T, THierarchy>
+        {
+            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T const& value) -> T
+            {
+                T old;
+                auto& ref(*addr);
+// atomically update ref, but capture the original value in old
+#        if BOOST_COMP_GNUC
+#            pragma GCC diagnostic push
+#            pragma GCC diagnostic ignored "-Wconversion"
+#        endif
+#        pragma omp atomic capture
+                {
+                    old = ref;
+                    ref &= value;
+                }
+#        if BOOST_COMP_GNUC
+#            pragma GCC diagnostic pop
+#        endif
+                return old;
+            }
+        };
+
+        //! The OpenMP accelerators atomic operation: OR
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicOr, AtomicOmpBuiltIn, T, THierarchy>
+        {
+            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T const& value) -> T
+            {
+                T old;
+                auto& ref(*addr);
+// atomically update ref, but capture the original value in old
+#        if BOOST_COMP_GNUC
+#            pragma GCC diagnostic push
+#            pragma GCC diagnostic ignored "-Wconversion"
+#        endif
+#        pragma omp atomic capture
+                {
+                    old = ref;
+                    ref |= value;
+                }
+#        if BOOST_COMP_GNUC
+#            pragma GCC diagnostic pop
+#        endif
+                return old;
+            }
+        };
+
+        //! The OpenMP accelerators atomic operation: XOR
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicXor, AtomicOmpBuiltIn, T, THierarchy>
+        {
+            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T const& value) -> T
+            {
+                T old;
+                auto& ref(*addr);
+// atomically update ref, but capture the original value in old
+#        if BOOST_COMP_GNUC
+#            pragma GCC diagnostic push
+#            pragma GCC diagnostic ignored "-Wconversion"
+#        endif
+#        pragma omp atomic capture
+                {
+                    old = ref;
+                    ref ^= value;
+                }
+#        if BOOST_COMP_GNUC
+#            pragma GCC diagnostic pop
+#        endif
+                return old;
+            }
+        };
+
+#    endif // _OPENMP >= 201107
+
+// check for OpenMP 5.1+
+// "omp atomic compare" was introduced with OpenMP 5.1
+#    if _OPENMP >= 202011
+
+        //! The OpenMP accelerators atomic operation: Min
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicMin, AtomicOmpBuiltIn, T, THierarchy>
+        {
+            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T value) -> T
+            {
+                T old;
+                auto& ref(*addr);
+// atomically update ref, but capture the original value in old
+#        pragma omp atomic capture compare
+                {
+                    old = ref;
+                    // Do not remove the curly brackets of the if body else
+                    // icpx 2024.0 is not able to compile the atomics.
+                    if(value < ref)
+                    {
+                        ref = value;
+                    }
+                }
+                return old;
+            }
+        };
+
+        //! The OpenMP accelerators atomic operation: Max
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicMax, AtomicOmpBuiltIn, T, THierarchy>
+        {
+            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T value) -> T
+            {
+                T old;
+                auto& ref(*addr);
+// atomically update ref, but capture the original value in old
+#        pragma omp atomic capture compare
+                {
+                    old = ref;
+                    // Do not remove the curly brackets of the if body else
+                    // icpx 2024.0 is not able to compile the atomics.
+                    if(value > ref)
+                    {
+                        ref = value;
+                    }
+                }
+                return old;
+            }
+        };
+
+        //! The OpenMP accelerators atomic operation: Inc
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicInc, AtomicOmpBuiltIn, T, THierarchy>
+        {
+            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T const& value) -> T
+            {
+                // TODO(bgruber): atomic increment with wrap around is not implementable in OpenMP 5.1
+                T old;
+#        pragma omp critical(AlpakaOmpAtomicOp)
+                {
+                    old = AtomicInc{}(addr, value);
+                }
+                return old;
+            }
+        };
+
+        //! The OpenMP accelerators atomic operation: Dec
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicDec, AtomicOmpBuiltIn, T, THierarchy>
+        {
+            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T const& value) -> T
+            {
+                // TODO(bgruber): atomic decrement with wrap around is not implementable in OpenMP 5.1
+                T old;
+#        pragma omp critical(AlpakaOmpAtomicOp)
+                {
+                    old = AtomicDec{}(addr, value);
+                }
+                return old;
+            }
+        };
+
+        //! The OpenMP accelerators atomic operation: Cas
+        template<typename T, typename THierarchy>
+        struct AtomicOp<AtomicCas, AtomicOmpBuiltIn, T, THierarchy>
+        {
+            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T compare, T value) -> T
+            {
+                T old;
+                auto& ref(*addr);
+// atomically update ref, but capture the original value in old
+#        pragma omp atomic capture compare
+                {
+                    old = ref;
+                    // Do not remove the curly brackets of the if body else
+                    // icpx 2024.0 is not able to compile the atomics.
+                    if(ref == compare)
+                    {
+                        ref = value;
+                    }
+                }
+                return old;
+            }
+        };
+
+#    else
+        //! The OpenMP accelerators atomic operation
+        //
+        // generic implementations for operations where native atomics are not available
+        template<typename TOp, typename T, typename THierarchy>
+        struct AtomicOp<TOp, AtomicOmpBuiltIn, T, THierarchy>
+        {
+            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T const& value) -> T
+            {
+                T old;
+                // \TODO: Currently not only the access to the same memory location is protected by a mutex but all
+                // atomic ops on all threads.
+#        pragma omp critical(AlpakaOmpAtomicOp)
+                {
+                    old = TOp()(addr, value);
+                }
+                return old;
+            }
+
+            ALPAKA_FN_HOST static auto atomicOp(
+                AtomicOmpBuiltIn const&,
+                T* const addr,
+                T const& compare,
+                T const& value) -> T
+            {
+                T old;
+                // \TODO: Currently not only the access to the same memory location is protected by a mutex but all
+                // atomic ops on all threads.
+#        pragma omp critical(AlpakaOmpAtomicOp2)
+                {
+                    old = TOp()(addr, compare, value);
+                }
+                return old;
+            }
+        };
+
+#    endif // _OPENMP >= 202011
+
+    } // namespace trait
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/atomic/AtomicStdLibLock.hpp b/include/alpaka/atomic/AtomicStdLibLock.hpp
new file mode 100644
index 0000000..16a659f
--- /dev/null
+++ b/include/alpaka/atomic/AtomicStdLibLock.hpp
@@ -0,0 +1,103 @@
+/* Copyright 2022 Benjamin Worpitz, Matthias Werner, René Widera, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/atomic/Traits.hpp"
+#include "alpaka/core/BoostPredef.hpp"
+
+#include <array>
+#include <mutex>
+
+#ifdef ALPAKA_DISABLE_ATOMIC_ATOMICREF
+
+namespace alpaka
+{
+    //! The CPU threads accelerator atomic ops.
+    //
+    //  Atomics can be used in the grids, blocks and threads hierarchy levels.
+    //  Atomics are not guaranteed to be save between devices.
+    //
+    // \tparam THashTableSize size of the hash table to allow concurrency between
+    //                        atomics to different addresses
+    template<size_t THashTableSize>
+    class AtomicStdLibLock
+    {
+    public:
+        template<typename TAtomic, typename TOp, typename T, typename THierarchy, typename TSfinae>
+        friend struct trait::AtomicOp;
+
+        static constexpr auto nextPowerOf2(size_t const value, size_t const bit = 0u) -> size_t
+        {
+            return value <= (static_cast<size_t>(1u) << bit) ? (static_cast<size_t>(1u) << bit)
+                                                             : nextPowerOf2(value, bit + 1u);
+        }
+
+        //! get a hash value of the pointer
+        //
+        // This is no perfect hash, there will be collisions if the size of pointer type
+        // is not a power of two.
+        template<typename TPtr>
+        static auto hash(TPtr const* const ptr) -> size_t
+        {
+            auto const ptrAddr = reinterpret_cast<size_t>(ptr);
+            // using power of two for the next division will increase the performance
+            constexpr size_t typeSizePowerOf2 = nextPowerOf2(sizeof(TPtr));
+            // division removes the stride between indices
+            return (ptrAddr / typeSizePowerOf2);
+        }
+
+        template<typename TPtr>
+        auto getMutex(TPtr const* const ptr) const -> std::mutex&
+        {
+            //! get the size of the hash table
+            //
+            // The size is at least 1 or THashTableSize rounded up to the next power of 2
+            constexpr size_t hashTableSize = THashTableSize == 0u ? 1u : nextPowerOf2(THashTableSize);
+
+            size_t const hashedAddr = hash(ptr) & (hashTableSize - 1u);
+#    if BOOST_COMP_CLANG
+#        pragma clang diagnostic push
+#        pragma clang diagnostic ignored "-Wexit-time-destructors"
+#    endif
+            static std::array<
+                std::mutex,
+                hashTableSize>
+                m_mtxAtomic; //!< The mutex protecting access for an atomic operation.
+#    if BOOST_COMP_CLANG
+#        pragma clang diagnostic pop
+#    endif
+            return m_mtxAtomic[hashedAddr];
+        }
+    };
+
+    namespace trait
+    {
+        //! The CPU threads accelerator atomic operation.
+        template<typename TOp, typename T, typename THierarchy, size_t THashTableSize>
+        struct AtomicOp<TOp, AtomicStdLibLock<THashTableSize>, T, THierarchy>
+        {
+            ALPAKA_FN_HOST static auto atomicOp(
+                AtomicStdLibLock<THashTableSize> const& atomic,
+                T* const addr,
+                T const& value) -> T
+            {
+                std::lock_guard<std::mutex> lock(atomic.getMutex(addr));
+                return TOp()(addr, value);
+            }
+
+            ALPAKA_FN_HOST static auto atomicOp(
+                AtomicStdLibLock<THashTableSize> const& atomic,
+                T* const addr,
+                T const& compare,
+                T const& value) -> T
+            {
+                std::lock_guard<std::mutex> lock(atomic.getMutex(addr));
+                return TOp()(addr, compare, value);
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/atomic/AtomicUniformCudaHip.hpp b/include/alpaka/atomic/AtomicUniformCudaHip.hpp
new file mode 100644
index 0000000..330e3a4
--- /dev/null
+++ b/include/alpaka/atomic/AtomicUniformCudaHip.hpp
@@ -0,0 +1,512 @@
+/* Copyright 2022 René Widera
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/atomic/Op.hpp"
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/Positioning.hpp"
+#include "alpaka/core/Utility.hpp"
+
+#include <type_traits>
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+namespace alpaka
+{
+    //! The GPU CUDA/HIP accelerator atomic ops.
+    //
+    //  Atomics can be used in the hierarchy level grids, blocks and threads.
+    //  Atomics are not guaranteed to be safe between devices.
+    class AtomicUniformCudaHipBuiltIn
+    {
+    };
+} // namespace alpaka
+
+#    if !defined(ALPAKA_HOST_ONLY)
+
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+
+#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+
+//! clang is providing a builtin for different atomic functions even if these is not supported for architectures < 6.0
+#        define CLANG_CUDA_PTX_WORKAROUND                                                                             \
+            (BOOST_COMP_CLANG && BOOST_LANG_CUDA && BOOST_ARCH_PTX < BOOST_VERSION_NUMBER(6, 0, 0))
+
+//! These types must be in the global namespace for checking existence of respective functions in global namespace via
+//! SFINAE, so we use inline namespace.
+inline namespace alpakaGlobal
+{
+    //! Provide an interface to builtin atomic functions.
+    //
+    // To check for the existence of builtin functions located in the global namespace :: directly.
+    // This would not be possible without having these types in global namespace.
+    // If the functor is inheriting from std::false_type an signature is explicitly not available. This can be used to
+    // explicitly disable builtin function in case the builtin is broken.
+    // If the functor is inheriting from std::true_type a specialization must implement one of the following
+    // interfaces.
+    // \code{.cpp}
+    //    // interface for all atomics except atomicCas
+    //    __device__ static T atomic( T* add, T value);
+    //    // interface for atomicCas only
+    //    __device__ static T atomic( T* add, T compare, T value);
+    // \endcode
+    template<typename TOp, typename T, typename THierarchy, typename TSfinae = void>
+    struct AlpakaBuiltInAtomic : std::false_type
+    {
+    };
+
+    // Cas.
+    template<typename T, typename THierarchy>
+    struct AlpakaBuiltInAtomic<
+        alpaka::AtomicCas,
+        T,
+        THierarchy,
+        typename std::void_t<
+            decltype(atomicCAS(alpaka::core::declval<T*>(), alpaka::core::declval<T>(), alpaka::core::declval<T>()))>>
+        : std::true_type
+    {
+        static __device__ T atomic(T* add, T compare, T value)
+        {
+            return atomicCAS(add, compare, value);
+        }
+    };
+
+#        if !CLANG_CUDA_PTX_WORKAROUND
+    template<typename T>
+    struct AlpakaBuiltInAtomic<
+        alpaka::AtomicCas,
+        T,
+        alpaka::hierarchy::Threads,
+        typename std::void_t<decltype(atomicCAS_block(
+            alpaka::core::declval<T*>(),
+            alpaka::core::declval<T>(),
+            alpaka::core::declval<T>()))>> : std::true_type
+    {
+        static __device__ T atomic(T* add, T compare, T value)
+        {
+            return atomicCAS_block(add, compare, value);
+        }
+    };
+#        endif
+
+
+    // Add.
+    template<typename T, typename THierarchy>
+    struct AlpakaBuiltInAtomic<
+        alpaka::AtomicAdd,
+        T,
+        THierarchy,
+        typename std::void_t<decltype(atomicAdd(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
+        : std::true_type
+    {
+        static __device__ T atomic(T* add, T value)
+        {
+            return atomicAdd(add, value);
+        }
+    };
+
+
+#        if !CLANG_CUDA_PTX_WORKAROUND
+    template<typename T>
+    struct AlpakaBuiltInAtomic<
+        alpaka::AtomicAdd,
+        T,
+        alpaka::hierarchy::Threads,
+        typename std::void_t<decltype(atomicAdd_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
+        : std::true_type
+    {
+        static __device__ T atomic(T* add, T value)
+        {
+            return atomicAdd_block(add, value);
+        }
+    };
+#        endif
+
+#        if CLANG_CUDA_PTX_WORKAROUND
+    // clang is providing a builtin for atomicAdd even if these is not supported by the current architecture
+    template<typename THierarchy>
+    struct AlpakaBuiltInAtomic<alpaka::AtomicAdd, double, THierarchy> : std::false_type
+    {
+    };
+#        endif
+
+#        if(BOOST_LANG_HIP)
+    // HIP shows bad performance with builtin atomicAdd(float*,float) for the hierarchy threads therefore we do not
+    // call the buildin method and instead use the atomicCAS emulation. For details see:
+    // https://github.com/alpaka-group/alpaka/issues/1657
+    template<>
+    struct AlpakaBuiltInAtomic<alpaka::AtomicAdd, float, alpaka::hierarchy::Threads> : std::false_type
+    {
+    };
+#        endif
+
+    // Sub.
+
+    template<typename T, typename THierarchy>
+    struct AlpakaBuiltInAtomic<
+        alpaka::AtomicSub,
+        T,
+        THierarchy,
+        typename std::void_t<decltype(atomicSub(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
+        : std::true_type
+    {
+        static __device__ T atomic(T* add, T value)
+        {
+            return atomicSub(add, value);
+        }
+    };
+
+#        if !CLANG_CUDA_PTX_WORKAROUND
+    template<typename T>
+    struct AlpakaBuiltInAtomic<
+        alpaka::AtomicSub,
+        T,
+        alpaka::hierarchy::Threads,
+        typename std::void_t<decltype(atomicSub_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
+        : std::true_type
+    {
+        static __device__ T atomic(T* add, T value)
+        {
+            return atomicSub_block(add, value);
+        }
+    };
+#        endif
+
+    // Min.
+    template<typename T, typename THierarchy>
+    struct AlpakaBuiltInAtomic<
+        alpaka::AtomicMin,
+        T,
+        THierarchy,
+        typename std::void_t<decltype(atomicMin(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
+        : std::true_type
+    {
+        static __device__ T atomic(T* add, T value)
+        {
+            return atomicMin(add, value);
+        }
+    };
+
+#        if !CLANG_CUDA_PTX_WORKAROUND
+    template<typename T>
+    struct AlpakaBuiltInAtomic<
+        alpaka::AtomicMin,
+        T,
+        alpaka::hierarchy::Threads,
+        typename std::void_t<decltype(atomicMin_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
+        : std::true_type
+    {
+        static __device__ T atomic(T* add, T value)
+        {
+            return atomicMin_block(add, value);
+        }
+    };
+#        endif
+
+// disable HIP atomicMin: see https://github.com/ROCm-Developer-Tools/hipamd/pull/40
+#        if(BOOST_LANG_HIP)
+    template<typename THierarchy>
+    struct AlpakaBuiltInAtomic<alpaka::AtomicMin, float, THierarchy> : std::false_type
+    {
+    };
+
+    template<>
+    struct AlpakaBuiltInAtomic<alpaka::AtomicMin, float, alpaka::hierarchy::Threads> : std::false_type
+    {
+    };
+
+    template<typename THierarchy>
+    struct AlpakaBuiltInAtomic<alpaka::AtomicMin, double, THierarchy> : std::false_type
+    {
+    };
+
+    template<>
+    struct AlpakaBuiltInAtomic<alpaka::AtomicMin, double, alpaka::hierarchy::Threads> : std::false_type
+    {
+    };
+
+#            if !__has_builtin(__hip_atomic_compare_exchange_strong)
+    template<typename THierarchy>
+    struct AlpakaBuiltInAtomic<alpaka::AtomicMin, unsigned long long, THierarchy> : std::false_type
+    {
+    };
+
+    template<>
+    struct AlpakaBuiltInAtomic<alpaka::AtomicMin, unsigned long long, alpaka::hierarchy::Threads> : std::false_type
+    {
+    };
+#            endif
+#        endif
+
+    // Max.
+
+    template<typename T, typename THierarchy>
+    struct AlpakaBuiltInAtomic<
+        alpaka::AtomicMax,
+        T,
+        THierarchy,
+        typename std::void_t<decltype(atomicMax(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
+        : std::true_type
+    {
+        static __device__ T atomic(T* add, T value)
+        {
+            return atomicMax(add, value);
+        }
+    };
+
+#        if !CLANG_CUDA_PTX_WORKAROUND
+    template<typename T>
+    struct AlpakaBuiltInAtomic<
+        alpaka::AtomicMax,
+        T,
+        alpaka::hierarchy::Threads,
+        typename std::void_t<decltype(atomicMax_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
+        : std::true_type
+    {
+        static __device__ T atomic(T* add, T value)
+        {
+            return atomicMax_block(add, value);
+        }
+    };
+#        endif
+
+    // disable HIP atomicMax: see https://github.com/ROCm-Developer-Tools/hipamd/pull/40
+#        if(BOOST_LANG_HIP)
+    template<typename THierarchy>
+    struct AlpakaBuiltInAtomic<alpaka::AtomicMax, float, THierarchy> : std::false_type
+    {
+    };
+
+    template<>
+    struct AlpakaBuiltInAtomic<alpaka::AtomicMax, float, alpaka::hierarchy::Threads> : std::false_type
+    {
+    };
+
+    template<typename THierarchy>
+    struct AlpakaBuiltInAtomic<alpaka::AtomicMax, double, THierarchy> : std::false_type
+    {
+    };
+
+    template<>
+    struct AlpakaBuiltInAtomic<alpaka::AtomicMax, double, alpaka::hierarchy::Threads> : std::false_type
+    {
+    };
+
+#            if !__has_builtin(__hip_atomic_compare_exchange_strong)
+    template<typename THierarchy>
+    struct AlpakaBuiltInAtomic<alpaka::AtomicMax, unsigned long long, THierarchy> : std::false_type
+    {
+    };
+
+    template<>
+    struct AlpakaBuiltInAtomic<alpaka::AtomicMax, unsigned long long, alpaka::hierarchy::Threads> : std::false_type
+    {
+    };
+#            endif
+#        endif
+
+
+    // Exch.
+
+    template<typename T, typename THierarchy>
+    struct AlpakaBuiltInAtomic<
+        alpaka::AtomicExch,
+        T,
+        THierarchy,
+        typename std::void_t<decltype(atomicExch(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
+        : std::true_type
+    {
+        static __device__ T atomic(T* add, T value)
+        {
+            return atomicExch(add, value);
+        }
+    };
+
+#        if !CLANG_CUDA_PTX_WORKAROUND
+    template<typename T>
+    struct AlpakaBuiltInAtomic<
+        alpaka::AtomicExch,
+        T,
+        alpaka::hierarchy::Threads,
+        typename std::void_t<decltype(atomicExch_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
+        : std::true_type
+    {
+        static __device__ T atomic(T* add, T value)
+        {
+            return atomicExch_block(add, value);
+        }
+    };
+#        endif
+
+    // Inc.
+
+    template<typename T, typename THierarchy>
+    struct AlpakaBuiltInAtomic<
+        alpaka::AtomicInc,
+        T,
+        THierarchy,
+        typename std::void_t<decltype(atomicInc(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
+        : std::true_type
+    {
+        static __device__ T atomic(T* add, T value)
+        {
+            return atomicInc(add, value);
+        }
+    };
+
+#        if !CLANG_CUDA_PTX_WORKAROUND
+    template<typename T>
+    struct AlpakaBuiltInAtomic<
+        alpaka::AtomicInc,
+        T,
+        alpaka::hierarchy::Threads,
+        typename std::void_t<decltype(atomicInc_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
+        : std::true_type
+    {
+        static __device__ T atomic(T* add, T value)
+        {
+            return atomicInc_block(add, value);
+        }
+    };
+#        endif
+
+    // Dec.
+
+    template<typename T, typename THierarchy>
+    struct AlpakaBuiltInAtomic<
+        alpaka::AtomicDec,
+        T,
+        THierarchy,
+        typename std::void_t<decltype(atomicDec(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
+        : std::true_type
+    {
+        static __device__ T atomic(T* add, T value)
+        {
+            return atomicDec(add, value);
+        }
+    };
+
+#        if !CLANG_CUDA_PTX_WORKAROUND
+    template<typename T>
+    struct AlpakaBuiltInAtomic<
+        alpaka::AtomicDec,
+        T,
+        alpaka::hierarchy::Threads,
+        typename std::void_t<decltype(atomicDec_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
+        : std::true_type
+    {
+        static __device__ T atomic(T* add, T value)
+        {
+            return atomicDec_block(add, value);
+        }
+    };
+#        endif
+
+    // And.
+
+    template<typename T, typename THierarchy>
+    struct AlpakaBuiltInAtomic<
+        alpaka::AtomicAnd,
+        T,
+        THierarchy,
+        typename std::void_t<decltype(atomicAnd(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
+        : std::true_type
+    {
+        static __device__ T atomic(T* add, T value)
+        {
+            return atomicAnd(add, value);
+        }
+    };
+
+#        if !CLANG_CUDA_PTX_WORKAROUND
+    template<typename T>
+    struct AlpakaBuiltInAtomic<
+        alpaka::AtomicAnd,
+        T,
+        alpaka::hierarchy::Threads,
+        typename std::void_t<decltype(atomicAnd_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
+        : std::true_type
+    {
+        static __device__ T atomic(T* add, T value)
+        {
+            return atomicAnd_block(add, value);
+        }
+    };
+#        endif
+
+    // Or.
+
+    template<typename T, typename THierarchy>
+    struct AlpakaBuiltInAtomic<
+        alpaka::AtomicOr,
+        T,
+        THierarchy,
+        typename std::void_t<decltype(atomicOr(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
+        : std::true_type
+    {
+        static __device__ T atomic(T* add, T value)
+        {
+            return atomicOr(add, value);
+        }
+    };
+
+#        if !CLANG_CUDA_PTX_WORKAROUND
+    template<typename T>
+    struct AlpakaBuiltInAtomic<
+        alpaka::AtomicOr,
+        T,
+        alpaka::hierarchy::Threads,
+        typename std::void_t<decltype(atomicOr_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
+        : std::true_type
+    {
+        static __device__ T atomic(T* add, T value)
+        {
+            return atomicOr_block(add, value);
+        }
+    };
+#        endif
+
+    // Xor.
+
+    template<typename T, typename THierarchy>
+    struct AlpakaBuiltInAtomic<
+        alpaka::AtomicXor,
+        T,
+        THierarchy,
+        typename std::void_t<decltype(atomicXor(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
+        : std::true_type
+    {
+        static __device__ T atomic(T* add, T value)
+        {
+            return atomicXor(add, value);
+        }
+    };
+
+#        if !CLANG_CUDA_PTX_WORKAROUND
+    template<typename T>
+    struct AlpakaBuiltInAtomic<
+        alpaka::AtomicXor,
+        T,
+        alpaka::hierarchy::Threads,
+        typename std::void_t<decltype(atomicXor_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
+        : std::true_type
+    {
+        static __device__ T atomic(T* add, T value)
+        {
+            return atomicXor_block(add, value);
+        }
+    };
+#        endif
+
+} // namespace alpakaGlobal
+
+#        undef CLANG_CUDA_PTX_WORKAROUND
+#    endif
+
+#endif
diff --git a/include/alpaka/atomic/AtomicUniformCudaHipBuiltIn.hpp b/include/alpaka/atomic/AtomicUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000..86c5120
--- /dev/null
+++ b/include/alpaka/atomic/AtomicUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,321 @@
+/* Copyright 2022 Benjamin Worpitz, René Widera, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Antonio Di Pilato
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/atomic/AtomicUniformCudaHip.hpp"
+#include "alpaka/atomic/Op.hpp"
+#include "alpaka/atomic/Traits.hpp"
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/Decay.hpp"
+#include "alpaka/core/Unreachable.hpp"
+
+#include <limits>
+#include <type_traits>
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    if !defined(ALPAKA_HOST_ONLY)
+
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+
+#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+
+namespace alpaka::trait
+{
+    namespace detail
+    {
+        struct EmulationBase
+        {
+            //! reinterprets an address as an 32bit value for atomicCas emulation usage
+            template<typename TAddressType>
+            static __device__ auto reinterpretAddress(TAddressType* address)
+                -> std::enable_if_t<sizeof(TAddressType) == 4u, unsigned int*>
+            {
+                return reinterpret_cast<unsigned int*>(address);
+            }
+
+            //! reinterprets a address as an 64bit value for atomicCas emulation usage
+            template<typename TAddressType>
+            static __device__ auto reinterpretAddress(TAddressType* address)
+                -> std::enable_if_t<sizeof(TAddressType) == 8u, unsigned long long int*>
+            {
+                return reinterpret_cast<unsigned long long int*>(address);
+            }
+
+            //! reinterprets a value to be usable for the atomicCAS emulation
+            template<typename T_Type>
+            static __device__ auto reinterpretValue(T_Type value)
+            {
+                return *reinterpretAddress(&value);
+            }
+        };
+
+        //! Emulate atomic
+        //
+        // The default implementation will emulate all atomic functions with atomicCAS.
+        template<
+            typename TOp,
+            typename TAtomic,
+            typename T,
+            typename THierarchy,
+            typename TSfinae = void,
+            typename TDefer = void>
+        struct EmulateAtomic : private EmulationBase
+        {
+        public:
+            static __device__ auto atomic(
+                alpaka::AtomicUniformCudaHipBuiltIn const& ctx,
+                T* const addr,
+                T const& value) -> T
+            {
+                auto* const addressAsIntegralType = reinterpretAddress(addr);
+                using EmulatedType = std::decay_t<decltype(*addressAsIntegralType)>;
+
+                // Emulating atomics with atomicCAS is mentioned in the programming guide too.
+                // http://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions
+#        if BOOST_LANG_HIP
+#            if __has_builtin(__hip_atomic_load)
+                EmulatedType old{__hip_atomic_load(addressAsIntegralType, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT)};
+#            else
+                EmulatedType old{__atomic_load_n(addressAsIntegralType, __ATOMIC_RELAXED)};
+#            endif
+#        else
+                EmulatedType old{*addressAsIntegralType};
+#        endif
+                EmulatedType assumed;
+                do
+                {
+                    assumed = old;
+                    T v = *(reinterpret_cast<T*>(&assumed));
+                    TOp{}(&v, value);
+                    using Cas = alpaka::trait::
+                        AtomicOp<alpaka::AtomicCas, alpaka::AtomicUniformCudaHipBuiltIn, EmulatedType, THierarchy>;
+                    old = Cas::atomicOp(ctx, addressAsIntegralType, assumed, reinterpretValue(v));
+                    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+                } while(assumed != old);
+                return *(reinterpret_cast<T*>(&old));
+            }
+        };
+
+        //! Emulate AtomicCas with equivalent unisigned integral type
+        template<typename T, typename THierarchy>
+        struct EmulateAtomic<alpaka::AtomicCas, alpaka::AtomicUniformCudaHipBuiltIn, T, THierarchy>
+            : private EmulationBase
+        {
+            static __device__ auto atomic(
+                alpaka::AtomicUniformCudaHipBuiltIn const& ctx,
+                T* const addr,
+                T const& compare,
+                T const& value) -> T
+            {
+                auto* const addressAsIntegralType = reinterpretAddress(addr);
+                using EmulatedType = std::decay_t<decltype(*addressAsIntegralType)>;
+                EmulatedType reinterpretedCompare = reinterpretValue(compare);
+                EmulatedType reinterpretedValue = reinterpretValue(value);
+
+                auto old = alpaka::trait::
+                    AtomicOp<alpaka::AtomicCas, alpaka::AtomicUniformCudaHipBuiltIn, EmulatedType, THierarchy>::
+                        atomicOp(ctx, addressAsIntegralType, reinterpretedCompare, reinterpretedValue);
+
+                return *(reinterpret_cast<T*>(&old));
+            }
+        };
+
+        //! Emulate AtomicSub with atomicAdd
+        template<typename T, typename THierarchy>
+        struct EmulateAtomic<alpaka::AtomicSub, alpaka::AtomicUniformCudaHipBuiltIn, T, THierarchy>
+        {
+            static __device__ auto atomic(
+                alpaka::AtomicUniformCudaHipBuiltIn const& ctx,
+                T* const addr,
+                T const& value) -> T
+            {
+                return alpaka::trait::AtomicOp<alpaka::AtomicAdd, alpaka::AtomicUniformCudaHipBuiltIn, T, THierarchy>::
+                    atomicOp(ctx, addr, -value);
+            }
+        };
+
+        //! AtomicDec can not be implemented for floating point types!
+        template<typename T, typename THierarchy>
+        struct EmulateAtomic<
+            alpaka::AtomicDec,
+            alpaka::AtomicUniformCudaHipBuiltIn,
+            T,
+            THierarchy,
+            std::enable_if_t<std::is_floating_point_v<T>>>
+        {
+            static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const&, T* const, T const&) -> T
+            {
+                static_assert(
+                    !sizeof(T),
+                    "EmulateAtomic<alpaka::AtomicDec> is not supported for floating point data types!");
+                return T{};
+            }
+        };
+
+        //! AtomicInc can not be implemented for floating point types!
+        template<typename T, typename THierarchy>
+        struct EmulateAtomic<
+            alpaka::AtomicInc,
+            alpaka::AtomicUniformCudaHipBuiltIn,
+            T,
+            THierarchy,
+            std::enable_if_t<std::is_floating_point_v<T>>>
+        {
+            static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const&, T* const, T const&) -> T
+            {
+                static_assert(
+                    !sizeof(T),
+                    "EmulateAtomic<alpaka::AtomicInc> is not supported for floating point data types!");
+                return T{};
+            }
+        };
+
+        //! AtomicAnd can not be implemented for floating point types!
+        template<typename T, typename THierarchy>
+        struct EmulateAtomic<
+            alpaka::AtomicAnd,
+            alpaka::AtomicUniformCudaHipBuiltIn,
+            T,
+            THierarchy,
+            std::enable_if_t<std::is_floating_point_v<T>>>
+        {
+            static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const&, T* const, T const&) -> T
+            {
+                static_assert(
+                    !sizeof(T),
+                    "EmulateAtomic<alpaka::AtomicAnd> is not supported for floating point data types!");
+                return T{};
+            }
+        };
+
+        //! AtomicOr can not be implemented for floating point types!
+        template<typename T, typename THierarchy>
+        struct EmulateAtomic<
+            alpaka::AtomicOr,
+            alpaka::AtomicUniformCudaHipBuiltIn,
+            T,
+            THierarchy,
+            std::enable_if_t<std::is_floating_point_v<T>>>
+        {
+            static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const&, T* const, T const&) -> T
+            {
+                static_assert(
+                    !sizeof(T),
+                    "EmulateAtomic<alpaka::AtomicOr> is not supported for floating point data types!");
+                return T{};
+            }
+        };
+
+        //! AtomicXor can not be implemented for floating point types!
+        template<typename T, typename THierarchy>
+        struct EmulateAtomic<
+            alpaka::AtomicXor,
+            alpaka::AtomicUniformCudaHipBuiltIn,
+            T,
+            THierarchy,
+            std::enable_if_t<std::is_floating_point_v<T>>>
+        {
+            static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const&, T* const, T const&) -> T
+            {
+                static_assert(
+                    !sizeof(T),
+                    "EmulateAtomic<alpaka::AtomicXor> is not supported for floating point data types!");
+                return T{};
+            }
+        };
+
+    } // namespace detail
+
+    //! Generic atomic implementation
+    //
+    // - unsigned long int will be redirected to unsigned long long int or unsigned int implementation depending if
+    //   unsigned long int is a 64 or 32bit data type.
+    // - Atomics which are not available as builtin atomic will be emulated.
+    template<typename TOp, typename T, typename THierarchy>
+    struct AtomicOp<TOp, AtomicUniformCudaHipBuiltIn, T, THierarchy>
+    {
+        static __device__ auto atomicOp(
+            AtomicUniformCudaHipBuiltIn const& ctx,
+            [[maybe_unused]] T* const addr,
+            [[maybe_unused]] T const& value) -> T
+        {
+            static_assert(
+                sizeof(T) == 4u || sizeof(T) == 8u,
+                "atomicOp<TOp, AtomicUniformCudaHipBuiltIn, T>(atomic, addr, value) is not supported! Only 64 and "
+                "32bit atomics are supported.");
+
+            if constexpr(::AlpakaBuiltInAtomic<TOp, T, THierarchy>::value)
+                return ::AlpakaBuiltInAtomic<TOp, T, THierarchy>::atomic(addr, value);
+
+            else if constexpr(std::is_same_v<unsigned long int, T>)
+            {
+                if constexpr(sizeof(T) == 4u && ::AlpakaBuiltInAtomic<TOp, unsigned int, THierarchy>::value)
+                    return ::AlpakaBuiltInAtomic<TOp, unsigned int, THierarchy>::atomic(
+                        reinterpret_cast<unsigned int*>(addr),
+                        static_cast<unsigned int>(value));
+                else if constexpr(
+                    sizeof(T) == 8u && ::AlpakaBuiltInAtomic<TOp, unsigned long long int, THierarchy>::value) // LP64
+                {
+                    return ::AlpakaBuiltInAtomic<TOp, unsigned long long int, THierarchy>::atomic(
+                        reinterpret_cast<unsigned long long int*>(addr),
+                        static_cast<unsigned long long int>(value));
+                }
+            }
+
+            return detail::EmulateAtomic<TOp, AtomicUniformCudaHipBuiltIn, T, THierarchy>::atomic(ctx, addr, value);
+        }
+    };
+
+    template<typename T, typename THierarchy>
+    struct AtomicOp<AtomicCas, AtomicUniformCudaHipBuiltIn, T, THierarchy>
+    {
+        static __device__ auto atomicOp(
+            [[maybe_unused]] AtomicUniformCudaHipBuiltIn const& ctx,
+            [[maybe_unused]] T* const addr,
+            [[maybe_unused]] T const& compare,
+            [[maybe_unused]] T const& value) -> T
+        {
+            static_assert(
+                sizeof(T) == 4u || sizeof(T) == 8u,
+                "atomicOp<AtomicCas, AtomicUniformCudaHipBuiltIn, T>(atomic, addr, compare, value) is not "
+                "supported! Only 64 and "
+                "32bit atomics are supported.");
+
+            if constexpr(::AlpakaBuiltInAtomic<AtomicCas, T, THierarchy>::value)
+                return ::AlpakaBuiltInAtomic<AtomicCas, T, THierarchy>::atomic(addr, compare, value);
+
+            else if constexpr(std::is_same_v<unsigned long int, T>)
+            {
+                if constexpr(sizeof(T) == 4u && ::AlpakaBuiltInAtomic<AtomicCas, unsigned int, THierarchy>::value)
+                    return ::AlpakaBuiltInAtomic<AtomicCas, unsigned int, THierarchy>::atomic(
+                        reinterpret_cast<unsigned int*>(addr),
+                        static_cast<unsigned int>(compare),
+                        static_cast<unsigned int>(value));
+                else if constexpr(
+                    sizeof(T) == 8u
+                    && ::AlpakaBuiltInAtomic<AtomicCas, unsigned long long int, THierarchy>::value) // LP64
+                {
+                    return ::AlpakaBuiltInAtomic<AtomicCas, unsigned long long int, THierarchy>::atomic(
+                        reinterpret_cast<unsigned long long int*>(addr),
+                        static_cast<unsigned long long int>(compare),
+                        static_cast<unsigned long long int>(value));
+                }
+            }
+
+            return detail::EmulateAtomic<AtomicCas, AtomicUniformCudaHipBuiltIn, T, THierarchy>::atomic(
+                ctx,
+                addr,
+                compare,
+                value);
+        }
+    };
+} // namespace alpaka::trait
+#    endif
+#endif
diff --git a/include/alpaka/atomic/Op.hpp b/include/alpaka/atomic/Op.hpp
new file mode 100644
index 0000000..2912556
--- /dev/null
+++ b/include/alpaka/atomic/Op.hpp
@@ -0,0 +1,249 @@
+/* Copyright 2020 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/Common.hpp"
+
+#include <algorithm>
+#include <type_traits>
+
+namespace alpaka
+{
+    //! The addition function object.
+    struct AtomicAdd
+    {
+        //! \return The old value of addr.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
+        {
+            auto const old = *addr;
+            auto& ref = *addr;
+#if BOOST_COMP_GNUC
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wconversion"
+#endif
+            ref += value;
+            return old;
+#if BOOST_COMP_GNUC
+#    pragma GCC diagnostic pop
+#endif
+        }
+    };
+
+    //! The subtraction function object.
+    struct AtomicSub
+    {
+        //! \return The old value of addr.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
+        {
+            auto const old = *addr;
+            auto& ref = *addr;
+#if BOOST_COMP_GNUC
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wconversion"
+#endif
+            ref -= value;
+#if BOOST_COMP_GNUC
+#    pragma GCC diagnostic pop
+#endif
+            return old;
+        }
+    };
+
+    //! The minimum function object.
+    struct AtomicMin
+    {
+        //! \return The old value of addr.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
+        {
+            auto const old = *addr;
+            auto& ref = *addr;
+            ref = std::min(ref, value);
+            return old;
+        }
+    };
+
+    //! The maximum function object.
+    struct AtomicMax
+    {
+        //! \return The old value of addr.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
+        {
+            auto const old = *addr;
+            auto& ref = *addr;
+            ref = std::max(ref, value);
+            return old;
+        }
+    };
+
+    //! The exchange function object.
+    struct AtomicExch
+    {
+        //! \return The old value of addr.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
+        {
+            auto const old = *addr;
+            auto& ref = *addr;
+            ref = value;
+            return old;
+        }
+    };
+
+    //! The increment function object.
+    struct AtomicInc
+    {
+        //! Increments up to value, then reset to 0.
+        //!
+        //! \return The old value of addr.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
+        {
+            auto const old = *addr;
+            auto& ref = *addr;
+            ref = ((old >= value) ? static_cast<T>(0) : static_cast<T>(old + static_cast<T>(1)));
+            return old;
+        }
+    };
+
+    //! The decrement function object.
+    struct AtomicDec
+    {
+        //! Decrement down to 0, then reset to value.
+        //!
+        //! \return The old value of addr.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
+        {
+            auto const old = *addr;
+            auto& ref = *addr;
+            ref = (((old == static_cast<T>(0)) || (old > value)) ? value : static_cast<T>(old - static_cast<T>(1)));
+            return old;
+        }
+    };
+
+    //! The and function object.
+    struct AtomicAnd
+    {
+        //! \return The old value of addr.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
+        {
+            auto const old = *addr;
+            auto& ref = *addr;
+            ref &= value;
+            return old;
+        }
+    };
+
+    //! The or function object.
+    struct AtomicOr
+    {
+        //! \return The old value of addr.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
+        {
+            auto const old = *addr;
+            auto& ref = *addr;
+            ref |= value;
+            return old;
+        }
+    };
+
+    //! The exclusive or function object.
+    struct AtomicXor
+    {
+        //! \return The old value of addr.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
+        {
+            auto const old = *addr;
+            auto& ref = *addr;
+            ref ^= value;
+            return old;
+        }
+    };
+
+    //! The compare and swap function object.
+    struct AtomicCas
+    {
+        //! AtomicCas for non floating point values
+        // \return The old value of addr.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T, std::enable_if_t<!std::is_floating_point_v<T>, bool> = true>
+        ALPAKA_FN_HOST_ACC auto operator()(T* addr, T const& compare, T const& value) const -> T
+        {
+            auto const old = *addr;
+            auto& ref = *addr;
+
+// gcc-7.4.0 assumes for an optimization that a signed overflow does not occur here.
+// That's fine, so ignore that warning.
+#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC == BOOST_VERSION_NUMBER(7, 4, 0))
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wstrict-overflow"
+#endif
+            // check if values are bit-wise equal
+            ref = ((old == compare) ? value : old);
+#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC == BOOST_VERSION_NUMBER(7, 4, 0))
+#    pragma GCC diagnostic pop
+#endif
+            return old;
+        }
+
+        //! AtomicCas for floating point values
+        // \return The old value of addr.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T, std::enable_if_t<std::is_floating_point_v<T>, bool> = true>
+        ALPAKA_FN_HOST_ACC auto operator()(T* addr, T const& compare, T const& value) const -> T
+        {
+            static_assert(sizeof(T) == 4u || sizeof(T) == 8u, "AtomicCas is supporting only 32bit and 64bit values!");
+            // Type to reinterpret too to perform the bit comparison
+            using BitType = std::conditional_t<sizeof(T) == 4u, unsigned int, unsigned long long>;
+
+            // type used to have a safe way to reinterprete the data into another type
+            // std::variant can not be used because clang8 has issues to compile std::variant
+            struct BitUnion
+            {
+                union
+                {
+                    T value;
+                    BitType r;
+                };
+            };
+
+            auto const old = *addr;
+            auto& ref = *addr;
+
+// gcc-7.4.0 assumes for an optimization that a signed overflow does not occur here.
+// That's fine, so ignore that warning.
+#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC == BOOST_VERSION_NUMBER(7, 4, 0))
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wstrict-overflow"
+#endif
+            BitUnion o{old};
+            BitUnion c{compare};
+
+            ref = ((o.r == c.r) ? value : old);
+#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC == BOOST_VERSION_NUMBER(7, 4, 0))
+#    pragma GCC diagnostic pop
+#endif
+            return old;
+        }
+    };
+} // namespace alpaka
diff --git a/include/alpaka/atomic/Traits.hpp b/include/alpaka/atomic/Traits.hpp
new file mode 100644
index 0000000..160da8c
--- /dev/null
+++ b/include/alpaka/atomic/Traits.hpp
@@ -0,0 +1,304 @@
+/* Copyright 2022 Benjamin Worpitz, René Widera, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/atomic/Op.hpp"
+#include "alpaka/core/Common.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/core/Positioning.hpp"
+
+#include <type_traits>
+
+namespace alpaka
+{
+    struct ConceptAtomicGrids
+    {
+    };
+
+    struct ConceptAtomicBlocks
+    {
+    };
+
+    struct ConceptAtomicThreads
+    {
+    };
+
+    namespace detail
+    {
+        template<typename THierarchy>
+        struct AtomicHierarchyConceptType;
+
+        template<>
+        struct AtomicHierarchyConceptType<hierarchy::Threads>
+        {
+            using type = ConceptAtomicThreads;
+        };
+
+        template<>
+        struct AtomicHierarchyConceptType<hierarchy::Blocks>
+        {
+            using type = ConceptAtomicBlocks;
+        };
+
+        template<>
+        struct AtomicHierarchyConceptType<hierarchy::Grids>
+        {
+            using type = ConceptAtomicGrids;
+        };
+    } // namespace detail
+
+    template<typename THierarchy>
+    using AtomicHierarchyConcept = typename detail::AtomicHierarchyConceptType<THierarchy>::type;
+
+    //! The atomic operation trait.
+    namespace trait
+    {
+        //! The atomic operation trait.
+        template<typename TOp, typename TAtomic, typename T, typename THierarchy, typename TSfinae = void>
+        struct AtomicOp;
+    } // namespace trait
+
+    //! Executes the given operation atomically.
+    //!
+    //! \tparam TOp The operation type.
+    //! \tparam T The value type.
+    //! \tparam TAtomic The atomic implementation type.
+    //! \param addr The value to change atomically.
+    //! \param value The value used in the atomic operation.
+    //! \param atomic The atomic implementation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TOp, typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
+    ALPAKA_FN_HOST_ACC auto atomicOp(
+        TAtomic const& atomic,
+        T* const addr,
+        T const& value,
+        THierarchy const& = THierarchy()) -> T
+    {
+        using ImplementationBase = typename concepts::ImplementationBase<AtomicHierarchyConcept<THierarchy>, TAtomic>;
+        return trait::AtomicOp<TOp, ImplementationBase, T, THierarchy>::atomicOp(atomic, addr, value);
+    }
+
+    //! Executes the given operation atomically.
+    //!
+    //! \tparam TOp The operation type.
+    //! \tparam TAtomic The atomic implementation type.
+    //! \tparam T The value type.
+    //! \param atomic The atomic implementation.
+    //! \param addr The value to change atomically.
+    //! \param compare The comparison value used in the atomic operation.
+    //! \param value The value used in the atomic operation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TOp, typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
+    ALPAKA_FN_HOST_ACC auto atomicOp(
+        TAtomic const& atomic,
+        T* const addr,
+        T const& compare,
+        T const& value,
+        THierarchy const& = THierarchy()) -> T
+    {
+        using ImplementationBase = typename concepts::ImplementationBase<AtomicHierarchyConcept<THierarchy>, TAtomic>;
+        return trait::AtomicOp<TOp, ImplementationBase, T, THierarchy>::atomicOp(atomic, addr, compare, value);
+    }
+
+    //! Executes an atomic add operation.
+    //!
+    //! \tparam T The value type.
+    //! \tparam TAtomic The atomic implementation type.
+    //! \param addr The value to change atomically.
+    //! \param value The value used in the atomic operation.
+    //! \param atomic The atomic implementation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
+    ALPAKA_FN_HOST_ACC auto atomicAdd(
+        TAtomic const& atomic,
+        T* const addr,
+        T const& value,
+        THierarchy const& hier = THierarchy()) -> T
+    {
+        return atomicOp<AtomicAdd>(atomic, addr, value, hier);
+    }
+
+    //! Executes an atomic sub operation.
+    //!
+    //! \tparam T The value type.
+    //! \tparam TAtomic The atomic implementation type.
+    //! \param addr The value to change atomically.
+    //! \param value The value used in the atomic operation.
+    //! \param atomic The atomic implementation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
+    ALPAKA_FN_HOST_ACC auto atomicSub(
+        TAtomic const& atomic,
+        T* const addr,
+        T const& value,
+        THierarchy const& hier = THierarchy()) -> T
+    {
+        return atomicOp<AtomicSub>(atomic, addr, value, hier);
+    }
+
+    //! Executes an atomic min operation.
+    //!
+    //! \tparam T The value type.
+    //! \tparam TAtomic The atomic implementation type.
+    //! \param addr The value to change atomically.
+    //! \param value The value used in the atomic operation.
+    //! \param atomic The atomic implementation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
+    ALPAKA_FN_HOST_ACC auto atomicMin(
+        TAtomic const& atomic,
+        T* const addr,
+        T const& value,
+        THierarchy const& hier = THierarchy()) -> T
+    {
+        return atomicOp<AtomicMin>(atomic, addr, value, hier);
+    }
+
+    //! Executes an atomic max operation.
+    //!
+    //! \tparam T The value type.
+    //! \tparam TAtomic The atomic implementation type.
+    //! \param addr The value to change atomically.
+    //! \param value The value used in the atomic operation.
+    //! \param atomic The atomic implementation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
+    ALPAKA_FN_HOST_ACC auto atomicMax(
+        TAtomic const& atomic,
+        T* const addr,
+        T const& value,
+        THierarchy const& hier = THierarchy()) -> T
+    {
+        return atomicOp<AtomicMax>(atomic, addr, value, hier);
+    }
+
+    //! Executes an atomic exchange operation.
+    //!
+    //! \tparam T The value type.
+    //! \tparam TAtomic The atomic implementation type.
+    //! \param addr The value to change atomically.
+    //! \param value The value used in the atomic operation.
+    //! \param atomic The atomic implementation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
+    ALPAKA_FN_HOST_ACC auto atomicExch(
+        TAtomic const& atomic,
+        T* const addr,
+        T const& value,
+        THierarchy const& hier = THierarchy()) -> T
+    {
+        return atomicOp<AtomicExch>(atomic, addr, value, hier);
+    }
+
+    //! Executes an atomic increment operation.
+    //!
+    //! \tparam T The value type.
+    //! \tparam TAtomic The atomic implementation type.
+    //! \param addr The value to change atomically.
+    //! \param value The value used in the atomic operation.
+    //! \param atomic The atomic implementation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
+    ALPAKA_FN_HOST_ACC auto atomicInc(
+        TAtomic const& atomic,
+        T* const addr,
+        T const& value,
+        THierarchy const& hier = THierarchy()) -> T
+    {
+        return atomicOp<AtomicInc>(atomic, addr, value, hier);
+    }
+
+    //! Executes an atomic decrement operation.
+    //!
+    //! \tparam T The value type.
+    //! \tparam TAtomic The atomic implementation type.
+    //! \param addr The value to change atomically.
+    //! \param value The value used in the atomic operation.
+    //! \param atomic The atomic implementation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
+    ALPAKA_FN_HOST_ACC auto atomicDec(
+        TAtomic const& atomic,
+        T* const addr,
+        T const& value,
+        THierarchy const& hier = THierarchy()) -> T
+    {
+        return atomicOp<AtomicDec>(atomic, addr, value, hier);
+    }
+
+    //! Executes an atomic and operation.
+    //!
+    //! \tparam T The value type.
+    //! \tparam TAtomic The atomic implementation type.
+    //! \param addr The value to change atomically.
+    //! \param value The value used in the atomic operation.
+    //! \param atomic The atomic implementation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
+    ALPAKA_FN_HOST_ACC auto atomicAnd(
+        TAtomic const& atomic,
+        T* const addr,
+        T const& value,
+        THierarchy const& hier = THierarchy()) -> T
+    {
+        return atomicOp<AtomicAnd>(atomic, addr, value, hier);
+    }
+
+    //! Executes an atomic or operation.
+    //!
+    //! \tparam T The value type.
+    //! \tparam TAtomic The atomic implementation type.
+    //! \param addr The value to change atomically.
+    //! \param value The value used in the atomic operation.
+    //! \param atomic The atomic implementation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
+    ALPAKA_FN_HOST_ACC auto atomicOr(
+        TAtomic const& atomic,
+        T* const addr,
+        T const& value,
+        THierarchy const& hier = THierarchy()) -> T
+    {
+        return atomicOp<AtomicOr>(atomic, addr, value, hier);
+    }
+
+    //! Executes an atomic xor operation.
+    //!
+    //! \tparam T The value type.
+    //! \tparam TAtomic The atomic implementation type.
+    //! \param addr The value to change atomically.
+    //! \param value The value used in the atomic operation.
+    //! \param atomic The atomic implementation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
+    ALPAKA_FN_HOST_ACC auto atomicXor(
+        TAtomic const& atomic,
+        T* const addr,
+        T const& value,
+        THierarchy const& hier = THierarchy()) -> T
+    {
+        return atomicOp<AtomicXor>(atomic, addr, value, hier);
+    }
+
+    //! Executes an atomic compare-and-swap operation.
+    //!
+    //! \tparam TAtomic The atomic implementation type.
+    //! \tparam T The value type.
+    //! \param atomic The atomic implementation.
+    //! \param addr The value to change atomically.
+    //! \param compare The comparison value used in the atomic operation.
+    //! \param value The value used in the atomic operation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
+    ALPAKA_FN_HOST_ACC auto atomicCas(
+        TAtomic const& atomic,
+        T* const addr,
+        T const& compare,
+        T const& value,
+        THierarchy const& hier = THierarchy()) -> T
+    {
+        return atomicOp<AtomicCas>(atomic, addr, compare, value, hier);
+    }
+} // namespace alpaka
diff --git a/include/alpaka/block/shared/dyn/BlockSharedDynMemberAllocKiB.hpp b/include/alpaka/block/shared/dyn/BlockSharedDynMemberAllocKiB.hpp
new file mode 100644
index 0000000..88e4d4b
--- /dev/null
+++ b/include/alpaka/block/shared/dyn/BlockSharedDynMemberAllocKiB.hpp
@@ -0,0 +1,15 @@
+/* Copyright 2022 Jeffrey Kelling
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace alpaka
+{
+#ifndef ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB
+#    define ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB 47u
+#endif
+    constexpr std::uint32_t BlockSharedDynMemberAllocKiB = ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB;
+} // namespace alpaka
diff --git a/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp b/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp
new file mode 100644
index 0000000..0c09cf1
--- /dev/null
+++ b/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp
@@ -0,0 +1,43 @@
+/* Copyright 2023 Jan Stephan, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/block/shared/dyn/Traits.hpp"
+
+#include <cstddef>
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+#    include <sycl/sycl.hpp>
+
+namespace alpaka
+{
+    //! The SYCL block shared memory allocator.
+    class BlockSharedMemDynGenericSycl
+        : public concepts::Implements<ConceptBlockSharedDyn, BlockSharedMemDynGenericSycl>
+    {
+    public:
+        using BlockSharedMemDynBase = BlockSharedMemDynGenericSycl;
+
+        BlockSharedMemDynGenericSycl(sycl::local_accessor<std::byte> accessor) : m_accessor{accessor}
+        {
+        }
+
+        sycl::local_accessor<std::byte> m_accessor;
+    };
+} // namespace alpaka
+
+namespace alpaka::trait
+{
+    template<typename T>
+    struct GetDynSharedMem<T, BlockSharedMemDynGenericSycl>
+    {
+        static auto getMem(BlockSharedMemDynGenericSycl const& shared) -> T*
+        {
+            return reinterpret_cast<T*>(shared.m_accessor.get_multi_ptr<sycl::access::decorated::no>().get());
+        }
+    };
+} // namespace alpaka::trait
+
+#endif
diff --git a/include/alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp b/include/alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp
new file mode 100644
index 0000000..c6a3239
--- /dev/null
+++ b/include/alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp
@@ -0,0 +1,113 @@
+/* Copyright 2023 Jeffrey Kelling, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/block/shared/dyn/BlockSharedDynMemberAllocKiB.hpp"
+#include "alpaka/block/shared/dyn/Traits.hpp"
+#include "alpaka/core/Assert.hpp"
+#include "alpaka/core/Vectorize.hpp"
+
+#include <array>
+#include <cstdint>
+#include <type_traits>
+
+namespace alpaka
+{
+    namespace detail
+    {
+        //! "namespace" for static constexpr members that should be in BlockSharedMemDynMember
+        //! but cannot be because having a static const member breaks GCC 10
+        //! OpenMP target: type not mappable.
+        template<std::size_t TStaticAllocKiB>
+        struct BlockSharedMemDynMemberStatic
+        {
+            //! Storage size in bytes
+            static constexpr std::uint32_t staticAllocBytes = static_cast<std::uint32_t>(TStaticAllocKiB << 10u);
+        };
+    } // namespace detail
+
+#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
+#    pragma warning(push)
+#    pragma warning(disable : 4324) // warning C4324: structure was padded due to alignment specifier
+#endif
+    //! Dynamic block shared memory provider using fixed-size
+    //! member array to allocate memory on the stack or in shared
+    //! memory.
+    template<std::size_t TStaticAllocKiB = BlockSharedDynMemberAllocKiB>
+    class alignas(core::vectorization::defaultAlignment) BlockSharedMemDynMember
+        : public concepts::Implements<ConceptBlockSharedDyn, BlockSharedMemDynMember<TStaticAllocKiB>>
+    {
+    public:
+        BlockSharedMemDynMember(std::size_t sizeBytes) : m_dynPitch(getPitch(sizeBytes))
+        {
+            ALPAKA_ASSERT_ACC(static_cast<std::uint32_t>(sizeBytes) <= staticAllocBytes());
+        }
+
+        auto dynMemBegin() const -> uint8_t*
+        {
+            return std::data(m_mem);
+        }
+
+        /*! \return the pointer to the begin of data after the portion allocated as dynamical shared memory.
+         */
+        auto staticMemBegin() const -> uint8_t*
+        {
+            return std::data(m_mem) + m_dynPitch;
+        }
+
+        /*! \return the remaining capacity for static block shared memory,
+                    returns a 32-bit type for register efficiency on GPUs
+            */
+        auto staticMemCapacity() const -> std::uint32_t
+        {
+            return staticAllocBytes() - m_dynPitch;
+        }
+
+        //! \return size of statically allocated memory available for both
+        //!         dynamic and static shared memory. Value is of a 32-bit type
+        //!         for register efficiency on GPUs
+        static constexpr auto staticAllocBytes() -> std::uint32_t
+        {
+            return detail::BlockSharedMemDynMemberStatic<TStaticAllocKiB>::staticAllocBytes;
+        }
+
+    private:
+        static auto getPitch(std::size_t sizeBytes) -> std::uint32_t
+        {
+            constexpr auto alignment = core::vectorization::defaultAlignment;
+            return static_cast<std::uint32_t>((sizeBytes / alignment + (sizeBytes % alignment > 0u)) * alignment);
+        }
+
+        mutable std::array<uint8_t, detail::BlockSharedMemDynMemberStatic<TStaticAllocKiB>::staticAllocBytes> m_mem;
+        std::uint32_t m_dynPitch;
+    };
+#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
+#    pragma warning(pop)
+#endif
+
+    namespace trait
+    {
+        template<typename T, std::size_t TStaticAllocKiB>
+        struct GetDynSharedMem<T, BlockSharedMemDynMember<TStaticAllocKiB>>
+        {
+#if BOOST_COMP_GNUC
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored                                                                                    \
+        "-Wcast-align" // "cast from 'unsigned char*' to 'unsigned int*' increases required alignment of target type"
+#endif
+            static auto getMem(BlockSharedMemDynMember<TStaticAllocKiB> const& mem) -> T*
+            {
+                static_assert(
+                    core::vectorization::defaultAlignment >= alignof(T),
+                    "Unable to get block shared dynamic memory for types with alignment higher than "
+                    "defaultAlignment!");
+                return reinterpret_cast<T*>(mem.dynMemBegin());
+            }
+#if BOOST_COMP_GNUC
+#    pragma GCC diagnostic pop
+#endif
+        };
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/block/shared/dyn/BlockSharedMemDynUniformCudaHipBuiltIn.hpp b/include/alpaka/block/shared/dyn/BlockSharedMemDynUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000..8364019
--- /dev/null
+++ b/include/alpaka/block/shared/dyn/BlockSharedMemDynUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,57 @@
+/* Copyright 2022 Benjamin Worpitz, René Widera, Andrea Bocci, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/block/shared/dyn/Traits.hpp"
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/Concepts.hpp"
+
+#include <cstddef>
+#include <type_traits>
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+namespace alpaka
+{
+    //! The GPU CUDA/HIP block shared memory allocator.
+    class BlockSharedMemDynUniformCudaHipBuiltIn
+        : public concepts::Implements<ConceptBlockSharedDyn, BlockSharedMemDynUniformCudaHipBuiltIn>
+    {
+    };
+
+#    if !defined(ALPAKA_HOST_ONLY)
+
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+
+#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+
+    namespace trait
+    {
+        template<typename T>
+        struct GetDynSharedMem<T, BlockSharedMemDynUniformCudaHipBuiltIn>
+        {
+            __device__ static auto getMem(BlockSharedMemDynUniformCudaHipBuiltIn const&) -> T*
+            {
+                // Because unaligned access to variables is not allowed in device code,
+                // we use the widest possible alignment supported by CUDA types to have
+                // all types aligned correctly.
+                // See:
+                //   - http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared
+                //   - http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#vector-types
+                extern __shared__ std::byte shMem alignas(std::max_align_t)[];
+                return reinterpret_cast<T*>(shMem);
+            }
+        };
+    } // namespace trait
+
+#    endif
+
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/block/shared/dyn/Traits.hpp b/include/alpaka/block/shared/dyn/Traits.hpp
new file mode 100644
index 0000000..17df89c
--- /dev/null
+++ b/include/alpaka/block/shared/dyn/Traits.hpp
@@ -0,0 +1,44 @@
+/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/core/Concepts.hpp"
+
+#include <type_traits>
+
+namespace alpaka
+{
+    struct ConceptBlockSharedDyn
+    {
+    };
+
+    //! The block shared dynamic memory operation traits.
+    namespace trait
+    {
+        //! The block shared dynamic memory get trait.
+        template<typename T, typename TBlockSharedMemDyn, typename TSfinae = void>
+        struct GetDynSharedMem;
+    } // namespace trait
+
+    //! Get block shared dynamic memory.
+    //!
+    //! The available size of the memory can be defined by specializing the trait
+    //! BlockSharedMemDynSizeBytes for a kernel.
+    //! The Memory can be accessed by all threads within a block.
+    //! Access to the memory is not thread safe.
+    //!
+    //! \tparam T The element type.
+    //! \tparam TBlockSharedMemDyn The block shared dynamic memory implementation type.
+    //! \param blockSharedMemDyn The block shared dynamic memory implementation.
+    //! \return Pointer to pre-allocated contiguous memory.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TBlockSharedMemDyn>
+    ALPAKA_FN_ACC auto getDynSharedMem(TBlockSharedMemDyn const& blockSharedMemDyn) -> T*
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptBlockSharedDyn, TBlockSharedMemDyn>;
+        return trait::GetDynSharedMem<T, ImplementationBase>::getMem(blockSharedMemDyn);
+    }
+} // namespace alpaka
diff --git a/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp b/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp
new file mode 100644
index 0000000..060414d
--- /dev/null
+++ b/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp
@@ -0,0 +1,67 @@
+/* Copyright 2023 Jan Stephan, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/block/shared/st/Traits.hpp"
+#include "alpaka/block/shared/st/detail/BlockSharedMemStMemberImpl.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+
+#    include <sycl/sycl.hpp>
+
+namespace alpaka
+{
+    //! The generic SYCL shared memory allocator.
+    class BlockSharedMemStGenericSycl
+        : public alpaka::detail::BlockSharedMemStMemberImpl<>
+        , public concepts::Implements<ConceptBlockSharedSt, BlockSharedMemStGenericSycl>
+    {
+    public:
+        BlockSharedMemStGenericSycl(sycl::local_accessor<std::byte> accessor)
+            : BlockSharedMemStMemberImpl(
+                reinterpret_cast<std::uint8_t*>(accessor.get_multi_ptr<sycl::access::decorated::no>().get()),
+                accessor.size())
+            , m_accessor{accessor}
+        {
+        }
+
+    private:
+        sycl::local_accessor<std::byte> m_accessor;
+    };
+} // namespace alpaka
+
+namespace alpaka::trait
+{
+    template<typename T, std::size_t TUniqueId>
+    struct DeclareSharedVar<T, TUniqueId, BlockSharedMemStGenericSycl>
+    {
+        static auto declareVar(BlockSharedMemStGenericSycl const& smem) -> T&
+        {
+            auto* data = smem.template getVarPtr<T>(TUniqueId);
+
+            if(!data)
+            {
+                smem.template alloc<T>(TUniqueId);
+                data = smem.template getLatestVarPtr<T>();
+            }
+            ALPAKA_ASSERT(data != nullptr);
+            return *data;
+        }
+    };
+
+    template<>
+    struct FreeSharedVars<BlockSharedMemStGenericSycl>
+    {
+        static auto freeVars(BlockSharedMemStGenericSycl const&) -> void
+        {
+            // shared memory block data will be reused
+        }
+    };
+} // namespace alpaka::trait
+
+#endif
diff --git a/include/alpaka/block/shared/st/BlockSharedMemStMember.hpp b/include/alpaka/block/shared/st/BlockSharedMemStMember.hpp
new file mode 100644
index 0000000..93c65e5
--- /dev/null
+++ b/include/alpaka/block/shared/st/BlockSharedMemStMember.hpp
@@ -0,0 +1,59 @@
+/* Copyright 2022 Jeffrey Kelling, Rene Widera, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/block/shared/st/Traits.hpp"
+#include "alpaka/block/shared/st/detail/BlockSharedMemStMemberImpl.hpp"
+#include "alpaka/core/Assert.hpp"
+#include "alpaka/core/Vectorize.hpp"
+
+#include <algorithm>
+#include <cstdint>
+#include <type_traits>
+
+namespace alpaka
+{
+    //! Static block shared memory provider using a pointer to
+    //! externally allocated fixed-size memory, likely provided by
+    //! BlockSharedMemDynMember.
+    //! \warning This class is not thread safe!
+    template<std::size_t TDataAlignBytes = core::vectorization::defaultAlignment>
+    class BlockSharedMemStMember
+        : public detail::BlockSharedMemStMemberImpl<TDataAlignBytes>
+        , public concepts::Implements<ConceptBlockSharedSt, BlockSharedMemStMember<TDataAlignBytes>>
+    {
+    public:
+        using detail::BlockSharedMemStMemberImpl<TDataAlignBytes>::BlockSharedMemStMemberImpl;
+    };
+
+    namespace trait
+    {
+        template<typename T, std::size_t TDataAlignBytes, std::size_t TuniqueId>
+        struct DeclareSharedVar<T, TuniqueId, BlockSharedMemStMember<TDataAlignBytes>>
+        {
+            static auto declareVar(BlockSharedMemStMember<TDataAlignBytes> const& smem) -> T&
+            {
+                auto* data = smem.template getVarPtr<T>(TuniqueId);
+
+                if(!data)
+                {
+                    smem.template alloc<T>(TuniqueId);
+                    data = smem.template getLatestVarPtr<T>();
+                }
+                ALPAKA_ASSERT(data != nullptr);
+                return *data;
+            }
+        };
+
+        template<std::size_t TDataAlignBytes>
+        struct FreeSharedVars<BlockSharedMemStMember<TDataAlignBytes>>
+        {
+            static auto freeVars(BlockSharedMemStMember<TDataAlignBytes> const&) -> void
+            {
+                // shared memory block data will be reused
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/block/shared/st/BlockSharedMemStMemberMasterSync.hpp b/include/alpaka/block/shared/st/BlockSharedMemStMemberMasterSync.hpp
new file mode 100644
index 0000000..65bd304
--- /dev/null
+++ b/include/alpaka/block/shared/st/BlockSharedMemStMemberMasterSync.hpp
@@ -0,0 +1,86 @@
+/* Copyright 2022 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/block/shared/st/Traits.hpp"
+#include "alpaka/block/shared/st/detail/BlockSharedMemStMemberImpl.hpp"
+#include "alpaka/core/AlignedAlloc.hpp"
+#include "alpaka/core/Common.hpp"
+#include "alpaka/core/Vectorize.hpp"
+
+#include <functional>
+#include <memory>
+#include <utility>
+#include <vector>
+
+namespace alpaka
+{
+    template<std::size_t TDataAlignBytes = core::vectorization::defaultAlignment>
+    class BlockSharedMemStMemberMasterSync
+        : public detail::BlockSharedMemStMemberImpl<TDataAlignBytes>
+        , public concepts::Implements<ConceptBlockSharedSt, BlockSharedMemStMemberMasterSync<TDataAlignBytes>>
+    {
+    public:
+        BlockSharedMemStMemberMasterSync(
+            uint8_t* mem,
+            std::size_t capacity,
+            std::function<void()> fnSync,
+            std::function<bool()> fnIsMasterThread)
+            : detail::BlockSharedMemStMemberImpl<TDataAlignBytes>(mem, capacity)
+            , m_syncFn(std::move(fnSync))
+            , m_isMasterThreadFn(std::move(fnIsMasterThread))
+        {
+        }
+
+        std::function<void()> m_syncFn;
+        std::function<bool()> m_isMasterThreadFn;
+    };
+
+    namespace trait
+    {
+#if BOOST_COMP_GNUC
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored                                                                                    \
+        "-Wcast-align" // "cast from 'unsigned char*' to 'unsigned int*' increases required alignment of target type"
+#endif
+        template<typename T, std::size_t TDataAlignBytes, std::size_t TuniqueId>
+        struct DeclareSharedVar<T, TuniqueId, BlockSharedMemStMemberMasterSync<TDataAlignBytes>>
+        {
+            ALPAKA_FN_HOST static auto declareVar(
+                BlockSharedMemStMemberMasterSync<TDataAlignBytes> const& blockSharedMemSt) -> T&
+            {
+                auto* data = blockSharedMemSt.template getVarPtr<T>(TuniqueId);
+
+                if(!data)
+                {
+                    // Assure that all threads have executed the return of the last allocBlockSharedArr function (if
+                    // there was one before).
+                    blockSharedMemSt.m_syncFn();
+                    if(blockSharedMemSt.m_isMasterThreadFn())
+                    {
+                        blockSharedMemSt.template alloc<T>(TuniqueId);
+                    }
+
+                    blockSharedMemSt.m_syncFn();
+                    // lookup for the data chunk allocated by the master thread
+                    data = blockSharedMemSt.template getLatestVarPtr<T>();
+                }
+                ALPAKA_ASSERT(data != nullptr);
+                return *data;
+            }
+        };
+#if BOOST_COMP_GNUC
+#    pragma GCC diagnostic pop
+#endif
+        template<std::size_t TDataAlignBytes>
+        struct FreeSharedVars<BlockSharedMemStMemberMasterSync<TDataAlignBytes>>
+        {
+            ALPAKA_FN_HOST static auto freeVars(BlockSharedMemStMemberMasterSync<TDataAlignBytes> const&) -> void
+            {
+                // shared memory block data will be reused
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/block/shared/st/BlockSharedMemStUniformCudaHipBuiltIn.hpp b/include/alpaka/block/shared/st/BlockSharedMemStUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000..9f4ed0c
--- /dev/null
+++ b/include/alpaka/block/shared/st/BlockSharedMemStUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,60 @@
+/* Copyright 2022 Benjamin Worpitz, Erik Zenker, René Widera, Matthias Werner, Andrea Bocci, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/block/shared/st/Traits.hpp"
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/Concepts.hpp"
+
+#include <cstdint>
+#include <type_traits>
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+namespace alpaka
+{
+    //! The GPU CUDA/HIP block shared memory allocator.
+    class BlockSharedMemStUniformCudaHipBuiltIn
+        : public concepts::Implements<ConceptBlockSharedSt, BlockSharedMemStUniformCudaHipBuiltIn>
+    {
+    };
+
+#    if !defined(ALPAKA_HOST_ONLY)
+
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+
+#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+
+    namespace trait
+    {
+        template<typename T, std::size_t TuniqueId>
+        struct DeclareSharedVar<T, TuniqueId, BlockSharedMemStUniformCudaHipBuiltIn>
+        {
+            __device__ static auto declareVar(BlockSharedMemStUniformCudaHipBuiltIn const&) -> T&
+            {
+                __shared__ uint8_t shMem alignas(alignof(T))[sizeof(T)];
+                return *(reinterpret_cast<T*>(shMem));
+            }
+        };
+
+        template<>
+        struct FreeSharedVars<BlockSharedMemStUniformCudaHipBuiltIn>
+        {
+            __device__ static auto freeVars(BlockSharedMemStUniformCudaHipBuiltIn const&) -> void
+            {
+                // Nothing to do. CUDA/HIP block shared memory is automatically freed when all threads left the block.
+            }
+        };
+    } // namespace trait
+
+#    endif
+
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/block/shared/st/Traits.hpp b/include/alpaka/block/shared/st/Traits.hpp
new file mode 100644
index 0000000..3cc7ab2
--- /dev/null
+++ b/include/alpaka/block/shared/st/Traits.hpp
@@ -0,0 +1,59 @@
+/* Copyright 2022 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/core/Concepts.hpp"
+
+#include <type_traits>
+
+namespace alpaka
+{
+    struct ConceptBlockSharedSt
+    {
+    };
+
+    //! The block shared static memory operation trait.
+    namespace trait
+    {
+        //! The block shared static memory variable allocation operation trait.
+        template<typename T, std::size_t TuniqueId, typename TBlockSharedMemSt, typename TSfinae = void>
+        struct DeclareSharedVar;
+        //! The block shared static memory free operation trait.
+        template<typename TBlockSharedMemSt, typename TSfinae = void>
+        struct FreeSharedVars;
+    } // namespace trait
+
+    //! Declare a block shared variable.
+    //!
+    //! The variable is uninitialized and not default constructed!
+    //! The variable can be accessed by all threads within a block.
+    //! Access to the variable is not thread safe.
+    //!
+    //! \tparam T The element type.
+    //! \tparam TuniqueId id those is unique inside a kernel
+    //! \tparam TBlockSharedMemSt The block shared allocator implementation type.
+    //! \param blockSharedMemSt The block shared allocator implementation.
+    //! \return Uninitialized variable stored in shared memory.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, std::size_t TuniqueId, typename TBlockSharedMemSt>
+    ALPAKA_FN_ACC auto declareSharedVar(TBlockSharedMemSt const& blockSharedMemSt) -> T&
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptBlockSharedSt, TBlockSharedMemSt>;
+        return trait::DeclareSharedVar<T, TuniqueId, ImplementationBase>::declareVar(blockSharedMemSt);
+    }
+
+    //! Frees all memory used by block shared variables.
+    //!
+    //! \tparam TBlockSharedMemSt The block shared allocator implementation type.
+    //! \param blockSharedMemSt The block shared allocator implementation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TBlockSharedMemSt>
+    ALPAKA_FN_ACC auto freeSharedVars(TBlockSharedMemSt& blockSharedMemSt) -> void
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptBlockSharedSt, TBlockSharedMemSt>;
+        trait::FreeSharedVars<ImplementationBase>::freeVars(blockSharedMemSt);
+    }
+} // namespace alpaka
diff --git a/include/alpaka/block/shared/st/detail/BlockSharedMemStMemberImpl.hpp b/include/alpaka/block/shared/st/detail/BlockSharedMemStMemberImpl.hpp
new file mode 100644
index 0000000..eb09790
--- /dev/null
+++ b/include/alpaka/block/shared/st/detail/BlockSharedMemStMemberImpl.hpp
@@ -0,0 +1,145 @@
+/* Copyright 2022 Jeffrey Kelling, Rene Widera, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/block/shared/st/Traits.hpp"
+#include "alpaka/core/Assert.hpp"
+#include "alpaka/core/Vectorize.hpp"
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <type_traits>
+
+namespace alpaka::detail
+{
+    //! Implementation of static block shared memory provider.
+    //!
+    //! externally allocated fixed-size memory, likely provided by BlockSharedMemDynMember.
+    template<std::size_t TMinDataAlignBytes = core::vectorization::defaultAlignment>
+    class BlockSharedMemStMemberImpl
+    {
+        struct MetaData
+        {
+            //! Unique id if the next data chunk.
+            std::uint32_t id = std::numeric_limits<std::uint32_t>::max();
+            //! Offset to the next meta data header, relative to m_mem.
+            //! To access the meta data header the offset must by aligned first.
+            std::uint32_t offset = 0;
+        };
+
+        static constexpr std::uint32_t metaDataSize = sizeof(MetaData);
+
+    public:
+#ifndef NDEBUG
+        BlockSharedMemStMemberImpl(std::uint8_t* mem, std::size_t capacity)
+            : m_mem(mem)
+            , m_capacity(static_cast<std::uint32_t>(capacity))
+        {
+            ALPAKA_ASSERT_ACC((m_mem == nullptr) == (m_capacity == 0u));
+        }
+#else
+        BlockSharedMemStMemberImpl(std::uint8_t* mem, std::size_t) : m_mem(mem)
+        {
+        }
+#endif
+
+        template<typename T>
+        void alloc(std::uint32_t id) const
+        {
+            // Add meta data chunk in front of the user data
+            m_allocdBytes = varChunkEnd<MetaData>(m_allocdBytes);
+            ALPAKA_ASSERT_ACC(m_allocdBytes <= m_capacity);
+            auto* meta = getLatestVarPtr<MetaData>();
+
+            // Allocate variable
+            m_allocdBytes = varChunkEnd<T>(m_allocdBytes);
+            ALPAKA_ASSERT_ACC(m_allocdBytes <= m_capacity);
+
+            // Update meta data with id and offset for the allocated variable.
+            meta->id = id;
+            meta->offset = m_allocdBytes;
+        }
+
+#if BOOST_COMP_GNUC
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored                                                                                    \
+        "-Wcast-align" // "cast from 'unsigned char*' to 'unsigned int*' increases required alignment of target type"
+#endif
+
+        //! Give the pointer to an exiting variable
+        //!
+        //! @tparam T type of the variable
+        //! @param id unique id of the variable
+        //! @return nullptr if variable with id not exists
+        template<typename T>
+        auto getVarPtr(std::uint32_t id) const -> T*
+        {
+            // Offset in bytes to the next unaligned meta data header behind the variable.
+            std::uint32_t off = 0;
+
+            // Iterate over allocated data only
+            while(off < m_allocdBytes)
+            {
+                // Adjust offset to be aligned
+                std::uint32_t const alignedMetaDataOffset
+                    = varChunkEnd<MetaData>(off) - static_cast<std::uint32_t>(sizeof(MetaData));
+                ALPAKA_ASSERT_ACC(
+                    (alignedMetaDataOffset + static_cast<std::uint32_t>(sizeof(MetaData))) <= m_allocdBytes);
+                auto* metaDataPtr = reinterpret_cast<MetaData*>(m_mem + alignedMetaDataOffset);
+                off = metaDataPtr->offset;
+
+                if(metaDataPtr->id == id)
+                    return reinterpret_cast<T*>(&m_mem[off - sizeof(T)]);
+            }
+
+            // Variable not found.
+            return nullptr;
+        }
+
+        //! Get last allocated variable.
+        template<typename T>
+        auto getLatestVarPtr() const -> T*
+        {
+            return reinterpret_cast<T*>(&m_mem[m_allocdBytes - sizeof(T)]);
+        }
+
+    private:
+#if BOOST_COMP_GNUC
+#    pragma GCC diagnostic pop
+#endif
+
+        //! Byte offset to the end of the memory chunk
+        //!
+        //! Calculate bytes required to store a type with a aligned starting address in m_mem.
+        //! Start offset to the origin of the user data chunk can be calculated with `result - sizeof(T)`.
+        //! The padding is always before the origin of the user data chunk and can be zero byte.
+        //!
+        //! \tparam T type should fit into the chunk
+        //! \param byteOffset Current byte offset.
+        //! \result Byte offset to the end of the data chunk, relative to m_mem..
+        template<typename T>
+        auto varChunkEnd(std::uint32_t byteOffset) const -> std::uint32_t
+        {
+            auto const ptr = reinterpret_cast<std::size_t>(m_mem + byteOffset);
+            constexpr size_t align = std::max(TMinDataAlignBytes, alignof(T));
+            std::size_t const newPtrAdress = ((ptr + align - 1u) / align) * align + sizeof(T);
+            return static_cast<uint32_t>(newPtrAdress - reinterpret_cast<std::size_t>(m_mem));
+        }
+
+        //! Offset in bytes relative to m_mem to next free data area.
+        //! The last aligned before the free area is always a meta data header.
+        mutable std::uint32_t m_allocdBytes = 0u;
+
+        //! Memory layout
+        //! |Header|Padding|Variable|Padding|Header|....uninitialized Data ....
+        //! Size of padding can be zero if data after padding is already aligned.
+        std::uint8_t* const m_mem;
+#ifndef NDEBUG
+        const std::uint32_t m_capacity;
+#endif
+    };
+} // namespace alpaka::detail
diff --git a/include/alpaka/block/sync/BlockSyncBarrierOmp.hpp b/include/alpaka/block/sync/BlockSyncBarrierOmp.hpp
new file mode 100644
index 0000000..c8d9ace
--- /dev/null
+++ b/include/alpaka/block/sync/BlockSyncBarrierOmp.hpp
@@ -0,0 +1,109 @@
+/* Copyright 2023 Axel Hübl, Benjamin Worpitz, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/block/sync/Traits.hpp"
+#include "alpaka/core/Common.hpp"
+
+#include <cstdint>
+
+#ifdef _OPENMP
+
+namespace alpaka
+{
+    //! The OpenMP barrier block synchronization.
+    class BlockSyncBarrierOmp : public concepts::Implements<ConceptBlockSync, BlockSyncBarrierOmp>
+    {
+    public:
+        std::uint8_t mutable m_generation = 0u;
+        int mutable m_result[2];
+    };
+
+    namespace trait
+    {
+        template<>
+        struct SyncBlockThreads<BlockSyncBarrierOmp>
+        {
+            ALPAKA_FN_HOST static auto syncBlockThreads(BlockSyncBarrierOmp const& /* blockSync */) -> void
+            {
+// NOTE: This waits for all threads in all blocks.
+// If multiple blocks are executed in parallel this is not optimal.
+#    pragma omp barrier
+            }
+        };
+
+        namespace detail
+        {
+            template<typename TOp>
+            struct AtomicOp;
+
+            template<>
+            struct AtomicOp<BlockCount>
+            {
+                void operator()(int& result, bool value)
+                {
+#    pragma omp atomic
+                    result += static_cast<int>(value);
+                }
+            };
+
+            template<>
+            struct AtomicOp<BlockAnd>
+            {
+                void operator()(int& result, bool value)
+                {
+#    pragma omp atomic
+                    result &= static_cast<int>(value);
+                }
+            };
+
+            template<>
+            struct AtomicOp<BlockOr>
+            {
+                void operator()(int& result, bool value)
+                {
+#    pragma omp atomic
+                    result |= static_cast<int>(value);
+                }
+            };
+        } // namespace detail
+
+        template<typename TOp>
+        struct SyncBlockThreadsPredicate<TOp, BlockSyncBarrierOmp>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+
+            ALPAKA_FN_ACC static auto syncBlockThreadsPredicate(BlockSyncBarrierOmp const& blockSync, int predicate)
+                -> int
+            {
+// The first thread initializes the value.
+// There is an implicit barrier at the end of omp single.
+// NOTE: This code is executed only once for all OpenMP threads.
+// If multiple blocks with multiple threads are executed in parallel
+// this reduction is executed only for one block!
+#    pragma omp single
+                {
+                    ++blockSync.m_generation;
+                    blockSync.m_result[blockSync.m_generation % 2u] = TOp::InitialValue;
+                }
+
+                auto const generationMod2(blockSync.m_generation % 2u);
+                int& result(blockSync.m_result[generationMod2]);
+                bool const predicateBool(predicate != 0);
+
+                detail::AtomicOp<TOp>()(result, predicateBool);
+
+// Wait for all threads to write their predicate into the vector.
+// NOTE: This waits for all threads in all blocks.
+// If multiple blocks are executed in parallel this is not optimal.
+#    pragma omp barrier
+
+                return blockSync.m_result[generationMod2];
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/block/sync/BlockSyncBarrierThread.hpp b/include/alpaka/block/sync/BlockSyncBarrierThread.hpp
new file mode 100644
index 0000000..61cb6b9
--- /dev/null
+++ b/include/alpaka/block/sync/BlockSyncBarrierThread.hpp
@@ -0,0 +1,62 @@
+/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/block/sync/Traits.hpp"
+#include "alpaka/core/BarrierThread.hpp"
+#include "alpaka/core/Common.hpp"
+
+#include <map>
+#include <mutex>
+#include <thread>
+
+#ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
+
+namespace alpaka
+{
+    //! The thread id map barrier block synchronization.
+    template<typename TIdx>
+    class BlockSyncBarrierThread : public concepts::Implements<ConceptBlockSync, BlockSyncBarrierThread<TIdx>>
+    {
+    public:
+        using Barrier = core::threads::BarrierThread<TIdx>;
+        using BarrierWithPredicate = core::threads::BarrierThreadWithPredicate<TIdx>;
+
+        ALPAKA_FN_HOST BlockSyncBarrierThread(TIdx const& blockThreadCount)
+            : m_barrier(blockThreadCount)
+            , m_barrierWithPredicate(blockThreadCount)
+        {
+        }
+
+        Barrier mutable m_barrier;
+        BarrierWithPredicate mutable m_barrierWithPredicate;
+    };
+
+    namespace trait
+    {
+        template<typename TIdx>
+        struct SyncBlockThreads<BlockSyncBarrierThread<TIdx>>
+        {
+            ALPAKA_FN_HOST static auto syncBlockThreads(BlockSyncBarrierThread<TIdx> const& blockSync) -> void
+            {
+                blockSync.m_barrier.wait();
+            }
+        };
+
+        template<typename TOp, typename TIdx>
+        struct SyncBlockThreadsPredicate<TOp, BlockSyncBarrierThread<TIdx>>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_ACC static auto syncBlockThreadsPredicate(
+                BlockSyncBarrierThread<TIdx> const& blockSync,
+                int predicate) -> int
+            {
+                return blockSync.m_barrierWithPredicate.template wait<TOp>(predicate);
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/block/sync/BlockSyncGenericSycl.hpp b/include/alpaka/block/sync/BlockSyncGenericSycl.hpp
new file mode 100644
index 0000000..67e9749
--- /dev/null
+++ b/include/alpaka/block/sync/BlockSyncGenericSycl.hpp
@@ -0,0 +1,79 @@
+/* Copyright 2022 Jan Stephan
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/block/sync/Traits.hpp"
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+
+#    include <sycl/sycl.hpp>
+
+namespace alpaka
+{
+    //! The SYCL block synchronization.
+    template<typename TDim>
+    class BlockSyncGenericSycl : public concepts::Implements<ConceptBlockSync, BlockSyncGenericSycl<TDim>>
+    {
+    public:
+        using BlockSyncBase = BlockSyncGenericSycl<TDim>;
+
+        BlockSyncGenericSycl(sycl::nd_item<TDim::value> work_item) : my_item{work_item}
+        {
+        }
+
+        sycl::nd_item<TDim::value> my_item;
+    };
+} // namespace alpaka
+
+namespace alpaka::trait
+{
+    template<typename TDim>
+    struct SyncBlockThreads<BlockSyncGenericSycl<TDim>>
+    {
+        static auto syncBlockThreads(BlockSyncGenericSycl<TDim> const& blockSync) -> void
+        {
+            blockSync.my_item.barrier();
+        }
+    };
+
+    template<typename TDim>
+    struct SyncBlockThreadsPredicate<BlockCount, BlockSyncGenericSycl<TDim>>
+    {
+        static auto syncBlockThreadsPredicate(BlockSyncGenericSycl<TDim> const& blockSync, int predicate) -> int
+        {
+            auto const group = blockSync.my_item.get_group();
+            blockSync.my_item.barrier();
+
+            auto const counter = (predicate != 0) ? 1 : 0;
+            return sycl::reduce_over_group(group, counter, sycl::plus<>{});
+        }
+    };
+
+    template<typename TDim>
+    struct SyncBlockThreadsPredicate<BlockAnd, BlockSyncGenericSycl<TDim>>
+    {
+        static auto syncBlockThreadsPredicate(BlockSyncGenericSycl<TDim> const& blockSync, int predicate) -> int
+        {
+            auto const group = blockSync.my_item.get_group();
+            blockSync.my_item.barrier();
+
+            return static_cast<int>(sycl::all_of_group(group, static_cast<bool>(predicate)));
+        }
+    };
+
+    template<typename TDim>
+    struct SyncBlockThreadsPredicate<BlockOr, BlockSyncGenericSycl<TDim>>
+    {
+        static auto syncBlockThreadsPredicate(BlockSyncGenericSycl<TDim> const& blockSync, int predicate) -> int
+        {
+            auto const group = blockSync.my_item.get_group();
+            blockSync.my_item.barrier();
+
+            return static_cast<int>(sycl::any_of_group(group, static_cast<bool>(predicate)));
+        }
+    };
+} // namespace alpaka::trait
+
+#endif
diff --git a/include/alpaka/block/sync/BlockSyncNoOp.hpp b/include/alpaka/block/sync/BlockSyncNoOp.hpp
new file mode 100644
index 0000000..57aae90
--- /dev/null
+++ b/include/alpaka/block/sync/BlockSyncNoOp.hpp
@@ -0,0 +1,40 @@
+/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/block/sync/Traits.hpp"
+#include "alpaka/core/Common.hpp"
+
+namespace alpaka
+{
+    //! The no op block synchronization.
+    class BlockSyncNoOp : public concepts::Implements<ConceptBlockSync, BlockSyncNoOp>
+    {
+    };
+
+    namespace trait
+    {
+        template<>
+        struct SyncBlockThreads<BlockSyncNoOp>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_ACC static auto syncBlockThreads(BlockSyncNoOp const& /* blockSync */) -> void
+            {
+                // Nothing to do.
+            }
+        };
+
+        template<typename TOp>
+        struct SyncBlockThreadsPredicate<TOp, BlockSyncNoOp>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_ACC static auto syncBlockThreadsPredicate(BlockSyncNoOp const& /* blockSync */, int predicate)
+                -> int
+            {
+                return predicate;
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/block/sync/BlockSyncUniformCudaHipBuiltIn.hpp b/include/alpaka/block/sync/BlockSyncUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000..ddc369d
--- /dev/null
+++ b/include/alpaka/block/sync/BlockSyncUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,122 @@
+/* Copyright 2022 Benjamin Worpitz, Matthias Werner, Andrea Bocci, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/block/sync/Traits.hpp"
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/Concepts.hpp"
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+namespace alpaka
+{
+    //! The GPU CUDA/HIP block synchronization.
+    class BlockSyncUniformCudaHipBuiltIn
+        : public concepts::Implements<ConceptBlockSync, BlockSyncUniformCudaHipBuiltIn>
+    {
+    };
+
+#    if !defined(ALPAKA_HOST_ONLY)
+
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+
+#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+
+    namespace trait
+    {
+        template<>
+        struct SyncBlockThreads<BlockSyncUniformCudaHipBuiltIn>
+        {
+            __device__ static auto syncBlockThreads(BlockSyncUniformCudaHipBuiltIn const& /*blockSync*/) -> void
+            {
+                __syncthreads();
+            }
+        };
+
+        template<>
+        struct SyncBlockThreadsPredicate<BlockCount, BlockSyncUniformCudaHipBuiltIn>
+        {
+            __device__ static auto syncBlockThreadsPredicate(
+                BlockSyncUniformCudaHipBuiltIn const& /*blockSync*/,
+                int predicate) -> int
+            {
+#        if defined(__HIP_ARCH_HAS_SYNC_THREAD_EXT__) && __HIP_ARCH_HAS_SYNC_THREAD_EXT__ == 0 && BOOST_COMP_HIP
+                // workaround for unsupported syncthreads_* operation on AMD hardware without sync extension
+                __shared__ int tmp;
+                __syncthreads();
+                if(threadIdx.x == 0)
+                    tmp = 0;
+                __syncthreads();
+                if(predicate)
+                    ::atomicAdd(&tmp, 1);
+                __syncthreads();
+
+                return tmp;
+#        else
+                return __syncthreads_count(predicate);
+#        endif
+            }
+        };
+
+        template<>
+        struct SyncBlockThreadsPredicate<BlockAnd, BlockSyncUniformCudaHipBuiltIn>
+        {
+            __device__ static auto syncBlockThreadsPredicate(
+                BlockSyncUniformCudaHipBuiltIn const& /*blockSync*/,
+                int predicate) -> int
+            {
+#        if defined(__HIP_ARCH_HAS_SYNC_THREAD_EXT__) && __HIP_ARCH_HAS_SYNC_THREAD_EXT__ == 0 && BOOST_COMP_HIP
+                // workaround for unsupported syncthreads_* operation on AMD hardware without sync extension
+                __shared__ int tmp;
+                __syncthreads();
+                if(threadIdx.x == 0)
+                    tmp = 1;
+                __syncthreads();
+                if(!predicate)
+                    ::atomicAnd(&tmp, 0);
+                __syncthreads();
+
+                return tmp;
+#        else
+                return __syncthreads_and(predicate);
+#        endif
+            }
+        };
+
+        template<>
+        struct SyncBlockThreadsPredicate<BlockOr, BlockSyncUniformCudaHipBuiltIn>
+        {
+            __device__ static auto syncBlockThreadsPredicate(
+                BlockSyncUniformCudaHipBuiltIn const& /*blockSync*/,
+                int predicate) -> int
+            {
+#        if defined(__HIP_ARCH_HAS_SYNC_THREAD_EXT__) && __HIP_ARCH_HAS_SYNC_THREAD_EXT__ == 0 && BOOST_COMP_HIP
+                // workaround for unsupported syncthreads_* operation on AMD hardware without sync extension
+                __shared__ int tmp;
+                __syncthreads();
+                if(threadIdx.x == 0)
+                    tmp = 0;
+                __syncthreads();
+                if(predicate)
+                    ::atomicOr(&tmp, 1);
+                __syncthreads();
+
+                return tmp;
+#        else
+                return __syncthreads_or(predicate);
+#        endif
+            }
+        };
+    } // namespace trait
+
+#    endif
+
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/block/sync/Traits.hpp b/include/alpaka/block/sync/Traits.hpp
new file mode 100644
index 0000000..f6c6563
--- /dev/null
+++ b/include/alpaka/block/sync/Traits.hpp
@@ -0,0 +1,107 @@
+/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/core/Concepts.hpp"
+
+#include <type_traits>
+
+namespace alpaka
+{
+    struct ConceptBlockSync
+    {
+    };
+
+    //! The block synchronization traits.
+    namespace trait
+    {
+        //! The block synchronization operation trait.
+        template<typename TBlockSync, typename TSfinae = void>
+        struct SyncBlockThreads;
+
+        //! The block synchronization and predicate operation trait.
+        template<typename TOp, typename TBlockSync, typename TSfinae = void>
+        struct SyncBlockThreadsPredicate;
+    } // namespace trait
+
+    //! Synchronizes all threads within the current block (independently for all blocks).
+    //!
+    //! \tparam TBlockSync The block synchronization implementation type.
+    //! \param blockSync The block synchronization implementation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TBlockSync>
+    ALPAKA_FN_ACC auto syncBlockThreads(TBlockSync const& blockSync) -> void
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptBlockSync, TBlockSync>;
+        trait::SyncBlockThreads<ImplementationBase>::syncBlockThreads(blockSync);
+    }
+
+    //! The counting function object.
+    struct BlockCount
+    {
+        enum
+        {
+            InitialValue = 0u
+        };
+
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        ALPAKA_FN_HOST_ACC auto operator()(T const& currentResult, T const& value) const -> T
+        {
+            return currentResult + static_cast<T>(value != static_cast<T>(0));
+        }
+    };
+
+    //! The logical and function object.
+    struct BlockAnd
+    {
+        enum
+        {
+            InitialValue = 1u
+        };
+
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        ALPAKA_FN_HOST_ACC auto operator()(T const& currentResult, T const& value) const -> T
+        {
+            return static_cast<T>(currentResult && (value != static_cast<T>(0)));
+        }
+    };
+
+    //! The logical or function object.
+    struct BlockOr
+    {
+        enum
+        {
+            InitialValue = 0u
+        };
+
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        ALPAKA_FN_HOST_ACC auto operator()(T const& currentResult, T const& value) const -> T
+        {
+            return static_cast<T>(currentResult || (value != static_cast<T>(0)));
+        }
+    };
+
+    //! Synchronizes all threads within the current block (independently for all blocks),
+    //! evaluates the predicate for all threads and returns the combination of all the results
+    //! computed via TOp.
+    //!
+    //! \tparam TOp The operation used to combine the predicate values of all threads.
+    //! \tparam TBlockSync The block synchronization implementation type.
+    //! \param blockSync The block synchronization implementation.
+    //! \param predicate The predicate value of the current thread.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TOp, typename TBlockSync>
+    ALPAKA_FN_ACC auto syncBlockThreadsPredicate(TBlockSync const& blockSync, int predicate) -> int
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptBlockSync, TBlockSync>;
+        return trait::SyncBlockThreadsPredicate<TOp, ImplementationBase>::syncBlockThreadsPredicate(
+            blockSync,
+            predicate);
+    }
+} // namespace alpaka
diff --git a/include/alpaka/core/Align.hpp b/include/alpaka/core/Align.hpp
new file mode 100644
index 0000000..d2be014
--- /dev/null
+++ b/include/alpaka/core/Align.hpp
@@ -0,0 +1,65 @@
+/* Copyright 2022 Benjamin Worpitz, René Widera, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+
+#include <cstddef>
+#include <type_traits>
+
+namespace alpaka::core
+{
+    //! Rounds to the next higher power of two (if not already power of two).
+    // Adapted from llvm/ADT/SmallPtrSet.h
+    template<std::size_t N>
+    struct RoundUpToPowerOfTwo;
+
+    //! Defines implementation details that should not be used directly by the user.
+    namespace detail
+    {
+        //! Base case for N being a power of two.
+        template<std::size_t N, bool TisPowerTwo>
+        struct RoundUpToPowerOfTwoHelper : std::integral_constant<std::size_t, N>
+        {
+        };
+
+        //! Case for N not being a power of two.
+        // We could just use NextVal = N+1, but this converges faster.  N|(N-1) sets
+        // the right-most zero bits to one all at once, e.g. 0b0011000 -> 0b0011111.
+        template<std::size_t N>
+        struct RoundUpToPowerOfTwoHelper<N, false>
+            : std::integral_constant<std::size_t, RoundUpToPowerOfTwo<(N | (N - 1)) + 1>::value>
+        {
+        };
+    } // namespace detail
+
+    template<std::size_t N>
+    struct RoundUpToPowerOfTwo
+        : std::integral_constant<std::size_t, detail::RoundUpToPowerOfTwoHelper<N, (N & (N - 1)) == 0>::value>
+    {
+    };
+
+    //! The alignment specifics.
+    namespace align
+    {
+        //! Calculates the optimal alignment for data of the given size.
+        template<std::size_t TsizeBytes>
+        struct OptimalAlignment
+            : std::integral_constant<
+                  std::size_t,
+#if BOOST_COMP_GNUC
+                  // GCC does not support alignments larger then 128: "warning: requested alignment 256 is larger
+                  // than 128[-Wattributes]".
+                  (TsizeBytes > 64) ? 128 :
+#endif
+                                    (RoundUpToPowerOfTwo<TsizeBytes>::value)>
+        {
+        };
+    } // namespace align
+} // namespace alpaka::core
+
+// The optimal alignment for a type is the next higher or equal power of two.
+#define ALPAKA_OPTIMAL_ALIGNMENT(...)                                                                                 \
+    ::alpaka::core::align::OptimalAlignment<sizeof(std::remove_cv_t<__VA_ARGS__>)>::value
diff --git a/include/alpaka/core/AlignedAlloc.hpp b/include/alpaka/core/AlignedAlloc.hpp
new file mode 100644
index 0000000..2dca319
--- /dev/null
+++ b/include/alpaka/core/AlignedAlloc.hpp
@@ -0,0 +1,23 @@
+/* Copyright 2022 René Widera, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/Common.hpp"
+
+#include <new>
+
+namespace alpaka::core
+{
+    ALPAKA_FN_INLINE ALPAKA_FN_HOST auto alignedAlloc(size_t alignment, size_t size) -> void*
+    {
+        return ::operator new(size, std::align_val_t{alignment});
+    }
+
+    ALPAKA_FN_INLINE ALPAKA_FN_HOST void alignedFree(size_t alignment, void* ptr)
+    {
+        ::operator delete(ptr, std::align_val_t{alignment});
+    }
+} // namespace alpaka::core
diff --git a/include/alpaka/core/ApiCudaRt.hpp b/include/alpaka/core/ApiCudaRt.hpp
new file mode 100644
index 0000000..ee2cdb2
--- /dev/null
+++ b/include/alpaka/core/ApiCudaRt.hpp
@@ -0,0 +1,402 @@
+/* Copyright 2022 Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include <boost/predef.h>
+
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+#    include <cuda_runtime_api.h>
+
+namespace alpaka
+{
+    struct ApiCudaRt
+    {
+        // Names
+        static constexpr char name[] = "Cuda";
+        static constexpr auto version = BOOST_PREDEF_MAKE_10_VVRRP(CUDART_VERSION);
+
+        // Types
+        using DeviceAttr_t = ::cudaDeviceAttr;
+        using DeviceProp_t = ::cudaDeviceProp;
+        using Error_t = ::cudaError_t;
+        using Event_t = ::cudaEvent_t;
+        using Extent_t = ::cudaExtent;
+        using Flag_t = unsigned int;
+        using FuncAttributes_t = ::cudaFuncAttributes;
+        using HostFn_t = void (*)(void* data); // same as cudaHostFn_t, without the CUDART_CB calling convention
+        using Limit_t = ::cudaLimit;
+        using Memcpy3DParms_t = ::cudaMemcpy3DParms;
+        using MemcpyKind_t = ::cudaMemcpyKind;
+        using PitchedPtr_t = ::cudaPitchedPtr;
+        using Pos_t = ::cudaPos;
+        using Stream_t = ::cudaStream_t;
+
+        // Constants
+        static constexpr Error_t success = ::cudaSuccess;
+        static constexpr Error_t errorNotReady = ::cudaErrorNotReady;
+        static constexpr Error_t errorHostMemoryAlreadyRegistered = ::cudaErrorHostMemoryAlreadyRegistered;
+        static constexpr Error_t errorHostMemoryNotRegistered = ::cudaErrorHostMemoryNotRegistered;
+        static constexpr Error_t errorUnsupportedLimit = ::cudaErrorUnsupportedLimit;
+        static constexpr Error_t errorUnknown = ::cudaErrorUnknown;
+
+        static constexpr Flag_t eventDefault = cudaEventDefault;
+        static constexpr Flag_t eventBlockingSync = cudaEventBlockingSync;
+        static constexpr Flag_t eventDisableTiming = cudaEventDisableTiming;
+        static constexpr Flag_t eventInterprocess = cudaEventInterprocess;
+
+        static constexpr Flag_t hostMallocDefault = cudaHostAllocDefault;
+        static constexpr Flag_t hostMallocMapped = cudaHostAllocMapped;
+        static constexpr Flag_t hostMallocPortable = cudaHostAllocPortable;
+        static constexpr Flag_t hostMallocWriteCombined = cudaHostAllocWriteCombined;
+        static constexpr Flag_t hostMallocCoherent = cudaHostAllocDefault; // Not supported.
+        static constexpr Flag_t hostMallocNonCoherent = cudaHostAllocDefault; // Not supported.
+
+        static constexpr Flag_t hostRegisterDefault = cudaHostRegisterDefault;
+        static constexpr Flag_t hostRegisterPortable = cudaHostRegisterPortable;
+        static constexpr Flag_t hostRegisterMapped = cudaHostRegisterMapped;
+        static constexpr Flag_t hostRegisterIoMemory = cudaHostRegisterIoMemory;
+
+        static constexpr MemcpyKind_t memcpyDefault = ::cudaMemcpyDefault;
+        static constexpr MemcpyKind_t memcpyDeviceToDevice = ::cudaMemcpyDeviceToDevice;
+        static constexpr MemcpyKind_t memcpyDeviceToHost = ::cudaMemcpyDeviceToHost;
+        static constexpr MemcpyKind_t memcpyHostToDevice = ::cudaMemcpyHostToDevice;
+
+        static constexpr Flag_t streamDefault = cudaStreamDefault;
+        static constexpr Flag_t streamNonBlocking = cudaStreamNonBlocking;
+
+        static constexpr DeviceAttr_t deviceAttributeMaxBlockDimX = ::cudaDevAttrMaxBlockDimX;
+        static constexpr DeviceAttr_t deviceAttributeMaxBlockDimY = ::cudaDevAttrMaxBlockDimY;
+        static constexpr DeviceAttr_t deviceAttributeMaxBlockDimZ = ::cudaDevAttrMaxBlockDimZ;
+        static constexpr DeviceAttr_t deviceAttributeMaxGridDimX = ::cudaDevAttrMaxGridDimX;
+        static constexpr DeviceAttr_t deviceAttributeMaxGridDimY = ::cudaDevAttrMaxGridDimY;
+        static constexpr DeviceAttr_t deviceAttributeMaxGridDimZ = ::cudaDevAttrMaxGridDimZ;
+        static constexpr DeviceAttr_t deviceAttributeMaxSharedMemoryPerBlock = ::cudaDevAttrMaxSharedMemoryPerBlock;
+        static constexpr DeviceAttr_t deviceAttributeMaxThreadsPerBlock = ::cudaDevAttrMaxThreadsPerBlock;
+        static constexpr DeviceAttr_t deviceAttributeMultiprocessorCount = ::cudaDevAttrMultiProcessorCount;
+        static constexpr DeviceAttr_t deviceAttributeWarpSize = ::cudaDevAttrWarpSize;
+
+        static constexpr Limit_t limitPrintfFifoSize = ::cudaLimitPrintfFifoSize;
+        static constexpr Limit_t limitMallocHeapSize = ::cudaLimitMallocHeapSize;
+
+        // Host function helper
+        // Encapsulates the different function signatures used by cudaStreamAddCallback and cudaLaunchHostFn, and the
+        // different calling conventions used by CUDA (__stdcall on Win32) and HIP (standard).
+        struct HostFnAdaptor
+        {
+            HostFn_t func_;
+            void* data_;
+
+            static void CUDART_CB hostFunction(void* data)
+            {
+                auto ptr = reinterpret_cast<HostFnAdaptor*>(data);
+                ptr->func_(ptr->data_);
+                delete ptr;
+            }
+
+            static void CUDART_CB streamCallback(Stream_t, Error_t, void* data)
+            {
+                auto ptr = reinterpret_cast<HostFnAdaptor*>(data);
+                ptr->func_(ptr->data_);
+                delete ptr;
+            }
+        };
+
+        // Runtime API
+        static inline Error_t deviceGetAttribute(int* value, DeviceAttr_t attr, int device)
+        {
+            return ::cudaDeviceGetAttribute(value, attr, device);
+        }
+
+        static inline Error_t deviceGetLimit(size_t* pValue, Limit_t limit)
+        {
+            return ::cudaDeviceGetLimit(pValue, limit);
+        }
+
+        static inline Error_t deviceReset()
+        {
+            return ::cudaDeviceReset();
+        }
+
+        static inline Error_t deviceSetLimit(Limit_t limit, size_t value)
+        {
+            return ::cudaDeviceSetLimit(limit, value);
+        }
+
+        static inline Error_t deviceSynchronize()
+        {
+            return ::cudaDeviceSynchronize();
+        }
+
+        static inline Error_t eventCreate(Event_t* event)
+        {
+            return ::cudaEventCreate(event);
+        }
+
+        static inline Error_t eventCreateWithFlags(Event_t* event, Flag_t flags)
+        {
+            return ::cudaEventCreateWithFlags(event, flags);
+        }
+
+        static inline Error_t eventDestroy(Event_t event)
+        {
+            return ::cudaEventDestroy(event);
+        }
+
+        static inline Error_t eventQuery(Event_t event)
+        {
+            return ::cudaEventQuery(event);
+        }
+
+        static inline Error_t eventRecord(Event_t event, Stream_t stream)
+        {
+            return ::cudaEventRecord(event, stream);
+        }
+
+        static inline Error_t eventSynchronize(Event_t event)
+        {
+            return ::cudaEventSynchronize(event);
+        }
+
+        static inline Error_t free(void* devPtr)
+        {
+            return ::cudaFree(devPtr);
+        }
+
+        static inline Error_t freeAsync([[maybe_unused]] void* devPtr, [[maybe_unused]] Stream_t stream)
+        {
+#    if CUDART_VERSION >= 11020
+            return ::cudaFreeAsync(devPtr, stream);
+#    else
+            // Not implemented.
+            return errorUnknown;
+#    endif
+        }
+
+        static inline Error_t funcGetAttributes(FuncAttributes_t* attr, void const* func)
+        {
+            return ::cudaFuncGetAttributes(attr, func);
+        }
+
+        template<typename T>
+        static inline Error_t funcGetAttributes(FuncAttributes_t* attr, T* func)
+        {
+#    if BOOST_COMP_GNUC
+#        pragma GCC diagnostic push
+#        pragma GCC diagnostic ignored "-Wconditionally-supported"
+#    endif
+            return ::cudaFuncGetAttributes(attr, reinterpret_cast<void const*>(func));
+#    if BOOST_COMP_GNUC
+#        pragma GCC diagnostic pop
+#    endif
+        }
+
+        static inline Error_t getDeviceCount(int* count)
+        {
+            return ::cudaGetDeviceCount(count);
+        }
+
+        static inline Error_t getDeviceProperties(DeviceProp_t* prop, int device)
+        {
+            return ::cudaGetDeviceProperties(prop, device);
+        }
+
+        static inline char const* getErrorName(Error_t error)
+        {
+            return ::cudaGetErrorName(error);
+        }
+
+        static inline char const* getErrorString(Error_t error)
+        {
+            return ::cudaGetErrorString(error);
+        }
+
+        static inline Error_t getLastError()
+        {
+            return ::cudaGetLastError();
+        }
+
+        static inline Error_t getSymbolAddress(void** devPtr, void const* symbol)
+        {
+            return ::cudaGetSymbolAddress(devPtr, symbol);
+        }
+
+        template<class T>
+        static inline Error_t getSymbolAddress(void** devPtr, T const& symbol)
+        {
+            return ::cudaGetSymbolAddress(devPtr, symbol);
+        }
+
+        static inline Error_t hostGetDevicePointer(void** pDevice, void* pHost, Flag_t flags)
+        {
+            return ::cudaHostGetDevicePointer(pDevice, pHost, flags);
+        }
+
+        static inline Error_t hostFree(void* ptr)
+        {
+            return ::cudaFreeHost(ptr);
+        }
+
+        static inline Error_t hostMalloc(void** ptr, size_t size, Flag_t flags)
+        {
+            return ::cudaHostAlloc(ptr, size, flags);
+        }
+
+        static inline Error_t hostRegister(void* ptr, size_t size, Flag_t flags)
+        {
+            return ::cudaHostRegister(ptr, size, flags);
+        }
+
+        static inline Error_t hostUnregister(void* ptr)
+        {
+            return ::cudaHostUnregister(ptr);
+        }
+
+        static inline Error_t launchHostFunc(Stream_t stream, HostFn_t fn, void* userData)
+        {
+#    if CUDART_VERSION >= 10000
+            // Wrap the host function using the proper calling convention
+            return ::cudaLaunchHostFunc(stream, HostFnAdaptor::hostFunction, new HostFnAdaptor{fn, userData});
+#    else
+            // Emulate cudaLaunchHostFunc using cudaStreamAddCallback with a callback adaptor.
+            return ::cudaStreamAddCallback(stream, HostFnAdaptor::streamCallback, new HostFnAdaptor{fn, userData}, 0);
+#    endif
+        }
+
+        static inline Error_t malloc(void** devPtr, size_t size)
+        {
+            return ::cudaMalloc(devPtr, size);
+        }
+
+        static inline Error_t malloc3D(PitchedPtr_t* pitchedDevPtr, Extent_t extent)
+        {
+            return ::cudaMalloc3D(pitchedDevPtr, extent);
+        }
+
+        static inline Error_t mallocAsync(
+            [[maybe_unused]] void** devPtr,
+            [[maybe_unused]] size_t size,
+            [[maybe_unused]] Stream_t stream)
+        {
+#    if CUDART_VERSION >= 11020
+            return ::cudaMallocAsync(devPtr, size, stream);
+#    else
+            // Not implemented.
+            return errorUnknown;
+#    endif
+        }
+
+        static inline Error_t mallocPitch(void** devPtr, size_t* pitch, size_t width, size_t height)
+        {
+            return ::cudaMallocPitch(devPtr, pitch, width, height);
+        }
+
+        static inline Error_t memGetInfo(size_t* free, size_t* total)
+        {
+            return ::cudaMemGetInfo(free, total);
+        }
+
+        static inline Error_t memcpy(void* dst, void const* src, size_t count, MemcpyKind_t kind)
+        {
+            return ::cudaMemcpy(dst, src, count, kind);
+        }
+
+        static inline Error_t memcpy2DAsync(
+            void* dst,
+            size_t dpitch,
+            void const* src,
+            size_t spitch,
+            size_t width,
+            size_t height,
+            MemcpyKind_t kind,
+            Stream_t stream)
+        {
+            return ::cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream);
+        }
+
+        static inline Error_t memcpy3DAsync(Memcpy3DParms_t const* p, Stream_t stream)
+        {
+            return ::cudaMemcpy3DAsync(p, stream);
+        }
+
+        static inline Error_t memcpyAsync(void* dst, void const* src, size_t count, MemcpyKind_t kind, Stream_t stream)
+        {
+            return ::cudaMemcpyAsync(dst, src, count, kind, stream);
+        }
+
+        static inline Error_t memset2DAsync(
+            void* devPtr,
+            size_t pitch,
+            int value,
+            size_t width,
+            size_t height,
+            Stream_t stream)
+        {
+            return ::cudaMemset2DAsync(devPtr, pitch, value, width, height, stream);
+        }
+
+        static inline Error_t memset3DAsync(PitchedPtr_t pitchedDevPtr, int value, Extent_t extent, Stream_t stream)
+        {
+            return ::cudaMemset3DAsync(pitchedDevPtr, value, extent, stream);
+        }
+
+        static inline Error_t memsetAsync(void* devPtr, int value, size_t count, Stream_t stream)
+        {
+            return ::cudaMemsetAsync(devPtr, value, count, stream);
+        }
+
+        static inline Error_t setDevice(int device)
+        {
+            return ::cudaSetDevice(device);
+        }
+
+        static inline Error_t streamCreate(Stream_t* pStream)
+        {
+            return ::cudaStreamCreate(pStream);
+        }
+
+        static inline Error_t streamCreateWithFlags(Stream_t* pStream, Flag_t flags)
+        {
+            return ::cudaStreamCreateWithFlags(pStream, flags);
+        }
+
+        static inline Error_t streamDestroy(Stream_t stream)
+        {
+            return ::cudaStreamDestroy(stream);
+        }
+
+        static inline Error_t streamQuery(Stream_t stream)
+        {
+            return ::cudaStreamQuery(stream);
+        }
+
+        static inline Error_t streamSynchronize(Stream_t stream)
+        {
+            return ::cudaStreamSynchronize(stream);
+        }
+
+        static inline Error_t streamWaitEvent(Stream_t stream, Event_t event, Flag_t flags)
+        {
+            return ::cudaStreamWaitEvent(stream, event, flags);
+        }
+
+        static inline PitchedPtr_t makePitchedPtr(void* d, size_t p, size_t xsz, size_t ysz)
+        {
+            return ::make_cudaPitchedPtr(d, p, xsz, ysz);
+        }
+
+        static inline Pos_t makePos(size_t x, size_t y, size_t z)
+        {
+            return ::make_cudaPos(x, y, z);
+        }
+
+        static inline Extent_t makeExtent(size_t w, size_t h, size_t d)
+        {
+            return ::make_cudaExtent(w, h, d);
+        }
+    };
+
+} // namespace alpaka
+
+#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
diff --git a/include/alpaka/core/ApiHipRt.hpp b/include/alpaka/core/ApiHipRt.hpp
new file mode 100644
index 0000000..d765246
--- /dev/null
+++ b/include/alpaka/core/ApiHipRt.hpp
@@ -0,0 +1,441 @@
+/* Copyright 2022 Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include <boost/predef.h>
+
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+
+#    include <hip/hip_runtime_api.h>
+#    include <hip/hip_version.h>
+
+namespace alpaka
+{
+    struct ApiHipRt
+    {
+        // Names
+        static constexpr char name[] = "Hip";
+        static constexpr auto version = BOOST_VERSION_NUMBER(HIP_VERSION_MAJOR, HIP_VERSION_MINOR, 0);
+
+        // Types
+        using DeviceAttr_t = ::hipDeviceAttribute_t;
+        using DeviceProp_t = ::hipDeviceProp_t;
+        using Error_t = ::hipError_t;
+        using Event_t = ::hipEvent_t;
+        using Extent_t = ::hipExtent;
+        using Flag_t = unsigned int;
+        using FuncAttributes_t = ::hipFuncAttributes;
+        using HostFn_t = void (*)(void* data); // same as hipHostFn_t
+        using Limit_t = ::hipLimit_t;
+        using Memcpy3DParms_t = ::hipMemcpy3DParms;
+        using MemcpyKind_t = ::hipMemcpyKind;
+        using PitchedPtr_t = ::hipPitchedPtr;
+        using Pos_t = ::hipPos;
+        using Stream_t = ::hipStream_t;
+
+        // Constants
+        static constexpr Error_t success = ::hipSuccess;
+        static constexpr Error_t errorNotReady = ::hipErrorNotReady;
+        static constexpr Error_t errorHostMemoryAlreadyRegistered = ::hipErrorHostMemoryAlreadyRegistered;
+        static constexpr Error_t errorHostMemoryNotRegistered = ::hipErrorHostMemoryNotRegistered;
+        static constexpr Error_t errorUnsupportedLimit = ::hipErrorUnsupportedLimit;
+        static constexpr Error_t errorUnknown = ::hipErrorUnknown;
+
+        static constexpr Flag_t eventDefault = hipEventDefault;
+        static constexpr Flag_t eventBlockingSync = hipEventBlockingSync;
+        static constexpr Flag_t eventDisableTiming = hipEventDisableTiming;
+        static constexpr Flag_t eventInterprocess = hipEventInterprocess;
+
+        static constexpr Flag_t hostMallocDefault = hipHostMallocDefault;
+        static constexpr Flag_t hostMallocMapped = hipHostMallocMapped;
+        static constexpr Flag_t hostMallocPortable = hipHostMallocPortable;
+        static constexpr Flag_t hostMallocWriteCombined = hipHostMallocWriteCombined;
+        static constexpr Flag_t hostMallocCoherent = hipHostMallocCoherent;
+        static constexpr Flag_t hostMallocNonCoherent = hipHostMallocNonCoherent;
+
+        static constexpr Flag_t hostRegisterDefault = hipHostRegisterDefault;
+        static constexpr Flag_t hostRegisterPortable = hipHostRegisterPortable;
+        static constexpr Flag_t hostRegisterMapped = hipHostRegisterMapped;
+        static constexpr Flag_t hostRegisterIoMemory = hipHostRegisterIoMemory;
+
+        static constexpr MemcpyKind_t memcpyDefault = ::hipMemcpyDefault;
+        static constexpr MemcpyKind_t memcpyDeviceToDevice = ::hipMemcpyDeviceToDevice;
+        static constexpr MemcpyKind_t memcpyDeviceToHost = ::hipMemcpyDeviceToHost;
+        static constexpr MemcpyKind_t memcpyHostToDevice = ::hipMemcpyHostToDevice;
+
+        static constexpr Flag_t streamDefault = hipStreamDefault;
+        static constexpr Flag_t streamNonBlocking = hipStreamNonBlocking;
+
+        static constexpr DeviceAttr_t deviceAttributeMaxBlockDimX = ::hipDeviceAttributeMaxBlockDimX;
+        static constexpr DeviceAttr_t deviceAttributeMaxBlockDimY = ::hipDeviceAttributeMaxBlockDimY;
+        static constexpr DeviceAttr_t deviceAttributeMaxBlockDimZ = ::hipDeviceAttributeMaxBlockDimZ;
+        static constexpr DeviceAttr_t deviceAttributeMaxGridDimX = ::hipDeviceAttributeMaxGridDimX;
+        static constexpr DeviceAttr_t deviceAttributeMaxGridDimY = ::hipDeviceAttributeMaxGridDimY;
+        static constexpr DeviceAttr_t deviceAttributeMaxGridDimZ = ::hipDeviceAttributeMaxGridDimZ;
+        static constexpr DeviceAttr_t deviceAttributeMaxSharedMemoryPerBlock
+            = ::hipDeviceAttributeMaxSharedMemoryPerBlock;
+        static constexpr DeviceAttr_t deviceAttributeMaxThreadsPerBlock = ::hipDeviceAttributeMaxThreadsPerBlock;
+        static constexpr DeviceAttr_t deviceAttributeMultiprocessorCount = ::hipDeviceAttributeMultiprocessorCount;
+        static constexpr DeviceAttr_t deviceAttributeWarpSize = ::hipDeviceAttributeWarpSize;
+
+#    if HIP_VERSION >= 40'500'000
+        static constexpr Limit_t limitPrintfFifoSize = ::hipLimitPrintfFifoSize;
+#    else
+        static constexpr Limit_t limitPrintfFifoSize
+            = static_cast<Limit_t>(0x01); // Implemented only in ROCm 4.5.0 and later.
+#    endif
+        static constexpr Limit_t limitMallocHeapSize = ::hipLimitMallocHeapSize;
+
+        // Host function helper
+        // Encapsulates the different function signatures used by hipStreamAddCallback and hipLaunchHostFn, and the
+        // different calling conventions used by CUDA (__stdcall on Win32) and HIP (standard).
+        struct HostFnAdaptor
+        {
+            HostFn_t func_;
+            void* data_;
+
+            static void hostFunction(void* data)
+            {
+                auto ptr = reinterpret_cast<HostFnAdaptor*>(data);
+                ptr->func_(ptr->data_);
+                delete ptr;
+            }
+
+            static void streamCallback(Stream_t, Error_t, void* data)
+            {
+                auto ptr = reinterpret_cast<HostFnAdaptor*>(data);
+                ptr->func_(ptr->data_);
+                delete ptr;
+            }
+        };
+
+        // Runtime API
+        static inline Error_t deviceGetAttribute(int* value, DeviceAttr_t attr, int device)
+        {
+            return ::hipDeviceGetAttribute(value, attr, device);
+        }
+
+        static inline Error_t deviceGetLimit(size_t* pValue, Limit_t limit)
+        {
+#    if HIP_VERSION < 40'500'000
+            if(limit == limitPrintfFifoSize)
+            {
+                // Implemented only in ROCm 4.5.0 and later.
+                return errorUnsupportedLimit;
+            }
+#    endif
+            return ::hipDeviceGetLimit(pValue, limit);
+        }
+
+        static inline Error_t deviceReset()
+        {
+            return ::hipDeviceReset();
+        }
+
+        static inline Error_t deviceSetLimit(Limit_t /* limit */, size_t /* value */)
+        {
+            // Not implemented.
+            return errorUnsupportedLimit;
+        }
+
+        static inline Error_t deviceSynchronize()
+        {
+            return ::hipDeviceSynchronize();
+        }
+
+        static inline Error_t eventCreate(Event_t* event)
+        {
+            return ::hipEventCreate(event);
+        }
+
+        static inline Error_t eventCreateWithFlags(Event_t* event, Flag_t flags)
+        {
+            return ::hipEventCreateWithFlags(event, flags);
+        }
+
+        static inline Error_t eventDestroy(Event_t event)
+        {
+            return ::hipEventDestroy(event);
+        }
+
+        static inline Error_t eventQuery(Event_t event)
+        {
+            return ::hipEventQuery(event);
+        }
+
+        static inline Error_t eventRecord(Event_t event, Stream_t stream)
+        {
+            return ::hipEventRecord(event, stream);
+        }
+
+        static inline Error_t eventSynchronize(Event_t event)
+        {
+            return ::hipEventSynchronize(event);
+        }
+
+        static inline Error_t free(void* devPtr)
+        {
+            return ::hipFree(devPtr);
+        }
+
+        static inline Error_t freeAsync([[maybe_unused]] void* devPtr, [[maybe_unused]] Stream_t stream)
+        {
+            // stream-ordered memory operations are fully implemented only in ROCm 5.3.0 and later.
+#    if HIP_VERSION >= 50'300'000
+            // hipFreeAsync fails on a null pointer deallocation
+            if(devPtr)
+            {
+                return ::hipFreeAsync(devPtr, stream);
+            }
+            else
+            {
+                return ::hipSuccess;
+            }
+#    else
+            // Not implemented.
+            return errorUnknown;
+#    endif
+        }
+
+        static inline Error_t funcGetAttributes(FuncAttributes_t* attr, void const* func)
+        {
+            return ::hipFuncGetAttributes(attr, func);
+        }
+
+        template<typename T>
+        static inline Error_t funcGetAttributes(FuncAttributes_t* attr, T* func)
+        {
+#    if BOOST_COMP_GNUC
+#        pragma GCC diagnostic push
+#        pragma GCC diagnostic ignored "-Wconditionally-supported"
+#    endif
+            return ::hipFuncGetAttributes(attr, reinterpret_cast<void const*>(func));
+#    if BOOST_COMP_GNUC
+#        pragma GCC diagnostic pop
+#    endif
+        }
+
+        static inline Error_t getDeviceCount(int* count)
+        {
+            return ::hipGetDeviceCount(count);
+        }
+
+        static inline Error_t getDeviceProperties(DeviceProp_t* prop, int device)
+        {
+            return ::hipGetDeviceProperties(prop, device);
+        }
+
+        static inline char const* getErrorName(Error_t error)
+        {
+            return ::hipGetErrorName(error);
+        }
+
+        static inline char const* getErrorString(Error_t error)
+        {
+            return ::hipGetErrorString(error);
+        }
+
+        static inline Error_t getLastError()
+        {
+            return ::hipGetLastError();
+        }
+
+        static inline Error_t getSymbolAddress(void** devPtr, void const* symbol)
+        {
+            return ::hipGetSymbolAddress(devPtr, symbol);
+        }
+
+        template<class T>
+        static inline Error_t getSymbolAddress(void** devPtr, T const& symbol)
+        {
+            return ::hipGetSymbolAddress(devPtr, symbol);
+        }
+
+        static inline Error_t hostGetDevicePointer(void** pDevice, void* pHost, Flag_t flags)
+        {
+            return ::hipHostGetDevicePointer(pDevice, pHost, flags);
+        }
+
+        static inline Error_t hostFree(void* ptr)
+        {
+            return ::hipHostFree(ptr);
+        }
+
+        static inline Error_t hostMalloc(void** ptr, size_t size, Flag_t flags)
+        {
+            return ::hipHostMalloc(ptr, size, flags);
+        }
+
+        static inline Error_t hostRegister(void* ptr, size_t size, Flag_t flags)
+        {
+            return ::hipHostRegister(ptr, size, flags);
+        }
+
+        static inline Error_t hostUnregister(void* ptr)
+        {
+            return ::hipHostUnregister(ptr);
+        }
+
+        static inline Error_t launchHostFunc(Stream_t stream, HostFn_t fn, void* userData)
+        {
+            // hipLaunchHostFunc is implemented only in ROCm 5.4.0 and later.
+#    if HIP_VERSION >= 50'400'000
+            // Wrap the host function using the proper calling convention.
+            return ::hipLaunchHostFunc(stream, HostFnAdaptor::hostFunction, new HostFnAdaptor{fn, userData});
+#    else
+            // Emulate hipLaunchHostFunc using hipStreamAddCallback with a callback adaptor.
+            return ::hipStreamAddCallback(stream, HostFnAdaptor::streamCallback, new HostFnAdaptor{fn, userData}, 0);
+#    endif
+        }
+
+        static inline Error_t malloc(void** devPtr, size_t size)
+        {
+            return ::hipMalloc(devPtr, size);
+        }
+
+        static inline Error_t malloc3D(PitchedPtr_t* pitchedDevPtr, Extent_t extent)
+        {
+            return ::hipMalloc3D(pitchedDevPtr, extent);
+        }
+
+        static inline Error_t mallocAsync(
+            [[maybe_unused]] void** devPtr,
+            [[maybe_unused]] size_t size,
+            [[maybe_unused]] Stream_t stream)
+        {
+            // stream-ordered memory operations are fully implemented only in ROCm 5.3.0 and later.
+#    if HIP_VERSION >= 50'600'000
+            return ::hipMallocAsync(devPtr, size, stream);
+#    elif HIP_VERSION >= 50'300'000
+            // before ROCm 5.6.0, hipMallocAsync fails for an allocation of 0 bytes
+            if(size > 0)
+            {
+                return ::hipMallocAsync(devPtr, size, stream);
+            }
+            else
+            {
+                // make sure the pointer can safely be passed to hipFreeAsync
+                *devPtr = nullptr;
+                return ::hipSuccess;
+            }
+#    else
+            // Not implemented.
+            return errorUnknown;
+#    endif
+        }
+
+        static inline Error_t mallocPitch(void** devPtr, size_t* pitch, size_t width, size_t height)
+        {
+            return ::hipMallocPitch(devPtr, pitch, width, height);
+        }
+
+        static inline Error_t memGetInfo(size_t* free, size_t* total)
+        {
+            return ::hipMemGetInfo(free, total);
+        }
+
+        static inline Error_t memcpy(void* dst, void const* src, size_t count, MemcpyKind_t kind)
+        {
+            return ::hipMemcpy(dst, src, count, kind);
+        }
+
+        static inline Error_t memcpy2DAsync(
+            void* dst,
+            size_t dpitch,
+            void const* src,
+            size_t spitch,
+            size_t width,
+            size_t height,
+            MemcpyKind_t kind,
+            Stream_t stream)
+        {
+            return ::hipMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream);
+        }
+
+        static inline Error_t memcpy3DAsync(Memcpy3DParms_t const* p, Stream_t stream)
+        {
+            return ::hipMemcpy3DAsync(p, stream);
+        }
+
+        static inline Error_t memcpyAsync(void* dst, void const* src, size_t count, MemcpyKind_t kind, Stream_t stream)
+        {
+            return ::hipMemcpyAsync(dst, src, count, kind, stream);
+        }
+
+        static inline Error_t memset2DAsync(
+            void* devPtr,
+            size_t pitch,
+            int value,
+            size_t width,
+            size_t height,
+            Stream_t stream)
+        {
+            return ::hipMemset2DAsync(devPtr, pitch, value, width, height, stream);
+        }
+
+        static inline Error_t memset3DAsync(PitchedPtr_t pitchedDevPtr, int value, Extent_t extent, Stream_t stream)
+        {
+            return ::hipMemset3DAsync(pitchedDevPtr, value, extent, stream);
+        }
+
+        static inline Error_t memsetAsync(void* devPtr, int value, size_t count, Stream_t stream)
+        {
+            return ::hipMemsetAsync(devPtr, value, count, stream);
+        }
+
+        static inline Error_t setDevice(int device)
+        {
+            return ::hipSetDevice(device);
+        }
+
+        static inline Error_t streamCreate(Stream_t* pStream)
+        {
+            return ::hipStreamCreate(pStream);
+        }
+
+        static inline Error_t streamCreateWithFlags(Stream_t* pStream, Flag_t flags)
+        {
+            return ::hipStreamCreateWithFlags(pStream, flags);
+        }
+
+        static inline Error_t streamDestroy(Stream_t stream)
+        {
+            return ::hipStreamDestroy(stream);
+        }
+
+        static inline Error_t streamQuery(Stream_t stream)
+        {
+            return ::hipStreamQuery(stream);
+        }
+
+        static inline Error_t streamSynchronize(Stream_t stream)
+        {
+            return ::hipStreamSynchronize(stream);
+        }
+
+        static inline Error_t streamWaitEvent(Stream_t stream, Event_t event, Flag_t flags)
+        {
+            return ::hipStreamWaitEvent(stream, event, flags);
+        }
+
+        static inline PitchedPtr_t makePitchedPtr(void* d, size_t p, size_t xsz, size_t ysz)
+        {
+            return ::make_hipPitchedPtr(d, p, xsz, ysz);
+        }
+
+        static inline Pos_t makePos(size_t x, size_t y, size_t z)
+        {
+            return ::make_hipPos(x, y, z);
+        }
+
+        static inline Extent_t makeExtent(size_t w, size_t h, size_t d)
+        {
+            return ::make_hipExtent(w, h, d);
+        }
+    };
+
+} // namespace alpaka
+
+#endif // ALPAKA_ACC_GPU_HIP_ENABLED
diff --git a/include/alpaka/core/Assert.hpp b/include/alpaka/core/Assert.hpp
new file mode 100644
index 0000000..7ad2a2b
--- /dev/null
+++ b/include/alpaka/core/Assert.hpp
@@ -0,0 +1,105 @@
+/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+
+#include <cassert>
+#include <type_traits>
+
+//! The assert can be explicit disabled by defining NDEBUG
+#define ALPAKA_ASSERT(...) assert(__VA_ARGS__)
+
+//! Macro which expands to a noop.
+//! Macro enforces an semicolon after the call.
+#define ALPAKA_NOOP(...)                                                                                              \
+    do                                                                                                                \
+    {                                                                                                                 \
+    } while(false)
+
+//! ALPAKA_ASSERT_ACC_IMPL is an assert-like macro.
+//! It can be disabled setting the ALPAKA_DISABLE_ASSERT_ACC preprocessor symbol or the NDEBUG preprocessor symbol.
+#if !defined(ALPAKA_DISABLE_ASSERT_ACC)
+#    define ALPAKA_ASSERT_ACC_IMPL(...) ALPAKA_ASSERT(__VA_ARGS__)
+#else
+#    define ALPAKA_ASSERT_ACC_IMPL(...) ALPAKA_NOOP(__VA_ARGS__)
+#endif
+
+//! ALPAKA_ASSERT_ACC is an assert-like macro.
+//!
+//! In device code for a GPU or SYCL backend it can be disabled setting the ALPAKA_DISABLE_ASSERT_ACC preprocessor
+//! symbol or the NDEBUG preprocessor symbol. In device code for a native C++ CPU backend and in host code, it is
+//! equivalent to ALPAKA_ASSERT, and can be disabled setting the NDEBUG preprocessor symbol.
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDA_ARCH__)
+// CUDA device code
+#    define ALPAKA_ASSERT_ACC(...) ALPAKA_ASSERT_ACC_IMPL(__VA_ARGS__)
+#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_DEVICE_COMPILE__)
+// HIP/ROCm device code
+#    define ALPAKA_ASSERT_ACC(...) ALPAKA_ASSERT_ACC_IMPL(__VA_ARGS__)
+#elif defined(ALPAKA_ACC_SYCL_ENABLED) && defined(__SYCL_DEVICE_ONLY__)
+// SYCL/oneAPI device code
+#    if defined(SYCL_EXT_ONEAPI_ASSERT)
+#        define ALPAKA_ASSERT_ACC(...) ALPAKA_ASSERT_ACC_IMPL(__VA_ARGS__)
+#    else
+#        define ALPAKA_ASSERT_ACC(...) ALPAKA_NOOP(__VA_ARGS__)
+#    endif
+// add here any other #elif conditions for non-CPU backends
+// ...
+#else
+// CPU backend, or host code
+#    define ALPAKA_ASSERT_ACC(...) ALPAKA_ASSERT(__VA_ARGS__)
+#endif
+
+namespace alpaka::core
+{
+    namespace detail
+    {
+        template<typename TArg>
+        struct AssertValueUnsigned
+        {
+            ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC static constexpr auto assertValueUnsigned(
+                [[maybe_unused]] TArg const& arg)
+            {
+                if constexpr(std::is_signed_v<TArg>)
+                    ALPAKA_ASSERT_ACC(arg >= 0);
+
+                // Nothing to do for unsigned types.
+            }
+        };
+    } // namespace detail
+
+    //! This method checks integral values if they are greater or equal zero.
+    //! The implementation prevents warnings for checking this for unsigned types.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TArg>
+    ALPAKA_FN_HOST_ACC constexpr auto assertValueUnsigned(TArg const& arg) -> void
+    {
+        detail::AssertValueUnsigned<TArg>::assertValueUnsigned(arg);
+    }
+
+    namespace detail
+    {
+        template<typename TLhs, typename TRhs>
+        struct AssertGreaterThan
+        {
+            ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC static constexpr auto assertGreaterThan(
+                [[maybe_unused]] TRhs const& rhs)
+            {
+                if constexpr(std::is_signed_v<TRhs> || (TLhs::value != 0u))
+                    ALPAKA_ASSERT_ACC(TLhs::value > rhs);
+
+                // Nothing to do for unsigned types comparing to zero.
+            }
+        };
+    } // namespace detail
+
+    //! This function asserts that the integral value TLhs is greater than TRhs.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TLhs, typename TRhs>
+    ALPAKA_FN_HOST_ACC constexpr auto assertGreaterThan(TRhs const& rhs) -> void
+    {
+        detail::AssertGreaterThan<TLhs, TRhs>::assertGreaterThan(rhs);
+    }
+} // namespace alpaka::core
diff --git a/include/alpaka/core/BarrierThread.hpp b/include/alpaka/core/BarrierThread.hpp
new file mode 100644
index 0000000..ff38eb3
--- /dev/null
+++ b/include/alpaka/core/BarrierThread.hpp
@@ -0,0 +1,168 @@
+/* Copyright 2022 Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+// Uncomment this to disable the standard spinlock behaviour of the threads
+// #define ALPAKA_THREAD_BARRIER_DISABLE_SPINLOCK
+
+#include "alpaka/block/sync/Traits.hpp"
+#include "alpaka/core/Common.hpp"
+
+#include <condition_variable>
+#include <mutex>
+#ifndef ALPAKA_THREAD_BARRIER_DISABLE_SPINLOCK
+#    include <atomic>
+#    include <thread>
+#endif
+
+namespace alpaka::core
+{
+    namespace threads
+    {
+        //! A self-resetting barrier.
+        template<typename TIdx>
+        class BarrierThread final
+        {
+        public:
+            explicit BarrierThread(TIdx const& threadCount)
+                : m_threadCount(threadCount)
+                , m_curThreadCount(threadCount)
+                , m_generation(0)
+            {
+            }
+
+            //! Waits for all the other threads to reach the barrier.
+            auto wait() -> void
+            {
+                TIdx const generationWhenEnteredTheWait = m_generation;
+#ifdef ALPAKA_THREAD_BARRIER_DISABLE_SPINLOCK
+                std::unique_lock<std::mutex> lock(m_mtxBarrier);
+#endif
+                if(--m_curThreadCount == 0)
+                {
+                    m_curThreadCount = m_threadCount;
+                    ++m_generation;
+#ifdef ALPAKA_THREAD_BARRIER_DISABLE_SPINLOCK
+                    m_cvAllThreadsReachedBarrier.notify_all();
+#endif
+                }
+                else
+                {
+#ifdef ALPAKA_THREAD_BARRIER_DISABLE_SPINLOCK
+                    m_cvAllThreadsReachedBarrier.wait(
+                        lock,
+                        [this, generationWhenEnteredTheWait] { return generationWhenEnteredTheWait != m_generation; });
+#else
+                    while(generationWhenEnteredTheWait == m_generation)
+                    {
+                        std::this_thread::yield();
+                    }
+#endif
+                }
+            }
+
+        private:
+#ifdef ALPAKA_THREAD_BARRIER_DISABLE_SPINLOCK
+            std::mutex m_mtxBarrier;
+            std::condition_variable m_cvAllThreadsReachedBarrier;
+#endif
+            const TIdx m_threadCount;
+#ifdef ALPAKA_THREAD_BARRIER_DISABLE_SPINLOCK
+            TIdx m_curThreadCount;
+            TIdx m_generation;
+#else
+            std::atomic<TIdx> m_curThreadCount;
+            std::atomic<TIdx> m_generation;
+#endif
+        };
+
+        namespace detail
+        {
+            template<typename TOp>
+            struct AtomicOp;
+
+            template<>
+            struct AtomicOp<BlockCount>
+            {
+                void operator()(std::atomic<int>& result, bool value)
+                {
+                    result += static_cast<int>(value);
+                }
+            };
+
+            template<>
+            struct AtomicOp<BlockAnd>
+            {
+                void operator()(std::atomic<int>& result, bool value)
+                {
+                    result &= static_cast<int>(value);
+                }
+            };
+
+            template<>
+            struct AtomicOp<BlockOr>
+            {
+                void operator()(std::atomic<int>& result, bool value)
+                {
+                    result |= static_cast<int>(value);
+                }
+            };
+        } // namespace detail
+
+        //! A self-resetting barrier with barrier.
+        template<typename TIdx>
+        class BarrierThreadWithPredicate final
+        {
+        public:
+            explicit BarrierThreadWithPredicate(TIdx const& threadCount)
+                : m_threadCount(threadCount)
+                , m_curThreadCount(threadCount)
+                , m_generation(0)
+            {
+            }
+
+            //! Waits for all the other threads to reach the barrier.
+            template<typename TOp>
+            ALPAKA_FN_HOST auto wait(int predicate) -> int
+            {
+                TIdx const generationWhenEnteredTheWait = m_generation;
+                std::unique_lock<std::mutex> lock(m_mtxBarrier);
+
+                auto const generationMod2 = m_generation % static_cast<TIdx>(2u);
+                if(m_curThreadCount == m_threadCount)
+                {
+                    m_result[generationMod2] = TOp::InitialValue;
+                }
+
+                std::atomic<int>& result(m_result[generationMod2]);
+                bool const predicateBool(predicate != 0);
+
+                detail::AtomicOp<TOp>()(result, predicateBool);
+
+                if(--m_curThreadCount == 0)
+                {
+                    m_curThreadCount = m_threadCount;
+                    ++m_generation;
+                    m_cvAllThreadsReachedBarrier.notify_all();
+                }
+                else
+                {
+                    m_cvAllThreadsReachedBarrier.wait(
+                        lock,
+                        [this, generationWhenEnteredTheWait] { return generationWhenEnteredTheWait != m_generation; });
+                }
+                return m_result[generationMod2];
+            }
+
+        private:
+            std::mutex m_mtxBarrier;
+            std::condition_variable m_cvAllThreadsReachedBarrier;
+            const TIdx m_threadCount;
+            TIdx m_curThreadCount;
+            TIdx m_generation;
+            std::atomic<int> m_result[2];
+        };
+    } // namespace threads
+} // namespace alpaka::core
diff --git a/include/alpaka/core/BoostPredef.hpp b/include/alpaka/core/BoostPredef.hpp
new file mode 100644
index 0000000..bcd2d35
--- /dev/null
+++ b/include/alpaka/core/BoostPredef.hpp
@@ -0,0 +1,79 @@
+/* Copyright 2023 Benjamin Worpitz, Matthias Werner, René Widera, Sergei Bastrakov, Jeffrey Kelling,
+ *                Bernhard Manfred Gruber, Jan Stephan
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include <boost/predef.h>
+
+#ifdef __INTEL_COMPILER
+#    warning                                                                                                          \
+        "The Intel Classic compiler (icpc) is no longer supported. Please upgrade to the Intel LLVM compiler (ipcx)."
+#endif
+
+//---------------------------------------HIP-----------------------------------
+// __HIP__ is defined by both hip-clang and vanilla clang in HIP mode.
+// https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_porting_guide.md#compiler-defines-summary
+#if !defined(BOOST_LANG_HIP)
+#    if defined(__HIP__)
+/* BOOST_LANG_CUDA is enabled when either __CUDACC__ (nvcc) or __CUDA__ (clang) are defined. This occurs when
+   nvcc / clang encounter a CUDA source file. Since there are no HIP source files we treat every source file
+   as HIP when we are using a HIP-capable compiler. */
+#        include <hip/hip_version.h>
+// HIP doesn't give us a patch level for the last entry, just a gitdate
+#        define BOOST_LANG_HIP BOOST_VERSION_NUMBER(HIP_VERSION_MAJOR, HIP_VERSION_MINOR, 0)
+#    else
+#        define BOOST_LANG_HIP BOOST_VERSION_NUMBER_NOT_AVAILABLE
+#    endif
+#endif
+
+// HSA device architecture detection (HSA generated via HIP(clang))
+#if !defined(BOOST_ARCH_HSA)
+#    if defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__ == 1 && defined(__HIP__)
+// __HIP_DEVICE_COMPILE__ does not represent feature capability of target device like CUDA_ARCH.
+// For feature detection there are special macros, see ROCm's HIP porting guide.
+#        define BOOST_ARCH_HSA BOOST_VERSION_NUMBER_AVAILABLE
+#    else
+#        define BOOST_ARCH_HSA BOOST_VERSION_NUMBER_NOT_AVAILABLE
+#    endif
+#endif
+
+// HIP compiler detection
+#if !defined(BOOST_COMP_HIP)
+#    if defined(__HIP__) // Defined by hip-clang and vanilla clang in HIP mode.
+#        include <hip/hip_version.h>
+// HIP doesn't give us a patch level for the last entry, just a gitdate
+#        define BOOST_COMP_HIP BOOST_VERSION_NUMBER(HIP_VERSION_MAJOR, HIP_VERSION_MINOR, 0)
+#    else
+#        define BOOST_COMP_HIP BOOST_VERSION_NUMBER_NOT_AVAILABLE
+#    endif
+#endif
+
+// clang CUDA compiler detection
+// Currently __CUDA__ is only defined by clang when compiling CUDA code.
+#if defined(__clang__) && defined(__CUDA__)
+#    define BOOST_COMP_CLANG_CUDA BOOST_COMP_CLANG
+#else
+#    define BOOST_COMP_CLANG_CUDA BOOST_VERSION_NUMBER_NOT_AVAILABLE
+#endif
+
+// PGI and NV HPC SDK compiler detection
+// As of Boost 1.74, Boost.Predef's compiler detection is a bit weird. Recent PGI compilers will be identified as
+// BOOST_COMP_PGI_EMULATED. Boost.Predef has lackluster front-end support and mistakes the EDG front-end
+// for an actual compiler.
+// TODO: Whenever you look at this code please check whether https://github.com/boostorg/predef/issues/28 and
+// https://github.com/boostorg/predef/issues/51 have been resolved.
+// BOOST_COMP_PGI_EMULATED is defined by boost instead of BOOST_COMP_PGI
+#if defined(BOOST_COMP_PGI) && defined(BOOST_COMP_PGI_EMULATED)
+#    undef BOOST_COMP_PGI
+#    define BOOST_COMP_PGI BOOST_COMP_PGI_EMULATED
+#endif
+
+// Intel LLVM compiler detection
+#if !defined(BOOST_COMP_ICPX)
+#    if defined(SYCL_LANGUAGE_VERSION) && defined(__INTEL_LLVM_COMPILER)
+// The version string for icpx 2023.1.0 is 20230100. In Boost.Predef this becomes (53,1,0).
+#        define BOOST_COMP_ICPX BOOST_PREDEF_MAKE_YYYYMMDD(__INTEL_LLVM_COMPILER)
+#    endif
+#endif
diff --git a/include/alpaka/core/CallbackThread.hpp b/include/alpaka/core/CallbackThread.hpp
new file mode 100644
index 0000000..91ecf78
--- /dev/null
+++ b/include/alpaka/core/CallbackThread.hpp
@@ -0,0 +1,171 @@
+/* Copyright 2022 Antonio Di Pilato
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+
+#include <cassert>
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <iostream>
+#include <mutex>
+#include <queue>
+#include <thread>
+
+namespace alpaka::core
+{
+    class CallbackThread
+    {
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wweak-vtables"
+#endif
+        // A custom class is used because std::function<F> requires F to be copyable, and std::packaged_task provides a
+        // std::future which will keep the task alive and we cannot control the moment the future is set.
+        //! \todo with C++23 std::move_only_function should be used
+        struct Task
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic pop
+#endif
+        {
+            virtual ~Task() = default;
+            virtual void run() = 0;
+        };
+
+        template<typename Function>
+        struct FunctionHolder : Task
+        {
+            Function m_func;
+
+            template<typename FunctionFwd>
+            explicit FunctionHolder(FunctionFwd&& func) : m_func{std::forward<FunctionFwd>(func)}
+            {
+            }
+
+            void run() override
+            {
+                // if m_func throws, let it propagate
+                m_func();
+            }
+        };
+
+        using TaskPackage = std::pair<std::unique_ptr<Task>, std::promise<void>>;
+
+    public:
+        ~CallbackThread()
+        {
+            {
+                std::unique_lock<std::mutex> lock{m_mutex};
+                m_stop = true;
+                m_cond.notify_one();
+            }
+
+            if(m_thread.joinable())
+            {
+                if(std::this_thread::get_id() == m_thread.get_id())
+                {
+                    std::cerr << "ERROR in ~CallbackThread: thread joins itself" << std::endl;
+                    std::abort();
+                }
+                m_thread.join();
+            }
+        }
+
+        //! It is guaranteed that the task is fully destroyed before the future's result is set.
+        //! @{
+        template<typename NullaryFunction>
+        auto submit(NullaryFunction&& nf) -> std::future<void>
+        {
+            using DecayedFunction = std::decay_t<NullaryFunction>;
+            static_assert(
+                std::is_void_v<std::invoke_result_t<DecayedFunction>>,
+                "Submitted function must not have any arguments and return void.");
+
+            // FunctionHolder stores a copy of the user's task, but may be constructed from an expiring value to avoid
+            // the copy. We do NOT store a reference to the users task, which could dangle if the user isn't careful.
+            auto tp = std::pair(
+                std::unique_ptr<Task>(new FunctionHolder<DecayedFunction>{std::forward<NullaryFunction>(nf)}),
+                std::promise<void>{});
+            auto f = tp.second.get_future();
+            {
+                std::unique_lock<std::mutex> lock{m_mutex};
+                m_tasks.emplace(std::move(tp));
+                if(!m_thread.joinable())
+                    startWorkerThread();
+                m_cond.notify_one();
+            }
+
+            return f;
+        }
+
+        //! @}
+
+        //! @return True if queue is empty and no task is executed else false.
+        //! If only one tasks is enqueued and the task is executed the task will see the queue as not empty.
+        //! During the destruction of this single enqueued task the queue will already be accounted as empty.
+        [[nodiscard]] auto empty()
+        {
+            std::unique_lock<std::mutex> lock{m_mutex};
+            return m_tasks.empty();
+        }
+
+    private:
+        std::thread m_thread;
+        std::condition_variable m_cond;
+        std::mutex m_mutex;
+        bool m_stop{false};
+        std::queue<TaskPackage> m_tasks;
+
+        auto startWorkerThread() -> void
+        {
+            m_thread = std::thread(
+                [this]
+                {
+                    while(true)
+                    {
+                        std::promise<void> taskPromise;
+                        std::exception_ptr eptr;
+                        {
+                            // Task is destroyed before promise is updated but after the queue state is up to date.
+                            std::unique_ptr<Task> task = nullptr;
+                            {
+                                std::unique_lock<std::mutex> lock{m_mutex};
+                                m_cond.wait(lock, [this] { return m_stop || !m_tasks.empty(); });
+
+                                if(m_stop && m_tasks.empty())
+                                    break;
+
+                                task = std::move(m_tasks.front().first);
+                                taskPromise = std::move(m_tasks.front().second);
+                            }
+                            assert(task);
+                            try
+                            {
+                                task->run();
+                            }
+                            catch(...)
+                            {
+                                eptr = std::current_exception();
+                            }
+                            {
+                                std::unique_lock<std::mutex> lock{m_mutex};
+                                // Pop empty data from the queue, task and promise will be destroyed later in a
+                                // well-defined order.
+                                m_tasks.pop();
+                            }
+                            // Task will be destroyed here, the queue status is already updated.
+                        }
+                        // In case the executed tasks is the last task in the queue the waiting threads will see the
+                        // queue as empty.
+                        if(eptr)
+                            taskPromise.set_exception(std::move(eptr));
+                        else
+                            taskPromise.set_value();
+                    }
+                });
+        }
+    };
+} // namespace alpaka::core
diff --git a/include/alpaka/core/ClipCast.hpp b/include/alpaka/core/ClipCast.hpp
new file mode 100644
index 0000000..aa8c712
--- /dev/null
+++ b/include/alpaka/core/ClipCast.hpp
@@ -0,0 +1,27 @@
+/* Copyright 2022 Benjamin Worpitz, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/meta/Integral.hpp"
+
+#include <algorithm>
+#include <limits>
+
+namespace alpaka::core
+{
+    //! \return The input casted and clipped to T.
+    template<typename T, typename V>
+    auto clipCast(V const& val) -> T
+    {
+        static_assert(
+            std::is_integral_v<T> && std::is_integral_v<V>,
+            "clipCast can not be called with non-integral types!");
+
+        constexpr auto max = static_cast<V>(std::numeric_limits<alpaka::meta::LowerMax<T, V>>::max());
+        constexpr auto min = static_cast<V>(std::numeric_limits<alpaka::meta::HigherMin<T, V>>::min());
+
+        return static_cast<T>(std::max(min, std::min(max, val)));
+    }
+} // namespace alpaka::core
diff --git a/include/alpaka/core/Common.hpp b/include/alpaka/core/Common.hpp
new file mode 100644
index 0000000..3b181ee
--- /dev/null
+++ b/include/alpaka/core/Common.hpp
@@ -0,0 +1,221 @@
+/* Copyright 2024 Axel Hübl, Benjamin Worpitz, Matthias Werner, Jan Stephan, René Widera, Andrea Bocci, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/Debug.hpp"
+
+// Boost.Uuid errors with VS2017 when intrin.h is not included
+#if defined(_MSC_VER) && _MSC_VER >= 1910
+#    include <intrin.h>
+#endif
+
+#if BOOST_LANG_HIP
+// HIP defines some keywords like __forceinline__ in header files.
+#    include <hip/hip_runtime.h>
+#endif
+
+//! All functions that can be used on an accelerator have to be attributed with ALPAKA_FN_ACC or ALPAKA_FN_HOST_ACC.
+//!
+//! \code{.cpp}
+//! Usage:
+//! ALPAKA_FN_ACC
+//! auto add(std::int32_t a, std::int32_t b)
+//! -> std::int32_t;
+//! \endcode
+//! @{
+#if BOOST_LANG_CUDA || BOOST_LANG_HIP
+#    if defined(ALPAKA_ACC_GPU_CUDA_ONLY_MODE) || defined(ALPAKA_ACC_GPU_HIP_ONLY_MODE)
+#        define ALPAKA_FN_ACC __device__
+#    else
+#        define ALPAKA_FN_ACC __device__ __host__
+#    endif
+#    define ALPAKA_FN_HOST_ACC __device__ __host__
+#    define ALPAKA_FN_HOST __host__
+#else
+#    define ALPAKA_FN_ACC
+#    define ALPAKA_FN_HOST_ACC
+#    define ALPAKA_FN_HOST
+#endif
+//! @}
+
+//! All functions marked with ALPAKA_FN_ACC or ALPAKA_FN_HOST_ACC that are exported to / imported from different
+//! translation units have to be attributed with ALPAKA_FN_EXTERN. Note that this needs to be applied to both the
+//! declaration and the definition.
+//!
+//! Usage:
+//! ALPAKA_FN_ACC ALPAKA_FN_EXTERN auto add(std::int32_t a, std::int32_t b) -> std::int32_t;
+//!
+//! Warning: If this is used together with the SYCL back-end make sure that your SYCL runtime supports generic
+//! address spaces. Otherwise it is forbidden to use pointers as parameter or return type for functions marked
+//! with ALPAKA_FN_EXTERN.
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+/*
+   This is required by the SYCL standard, section 5.10.1 "SYCL functions and member functions linkage":
+
+   The default behavior in SYCL applications is that all the definitions and declarations of the functions and member
+   functions are available to the SYCL compiler, in the same translation unit. When this is not the case, all the
+   symbols that need to be exported to a SYCL library or from a C++ library to a SYCL application need to be defined
+   using the macro: SYCL_EXTERNAL.
+*/
+#    define ALPAKA_FN_EXTERN SYCL_EXTERNAL
+#else
+#    define ALPAKA_FN_EXTERN
+#endif
+
+//! Disable nvcc warning:
+//! 'calling a __host__ function from __host__ __device__ function.'
+//! Usage:
+//! ALPAKA_NO_HOST_ACC_WARNING
+//! ALPAKA_FN_HOST_ACC function_declaration()
+//! WARNING: Only use this method if there is no other way.
+//! Most cases can be solved by #if BOOST_ARCH_PTX or #if BOOST_LANG_CUDA.
+#if(BOOST_LANG_CUDA && !BOOST_COMP_CLANG_CUDA)
+#    if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
+#        define ALPAKA_NO_HOST_ACC_WARNING __pragma(hd_warning_disable)
+#    else
+#        define ALPAKA_NO_HOST_ACC_WARNING _Pragma("hd_warning_disable")
+#    endif
+#else
+#    define ALPAKA_NO_HOST_ACC_WARNING
+#endif
+
+//! Macro defining the inline function attribute.
+//!
+//! The macro should stay on the left hand side of keywords, e.g. 'static', 'constexpr', 'explicit' or the return type.
+#if BOOST_LANG_CUDA || BOOST_LANG_HIP
+#    define ALPAKA_FN_INLINE __forceinline__
+#elif BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
+// TODO: With C++20 [[msvc::forceinline]] can be used.
+#    define ALPAKA_FN_INLINE __forceinline
+#else
+// For gcc, clang, and clang-based compilers like Intel icpx
+#    define ALPAKA_FN_INLINE [[gnu::always_inline]] inline
+#endif
+
+//! This macro defines a variable lying in global accelerator device memory.
+//!
+//! Example:
+//!   ALPAKA_STATIC_ACC_MEM_GLOBAL alpaka::DevGlobal<TAcc, int> variable;
+//!
+//! Those variables behave like ordinary variables when used in file-scope,
+//! but inside kernels the get() method must be used to access the variable.
+//! They are declared inline to resolve to a single instance across multiple
+//! translation units.
+//! Like ordinary variables, only one definition is allowed (ODR)
+//! Failure to do so might lead to linker errors.
+//!
+//! In contrast to ordinary variables, you can not define such variables
+//! as static compilation unit local variables with internal linkage
+//! because this is forbidden by CUDA.
+//!
+//! \attention It is not allowed to initialize the variable together with the declaration.
+//!            To initialize the variable alpaka::memcpy must be used.
+//! \code{.cpp}
+//! ALPAKA_STATIC_ACC_MEM_GLOBAL alpaka::DevGlobal<TAcc, int> foo;
+//!
+//! struct DeviceMemoryKernel
+//! {
+//!    ALPAKA_NO_HOST_ACC_WARNING
+//!    template<typename TAcc>
+//!    ALPAKA_FN_ACC void operator()(TAcc const& acc) const
+//!    {
+//!      auto a = foo<TAcc>.get();
+//!    }
+//!  }
+//!
+//! void initFoo() {
+//!     auto extent = alpaka::Vec<alpaka::DimInt<1u>, size_t>{1};
+//!     int initialValue = 42;
+//!     alpaka::ViewPlainPtr<DevHost, int, alpaka::DimInt<1u>, size_t> bufHost(&initialValue, devHost, extent);
+//!     alpaka::memcpy(queue, foo<Acc>, bufHost, extent);
+//! }
+//! \endcode
+#if((BOOST_LANG_CUDA && BOOST_COMP_CLANG_CUDA) || (BOOST_LANG_CUDA && BOOST_COMP_NVCC && BOOST_ARCH_PTX)              \
+    || BOOST_LANG_HIP)
+#    if defined(__CUDACC_RDC__) || defined(__CLANG_RDC__)
+#        define ALPAKA_STATIC_ACC_MEM_GLOBAL                                                                          \
+            template<typename TAcc>                                                                                   \
+            __device__ inline
+#    else
+#        define ALPAKA_STATIC_ACC_MEM_GLOBAL                                                                          \
+            template<typename TAcc>                                                                                   \
+            __device__ static
+#    endif
+#else
+#    define ALPAKA_STATIC_ACC_MEM_GLOBAL                                                                              \
+        template<typename TAcc>                                                                                       \
+        inline
+#endif
+
+//! This macro defines a variable lying in constant accelerator device memory.
+//!
+//! Example:
+//!   ALPAKA_STATIC_ACC_MEM_CONSTANT alpaka::DevGlobal<TAcc, const int> variable;
+//!
+//! Those variables behave like ordinary variables when used in file-scope,
+//! but inside kernels the get() method must be used to access the variable.
+//! They are declared inline to resolve to a single instance across multiple
+//! translation units.
+//! Like ordinary variables, only one definition is allowed (ODR)
+//! Failure to do so might lead to linker errors.
+//!
+//! In contrast to ordinary variables, you can not define such variables
+//! as static compilation unit local variables with internal linkage
+//! because this is forbidden by CUDA.
+//!
+//! \attention It is not allowed to initialize the variable together with the declaration.
+//!            To initialize the variable alpaka::memcpy must be used.
+//! \code{.cpp}
+//! ALPAKA_STATIC_ACC_MEM_CONSTANT alpaka::DevGlobal<TAcc, const int> foo;
+//!
+//! struct DeviceMemoryKernel
+//! {
+//!    ALPAKA_NO_HOST_ACC_WARNING
+//!    template<typename TAcc>
+//!    ALPAKA_FN_ACC void operator()(TAcc const& acc) const
+//!    {
+//!      auto a = foo<TAcc>.get();
+//!    }
+//!  }
+//!
+//! void initFoo() {
+//!     auto extent = alpaka::Vec<alpaka::DimInt<1u>, size_t>{1};
+//!     int initialValue = 42;
+//!     alpaka::ViewPlainPtr<DevHost, int, alpaka::DimInt<1u>, size_t> bufHost(&initialValue, devHost, extent);
+//!     alpaka::memcpy(queue, foo<Acc>, bufHost, extent);
+//! }
+//! \endcode
+#if((BOOST_LANG_CUDA && BOOST_COMP_CLANG_CUDA) || (BOOST_LANG_CUDA && BOOST_COMP_NVCC && BOOST_ARCH_PTX)              \
+    || BOOST_LANG_HIP)
+#    if defined(__CUDACC_RDC__) || defined(__CLANG_RDC__)
+#        define ALPAKA_STATIC_ACC_MEM_CONSTANT                                                                        \
+            template<typename TAcc>                                                                                   \
+            __constant__ inline
+#    else
+#        define ALPAKA_STATIC_ACC_MEM_CONSTANT                                                                        \
+            template<typename TAcc>                                                                                   \
+            __constant__ static
+#    endif
+#else
+#    define ALPAKA_STATIC_ACC_MEM_CONSTANT                                                                            \
+        template<typename TAcc>                                                                                       \
+        inline
+#endif
+
+//! This macro disables memory optimizations for annotated device memory.
+//!
+//! Example:
+//!   ALPAKA_DEVICE_VOLATILE float* ptr;
+//!
+//! This is useful for pointers, (shared) variables and shared memory which are used in combination with
+//! the alpaka::mem_fence() function. It ensures that memory annotated with this macro will always be written directly
+//! to memory (and not to a register or cache because of compiler optimizations).
+#if(BOOST_LANG_CUDA && BOOST_ARCH_PTX)                                                                                \
+    || (BOOST_LANG_HIP && defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__ == 1)
+#    define ALPAKA_DEVICE_VOLATILE volatile
+#else
+#    define ALPAKA_DEVICE_VOLATILE
+#endif
diff --git a/include/alpaka/core/Concepts.hpp b/include/alpaka/core/Concepts.hpp
new file mode 100644
index 0000000..443f347
--- /dev/null
+++ b/include/alpaka/core/Concepts.hpp
@@ -0,0 +1,67 @@
+/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include <type_traits>
+
+namespace alpaka::concepts
+{
+    //! Tag used in class inheritance hierarchies that describes that a specific concept (TConcept)
+    //! is implemented by the given base class (TBase).
+    template<typename TConcept, typename TBase>
+    struct Implements
+    {
+    };
+
+    //! Checks whether the concept is implemented by the given class
+    template<typename TConcept, typename TDerived>
+    struct ImplementsConcept
+    {
+        template<typename TBase>
+        static auto implements(Implements<TConcept, TBase>&) -> std::true_type;
+        static auto implements(...) -> std::false_type;
+
+        static constexpr auto value = decltype(implements(std::declval<TDerived&>()))::value;
+    };
+
+    namespace detail
+    {
+        //! Returns the type that implements the given concept in the inheritance hierarchy.
+        template<typename TConcept, typename TDerived, typename Sfinae = void>
+        struct ImplementationBaseType;
+
+        //! Base case for types that do not inherit from "Implements<TConcept, ...>" is the type itself.
+        template<typename TConcept, typename TDerived>
+        struct ImplementationBaseType<
+            TConcept,
+            TDerived,
+            std::enable_if_t<!ImplementsConcept<TConcept, TDerived>::value>>
+        {
+            using type = TDerived;
+        };
+
+        //! For types that inherit from "Implements<TConcept, ...>" it finds the base class (TBase) which
+        //! implements the concept.
+        template<typename TConcept, typename TDerived>
+        struct ImplementationBaseType<
+            TConcept,
+            TDerived,
+            std::enable_if_t<ImplementsConcept<TConcept, TDerived>::value>>
+        {
+            template<typename TBase>
+            static auto implementer(Implements<TConcept, TBase>&) -> TBase;
+
+            using type = decltype(implementer(std::declval<TDerived&>()));
+
+            static_assert(
+                std::is_base_of_v<type, TDerived>,
+                "The type implementing the concept has to be a publicly accessible base class!");
+        };
+    } // namespace detail
+
+    //! Returns the type that implements the given concept in the inheritance hierarchy.
+    template<typename TConcept, typename TDerived>
+    using ImplementationBase = typename detail::ImplementationBaseType<TConcept, TDerived>::type;
+} // namespace alpaka::concepts
diff --git a/include/alpaka/core/Cuda.hpp b/include/alpaka/core/Cuda.hpp
new file mode 100644
index 0000000..8332ad3
--- /dev/null
+++ b/include/alpaka/core/Cuda.hpp
@@ -0,0 +1,58 @@
+/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera, Andrea Bocci, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/CudaHipCommon.hpp"
+
+#include <iostream>
+#include <stdexcept>
+#include <string>
+
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+
+namespace alpaka::cuda::detail
+{
+    //! CUDA driver API error checking with log and exception, ignoring specific error values
+    ALPAKA_FN_HOST inline auto cudaDrvCheck(CUresult const& error, char const* desc, char const* file, int const& line)
+        -> void
+    {
+        if(error == CUDA_SUCCESS)
+            return;
+
+        char const* cu_err_name = nullptr;
+        char const* cu_err_string = nullptr;
+        CUresult cu_result_name = cuGetErrorName(error, &cu_err_name);
+        CUresult cu_result_string = cuGetErrorString(error, &cu_err_string);
+        std::string sError = std::string(file) + "(" + std::to_string(line) + ") " + std::string(desc) + " : '";
+        if(cu_result_name == CUDA_SUCCESS && cu_result_string == CUDA_SUCCESS)
+        {
+            sError += std::string(cu_err_name) + "': '" + std::string(cu_err_string) + "'!";
+        }
+        else
+        {
+            // cuGetError*() failed, so append corresponding error message
+            if(cu_result_name == CUDA_ERROR_INVALID_VALUE)
+            {
+                sError += " cuGetErrorName: 'Invalid Value'!";
+            }
+            if(cu_result_string == CUDA_ERROR_INVALID_VALUE)
+            {
+                sError += " cuGetErrorString: 'Invalid Value'!";
+            }
+        }
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+        std::cerr << sError << std::endl;
+#    endif
+        ALPAKA_DEBUG_BREAK;
+        throw std::runtime_error(sError);
+    }
+} // namespace alpaka::cuda::detail
+
+//! CUDA driver error checking with log and exception.
+#    define ALPAKA_CUDA_DRV_CHECK(cmd) ::alpaka::cuda::detail::cudaDrvCheck(cmd, #cmd, __FILE__, __LINE__)
+
+#    include "alpaka/core/UniformCudaHip.hpp"
+
+#endif
diff --git a/include/alpaka/core/CudaHipCommon.hpp b/include/alpaka/core/CudaHipCommon.hpp
new file mode 100644
index 0000000..b3fdd7d
--- /dev/null
+++ b/include/alpaka/core/CudaHipCommon.hpp
@@ -0,0 +1,161 @@
+/* Copyright 2023 Axel Hübl, Benjamin Worpitz, Matthias Werner, René Widera, Andrea Bocci, Bernhard Manfred Gruber,
+                  Jan Stephan
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/elem/Traits.hpp"
+#include "alpaka/extent/Traits.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/meta/Concatenate.hpp"
+#include "alpaka/meta/TypeListOps.hpp"
+#include "alpaka/offset/Traits.hpp"
+#include "alpaka/vec/Vec.hpp"
+
+#include <tuple>
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+#        include <cuda.h>
+#        include <cuda_runtime.h>
+#    endif
+
+#    ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+#        include <hip/hip_runtime.h>
+#    endif
+
+namespace alpaka
+{
+    namespace detail
+    {
+        using CudaHipBuiltinTypes1 = std::
+            tuple<char1, double1, float1, int1, long1, longlong1, short1, uchar1, uint1, ulong1, ulonglong1, ushort1>;
+        using CudaHipBuiltinTypes2 = std::
+            tuple<char2, double2, float2, int2, long2, longlong2, short2, uchar2, uint2, ulong2, ulonglong2, ushort2>;
+        using CudaHipBuiltinTypes3 = std::tuple<
+            char3,
+            dim3,
+            double3,
+            float3,
+            int3,
+            long3,
+            longlong3,
+            short3,
+            uchar3,
+            uint3,
+            ulong3,
+            ulonglong3,
+            ushort3
+// CUDA built-in variables have special types in clang native CUDA compilation
+// defined in cuda_builtin_vars.h
+#    if BOOST_COMP_CLANG_CUDA
+            ,
+            __cuda_builtin_threadIdx_t,
+            __cuda_builtin_blockIdx_t,
+            __cuda_builtin_blockDim_t,
+            __cuda_builtin_gridDim_t
+#    endif
+            >;
+        using CudaHipBuiltinTypes4 = std::
+            tuple<char4, double4, float4, int4, long4, longlong4, short4, uchar4, uint4, ulong4, ulonglong4, ushort4>;
+        using CudaHipBuiltinTypes = meta::
+            Concatenate<CudaHipBuiltinTypes1, CudaHipBuiltinTypes2, CudaHipBuiltinTypes3, CudaHipBuiltinTypes4>;
+
+        template<typename T>
+        inline constexpr auto isCudaHipBuiltInType = meta::Contains<CudaHipBuiltinTypes, T>::value;
+    } // namespace detail
+
+#    ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+    namespace cuda::trait
+    {
+        template<typename T>
+        inline constexpr auto isCudaBuiltInType = alpaka::detail::isCudaHipBuiltInType<T>;
+    } // namespace cuda::trait
+#    endif
+
+#    ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+    namespace hip::trait
+    {
+        template<typename T>
+        inline constexpr auto isHipBuiltInType = alpaka::detail::isCudaHipBuiltInType<T>;
+    } // namespace hip::trait
+#    endif
+
+    namespace trait
+    {
+        //! The CUDA/HIP vectors 1D dimension get trait specialization.
+        template<typename T>
+        struct DimType<T, std::enable_if_t<meta::Contains<alpaka::detail::CudaHipBuiltinTypes1, T>::value>>
+        {
+            using type = DimInt<1u>;
+        };
+
+        //! The CUDA/HIP vectors 2D dimension get trait specialization.
+        template<typename T>
+        struct DimType<T, std::enable_if_t<meta::Contains<alpaka::detail::CudaHipBuiltinTypes2, T>::value>>
+        {
+            using type = DimInt<2u>;
+        };
+
+        //! The CUDA/HIP vectors 3D dimension get trait specialization.
+        template<typename T>
+        struct DimType<T, std::enable_if_t<meta::Contains<alpaka::detail::CudaHipBuiltinTypes3, T>::value>>
+        {
+            using type = DimInt<3u>;
+        };
+
+        //! The CUDA/HIP vectors 4D dimension get trait specialization.
+        template<typename T>
+        struct DimType<T, std::enable_if_t<meta::Contains<alpaka::detail::CudaHipBuiltinTypes4, T>::value>>
+        {
+            using type = DimInt<4u>;
+        };
+
+        //! The CUDA/HIP vectors elem type trait specialization.
+        template<typename T>
+        struct ElemType<T, std::enable_if_t<alpaka::detail::isCudaHipBuiltInType<T>>>
+        {
+            using type = decltype(std::declval<T>().x);
+        };
+
+        template<typename TCudaHipBuiltin>
+        struct GetExtents<TCudaHipBuiltin, std::enable_if_t<alpaka::detail::isCudaHipBuiltInType<TCudaHipBuiltin>>>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC auto operator()(TCudaHipBuiltin const& value) const
+                -> Vec<Dim<TCudaHipBuiltin>, Idx<TCudaHipBuiltin>>
+            {
+                constexpr auto dim = Dim<TCudaHipBuiltin>::value;
+                if constexpr(dim == 1)
+                    return {value.x};
+                else if constexpr(dim == 2)
+                    return {value.y, value.x};
+                else if constexpr(dim == 3)
+                    return {value.z, value.y, value.x};
+                else if constexpr(dim == 4)
+                    return {value.w, value.z, value.y, value.x};
+                else
+                    static_assert(sizeof(value) == 0, "Not implemented");
+
+                ALPAKA_UNREACHABLE({});
+            }
+        };
+
+        template<typename TCudaHipBuiltin>
+        struct GetOffsets<TCudaHipBuiltin, std::enable_if_t<alpaka::detail::isCudaHipBuiltInType<TCudaHipBuiltin>>>
+            : GetExtents<TCudaHipBuiltin>
+        {
+        };
+
+        //! The CUDA/HIP vectors idx type trait specialization.
+        template<typename TIdx>
+        struct IdxType<TIdx, std::enable_if_t<alpaka::detail::isCudaHipBuiltInType<TIdx>>>
+        {
+            using type = std::size_t;
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/core/Debug.hpp b/include/alpaka/core/Debug.hpp
new file mode 100644
index 0000000..dc70ed5
--- /dev/null
+++ b/include/alpaka/core/Debug.hpp
@@ -0,0 +1,77 @@
+/* Copyright 2022 Alexander Matthes, Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+
+#include <iostream>
+#include <string>
+#include <utility>
+
+//! The no debug level.
+#define ALPAKA_DEBUG_DISABLED 0
+//! The minimal debug level.
+#define ALPAKA_DEBUG_MINIMAL 1
+//! The full debug level.
+#define ALPAKA_DEBUG_FULL 2
+
+#ifndef ALPAKA_DEBUG
+//! Set the minimum log level if it is not defined.
+#    define ALPAKA_DEBUG ALPAKA_DEBUG_DISABLED
+#endif
+
+namespace alpaka::core::detail
+{
+    //! Scope logger.
+    class ScopeLogStdOut final
+    {
+    public:
+        explicit ScopeLogStdOut(std::string sScope) : m_sScope(std::move(sScope))
+        {
+            std::cout << "[+] " << m_sScope << std::endl;
+        }
+
+        ScopeLogStdOut(ScopeLogStdOut const&) = delete;
+        ScopeLogStdOut(ScopeLogStdOut&&) = delete;
+        auto operator=(ScopeLogStdOut const&) -> ScopeLogStdOut& = delete;
+        auto operator=(ScopeLogStdOut&&) -> ScopeLogStdOut& = delete;
+
+        ~ScopeLogStdOut()
+        {
+            std::cout << "[-] " << m_sScope << std::endl;
+        }
+
+    private:
+        std::string const m_sScope;
+    };
+} // namespace alpaka::core::detail
+
+// Define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE.
+#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+#    define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE ::alpaka::core::detail::ScopeLogStdOut const scopeLogStdOut(__func__)
+#else
+#    define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
+#endif
+
+// Define ALPAKA_DEBUG_FULL_LOG_SCOPE.
+#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+#    define ALPAKA_DEBUG_FULL_LOG_SCOPE ::alpaka::core::detail::ScopeLogStdOut const scopeLogStdOut(__func__)
+#else
+#    define ALPAKA_DEBUG_FULL_LOG_SCOPE
+#endif
+
+// Define ALPAKA_DEBUG_BREAK.
+#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+#    if BOOST_COMP_GNUC || BOOST_COMP_CLANG
+#        define ALPAKA_DEBUG_BREAK ::__builtin_trap()
+#    elif BOOST_COMP_MSVC
+#        define ALPAKA_DEBUG_BREAK ::__debugbreak()
+#    else
+#        define ALPAKA_DEBUG_BREAK
+  // #error debug-break for current compiler not implemented!
+#    endif
+#else
+#    define ALPAKA_DEBUG_BREAK
+#endif
diff --git a/include/alpaka/core/Decay.hpp b/include/alpaka/core/Decay.hpp
new file mode 100644
index 0000000..6b978f5
--- /dev/null
+++ b/include/alpaka/core/Decay.hpp
@@ -0,0 +1,16 @@
+/* Copyright 2023 Sergei Bastrakov, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+
+#include <type_traits>
+
+namespace alpaka
+{
+    //! Provides a decaying wrapper around std::is_same. Example: is_decayed_v<volatile float, float> returns true.
+    template<typename T, typename U>
+    inline constexpr auto is_decayed_v = std::is_same_v<std::decay_t<T>, std::decay_t<U>>;
+} // namespace alpaka
diff --git a/include/alpaka/core/DemangleTypeNames.hpp b/include/alpaka/core/DemangleTypeNames.hpp
new file mode 100644
index 0000000..5650054
--- /dev/null
+++ b/include/alpaka/core/DemangleTypeNames.hpp
@@ -0,0 +1,23 @@
+/* Copyright 2022 Andrea Bocci, Antonio Di Pilato
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+
+#include <boost/core/demangle.hpp>
+
+namespace alpaka::core
+{
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wexit-time-destructors"
+#    pragma clang diagnostic ignored "-Wmissing-variable-declarations"
+#endif
+    template<typename T>
+    inline const std::string demangled = boost::core::demangle(typeid(T).name());
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic pop
+#endif
+} // namespace alpaka::core
diff --git a/include/alpaka/core/Hip.hpp b/include/alpaka/core/Hip.hpp
new file mode 100644
index 0000000..2c2e425
--- /dev/null
+++ b/include/alpaka/core/Hip.hpp
@@ -0,0 +1,14 @@
+/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/CudaHipCommon.hpp"
+#include "alpaka/core/UniformCudaHip.hpp"
+
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+#    if !BOOST_LANG_HIP && !defined(ALPAKA_HOST_ONLY)
+#        error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#    endif
+#endif
diff --git a/include/alpaka/core/OmpSchedule.hpp b/include/alpaka/core/OmpSchedule.hpp
new file mode 100644
index 0000000..722b77b
--- /dev/null
+++ b/include/alpaka/core/OmpSchedule.hpp
@@ -0,0 +1,88 @@
+/* Copyright 2022 Sergei Bastrakov, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+
+#ifdef _OPENMP
+#    include <omp.h>
+#endif
+
+#include <cstdint>
+
+namespace alpaka::omp
+{
+    //! Representation of OpenMP schedule information: kind and chunk size. This class can be used regardless of
+    //! whether OpenMP is enabled.
+    struct Schedule
+    {
+        //! Schedule kinds corresponding to arguments of OpenMP schedule clause
+        //!
+        //! Kinds also present in omp_sched_t enum have the same integer values.
+        //! It is enum, not enum class, for shorter usage as omp::Schedule::[kind] and to keep interface of 0.6.0.
+        enum Kind
+        {
+            // Corresponds to not setting schedule
+            NoSchedule,
+            Static = 1u,
+            Dynamic = 2u,
+            Guided = 3u,
+            // Auto supported since OpenMP 3.0
+#if defined _OPENMP && _OPENMP >= 200805
+            Auto = 4u,
+#endif
+            Runtime = 5u
+        };
+
+        //! Schedule kind.
+        Kind kind;
+
+        //! Chunk size. Same as in OpenMP, value 0 corresponds to default chunk size. Using int and not a
+        //! fixed-width type to match OpenMP API.
+        int chunkSize;
+
+        //! Create a schedule with the given kind and chunk size
+        ALPAKA_FN_HOST constexpr Schedule(Kind myKind = NoSchedule, int myChunkSize = 0)
+            : kind(myKind)
+            , chunkSize(myChunkSize)
+        {
+        }
+    };
+
+    //! Get the OpenMP schedule that is applied when the runtime schedule is used.
+    //!
+    //! For OpenMP >= 3.0 returns the value of the internal control variable run-sched-var.
+    //! Without OpenMP or with OpenMP < 3.0, returns the default schedule.
+    //!
+    //! \return Schedule object.
+    ALPAKA_FN_HOST inline auto getSchedule()
+    {
+        // Getting a runtime schedule requires OpenMP 3.0 or newer
+#if defined _OPENMP && _OPENMP >= 200805
+        omp_sched_t ompKind;
+        int chunkSize = 0;
+        omp_get_schedule(&ompKind, &chunkSize);
+        return Schedule{static_cast<Schedule::Kind>(ompKind), chunkSize};
+#else
+        return Schedule{};
+#endif
+    }
+
+    //! Set the OpenMP schedule that is applied when the runtime schedule is used for future parallel regions.
+    //!
+    //! For OpenMP >= 3.0 sets the value of the internal control variable run-sched-var according to the given
+    //! schedule. Without OpenMP or with OpenMP < 3.0, does nothing.
+    //!
+    //! Note that calling from inside a parallel region does not have an immediate effect.
+    ALPAKA_FN_HOST inline void setSchedule(Schedule schedule)
+    {
+        if((schedule.kind != Schedule::NoSchedule) && (schedule.kind != Schedule::Runtime))
+        {
+#if defined _OPENMP && _OPENMP >= 200805
+            omp_set_schedule(static_cast<omp_sched_t>(schedule.kind), schedule.chunkSize);
+#endif
+        }
+    }
+} // namespace alpaka::omp
diff --git a/include/alpaka/core/Positioning.hpp b/include/alpaka/core/Positioning.hpp
new file mode 100644
index 0000000..8f3d9b8
--- /dev/null
+++ b/include/alpaka/core/Positioning.hpp
@@ -0,0 +1,49 @@
+/* Copyright 2019 Benjamin Worpitz, René Widera
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+namespace alpaka
+{
+    //! Defines the parallelism hierarchy levels of alpaka
+    namespace hierarchy
+    {
+        struct Grids
+        {
+        };
+
+        struct Blocks
+        {
+        };
+
+        struct Threads
+        {
+        };
+    } // namespace hierarchy
+
+    //! Defines the origins available for getting extent and indices of kernel executions.
+    namespace origin
+    {
+        //! This type is used to get the extents/indices relative to the grid.
+        struct Grid;
+        //! This type is used to get the extent/indices relative to a/the current block.
+        struct Block;
+        //! This type is used to get the extents relative to the thread.
+        struct Thread;
+    } // namespace origin
+
+    //! Defines the units available for getting extent and indices of kernel executions.
+    namespace unit
+    {
+        //! This type is used to get the extent/indices in units of blocks.
+        struct Blocks;
+        //! This type is used to get the extent/indices in units of threads.
+        struct Threads;
+        //! This type is used to get the extents/indices in units of elements.
+        struct Elems;
+    } // namespace unit
+
+    using namespace origin;
+    using namespace unit;
+} // namespace alpaka
diff --git a/include/alpaka/core/RemoveRestrict.hpp b/include/alpaka/core/RemoveRestrict.hpp
new file mode 100644
index 0000000..316630f
--- /dev/null
+++ b/include/alpaka/core/RemoveRestrict.hpp
@@ -0,0 +1,35 @@
+/* Copyright 2021 Rene Widera
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+
+namespace alpaka
+{
+    //! Removes __restrict__ from a type
+    template<typename T>
+    struct remove_restrict
+    {
+        using type = T;
+    };
+
+#if BOOST_COMP_MSVC
+    template<typename T>
+    struct remove_restrict<T* __restrict>
+    {
+        using type = T*;
+    };
+#else
+    template<typename T>
+    struct remove_restrict<T* __restrict__>
+    {
+        using type = T*;
+    };
+#endif
+
+    //! Helper to remove __restrict__ from a type
+    template<typename T>
+    using remove_restrict_t = typename remove_restrict<T>::type;
+} // namespace alpaka
diff --git a/include/alpaka/core/RuntimeMacros.hpp b/include/alpaka/core/RuntimeMacros.hpp
new file mode 100644
index 0000000..80faa33
--- /dev/null
+++ b/include/alpaka/core/RuntimeMacros.hpp
@@ -0,0 +1,52 @@
+/* Copyright 2022  Andrea Bocci, Mehmet Yusufoglu, René Widera, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+// Implementation details
+#include "alpaka/core/Sycl.hpp"
+
+//! ALPAKA_THROW_ACC either aborts(terminating the program and creating a core dump) or throws std::runtime_error
+//! depending on the Acc. The std::runtime_error exception can be catched in the catch block.
+//!
+//! For CUDA __trap function is used which triggers std::runtime_error but can be catched during wait not exec.
+//! For HIP abort() function is used and calls __builtin_trap()
+//! For Sycl assert(false) is not used since it can be disabled -DNDEBUG compile option. abort() is used although it
+//! generates a runtime error instead of aborting in GPUs: "Caught synchronous SYCL exception: Unresolved Symbol
+//! <abort> -999 (Unknown PI error)."
+//!
+//! The OpenMP specification mandates that exceptions thrown by some thread must be handled by the same thread.
+//! Therefore std::runtime_error thrown by ALPAKA_THROW_ACC aborts the the program for OpenMP backends. If needed
+//! the SIGABRT signal can be catched by signal handler.
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDA_ARCH__)
+#    define ALPAKA_THROW_ACC(MSG)                                                                                     \
+        {                                                                                                             \
+            printf(                                                                                                   \
+                "alpaka encountered a user-defined error condition while running on the CUDA back-end:\n%s",          \
+                (MSG));                                                                                               \
+            __trap();                                                                                                 \
+        }
+#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_DEVICE_COMPILE__)
+#    define ALPAKA_THROW_ACC(MSG)                                                                                     \
+        {                                                                                                             \
+            printf(                                                                                                   \
+                "alpaka encountered a user-defined error condition while running on the HIP back-end:\n%s",           \
+                (MSG));                                                                                               \
+            abort();                                                                                                  \
+        }
+#elif defined(ALPAKA_ACC_SYCL_ENABLED) && defined(__SYCL_DEVICE_ONLY__)
+#    define ALPAKA_THROW_ACC(MSG)                                                                                     \
+        {                                                                                                             \
+            printf(                                                                                                   \
+                "alpaka encountered a user-defined error condition while running on the SYCL back-end:\n%s",          \
+                (MSG));                                                                                               \
+            abort();                                                                                                  \
+        }
+#else
+#    define ALPAKA_THROW_ACC(MSG)                                                                                     \
+        {                                                                                                             \
+            printf("alpaka encountered a user-defined error condition:\n%s", (MSG));                                  \
+            throw std::runtime_error(MSG);                                                                            \
+        }
+#endif
diff --git a/include/alpaka/core/Sycl.hpp b/include/alpaka/core/Sycl.hpp
new file mode 100644
index 0000000..c29fccd
--- /dev/null
+++ b/include/alpaka/core/Sycl.hpp
@@ -0,0 +1,199 @@
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Aurora Perego, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/elem/Traits.hpp"
+#include "alpaka/extent/Traits.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/meta/IntegerSequence.hpp"
+#include "alpaka/offset/Traits.hpp"
+#include "alpaka/vec/Vec.hpp"
+
+#include <array>
+#include <cstddef>
+#include <cstdio> // the #define printf(...) breaks <cstdio> if it is included afterwards
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+
+#    include <sycl/sycl.hpp>
+
+// if SYCL is enabled with the AMD backend the printf will be killed because of missing compiler support
+#    ifdef __AMDGCN__
+#        define printf(...)
+#    else
+
+#        ifdef __SYCL_DEVICE_ONLY__
+using AlpakaFormat = char const* [[clang::opencl_constant]];
+#        else
+using AlpakaFormat = char const*;
+#        endif
+
+#        if BOOST_COMP_CLANG
+#            pragma clang diagnostic push
+#            pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
+#        endif
+
+#        define printf(FORMAT, ...)                                                                                   \
+            do                                                                                                        \
+            {                                                                                                         \
+                static auto const format = AlpakaFormat{FORMAT};                                                      \
+                sycl::ext::oneapi::experimental::printf(format, ##__VA_ARGS__);                                       \
+            } while(false)
+
+#        if BOOST_COMP_CLANG
+#            pragma clang diagnostic pop
+#        endif
+
+#    endif
+
+// SYCL vector types trait specializations.
+namespace alpaka
+{
+    namespace detail
+    {
+        // Remove std::is_same boilerplate
+        template<typename T, typename... Ts>
+        struct is_any : std::bool_constant<(std::is_same_v<T, Ts> || ...)>
+        {
+        };
+    } // namespace detail
+
+    //! In contrast to CUDA SYCL doesn't know 1D vectors. It does
+    //! support OpenCL's data types which have additional requirements
+    //! on top of those in the C++ standard. Note that SYCL's equivalent
+    //! to CUDA's dim3 type is a different class type and thus not used
+    //! here.
+    template<typename T>
+    struct IsSyclBuiltInType
+        : detail::is_any<
+              T,
+              // built-in scalar types - these are the standard C++ built-in types, std::size_t, std::byte and
+              // sycl::half
+              sycl::half,
+
+              // 2 component vector types
+              sycl::char2,
+              sycl::uchar2,
+              sycl::short2,
+              sycl::ushort2,
+              sycl::int2,
+              sycl::uint2,
+              sycl::long2,
+              sycl::ulong2,
+              sycl::float2,
+              sycl::double2,
+              sycl::half2,
+
+              // 3 component vector types
+              sycl::char3,
+              sycl::uchar3,
+              sycl::short3,
+              sycl::ushort3,
+              sycl::int3,
+              sycl::uint3,
+              sycl::long3,
+              sycl::ulong3,
+              sycl::float3,
+              sycl::double3,
+              sycl::half3,
+
+              // 4 component vector types
+              sycl::char4,
+              sycl::uchar4,
+              sycl::short4,
+              sycl::ushort4,
+              sycl::int4,
+              sycl::uint4,
+              sycl::long4,
+              sycl::ulong4,
+              sycl::float4,
+              sycl::double4,
+              sycl::half4,
+
+              // 8 component vector types
+              sycl::char8,
+              sycl::uchar8,
+              sycl::short8,
+              sycl::ushort8,
+              sycl::int8,
+              sycl::uint8,
+              sycl::long8,
+              sycl::ulong8,
+              sycl::float8,
+              sycl::double8,
+              sycl::half8,
+
+              // 16 component vector types
+              sycl::char16,
+              sycl::uchar16,
+              sycl::short16,
+              sycl::ushort16,
+              sycl::int16,
+              sycl::uint16,
+              sycl::long16,
+              sycl::ulong16,
+              sycl::float16,
+              sycl::double16,
+              sycl::half16>
+    {
+    };
+} // namespace alpaka
+
+namespace alpaka::trait
+{
+    //! SYCL's types get trait specialization.
+    template<typename T>
+    struct DimType<T, std::enable_if_t<IsSyclBuiltInType<T>::value>>
+    {
+        using type = std::conditional_t<std::is_scalar_v<T>, DimInt<std::size_t{1}>, DimInt<T::size()>>;
+    };
+
+    //! The SYCL vectors' elem type trait specialization.
+    template<typename T>
+    struct ElemType<T, std::enable_if_t<IsSyclBuiltInType<T>::value>>
+    {
+        using type = std::conditional_t<std::is_scalar_v<T>, T, typename T::element_type>;
+    };
+
+    //! The SYCL vectors' extent get trait specialization.
+    template<typename T>
+    struct GetExtents<T, std::enable_if_t<IsSyclBuiltInType<T>::value>>
+    {
+        auto operator()(T const& value) const
+        {
+            if constexpr(std::is_scalar_v<T>)
+                return value;
+            else
+                return impl(value, std::make_index_sequence<Dim<T>::value>{});
+        }
+
+    private:
+        template<std::size_t... Is>
+        auto impl(T const& value, std::index_sequence<Is...>) const
+        {
+            return Vec{value.template swizzle<Is>()...};
+        }
+    };
+
+    //! The SYCL vectors' offset get trait specialization.
+    template<typename T>
+    struct GetOffsets<T, std::enable_if_t<IsSyclBuiltInType<T>::value>> : GetExtents<T>
+    {
+    };
+
+    //! The SYCL vectors' idx type trait specialization.
+    template<typename TIdx>
+    struct IdxType<TIdx, std::enable_if_t<IsSyclBuiltInType<TIdx>::value>>
+    {
+        using type = std::size_t;
+    };
+} // namespace alpaka::trait
+
+#endif
diff --git a/include/alpaka/core/ThreadPool.hpp b/include/alpaka/core/ThreadPool.hpp
new file mode 100644
index 0000000..b59555a
--- /dev/null
+++ b/include/alpaka/core/ThreadPool.hpp
@@ -0,0 +1,104 @@
+/* Copyright 2023 Benjamin Worpitz, René Widera, Jan Stephan, Bernhard Manfred Gruber, Jeffrey Kelling
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+
+#include <atomic>
+#include <future>
+#include <mutex>
+#include <optional>
+#include <queue>
+#include <vector>
+
+namespace alpaka::core::detail
+{
+    //! A thread pool yielding when there is not enough work to be done.
+    struct ThreadPool final
+    {
+        using Task = std::packaged_task<void()>;
+
+        //! Creates a thread pool with a given thread count
+        explicit ThreadPool(std::size_t threadCount)
+        {
+            if(threadCount < 1)
+                throw std::invalid_argument("The argument 'threadCount' has to be greate or equal to one!");
+            m_threads.reserve(threadCount);
+            for(std::size_t i = 0; i < threadCount; ++i)
+                m_threads.emplace_back([this] { threadFunc(); });
+        }
+
+        //! Destroys the thread pool, blocking until all enqueued work is done.
+        ~ThreadPool()
+        {
+            m_stop = true; // Signal that concurrent executors should not perform any new work
+            for(auto& t : m_threads)
+            {
+                if(std::this_thread::get_id() == t.get_id())
+                {
+                    std::cerr << "ERROR in ThreadPool joins itself" << std::endl;
+                    std::abort();
+                }
+                t.join();
+            }
+        }
+
+        //! Runs the given function on one of the pool in First In First Out (FIFO) order.
+        //!
+        //! \param task Function object to be called on the pool. Takes an arbitrary number of arguments. Must return
+        //!             void.
+        //! \param args Arguments for task, cannot be moved. If such parameters must be used, use a lambda and capture
+        //!             via move then move the lambda.
+        //! \return     A future to the created task.
+        template<typename TFnObj, typename... TArgs>
+        auto enqueueTask(TFnObj&& task, TArgs&&... args) -> std::future<void>
+        {
+#if BOOST_COMP_MSVC
+// MSVC 14.39.33519 is throwing an error because the noexcept type deduction is not defined in original C++17
+// error C2065: 'task': undeclared identifier
+// see: https://stackoverflow.com/a/72467726
+#    define ALPAKA_NOEXCEPT(...)
+#else
+#    define ALPAKA_NOEXCEPT(...) noexcept(__VA_ARGS__)
+#endif
+            auto ptask
+                = Task{[=, t = std::forward<TFnObj>(task)]() ALPAKA_NOEXCEPT(noexcept(task(args...))) { t(args...); }};
+#undef ALPAKA_NOEXCEPT
+
+            auto future = ptask.get_future();
+            {
+                std::lock_guard<std::mutex> lock{m_mutex};
+                m_tasks.push(std::move(ptask));
+            }
+            return future;
+        }
+
+    private:
+        void threadFunc()
+        {
+            while(!m_stop.load(std::memory_order_relaxed))
+            {
+                std::optional<Task> task;
+                {
+                    std::lock_guard<std::mutex> lock{m_mutex};
+                    if(!m_tasks.empty())
+                    {
+                        task = std::move(m_tasks.front());
+                        m_tasks.pop();
+                    }
+                }
+                if(task)
+                    (*task)();
+                else
+                    std::this_thread::yield();
+            }
+        }
+
+        std::vector<std::thread> m_threads;
+        std::queue<Task> m_tasks; // TODO(bgruber): we could consider a lock-free queue here
+        std::mutex m_mutex;
+        std::atomic<bool> m_stop = false;
+    };
+} // namespace alpaka::core::detail
diff --git a/include/alpaka/core/UniformCudaHip.hpp b/include/alpaka/core/UniformCudaHip.hpp
new file mode 100644
index 0000000..0896f9d
--- /dev/null
+++ b/include/alpaka/core/UniformCudaHip.hpp
@@ -0,0 +1,113 @@
+/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera, Jan Stephan, Andrea Bocci, Bernhard
+ * Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/Cuda.hpp"
+#include "alpaka/core/Hip.hpp"
+
+#include <initializer_list>
+#include <stdexcept>
+#include <string>
+#include <tuple>
+#include <type_traits>
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+namespace alpaka::uniform_cuda_hip::detail
+{
+    //! CUDA/HIP runtime API error checking with log and exception, ignoring specific error values
+    template<typename TApi, bool TThrow>
+    ALPAKA_FN_HOST inline void rtCheck(
+        typename TApi::Error_t const& error,
+        char const* desc,
+        char const* file,
+        int const& line) noexcept(!TThrow)
+    {
+        if(error != TApi::success)
+        {
+            auto const sError = std::string{
+                std::string(file) + "(" + std::to_string(line) + ") " + std::string(desc) + " : '"
+                + TApi::getErrorName(error) + "': '" + std::string(TApi::getErrorString(error)) + "'!"};
+
+            if constexpr(!TThrow || ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL)
+                std::cerr << sError << std::endl;
+
+            ALPAKA_DEBUG_BREAK;
+            // reset the last error to allow user side error handling. Using std::ignore to discard unneeded
+            // return values is suggested by the C++ core guidelines.
+            std::ignore = TApi::getLastError();
+
+            if constexpr(TThrow)
+                throw std::runtime_error(sError);
+        }
+    }
+
+    //! CUDA/HIP runtime API error checking with log and exception, ignoring specific error values
+    template<typename TApi, bool TThrow>
+    ALPAKA_FN_HOST inline void rtCheckIgnore(
+        typename TApi::Error_t const& error,
+        char const* cmd,
+        char const* file,
+        int const& line,
+        std::initializer_list<typename TApi::Error_t> ignoredErrorCodes) noexcept(!TThrow)
+    {
+        if(error != TApi::success)
+        {
+            // If the error code is not one of the ignored ones.
+            if(std::find(std::cbegin(ignoredErrorCodes), std::cend(ignoredErrorCodes), error)
+               == std::cend(ignoredErrorCodes))
+            {
+                using namespace std::literals;
+                rtCheck<TApi, TThrow>(error, ("'"s + std::string(cmd) + "' returned error "s).c_str(), file, line);
+            }
+            else
+            {
+                // reset the last error to avoid propagation to the next CUDA/HIP API call. Using std::ignore
+                // to discard unneeded return values is recommended by the C++ core guidelines.
+                std::ignore = TApi::getLastError();
+            }
+        }
+    }
+
+    //! CUDA/HIP runtime API last error checking with log and exception.
+    template<typename TApi, bool TThrow>
+    ALPAKA_FN_HOST inline void rtCheckLastError(char const* desc, char const* file, int const& line) noexcept(!TThrow)
+    {
+        typename TApi::Error_t const error(TApi::getLastError());
+        rtCheck<TApi, TThrow>(error, desc, file, line);
+    }
+} // namespace alpaka::uniform_cuda_hip::detail
+
+#    define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IMPL(cmd, throw, ...)                                                    \
+        do                                                                                                            \
+        {                                                                                                             \
+            ::alpaka::uniform_cuda_hip::detail::rtCheckLastError<TApi, throw>(                                        \
+                "'" #cmd "' A previous API call (not this one) set the error ",                                       \
+                __FILE__,                                                                                             \
+                __LINE__);                                                                                            \
+            ::alpaka::uniform_cuda_hip::detail::rtCheckIgnore<TApi, throw>(                                           \
+                cmd,                                                                                                  \
+                #cmd,                                                                                                 \
+                __FILE__,                                                                                             \
+                __LINE__,                                                                                             \
+                {__VA_ARGS__});                                                                                       \
+        } while(0)
+
+//! CUDA/HIP runtime error checking with log and exception, ignoring specific error values
+#    define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE(cmd, ...)                                                         \
+        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IMPL(cmd, true, __VA_ARGS__)
+
+//! CUDA/HIP runtime error checking with log and exception.
+#    define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cmd) ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IMPL(cmd, true, )
+
+//! CUDA/HIP runtime error checking with log and exception, ignoring specific error values
+#    define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE_NOEXCEPT(cmd, ...)                                                \
+        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IMPL(cmd, false, __VA_ARGS__)
+
+//! CUDA/HIP runtime error checking with log.
+#    define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(cmd) ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IMPL(cmd, false, )
+#endif
diff --git a/include/alpaka/core/Unreachable.hpp b/include/alpaka/core/Unreachable.hpp
new file mode 100644
index 0000000..7b1b9ff
--- /dev/null
+++ b/include/alpaka/core/Unreachable.hpp
@@ -0,0 +1,25 @@
+/* Copyright 2022 Jan Stephan, Jeffrey Kelling
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+
+//! Before CUDA 11.5 nvcc is unable to correctly identify return statements in 'if constexpr' branches. It will issue
+//! a false warning about a missing return statement unless it is told that the following code section is unreachable.
+//!
+//! \param x A dummy value for the expected return type of the calling function.
+#if(BOOST_COMP_NVCC && BOOST_ARCH_PTX)
+#    if BOOST_LANG_CUDA >= BOOST_VERSION_NUMBER(11, 3, 0)
+#        define ALPAKA_UNREACHABLE(...) __builtin_unreachable()
+#    else
+#        define ALPAKA_UNREACHABLE(...) return __VA_ARGS__
+#    endif
+#elif BOOST_COMP_MSVC
+#    define ALPAKA_UNREACHABLE(...) __assume(false)
+#elif BOOST_COMP_GNUC || BOOST_COMP_CLANG
+#    define ALPAKA_UNREACHABLE(...) __builtin_unreachable()
+#else
+#    define ALPAKA_UNREACHABLE(...)
+#endif
diff --git a/include/alpaka/core/Unroll.hpp b/include/alpaka/core/Unroll.hpp
new file mode 100644
index 0000000..10794e6
--- /dev/null
+++ b/include/alpaka/core/Unroll.hpp
@@ -0,0 +1,25 @@
+/* Copyright 2021 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+
+//! Suggests unrolling of the directly following loop to the compiler.
+//!
+//! Usage:
+//!  `ALPAKA_UNROLL
+//!  for(...){...}`
+// \TODO: Implement for other compilers.
+#if BOOST_ARCH_PTX
+#    define ALPAKA_UNROLL_STRINGIFY(x) #x
+#    define ALPAKA_UNROLL(...) _Pragma(ALPAKA_UNROLL_STRINGIFY(unroll __VA_ARGS__))
+#elif BOOST_COMP_IBM || BOOST_COMP_SUNPRO || BOOST_COMP_HPACC
+#    define ALPAKA_UNROLL_STRINGIFY(x) #x
+#    define ALPAKA_UNROLL(...) _Pragma(ALPAKA_UNROLL_STRINGIFY(unroll(__VA_ARGS__)))
+#elif BOOST_COMP_PGI
+#    define ALPAKA_UNROLL(...) _Pragma("unroll")
+#else
+#    define ALPAKA_UNROLL(...)
+#endif
diff --git a/include/alpaka/core/Utility.hpp b/include/alpaka/core/Utility.hpp
new file mode 100644
index 0000000..2610027
--- /dev/null
+++ b/include/alpaka/core/Utility.hpp
@@ -0,0 +1,62 @@
+/* Copyright 2024 Benjamin Worpitz, René Widera, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+
+#include <type_traits>
+#include <utility>
+
+namespace alpaka::core
+{
+    //! convert any type to a reference type
+    //
+    // This function is equivalent to std::declval() but can be used
+    // within an alpaka accelerator kernel too.
+    // This function can be used only within std::decltype().
+#if BOOST_LANG_CUDA && BOOST_COMP_CLANG_CUDA || BOOST_COMP_HIP
+    template<class T>
+    ALPAKA_FN_HOST_ACC std::add_rvalue_reference_t<T> declval();
+#else
+    using std::declval;
+#endif
+
+    /// Returns the ceiling of a / b, as integer.
+    template<typename Integral, typename = std::enable_if_t<std::is_integral_v<Integral>>>
+    [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto divCeil(Integral a, Integral b) -> Integral
+    {
+        return (a + b - 1) / b;
+    }
+
+    /// Computes the nth power of base, in integers.
+    template<typename Integral, typename = std::enable_if_t<std::is_integral_v<Integral>>>
+    [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto intPow(Integral base, Integral n) -> Integral
+    {
+        if(n == 0)
+            return 1;
+        auto r = base;
+        for(Integral i = 1; i < n; i++)
+            r *= base;
+        return r;
+    }
+
+    /// Computes the floor of the nth root of value, in integers.
+    template<typename Integral, typename = std::enable_if_t<std::is_integral_v<Integral>>>
+    [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto nthRootFloor(Integral value, Integral n) -> Integral
+    {
+        // adapted from: https://en.wikipedia.org/wiki/Integer_square_root
+        Integral L = 0;
+        Integral R = value + 1;
+        while(L != R - 1)
+        {
+            Integral const M = (L + R) / 2;
+            if(intPow(M, n) <= value)
+                L = M;
+            else
+                R = M;
+        }
+        return L;
+    }
+
+} // namespace alpaka::core
diff --git a/include/alpaka/core/Vectorize.hpp b/include/alpaka/core/Vectorize.hpp
new file mode 100644
index 0000000..55f0e6f
--- /dev/null
+++ b/include/alpaka/core/Vectorize.hpp
@@ -0,0 +1,358 @@
+/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+//! Suggests vectorization of the directly following loop to the compiler.
+//!
+//! Usage:
+//!  `ALPAKA_VECTORIZE_HINT
+//!  for(...){...}`
+// \TODO: Implement for other compilers.
+// See: http://stackoverflow.com/questions/2706286/pragmas-swp-ivdep-prefetch-support-in-various-compilers
+/*#if BOOST_COMP_HPACC
+    #define ALPAKA_VECTORIZE_HINT(...)  _Pragma("ivdep")
+#elif BOOST_COMP_PGI
+    #define ALPAKA_VECTORIZE_HINT(...)  _Pragma("vector")
+#elif BOOST_COMP_MSVC
+    #define ALPAKA_VECTORIZE_HINT(...)  __pragma(loop(ivdep))
+#elif BOOST_COMP_GNUC
+    #define ALPAKA_VECTORIZE_HINT(...)  _Pragma("GCC ivdep")
+#else
+    #define ALPAKA_VECTORIZE_HINT(...)
+#endif*/
+
+namespace alpaka::core::vectorization
+{
+    // The alignment required to enable optimal performance dependant on the target architecture.
+    constexpr std::size_t defaultAlignment =
+#if defined(__AVX512BW__) || defined(__AVX512F__) || defined(__MIC__)
+        64u
+#elif defined(__AVX__) || defined(__AVX2__)
+        32u
+#else
+        16u
+#endif
+        ;
+
+    // Number of elements of the given type that can be processed in parallel in a vector register.
+    // By default there is no vectorization.
+    template<typename TElem>
+    struct GetVectorizationSizeElems
+    {
+        static constexpr std::size_t value = 1u;
+    };
+
+    // Number of elements of the given type that can be processed in parallel in a vector register.
+    template<>
+    struct GetVectorizationSizeElems<double>
+    {
+        static constexpr std::size_t value =
+#if defined(__AVX512F__) || defined(__MIC__)
+            // addition (AVX512F,KNC): vaddpd / _mm512_add_pd
+            // subtraction (AVX512F,KNC): vsubpd / _mm512_sub_pd
+            // multiplication (AVX512F,KNC): vmulpd / _mm512_mul_pd
+            8u;
+#elif defined(__AVX__)
+            // addition (AVX): vaddpd / _mm256_add_pd
+            // subtraction (AVX): vsubpd / _mm256_sub_pd
+            // multiplication (AVX): vmulpd / _mm256_mul_pd
+            4u;
+#elif defined(__SSE2__)
+            // addition (SSE2): addpd / _mm_add_pd
+            // subtraction (SSE2): subpd / _mm_sub_pd
+            // multiplication (SSE2): mulpd / _mm_mul_pd
+            2u;
+#elif defined(__ARM_NEON__)
+            // No support for double precision vectorization!
+            1u;
+#elif defined(__ALTIVEC__)
+            2u;
+#else
+            1u;
+#endif
+    };
+
+    // Number of elements of the given type that can be processed in parallel in a vector register.
+    template<>
+    struct GetVectorizationSizeElems<float>
+    {
+        static constexpr std::size_t value =
+#if defined(__AVX512F__) || defined(__MIC__)
+            // addition (AVX512F,KNC): vaddps / _mm512_add_ps
+            // subtraction (AVX512F,KNC): vsubps / _mm512_sub_ps
+            // multiplication (AVX512F,KNC): vmulps / _mm512_mul_ps
+            16u;
+#elif defined(__AVX__)
+            // addition (AVX): vaddps / _mm256_add_ps
+            // subtraction (AVX): vsubps / _mm256_sub_ps
+            // multiplication (AVX): vmulps / _mm256_mul_ps
+            8u;
+#elif defined(__SSE__)
+            // addition (SSE): addps / _mm_add_ps
+            // subtraction (SSE): subps / _mm_sub_ps
+            // multiplication (SSE): mulps / _mm_mul_ps
+            4u;
+#elif defined(__ARM_NEON__)
+            4u;
+#elif defined(__ALTIVEC__)
+            4u;
+#else
+            1u;
+#endif
+    };
+
+    // Number of elements of the given type that can be processed in parallel in a vector register.
+    template<>
+    struct GetVectorizationSizeElems<std::int8_t>
+    {
+        static constexpr std::size_t value =
+#if defined(__AVX512BW__)
+            // addition (AVX512BW): vpaddb / _mm512_mask_add_epi8
+            // subtraction (AVX512BW): vpsubb / _mm512_sub_epi8
+            // multiplication: -
+            64u;
+#elif defined(__AVX2__)
+            // addition (AVX2): vpaddb / _mm256_add_epi8
+            // subtraction (AVX2): vpsubb / _mm256_sub_epi8
+            // multiplication: -
+            32u;
+#elif defined(__SSE2__)
+            // addition (SSE2): paddb / _mm_add_epi8
+            // subtraction (SSE2): psubb / _mm_sub_epi8
+            // multiplication: -
+            16u;
+#elif defined(__ARM_NEON__)
+            16u;
+#elif defined(__ALTIVEC__)
+            16u;
+#elif defined(__CUDA_ARCH__)
+            // addition: __vadd4
+            // subtraction: __vsub4
+            // multiplication: -
+            4u;
+#else
+            1u;
+#endif
+    };
+
+    // Number of elements of the given type that can be processed in parallel in a vector register.
+    template<>
+    struct GetVectorizationSizeElems<std::uint8_t>
+    {
+        static constexpr std::size_t value =
+#if defined(__AVX512BW__)
+            // addition (AVX512BW): vpaddb / _mm512_mask_add_epi8
+            // subtraction (AVX512BW): vpsubb / _mm512_sub_epi8
+            // multiplication: -
+            64u;
+#elif defined(__AVX2__)
+            // addition (AVX2): vpaddb / _mm256_add_epi8
+            // subtraction (AVX2): vpsubb / _mm256_sub_epi8
+            // multiplication: -
+            32u;
+#elif defined(__SSE2__)
+            // addition (SSE2): paddb / _mm_add_epi8
+            // subtraction (SSE2): psubb / _mm_sub_epi8
+            // multiplication: -
+            16u;
+#elif defined(__ARM_NEON__)
+            16u;
+#elif defined(__ALTIVEC__)
+            16u;
+#elif defined(__CUDA_ARCH__)
+            // addition: __vadd4
+            // subtraction: __vsub4
+            // multiplication: -
+            4u;
+#else
+            1u;
+#endif
+    };
+
+    // Number of elements of the given type that can be processed in parallel in a vector register.
+    template<>
+    struct GetVectorizationSizeElems<std::int16_t>
+    {
+        static constexpr std::size_t value =
+#if defined(__AVX512BW__)
+            // addition (AVX512BW): vpaddw / _mm512_mask_add_epi16
+            // subtraction (AVX512BW): vpsubw / _mm512_mask_sub_epi16
+            // multiplication (AVX512BW): vpmullw / _mm512_mask_mullo_epi16
+            32u;
+#elif defined(__AVX2__)
+            // addition (AVX2): vpaddw / _mm256_add_epi16
+            // subtraction (AVX2): vpsubw / _mm256_sub_epi16
+            // multiplication (AVX2): vpmullw / _mm256_mullo_epi16
+            16u;
+#elif defined(__SSE2__)
+            // addition (SSE2): paddw / _mm_add_epi16
+            // subtraction (SSE2): psubw / _mm_sub_epi16
+            // multiplication (SSE2): pmullw / _mm_mullo_epi16
+            8u;
+#elif defined(__ARM_NEON__)
+            8u;
+#elif defined(__ALTIVEC__)
+            8u;
+#elif defined(__CUDA_ARCH__)
+            // addition: __vadd2
+            // subtraction: __vsub2
+            // multiplication: -
+            2u;
+#else
+            1u;
+#endif
+    };
+
+    // Number of elements of the given type that can be processed in parallel in a vector register.
+    template<>
+    struct GetVectorizationSizeElems<std::uint16_t>
+    {
+        static constexpr std::size_t value =
+#if defined(__AVX512BW__)
+            // addition (AVX512BW): vpaddusw / _mm512_mask_adds_epu16
+            // subtraction (AVX512BW): vpsubw / _mm512_subs_epu16
+            // multiplication: ?
+            32u;
+#elif defined(__AVX2__)
+            // addition (AVX2): vpaddusw / _mm256_adds_epu16
+            // subtraction (AVX2): vpsubusw / _mm256_subs_epu16
+            // multiplication: ?
+            16u;
+#elif defined(__SSE2__)
+            // addition (SSE2): paddusw / _mm_adds_epu16
+            // subtraction (SSE2): psubusw / _mm_subs_epu16
+            // multiplication: ?
+            8u;
+#elif defined(__ARM_NEON__)
+            8u;
+#elif defined(__ALTIVEC__)
+            8u;
+#elif defined(__CUDA_ARCH__)
+            // addition: __vadd2
+            // subtraction: __vsub2
+            // multiplication: -
+            2u;
+#else
+            1u;
+#endif
+    };
+
+    // Number of elements of the given type that can be processed in parallel in a vector register.
+    template<>
+    struct GetVectorizationSizeElems<std::int32_t>
+    {
+        static constexpr std::size_t value =
+#if defined(__AVX512F__) || defined(__MIC__)
+            // addition (AVX512F,KNC): vpaddd / _mm512_mask_add_epi32
+            // subtraction (AVX512F,KNC): vpsubd / _mm512_mask_sub_epi32
+            // multiplication (AVX512F,KNC): vpmulld / _mm512_mask_mullo_epi32
+            16u;
+#elif defined(__AVX2__)
+            // addition (AVX2): vpaddd / _mm256_add_epi32
+            // subtraction (AVX2): vpsubd / _mm256_sub_epi32
+            // multiplication (AVX2): vpmulld / _mm256_mullo_epi32
+            8u;
+#elif defined(__SSE2__)
+            // addition (SSE2): paddd / _mm_add_epi32
+            // subtraction (SSE2): psubd / _mm_sub_epi32
+            // multiplication (SSE4.1): pmulld / _mm_mullo_epi32
+            4u;
+#elif defined(__ARM_NEON__)
+            4u;
+#elif defined(__ALTIVEC__)
+            4u;
+#else
+            1u;
+#endif
+    };
+
+    // Number of elements of the given type that can be processed in parallel in a vector register.
+    template<>
+    struct GetVectorizationSizeElems<std::uint32_t>
+    {
+        static constexpr std::size_t value =
+#if defined(__AVX512F__) || defined(__MIC__)
+            // addition (AVX512F,KNC): vpaddd / _mm512_mask_add_epi32
+            // subtraction (AVX512F,KNC): vpsubd / _mm512_mask_sub_epi32
+            // multiplication: ?
+            16u;
+#elif defined(__AVX2__)
+            // addition (AVX2): vpaddd / _mm256_add_epi32
+            // subtraction (AVX2): vpsubd / _mm256_sub_epi32
+            // multiplication: ?
+            8u;
+#elif defined(__SSE2__)
+            // addition (SSE2): paddd / _mm_add_epi32
+            // subtraction (SSE2): psubd / _mm_sub_epi32
+            // multiplication: ?
+            4u;
+#elif defined(__ARM_NEON__)
+            4u;
+#elif defined(__ALTIVEC__)
+            4u;
+#else
+            1u;
+#endif
+    };
+
+    // Number of elements of the given type that can be processed in parallel in a vector register.
+    template<>
+    struct GetVectorizationSizeElems<std::int64_t>
+    {
+        static constexpr std::size_t value =
+#if defined(__AVX512F__)
+            // addition (AVX512F): vpaddq / _mm512_mask_add_epi64
+            // subtraction (AVX512F): vpsubq / _mm512_mask_sub_epi64
+            // multiplication (AVX512DQ): vpmullq / _mm512_mask_mullo_epi64
+            8u;
+#elif defined(__AVX2__)
+            // addition (AVX2): vpaddq / _mm256_add_epi64
+            // subtraction (AVX2): vpsubq / _mm256_sub_epi64
+            // multiplication: -
+            4u;
+#elif defined(__SSE2__)
+            // addition (SSE2): paddq / _mm_add_epi64
+            // subtraction (SSE2): psubq / _mm_sub_epi64
+            // multiplication: -
+            2u;
+#elif defined(__ARM_NEON__)
+            2u;
+#else
+            1u;
+#endif
+    };
+
+    // Number of elements of the given type that can be processed in parallel in a vector register.
+    template<>
+    struct GetVectorizationSizeElems<std::uint64_t>
+    {
+        static constexpr std::size_t value =
+#if defined(__AVX512F__)
+            // addition (AVX512F): vpaddq / _mm512_mask_add_epi64
+            // subtraction (AVX512F): vpsubq / _mm512_mask_sub_epi64
+            // multiplication: ?
+            8u;
+#elif defined(__AVX2__)
+            // addition (AVX2): vpaddq / _mm256_add_epi64
+            // subtraction (AVX2): vpsubq / _mm256_sub_epi64
+            // multiplication: ?
+            4u;
+#elif defined(__SSE2__)
+            // addition (SSE2): paddq / _mm_add_epi64
+            // subtraction (SSE2): psubq / _mm_sub_epi64
+            // multiplication: ?
+            2u;
+#elif defined(__ARM_NEON__)
+            2u;
+#else
+            1u;
+#endif
+    };
+} // namespace alpaka::core::vectorization
diff --git a/include/alpaka/dev/DevCpu.hpp b/include/alpaka/dev/DevCpu.hpp
new file mode 100644
index 0000000..e36c263
--- /dev/null
+++ b/include/alpaka/dev/DevCpu.hpp
@@ -0,0 +1,207 @@
+/* Copyright 2024 Axel Huebl, Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber,
+ *                Antonio Di Pilato, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/dev/common/QueueRegistry.hpp"
+#include "alpaka/dev/cpu/SysInfo.hpp"
+#include "alpaka/mem/buf/Traits.hpp"
+#include "alpaka/platform/Traits.hpp"
+#include "alpaka/queue/Properties.hpp"
+#include "alpaka/queue/QueueGenericThreadsBlocking.hpp"
+#include "alpaka/queue/QueueGenericThreadsNonBlocking.hpp"
+#include "alpaka/queue/Traits.hpp"
+#include "alpaka/queue/cpu/IGenericThreadsQueue.hpp"
+#include "alpaka/traits/Traits.hpp"
+#include "alpaka/wait/Traits.hpp"
+
+#include <algorithm>
+#include <cstddef>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <vector>
+
+namespace alpaka
+{
+    class DevCpu;
+
+    namespace cpu
+    {
+        using ICpuQueue = IGenericThreadsQueue<DevCpu>;
+    } // namespace cpu
+
+    namespace trait
+    {
+        template<typename TPlatform, typename TSfinae>
+        struct GetDevByIdx;
+    } // namespace trait
+    struct PlatformCpu;
+
+    //! The CPU device.
+    namespace cpu::detail
+    {
+        //! The CPU device implementation.
+        using DevCpuImpl = alpaka::detail::QueueRegistry<cpu::ICpuQueue>;
+    } // namespace cpu::detail
+
+    //! The CPU device handle.
+    class DevCpu
+        : public concepts::Implements<ConceptCurrentThreadWaitFor, DevCpu>
+        , public concepts::Implements<ConceptDev, DevCpu>
+    {
+        friend struct trait::GetDevByIdx<PlatformCpu>;
+
+    protected:
+        DevCpu() : m_spDevCpuImpl(std::make_shared<cpu::detail::DevCpuImpl>())
+        {
+        }
+
+    public:
+        auto operator==(DevCpu const&) const -> bool
+        {
+            return true;
+        }
+
+        auto operator!=(DevCpu const& rhs) const -> bool
+        {
+            return !((*this) == rhs);
+        }
+
+        [[nodiscard]] ALPAKA_FN_HOST auto getAllQueues() const -> std::vector<std::shared_ptr<cpu::ICpuQueue>>
+        {
+            return m_spDevCpuImpl->getAllExistingQueues();
+        }
+
+        //! Registers the given queue on this device.
+        //! NOTE: Every queue has to be registered for correct functionality of device wait operations!
+        ALPAKA_FN_HOST auto registerQueue(std::shared_ptr<cpu::ICpuQueue> spQueue) const -> void
+        {
+            m_spDevCpuImpl->registerQueue(spQueue);
+        }
+
+        [[nodiscard]] auto getNativeHandle() const noexcept
+        {
+            return 0;
+        }
+
+    private:
+        std::shared_ptr<cpu::detail::DevCpuImpl> m_spDevCpuImpl;
+    };
+
+    namespace trait
+    {
+        //! The CPU device name get trait specialization.
+        template<>
+        struct GetName<DevCpu>
+        {
+            ALPAKA_FN_HOST static auto getName(DevCpu const& /* dev */) -> std::string
+            {
+                return cpu::detail::getCpuName();
+            }
+        };
+
+        //! The CPU device available memory get trait specialization.
+        template<>
+        struct GetMemBytes<DevCpu>
+        {
+            ALPAKA_FN_HOST static auto getMemBytes(DevCpu const& /* dev */) -> std::size_t
+            {
+                return cpu::detail::getTotalGlobalMemSizeBytes();
+            }
+        };
+
+        //! The CPU device free memory get trait specialization.
+        template<>
+        struct GetFreeMemBytes<DevCpu>
+        {
+            ALPAKA_FN_HOST static auto getFreeMemBytes(DevCpu const& /* dev */) -> std::size_t
+            {
+                return cpu::detail::getFreeGlobalMemSizeBytes();
+            }
+        };
+
+        //! The CPU device warp size get trait specialization.
+        template<>
+        struct GetWarpSizes<DevCpu>
+        {
+            ALPAKA_FN_HOST static auto getWarpSizes(DevCpu const& /* dev */) -> std::vector<std::size_t>
+            {
+                return {1u};
+            }
+        };
+
+        //! The CPU device preferred warp size get trait specialization.
+        template<>
+        struct GetPreferredWarpSize<DevCpu>
+        {
+            ALPAKA_FN_HOST static constexpr auto getPreferredWarpSize(DevCpu const& /* dev */) -> std::size_t
+            {
+                return 1u;
+            }
+        };
+
+        //! The CPU device reset trait specialization.
+        template<>
+        struct Reset<DevCpu>
+        {
+            ALPAKA_FN_HOST static auto reset(DevCpu const& /* dev */) -> void
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+                // The CPU does nothing on reset.
+            }
+        };
+
+        //! The CPU device native handle type trait specialization.
+        template<>
+        struct NativeHandle<DevCpu>
+        {
+            [[nodiscard]] static auto getNativeHandle(DevCpu const& dev)
+            {
+                return dev.getNativeHandle();
+            }
+        };
+    } // namespace trait
+
+    template<typename TElem, typename TDim, typename TIdx>
+    class BufCpu;
+
+    namespace trait
+    {
+        //! The CPU device memory buffer type trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct BufType<DevCpu, TElem, TDim, TIdx>
+        {
+            using type = BufCpu<TElem, TDim, TIdx>;
+        };
+
+        //! The CPU device platform type trait specialization.
+        template<>
+        struct PlatformType<DevCpu>
+        {
+            using type = PlatformCpu;
+        };
+    } // namespace trait
+
+    using QueueCpuNonBlocking = QueueGenericThreadsNonBlocking<DevCpu>;
+    using QueueCpuBlocking = QueueGenericThreadsBlocking<DevCpu>;
+
+    namespace trait
+    {
+        template<>
+        struct QueueType<DevCpu, Blocking>
+        {
+            using type = QueueCpuBlocking;
+        };
+
+        template<>
+        struct QueueType<DevCpu, NonBlocking>
+        {
+            using type = QueueCpuNonBlocking;
+        };
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/dev/DevCpuSycl.hpp b/include/alpaka/dev/DevCpuSycl.hpp
new file mode 100644
index 0000000..bc88ce9
--- /dev/null
+++ b/include/alpaka/dev/DevCpuSycl.hpp
@@ -0,0 +1,17 @@
+/* Copyright 2024 Jan Stephan, Andrea Bocci, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/acc/Tag.hpp"
+#include "alpaka/dev/DevGenericSycl.hpp"
+
+#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_CPU)
+
+namespace alpaka
+{
+    using DevCpuSycl = DevGenericSycl<TagCpuSycl>;
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/dev/DevCudaRt.hpp b/include/alpaka/dev/DevCudaRt.hpp
new file mode 100644
index 0000000..92dcba3
--- /dev/null
+++ b/include/alpaka/dev/DevCudaRt.hpp
@@ -0,0 +1,18 @@
+/* Copyright 2022 Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/ApiCudaRt.hpp"
+#include "alpaka/dev/DevUniformCudaHipRt.hpp"
+
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+
+namespace alpaka
+{
+    //! The CUDA RT device handle.
+    using DevCudaRt = DevUniformCudaHipRt<ApiCudaRt>;
+} // namespace alpaka
+
+#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
diff --git a/include/alpaka/dev/DevFpgaSyclIntel.hpp b/include/alpaka/dev/DevFpgaSyclIntel.hpp
new file mode 100644
index 0000000..c0c66ef
--- /dev/null
+++ b/include/alpaka/dev/DevFpgaSyclIntel.hpp
@@ -0,0 +1,17 @@
+/* Copyright 2024 Jan Stephan, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/acc/Tag.hpp"
+#include "alpaka/dev/DevGenericSycl.hpp"
+
+#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_FPGA)
+
+namespace alpaka
+{
+    using DevFpgaSyclIntel = DevGenericSycl<TagFpgaSyclIntel>;
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/dev/DevGenericSycl.hpp b/include/alpaka/dev/DevGenericSycl.hpp
new file mode 100644
index 0000000..efbcad9
--- /dev/null
+++ b/include/alpaka/dev/DevGenericSycl.hpp
@@ -0,0 +1,282 @@
+/* Copyright 2024 Jan Stephan, Antonio Di Pilato, Luca Ferragina, Aurora Perego, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/acc/Traits.hpp"
+#include "alpaka/core/Common.hpp"
+#include "alpaka/core/Sycl.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/mem/buf/Traits.hpp"
+#include "alpaka/platform/Traits.hpp"
+#include "alpaka/queue/Properties.hpp"
+#include "alpaka/queue/Traits.hpp"
+#include "alpaka/queue/sycl/QueueGenericSyclBase.hpp"
+#include "alpaka/traits/Traits.hpp"
+#include "alpaka/wait/Traits.hpp"
+
+#include <algorithm>
+#include <cstddef>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <shared_mutex>
+#include <string>
+#include <utility>
+#include <vector>
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+
+#    include <sycl/sycl.hpp>
+
+namespace alpaka
+{
+    namespace trait
+    {
+        template<typename TPlatform, typename TSfinae>
+        struct GetDevByIdx;
+    } // namespace trait
+
+    template<typename TTag>
+    using QueueGenericSyclBlocking = detail::QueueGenericSyclBase<TTag, true>;
+
+    template<typename TTag>
+    using QueueGenericSyclNonBlocking = detail::QueueGenericSyclBase<TTag, false>;
+
+    template<typename TTag>
+    struct PlatformGenericSycl;
+
+    template<typename TElem, typename TDim, typename TIdx, typename TTag>
+    class BufGenericSycl;
+
+    namespace detail
+    {
+        class DevGenericSyclImpl
+        {
+        public:
+            DevGenericSyclImpl(sycl::device device, sycl::context context)
+                : m_device{std::move(device)}
+                , m_context{std::move(context)}
+            {
+            }
+
+            // Don't call this without locking first!
+            auto clean_queues() -> void
+            {
+                // Clean up dead queues
+                auto const start = std::begin(m_queues);
+                auto const old_end = std::end(m_queues);
+                auto const new_end = std::remove_if(start, old_end, [](auto q_ptr) { return q_ptr.expired(); });
+                m_queues.erase(new_end, old_end);
+            }
+
+            auto register_queue(std::shared_ptr<QueueGenericSyclImpl> const& queue) -> void
+            {
+                std::lock_guard<std::shared_mutex> lock{m_mutex};
+
+                clean_queues();
+                m_queues.emplace_back(queue);
+            }
+
+            auto register_dependency(sycl::event event) -> void
+            {
+                std::shared_lock<std::shared_mutex> lock{m_mutex};
+
+                for(auto& q_ptr : m_queues)
+                {
+                    if(auto ptr = q_ptr.lock(); ptr != nullptr)
+                        ptr->register_dependency(event);
+                }
+            }
+
+            auto wait()
+            {
+                std::shared_lock<std::shared_mutex> lock{m_mutex};
+
+                for(auto& q_ptr : m_queues)
+                {
+                    if(auto ptr = q_ptr.lock(); ptr != nullptr)
+                        ptr->wait();
+                }
+            }
+
+            auto get_device() const -> sycl::device
+            {
+                return m_device;
+            }
+
+            auto get_context() const -> sycl::context
+            {
+                return m_context;
+            }
+
+        private:
+            sycl::device m_device;
+            sycl::context m_context;
+            std::vector<std::weak_ptr<QueueGenericSyclImpl>> m_queues;
+            std::shared_mutex mutable m_mutex;
+        };
+    } // namespace detail
+
+    //! The SYCL device handle.
+    template<typename TTag>
+    class DevGenericSycl
+        : public concepts::Implements<ConceptCurrentThreadWaitFor, DevGenericSycl<TTag>>
+        , public concepts::Implements<ConceptDev, DevGenericSycl<TTag>>
+    {
+        friend struct trait::GetDevByIdx<PlatformGenericSycl<TTag>>;
+
+    public:
+        DevGenericSycl(sycl::device device, sycl::context context)
+            : m_impl{std::make_shared<detail::DevGenericSyclImpl>(std::move(device), std::move(context))}
+        {
+        }
+
+        friend auto operator==(DevGenericSycl const& lhs, DevGenericSycl const& rhs) -> bool
+        {
+            return (lhs.m_impl == rhs.m_impl);
+        }
+
+        friend auto operator!=(DevGenericSycl const& lhs, DevGenericSycl const& rhs) -> bool
+        {
+            return !(lhs == rhs);
+        }
+
+        [[nodiscard]] auto getNativeHandle() const -> std::pair<sycl::device, sycl::context>
+        {
+            return std::make_pair(m_impl->get_device(), m_impl->get_context());
+        }
+
+        std::shared_ptr<detail::DevGenericSyclImpl> m_impl;
+    };
+
+    namespace trait
+    {
+        //! The SYCL device name get trait specialization.
+        template<typename TTag>
+        struct GetName<DevGenericSycl<TTag>>
+        {
+            static auto getName(DevGenericSycl<TTag> const& dev) -> std::string
+            {
+                auto const device = dev.getNativeHandle().first;
+                return device.template get_info<sycl::info::device::name>();
+            }
+        };
+
+        //! The SYCL device available memory get trait specialization.
+        template<typename TTag>
+        struct GetMemBytes<DevGenericSycl<TTag>>
+        {
+            static auto getMemBytes(DevGenericSycl<TTag> const& dev) -> std::size_t
+            {
+                auto const device = dev.getNativeHandle().first;
+                return device.template get_info<sycl::info::device::global_mem_size>();
+            }
+        };
+
+        //! The SYCL device free memory get trait specialization.
+        template<typename TTag>
+        struct GetFreeMemBytes<DevGenericSycl<TTag>>
+        {
+            static auto getFreeMemBytes(DevGenericSycl<TTag> const& /* dev */) -> std::size_t
+            {
+                static_assert(
+                    !sizeof(PlatformGenericSycl<TTag>),
+                    "Querying free device memory not supported for SYCL devices.");
+                return std::size_t{};
+            }
+        };
+
+        //! The SYCL device warp size get trait specialization.
+        template<typename TTag>
+        struct GetWarpSizes<DevGenericSycl<TTag>>
+        {
+            static auto getWarpSizes(DevGenericSycl<TTag> const& dev) -> std::vector<std::size_t>
+            {
+                auto const device = dev.getNativeHandle().first;
+                std::vector<std::size_t> warp_sizes = device.template get_info<sycl::info::device::sub_group_sizes>();
+                // The CPU runtime supports a sub-group size of 64, but the SYCL implementation currently does not
+                auto find64 = std::find(warp_sizes.begin(), warp_sizes.end(), 64);
+                if(find64 != warp_sizes.end())
+                    warp_sizes.erase(find64);
+                // Sort the warp sizes in decreasing order
+                std::sort(warp_sizes.begin(), warp_sizes.end(), std::greater<>{});
+                return warp_sizes;
+            }
+        };
+
+        //! The SYCL device preferred warp size get trait specialization.
+        template<typename TTag>
+        struct GetPreferredWarpSize<DevGenericSycl<TTag>>
+        {
+            static auto getPreferredWarpSize(DevGenericSycl<TTag> const& dev) -> std::size_t
+            {
+                return GetWarpSizes<DevGenericSycl<TTag>>::getWarpSizes(dev).front();
+            }
+        };
+
+        //! The SYCL device reset trait specialization.
+        template<typename TTag>
+        struct Reset<DevGenericSycl<TTag>>
+        {
+            static auto reset(DevGenericSycl<TTag> const&) -> void
+            {
+                static_assert(
+                    !sizeof(PlatformGenericSycl<TTag>),
+                    "Explicit device reset not supported for SYCL devices");
+            }
+        };
+
+        //! The SYCL device native handle trait specialization.
+        template<typename TTag>
+        struct NativeHandle<DevGenericSycl<TTag>>
+        {
+            [[nodiscard]] static auto getNativeHandle(DevGenericSycl<TTag> const& dev)
+            {
+                return dev.getNativeHandle();
+            }
+        };
+
+        //! The SYCL device memory buffer type trait specialization.
+        template<typename TElem, typename TDim, typename TIdx, typename TTag>
+        struct BufType<DevGenericSycl<TTag>, TElem, TDim, TIdx>
+        {
+            using type = BufGenericSycl<TElem, TDim, TIdx, TTag>;
+        };
+
+        //! The SYCL device platform type trait specialization.
+        template<typename TTag>
+        struct PlatformType<DevGenericSycl<TTag>>
+        {
+            using type = PlatformGenericSycl<TTag>;
+        };
+
+        //! The thread SYCL device wait specialization.
+        template<typename TTag>
+        struct CurrentThreadWaitFor<DevGenericSycl<TTag>>
+        {
+            static auto currentThreadWaitFor(DevGenericSycl<TTag> const& dev) -> void
+            {
+                dev.m_impl->wait();
+            }
+        };
+
+        //! The SYCL blocking queue trait specialization.
+        template<typename TTag>
+        struct QueueType<DevGenericSycl<TTag>, Blocking>
+        {
+            using type = QueueGenericSyclBlocking<TTag>;
+        };
+
+        //! The SYCL non-blocking queue trait specialization.
+        template<typename TTag>
+        struct QueueType<DevGenericSycl<TTag>, NonBlocking>
+        {
+            using type = QueueGenericSyclNonBlocking<TTag>;
+        };
+
+    } // namespace trait
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/dev/DevGpuSyclIntel.hpp b/include/alpaka/dev/DevGpuSyclIntel.hpp
new file mode 100644
index 0000000..2850126
--- /dev/null
+++ b/include/alpaka/dev/DevGpuSyclIntel.hpp
@@ -0,0 +1,17 @@
+/* Copyright 2024 Jan Stephan, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/acc/Tag.hpp"
+#include "alpaka/dev/DevGenericSycl.hpp"
+
+#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_GPU)
+
+namespace alpaka
+{
+    using DevGpuSyclIntel = DevGenericSycl<TagGpuSyclIntel>;
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/dev/DevHipRt.hpp b/include/alpaka/dev/DevHipRt.hpp
new file mode 100644
index 0000000..819c2f5
--- /dev/null
+++ b/include/alpaka/dev/DevHipRt.hpp
@@ -0,0 +1,18 @@
+/* Copyright 2022 Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/ApiHipRt.hpp"
+#include "alpaka/dev/DevUniformCudaHipRt.hpp"
+
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+
+namespace alpaka
+{
+    //! The HIP RT device handle.
+    using DevHipRt = DevUniformCudaHipRt<ApiHipRt>;
+} // namespace alpaka
+
+#endif // ALPAKA_ACC_GPU_HIP_ENABLED
diff --git a/include/alpaka/dev/DevUniformCudaHipRt.hpp b/include/alpaka/dev/DevUniformCudaHipRt.hpp
new file mode 100644
index 0000000..876d8ca
--- /dev/null
+++ b/include/alpaka/dev/DevUniformCudaHipRt.hpp
@@ -0,0 +1,269 @@
+/* Copyright 2024 Benjamin Worpitz, Jakob Krude, René Widera, Andrea Bocci, Bernhard Manfred Gruber,
+ *                Antonio Di Pilato, Jan Stephan, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/ApiCudaRt.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/core/Cuda.hpp"
+#include "alpaka/core/Hip.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/dev/common/QueueRegistry.hpp"
+#include "alpaka/mem/buf/Traits.hpp"
+#include "alpaka/platform/Traits.hpp"
+#include "alpaka/queue/Properties.hpp"
+#include "alpaka/queue/Traits.hpp"
+#include "alpaka/queue/cuda_hip/QueueUniformCudaHipRt.hpp"
+#include "alpaka/traits/Traits.hpp"
+#include "alpaka/wait/Traits.hpp"
+
+#include <cstddef>
+#include <string>
+#include <vector>
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+namespace alpaka
+{
+    namespace trait
+    {
+        template<typename TPlatform, typename TSfinae>
+        struct GetDevByIdx;
+    } // namespace trait
+
+    namespace uniform_cuda_hip::detail
+    {
+        template<typename TApi, bool TBlocking>
+        class QueueUniformCudaHipRt;
+    } // namespace uniform_cuda_hip::detail
+
+    template<typename TApi>
+    using QueueUniformCudaHipRtBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, true>;
+
+    template<typename TApi>
+    using QueueUniformCudaHipRtNonBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, false>;
+
+    template<typename TApi>
+    struct PlatformUniformCudaHipRt;
+
+    template<typename TApi, typename TElem, typename TDim, typename TIdx>
+    struct BufUniformCudaHipRt;
+
+    //! The CUDA/HIP RT device handle.
+    template<typename TApi>
+    class DevUniformCudaHipRt
+        : public concepts::Implements<ConceptCurrentThreadWaitFor, DevUniformCudaHipRt<TApi>>
+        , public concepts::Implements<ConceptDev, DevUniformCudaHipRt<TApi>>
+    {
+        friend struct trait::GetDevByIdx<PlatformUniformCudaHipRt<TApi>>;
+
+        using IDeviceQueue = uniform_cuda_hip::detail::QueueUniformCudaHipRtImpl<TApi>;
+
+    protected:
+        DevUniformCudaHipRt() : m_QueueRegistry{std::make_shared<alpaka::detail::QueueRegistry<IDeviceQueue>>()}
+        {
+        }
+
+    public:
+        ALPAKA_FN_HOST auto operator==(DevUniformCudaHipRt const& rhs) const -> bool
+        {
+            return m_iDevice == rhs.m_iDevice;
+        }
+
+        ALPAKA_FN_HOST auto operator!=(DevUniformCudaHipRt const& rhs) const -> bool
+        {
+            return !((*this) == rhs);
+        }
+
+        [[nodiscard]] auto getNativeHandle() const noexcept -> int
+        {
+            return m_iDevice;
+        }
+
+        [[nodiscard]] ALPAKA_FN_HOST auto getAllQueues() const -> std::vector<std::shared_ptr<IDeviceQueue>>
+        {
+            return m_QueueRegistry->getAllExistingQueues();
+        }
+
+        //! Registers the given queue on this device.
+        //! NOTE: Every queue has to be registered for correct functionality of device wait operations!
+        ALPAKA_FN_HOST auto registerQueue(std::shared_ptr<IDeviceQueue> spQueue) const -> void
+        {
+            m_QueueRegistry->registerQueue(spQueue);
+        }
+
+    private:
+        DevUniformCudaHipRt(int iDevice)
+            : m_iDevice(iDevice)
+            , m_QueueRegistry(std::make_shared<alpaka::detail::QueueRegistry<IDeviceQueue>>())
+        {
+        }
+
+        int m_iDevice;
+
+        std::shared_ptr<alpaka::detail::QueueRegistry<IDeviceQueue>> m_QueueRegistry;
+    };
+
+    namespace trait
+    {
+        //! The CUDA/HIP RT device name get trait specialization.
+        template<typename TApi>
+        struct GetName<DevUniformCudaHipRt<TApi>>
+        {
+            ALPAKA_FN_HOST static auto getName(DevUniformCudaHipRt<TApi> const& dev) -> std::string
+            {
+                // There is cuda/hip-DeviceGetAttribute as faster alternative to cuda/hip-GetDeviceProperties to get a
+                // single device property but it has no option to get the name
+                typename TApi::DeviceProp_t devProp;
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::getDeviceProperties(&devProp, dev.getNativeHandle()));
+
+                return std::string(devProp.name);
+            }
+        };
+
+        //! The CUDA/HIP RT device available memory get trait specialization.
+        template<typename TApi>
+        struct GetMemBytes<DevUniformCudaHipRt<TApi>>
+        {
+            ALPAKA_FN_HOST static auto getMemBytes(DevUniformCudaHipRt<TApi> const& dev) -> std::size_t
+            {
+                // Set the current device to wait for.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(dev.getNativeHandle()));
+
+                std::size_t freeInternal(0u);
+                std::size_t totalInternal(0u);
+
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memGetInfo(&freeInternal, &totalInternal));
+
+                return totalInternal;
+            }
+        };
+
+        //! The CUDA/HIP RT device free memory get trait specialization.
+        template<typename TApi>
+        struct GetFreeMemBytes<DevUniformCudaHipRt<TApi>>
+        {
+            ALPAKA_FN_HOST static auto getFreeMemBytes(DevUniformCudaHipRt<TApi> const& dev) -> std::size_t
+            {
+                // Set the current device to wait for.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(dev.getNativeHandle()));
+
+                std::size_t freeInternal(0u);
+                std::size_t totalInternal(0u);
+
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memGetInfo(&freeInternal, &totalInternal));
+
+                return freeInternal;
+            }
+        };
+
+        //! The CUDA/HIP RT device warp size get trait specialization.
+        template<typename TApi>
+        struct GetWarpSizes<DevUniformCudaHipRt<TApi>>
+        {
+            ALPAKA_FN_HOST static auto getWarpSizes(DevUniformCudaHipRt<TApi> const& dev) -> std::vector<std::size_t>
+            {
+                return {GetPreferredWarpSize<DevUniformCudaHipRt<TApi>>::getPreferredWarpSize(dev)};
+            }
+        };
+
+        //! The CUDA/HIP RT preferred device warp size get trait specialization.
+        template<typename TApi>
+        struct GetPreferredWarpSize<DevUniformCudaHipRt<TApi>>
+        {
+            ALPAKA_FN_HOST static auto getPreferredWarpSize(DevUniformCudaHipRt<TApi> const& dev) -> std::size_t
+            {
+                int warpSize = 0;
+
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    TApi::deviceGetAttribute(&warpSize, TApi::deviceAttributeWarpSize, dev.getNativeHandle()));
+                return static_cast<std::size_t>(warpSize);
+            }
+        };
+
+#    ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+        //! The CUDA RT preferred device warp size get trait specialization.
+        template<>
+        struct GetPreferredWarpSize<DevUniformCudaHipRt<ApiCudaRt>>
+        {
+            ALPAKA_FN_HOST static constexpr auto getPreferredWarpSize(DevUniformCudaHipRt<ApiCudaRt> const& /* dev */)
+                -> std::size_t
+            {
+                // All CUDA GPUs to date have a warp size of 32 threads.
+                return 32u;
+            }
+        };
+#    endif // ALPAKA_ACC_GPU_CUDA_ENABLED
+
+        //! The CUDA/HIP RT device reset trait specialization.
+        template<typename TApi>
+        struct Reset<DevUniformCudaHipRt<TApi>>
+        {
+            ALPAKA_FN_HOST static auto reset(DevUniformCudaHipRt<TApi> const& dev) -> void
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                // Set the current device to wait for.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(dev.getNativeHandle()));
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceReset());
+            }
+        };
+
+        //! The CUDA/HIP RT device native handle trait specialization.
+        template<typename TApi>
+        struct NativeHandle<DevUniformCudaHipRt<TApi>>
+        {
+            [[nodiscard]] static auto getNativeHandle(DevUniformCudaHipRt<TApi> const& dev)
+            {
+                return dev.getNativeHandle();
+            }
+        };
+
+        //! The CUDA/HIP RT device memory buffer type trait specialization.
+        template<typename TApi, typename TElem, typename TDim, typename TIdx>
+        struct BufType<DevUniformCudaHipRt<TApi>, TElem, TDim, TIdx>
+        {
+            using type = BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>;
+        };
+
+        //! The CUDA/HIP RT device platform type trait specialization.
+        template<typename TApi>
+        struct PlatformType<DevUniformCudaHipRt<TApi>>
+        {
+            using type = PlatformUniformCudaHipRt<TApi>;
+        };
+
+        //! The thread CUDA/HIP device wait specialization.
+        //!
+        //! Blocks until the device has completed all preceding requested tasks.
+        //! Tasks that are enqueued or queues that are created after this call is made are not waited for.
+        template<typename TApi>
+        struct CurrentThreadWaitFor<DevUniformCudaHipRt<TApi>>
+        {
+            ALPAKA_FN_HOST static auto currentThreadWaitFor(DevUniformCudaHipRt<TApi> const& dev) -> void
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                // Set the current device to wait for.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(dev.getNativeHandle()));
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceSynchronize());
+            }
+        };
+
+        template<typename TApi>
+        struct QueueType<DevUniformCudaHipRt<TApi>, Blocking>
+        {
+            using type = QueueUniformCudaHipRtBlocking<TApi>;
+        };
+
+        template<typename TApi>
+        struct QueueType<DevUniformCudaHipRt<TApi>, NonBlocking>
+        {
+            using type = QueueUniformCudaHipRtNonBlocking<TApi>;
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/dev/Traits.hpp b/include/alpaka/dev/Traits.hpp
new file mode 100644
index 0000000..a3954f2
--- /dev/null
+++ b/include/alpaka/dev/Traits.hpp
@@ -0,0 +1,140 @@
+/* Copyright 2024 Benjamin Worpitz, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/core/Concepts.hpp"
+
+#include <algorithm>
+#include <cctype>
+#include <cstddef>
+#include <string>
+#include <vector>
+
+namespace alpaka
+{
+    //! The device traits.
+    namespace trait
+    {
+        //! The device type trait.
+        template<typename T, typename TSfinae = void>
+        struct DevType;
+
+        //! The device get trait.
+        template<typename T, typename TSfinae = void>
+        struct GetDev;
+
+        //! The device name get trait.
+        template<typename TDev, typename TSfinae = void>
+        struct GetName;
+
+        //! The device memory size get trait.
+        template<typename TDev, typename TSfinae = void>
+        struct GetMemBytes;
+
+        //! The device free memory size get trait.
+        template<typename T, typename TSfinae = void>
+        struct GetFreeMemBytes;
+
+        //! The device warp size get trait.
+        template<typename T, typename TSfinae = void>
+        struct GetWarpSizes;
+
+        //! The device preferred warp size get trait.
+        template<typename T, typename TSfinae = void>
+        struct GetPreferredWarpSize;
+
+        //! The device reset trait.
+        template<typename T, typename TSfinae = void>
+        struct Reset;
+    } // namespace trait
+
+    //! The device type trait alias template to remove the ::type.
+    template<typename T>
+    using Dev = typename trait::DevType<T>::type;
+
+    struct ConceptGetDev;
+
+    struct ConceptDev;
+
+    //! True if TDev is a device, i.e. if it implements the ConceptDev concept.
+    template<typename TDev>
+    inline constexpr bool isDevice = concepts::ImplementsConcept<ConceptDev, std::decay_t<TDev>>::value;
+
+    //! \return The device this object is bound to.
+    template<typename T>
+    ALPAKA_FN_HOST auto getDev(T const& t)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptGetDev, T>;
+        return trait::GetDev<ImplementationBase>::getDev(t);
+    }
+
+    namespace detail
+    {
+        inline auto trim(std::string s) -> std::string
+        {
+            auto const pred = [](char c) { return !std::isspace(c); };
+            s.erase(std::find_if(rbegin(s), rend(s), pred).base(), end(s));
+            s.erase(begin(s), std::find_if(begin(s), end(s), pred));
+            return s;
+        }
+    } // namespace detail
+
+    //! \return The device name with leading/trailing space characters trimmed off.
+    template<typename TDev>
+    ALPAKA_FN_HOST auto getName(TDev const& dev) -> std::string
+    {
+        return detail::trim(trait::GetName<TDev>::getName(dev));
+    }
+
+    //! \return The memory on the device in Bytes. Returns 0 if querying memory
+    //!  is not supported.
+    template<typename TDev>
+    ALPAKA_FN_HOST auto getMemBytes(TDev const& dev) -> std::size_t
+    {
+        return trait::GetMemBytes<TDev>::getMemBytes(dev);
+    }
+
+    //! \return The free memory on the device in Bytes.
+    //
+    //! \note Do not use this query if getMemBytes returned 0.
+    template<typename TDev>
+    ALPAKA_FN_HOST auto getFreeMemBytes(TDev const& dev) -> std::size_t
+    {
+        return trait::GetFreeMemBytes<TDev>::getFreeMemBytes(dev);
+    }
+
+    //! \return The supported warp sizes on the device in number of threads.
+    template<typename TDev>
+    ALPAKA_FN_HOST auto getWarpSizes(TDev const& dev) -> std::vector<std::size_t>
+    {
+        return trait::GetWarpSizes<TDev>::getWarpSizes(dev);
+    }
+
+    //! \return The preferred warp size on the device in number of threads.
+    template<typename TDev>
+    ALPAKA_FN_HOST constexpr auto getPreferredWarpSize(TDev const& dev) -> std::size_t
+    {
+        return trait::GetPreferredWarpSize<TDev>::getPreferredWarpSize(dev);
+    }
+
+    //! Resets the device.
+    //! What this method does is dependent on the accelerator.
+    template<typename TDev>
+    ALPAKA_FN_HOST auto reset(TDev const& dev) -> void
+    {
+        trait::Reset<TDev>::reset(dev);
+    }
+
+    namespace trait
+    {
+        //! Get device type
+        template<typename TDev>
+        struct DevType<TDev, std::enable_if_t<concepts::ImplementsConcept<ConceptDev, TDev>::value>>
+        {
+            using type = typename concepts::ImplementationBase<ConceptDev, TDev>;
+        };
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/dev/common/QueueRegistry.hpp b/include/alpaka/dev/common/QueueRegistry.hpp
new file mode 100644
index 0000000..62055fc
--- /dev/null
+++ b/include/alpaka/dev/common/QueueRegistry.hpp
@@ -0,0 +1,59 @@
+/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Jeffrey Kelling
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+
+#include <deque>
+#include <functional>
+#include <memory>
+#include <mutex>
+
+namespace alpaka::detail
+{
+    //! The CPU/GPU device queue registry implementation.
+    //!
+    //! @tparam TQueue queue implementation
+    template<typename TQueue>
+    struct QueueRegistry
+    {
+        ALPAKA_FN_HOST auto getAllExistingQueues() const -> std::vector<std::shared_ptr<TQueue>>
+        {
+            std::vector<std::shared_ptr<TQueue>> vspQueues;
+
+            std::lock_guard<std::mutex> lk(m_Mutex);
+            vspQueues.reserve(std::size(m_queues));
+
+            for(auto it = std::begin(m_queues); it != std::end(m_queues);)
+            {
+                auto spQueue = it->lock();
+                if(spQueue)
+                {
+                    vspQueues.emplace_back(std::move(spQueue));
+                    ++it;
+                }
+                else
+                {
+                    it = m_queues.erase(it);
+                }
+            }
+            return vspQueues;
+        }
+
+        //! Registers the given queue on this device.
+        //! NOTE: Every queue has to be registered for correct functionality of device wait operations!
+        ALPAKA_FN_HOST auto registerQueue(std::shared_ptr<TQueue> const& spQueue) const -> void
+        {
+            std::lock_guard<std::mutex> lk(m_Mutex);
+
+            // Register this queue on the device.
+            m_queues.push_back(spQueue);
+        }
+
+    private:
+        std::mutex mutable m_Mutex;
+        std::deque<std::weak_ptr<TQueue>> mutable m_queues;
+    };
+} // namespace alpaka::detail
diff --git a/include/alpaka/dev/cpu/SysInfo.hpp b/include/alpaka/dev/cpu/SysInfo.hpp
new file mode 100644
index 0000000..1dc989f
--- /dev/null
+++ b/include/alpaka/dev/cpu/SysInfo.hpp
@@ -0,0 +1,237 @@
+/* Copyright 2022 Benjamin Worpitz, Daniel Vollmer, Erik Zenker, René Widera, Bernhard Manfred Gruber, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+
+#if BOOST_OS_WINDOWS || BOOST_OS_CYGWIN
+#    ifndef NOMINMAX
+#        define NOMINMAX
+#    endif
+#    ifndef WIN32_LEAN_AND_MEAN
+#        define WIN32_LEAN_AND_MEAN
+#    endif
+// We could use some more macros to reduce the number of sub-headers included, but this would restrict user code.
+#    include <windows.h>
+#elif BOOST_OS_UNIX || BOOST_OS_MACOS
+#    include <sys/param.h>
+#    include <sys/types.h>
+#    include <unistd.h>
+
+#    include <cstdint>
+#    if BOOST_OS_BSD || BOOST_OS_MACOS
+#        include <sys/sysctl.h>
+#    endif
+#endif
+
+#if BOOST_OS_LINUX
+#    include <fstream>
+#endif
+
+#include <cstring>
+#include <stdexcept>
+#include <string>
+
+#if BOOST_ARCH_X86
+#    if BOOST_COMP_GNUC || BOOST_COMP_CLANG || BOOST_COMP_PGI
+#        include <cpuid.h>
+#    elif BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
+#        include <intrin.h>
+#    endif
+#endif
+
+namespace alpaka::cpu::detail
+{
+    constexpr int NO_CPUID = 0;
+    constexpr int UNKNOWN_CPU = 0;
+    constexpr int UNKNOWN_COMPILER = 1;
+#if BOOST_ARCH_X86
+#    if BOOST_COMP_GNUC || BOOST_COMP_CLANG || BOOST_COMP_PGI
+    inline auto cpuid(std::uint32_t level, std::uint32_t subfunction, std::uint32_t ex[4]) -> void
+    {
+        __cpuid_count(level, subfunction, ex[0], ex[1], ex[2], ex[3]);
+    }
+
+#    elif BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
+    inline auto cpuid(std::uint32_t level, std::uint32_t subfunction, std::uint32_t ex[4]) -> void
+    {
+        __cpuidex(reinterpret_cast<int*>(ex), level, subfunction);
+    }
+#    else
+    inline auto cpuid(std::uint32_t, std::uint32_t, std::uint32_t ex[4]) -> void
+    {
+        ex[0] = ex[2] = ex[3] = NO_CPUID;
+        ex[1] = UNKNOWN_COMPILER;
+    }
+#    endif
+#else
+    inline auto cpuid(std::uint32_t, std::uint32_t, std::uint32_t ex[4]) -> void
+    {
+        ex[0] = ex[2] = ex[3] = NO_CPUID;
+        ex[1] = UNKNOWN_CPU;
+    }
+#endif
+    //! \return The name of the CPU the code is running on.
+    inline auto getCpuName() -> std::string
+    {
+        // Get extended ids.
+        std::uint32_t ex[4] = {0};
+        cpuid(0x8000'0000, 0, ex);
+        std::uint32_t const nExIds(ex[0]);
+
+        if(!nExIds)
+        {
+            switch(ex[1])
+            {
+            case UNKNOWN_COMPILER:
+                return "<unknown: compiler>";
+            case UNKNOWN_CPU:
+                return "<unknown: CPU>";
+            default:
+                return "<unknown>";
+            }
+        }
+#if BOOST_ARCH_X86
+        // Get the information associated with each extended ID.
+        char cpuBrandString[0x40] = {0};
+        for(std::uint32_t i(0x8000'0000); i <= nExIds; ++i)
+        {
+            cpuid(i, 0, ex);
+
+            // Interpret CPU brand string and cache information.
+            if(i == 0x8000'0002)
+            {
+                std::memcpy(cpuBrandString, ex, sizeof(ex));
+            }
+            else if(i == 0x8000'0003)
+            {
+                std::memcpy(cpuBrandString + 16, ex, sizeof(ex));
+            }
+            else if(i == 0x8000'0004)
+            {
+                std::memcpy(cpuBrandString + 32, ex, sizeof(ex));
+            }
+        }
+        return std::string(cpuBrandString);
+#else
+        return std::string("unknown");
+#endif
+    }
+
+    //! \return Pagesize in bytes used by the system.
+    inline size_t getPageSize()
+    {
+#if BOOST_OS_WINDOWS || BOOST_OS_CYGWIN
+        SYSTEM_INFO si;
+        GetSystemInfo(&si);
+        return si.dwPageSize;
+#elif BOOST_OS_UNIX || BOOST_OS_MACOS
+#    if defined(_SC_PAGESIZE)
+        return static_cast<std::size_t>(sysconf(_SC_PAGESIZE));
+#    else
+        // this is legacy and only used as fallback
+        return = static_cast<size_t>(getpagesize());
+#    endif
+#else
+#    error "getPageSize not implemented for this system!"
+        return 0;
+#endif
+    }
+
+    //! \return The total number of bytes of global memory.
+    //! Adapted from David Robert Nadeau:
+    //! http://nadeausoftware.com/articles/2012/09/c_c_tip_how_get_physical_memory_size_system
+    inline auto getTotalGlobalMemSizeBytes() -> std::size_t
+    {
+#if BOOST_OS_WINDOWS
+        MEMORYSTATUSEX status;
+        status.dwLength = sizeof(status);
+        GlobalMemoryStatusEx(&status);
+        return static_cast<std::size_t>(status.ullTotalPhys);
+
+#elif BOOST_OS_CYGWIN
+        // New 64-bit MEMORYSTATUSEX isn't available.
+        MEMORYSTATUS status;
+        status.dwLength = sizeof(status);
+        GlobalMemoryStatus(&status);
+        return static_cast<std::size_t>(status.dwTotalPhys);
+
+#elif BOOST_OS_UNIX || BOOST_OS_MACOS
+        // Unix : Prefer sysctl() over sysconf() except sysctl() with HW_REALMEM and HW_PHYSMEM which are not
+        // always reliable
+#    if defined(CTL_HW) && (defined(HW_MEMSIZE) || defined(HW_PHYSMEM64))
+        int mib[2]
+            = { CTL_HW,
+#        if defined(HW_MEMSIZE) // OSX
+                HW_MEMSIZE
+#        elif defined(HW_PHYSMEM64) // NetBSD, OpenBSD.
+                HW_PHYSMEM64
+#        endif
+              };
+        std::uint64_t size(0);
+        std::size_t sizeLen{sizeof(size)};
+        if(sysctl(mib, 2, &size, &sizeLen, nullptr, 0) < 0)
+            throw std::logic_error("getTotalGlobalMemSizeBytes failed calling sysctl!");
+        return static_cast<std::size_t>(size);
+
+#    elif defined(_SC_AIX_REALMEM) // AIX.
+        return static_cast<std::size_t>(sysconf(_SC_AIX_REALMEM)) * static_cast<std::size_t>(1024);
+
+#    elif defined(_SC_PHYS_PAGES) // Linux, FreeBSD, OpenBSD, Solaris.
+        return static_cast<std::size_t>(sysconf(_SC_PHYS_PAGES)) * getPageSize();
+
+#    elif defined(CTL_HW)                                                                                             \
+        && (defined(HW_PHYSMEM) || defined(HW_REALMEM)) // FreeBSD, DragonFly BSD, NetBSD, OpenBSD, and OSX.
+        int mib[2]
+            = { CTL_HW,
+#        if defined(HW_REALMEM) // FreeBSD.
+                HW_REALMEM
+#        elif defined(HW_PYSMEM) // Others.
+                HW_PHYSMEM
+#        endif
+              };
+        std::uint32_t size(0);
+        std::size_t const sizeLen{sizeof(size)};
+        if(sysctl(mib, 2, &size, &sizeLen, nullptr, 0) < 0)
+            throw std::logic_error("getTotalGlobalMemSizeBytes failed calling sysctl!");
+        return static_cast<std::size_t>(size);
+#    endif
+
+#else
+#    error "getTotalGlobalMemSizeBytes not implemented for this system!"
+#endif
+    }
+
+    //! \return The free number of bytes of global memory.
+    //! \throws std::logic_error if not implemented on the system and std::runtime_error on other errors.
+    inline auto getFreeGlobalMemSizeBytes() -> std::size_t
+    {
+#if BOOST_OS_WINDOWS
+        MEMORYSTATUSEX status;
+        status.dwLength = sizeof(status);
+        GlobalMemoryStatusEx(&status);
+        return static_cast<std::size_t>(status.ullAvailPhys);
+#elif BOOST_OS_LINUX
+#    if defined(_SC_AVPHYS_PAGES)
+        return static_cast<std::size_t>(sysconf(_SC_AVPHYS_PAGES)) * getPageSize();
+#    else
+        // this is legacy and only used as fallback
+        return static_cast<std::size_t>(get_avphys_pages()) * getPageSize();
+#    endif
+#elif BOOST_OS_MACOS
+        int free_pages = 0;
+        std::size_t len = sizeof(free_pages);
+        if(sysctlbyname("vm.page_free_count", &free_pages, &len, nullptr, 0) < 0)
+        {
+            throw std::logic_error("getFreeGlobalMemSizeBytes failed calling sysctl(vm.page_free_count)!");
+        }
+
+        return static_cast<std::size_t>(free_pages) * getPageSize();
+#else
+#    error "getFreeGlobalMemSizeBytes not implemented for this system!"
+#endif
+    }
+
+} // namespace alpaka::cpu::detail
diff --git a/include/alpaka/dev/cpu/Wait.hpp b/include/alpaka/dev/cpu/Wait.hpp
new file mode 100644
index 0000000..1983674
--- /dev/null
+++ b/include/alpaka/dev/cpu/Wait.hpp
@@ -0,0 +1,27 @@
+/* Copyright 2022 Benjamin Worpitz, Rene Widera, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/dev/DevCpu.hpp"
+#include "alpaka/event/EventCpu.hpp"
+#include "alpaka/wait/Traits.hpp"
+
+namespace alpaka::trait
+{
+    //! The CPU device thread wait specialization.
+    //!
+    //! Blocks until the device has completed all preceding requested tasks.
+    //! Tasks that are enqueued or queues that are created after this call is made are not waited for.
+    template<>
+    struct CurrentThreadWaitFor<DevCpu>
+    {
+        ALPAKA_FN_HOST static auto currentThreadWaitFor(DevCpu const& dev) -> void
+        {
+            ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+            generic::currentThreadWaitForDevice(dev);
+        }
+    };
+} // namespace alpaka::trait
diff --git a/include/alpaka/dim/DimArithmetic.hpp b/include/alpaka/dim/DimArithmetic.hpp
new file mode 100644
index 0000000..f0b0edc
--- /dev/null
+++ b/include/alpaka/dim/DimArithmetic.hpp
@@ -0,0 +1,19 @@
+/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/dim/DimIntegralConst.hpp"
+
+#include <type_traits>
+
+namespace alpaka::trait
+{
+    //! The arithmetic type dimension getter trait specialization.
+    template<typename T>
+    struct DimType<T, std::enable_if_t<std::is_arithmetic_v<T>>>
+    {
+        using type = DimInt<1u>;
+    };
+} // namespace alpaka::trait
diff --git a/include/alpaka/dim/DimIntegralConst.hpp b/include/alpaka/dim/DimIntegralConst.hpp
new file mode 100644
index 0000000..69c85b5
--- /dev/null
+++ b/include/alpaka/dim/DimIntegralConst.hpp
@@ -0,0 +1,16 @@
+/* Copyright 2020 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/dim/Traits.hpp"
+
+#include <type_traits>
+
+namespace alpaka
+{
+    // N(th) dimension(s).
+    template<std::size_t N>
+    using DimInt = std::integral_constant<std::size_t, N>;
+} // namespace alpaka
diff --git a/include/alpaka/dim/Traits.hpp b/include/alpaka/dim/Traits.hpp
new file mode 100644
index 0000000..706b0a7
--- /dev/null
+++ b/include/alpaka/dim/Traits.hpp
@@ -0,0 +1,20 @@
+/* Copyright 2020 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+namespace alpaka
+{
+    //! The dimension trait.
+    namespace trait
+    {
+        //! The dimension getter type trait.
+        template<typename T, typename TSfinae = void>
+        struct DimType;
+    } // namespace trait
+
+    //! The dimension type trait alias template to remove the ::type.
+    template<typename T>
+    using Dim = typename trait::DimType<T>::type;
+} // namespace alpaka
diff --git a/include/alpaka/elem/Traits.hpp b/include/alpaka/elem/Traits.hpp
new file mode 100644
index 0000000..690ce76
--- /dev/null
+++ b/include/alpaka/elem/Traits.hpp
@@ -0,0 +1,33 @@
+/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include <type_traits>
+
+namespace alpaka
+{
+    //! The element trait.
+    namespace trait
+    {
+        //! The element type trait.
+        template<typename TView, typename TSfinae = void>
+        struct ElemType;
+    } // namespace trait
+
+    //! The element type trait alias template to remove the ::type.
+    template<typename TView>
+    using Elem = std::remove_volatile_t<typename trait::ElemType<TView>::type>;
+
+    // Trait specializations for unsigned integral types.
+    namespace trait
+    {
+        //! The fundamental type elem type trait specialization.
+        template<typename T>
+        struct ElemType<T, std::enable_if_t<std::is_fundamental_v<T>>>
+        {
+            using type = T;
+        };
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/event/EventCpu.hpp b/include/alpaka/event/EventCpu.hpp
new file mode 100644
index 0000000..d883621
--- /dev/null
+++ b/include/alpaka/event/EventCpu.hpp
@@ -0,0 +1,13 @@
+/* Copyright 2020 Jeffrey Kelling, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/dev/DevCpu.hpp"
+#include "alpaka/event/EventGenericThreads.hpp"
+
+namespace alpaka
+{
+    using EventCpu = EventGenericThreads<DevCpu>;
+} // namespace alpaka
diff --git a/include/alpaka/event/EventCpuSycl.hpp b/include/alpaka/event/EventCpuSycl.hpp
new file mode 100644
index 0000000..91a9517
--- /dev/null
+++ b/include/alpaka/event/EventCpuSycl.hpp
@@ -0,0 +1,17 @@
+/* Copyright 2024 Jan Stephan, Andrea Bocci, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/acc/Tag.hpp"
+#include "alpaka/event/EventGenericSycl.hpp"
+
+#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_CPU)
+
+namespace alpaka
+{
+    using EventCpuSycl = EventGenericSycl<TagCpuSycl>;
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/event/EventCudaRt.hpp b/include/alpaka/event/EventCudaRt.hpp
new file mode 100644
index 0000000..4dfba7c
--- /dev/null
+++ b/include/alpaka/event/EventCudaRt.hpp
@@ -0,0 +1,18 @@
+/* Copyright 2022 Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/ApiCudaRt.hpp"
+#include "alpaka/event/EventUniformCudaHipRt.hpp"
+
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+
+namespace alpaka
+{
+    //! The CUDA RT device event.
+    using EventCudaRt = EventUniformCudaHipRt<ApiCudaRt>;
+} // namespace alpaka
+
+#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
diff --git a/include/alpaka/event/EventFpgaSyclIntel.hpp b/include/alpaka/event/EventFpgaSyclIntel.hpp
new file mode 100644
index 0000000..3646fe7
--- /dev/null
+++ b/include/alpaka/event/EventFpgaSyclIntel.hpp
@@ -0,0 +1,17 @@
+/* Copyright 2024 Jan Stephan, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/acc/Tag.hpp"
+#include "alpaka/event/EventGenericSycl.hpp"
+
+#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_FPGA)
+
+namespace alpaka
+{
+    using EventFpgaSyclIntel = EventGenericSycl<TagFpgaSyclIntel>;
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/event/EventGenericSycl.hpp b/include/alpaka/event/EventGenericSycl.hpp
new file mode 100644
index 0000000..7ea8538
--- /dev/null
+++ b/include/alpaka/event/EventGenericSycl.hpp
@@ -0,0 +1,161 @@
+/* Copyright 2024 Jan Stephan, Antonio Di Pilato, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Sycl.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/event/Traits.hpp"
+#include "alpaka/queue/QueueGenericSyclBlocking.hpp"
+#include "alpaka/queue/QueueGenericSyclNonBlocking.hpp"
+#include "alpaka/wait/Traits.hpp"
+
+#include <functional>
+#include <memory>
+#include <stdexcept>
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+
+#    include <sycl/sycl.hpp>
+
+namespace alpaka
+{
+    //! The SYCL device event.
+    template<typename TTag>
+    class EventGenericSycl final
+    {
+    public:
+        explicit EventGenericSycl(DevGenericSycl<TTag> const& dev) : m_dev{dev}
+        {
+        }
+
+        friend auto operator==(EventGenericSycl const& lhs, EventGenericSycl const& rhs) -> bool
+        {
+            return (lhs.m_event == rhs.m_event);
+        }
+
+        friend auto operator!=(EventGenericSycl const& lhs, EventGenericSycl const& rhs) -> bool
+        {
+            return !(lhs == rhs);
+        }
+
+        [[nodiscard]] auto getNativeHandle() const
+        {
+            return m_event;
+        }
+
+        void setEvent(sycl::event const& event)
+        {
+            m_event = event;
+        }
+
+        DevGenericSycl<TTag> m_dev;
+
+    private:
+        sycl::event m_event{};
+    };
+} // namespace alpaka
+
+namespace alpaka::trait
+{
+    //! The SYCL device event device get trait specialization.
+    template<typename TTag>
+    struct GetDev<EventGenericSycl<TTag>>
+    {
+        static auto getDev(EventGenericSycl<TTag> const& event) -> DevGenericSycl<TTag>
+        {
+            return event.m_dev;
+        }
+    };
+
+    //! The SYCL device event test trait specialization.
+    template<typename TTag>
+    struct IsComplete<EventGenericSycl<TTag>>
+    {
+        static auto isComplete(EventGenericSycl<TTag> const& event)
+        {
+            auto const status
+                = event.getNativeHandle().template get_info<sycl::info::event::command_execution_status>();
+            return (status == sycl::info::event_command_status::complete);
+        }
+    };
+
+    //! The SYCL queue enqueue trait specialization.
+    template<typename TTag>
+    struct Enqueue<QueueGenericSyclNonBlocking<TTag>, EventGenericSycl<TTag>>
+    {
+        static auto enqueue(QueueGenericSyclNonBlocking<TTag>& queue, EventGenericSycl<TTag>& event)
+        {
+            event.setEvent(queue.m_spQueueImpl->get_last_event());
+        }
+    };
+
+    //! The SYCL queue enqueue trait specialization.
+    template<typename TTag>
+    struct Enqueue<QueueGenericSyclBlocking<TTag>, EventGenericSycl<TTag>>
+    {
+        static auto enqueue(QueueGenericSyclBlocking<TTag>& queue, EventGenericSycl<TTag>& event)
+        {
+            event.setEvent(queue.m_spQueueImpl->get_last_event());
+        }
+    };
+
+    //! The SYCL device event thread wait trait specialization.
+    //!
+    //! Waits until the event itself and therefore all tasks preceding it in the queue it is enqueued to have been
+    //! completed. If the event is not enqueued to a queue the method returns immediately.
+    template<typename TTag>
+    struct CurrentThreadWaitFor<EventGenericSycl<TTag>>
+    {
+        static auto currentThreadWaitFor(EventGenericSycl<TTag> const& event)
+        {
+            event.getNativeHandle().wait_and_throw();
+        }
+    };
+
+    //! The SYCL queue event wait trait specialization.
+    template<typename TTag>
+    struct WaiterWaitFor<QueueGenericSyclNonBlocking<TTag>, EventGenericSycl<TTag>>
+    {
+        static auto waiterWaitFor(QueueGenericSyclNonBlocking<TTag>& queue, EventGenericSycl<TTag> const& event)
+        {
+            queue.m_spQueueImpl->register_dependency(event.getNativeHandle());
+        }
+    };
+
+    //! The SYCL queue event wait trait specialization.
+    template<typename TTag>
+    struct WaiterWaitFor<QueueGenericSyclBlocking<TTag>, EventGenericSycl<TTag>>
+    {
+        static auto waiterWaitFor(QueueGenericSyclBlocking<TTag>& queue, EventGenericSycl<TTag> const& event)
+        {
+            queue.m_spQueueImpl->register_dependency(event.getNativeHandle());
+        }
+    };
+
+    //! The SYCL device event wait trait specialization.
+    //!
+    //! Any future work submitted in any queue of this device will wait for event to complete before beginning
+    //! execution.
+    template<typename TTag>
+    struct WaiterWaitFor<DevGenericSycl<TTag>, EventGenericSycl<TTag>>
+    {
+        static auto waiterWaitFor(DevGenericSycl<TTag>& dev, EventGenericSycl<TTag> const& event)
+        {
+            dev.m_impl->register_dependency(event.getNativeHandle());
+        }
+    };
+
+    //! The SYCL device event native handle trait specialization.
+    template<typename TTag>
+    struct NativeHandle<EventGenericSycl<TTag>>
+    {
+        [[nodiscard]] static auto getNativeHandle(EventGenericSycl<TTag> const& event)
+        {
+            return event.getNativeHandle();
+        }
+    };
+} // namespace alpaka::trait
+
+#endif
diff --git a/include/alpaka/event/EventGenericThreads.hpp b/include/alpaka/event/EventGenericThreads.hpp
new file mode 100644
index 0000000..b588839
--- /dev/null
+++ b/include/alpaka/event/EventGenericThreads.hpp
@@ -0,0 +1,395 @@
+/* Copyright 2023 Axel Hübl, Benjamin Worpitz, Matthias Werner, René Widera, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Assert.hpp"
+#include "alpaka/core/Utility.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/event/Traits.hpp"
+#include "alpaka/queue/QueueGenericThreadsBlocking.hpp"
+#include "alpaka/queue/QueueGenericThreadsNonBlocking.hpp"
+#include "alpaka/wait/Traits.hpp"
+
+#include <condition_variable>
+#include <future>
+#include <mutex>
+#include <utility>
+#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+#    include <iostream>
+#endif
+
+namespace alpaka
+{
+    namespace generic::detail
+    {
+        //! The CPU device event implementation.
+        template<typename TDev>
+        class EventGenericThreadsImpl final
+            : public concepts::Implements<ConceptCurrentThreadWaitFor, EventGenericThreadsImpl<TDev>>
+        {
+        public:
+            EventGenericThreadsImpl(TDev dev) noexcept : m_dev(std::move(dev))
+            {
+            }
+
+            EventGenericThreadsImpl(EventGenericThreadsImpl<TDev> const&) = delete;
+            auto operator=(EventGenericThreadsImpl<TDev> const&) -> EventGenericThreadsImpl<TDev>& = delete;
+
+            auto isReady() noexcept -> bool
+            {
+                return (m_LastReadyEnqueueCount == m_enqueueCount);
+            }
+
+            auto wait(std::size_t const& enqueueCount, std::unique_lock<std::mutex>& lk) const noexcept -> void
+            {
+                ALPAKA_ASSERT(enqueueCount <= m_enqueueCount);
+
+                while(enqueueCount > m_LastReadyEnqueueCount)
+                {
+                    auto future = m_future;
+                    lk.unlock();
+                    future.get();
+                    lk.lock();
+                }
+            }
+
+            TDev const m_dev; //!< The device this event is bound to.
+
+            std::mutex mutable m_mutex; //!< The mutex used to synchronize access to the event.
+            std::shared_future<void> m_future; //!< The future signaling the event completion.
+            std::size_t m_enqueueCount = 0u; //!< The number of times this event has been enqueued.
+            std::size_t m_LastReadyEnqueueCount = 0u; //!< The time this event has been ready the last time.
+                                                      //!< Ready means that the event was not waiting within a queue
+                                                      //!< (not enqueued or already completed). If m_enqueueCount ==
+                                                      //!< m_LastReadyEnqueueCount, the event is currently not enqueued
+        };
+    } // namespace generic::detail
+
+    //! The CPU device event.
+    template<typename TDev>
+    class EventGenericThreads final
+        : public concepts::Implements<ConceptCurrentThreadWaitFor, EventGenericThreads<TDev>>
+        , public concepts::Implements<ConceptGetDev, EventGenericThreads<TDev>>
+    {
+    public:
+        //! \param bBusyWaiting Unused. EventGenericThreads never does busy waiting.
+        EventGenericThreads(TDev const& dev, [[maybe_unused]] bool bBusyWaiting = true)
+            : m_spEventImpl(std::make_shared<generic::detail::EventGenericThreadsImpl<TDev>>(dev))
+        {
+        }
+
+        auto operator==(EventGenericThreads<TDev> const& rhs) const -> bool
+        {
+            return (m_spEventImpl == rhs.m_spEventImpl);
+        }
+
+        auto operator!=(EventGenericThreads<TDev> const& rhs) const -> bool
+        {
+            return !((*this) == rhs);
+        }
+
+    public:
+        std::shared_ptr<generic::detail::EventGenericThreadsImpl<TDev>> m_spEventImpl;
+    };
+
+    namespace trait
+    {
+        //! The CPU device event device type trait specialization.
+        template<typename TDev>
+        struct DevType<EventGenericThreads<TDev>>
+        {
+            using type = TDev;
+        };
+
+        //! The CPU device event device get trait specialization.
+        template<typename TDev>
+        struct GetDev<EventGenericThreads<TDev>>
+        {
+            ALPAKA_FN_HOST static auto getDev(EventGenericThreads<TDev> const& event) -> TDev
+            {
+                return event.m_spEventImpl->m_dev;
+            }
+        };
+
+        //! The CPU device event test trait specialization.
+        template<typename TDev>
+        struct IsComplete<EventGenericThreads<TDev>>
+        {
+            //! \return If the event is not waiting within a queue (not enqueued or already handled).
+            ALPAKA_FN_HOST static auto isComplete(EventGenericThreads<TDev> const& event) -> bool
+            {
+                std::lock_guard<std::mutex> lk(event.m_spEventImpl->m_mutex);
+
+                return event.m_spEventImpl->isReady();
+            }
+        };
+
+        //! The CPU non-blocking device queue enqueue trait specialization.
+        template<typename TDev>
+        struct Enqueue<alpaka::generic::detail::QueueGenericThreadsNonBlockingImpl<TDev>, EventGenericThreads<TDev>>
+        {
+            ALPAKA_FN_HOST static auto enqueue(
+                [[maybe_unused]] alpaka::generic::detail::QueueGenericThreadsNonBlockingImpl<TDev>& queueImpl,
+                EventGenericThreads<TDev>& event) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // Copy the shared pointer of the event implementation.
+                // This is forwarded to the lambda that is enqueued into the queue to ensure that the event
+                // implementation is alive as long as it is enqueued.
+                auto spEventImpl = event.m_spEventImpl;
+
+                // Setting the event state and enqueuing it has to be atomic.
+                std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
+
+                ++spEventImpl->m_enqueueCount;
+
+                auto const enqueueCount = spEventImpl->m_enqueueCount;
+
+                // Enqueue a task that only resets the events flag if it is completed.
+                spEventImpl->m_future = queueImpl.m_workerThread.submit(
+                    [spEventImpl, enqueueCount]() mutable
+                    {
+                        std::unique_lock<std::mutex> lk2(spEventImpl->m_mutex);
+
+                        // Nothing to do if it has been re-enqueued to a later position in the queue.
+                        if(enqueueCount == spEventImpl->m_enqueueCount)
+                        {
+                            spEventImpl->m_LastReadyEnqueueCount
+                                = std::max(enqueueCount, spEventImpl->m_LastReadyEnqueueCount);
+                        }
+                    });
+            }
+        };
+
+        //! The CPU non-blocking device queue enqueue trait specialization.
+        template<typename TDev>
+        struct Enqueue<QueueGenericThreadsNonBlocking<TDev>, EventGenericThreads<TDev>>
+        {
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueGenericThreadsNonBlocking<TDev>& queue,
+                EventGenericThreads<TDev>& event) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                alpaka::enqueue(*queue.m_spQueueImpl, event);
+            }
+        };
+
+        //! The CPU blocking device queue enqueue trait specialization.
+        template<typename TDev>
+        struct Enqueue<alpaka::generic::detail::QueueGenericThreadsBlockingImpl<TDev>, EventGenericThreads<TDev>>
+        {
+            ALPAKA_FN_HOST static auto enqueue(
+                alpaka::generic::detail::QueueGenericThreadsBlockingImpl<TDev>& queueImpl,
+                EventGenericThreads<TDev>& event) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                std::promise<void> promise;
+                {
+                    std::lock_guard<std::mutex> lk(queueImpl.m_mutex);
+
+                    queueImpl.m_bCurrentlyExecutingTask = true;
+
+                    auto& eventImpl(*event.m_spEventImpl);
+
+                    {
+                        // Setting the event state and enqueuing it has to be atomic.
+                        std::lock_guard<std::mutex> evLk(eventImpl.m_mutex);
+
+                        ++eventImpl.m_enqueueCount;
+                        // NOTE: Difference to non-blocking version: directly set the event state instead of enqueuing.
+                        eventImpl.m_LastReadyEnqueueCount = eventImpl.m_enqueueCount;
+
+                        eventImpl.m_future = promise.get_future();
+                    }
+
+                    queueImpl.m_bCurrentlyExecutingTask = false;
+                }
+                promise.set_value();
+            }
+        };
+
+        //! The CPU blocking device queue enqueue trait specialization.
+        template<typename TDev>
+        struct Enqueue<QueueGenericThreadsBlocking<TDev>, EventGenericThreads<TDev>>
+        {
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueGenericThreadsBlocking<TDev>& queue,
+                EventGenericThreads<TDev>& event) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                alpaka::enqueue(*queue.m_spQueueImpl, event);
+            }
+        };
+    } // namespace trait
+
+    namespace trait
+    {
+        namespace generic
+        {
+            template<typename TDev>
+            ALPAKA_FN_HOST auto currentThreadWaitForDevice(TDev const& dev) -> void
+            {
+                // Get all the queues on the device at the time of invocation.
+                // All queues added afterwards are ignored.
+                auto vQueues = dev.getAllQueues();
+                // Furthermore there should not even be a chance to enqueue something between getting the queues and
+                // adding our wait events!
+                std::vector<EventGenericThreads<TDev>> vEvents;
+                for(auto&& spQueue : vQueues)
+                {
+                    vEvents.emplace_back(dev);
+                    spQueue->enqueue(vEvents.back());
+                }
+
+                // Now wait for all the events.
+                for(auto&& event : vEvents)
+                {
+                    wait(event);
+                }
+            }
+        } // namespace generic
+
+        //! The CPU device event thread wait trait specialization.
+        //!
+        //! Waits until the event itself and therefore all tasks preceding it in the queue it is enqueued to have been
+        //! completed. If the event is not enqueued to a queue the method returns immediately.
+        template<typename TDev>
+        struct CurrentThreadWaitFor<EventGenericThreads<TDev>>
+        {
+            ALPAKA_FN_HOST static auto currentThreadWaitFor(EventGenericThreads<TDev> const& event) -> void
+            {
+                wait(*event.m_spEventImpl);
+            }
+        };
+
+        //! The CPU device event implementation thread wait trait specialization.
+        //!
+        //! Waits until the event itself and therefore all tasks preceding it in the queue it is enqueued to have been
+        //! completed. If the event is not enqueued to a queue the method returns immediately.
+        //!
+        //! NOTE: This method is for internal usage only.
+        template<typename TDev>
+        struct CurrentThreadWaitFor<alpaka::generic::detail::EventGenericThreadsImpl<TDev>>
+        {
+            ALPAKA_FN_HOST static auto currentThreadWaitFor(
+                alpaka::generic::detail::EventGenericThreadsImpl<TDev> const& eventImpl) -> void
+            {
+                std::unique_lock<std::mutex> lk(eventImpl.m_mutex);
+
+                auto const enqueueCount = eventImpl.m_enqueueCount;
+                eventImpl.wait(enqueueCount, lk);
+            }
+        };
+
+        //! The CPU non-blocking device queue event wait trait specialization.
+        template<typename TDev>
+        struct WaiterWaitFor<
+            alpaka::generic::detail::QueueGenericThreadsNonBlockingImpl<TDev>,
+            EventGenericThreads<TDev>>
+        {
+            ALPAKA_FN_HOST static auto waiterWaitFor(
+                alpaka::generic::detail::QueueGenericThreadsNonBlockingImpl<TDev>& queueImpl,
+                EventGenericThreads<TDev> const& event) -> void
+            {
+                // Copy the shared pointer of the event implementation.
+                // This is forwarded to the lambda that is enqueued into the queue to ensure that the event
+                // implementation is alive as long as it is enqueued.
+                auto spEventImpl = event.m_spEventImpl;
+
+                std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
+
+                if(!spEventImpl->isReady())
+                {
+                    auto oldFuture = spEventImpl->m_future;
+
+                    // Enqueue a task that waits for the given future of the event.
+                    queueImpl.m_workerThread.submit([oldFuture]() { oldFuture.get(); });
+                }
+            }
+        };
+
+        //! The CPU non-blocking device queue event wait trait specialization.
+        template<typename TDev>
+        struct WaiterWaitFor<QueueGenericThreadsNonBlocking<TDev>, EventGenericThreads<TDev>>
+        {
+            ALPAKA_FN_HOST static auto waiterWaitFor(
+                QueueGenericThreadsNonBlocking<TDev>& queue,
+                EventGenericThreads<TDev> const& event) -> void
+            {
+                wait(*queue.m_spQueueImpl, event);
+            }
+        };
+
+        //! The CPU blocking device queue event wait trait specialization.
+        template<typename TDev>
+        struct WaiterWaitFor<alpaka::generic::detail::QueueGenericThreadsBlockingImpl<TDev>, EventGenericThreads<TDev>>
+        {
+            ALPAKA_FN_HOST static auto waiterWaitFor(
+                alpaka::generic::detail::QueueGenericThreadsBlockingImpl<TDev>& /* queueImpl */,
+                EventGenericThreads<TDev> const& event) -> void
+            {
+                // NOTE: Difference to non-blocking version: directly wait for event.
+                wait(*event.m_spEventImpl);
+            }
+        };
+
+        //! The CPU blocking device queue event wait trait specialization.
+        template<typename TDev>
+        struct WaiterWaitFor<QueueGenericThreadsBlocking<TDev>, EventGenericThreads<TDev>>
+        {
+            ALPAKA_FN_HOST static auto waiterWaitFor(
+                QueueGenericThreadsBlocking<TDev>& queue,
+                EventGenericThreads<TDev> const& event) -> void
+            {
+                wait(*queue.m_spQueueImpl, event);
+            }
+        };
+
+        //! The CPU non-blocking device event wait trait specialization.
+        //!
+        //! Any future work submitted in any queue of this device will wait for event to complete before beginning
+        //! execution.
+        template<typename TDev>
+        struct WaiterWaitFor<TDev, EventGenericThreads<TDev>>
+        {
+            ALPAKA_FN_HOST static auto waiterWaitFor(TDev& dev, EventGenericThreads<TDev> const& event) -> void
+            {
+                // Get all the queues on the device at the time of invocation.
+                // All queues added afterwards are ignored.
+                auto vspQueues(dev.getAllQueues());
+
+                // Let all the queues wait for this event.
+                // Furthermore there should not even be a chance to enqueue something between getting the queues and
+                // adding our wait events!
+                for(auto&& spQueue : vspQueues)
+                {
+                    spQueue->wait(event);
+                }
+            }
+        };
+
+        //! The CPU non-blocking device queue thread wait trait specialization.
+        //!
+        //! Blocks execution of the calling thread until the queue has finished processing all previously requested
+        //! tasks (kernels, data copies, ...)
+        template<typename TDev>
+        struct CurrentThreadWaitFor<QueueGenericThreadsNonBlocking<TDev>>
+        {
+            ALPAKA_FN_HOST static auto currentThreadWaitFor(QueueGenericThreadsNonBlocking<TDev> const& queue) -> void
+            {
+                // Enqueue a dummy tasks into the worker thread of the queue will provide a future we can wait for.
+                // Previously we enqueued an event into the queue but this will not guarantee that queue is empty
+                // after the event is finished because the event handling can be finished before the event task is
+                // fully removed from the queue.
+                auto f = queue.m_spQueueImpl->m_workerThread.submit([]() noexcept {});
+                f.wait();
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/event/EventGpuSyclIntel.hpp b/include/alpaka/event/EventGpuSyclIntel.hpp
new file mode 100644
index 0000000..508fb57
--- /dev/null
+++ b/include/alpaka/event/EventGpuSyclIntel.hpp
@@ -0,0 +1,17 @@
+/* Copyright 2024 Jan Stephan, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/acc/Tag.hpp"
+#include "alpaka/event/EventGenericSycl.hpp"
+
+#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_GPU)
+
+namespace alpaka
+{
+    using EventGpuSyclIntel = EventGenericSycl<TagGpuSyclIntel>;
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/event/EventHipRt.hpp b/include/alpaka/event/EventHipRt.hpp
new file mode 100644
index 0000000..06c9bd1
--- /dev/null
+++ b/include/alpaka/event/EventHipRt.hpp
@@ -0,0 +1,18 @@
+/* Copyright 2022 Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/ApiHipRt.hpp"
+#include "alpaka/event/EventUniformCudaHipRt.hpp"
+
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+
+namespace alpaka
+{
+    //! The HIP RT device event.
+    using EventHipRt = EventUniformCudaHipRt<ApiHipRt>;
+} // namespace alpaka
+
+#endif // ALPAKA_ACC_GPU_HIP_ENABLED
diff --git a/include/alpaka/event/EventUniformCudaHipRt.hpp b/include/alpaka/event/EventUniformCudaHipRt.hpp
new file mode 100644
index 0000000..63f1f2f
--- /dev/null
+++ b/include/alpaka/event/EventUniformCudaHipRt.hpp
@@ -0,0 +1,263 @@
+/* Copyright 2022 Benjamin Worpitz, Andrea Bocci, Bernhard Manfred Gruber, Antonio Di Pilato
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/core/Cuda.hpp"
+#include "alpaka/core/Hip.hpp"
+#include "alpaka/dev/DevUniformCudaHipRt.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/event/Traits.hpp"
+#include "alpaka/queue/QueueUniformCudaHipRtBlocking.hpp"
+#include "alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp"
+#include "alpaka/wait/Traits.hpp"
+
+#include <functional>
+#include <memory>
+#include <stdexcept>
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+namespace alpaka
+{
+    namespace uniform_cuda_hip::detail
+    {
+        //! The CUDA/HIP RT device event implementation.
+        template<typename TApi>
+        class EventUniformCudaHipImpl final
+        {
+        public:
+            ALPAKA_FN_HOST EventUniformCudaHipImpl(DevUniformCudaHipRt<TApi> const& dev, bool bBusyWait)
+                : m_dev(dev)
+                , m_UniformCudaHipEvent()
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // Set the current device.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(m_dev.getNativeHandle()));
+
+                // Create the event on the current device with the specified flags. Valid flags include:
+                // - cuda/hip-EventDefault: Default event creation flag.
+                // - cuda/hip-EventBlockingSync : Specifies that event should use blocking synchronization.
+                //   A host thread that uses cuda/hip-EventSynchronize() to wait on an event created with this flag
+                //   will block until the event actually completes.
+                // - cuda/hip-EventDisableTiming : Specifies that the created event does not need to record timing
+                // data.
+                //   Events created with this flag specified and the cuda/hip-EventBlockingSync flag not specified
+                //   will provide the best performance when used with cudaStreamWaitEvent() and cudaEventQuery().
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::eventCreateWithFlags(
+                    &m_UniformCudaHipEvent,
+                    (bBusyWait ? TApi::eventDefault : TApi::eventBlockingSync) | TApi::eventDisableTiming));
+            }
+
+            EventUniformCudaHipImpl(EventUniformCudaHipImpl const&) = delete;
+            auto operator=(EventUniformCudaHipImpl const&) -> EventUniformCudaHipImpl& = delete;
+
+            ALPAKA_FN_HOST ~EventUniformCudaHipImpl()
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // In case event has been recorded but has not yet been completed when cuda/hip-EventDestroy() is
+                // called, the function will return immediately and the resources associated with event will be
+                // released automatically once the device has completed event.
+                // -> No need to synchronize here.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::eventDestroy(m_UniformCudaHipEvent));
+            }
+
+            [[nodiscard]] auto getNativeHandle() const noexcept
+            {
+                return m_UniformCudaHipEvent;
+            }
+
+        public:
+            DevUniformCudaHipRt<TApi> const m_dev; //!< The device this event is bound to.
+
+        private:
+            typename TApi::Event_t m_UniformCudaHipEvent;
+        };
+    } // namespace uniform_cuda_hip::detail
+
+    //! The CUDA/HIP RT device event.
+    template<typename TApi>
+    class EventUniformCudaHipRt final
+        : public concepts::Implements<ConceptCurrentThreadWaitFor, EventUniformCudaHipRt<TApi>>
+        , public concepts::Implements<ConceptGetDev, EventUniformCudaHipRt<TApi>>
+    {
+    public:
+        ALPAKA_FN_HOST EventUniformCudaHipRt(DevUniformCudaHipRt<TApi> const& dev, bool bBusyWait = true)
+            : m_spEventImpl(std::make_shared<uniform_cuda_hip::detail::EventUniformCudaHipImpl<TApi>>(dev, bBusyWait))
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+        }
+
+        ALPAKA_FN_HOST auto operator==(EventUniformCudaHipRt<TApi> const& rhs) const -> bool
+        {
+            return (m_spEventImpl == rhs.m_spEventImpl);
+        }
+
+        ALPAKA_FN_HOST auto operator!=(EventUniformCudaHipRt<TApi> const& rhs) const -> bool
+        {
+            return !((*this) == rhs);
+        }
+
+        [[nodiscard]] auto getNativeHandle() const noexcept
+        {
+            return m_spEventImpl->getNativeHandle();
+        }
+
+    public:
+        std::shared_ptr<uniform_cuda_hip::detail::EventUniformCudaHipImpl<TApi>> m_spEventImpl;
+    };
+
+    namespace trait
+    {
+        //! The CUDA/HIP RT device event device type trait specialization.
+        template<typename TApi>
+        struct DevType<EventUniformCudaHipRt<TApi>>
+        {
+            using type = DevUniformCudaHipRt<TApi>;
+        };
+
+        //! The CUDA/HIP RT device event device get trait specialization.
+        template<typename TApi>
+        struct GetDev<EventUniformCudaHipRt<TApi>>
+        {
+            ALPAKA_FN_HOST static auto getDev(EventUniformCudaHipRt<TApi> const& event) -> DevUniformCudaHipRt<TApi>
+            {
+                return event.m_spEventImpl->m_dev;
+            }
+        };
+
+        //! The CUDA/HIP RT device event test trait specialization.
+        template<typename TApi>
+        struct IsComplete<EventUniformCudaHipRt<TApi>>
+        {
+            ALPAKA_FN_HOST static auto isComplete(EventUniformCudaHipRt<TApi> const& event) -> bool
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // Query is allowed even for events on non current device.
+                typename TApi::Error_t ret = TApi::success;
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE(
+                    ret = TApi::eventQuery(event.getNativeHandle()),
+                    TApi::errorNotReady);
+                return (ret == TApi::success);
+            }
+        };
+
+        //! The CUDA/HIP RT queue enqueue trait specialization.
+        template<typename TApi>
+        struct Enqueue<QueueUniformCudaHipRtNonBlocking<TApi>, EventUniformCudaHipRt<TApi>>
+        {
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtNonBlocking<TApi>& queue,
+                EventUniformCudaHipRt<TApi>& event) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::eventRecord(event.getNativeHandle(), queue.getNativeHandle()));
+            }
+        };
+
+        //! The CUDA/HIP RT queue enqueue trait specialization.
+        template<typename TApi>
+        struct Enqueue<QueueUniformCudaHipRtBlocking<TApi>, EventUniformCudaHipRt<TApi>>
+        {
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtBlocking<TApi>& queue,
+                EventUniformCudaHipRt<TApi>& event) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::eventRecord(event.getNativeHandle(), queue.getNativeHandle()));
+            }
+        };
+
+        //! The CUDA/HIP RT device event thread wait trait specialization.
+        //!
+        //! Waits until the event itself and therefore all tasks preceding it in the queue it is enqueued to have been
+        //! completed. If the event is not enqueued to a queue the method returns immediately.
+        template<typename TApi>
+        struct CurrentThreadWaitFor<EventUniformCudaHipRt<TApi>>
+        {
+            ALPAKA_FN_HOST static auto currentThreadWaitFor(EventUniformCudaHipRt<TApi> const& event) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // Sync is allowed even for events on non current device.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::eventSynchronize(event.getNativeHandle()));
+            }
+        };
+
+        //! The CUDA/HIP RT queue event wait trait specialization.
+        template<typename TApi>
+        struct WaiterWaitFor<QueueUniformCudaHipRtNonBlocking<TApi>, EventUniformCudaHipRt<TApi>>
+        {
+            ALPAKA_FN_HOST static auto waiterWaitFor(
+                QueueUniformCudaHipRtNonBlocking<TApi>& queue,
+                EventUniformCudaHipRt<TApi> const& event) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    TApi::streamWaitEvent(queue.getNativeHandle(), event.getNativeHandle(), 0));
+            }
+        };
+
+        //! The CUDA/HIP RT queue event wait trait specialization.
+        template<typename TApi>
+        struct WaiterWaitFor<QueueUniformCudaHipRtBlocking<TApi>, EventUniformCudaHipRt<TApi>>
+        {
+            ALPAKA_FN_HOST static auto waiterWaitFor(
+                QueueUniformCudaHipRtBlocking<TApi>& queue,
+                EventUniformCudaHipRt<TApi> const& event) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    TApi::streamWaitEvent(queue.getNativeHandle(), event.getNativeHandle(), 0));
+            }
+        };
+
+        //! The CUDA/HIP RT device event wait trait specialization.
+        //!
+        //! Any future work submitted in any queue of this device will wait for event to complete before beginning
+        //! execution.
+        template<typename TApi>
+        struct WaiterWaitFor<DevUniformCudaHipRt<TApi>, EventUniformCudaHipRt<TApi>>
+        {
+            ALPAKA_FN_HOST static auto waiterWaitFor(
+                DevUniformCudaHipRt<TApi>& dev,
+                EventUniformCudaHipRt<TApi> const& event) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // Set the current device.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(dev.getNativeHandle()));
+
+                // Get all the queues on the device at the time of invocation.
+                // All queues added afterwards are ignored.
+                auto vQueues = dev.getAllQueues();
+                for(auto&& spQueue : vQueues)
+                {
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                        TApi::streamWaitEvent(spQueue->getNativeHandle(), event.getNativeHandle(), 0));
+                }
+            }
+        };
+
+        //! The CUDA/HIP RT event native handle trait specialization.
+        template<typename TApi>
+        struct NativeHandle<EventUniformCudaHipRt<TApi>>
+        {
+            [[nodiscard]] static auto getNativeHandle(EventUniformCudaHipRt<TApi> const& event)
+            {
+                return event.getNativeHandle();
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/event/Traits.hpp b/include/alpaka/event/Traits.hpp
new file mode 100644
index 0000000..7acb7ab
--- /dev/null
+++ b/include/alpaka/event/Traits.hpp
@@ -0,0 +1,38 @@
+/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/dev/Traits.hpp"
+
+namespace alpaka
+{
+    //! The event management traits.
+    namespace trait
+    {
+        //! The event type trait.
+        template<typename T, typename TSfinae = void>
+        struct EventType;
+
+        //! The event tester trait.
+        template<typename TEvent, typename TSfinae = void>
+        struct IsComplete;
+    } // namespace trait
+
+    //! The event type trait alias template to remove the ::type.
+    template<typename T>
+    using Event = typename trait::EventType<T>::type;
+
+    //! Tests if the given event has already been completed.
+    //!
+    //! \warning This function is allowed to return false negatives. An already completed event can reported as
+    //! uncompleted because the status information are not fully propagated by the used alpaka backend.
+    //! \return true event is finished/complete else false.
+    template<typename TEvent>
+    ALPAKA_FN_HOST auto isComplete(TEvent const& event) -> bool
+    {
+        return trait::IsComplete<TEvent>::isComplete(event);
+    }
+} // namespace alpaka
diff --git a/include/alpaka/example/ExampleDefaultAcc.hpp b/include/alpaka/example/ExampleDefaultAcc.hpp
new file mode 100644
index 0000000..22f77f9
--- /dev/null
+++ b/include/alpaka/example/ExampleDefaultAcc.hpp
@@ -0,0 +1,41 @@
+/* Copyright 2023 Jeffrey Kelling, Bernhard Manfred Gruber, Jan Stephan, Aurora Perego, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#include "alpaka/alpaka.hpp"
+
+#pragma once
+
+namespace alpaka
+{
+    //! Alias for the default accelerator used by examples. From a list of
+    //! all accelerators the first one which is enabled is chosen.
+    //! AccCpuSerial is selected last.
+    template<class TDim, class TIdx>
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+    using ExampleDefaultAcc = alpaka::AccGpuCudaRt<TDim, TIdx>;
+#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+    using ExampleDefaultAcc = alpaka::AccGpuHipRt<TDim, TIdx>;
+#elif defined(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED)
+    using ExampleDefaultAcc = alpaka::AccCpuOmp2Blocks<TDim, TIdx>;
+#elif defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED)
+    using ExampleDefaultAcc = alpaka::AccCpuTbbBlocks<TDim, TIdx>;
+#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED)
+    using ExampleDefaultAcc = alpaka::AccCpuOmp2Threads<TDim, TIdx>;
+#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
+    using ExampleDefaultAcc = alpaka::AccCpuThreads<TDim, TIdx>;
+#elif defined(ALPAKA_ACC_SYCL_ENABLED)
+#    if defined(ALPAKA_SYCL_ONEAPI_CPU)
+    using ExampleDefaultAcc = alpaka::AccCpuSycl<TDim, TIdx>;
+#    elif defined(ALPAKA_SYCL_ONEAPI_FPGA)
+    using ExampleDefaultAcc = alpaka::AccFpgaSyclIntel<TDim, TIdx>;
+#    elif defined(ALPAKA_SYCL_ONEAPI_GPU)
+    using ExampleDefaultAcc = alpaka::AccGpuSyclIntel<TDim, TIdx>;
+#    endif
+#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
+    using ExampleDefaultAcc = alpaka::AccCpuSerial<TDim, TIdx>;
+#else
+    class ExampleDefaultAcc;
+#    warning "No supported backend selected."
+#endif
+} // namespace alpaka
diff --git a/include/alpaka/example/ExecuteForEachAccTag.hpp b/include/alpaka/example/ExecuteForEachAccTag.hpp
new file mode 100644
index 0000000..1eae3d8
--- /dev/null
+++ b/include/alpaka/example/ExecuteForEachAccTag.hpp
@@ -0,0 +1,27 @@
+/* Copyright 2023 Jeffrey Kelling, Bernhard Manfred Gruber, Jan Stephan, Aurora Perego, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#include "alpaka/alpaka.hpp"
+
+#include <functional>
+#include <tuple>
+#include <utility>
+
+#pragma once
+
+namespace alpaka
+{
+    //! execute a callable for each active accelerator tag
+    //
+    // @param callable callable which can be invoked with an accelerator tag
+    // @return disjunction of all invocation results
+    //
+    template<typename TCallable>
+    inline auto executeForEachAccTag(TCallable&& callable)
+    {
+        // Execute the callable once for each enabled accelerator.
+        // Pass the tag as first argument to the callable.
+        return std::apply([=](auto const&... tags) { return (callable(tags) || ...); }, alpaka::EnabledAccTags{});
+    }
+} // namespace alpaka
diff --git a/include/alpaka/exec/ElementIndex.hpp b/include/alpaka/exec/ElementIndex.hpp
new file mode 100644
index 0000000..061c597
--- /dev/null
+++ b/include/alpaka/exec/ElementIndex.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+namespace alpaka
+{
+
+    /* ElementIndex
+     *
+     * An aggregate that containes the `.global` and `.local` indices of an element along a given dimension.
+     */
+
+    template<typename TIdx>
+    struct ElementIndex
+    {
+        TIdx global; // Index of the element along a given dimension, relative to the whole problem space.
+        TIdx local; // Index of the element along a given dimension, relative to the current group.
+    };
+
+} // namespace alpaka
diff --git a/include/alpaka/exec/IndependentElements.hpp b/include/alpaka/exec/IndependentElements.hpp
new file mode 100644
index 0000000..447fa7e
--- /dev/null
+++ b/include/alpaka/exec/IndependentElements.hpp
@@ -0,0 +1,454 @@
+#pragma once
+
+#include "alpaka/acc/Traits.hpp"
+#include "alpaka/idx/Accessors.hpp"
+
+#include <algorithm>
+#include <ciso646> // workaround for MSVC in c++17 mode - TODO: remove once we move to c++20
+#include <cstddef>
+#include <type_traits>
+
+namespace alpaka
+{
+
+    namespace detail
+    {
+
+        /* IndependentGroupsAlong
+         *
+         * `IndependentGroupsAlong<TAcc, Dim>(acc, groups)` returns a one-dimensional iteratable range than spans the
+         * group indices from 0 to `groups`; the groups are assigned to the blocks along the `Dim` dimension. If
+         * `groups` is not specified, it defaults to the number of blocks along the `Dim` dimension.
+         *
+         * `independentGroupsAlong<Dim>(acc, ...)` is a shorthand for `IndependentGroupsAlong<TAcc, Dim>(acc, ...)`
+         * that can infer the accelerator type from the argument.
+         *
+         * In a 1-dimensional kernel, `independentGroups(acc, ...)` is a shorthand for `IndependentGroupsAlong<TAcc,
+         * 0>(acc, ...)`.
+         *
+         * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop),
+         * followed by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). For convenience
+         * when converting CUDA or HIP code, `independentGroupsAlongX(acc, ...)`, `Y` and `Z` are shorthands for
+         * `IndependentGroupsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
+         *
+         * `independentGroupsAlong<Dim>(acc, ...)` should be called consistently by all the threads in a block. All
+         * threads in a block see the same loop iterations, while threads in different blocks may see a different
+         * number of iterations.
+         * If the work division has more blocks than the required number of groups, the first blocks will perform one
+         * iteration of the loop, while the other blocks will exit the loop immediately.
+         * If the work division has less blocks than the required number of groups, some of the blocks will perform
+         * more than one iteration, in order to cover then whole problem space.
+         *
+         * For example,
+         *
+         *   for (auto group: independentGroupsAlong<Dim>(acc, 7))
+         *
+         * will return the group range from 0 to 6, distributed across all blocks in the work division.
+         * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the
+         * other blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from
+         * 0 to 6 will process one group while block 7 will no process any.
+         * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the
+         * loop, in order to cover then whole problem space. For example if the work division has 4 blocks, block 0
+         * will process the groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6,
+         * and block 3 will process group 3.
+         */
+
+        template<
+            typename TAcc,
+            std::size_t Dim,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
+        class IndependentGroupsAlong
+        {
+        public:
+            using Idx = alpaka::Idx<TAcc>;
+
+            ALPAKA_FN_ACC inline IndependentGroupsAlong(TAcc const& acc)
+                : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
+                , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
+                , extent_{stride_}
+            {
+            }
+
+            ALPAKA_FN_ACC inline IndependentGroupsAlong(TAcc const& acc, Idx groups)
+                : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
+                , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
+                , extent_{groups}
+            {
+            }
+
+            class const_iterator;
+            using iterator = const_iterator;
+
+            ALPAKA_FN_ACC inline const_iterator begin() const
+            {
+                return const_iterator(stride_, extent_, first_);
+            }
+
+            ALPAKA_FN_ACC inline const_iterator end() const
+            {
+                return const_iterator(stride_, extent_, extent_);
+            }
+
+            class const_iterator
+            {
+                friend class IndependentGroupsAlong;
+
+                ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first)
+                    : stride_{stride}
+                    , extent_{extent}
+                    , first_{std::min(first, extent)}
+                {
+                }
+
+            public:
+                ALPAKA_FN_ACC inline Idx operator*() const
+                {
+                    return first_;
+                }
+
+                // pre-increment the iterator
+                ALPAKA_FN_ACC inline const_iterator& operator++()
+                {
+                    // increment the first-element-in-block index by the grid stride
+                    first_ += stride_;
+                    if(first_ < extent_)
+                        return *this;
+
+                    // the iterator has reached or passed the end of the extent, clamp it to the extent
+                    first_ = extent_;
+                    return *this;
+                }
+
+                // post-increment the iterator
+                ALPAKA_FN_ACC inline const_iterator operator++(int)
+                {
+                    const_iterator old = *this;
+                    ++(*this);
+                    return old;
+                }
+
+                ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const
+                {
+                    return (first_ == other.first_);
+                }
+
+                ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const
+                {
+                    return not(*this == other);
+                }
+
+            private:
+                // non-const to support iterator copy and assignment
+                Idx stride_;
+                Idx extent_;
+                // modified by the pre/post-increment operator
+                Idx first_;
+            };
+
+        private:
+            Idx const first_;
+            Idx const stride_;
+            Idx const extent_;
+        };
+
+    } // namespace detail
+
+    /* independentGroups
+     *
+     * `independentGroups(acc, groups)` returns a one-dimensional iteratable range than spans the group indices from 0
+     * to `groups`. If `groups` is not specified, it defaults to the number of blocks.
+     *
+     * `independentGroups(acc, ...)` is a shorthand for `detail::IndependentGroupsAlong<TAcc, 0>(acc, ...)`.
+     *
+     * `independentGroups(acc, ...)` should be called consistently by all the threads in a block. All threads in a
+     * block see the same loop iterations, while threads in different blocks may see a different number of iterations.
+     * If the work division has more blocks than the required number of groups, the first blocks will perform one
+     * iteration of the loop, while the other blocks will exit the loop immediately.
+     * If the work division has less blocks than the required number of groups, some of the blocks will perform more
+     * than one iteration, in order to cover then whole problem space.
+     *
+     * For example,
+     *
+     *   for (auto group: independentGroups(acc, 7))
+     *
+     * will return the group range from 0 to 6, distributed across all blocks in the work division.
+     * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
+     * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6
+     * will process one group while block 7 will no process any.
+     * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the
+     * loop, in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will
+     * process the groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block
+     * 3 will process group 3.
+     *
+     * Note that `independentGroups(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels,
+     * use
+     *   - `independentGroupsAlong<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
+     *   - `independentGroupsAlongX(acc, ...)`, `independentGroupsAlongY(acc, ...)`, or `independentGroupsAlongZ(acc,
+     *     ...)` to loop along the fastest, second-fastest, or third-fastest dimension.
+     */
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+    ALPAKA_FN_ACC inline auto independentGroups(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::IndependentGroupsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
+    }
+
+    /* independentGroupsAlong<Dim>
+     *
+     * `independentGroupsAlong<Dim>(acc, ...)` is a shorthand for `detail::IndependentGroupsAlong<TAcc, Dim>(acc, ...)`
+     * that can infer the accelerator type from the argument.
+     */
+
+    template<
+        std::size_t Dim,
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
+    ALPAKA_FN_ACC inline auto independentGroupsAlong(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::IndependentGroupsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
+    }
+
+    /* independentGroupsAlongX, Y, Z
+     *
+     * Like `independentGroups` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
+     * dimensions.
+     */
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
+    ALPAKA_FN_ACC inline auto independentGroupsAlongX(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::IndependentGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
+    }
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
+    ALPAKA_FN_ACC inline auto independentGroupsAlongY(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::IndependentGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
+    }
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
+    ALPAKA_FN_ACC inline auto independentGroupsAlongZ(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::IndependentGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
+    }
+
+    namespace detail
+    {
+
+        /* IndependentGroupElementsAlong
+         *
+         * `independentGroupElementsAlong<Dim>(acc, ...)` is a shorthand for `IndependentGroupElementsAlong<TAcc,
+         * Dim>(acc, ...)` that can infer the accelerator type from the argument.
+         */
+
+        template<
+            typename TAcc,
+            std::size_t Dim,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
+        class IndependentGroupElementsAlong
+        {
+        public:
+            using Idx = alpaka::Idx<TAcc>;
+
+            ALPAKA_FN_ACC inline IndependentGroupElementsAlong(TAcc const& acc)
+                : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
+                , thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_}
+                , stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_}
+                , extent_{stride_}
+            {
+            }
+
+            ALPAKA_FN_ACC inline IndependentGroupElementsAlong(TAcc const& acc, Idx extent)
+                : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
+                , thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_}
+                , stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_}
+                , extent_{extent}
+            {
+            }
+
+            ALPAKA_FN_ACC inline IndependentGroupElementsAlong(TAcc const& acc, Idx first, Idx extent)
+                : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
+                , thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_ + first}
+                , stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_}
+                , extent_{extent}
+            {
+            }
+
+            class const_iterator;
+            using iterator = const_iterator;
+
+            ALPAKA_FN_ACC inline const_iterator begin() const
+            {
+                return const_iterator(elements_, stride_, extent_, thread_);
+            }
+
+            ALPAKA_FN_ACC inline const_iterator end() const
+            {
+                return const_iterator(elements_, stride_, extent_, extent_);
+            }
+
+            class const_iterator
+            {
+                friend class IndependentGroupElementsAlong;
+
+                ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first)
+                    : elements_{elements}
+                    ,
+                    // we need to reduce the stride by on element range because index_ is later increased with each
+                    // increment
+                    stride_{stride - elements}
+                    , extent_{extent}
+                    , index_{std::min(first, extent)}
+                {
+                }
+
+            public:
+                ALPAKA_FN_ACC inline Idx operator*() const
+                {
+                    return index_;
+                }
+
+                // pre-increment the iterator
+                ALPAKA_FN_ACC inline const_iterator& operator++()
+                {
+                    ++indexElem_;
+                    ++index_;
+                    if(indexElem_ >= elements_)
+                    {
+                        indexElem_ = 0;
+                        index_ += stride_;
+                    }
+                    if(index_ >= extent_)
+                        index_ = extent_;
+
+                    return *this;
+                }
+
+                // post-increment the iterator
+                ALPAKA_FN_ACC inline const_iterator operator++(int)
+                {
+                    const_iterator old = *this;
+                    ++(*this);
+                    return old;
+                }
+
+                ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const
+                {
+                    return (*(*this) == *other);
+                }
+
+                ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const
+                {
+                    return not(*this == other);
+                }
+
+            private:
+                // non-const to support iterator copy and assignment
+                Idx elements_;
+                Idx stride_;
+                Idx extent_;
+                // modified by the pre/post-increment operator
+                Idx index_;
+                Idx indexElem_ = 0;
+            };
+
+        private:
+            Idx const elements_;
+            Idx const thread_;
+            Idx const stride_;
+            Idx const extent_;
+        };
+
+    } // namespace detail
+
+    /* independentGroupElements
+     */
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+    ALPAKA_FN_ACC inline auto independentGroupElements(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::IndependentGroupElementsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
+    }
+
+    /* independentGroupElementsAlong<Dim>
+     *
+     * `independentGroupElementsAlong<Dim>(acc, ...)` is a shorthand for `detail::IndependentGroupElementsAlong<TAcc,
+     * Dim>(acc, ...)` that can infer the accelerator type from the argument.
+     */
+
+    template<
+        std::size_t Dim,
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
+    ALPAKA_FN_ACC inline auto independentGroupElementsAlong(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::IndependentGroupElementsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
+    }
+
+    /* independentGroupElementsAlongX, Y, Z
+     *
+     * Like `independentGroupElements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
+     * dimensions.
+     */
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
+    ALPAKA_FN_ACC inline auto independentGroupElementsAlongX(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::IndependentGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(
+            acc,
+            static_cast<Idx>(args)...);
+    }
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
+    ALPAKA_FN_ACC inline auto independentGroupElementsAlongY(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::IndependentGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(
+            acc,
+            static_cast<Idx>(args)...);
+    }
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
+    ALPAKA_FN_ACC inline auto independentGroupElementsAlongZ(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::IndependentGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(
+            acc,
+            static_cast<Idx>(args)...);
+    }
+
+} // namespace alpaka
diff --git a/include/alpaka/exec/Once.hpp b/include/alpaka/exec/Once.hpp
new file mode 100644
index 0000000..8a2f2cb
--- /dev/null
+++ b/include/alpaka/exec/Once.hpp
@@ -0,0 +1,56 @@
+/* Copyright 2024 Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/acc/Tag.hpp"
+#include "alpaka/acc/Traits.hpp"
+#include "alpaka/dim/DimIntegralConst.hpp"
+#include "alpaka/idx/Accessors.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/vec/Vec.hpp"
+
+#include <type_traits>
+
+namespace alpaka
+{
+
+    /* oncePerGrid
+     *
+     * `oncePerGrid(acc)` returns true for a single thread within the kernel execution grid.
+     *
+     * Usually the condition is true for block 0 and thread 0, but these indices should not be relied upon.
+     */
+
+    template<typename TAcc, typename = std::enable_if_t<isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC inline constexpr bool oncePerGrid(TAcc const& acc)
+    {
+        using Dim = alpaka::Dim<TAcc>;
+        using Idx = alpaka::Idx<TAcc>;
+        using Vec = alpaka::Vec<Dim, Idx>;
+
+        // Workaround for a weird bug in oneAPI 2024.x targetting the CPU backend and FPGA emulator.
+        if constexpr(accMatchesTags<TAcc, TagCpuSycl, TagFpgaSyclIntel>)
+        {
+            // SYCL accelerator specific code
+            return acc.m_item_workdiv.get_global_linear_id() == 0;
+        }
+
+        return getIdx<Grid, Threads>(acc) == Vec::zeros();
+    }
+
+    /* oncePerBlock
+     *
+     * `oncePerBlock(acc)` returns true for a single thread within the block.
+     *
+     * Usually the condition is true for thread 0, but this index should not be relied upon.
+     */
+
+    template<typename TAcc, typename = std::enable_if_t<isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC inline constexpr bool oncePerBlock(TAcc const& acc)
+    {
+        return getIdx<Block, Threads>(acc) == Vec<Dim<TAcc>, Idx<TAcc>>::zeros();
+    }
+
+} // namespace alpaka
diff --git a/include/alpaka/exec/UniformElements.hpp b/include/alpaka/exec/UniformElements.hpp
new file mode 100644
index 0000000..2bfbc94
--- /dev/null
+++ b/include/alpaka/exec/UniformElements.hpp
@@ -0,0 +1,1145 @@
+#pragma once
+
+#include "alpaka/acc/Traits.hpp"
+#include "alpaka/core/Utility.hpp"
+#include "alpaka/exec/ElementIndex.hpp"
+#include "alpaka/idx/Accessors.hpp"
+
+#include <algorithm>
+#include <ciso646> // workaround for MSVC in c++17 mode - TODO: remove once we move to c++20
+#include <cstddef>
+#include <type_traits>
+
+namespace alpaka
+{
+
+    namespace detail
+    {
+
+        /* UniformElementsAlong
+         *
+         * `UniformElementsAlong<TAcc, Dim>(acc [, first], extent)` returns a one-dimensional iteratable range that
+         * spans the element indices from `first` (inclusive) to `extent` (exlusive) along the `Dim` dimension. If
+         * `first` is not specified, it defaults to 0. If `extent` is not specified, it defaults to the kernel grid
+         * size along the `Dim` dimension.
+         *
+         * `uniformElementsAlong<Dim>(acc, ...)` is a shorthand for `UniformElementsAlong<TAcc, Dim>(acc, ...)` that
+         * can infer the accelerator type from the argument.
+         *
+         * In a 1-dimensional kernel, `uniformElements(acc, ...)` is a shorthand for `UniformElementsAlong<TAcc,
+         * 0>(acc, ...)`.
+         *
+         * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop),
+         * followed by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). For convenience
+         * when converting CUDA or HIP code, `uniformElementsAlongX(acc, ...)`, `Y` and `Z` are shorthands for
+         * `UniformElementsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
+         *
+         * To cover the problem space, different threads may execute a different number of iterations. As a result, it
+         * is not safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop. If
+         * a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
+         * loop over each group's elements, and synchronise only in the outer loop:
+         *
+         *  for (auto group : uniformGroupsAlong<Dim>(acc, extent)) {
+         *    for (auto element : uniformGroupElementsAlong<Dim>(acc, group, extent)) {
+         *       // first part of the computation
+         *       // no synchronisations here
+         *       ...
+         *    }
+         *    // wait for all threads to complete the first part
+         *    alpaka::syncBlockThreads();
+         *    for (auto element : uniformGroupElementsAlong<Dim>(acc, group, extent)) {
+         *       // second part of the computation
+         *       // no synchronisations here
+         *       ...
+         *    }
+         *    // wait for all threads to complete the second part
+         *    alpaka::syncBlockThreads();
+         *    ...
+         *  }
+         *
+         * Warp-level primitives require that all threads in the warp execute the same function. If `extent` is not a
+         * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for
+         * example, the kernel may hang. To avoid this problem, round up `extent` to a multiple of the warp size, and
+         * check the element index explicitly inside the loop:
+         *
+         *  for (auto element : uniformElementsAlong<N-1>(acc, round_up_by(extent, alpaka::warp::getSize(acc)))) {
+         *    bool flag = false;
+         *    if (element < extent) {
+         *      // do some work and compute a result flag only for the valid elements
+         *      flag = do_some_work();
+         *    }
+         *    // check if any valid element had a positive result
+         *    if (alpaka::warp::any(acc, flag)) {
+         *      // ...
+         *    }
+         *  }
+         *
+         * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension,
+         * `N-1`.
+         */
+
+        template<
+            typename TAcc,
+            std::size_t Dim,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
+        class UniformElementsAlong
+        {
+        public:
+            using Idx = alpaka::Idx<TAcc>;
+
+            ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc)
+                : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
+                , first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
+                , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
+                , extent_{stride_}
+            {
+            }
+
+            ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc, Idx extent)
+                : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
+                , first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
+                , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
+                , extent_{extent}
+            {
+            }
+
+            ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc, Idx first, Idx extent)
+                : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
+                , first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_ + first}
+                , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
+                , extent_{extent}
+            {
+            }
+
+            class const_iterator;
+            using iterator = const_iterator;
+
+            ALPAKA_FN_ACC inline const_iterator begin() const
+            {
+                return const_iterator(elements_, stride_, extent_, first_);
+            }
+
+            ALPAKA_FN_ACC inline const_iterator end() const
+            {
+                return const_iterator(elements_, stride_, extent_, extent_);
+            }
+
+            class const_iterator
+            {
+                friend class UniformElementsAlong;
+
+                ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first)
+                    : elements_{elements}
+                    ,
+                    // we need to reduce the stride by on element range because index_ is later increased with each
+                    // increment
+                    stride_{stride - elements}
+                    , extent_{extent}
+                    , index_{std::min(first, extent)}
+                {
+                }
+
+            public:
+                ALPAKA_FN_ACC inline Idx operator*() const
+                {
+                    return index_;
+                }
+
+                // pre-increment the iterator
+                ALPAKA_FN_ACC inline const_iterator& operator++()
+                {
+                    // increment the index along the elements processed by the current thread
+                    ++indexElem_;
+                    ++index_;
+                    if(indexElem_ >= elements_)
+                    {
+                        indexElem_ = 0;
+                        index_ += stride_;
+                    }
+                    if(index_ >= extent_)
+                        index_ = extent_;
+
+                    return *this;
+                }
+
+                // post-increment the iterator
+                ALPAKA_FN_ACC inline const_iterator operator++(int)
+                {
+                    const_iterator old = *this;
+                    ++(*this);
+                    return old;
+                }
+
+                ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const
+                {
+                    return (*(*this) == *other);
+                }
+
+                ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const
+                {
+                    return not(*this == other);
+                }
+
+            private:
+                // non-const to support iterator copy and assignment
+                Idx elements_;
+                Idx stride_;
+                Idx extent_;
+                // modified by the pre/post-increment operator
+                Idx index_;
+                Idx indexElem_ = 0;
+            };
+
+        private:
+            Idx const elements_;
+            Idx const first_;
+            Idx const stride_;
+            Idx const extent_;
+        };
+
+    } // namespace detail
+
+    /* uniformElements
+     *
+     * `uniformElements(acc [, first], extent)` returns a one-dimensional iteratable range that spans the element
+     * indices from `first` (inclusive) to `extent` (exlusive). If `first` is not specified, it defaults to 0. If
+     * `extent` is not specified, it defaults to the kernel grid size.
+     *
+     * `uniformElements(acc, ...)` is a shorthand for `detail::UniformElementsAlong<TAcc, 0>(acc, ...)`.
+     *
+     * To cover the problem space, different threads may execute a different number of iterations. As a result, it is
+     * not safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop. If a
+     * block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner loop
+     * over each group's elements, and synchronise only in the outer loop:
+     *
+     *  for (auto group : uniformGroups(acc, extent)) {
+     *    for (auto element : uniformGroupElements(acc, group, extent)) {
+     *       // first part of the computation
+     *       // no synchronisations here
+     *       ...
+     *    }
+     *    // wait for all threads to complete the first part
+     *    alpaka::syncBlockThreads();
+     *    for (auto element : uniformGroupElements(acc, group, extent)) {
+     *       // second part of the computation
+     *       // no synchronisations here
+     *       ...
+     *    }
+     *    // wait for all threads to complete the second part
+     *    alpaka::syncBlockThreads();
+     *    ...
+     *  }
+     *
+     * Warp-level primitives require that all threads in the warp execute the same function. If `extent` is not a
+     * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example,
+     * the kernel may hang. To avoid this problem, round up `extent` to a multiple of the warp size, and check the
+     * element index explicitly inside the loop:
+     *
+     *  for (auto element : uniformElements(acc, round_up_by(extent, alpaka::warp::getSize(acc)))) {
+     *    bool flag = false;
+     *    if (element < extent) {
+     *      // do some work and compute a result flag only for elements up to extent
+     *      flag = do_some_work();
+     *    }
+     *    // check if any valid element had a positive result
+     *    if (alpaka::warp::any(acc, flag)) {
+     *      // ...
+     *    }
+     *  }
+     *
+     * Note that `uniformElements(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels,
+     * use
+     *   - `uniformElementsND(acc, ...)` to cover an N-dimensional problem space with a single loop;
+     *   - `uniformElementsAlong<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
+     *   - `uniformElementsAlongX(acc, ...)`, `uniformElementsAlongY(acc, ...)`, or `uniformElementsAlongZ(acc, ...)`
+     *     to loop along the fastest, second-fastest, or third-fastest dimension.
+     */
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+    ALPAKA_FN_ACC inline auto uniformElements(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::UniformElementsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
+    }
+
+    /* uniformElementsAlong<Dim>
+     *
+     * `uniformElementsAlong<Dim>(acc, ...)` is a shorthand for `detail::UniformElementsAlong<TAcc, Dim>(acc, ...)`
+     * that can infer the accelerator type from the argument.
+     */
+
+    template<
+        std::size_t Dim,
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
+    ALPAKA_FN_ACC inline auto uniformElementsAlong(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::UniformElementsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
+    }
+
+    /* uniformElementsAlongX, Y, Z
+     *
+     * Like `uniformElements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
+     * dimensions.
+     */
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
+    ALPAKA_FN_ACC inline auto uniformElementsAlongX(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::UniformElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
+    }
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
+    ALPAKA_FN_ACC inline auto uniformElementsAlongY(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::UniformElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
+    }
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
+    ALPAKA_FN_ACC inline auto uniformElementsAlongZ(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::UniformElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
+    }
+
+    namespace detail
+    {
+
+        /* UniformElementsND
+         *
+         * `UniformElementsND(acc, extent)` returns an N-dimensional iteratable range that spans the element indices
+         * required to cover the given problem size, indicated by `extent`.
+         *
+         * `uniformElementsND(acc, ...)` is an alias for `UniformElementsND<TAcc>(acc, ...)`.
+         *
+         * To cover the problem space, different threads may execute a different number of iterations. As a result, it
+         * is not safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop. If
+         * a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
+         * loop over each group's elements, and synchronise only in the outer loop:
+         *
+         *  for (auto group0 : uniformGroupsAlong<0>(acc, extent[0])) {
+         *    for (auto group1 : uniformGroupsAlong<1>(acc, extent[1])) {
+         *      for (auto element0 : uniformGroupElementsAlong<0>(acc, group0, extent[0])) {
+         *        for (auto element1 : uniformGroupElementsAlong<1>(acc, group1, extent[1])) {
+         *           // first part of the computation
+         *           // no synchronisations here
+         *           ...
+         *        }
+         *      }
+         *      // wait for all threads to complete the first part
+         *      alpaka::syncBlockThreads();
+         *      for (auto element0 : uniformGroupElementsAlong<0>(acc, group0, extent[0])) {
+         *        for (auto element1 : uniformGroupElementsAlong<1>(acc, group1, extent[1])) {
+         *           // second part of the computation
+         *           // no synchronisations here
+         *           ...
+         *        }
+         *      }
+         *      // wait for all threads to complete the second part
+         *      alpaka::syncBlockThreads();
+         *      ...
+         *    }
+         *  }
+         *
+         * For more details, see `UniformElementsAlong<TAcc, Dim>(acc, ...)`.
+         */
+
+        template<
+            typename TAcc,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
+        class UniformElementsND
+        {
+        public:
+            using Dim = alpaka::Dim<TAcc>;
+            using Idx = alpaka::Idx<TAcc>;
+            using Vec = alpaka::Vec<Dim, Idx>;
+
+            ALPAKA_FN_ACC inline UniformElementsND(TAcc const& acc)
+                : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)}
+                , thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_}
+                , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_}
+                , extent_{stride_}
+            {
+            }
+
+            ALPAKA_FN_ACC inline UniformElementsND(TAcc const& acc, Vec extent)
+                : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)}
+                , thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_}
+                , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_}
+                , extent_{extent}
+            {
+            }
+
+            // tag used to construct an end iterator
+            struct at_end_t
+            {
+            };
+
+            class const_iterator;
+            using iterator = const_iterator;
+
+            ALPAKA_FN_ACC inline const_iterator begin() const
+            {
+                // check that all dimensions of the current thread index are within the extent
+                if((thread_ < extent_).all())
+                {
+                    // construct an iterator pointing to the first element to be processed by the current thread
+                    return const_iterator{this, thread_};
+                }
+                else
+                {
+                    // construct an end iterator, pointing post the end of the extent
+                    return const_iterator{this, at_end_t{}};
+                }
+            }
+
+            ALPAKA_FN_ACC inline const_iterator end() const
+            {
+                // construct an end iterator, pointing post the end of the extent
+                return const_iterator{this, at_end_t{}};
+            }
+
+            class const_iterator
+            {
+                friend class UniformElementsND;
+
+            public:
+                ALPAKA_FN_ACC inline Vec operator*() const
+                {
+                    return index_;
+                }
+
+                // pre-increment the iterator
+                ALPAKA_FN_ACC inline constexpr const_iterator operator++()
+                {
+                    increment();
+                    return *this;
+                }
+
+                // post-increment the iterator
+                ALPAKA_FN_ACC inline constexpr const_iterator operator++(int)
+                {
+                    const_iterator old = *this;
+                    increment();
+                    return old;
+                }
+
+                ALPAKA_FN_ACC inline constexpr bool operator==(const_iterator const& other) const
+                {
+                    return (index_ == other.index_);
+                }
+
+                ALPAKA_FN_ACC inline constexpr bool operator!=(const_iterator const& other) const
+                {
+                    return not(*this == other);
+                }
+
+            private:
+                // construct an iterator pointing to the first element to be processed by the current thread
+                ALPAKA_FN_ACC inline const_iterator(UniformElementsND const* loop, Vec first)
+                    : loop_{loop}
+                    , first_{alpaka::elementwise_min(first, loop->extent_)}
+                    , range_{alpaka::elementwise_min(first + loop->elements_, loop->extent_)}
+                    , index_{first_}
+                {
+                }
+
+                // construct an end iterator, pointing post the end of the extent
+                ALPAKA_FN_ACC inline const_iterator(UniformElementsND const* loop, at_end_t const&)
+                    : loop_{loop}
+                    , first_{loop_->extent_}
+                    , range_{loop_->extent_}
+                    , index_{loop_->extent_}
+                {
+                }
+
+                template<size_t I>
+                ALPAKA_FN_ACC inline constexpr bool nth_elements_loop()
+                {
+                    bool overflow = false;
+                    ++index_[I];
+                    if(index_[I] >= range_[I])
+                    {
+                        index_[I] = first_[I];
+                        overflow = true;
+                    }
+                    return overflow;
+                }
+
+                template<size_t N>
+                ALPAKA_FN_ACC inline constexpr bool do_elements_loops()
+                {
+                    if constexpr(N == 0)
+                    {
+                        // overflow
+                        return true;
+                    }
+                    else
+                    {
+                        if(not nth_elements_loop<N - 1>())
+                        {
+                            return false;
+                        }
+                        else
+                        {
+                            return do_elements_loops<N - 1>();
+                        }
+                    }
+                    ALPAKA_UNREACHABLE(false);
+                }
+
+                template<size_t I>
+                ALPAKA_FN_ACC inline constexpr bool nth_strided_loop()
+                {
+                    bool overflow = false;
+                    first_[I] += loop_->stride_[I];
+                    if(first_[I] >= loop_->extent_[I])
+                    {
+                        first_[I] = loop_->thread_[I];
+                        overflow = true;
+                    }
+                    index_[I] = first_[I];
+                    range_[I] = std::min(first_[I] + loop_->elements_[I], loop_->extent_[I]);
+                    return overflow;
+                }
+
+                template<size_t N>
+                ALPAKA_FN_ACC inline constexpr bool do_strided_loops()
+                {
+                    if constexpr(N == 0)
+                    {
+                        // overflow
+                        return true;
+                    }
+                    else
+                    {
+                        if(not nth_strided_loop<N - 1>())
+                        {
+                            return false;
+                        }
+                        else
+                        {
+                            return do_strided_loops<N - 1>();
+                        }
+                    }
+                    ALPAKA_UNREACHABLE(false);
+                }
+
+                // increment the iterator
+                ALPAKA_FN_ACC inline constexpr void increment()
+                {
+                    // linear N-dimensional loops over the elements associated to the thread;
+                    // do_elements_loops<>() returns true if any of those loops overflows
+                    if(not do_elements_loops<Dim::value>())
+                    {
+                        // the elements loops did not overflow, return the next index
+                        return;
+                    }
+
+                    // strided N-dimensional loop over the threads in the kernel launch grid;
+                    // do_strided_loops<>() returns true if any of those loops overflows
+                    if(not do_strided_loops<Dim::value>())
+                    {
+                        // the strided loops did not overflow, return the next index
+                        return;
+                    }
+
+                    // the iterator has reached or passed the end of the extent, clamp it to the extent
+                    first_ = loop_->extent_;
+                    range_ = loop_->extent_;
+                    index_ = loop_->extent_;
+                }
+
+                // const pointer to the UniformElementsND that the iterator refers to
+                UniformElementsND const* loop_;
+
+                // modified by the pre/post-increment operator
+                Vec first_; // first element processed by this thread
+                Vec range_; // last element processed by this thread
+                Vec index_; // current element processed by this thread
+            };
+
+        private:
+            Vec const elements_;
+            Vec const thread_;
+            Vec const stride_;
+            Vec const extent_;
+        };
+
+    } // namespace detail
+
+    /* uniformElementsND
+     *
+     * `uniformElementsND(acc, ...)` is a shorthand for `detail::UniformElementsND<TAcc>(acc, ...)`.
+     */
+
+    template<
+        typename TAcc,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
+    ALPAKA_FN_ACC inline auto uniformElementsND(TAcc const& acc)
+    {
+        return detail::UniformElementsND<TAcc>(acc);
+    }
+
+    template<
+        typename TAcc,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
+    ALPAKA_FN_ACC inline auto uniformElementsND(
+        TAcc const& acc,
+        alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> extent)
+    {
+        return detail::UniformElementsND<TAcc>(acc, extent);
+    }
+
+    namespace detail
+    {
+
+        /* UniformGroupsAlong
+         *
+         * `UniformGroupsAlong<Dim>(acc, elements)` returns a one-dimensional iteratable range than spans the group
+         * indices required to cover the given problem size along the `Dim` dimension, in units of the block size.
+         * `elements` indicates the total number of elements, across all groups; if not specified, it defaults to the
+         * kernel grid size along the `Dim` dimension.
+         *
+         * `uniformGroupsAlong<Dim>(acc, ...)` is a shorthand for `UniformGroupsAlong<TAcc, Dim>(acc, ...)` that can
+         * infer the accelerator type from the argument.
+         *
+         * In a 1-dimensional kernel, `uniformGroups(acc, ...)` is a shorthand for `UniformGroupsAlong<Tacc, 0>(acc,
+         * ...)`.
+         *
+         * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop),
+         * followed by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). For convenience
+         * when converting CUDA or HIP code, `uniformGroupsAlongX(acc, ...)`, `Y` and `Z` are shorthands for
+         * `UniformGroupsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
+         *
+         * `uniformGroupsAlong<Dim>(acc, ...)` should be called consistently by all the threads in a block. All
+         * threads in a block see the same loop iterations, while threads in different blocks may see a different
+         * number of iterations. If the work division has more blocks than the required number of groups, the first
+         * blocks will perform one iteration of the loop, while the other blocks will exit the loop immediately. If the
+         * work division has less blocks than the required number of groups, some of the blocks will perform more than
+         * one iteration, in order to cover then whole problem space.
+         *
+         * If the problem size is not a multiple of the block size, the last group will process a number of elements
+         * smaller than the block size. However, also in this case all threads in the block will execute the same
+         * number of iterations of this loop: this makes it safe to use block-level synchronisations in the loop body.
+         * It is left to the inner loop (or the user) to ensure that only the correct number of threads process any
+         * data; this logic is implemented by `uniformGroupElementsAlong<Dim>(acc, group, elements)`.
+         *
+         * For example, if the block size is 64 and there are 400 elements
+         *
+         *   for (auto group: uniformGroupsAlong<Dim>(acc, 400)
+         *
+         * will return the group range from 0 to 6, distributed across all blocks in the work division: group 0 should
+         * cover the elements from 0 to 63, group 1 should cover the elements from 64 to 127, etc., until the last
+         * group, group 6, should cover the elements from 384 to 399. All the threads of the block will process this
+         * last group; it is up to the inner loop to not process the non-existing elements after 399.
+         *
+         * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the
+         * other blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from
+         * 0 to 6 will process one group while block 7 will no process any.
+         *
+         * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the
+         * loop, in order to cover then whole problem space. For example if the work division has 4 blocks, block 0
+         * will process the groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6,
+         * and block 3 will process group 3.
+         *
+         * See `UniformElementsAlong<TAcc, Dim>(acc, ...)` for a concrete example using `uniformGroupsAlong<Dim>` and
+         * `uniformGroupElementsAlong<Dim>`.
+         */
+
+        template<
+            typename TAcc,
+            std::size_t Dim,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
+        class UniformGroupsAlong
+        {
+        public:
+            using Idx = alpaka::Idx<TAcc>;
+
+            ALPAKA_FN_ACC inline UniformGroupsAlong(TAcc const& acc)
+                : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
+                , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
+                , extent_{stride_}
+            {
+            }
+
+            // extent is the total number of elements (not blocks)
+            ALPAKA_FN_ACC inline UniformGroupsAlong(TAcc const& acc, Idx extent)
+                : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
+                , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
+                , extent_{alpaka::core::divCeil(extent, alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim])}
+            {
+            }
+
+            class const_iterator;
+            using iterator = const_iterator;
+
+            ALPAKA_FN_ACC inline const_iterator begin() const
+            {
+                return const_iterator(stride_, extent_, first_);
+            }
+
+            ALPAKA_FN_ACC inline const_iterator end() const
+            {
+                return const_iterator(stride_, extent_, extent_);
+            }
+
+            class const_iterator
+            {
+                friend class UniformGroupsAlong;
+
+                ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first)
+                    : stride_{stride}
+                    , extent_{extent}
+                    , first_{std::min(first, extent)}
+                {
+                }
+
+            public:
+                ALPAKA_FN_ACC inline Idx operator*() const
+                {
+                    return first_;
+                }
+
+                // pre-increment the iterator
+                ALPAKA_FN_ACC inline const_iterator& operator++()
+                {
+                    // increment the first-element-in-block index by the grid stride
+                    first_ += stride_;
+                    if(first_ < extent_)
+                        return *this;
+
+                    // the iterator has reached or passed the end of the extent, clamp it to the extent
+                    first_ = extent_;
+                    return *this;
+                }
+
+                // post-increment the iterator
+                ALPAKA_FN_ACC inline const_iterator operator++(int)
+                {
+                    const_iterator old = *this;
+                    ++(*this);
+                    return old;
+                }
+
+                ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const
+                {
+                    return (first_ == other.first_);
+                }
+
+                ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const
+                {
+                    return not(*this == other);
+                }
+
+            private:
+                // non-const to support iterator copy and assignment
+                Idx stride_;
+                Idx extent_;
+                // modified by the pre/post-increment operator
+                Idx first_;
+            };
+
+        private:
+            Idx const first_;
+            Idx const stride_;
+            Idx const extent_;
+        };
+
+    } // namespace detail
+
+    /* uniformGroups
+     *
+     * `uniformGroups(acc, elements)` returns a one-dimensional iteratable range than spans the group indices required
+     * to cover the given problem size, in units of the block size. `elements` indicates the total number of elements,
+     * across all groups; if not specified, it defaults to the kernel grid size.
+     *
+     * `uniformGroups(acc, ...)` is a shorthand for `detail::UniformGroupsAlong<TAcc, 0>(acc, ...)`.
+     *
+     * `uniformGroups(acc, ...)` should be called consistently by all the threads in a block. All threads in a block
+     * see the same loop iterations, while threads in different blocks may see a different number of iterations. If the
+     * work division has more blocks than the required number of groups, the first blocks will perform one iteration of
+     * the loop, while the other blocks will exit the loop immediately. If the work division has less blocks than the
+     * required number of groups, some of the blocks will perform more than one iteration, in order to cover then whole
+     * problem space.
+     *
+     * If the problem size is not a multiple of the block size, the last group will process a number of elements
+     * smaller than the block size. However, also in this case all threads in the block will execute the same number of
+     * iterations of this loop: this makes it safe to use block-level synchronisations in the loop body. It is left to
+     * the inner loop (or the user) to ensure that only the correct number of threads process any data; this logic is
+     * implemented by `uniformGroupElements(acc, group, elements)`.
+     *
+     * For example, if the block size is 64 and there are 400 elements
+     *
+     *   for (auto group: uniformGroups(acc, 400)
+     *
+     * will return the group range from 0 to 6, distributed across all blocks in the work division: group 0 should
+     * cover the elements from 0 to 63, group 1 should cover the elements from 64 to 127, etc., until the last group,
+     * group 6, should cover the elements from 384 to 399. All the threads of the block will process this last group;
+     * it is up to the inner loop to not process the non-existing elements after 399.
+     *
+     * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
+     * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6
+     * will process one group while block 7 will no process any.
+     *
+     * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the
+     * loop, in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will
+     * process the groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block
+     * 3 will process group 3.
+     *
+     * See `uniformElements(acc, ...)` for a concrete example using `uniformGroups` and `uniformGroupElements`.
+     *
+     * Note that `uniformGroups(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels,
+     * use
+     *   - `uniformGroupsAlong<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
+     *   - `uniformGroupsAlongX(acc, ...)`, `uniformGroupsAlongY(acc, ...)`, or `uniformGroupsAlongZ(acc, ...)` to loop
+     *     along the fastest, second-fastest, or third-fastest dimension.
+     */
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+    ALPAKA_FN_ACC inline auto uniformGroups(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::UniformGroupsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
+    }
+
+    /* uniformGroupsAlong<Dim>
+     *
+     * `uniformGroupsAlong<Dim>(acc, ...)` is a shorthand for `detail::UniformGroupsAlong<TAcc, Dim>(acc, ...)` that
+     * can infer the accelerator type from the argument.
+     */
+
+    template<
+        std::size_t Dim,
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
+    ALPAKA_FN_ACC inline auto uniformGroupsAlong(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::UniformGroupsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
+    }
+
+    /* uniformGroupsAlongX, Y, Z
+     *
+     * Like `uniformGroups` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
+     * dimensions.
+     */
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
+    ALPAKA_FN_ACC inline auto uniformGroupsAlongX(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::UniformGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
+    }
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
+    ALPAKA_FN_ACC inline auto uniformGroupsAlongY(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::UniformGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
+    }
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
+    ALPAKA_FN_ACC inline auto uniformGroupsAlongZ(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::UniformGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
+    }
+
+    namespace detail
+    {
+
+        /* UniformGroupElementsAlong
+         *
+         * `UniformGroupElementsAlong<TAcc, Dim>(acc, group, elements)` returns a one-dimensional iteratable range that
+         * spans all the elements within the given `group` along dimension `Dim`, as obtained from
+         * `UniformGroupsAlong<Dim>`, up to `elements` (exclusive). `elements` indicates the total number of elements
+         * across all groups; if not specified, it defaults to the kernel grid size.
+         *
+         * `uniformGroupElementsAlong<Dim>(acc, ...)` is a shorthand for `UniformGroupElementsAlong<TAcc, Dim>(acc,
+         * ...)` that can infer the accelerator type from the argument.
+         *
+         * In a 1-dimensional kernel, `uniformGroupElements(acc, ...)` is a shorthand for
+         * `UniformGroupElementsAlong<0>(acc, ...)`.
+         *
+         * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop),
+         * followed by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). For convenience
+         * when converting CUDA or HIP code, `uniformGroupElementsAlongX(acc, ...)`, `Y` and `Z` are shorthands for
+         * `UniformGroupElementsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
+         *
+         * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local`
+         * indices of the corresponding element. The global index spans a subset of the range from 0 to `elements`
+         * (excluded), while the local index spans the range from 0 to the block size (excluded).
+         *
+         * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier if
+         * the global element index reaches `elements`.
+         *
+         * If the problem size is not a multiple of the block size, different threads may execute a different number of
+         * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block
+         * synchronisation is needed, one should split the loop, and synchronise the threads between the loops.
+         * See `UniformElementsAlong<Dim>(acc, ...)` for a concrete example using `uniformGroupsAlong<Dim>` and
+         * `uniformGroupElementsAlong<Dim>`.
+         *
+         * Warp-level primitives require that all threads in the warp execute the same function. If `elements` is not a
+         * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for
+         * example, the kernel may hang. To avoid this problem, round up `elements` to a multiple of the warp size, and
+         * check the element index explicitly inside the loop:
+         *
+         *  for (auto element : uniformGroupElementsAlong<N-1>(acc, group, round_up_by(elements,
+         * alpaka::warp::getSize(acc)))) { bool flag = false; if (element < elements) {
+         *      // do some work and compute a result flag only for the valid elements
+         *      flag = do_some_work();
+         *    }
+         *    // check if any valid element had a positive result
+         *    if (alpaka::warp::any(acc, flag)) {
+         *      // ...
+         *    }
+         *  }
+         *
+         * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension,
+         * `N-1`.
+         */
+
+        template<
+            typename TAcc,
+            std::size_t Dim,
+            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
+        class UniformGroupElementsAlong
+        {
+        public:
+            using Idx = alpaka::Idx<TAcc>;
+
+            ALPAKA_FN_ACC inline UniformGroupElementsAlong(TAcc const& acc, Idx block)
+                : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim]}
+                , local_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
+                , range_{local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
+            {
+            }
+
+            ALPAKA_FN_ACC inline UniformGroupElementsAlong(TAcc const& acc, Idx block, Idx extent)
+                : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim]}
+                , local_{std::min(
+                      extent - first_,
+                      alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim]
+                          * alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim])}
+                , range_{
+                      std::min(extent - first_, local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim])}
+            {
+            }
+
+            class const_iterator;
+            using iterator = const_iterator;
+
+            ALPAKA_FN_ACC inline const_iterator begin() const
+            {
+                return const_iterator(local_, first_, range_);
+            }
+
+            ALPAKA_FN_ACC inline const_iterator end() const
+            {
+                return const_iterator(range_, first_, range_);
+            }
+
+            class const_iterator
+            {
+                friend class UniformGroupElementsAlong;
+
+                ALPAKA_FN_ACC inline const_iterator(Idx local, Idx first, Idx range)
+                    : index_{local}
+                    , first_{first}
+                    , range_{range}
+                {
+                }
+
+            public:
+                ALPAKA_FN_ACC inline ElementIndex<Idx> operator*() const
+                {
+                    return ElementIndex<Idx>{index_ + first_, index_};
+                }
+
+                // pre-increment the iterator
+                ALPAKA_FN_ACC inline const_iterator& operator++()
+                {
+                    // increment the index along the elements processed by the current thread
+                    ++index_;
+                    if(index_ < range_)
+                        return *this;
+
+                    // the iterator has reached or passed the end of the extent, clamp it to the extent
+                    index_ = range_;
+                    return *this;
+                }
+
+                // post-increment the iterator
+                ALPAKA_FN_ACC inline const_iterator operator++(int)
+                {
+                    const_iterator old = *this;
+                    ++(*this);
+                    return old;
+                }
+
+                ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const
+                {
+                    return (index_ == other.index_);
+                }
+
+                ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const
+                {
+                    return not(*this == other);
+                }
+
+            private:
+                // modified by the pre/post-increment operator
+                Idx index_;
+                // non-const to support iterator copy and assignment
+                Idx first_;
+                Idx range_;
+            };
+
+        private:
+            Idx const first_;
+            Idx const local_;
+            Idx const range_;
+        };
+
+    } // namespace detail
+
+    /* uniformGroupElements
+     *
+     * `uniformGroupElements(acc, group, elements)` returns a one-dimensional iteratable range that spans all the
+     * elements within the given `group`, as obtained from `uniformGroups`, up to `elements` (exclusive). `elements`
+     * indicates the total number of elements across all groups; if not specified, it defaults to the kernel grid size.
+     *
+     * `uniformGroupElements(acc, ...)` is a shorthand for `detail::UniformGroupElementsAlong<0>(acc, ...)`.
+     *
+     * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local` indices
+     * of the corresponding element. The global index spans a subset of the range from 0 to `elements` (excluded),
+     * while the local index spans the range from 0 to the block size (excluded).
+     *
+     * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier if the
+     * global element index reaches `elements`.
+     *
+     * If the problem size is not a multiple of the block size, different threads may execute a different number of
+     * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block
+     * synchronisation is needed, one should split the loop, and synchronise the threads between the loops.
+     * See `uniformElements(acc, ...)` for a concrete example using `uniformGroups` and `uniformGroupElements`.
+     *
+     * Warp-level primitives require that all threads in the warp execute the same function. If `elements` is not a
+     * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example,
+     * the kernel may hang. To avoid this problem, round up `elements` to a multiple of the warp size, and check the
+     * element index explicitly inside the loop:
+     *
+     *  for (auto element : uniformGroupElements(acc, group, round_up_by(elements, alpaka::warp::getSize(acc)))) {
+     *    bool flag = false;
+     *    if (element < elements) {
+     *      // do some work and compute a result flag only for the valid elements
+     *      flag = do_some_work();
+     *    }
+     *    // check if any valid element had a positive result
+     *    if (alpaka::warp::any(acc, flag)) {
+     *      // ...
+     *    }
+     *  }
+     *
+     * Note that `uniformGroupElements(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional
+     * kernels, use
+     *   - `detail::UniformGroupElementsAlong<Dim>(acc, ...)` to perform the iteration explicitly along dimension
+     *     `Dim`;
+     *   - `uniformGroupElementsAlongX(acc, ...)`, `uniformGroupElementsAlongY(acc, ...)`, or
+     *     `uniformGroupElementsAlongZ(acc, ...)` to loop along the fastest, second-fastest, or third-fastest
+     *     dimension.
+     */
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+    ALPAKA_FN_ACC inline auto uniformGroupElements(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::UniformGroupElementsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
+    }
+
+    /* uniformGroupElementsAlong<Dim>
+     *
+     * `uniformGroupElementsAlong<Dim>(acc, ...)` is a shorthand for `detail::UniformGroupElementsAlong<TAcc,
+     * Dim>(acc, ...)` that can infer the accelerator type from the argument.
+     */
+
+    template<
+        std::size_t Dim,
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
+    ALPAKA_FN_ACC inline auto uniformGroupElementsAlong(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::UniformGroupElementsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
+    }
+
+    /* uniformGroupElementsAlongX, Y, Z
+     *
+     * Like `uniformGroupElements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
+     * dimensions.
+     */
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
+    ALPAKA_FN_ACC inline auto uniformGroupElementsAlongX(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::UniformGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
+    }
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
+    ALPAKA_FN_ACC inline auto uniformGroupElementsAlongY(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::UniformGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
+    }
+
+    template<
+        typename TAcc,
+        typename... TArgs,
+        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
+    ALPAKA_FN_ACC inline auto uniformGroupElementsAlongZ(TAcc const& acc, TArgs... args)
+    {
+        using Idx = alpaka::Idx<TAcc>;
+        return detail::UniformGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
+    }
+
+} // namespace alpaka
diff --git a/include/alpaka/extent/Traits.hpp b/include/alpaka/extent/Traits.hpp
new file mode 100644
index 0000000..460269f
--- /dev/null
+++ b/include/alpaka/extent/Traits.hpp
@@ -0,0 +1,162 @@
+/* Copyright 2023 Benjamin Worpitz, Andrea Bocci, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/core/Unreachable.hpp"
+#include "alpaka/dim/DimIntegralConst.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/meta/Fold.hpp"
+#include "alpaka/vec/Vec.hpp"
+
+#include <functional>
+#include <type_traits>
+#include <utility>
+
+namespace alpaka
+{
+    //! The extent traits.
+    namespace trait
+    {
+        //! The extent get trait.
+        //!
+        //! If not specialized explicitly it returns 1.
+        template<typename TIdxIntegralConst, typename TExtent, typename TSfinae = void>
+        struct [[deprecated("Specialize GetExtents instead")]] GetExtent
+        {
+            ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC static auto getExtent(TExtent const&) -> Idx<TExtent>
+            {
+                return static_cast<Idx<TExtent>>(1);
+            } // namespace trait
+        }; // namespace alpaka
+
+        //! The GetExtents trait for getting the extents of an object as an alpaka::Vec.
+        template<typename TExtent, typename TSfinae = void>
+        struct GetExtents;
+    } // namespace trait
+
+    //! \return The extent in the given dimension.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<std::size_t Tidx, typename TExtent>
+    [[deprecated("use getExtents(extent)[Tidx] instead")]] ALPAKA_FN_HOST_ACC auto getExtent(
+        TExtent const& extent = TExtent()) -> Idx<TExtent>
+    {
+#if BOOST_COMP_CLANG || BOOST_COMP_GNUC
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+        return trait::GetExtent<DimInt<Tidx>, TExtent>::getExtent(extent);
+#if BOOST_COMP_CLANG || BOOST_COMP_GNUC
+#    pragma GCC diagnostic pop
+#endif
+    }
+
+    //! \return The extents of the given object.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T>
+    ALPAKA_FN_HOST_ACC auto getExtents(T const& object) -> Vec<Dim<T>, Idx<T>>
+    {
+        return trait::GetExtents<T>{}(object);
+    }
+
+    //! \tparam T has to specialize GetExtent.
+    //! \return The extents of the given object.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T>
+    [[deprecated("use getExtents() instead")]] ALPAKA_FN_HOST_ACC constexpr auto getExtentVec(T const& object = {})
+        -> Vec<Dim<T>, Idx<T>>
+    {
+        return getExtents(object);
+    }
+
+    //! \tparam T has to specialize GetExtent.
+    //! \return The extent but only the last TDim elements.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TDim, typename T>
+    ALPAKA_FN_HOST_ACC constexpr auto getExtentVecEnd(T const& object = {}) -> Vec<TDim, Idx<T>>
+    {
+        static_assert(TDim::value <= Dim<T>::value, "Cannot get more items than the extent holds");
+
+        [[maybe_unused]] auto const e = getExtents(object);
+        Vec<TDim, Idx<T>> v{};
+        if constexpr(TDim::value > 0)
+        {
+            for(unsigned i = 0; i < TDim::value; i++)
+                v[i] = e[(Dim<T>::value - TDim::value) + i];
+        }
+        return v;
+    }
+
+    //! \return The width.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TExtent>
+    ALPAKA_FN_HOST_ACC auto getWidth(TExtent const& extent = TExtent()) -> Idx<TExtent>
+    {
+        if constexpr(Dim<TExtent>::value >= 1)
+            return getExtents(extent)[Dim<TExtent>::value - 1u];
+        else
+            return 1;
+
+        ALPAKA_UNREACHABLE({});
+    }
+
+    //! \return The height.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TExtent>
+    ALPAKA_FN_HOST_ACC auto getHeight(TExtent const& extent = TExtent()) -> Idx<TExtent>
+    {
+        if constexpr(Dim<TExtent>::value >= 2)
+            return getExtents(extent)[Dim<TExtent>::value - 2u];
+        else
+            return 1;
+
+        ALPAKA_UNREACHABLE({});
+    }
+
+    //! \return The depth.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TExtent>
+    ALPAKA_FN_HOST_ACC auto getDepth(TExtent const& extent = TExtent()) -> Idx<TExtent>
+    {
+        if constexpr(Dim<TExtent>::value >= 3)
+            return getExtents(extent)[Dim<TExtent>::value - 3u];
+        else
+            return 1;
+
+        ALPAKA_UNREACHABLE({});
+    }
+
+    //! \return The product of the extents of the given object.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T>
+    ALPAKA_FN_HOST_ACC auto getExtentProduct(T const& object) -> Idx<T>
+    {
+        return getExtents(object).prod();
+    }
+
+    namespace trait
+    {
+        //! The Vec extent get trait specialization.
+        template<typename TDim, typename TVal>
+        struct GetExtents<Vec<TDim, TVal>>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC constexpr auto operator()(Vec<TDim, TVal> const& extent) const -> Vec<TDim, TVal>
+            {
+                return extent;
+            }
+        };
+
+        template<typename Integral>
+        struct GetExtents<Integral, std::enable_if_t<std::is_integral_v<Integral>>>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC auto operator()(Integral i) const
+            {
+                return Vec{i};
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/idx/Accessors.hpp b/include/alpaka/idx/Accessors.hpp
new file mode 100644
index 0000000..f329728
--- /dev/null
+++ b/include/alpaka/idx/Accessors.hpp
@@ -0,0 +1,116 @@
+/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/core/Positioning.hpp"
+#include "alpaka/dim/DimIntegralConst.hpp"
+#include "alpaka/dim/Traits.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/vec/Vec.hpp"
+#include "alpaka/workdiv/Traits.hpp"
+
+#include <utility>
+
+namespace alpaka
+{
+    //! Get the indices requested.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TOrigin, typename TUnit, typename TIdx, typename TWorkDiv>
+    ALPAKA_FN_HOST_ACC auto getIdx(TIdx const& idx, TWorkDiv const& workDiv) -> Vec<Dim<TWorkDiv>, Idx<TIdx>>
+    {
+        return trait::GetIdx<TIdx, TOrigin, TUnit>::getIdx(idx, workDiv);
+    }
+
+    //! Get the indices requested.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TOrigin, typename TUnit, typename TIdxWorkDiv>
+    ALPAKA_FN_HOST_ACC auto getIdx(TIdxWorkDiv const& idxWorkDiv) -> Vec<Dim<TIdxWorkDiv>, Idx<TIdxWorkDiv>>
+    {
+        return trait::GetIdx<TIdxWorkDiv, TOrigin, TUnit>::getIdx(idxWorkDiv, idxWorkDiv);
+    }
+
+    namespace trait
+    {
+        //! The grid block index get trait specialization for classes with IdxGbBase member type.
+        template<typename TIdxGb>
+        struct GetIdx<TIdxGb, origin::Grid, unit::Blocks>
+        {
+            using ImplementationBase = concepts::ImplementationBase<ConceptIdxGb, TIdxGb>;
+
+            //! \return The index of the current thread in the grid.
+            ALPAKA_NO_HOST_ACC_WARNING
+            template<typename TWorkDiv>
+            ALPAKA_FN_HOST_ACC static auto getIdx(TIdxGb const& idx, TWorkDiv const& workDiv)
+                -> Vec<Dim<ImplementationBase>, Idx<ImplementationBase>>
+            {
+                return trait::GetIdx<ImplementationBase, origin::Grid, unit::Blocks>::getIdx(idx, workDiv);
+            }
+        };
+
+        //! The block thread index get trait specialization for classes with IdxBtBase member type.
+        template<typename TIdxBt>
+        struct GetIdx<TIdxBt, origin::Block, unit::Threads>
+        {
+            using ImplementationBase = concepts::ImplementationBase<ConceptIdxBt, TIdxBt>;
+
+            //! \return The index of the current thread in the grid.
+            ALPAKA_NO_HOST_ACC_WARNING
+            template<typename TWorkDiv>
+            ALPAKA_FN_HOST_ACC static auto getIdx(TIdxBt const& idx, TWorkDiv const& workDiv)
+                -> Vec<Dim<ImplementationBase>, Idx<ImplementationBase>>
+            {
+                return trait::GetIdx<ImplementationBase, origin::Block, unit::Threads>::getIdx(idx, workDiv);
+            }
+        };
+
+        //! The grid thread index get trait specialization.
+        template<typename TIdx>
+        struct GetIdx<TIdx, origin::Grid, unit::Threads>
+        {
+            //! \return The index of the current thread in the grid.
+            ALPAKA_NO_HOST_ACC_WARNING
+            template<typename TWorkDiv>
+            ALPAKA_FN_HOST_ACC static auto getIdx(TIdx const& idx, TWorkDiv const& workDiv)
+            {
+                return alpaka::getIdx<origin::Grid, unit::Blocks>(idx, workDiv)
+                           * getWorkDiv<origin::Block, unit::Threads>(workDiv)
+                       + alpaka::getIdx<origin::Block, unit::Threads>(idx, workDiv);
+            }
+        };
+    } // namespace trait
+
+    //! Get the index of the first element this thread computes.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TIdxWorkDiv, typename TGridThreadIdx, typename TThreadElemExtent>
+    ALPAKA_FN_HOST_ACC auto getIdxThreadFirstElem(
+        [[maybe_unused]] TIdxWorkDiv const& idxWorkDiv,
+        TGridThreadIdx const& gridThreadIdx,
+        TThreadElemExtent const& threadElemExtent) -> Vec<Dim<TIdxWorkDiv>, Idx<TIdxWorkDiv>>
+    {
+        return gridThreadIdx * threadElemExtent;
+    }
+
+    //! Get the index of the first element this thread computes.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TIdxWorkDiv, typename TGridThreadIdx>
+    ALPAKA_FN_HOST_ACC auto getIdxThreadFirstElem(TIdxWorkDiv const& idxWorkDiv, TGridThreadIdx const& gridThreadIdx)
+        -> Vec<Dim<TIdxWorkDiv>, Idx<TIdxWorkDiv>>
+    {
+        auto const threadElemExtent(alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(idxWorkDiv));
+        return getIdxThreadFirstElem(idxWorkDiv, gridThreadIdx, threadElemExtent);
+    }
+
+    //! Get the index of the first element this thread computes.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TIdxWorkDiv>
+    ALPAKA_FN_HOST_ACC auto getIdxThreadFirstElem(TIdxWorkDiv const& idxWorkDiv)
+        -> Vec<Dim<TIdxWorkDiv>, Idx<TIdxWorkDiv>>
+    {
+        auto const gridThreadIdx(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(idxWorkDiv));
+        return getIdxThreadFirstElem(idxWorkDiv, gridThreadIdx);
+    }
+} // namespace alpaka
diff --git a/include/alpaka/idx/MapIdx.hpp b/include/alpaka/idx/MapIdx.hpp
new file mode 100644
index 0000000..f081252
--- /dev/null
+++ b/include/alpaka/idx/MapIdx.hpp
@@ -0,0 +1,98 @@
+/* Copyright 2023 Axel Hübl, Benjamin Worpitz, Erik Zenker, Jan Stephan, Jeffrey Kelling, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/core/Unreachable.hpp"
+#include "alpaka/vec/Traits.hpp"
+#include "alpaka/vec/Vec.hpp"
+
+#include <type_traits>
+
+namespace alpaka
+{
+    //! Maps an N-dimensional index to an N-dimensional position. At least one dimension must always be 1 or zero.
+    //!
+    //! \tparam TDimOut Dimension of the index vector to map to.
+    //! \param in The index vector to map from.
+    //! \param extent The extents of the input or output space, whichever has more than 1 dimensions.
+    ALPAKA_NO_HOST_ACC_WARNING template<
+        std::size_t TDimOut,
+        std::size_t TDimIn,
+        std::size_t TDimExtents,
+        typename TElem>
+    ALPAKA_FN_HOST_ACC auto mapIdx(Vec<DimInt<TDimIn>, TElem> const& in, Vec<DimInt<TDimExtents>, TElem> const& extent)
+        -> Vec<DimInt<TDimOut>, TElem>
+    {
+        if constexpr(TDimOut == 0 || TDimIn == 0)
+            return Vec<DimInt<TDimOut>, TElem>::zeros();
+        else if constexpr(TDimOut == TDimIn)
+            return in;
+        else if constexpr(TDimOut == 1)
+        {
+            TElem out = in[0];
+            for(std::size_t d = 1; d < TDimIn; ++d)
+                out = static_cast<TElem>(out * extent[d] + in[d]);
+            return {out};
+        }
+        else if constexpr(TDimIn == 1)
+        {
+            auto flat = in.front();
+            Vec<DimInt<TDimOut>, TElem> out;
+            for(std::size_t d = TDimOut - 1u; d > 0; d--)
+            {
+                out[d] = static_cast<TElem>(flat % extent[d]);
+                flat /= extent[d];
+            }
+            out.front() = static_cast<TElem>(flat);
+            return out;
+        }
+        else
+            static_assert(!sizeof(TElem), "Not implemented");
+
+        ALPAKA_UNREACHABLE({});
+    }
+
+    //! Maps an N dimensional index to a N dimensional position based on the pitches of a view without padding or a
+    //! byte view. At least one dimension must always be 1 or zero.
+    //!
+    //! \tparam TDimOut Dimension of the index vector to map to.
+    //! \param in The index vector to map from.
+    //! \param pitches The pitches of the input or output space, whichever has more than 1 dimensions.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<std::size_t TDimOut, std::size_t TDimIn, std::size_t TidxDimPitch, typename TElem>
+    ALPAKA_FN_HOST_ACC auto mapIdxPitchBytes(
+        Vec<DimInt<TDimIn>, TElem> const& in,
+        Vec<DimInt<TidxDimPitch>, TElem> const& pitches) -> Vec<DimInt<TDimOut>, TElem>
+    {
+        if constexpr(TDimOut == 0 || TDimIn == 0)
+            return Vec<DimInt<TDimOut>, TElem>::zeros();
+        else if constexpr(TDimOut == TDimIn)
+            return in;
+        else if constexpr(TDimOut == 1)
+        {
+            using DimMinusOne = DimInt<TDimIn - 1>;
+            return {in.back() + (subVecBegin<DimMinusOne>(pitches) * subVecBegin<DimMinusOne>(in)).sum()};
+        }
+        else if constexpr(TDimIn == 1)
+        {
+            auto result = Vec<DimInt<TDimOut>, TElem>::zeros();
+
+            TElem out = in.front();
+            for(std::size_t d = 0; d < TDimOut - 1u; ++d)
+            {
+                result[d] = static_cast<TElem>(out / pitches[d]);
+                out %= pitches[d];
+            }
+            result.back() = out;
+
+            return result;
+        }
+        else
+            static_assert(!sizeof(TElem), "Not implemented");
+
+        ALPAKA_UNREACHABLE({});
+    }
+} // namespace alpaka
diff --git a/include/alpaka/idx/Traits.hpp b/include/alpaka/idx/Traits.hpp
new file mode 100644
index 0000000..88e2365
--- /dev/null
+++ b/include/alpaka/idx/Traits.hpp
@@ -0,0 +1,44 @@
+/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include <type_traits>
+#include <utility>
+
+namespace alpaka
+{
+    struct ConceptIdxBt
+    {
+    };
+
+    struct ConceptIdxGb
+    {
+    };
+
+    //! The idx trait.
+    namespace trait
+    {
+        //! The idx type trait.
+        template<typename T, typename TSfinae = void>
+        struct IdxType;
+    } // namespace trait
+
+    template<typename T>
+    using Idx = typename trait::IdxType<T>::type;
+
+    namespace trait
+    {
+        //! The arithmetic idx type trait specialization.
+        template<typename T>
+        struct IdxType<T, std::enable_if_t<std::is_arithmetic_v<T>>>
+        {
+            using type = std::decay_t<T>;
+        };
+
+        //! The index get trait.
+        template<typename TIdx, typename TOrigin, typename TUnit, typename TSfinae = void>
+        struct GetIdx;
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/idx/bt/IdxBtGenericSycl.hpp b/include/alpaka/idx/bt/IdxBtGenericSycl.hpp
new file mode 100644
index 0000000..54ef780
--- /dev/null
+++ b/include/alpaka/idx/bt/IdxBtGenericSycl.hpp
@@ -0,0 +1,77 @@
+/* Copyright 2023 Jan Stephan, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/core/Positioning.hpp"
+#include "alpaka/core/Sycl.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/vec/Vec.hpp"
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+
+#    include <sycl/sycl.hpp>
+
+namespace alpaka::bt
+{
+    //! The SYCL accelerator ND index provider.
+    template<typename TDim, typename TIdx>
+    class IdxBtGenericSycl : public concepts::Implements<ConceptIdxBt, IdxBtGenericSycl<TDim, TIdx>>
+    {
+    public:
+        using IdxBtBase = IdxBtGenericSycl;
+
+        explicit IdxBtGenericSycl(sycl::nd_item<TDim::value> work_item) : m_item_bt{work_item}
+        {
+        }
+
+        sycl::nd_item<TDim::value> m_item_bt;
+    };
+} // namespace alpaka::bt
+
+namespace alpaka::trait
+{
+    //! The SYCL accelerator index dimension get trait specialization.
+    template<typename TDim, typename TIdx>
+    struct DimType<bt::IdxBtGenericSycl<TDim, TIdx>>
+    {
+        using type = TDim;
+    };
+
+    //! The SYCL accelerator block thread index get trait specialization.
+    template<typename TDim, typename TIdx>
+    struct GetIdx<bt::IdxBtGenericSycl<TDim, TIdx>, origin::Block, unit::Threads>
+    {
+        //! \return The index of the current thread in the block.
+        template<typename TWorkDiv>
+        static auto getIdx(bt::IdxBtGenericSycl<TDim, TIdx> const& idx, TWorkDiv const&) -> Vec<TDim, TIdx>
+        {
+            if constexpr(TDim::value == 1)
+                return Vec<TDim, TIdx>{static_cast<TIdx>(idx.m_item_bt.get_local_id(0))};
+            else if constexpr(TDim::value == 2)
+            {
+                return Vec<TDim, TIdx>{
+                    static_cast<TIdx>(idx.m_item_bt.get_local_id(1)),
+                    static_cast<TIdx>(idx.m_item_bt.get_local_id(0))};
+            }
+            else
+            {
+                return Vec<TDim, TIdx>{
+                    static_cast<TIdx>(idx.m_item_bt.get_local_id(2)),
+                    static_cast<TIdx>(idx.m_item_bt.get_local_id(1)),
+                    static_cast<TIdx>(idx.m_item_bt.get_local_id(0))};
+            }
+        }
+    };
+
+    //! The SYCL accelerator block thread index idx type trait specialization.
+    template<typename TDim, typename TIdx>
+    struct IdxType<bt::IdxBtGenericSycl<TDim, TIdx>>
+    {
+        using type = TIdx;
+    };
+} // namespace alpaka::trait
+
+#endif
diff --git a/include/alpaka/idx/bt/IdxBtLinear.hpp b/include/alpaka/idx/bt/IdxBtLinear.hpp
new file mode 100644
index 0000000..53f876c
--- /dev/null
+++ b/include/alpaka/idx/bt/IdxBtLinear.hpp
@@ -0,0 +1,72 @@
+/* Copyright 2022 Axel Huebl, Jeffrey Kelling, Benjamin Worpitz, René Widera, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/core/Positioning.hpp"
+#include "alpaka/idx/MapIdx.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/vec/Vec.hpp"
+#include "alpaka/workdiv/Traits.hpp"
+
+namespace alpaka
+{
+    namespace bt
+    {
+        //! General ND bt index provider based on a linear index.
+        template<typename TDim, typename TIdx>
+        class IdxBtLinear : public concepts::Implements<ConceptIdxBt, IdxBtLinear<TDim, TIdx>>
+        {
+        public:
+            IdxBtLinear(TIdx blockThreadIdx) : m_blockThreadIdx(blockThreadIdx)
+            {
+            }
+
+            const TIdx m_blockThreadIdx;
+        };
+    } // namespace bt
+
+    namespace trait
+    {
+        //! The IdxBtLinear index dimension get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<bt::IdxBtLinear<TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //! The IdxBtLinear block thread index get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetIdx<bt::IdxBtLinear<TDim, TIdx>, origin::Block, unit::Threads>
+        {
+            //! \return The index of the current thread in the block.
+            template<typename TWorkDiv>
+            static auto getIdx(bt::IdxBtLinear<TDim, TIdx> const& idx, TWorkDiv const& workDiv) -> Vec<TDim, TIdx>
+            {
+                return mapIdx<TDim::value>(
+                    Vec<DimInt<1u>, TIdx>(idx.m_blockThreadIdx),
+                    getWorkDiv<Block, Threads>(workDiv));
+            }
+        };
+
+        template<typename TIdx>
+        struct GetIdx<bt::IdxBtLinear<DimInt<1u>, TIdx>, origin::Block, unit::Threads>
+        {
+            //! \return The index of the current thread in the block.
+            template<typename TWorkDiv>
+            static auto getIdx(bt::IdxBtLinear<DimInt<1u>, TIdx> const& idx, TWorkDiv const&) -> Vec<DimInt<1u>, TIdx>
+            {
+                return idx.m_blockThreadIdx;
+            }
+        };
+
+        //! The IdxBtLinear block thread index idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<bt::IdxBtLinear<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/idx/bt/IdxBtOmp.hpp b/include/alpaka/idx/bt/IdxBtOmp.hpp
new file mode 100644
index 0000000..df5a96a
--- /dev/null
+++ b/include/alpaka/idx/bt/IdxBtOmp.hpp
@@ -0,0 +1,77 @@
+/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Assert.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/core/Positioning.hpp"
+#include "alpaka/idx/MapIdx.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/vec/Vec.hpp"
+#include "alpaka/workdiv/Traits.hpp"
+
+#ifdef _OPENMP
+
+#    include <omp.h>
+
+namespace alpaka
+{
+    namespace bt
+    {
+        //! The OpenMP accelerator index provider.
+        template<typename TDim, typename TIdx>
+        class IdxBtOmp : public concepts::Implements<ConceptIdxBt, IdxBtOmp<TDim, TIdx>>
+        {
+        };
+    } // namespace bt
+
+    namespace trait
+    {
+        //! The OpenMP accelerator index dimension get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<bt::IdxBtOmp<TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //! The OpenMP accelerator block thread index get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetIdx<bt::IdxBtOmp<TDim, TIdx>, origin::Block, unit::Threads>
+        {
+            //! \return The index of the current thread in the block.
+            template<typename TWorkDiv>
+            static auto getIdx(bt::IdxBtOmp<TDim, TIdx> const& /* idx */, TWorkDiv const& workDiv) -> Vec<TDim, TIdx>
+            {
+                // We assume that the thread id is positive.
+                ALPAKA_ASSERT_ACC(::omp_get_thread_num() >= 0);
+                // \TODO: Would it be faster to precompute the index and cache it inside an array?
+                return mapIdx<TDim::value>(
+                    Vec<DimInt<1u>, TIdx>(static_cast<TIdx>(::omp_get_thread_num())),
+                    getWorkDiv<Block, Threads>(workDiv));
+            }
+        };
+
+        template<typename TIdx>
+        struct GetIdx<bt::IdxBtOmp<DimInt<1u>, TIdx>, origin::Block, unit::Threads>
+        {
+            //! \return The index of the current thread in the block.
+            template<typename TWorkDiv>
+            static auto getIdx(bt::IdxBtOmp<DimInt<1u>, TIdx> const& /* idx */, TWorkDiv const&)
+                -> Vec<DimInt<1u>, TIdx>
+            {
+                return Vec<DimInt<1u>, TIdx>(static_cast<TIdx>(::omp_get_thread_num()));
+            }
+        };
+
+        //! The OpenMP accelerator block thread index idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<bt::IdxBtOmp<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/idx/bt/IdxBtRefThreadIdMap.hpp b/include/alpaka/idx/bt/IdxBtRefThreadIdMap.hpp
new file mode 100644
index 0000000..4d94d0f
--- /dev/null
+++ b/include/alpaka/idx/bt/IdxBtRefThreadIdMap.hpp
@@ -0,0 +1,77 @@
+/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Assert.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/core/Positioning.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/vec/Vec.hpp"
+
+#include <map>
+#include <thread>
+
+#ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
+
+namespace alpaka
+{
+    namespace bt
+    {
+        //! The threads accelerator index provider.
+        template<typename TDim, typename TIdx>
+        class IdxBtRefThreadIdMap : public concepts::Implements<ConceptIdxBt, IdxBtRefThreadIdMap<TDim, TIdx>>
+        {
+        public:
+            using ThreadIdToIdxMap = std::map<std::thread::id, Vec<TDim, TIdx>>;
+
+            ALPAKA_FN_HOST IdxBtRefThreadIdMap(ThreadIdToIdxMap const& mThreadToIndices)
+                : m_threadToIndexMap(mThreadToIndices)
+            {
+            }
+
+            ALPAKA_FN_HOST IdxBtRefThreadIdMap(IdxBtRefThreadIdMap const&) = delete;
+            ALPAKA_FN_HOST auto operator=(IdxBtRefThreadIdMap const&) -> IdxBtRefThreadIdMap& = delete;
+
+        public:
+            ThreadIdToIdxMap const& m_threadToIndexMap; //!< The mapping of thread id's to thread indices.
+        };
+    } // namespace bt
+
+    namespace trait
+    {
+        //! The CPU threads accelerator index dimension get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<bt::IdxBtRefThreadIdMap<TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //! The CPU threads accelerator block thread index get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetIdx<bt::IdxBtRefThreadIdMap<TDim, TIdx>, origin::Block, unit::Threads>
+        {
+            //! \return The index of the current thread in the block.
+            template<typename TWorkDiv>
+            ALPAKA_FN_HOST static auto getIdx(
+                bt::IdxBtRefThreadIdMap<TDim, TIdx> const& idx,
+                TWorkDiv const& /* workDiv */) -> Vec<TDim, TIdx>
+            {
+                auto const threadId = std::this_thread::get_id();
+                auto const threadEntry = idx.m_threadToIndexMap.find(threadId);
+                ALPAKA_ASSERT(threadEntry != std::end(idx.m_threadToIndexMap));
+                return threadEntry->second;
+            }
+        };
+
+        //! The CPU threads accelerator block thread index idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<bt::IdxBtRefThreadIdMap<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/idx/bt/IdxBtUniformCudaHipBuiltIn.hpp b/include/alpaka/idx/bt/IdxBtUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000..ff0366f
--- /dev/null
+++ b/include/alpaka/idx/bt/IdxBtUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,81 @@
+/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera, Jan Stephan, Andrea Bocci, Bernhard
+ * Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/core/Cuda.hpp"
+#include "alpaka/core/Hip.hpp"
+#include "alpaka/core/Positioning.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/vec/Vec.hpp"
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+namespace alpaka
+{
+    namespace bt
+    {
+        //! The CUDA/HIP accelerator ND index provider.
+        template<typename TDim, typename TIdx>
+        class IdxBtUniformCudaHipBuiltIn
+            : public concepts::Implements<ConceptIdxBt, IdxBtUniformCudaHipBuiltIn<TDim, TIdx>>
+        {
+        };
+    } // namespace bt
+
+#    if !defined(ALPAKA_HOST_ONLY)
+
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+
+#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+
+    namespace trait
+    {
+        //! The GPU CUDA/HIP accelerator index dimension get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<bt::IdxBtUniformCudaHipBuiltIn<TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //! The GPU CUDA/HIP accelerator block thread index get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetIdx<bt::IdxBtUniformCudaHipBuiltIn<TDim, TIdx>, origin::Block, unit::Threads>
+        {
+            //! \return The index of the current thread in the block.
+            template<typename TWorkDiv>
+            __device__ static auto getIdx(bt::IdxBtUniformCudaHipBuiltIn<TDim, TIdx> const& /* idx */, TWorkDiv const&)
+                -> Vec<TDim, TIdx>
+            {
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                return castVec<TIdx>(getOffsetVecEnd<TDim>(threadIdx));
+#        else
+                return getOffsetVecEnd<TDim>(Vec<std::integral_constant<typename TDim::value_type, 3>, TIdx>(
+                    static_cast<TIdx>(hipThreadIdx_z),
+                    static_cast<TIdx>(hipThreadIdx_y),
+                    static_cast<TIdx>(hipThreadIdx_x)));
+#        endif
+            }
+        };
+
+        //! The GPU CUDA/HIP accelerator block thread index idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<bt::IdxBtUniformCudaHipBuiltIn<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+    } // namespace trait
+
+#    endif
+
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/idx/bt/IdxBtZero.hpp b/include/alpaka/idx/bt/IdxBtZero.hpp
new file mode 100644
index 0000000..be90326
--- /dev/null
+++ b/include/alpaka/idx/bt/IdxBtZero.hpp
@@ -0,0 +1,53 @@
+/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/core/Positioning.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/vec/Vec.hpp"
+
+namespace alpaka
+{
+    namespace bt
+    {
+        //! A zero block thread index provider.
+        template<typename TDim, typename TIdx>
+        class IdxBtZero : public concepts::Implements<ConceptIdxBt, IdxBtZero<TDim, TIdx>>
+        {
+        };
+    } // namespace bt
+
+    namespace trait
+    {
+        //! The zero block thread index provider dimension get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<bt::IdxBtZero<TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //! The zero block thread index provider block thread index get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetIdx<bt::IdxBtZero<TDim, TIdx>, origin::Block, unit::Threads>
+        {
+            //! \return The index of the current thread in the block.
+            template<typename TWorkDiv>
+            ALPAKA_FN_HOST static auto getIdx(
+                bt::IdxBtZero<TDim, TIdx> const& /* idx */,
+                TWorkDiv const& /* workDiv */) -> Vec<TDim, TIdx>
+            {
+                return Vec<TDim, TIdx>::zeros();
+            }
+        };
+
+        //! The zero block thread index idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<bt::IdxBtZero<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/idx/gb/IdxGbGenericSycl.hpp b/include/alpaka/idx/gb/IdxGbGenericSycl.hpp
new file mode 100644
index 0000000..42547ef
--- /dev/null
+++ b/include/alpaka/idx/gb/IdxGbGenericSycl.hpp
@@ -0,0 +1,77 @@
+/* Copyright 2023 Jan Stephan, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/core/Positioning.hpp"
+#include "alpaka/core/Sycl.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/vec/Vec.hpp"
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+
+#    include <sycl/sycl.hpp>
+
+namespace alpaka::gb
+{
+    //! The SYCL accelerator ND index provider.
+    template<typename TDim, typename TIdx>
+    class IdxGbGenericSycl : public concepts::Implements<ConceptIdxGb, IdxGbGenericSycl<TDim, TIdx>>
+    {
+    public:
+        using IdxGbBase = IdxGbGenericSycl;
+
+        explicit IdxGbGenericSycl(sycl::nd_item<TDim::value> work_item) : m_item_gb{work_item}
+        {
+        }
+
+        sycl::nd_item<TDim::value> m_item_gb;
+    };
+} // namespace alpaka::gb
+
+namespace alpaka::trait
+{
+    //! The SYCL accelerator index dimension get trait specialization.
+    template<typename TDim, typename TIdx>
+    struct DimType<gb::IdxGbGenericSycl<TDim, TIdx>>
+    {
+        using type = TDim;
+    };
+
+    //! The SYCL accelerator grid block index get trait specialization.
+    template<typename TDim, typename TIdx>
+    struct GetIdx<gb::IdxGbGenericSycl<TDim, TIdx>, origin::Grid, unit::Blocks>
+    {
+        //! \return The index of the current block in the grid.
+        template<typename TWorkDiv>
+        static auto getIdx(gb::IdxGbGenericSycl<TDim, TIdx> const& idx, TWorkDiv const&)
+        {
+            if constexpr(TDim::value == 1)
+                return Vec<TDim, TIdx>(static_cast<TIdx>(idx.m_item_gb.get_group(0)));
+            else if constexpr(TDim::value == 2)
+            {
+                return Vec<TDim, TIdx>(
+                    static_cast<TIdx>(idx.m_item_gb.get_group(1)),
+                    static_cast<TIdx>(idx.m_item_gb.get_group(0)));
+            }
+            else
+            {
+                return Vec<TDim, TIdx>(
+                    static_cast<TIdx>(idx.m_item_gb.get_group(2)),
+                    static_cast<TIdx>(idx.m_item_gb.get_group(1)),
+                    static_cast<TIdx>(idx.m_item_gb.get_group(0)));
+            }
+        }
+    };
+
+    //! The SYCL accelerator grid block index idx type trait specialization.
+    template<typename TDim, typename TIdx>
+    struct IdxType<gb::IdxGbGenericSycl<TDim, TIdx>>
+    {
+        using type = TIdx;
+    };
+} // namespace alpaka::trait
+
+#endif
diff --git a/include/alpaka/idx/gb/IdxGbLinear.hpp b/include/alpaka/idx/gb/IdxGbLinear.hpp
new file mode 100644
index 0000000..d35eb50
--- /dev/null
+++ b/include/alpaka/idx/gb/IdxGbLinear.hpp
@@ -0,0 +1,73 @@
+/* Copyright 2022 Jeffrey Kelling, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/core/Positioning.hpp"
+#include "alpaka/idx/MapIdx.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/vec/Vec.hpp"
+#include "alpaka/workdiv/Traits.hpp"
+
+namespace alpaka
+{
+    namespace gb
+    {
+        //! General ND index provider based on a linear index.
+        template<typename TDim, typename TIdx>
+        class IdxGbLinear : public concepts::Implements<ConceptIdxGb, IdxGbLinear<TDim, TIdx>>
+        {
+        public:
+            IdxGbLinear(TIdx const& teamOffset = static_cast<TIdx>(0u)) : m_gridBlockIdx(teamOffset)
+            {
+            }
+
+            TIdx const m_gridBlockIdx;
+        };
+    } // namespace gb
+
+    namespace trait
+    {
+        //! The IdxGbLinear index dimension get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<gb::IdxGbLinear<TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //! The IdxGbLinear grid block index get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetIdx<gb::IdxGbLinear<TDim, TIdx>, origin::Grid, unit::Blocks>
+        {
+            //! \return The index of the current block in the grid.
+            template<typename TWorkDiv>
+            static auto getIdx(gb::IdxGbLinear<TDim, TIdx> const& idx, TWorkDiv const& workDiv) -> Vec<TDim, TIdx>
+            {
+                // \TODO: Would it be faster to precompute the index and cache it inside an array?
+                return mapIdx<TDim::value>(
+                    Vec<DimInt<1u>, TIdx>(idx.m_gridBlockIdx),
+                    getWorkDiv<Grid, Blocks>(workDiv));
+            }
+        };
+
+        template<typename TIdx>
+        struct GetIdx<gb::IdxGbLinear<DimInt<1u>, TIdx>, origin::Grid, unit::Blocks>
+        {
+            //! \return The index of the current block in the grid.
+            template<typename TWorkDiv>
+            static auto getIdx(gb::IdxGbLinear<DimInt<1u>, TIdx> const& idx, TWorkDiv const&) -> Vec<DimInt<1u>, TIdx>
+            {
+                return idx.m_gridBlockIdx;
+            }
+        };
+
+        //! The IdxGbLinear grid block index idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<gb::IdxGbLinear<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/idx/gb/IdxGbRef.hpp b/include/alpaka/idx/gb/IdxGbRef.hpp
new file mode 100644
index 0000000..6e3d9a6
--- /dev/null
+++ b/include/alpaka/idx/gb/IdxGbRef.hpp
@@ -0,0 +1,59 @@
+/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/core/Positioning.hpp"
+#include "alpaka/dim/Traits.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/vec/Vec.hpp"
+
+namespace alpaka
+{
+    namespace gb
+    {
+        //! A IdxGbRef grid block index.
+        template<typename TDim, typename TIdx>
+        class IdxGbRef : public concepts::Implements<ConceptIdxGb, IdxGbRef<TDim, TIdx>>
+        {
+        public:
+            IdxGbRef(Vec<TDim, TIdx> const& gridBlockIdx) : m_gridBlockIdx(gridBlockIdx)
+            {
+            }
+
+            Vec<TDim, TIdx> const& m_gridBlockIdx;
+        };
+    } // namespace gb
+
+    namespace trait
+    {
+        //! The IdxGbRef grid block index dimension get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<gb::IdxGbRef<TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //! The IdxGbRef grid block index grid block index get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetIdx<gb::IdxGbRef<TDim, TIdx>, origin::Grid, unit::Blocks>
+        {
+            //! \return The index of the current block in the grid.
+            template<typename TWorkDiv>
+            ALPAKA_FN_HOST static auto getIdx(gb::IdxGbRef<TDim, TIdx> const& idx, TWorkDiv const& /* workDiv */)
+                -> Vec<TDim, TIdx>
+            {
+                return idx.m_gridBlockIdx;
+            }
+        };
+
+        //! The IdxGbRef grid block index idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<gb::IdxGbRef<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/idx/gb/IdxGbUniformCudaHipBuiltIn.hpp b/include/alpaka/idx/gb/IdxGbUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000..a643533
--- /dev/null
+++ b/include/alpaka/idx/gb/IdxGbUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,81 @@
+/* Copyright 2022 Axel Huebl, Benjamin Worpitz, René Widera, Matthias Werner, Jan Stephan, Andrea Bocci, Bernhard
+ * Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/core/Cuda.hpp"
+#include "alpaka/core/Hip.hpp"
+#include "alpaka/core/Positioning.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/vec/Vec.hpp"
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+namespace alpaka
+{
+    namespace gb
+    {
+        //! The CUDA/HIP accelerator ND index provider.
+        template<typename TDim, typename TIdx>
+        class IdxGbUniformCudaHipBuiltIn
+            : public concepts::Implements<ConceptIdxGb, IdxGbUniformCudaHipBuiltIn<TDim, TIdx>>
+        {
+        };
+    } // namespace gb
+
+#    if !defined(ALPAKA_HOST_ONLY)
+
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+
+#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+
+    namespace trait
+    {
+        //! The GPU CUDA/HIP accelerator index dimension get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<gb::IdxGbUniformCudaHipBuiltIn<TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //! The GPU CUDA/HIP accelerator grid block index get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetIdx<gb::IdxGbUniformCudaHipBuiltIn<TDim, TIdx>, origin::Grid, unit::Blocks>
+        {
+            //! \return The index of the current block in the grid.
+            template<typename TWorkDiv>
+            __device__ static auto getIdx(gb::IdxGbUniformCudaHipBuiltIn<TDim, TIdx> const& /* idx */, TWorkDiv const&)
+                -> Vec<TDim, TIdx>
+            {
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                return castVec<TIdx>(getOffsetVecEnd<TDim>(blockIdx));
+#        else
+                return getOffsetVecEnd<TDim>(Vec<std::integral_constant<typename TDim::value_type, 3>, TIdx>(
+                    static_cast<TIdx>(hipBlockIdx_z),
+                    static_cast<TIdx>(hipBlockIdx_y),
+                    static_cast<TIdx>(hipBlockIdx_x)));
+#        endif
+            }
+        };
+
+        //! The GPU CUDA/HIP accelerator grid block index idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<gb::IdxGbUniformCudaHipBuiltIn<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+    } // namespace trait
+
+#    endif
+
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/intrinsic/IntrinsicCpu.hpp b/include/alpaka/intrinsic/IntrinsicCpu.hpp
new file mode 100644
index 0000000..5db927b
--- /dev/null
+++ b/include/alpaka/intrinsic/IntrinsicCpu.hpp
@@ -0,0 +1,88 @@
+/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/Unreachable.hpp"
+#include "alpaka/intrinsic/IntrinsicFallback.hpp"
+#include "alpaka/intrinsic/Traits.hpp"
+
+#include <bitset>
+#include <climits>
+#if __has_include(<version>) // Not part of the C++17 standard but all major standard libraries include this
+#    include <version>
+#endif
+#ifdef __cpp_lib_bitops
+#    include <bit>
+#endif
+
+#if BOOST_COMP_MSVC
+#    include <intrin.h>
+#endif
+
+namespace alpaka
+{
+    //! The CPU intrinsic.
+    class IntrinsicCpu : public concepts::Implements<ConceptIntrinsic, IntrinsicCpu>
+    {
+    };
+
+    namespace trait
+    {
+        template<>
+        struct Popcount<IntrinsicCpu>
+        {
+            template<typename UnsignedIntegral>
+            static auto popcount(IntrinsicCpu const& /*intrinsic*/, UnsignedIntegral value) -> std::int32_t
+            {
+#ifdef __cpp_lib_bitops
+                return std::popcount(value);
+#elif BOOST_COMP_GNUC || BOOST_COMP_CLANG
+                if constexpr(sizeof(UnsignedIntegral) == 8)
+                    return __builtin_popcountll(value);
+                else
+                    return __builtin_popcount(value);
+#elif BOOST_COMP_MSVC
+                if constexpr(sizeof(UnsignedIntegral) == 8)
+                    return static_cast<std::int32_t>(__popcnt64(value));
+                else
+                    return __popcnt(value);
+#else
+                // Fallback to standard library
+                return static_cast<std::int32_t>(std::bitset<sizeof(UnsignedIntegral) * CHAR_BIT>(value).count());
+#endif
+                ALPAKA_UNREACHABLE(0);
+            }
+        };
+
+        template<>
+        struct Ffs<IntrinsicCpu>
+        {
+            template<typename Integral>
+            static auto ffs(IntrinsicCpu const& /*intrinsic*/, Integral value) -> std::int32_t
+            {
+#ifdef __cpp_lib_bitops
+                return value == 0 ? 0 : std::countr_zero(static_cast<std::make_unsigned_t<Integral>>(value)) + 1;
+#elif BOOST_COMP_GNUC || BOOST_COMP_CLANG
+                if constexpr(sizeof(Integral) == 8)
+                    return __builtin_ffsll(value);
+                else
+                    return __builtin_ffs(value);
+#elif BOOST_COMP_MSVC
+                // Implementation based on
+                // https://gitlab.freedesktop.org/cairo/cairo/commit/f5167dc2e1a13d8c4e5d66d7178a24b9b5e7ac7a
+                unsigned long index = 0u;
+                if constexpr(sizeof(Integral) == 8)
+                    return _BitScanForward64(&index, value) == 0 ? 0 : static_cast<std::int32_t>(index + 1u);
+                else
+                    return _BitScanForward(&index, value) == 0 ? 0 : static_cast<std::int32_t>(index + 1u);
+#else
+                return alpaka::detail::ffsFallback(value);
+#endif
+                ALPAKA_UNREACHABLE(0);
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/intrinsic/IntrinsicFallback.hpp b/include/alpaka/intrinsic/IntrinsicFallback.hpp
new file mode 100644
index 0000000..1e9f3a4
--- /dev/null
+++ b/include/alpaka/intrinsic/IntrinsicFallback.hpp
@@ -0,0 +1,77 @@
+/* Copyright 2022 Sergei Bastrakov, Jeffrey Kelling, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/intrinsic/Traits.hpp"
+
+namespace alpaka
+{
+    namespace detail
+    {
+        //! Fallback implementation of popcount.
+        template<typename TValue>
+        static auto popcountFallback(TValue value) -> std::int32_t
+        {
+            TValue count = 0;
+            while(value != 0)
+            {
+                count += value & 1u;
+                value >>= 1u;
+            }
+            return static_cast<std::int32_t>(count);
+        }
+
+        //! Fallback implementation of ffs.
+        template<typename TValue>
+        static auto ffsFallback(TValue value) -> std::int32_t
+        {
+            if(value == 0)
+                return 0;
+            std::int32_t result = 1;
+            while((value & 1) == 0)
+            {
+                value >>= 1;
+                result++;
+            }
+            return result;
+        }
+    } // namespace detail
+
+    //! The Fallback intrinsic.
+    class IntrinsicFallback : public concepts::Implements<ConceptIntrinsic, IntrinsicFallback>
+    {
+    };
+
+    namespace trait
+    {
+        template<>
+        struct Popcount<IntrinsicFallback>
+        {
+            static auto popcount(IntrinsicFallback const& /*intrinsic*/, std::uint32_t value) -> std::int32_t
+            {
+                return alpaka::detail::popcountFallback(value);
+            }
+
+            static auto popcount(IntrinsicFallback const& /*intrinsic*/, std::uint64_t value) -> std::int32_t
+            {
+                return alpaka::detail::popcountFallback(value);
+            }
+        };
+
+        template<>
+        struct Ffs<IntrinsicFallback>
+        {
+            static auto ffs(IntrinsicFallback const& /*intrinsic*/, std::int32_t value) -> std::int32_t
+            {
+                return alpaka::detail::ffsFallback(value);
+            }
+
+            static auto ffs(IntrinsicFallback const& /*intrinsic*/, std::int64_t value) -> std::int32_t
+            {
+                return alpaka::detail::ffsFallback(value);
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/intrinsic/IntrinsicGenericSycl.hpp b/include/alpaka/intrinsic/IntrinsicGenericSycl.hpp
new file mode 100644
index 0000000..395043a
--- /dev/null
+++ b/include/alpaka/intrinsic/IntrinsicGenericSycl.hpp
@@ -0,0 +1,57 @@
+/* Copyright 2022 Jan Stephan
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/intrinsic/IntrinsicFallback.hpp"
+#include "alpaka/intrinsic/Traits.hpp"
+
+#include <cstdint>
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+
+#    include <sycl/sycl.hpp>
+
+namespace alpaka
+{
+    //! The SYCL intrinsic.
+    class IntrinsicGenericSycl : public concepts::Implements<ConceptIntrinsic, IntrinsicGenericSycl>
+    {
+    };
+} // namespace alpaka
+
+namespace alpaka::trait
+{
+    template<>
+    struct Popcount<IntrinsicGenericSycl>
+    {
+        static auto popcount(IntrinsicGenericSycl const&, std::uint32_t value) -> std::int32_t
+        {
+            return static_cast<std::int32_t>(sycl::popcount(value));
+        }
+
+        static auto popcount(IntrinsicGenericSycl const&, std::uint64_t value) -> std::int32_t
+        {
+            return static_cast<std::int32_t>(sycl::popcount(value));
+        }
+    };
+
+    template<>
+    struct Ffs<IntrinsicGenericSycl>
+    {
+        static auto ffs(IntrinsicGenericSycl const&, std::int32_t value) -> std::int32_t
+        {
+            // There is no FFS operation in SYCL but we can emulate it using popcount.
+            return (value == 0) ? 0 : sycl::popcount(value ^ ~(-value));
+        }
+
+        static auto ffs(IntrinsicGenericSycl const&, std::int64_t value) -> std::int32_t
+        {
+            // There is no FFS operation in SYCL but we can emulate it using popcount.
+            return (value == 0l) ? 0 : static_cast<std::int32_t>(sycl::popcount(value ^ ~(-value)));
+        }
+    };
+} // namespace alpaka::trait
+
+#endif
diff --git a/include/alpaka/intrinsic/IntrinsicUniformCudaHipBuiltIn.hpp b/include/alpaka/intrinsic/IntrinsicUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000..c73f973
--- /dev/null
+++ b/include/alpaka/intrinsic/IntrinsicUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,78 @@
+/* Copyright 2022 Sergei Bastrakov, Andrea Bocci, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/intrinsic/Traits.hpp"
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+namespace alpaka
+{
+    //! The GPU CUDA/HIP intrinsic.
+    class IntrinsicUniformCudaHipBuiltIn
+        : public concepts::Implements<ConceptIntrinsic, IntrinsicUniformCudaHipBuiltIn>
+    {
+    };
+
+#    if !defined(ALPAKA_HOST_ONLY)
+
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+
+#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+
+    namespace trait
+    {
+        template<>
+        struct Popcount<IntrinsicUniformCudaHipBuiltIn>
+        {
+            __device__ static auto popcount(IntrinsicUniformCudaHipBuiltIn const& /*intrinsic*/, std::uint32_t value)
+                -> std::int32_t
+            {
+#        if BOOST_COMP_CLANG && BOOST_LANG_CUDA
+                return __popc(static_cast<int>(value));
+#        else
+                return static_cast<std::int32_t>(__popc(static_cast<unsigned int>(value)));
+#        endif
+            }
+
+            __device__ static auto popcount(IntrinsicUniformCudaHipBuiltIn const& /*intrinsic*/, std::uint64_t value)
+                -> std::int32_t
+            {
+#        if BOOST_COMP_CLANG && BOOST_LANG_CUDA
+                return __popcll(static_cast<long long>(value));
+#        else
+                return static_cast<std::int32_t>(__popcll(static_cast<unsigned long long>(value)));
+#        endif
+            }
+        };
+
+        template<>
+        struct Ffs<IntrinsicUniformCudaHipBuiltIn>
+        {
+            __device__ static auto ffs(IntrinsicUniformCudaHipBuiltIn const& /*intrinsic*/, std::int32_t value)
+                -> std::int32_t
+            {
+                return static_cast<std::int32_t>(__ffs(static_cast<int>(value)));
+            }
+
+            __device__ static auto ffs(IntrinsicUniformCudaHipBuiltIn const& /*intrinsic*/, std::int64_t value)
+                -> std::int32_t
+            {
+                return static_cast<std::int32_t>(__ffsll(static_cast<long long>(value)));
+            }
+        };
+    } // namespace trait
+
+#    endif
+
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/intrinsic/Traits.hpp b/include/alpaka/intrinsic/Traits.hpp
new file mode 100644
index 0000000..8aea0a4
--- /dev/null
+++ b/include/alpaka/intrinsic/Traits.hpp
@@ -0,0 +1,84 @@
+/* Copyright 2022 Sergei Bastrakov, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/core/Concepts.hpp"
+
+#include <cstdint>
+#include <type_traits>
+
+namespace alpaka
+{
+    struct ConceptIntrinsic
+    {
+    };
+
+    //! The intrinsics traits.
+    namespace trait
+    {
+        //! The popcount trait.
+        template<typename TWarp, typename TSfinae = void>
+        struct Popcount;
+
+        //! The ffs trait.
+        template<typename TWarp, typename TSfinae = void>
+        struct Ffs;
+    } // namespace trait
+
+    //! Returns the number of 1 bits in the given 32-bit value.
+    //!
+    //! \tparam TIntrinsic The intrinsic implementation type.
+    //! \param intrinsic The intrinsic implementation.
+    //! \param value The input value.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TIntrinsic>
+    ALPAKA_FN_ACC auto popcount(TIntrinsic const& intrinsic, std::uint32_t value) -> std::int32_t
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptIntrinsic, TIntrinsic>;
+        return trait::Popcount<ImplementationBase>::popcount(intrinsic, value);
+    }
+
+    //! Returns the number of 1 bits in the given 64-bit value.
+    //!
+    //! \tparam TIntrinsic The intrinsic implementation type.
+    //! \param intrinsic The intrinsic implementation.
+    //! \param value The input value.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TIntrinsic>
+    ALPAKA_FN_ACC auto popcount(TIntrinsic const& intrinsic, std::uint64_t value) -> std::int32_t
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptIntrinsic, TIntrinsic>;
+        return trait::Popcount<ImplementationBase>::popcount(intrinsic, value);
+    }
+
+    //! Returns the 1-based position of the least significant bit set to 1
+    //! in the given 32-bit value. Returns 0 for input value 0.
+    //!
+    //! \tparam TIntrinsic The intrinsic implementation type.
+    //! \param intrinsic The intrinsic implementation.
+    //! \param value The input value.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TIntrinsic>
+    ALPAKA_FN_ACC auto ffs(TIntrinsic const& intrinsic, std::int32_t value) -> std::int32_t
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptIntrinsic, TIntrinsic>;
+        return trait::Ffs<ImplementationBase>::ffs(intrinsic, value);
+    }
+
+    //! Returns the 1-based position of the least significant bit set to 1
+    //! in the given 64-bit value. Returns 0 for input value 0.
+    //!
+    //! \tparam TIntrinsic The intrinsic implementation type.
+    //! \param intrinsic The intrinsic implementation.
+    //! \param value The input value.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TIntrinsic>
+    ALPAKA_FN_ACC auto ffs(TIntrinsic const& intrinsic, std::int64_t value) -> std::int32_t
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptIntrinsic, TIntrinsic>;
+        return trait::Ffs<ImplementationBase>::ffs(intrinsic, value);
+    }
+} // namespace alpaka
diff --git a/include/alpaka/kernel/KernelFunctionAttributes.hpp b/include/alpaka/kernel/KernelFunctionAttributes.hpp
new file mode 100644
index 0000000..0371430
--- /dev/null
+++ b/include/alpaka/kernel/KernelFunctionAttributes.hpp
@@ -0,0 +1,25 @@
+/* Copyright 2022 René Widera, Mehmet Yusufoglu
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include <cstddef>
+
+namespace alpaka
+{
+    //! Kernel function attributes struct. Attributes are filled by calling the API of the accelerator using the kernel
+    //! function as an argument. In case of a CPU backend, maxThreadsPerBlock is set to 1 and other values remain zero
+    //! since there are no correponding API functions to get the values.
+    struct KernelFunctionAttributes
+    {
+        std::size_t constSizeBytes{0};
+        std::size_t localSizeBytes{0};
+        std::size_t sharedSizeBytes{0};
+        int maxDynamicSharedSizeBytes{0};
+        int numRegs{0};
+        // This field is ptx or isa version if the backend is GPU
+        int asmVersion{0};
+        int maxThreadsPerBlock{0};
+    };
+} // namespace alpaka
diff --git a/include/alpaka/kernel/SyclSubgroupSize.hpp b/include/alpaka/kernel/SyclSubgroupSize.hpp
new file mode 100644
index 0000000..1c7124b
--- /dev/null
+++ b/include/alpaka/kernel/SyclSubgroupSize.hpp
@@ -0,0 +1,120 @@
+/* Copyright 2023 Andrea Bocci, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+
+#    ifdef __SYCL_DEVICE_ONLY__
+
+#        if(__SYCL_TARGET_INTEL_GPU_BDW__) || /* Broadwell Intel graphics architecture */                             \
+            (__SYCL_TARGET_INTEL_GPU_SKL__) || /* Skylake Intel graphics architecture */                              \
+            (__SYCL_TARGET_INTEL_GPU_KBL__) || /* Kaby Lake Intel graphics architecture */                            \
+            (__SYCL_TARGET_INTEL_GPU_CFL__) || /* Coffee Lake Intel graphics architecture */                          \
+            (__SYCL_TARGET_INTEL_GPU_APL__) || /* Apollo Lake Intel graphics architecture */                          \
+            (__SYCL_TARGET_INTEL_GPU_GLK__) || /* Gemini Lake Intel graphics architecture */                          \
+            (__SYCL_TARGET_INTEL_GPU_WHL__) || /* Whiskey Lake Intel graphics architecture */                         \
+            (__SYCL_TARGET_INTEL_GPU_AML__) || /* Amber Lake Intel graphics architecture */                           \
+            (__SYCL_TARGET_INTEL_GPU_CML__) || /* Comet Lake Intel graphics architecture */                           \
+            (__SYCL_TARGET_INTEL_GPU_ICLLP__) || /* Ice Lake Intel graphics architecture */                           \
+            (__SYCL_TARGET_INTEL_GPU_EHL__) || /* Elkhart Lake or Jasper Lake Intel graphics architecture */          \
+            (__SYCL_TARGET_INTEL_GPU_TGLLP__) || /* Tiger Lake Intel graphics architecture */                         \
+            (__SYCL_TARGET_INTEL_GPU_RKL__) || /* Rocket Lake Intel graphics architecture */                          \
+            (__SYCL_TARGET_INTEL_GPU_ADL_S__) || /* Alder Lake S or Raptor Lake S Intel graphics architecture */      \
+            (__SYCL_TARGET_INTEL_GPU_ADL_P__) || /* Alder Lake P Intel graphics architecture */                       \
+            (__SYCL_TARGET_INTEL_GPU_ADL_N__) || /* Alder Lake N Intel graphics architecture */                       \
+            (__SYCL_TARGET_INTEL_GPU_DG1__) || /* DG1 Intel graphics architecture */                                  \
+            (__SYCL_TARGET_INTEL_GPU_ACM_G10__) || /* Alchemist G10 Intel graphics architecture */                    \
+            (__SYCL_TARGET_INTEL_GPU_ACM_G11__) || /* Alchemist G11 Intel graphics architecture */                    \
+            (__SYCL_TARGET_INTEL_GPU_ACM_G12__) || /* Alchemist G12 Intel graphics architecture */                    \
+            (__SYCL_TARGET_INTEL_GPU_MTL_U__) || /* Meteor Lake U/S or Arrow Lake U/S Intel graphics architecture */  \
+            (__SYCL_TARGET_INTEL_GPU_MTL_H__) || /* Meteor Lake H Intel graphics architecture */                      \
+            (__SYCL_TARGET_INTEL_GPU_ARL_H__) || /* Arrow Lake H Intel graphics architecture */                       \
+            (__SYCL_TARGET_INTEL_GPU_BMG_G21__) || /* Battlemage G21 Intel graphics architecture */                   \
+            (__SYCL_TARGET_INTEL_GPU_LNL_M__) /* Lunar Lake Intel graphics architecture */
+
+#            define SYCL_SUBGROUP_SIZE (8 | 16 | 32)
+
+#        elif(__SYCL_TARGET_INTEL_GPU_PVC__) || /* Ponte Vecchio Intel graphics architecture */                       \
+            (__SYCL_TARGET_INTEL_GPU_PVC_VG__) /* Ponte Vecchio VG Intel graphics architecture */
+
+#            define SYCL_SUBGROUP_SIZE (16 | 32)
+
+#        elif(__SYCL_TARGET_INTEL_X86_64__) /* generate code ahead of time for x86_64 CPUs */
+
+#            define SYCL_SUBGROUP_SIZE (4 | 8 | 16 | 32 | 64)
+
+#        elif(__SYCL_TARGET_NVIDIA_GPU_SM50__) || /* NVIDIA Maxwell architecture (compute capability 5.0) */          \
+            (__SYCL_TARGET_NVIDIA_GPU_SM52__) || /* NVIDIA Maxwell architecture (compute capability 5.2) */           \
+            (__SYCL_TARGET_NVIDIA_GPU_SM53__) || /* NVIDIA Jetson TX1 / Nano (compute capability 5.3) */              \
+            (__SYCL_TARGET_NVIDIA_GPU_SM60__) || /* NVIDIA Pascal architecture (compute capability 6.0) */            \
+            (__SYCL_TARGET_NVIDIA_GPU_SM61__) || /* NVIDIA Pascal architecture (compute capability 6.1) */            \
+            (__SYCL_TARGET_NVIDIA_GPU_SM62__) || /* NVIDIA Jetson TX2 (compute capability 6.2) */                     \
+            (__SYCL_TARGET_NVIDIA_GPU_SM70__) || /* NVIDIA Volta architecture (compute capability 7.0) */             \
+            (__SYCL_TARGET_NVIDIA_GPU_SM72__) || /* NVIDIA Jetson AGX (compute capability 7.2) */                     \
+            (__SYCL_TARGET_NVIDIA_GPU_SM75__) || /* NVIDIA Turing architecture (compute capability 7.5) */            \
+            (__SYCL_TARGET_NVIDIA_GPU_SM80__) || /* NVIDIA Ampere architecture (compute capability 8.0) */            \
+            (__SYCL_TARGET_NVIDIA_GPU_SM86__) || /* NVIDIA Ampere architecture (compute capability 8.6) */            \
+            (__SYCL_TARGET_NVIDIA_GPU_SM87__) || /* NVIDIA Jetson/Drive AGX Orin (compute capability 8.7) */          \
+            (__SYCL_TARGET_NVIDIA_GPU_SM89__) || /* NVIDIA Ada Lovelace arch. (compute capability 8.9) */             \
+            (__SYCL_TARGET_NVIDIA_GPU_SM90__) /* NVIDIA Hopper architecture (compute capability 9.0) */
+
+#            define SYCL_SUBGROUP_SIZE (32)
+
+#        elif(__SYCL_TARGET_AMD_GPU_GFX700__) || /* AMD GCN 2.0 Sea Islands architecture (gfx 7.0) */                 \
+            (__SYCL_TARGET_AMD_GPU_GFX701__) || /* AMD GCN 2.0 Sea Islands architecture (gfx 7.0) */                  \
+            (__SYCL_TARGET_AMD_GPU_GFX702__) || /* AMD GCN 2.0 Sea Islands architecture (gfx 7.0) */                  \
+            (__SYCL_TARGET_AMD_GPU_GFX801__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.0) */             \
+            (__SYCL_TARGET_AMD_GPU_GFX802__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.0) */             \
+            (__SYCL_TARGET_AMD_GPU_GFX803__) || /* AMD GCN 4.0 Arctic Islands architecture (gfx 8.0) */               \
+            (__SYCL_TARGET_AMD_GPU_GFX805__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.0) */             \
+            (__SYCL_TARGET_AMD_GPU_GFX810__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.1) */             \
+            (__SYCL_TARGET_AMD_GPU_GFX900__) || /* AMD GCN 5.0 Vega architecture (gfx 9.0) */                         \
+            (__SYCL_TARGET_AMD_GPU_GFX902__) || /* AMD GCN 5.0 Vega architecture (gfx 9.0) */                         \
+            (__SYCL_TARGET_AMD_GPU_GFX904__) || /* AMD GCN 5.0 Vega architecture (gfx 9.0) */                         \
+            (__SYCL_TARGET_AMD_GPU_GFX906__) || /* AMD GCN 5.1 Vega II architecture (gfx 9.0) */                      \
+            (__SYCL_TARGET_AMD_GPU_GFX908__) || /* AMD CDNA 1.0 Arcturus architecture (gfx 9.0) */                    \
+            (__SYCL_TARGET_AMD_GPU_GFX909__) || /* AMD GCN 5.0 Raven 2 architecture (gfx 9.0) */                      \
+            (__SYCL_TARGET_AMD_GPU_GFX90A__) || /* AMD CDNA 2.0 Aldebaran architecture (gfx 9.0) */                   \
+            (__SYCL_TARGET_AMD_GPU_GFX90C__) || /* AMD GCN 5.1 Renoir architecture (gfx 9.0) */                       \
+            (__SYCL_TARGET_AMD_GPU_GFX940__) || /* AMD CDNA 3.0 Aqua Vanjaram architecture (gfx 9.4) */               \
+            (__SYCL_TARGET_AMD_GPU_GFX941__) || /* AMD CDNA 3.0 Aqua Vanjaram architecture (gfx 9.4) */               \
+            (__SYCL_TARGET_AMD_GPU_GFX942__) /* AMD CDNA 3.0 Aqua Vanjaram architecture (gfx 9.4) */
+
+#            define SYCL_SUBGROUP_SIZE (64)
+
+#        elif(__SYCL_TARGET_AMD_GPU_GFX1010__) || /* AMD RDNA 1.0 Navi 10 architecture (gfx 10.1) */                  \
+            (__SYCL_TARGET_AMD_GPU_GFX1011__) || /* AMD RDNA 1.0 Navi 12 architecture (gfx 10.1) */                   \
+            (__SYCL_TARGET_AMD_GPU_GFX1012__) || /* AMD RDNA 1.0 Navi 14 architecture (gfx 10.1) */                   \
+            (__SYCL_TARGET_AMD_GPU_GFX1013__) || /* AMD RDNA 2.0 Oberon architecture (gfx 10.1) */                    \
+            (__SYCL_TARGET_AMD_GPU_GFX1030__) || /* AMD RDNA 2.0 Navi 21 architecture (gfx 10.3) */                   \
+            (__SYCL_TARGET_AMD_GPU_GFX1031__) || /* AMD RDNA 2.0 Navi 22 architecture (gfx 10.3) */                   \
+            (__SYCL_TARGET_AMD_GPU_GFX1032__) || /* AMD RDNA 2.0 Navi 23 architecture (gfx 10.3) */                   \
+            (__SYCL_TARGET_AMD_GPU_GFX1033__) || /* AMD RDNA 2.0 Van Gogh architecture (gfx 10.3) */                  \
+            (__SYCL_TARGET_AMD_GPU_GFX1034__) || /* AMD RDNA 2.0 Navi 24 architecture (gfx 10.3) */                   \
+            (__SYCL_TARGET_AMD_GPU_GFX1035__) || /* AMD RDNA 2.0 Rembrandt Mobile architecture (gfx 10.3) */          \
+            (__SYCL_TARGET_AMD_GPU_GFX1036__) || /* AMD RDNA 2.0 Raphael architecture (gfx 10.3) */                   \
+            (__SYCL_TARGET_AMD_GPU_GFX1100__) || /* AMD RDNA 3.0 Navi 31 architecture (gfx 11.0) */                   \
+            (__SYCL_TARGET_AMD_GPU_GFX1101__) || /* AMD RDNA 3.0 Navi 32 architecture (gfx 11.0) */                   \
+            (__SYCL_TARGET_AMD_GPU_GFX1102__) || /* AMD RDNA 3.0 Navi 33 architecture (gfx 11.0) */                   \
+            (__SYCL_TARGET_AMD_GPU_GFX1103__) || /* AMD RDNA 3.0 Phoenix mobile architecture (gfx 11.0) */            \
+            (__SYCL_TARGET_AMD_GPU_GFX1150__) || /* AMD RDNA 3.5 Strix Point architecture (gfx 11.5) */               \
+            (__SYCL_TARGET_AMD_GPU_GFX1151__) || /* AMD RDNA 3.5 Strix Halo architecture (gfx 11.5) */                \
+            (__SYCL_TARGET_AMD_GPU_GFX1200__) || /* AMD RDNA 4.0 Navi 44 architecture (gfx 12.0) */                   \
+            (__SYCL_TARGET_AMD_GPU_GFX1201__) /* AMD RDNA 4.0 Navi 48 architecture (gfx 12.0) */
+
+// starting from gfx10, HIP supports only wavefront size 32
+#            define SYCL_SUBGROUP_SIZE (32)
+
+#        else // __SYCL_TARGET_*
+
+#            define SYCL_SUBGROUP_SIZE (0) /* unknown target */
+
+#        endif // __SYCL_TARGET_*
+
+#    else
+
+#        define SYCL_SUBGROUP_SIZE (0) /* host compilation */
+
+#    endif // __SYCL_DEVICE_ONLY__
+
+#endif // ALPAKA_ACC_SYCL_ENABLED
diff --git a/include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp b/include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp
new file mode 100644
index 0000000..f0d6056
--- /dev/null
+++ b/include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp
@@ -0,0 +1,991 @@
+/* Copyright 2022 Benjamin Worpitz, Bert Wesarg, René Widera, Sergei Bastrakov, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+// Specialized traits.
+#include "alpaka/acc/Traits.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/dim/Traits.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/platform/Traits.hpp"
+
+// Implementation details.
+#include "alpaka/acc/AccCpuOmp2Blocks.hpp"
+#include "alpaka/core/Decay.hpp"
+#include "alpaka/core/OmpSchedule.hpp"
+#include "alpaka/dev/DevCpu.hpp"
+#include "alpaka/idx/MapIdx.hpp"
+#include "alpaka/kernel/KernelFunctionAttributes.hpp"
+#include "alpaka/kernel/Traits.hpp"
+#include "alpaka/platform/PlatformCpu.hpp"
+#include "alpaka/workdiv/WorkDivMembers.hpp"
+
+#include <functional>
+#include <stdexcept>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+#    include <iostream>
+#endif
+
+#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
+
+#    if BOOST_COMP_CLANG
+#        pragma clang diagnostic push
+#        pragma clang diagnostic ignored "-Wswitch-default"
+#    endif
+
+#    if _OPENMP < 200203
+#        error If ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
+#    endif
+
+#    include <omp.h>
+
+namespace alpaka
+{
+    namespace detail
+    {
+        //! Executor of parallel OpenMP loop with the given schedule
+        //!
+        //! Is explicitly specialized for all supported schedule kinds to help code optimization by compilers.
+        //!
+        //! \tparam TKernel The kernel type.
+        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
+        //! \tparam TScheduleKind The schedule kind value.
+        template<typename TKernel, typename TSchedule, omp::Schedule::Kind TScheduleKind>
+        struct ParallelForImpl;
+
+        //! Executor of parallel OpenMP loop with no schedule set
+        //!
+        //! Does not use chunk size.
+        //!
+        //! \tparam TKernel The kernel type.
+        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
+        template<typename TKernel, typename TSchedule>
+        struct ParallelForImpl<TKernel, TSchedule, omp::Schedule::NoSchedule>
+        {
+            //! Run parallel OpenMP loop
+            //!
+            //! \tparam TLoopBody The loop body functor type.
+            //! \tparam TIdx The index type.
+            //!
+            //! \param loopBody The loop body functor instance, takes iteration index as input.
+            //! \param numIterations The number of loop iterations.
+            template<typename TLoopBody, typename TIdx>
+            ALPAKA_FN_HOST void operator()(
+                TKernel const&,
+                TLoopBody&& loopBody,
+                TIdx const numIterations,
+                TSchedule const&)
+            {
+#    if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
+                         // header.
+                std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
+                std::intmax_t i;
+#        pragma omp for nowait
+                for(i = 0; i < iNumBlocksInGrid; ++i)
+#    else
+#        pragma omp for nowait
+                for(TIdx i = 0; i < numIterations; ++i)
+#    endif
+                {
+                    // Make another lambda to work around #1288
+                    auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
+                    wrappedLoopBody(i);
+                }
+            }
+        };
+
+        /* Implementations for Static, Dynamic and Guided follow the same pattern.
+         * There are two specializations of ParallelForImpl for compile-time dispatch depending on whether the
+         * OmpSchedule trait is specialized.
+         * The no trait case is further compile-time dispatched with a helper ParallelForStaticImpl.
+         * It is based on whether ompScheduleChunkSize member is available.
+         */
+
+        //! Executor of parallel OpenMP loop with the static schedule
+        //!
+        //! Specialization for kernels specializing the OmpSchedule trait.
+        //!
+        //! \tparam TKernel The kernel type.
+        template<typename TKernel>
+        struct ParallelForImpl<TKernel, omp::Schedule, omp::Schedule::Static>
+        {
+            //! Run parallel OpenMP loop
+            //!
+            //! \tparam TLoopBody The loop body functor type.
+            //! \tparam TIdx The index type.
+            //!
+            //! \param loopBody The loop body functor instance, takes iteration index as input.
+            //! \param numIterations The number of loop iterations.
+            //! \param schedule The schedule object.
+            template<typename TLoopBody, typename TIdx>
+            ALPAKA_FN_HOST void operator()(
+                TKernel const&,
+                TLoopBody&& loopBody,
+                TIdx const numIterations,
+                omp::Schedule const& schedule)
+            {
+#    if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
+                         // header.
+                std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
+                std::intmax_t i;
+#        pragma omp for nowait schedule(static, schedule.chunkSize)
+                for(i = 0; i < iNumBlocksInGrid; ++i)
+#    else
+#        pragma omp for nowait schedule(static, schedule.chunkSize)
+                for(TIdx i = 0; i < numIterations; ++i)
+#    endif
+                {
+                    // Make another lambda to work around #1288
+                    auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
+                    wrappedLoopBody(i);
+                }
+            }
+        };
+
+        //! Helper executor of parallel OpenMP loop with the static schedule
+        //!
+        //! Generel implementation is for TKernel types without member ompScheduleChunkSize.
+        //!
+        //! \tparam TKernel The kernel type.
+        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
+        template<typename TKernel, typename TSchedule, typename TSfinae = void>
+        struct ParallelForStaticImpl
+        {
+            //! Run parallel OpenMP loop
+            //!
+            //! \tparam TLoopBody The loop body functor type.
+            //! \tparam TIdx The index type.
+            //!
+            //! \param loopBody The loop body functor instance, takes iteration index as input.
+            //! \param numIterations The number of loop iterations.
+            template<typename TLoopBody, typename TIdx>
+            ALPAKA_FN_HOST void operator()(
+                TKernel const&,
+                TLoopBody&& loopBody,
+                TIdx const numIterations,
+                TSchedule const&)
+            {
+#    if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
+                         // header.
+                std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
+                std::intmax_t i;
+#        pragma omp for nowait schedule(static)
+                for(i = 0; i < iNumBlocksInGrid; ++i)
+#    else
+#        pragma omp for nowait schedule(static)
+                for(TIdx i = 0; i < numIterations; ++i)
+#    endif
+                {
+                    // Make another lambda to work around #1288
+                    auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
+                    wrappedLoopBody(i);
+                }
+            }
+        };
+
+        //! Helper type to check if TKernel has member ompScheduleChunkSize
+        //!
+        //! Is void for those types, ill-formed otherwise.
+        //!
+        //! \tparam TKernel The kernel type.
+        template<typename TKernel>
+        using HasScheduleChunkSize = std::void_t<decltype(TKernel::ompScheduleChunkSize)>;
+
+        //! Helper executor of parallel OpenMP loop with the static schedule
+        //!
+        //! Specialization for kernels with ompScheduleChunkSize member.
+        //!
+        //! \tparam TKernel The kernel type.
+        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
+        template<typename TKernel, typename TSchedule>
+        struct ParallelForStaticImpl<TKernel, TSchedule, HasScheduleChunkSize<TKernel>>
+        {
+            //! Run parallel OpenMP loop
+            //!
+            //! \tparam TLoopBody The loop body functor type.
+            //! \tparam TIdx The index type.
+            //!
+            //! \param kernel The kernel instance reference
+            //! \param loopBody The loop body functor instance, takes iteration index as input.
+            //! \param numIterations The number of loop iterations.
+            template<typename TLoopBody, typename TIdx>
+            ALPAKA_FN_HOST void operator()(
+                TKernel const& kernel,
+                TLoopBody&& loopBody,
+                TIdx const numIterations,
+                TSchedule const&)
+            {
+#    if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
+                         // header.
+                std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
+                std::intmax_t i;
+#        pragma omp for nowait schedule(static, kernel.ompScheduleChunkSize)
+                for(i = 0; i < iNumBlocksInGrid; ++i)
+#    else
+#        pragma omp for nowait schedule(static, kernel.ompScheduleChunkSize)
+                for(TIdx i = 0; i < numIterations; ++i)
+#    endif
+                {
+                    // Make another lambda to work around #1288
+                    auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
+                    wrappedLoopBody(i);
+                }
+            }
+        };
+
+        //! Executor of parallel OpenMP loop with the static schedule
+        //!
+        //! Specialization for kernels not specializing the OmpSchedule trait.
+        //! Falls back to ParallelForStaticImpl for further dispatch.
+        //!
+        //! \tparam TKernel The kernel type.
+        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
+        template<typename TKernel, typename TSchedule>
+        struct ParallelForImpl<TKernel, TSchedule, omp::Schedule::Static> : ParallelForStaticImpl<TKernel, TSchedule>
+        {
+        };
+
+        //! Executor of parallel OpenMP loop with the dynamic schedule
+        //!
+        //! Specialization for kernels specializing the OmpSchedule trait.
+        //!
+        //! \tparam TKernel The kernel type.
+        template<typename TKernel>
+        struct ParallelForImpl<TKernel, omp::Schedule, omp::Schedule::Dynamic>
+        {
+            //! Run parallel OpenMP loop
+            //!
+            //! \tparam TLoopBody The loop body functor type.
+            //! \tparam TIdx The index type.
+            //!
+            //! \param loopBody The loop body functor instance, takes iteration index as input.
+            //! \param numIterations The number of loop iterations.
+            //! \param schedule The schedule object.
+            template<typename TLoopBody, typename TIdx>
+            ALPAKA_FN_HOST void operator()(
+                TKernel const&,
+                TLoopBody&& loopBody,
+                TIdx const numIterations,
+                omp::Schedule const& schedule)
+            {
+#    if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
+                         // header.
+                std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
+                std::intmax_t i;
+#        pragma omp for nowait schedule(dynamic, schedule.chunkSize)
+                for(i = 0; i < iNumBlocksInGrid; ++i)
+#    else
+#        pragma omp for nowait schedule(dynamic, schedule.chunkSize)
+                for(TIdx i = 0; i < numIterations; ++i)
+#    endif
+                {
+                    // Make another lambda to work around #1288
+                    auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
+                    wrappedLoopBody(i);
+                }
+            }
+        };
+
+        //! Helper executor of parallel OpenMP loop with the dynamic schedule
+        //!
+        //! Generel implementation is for TKernel types without member ompScheduleChunkSize.
+        //!
+        //! \tparam TKernel The kernel type.
+        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
+        template<typename TKernel, typename TSchedule, typename TSfinae = void>
+        struct ParallelForDynamicImpl
+        {
+            //! Run parallel OpenMP loop
+            //!
+            //! \tparam TLoopBody The loop body functor type.
+            //! \tparam TIdx The index type.
+            //!
+            //! \param loopBody The loop body functor instance, takes iteration index as input.
+            //! \param numIterations The number of loop iterations.
+            template<typename TLoopBody, typename TIdx>
+            ALPAKA_FN_HOST void operator()(
+                TKernel const&,
+                TLoopBody&& loopBody,
+                TIdx const numIterations,
+                TSchedule const&)
+            {
+#    if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
+                         // header.
+                std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
+                std::intmax_t i;
+#        pragma omp for nowait schedule(dynamic)
+                for(i = 0; i < iNumBlocksInGrid; ++i)
+#    else
+#        pragma omp for nowait schedule(dynamic)
+                for(TIdx i = 0; i < numIterations; ++i)
+#    endif
+                {
+                    // Make another lambda to work around #1288
+                    auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
+                    wrappedLoopBody(i);
+                }
+            }
+        };
+
+        //! Helper executor of parallel OpenMP loop with the dynamic schedule
+        //!
+        //! Specialization for kernels with ompScheduleChunkSize member.
+        //!
+        //! \tparam TKernel The kernel type.
+        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
+        template<typename TKernel, typename TSchedule>
+        struct ParallelForDynamicImpl<TKernel, TSchedule, HasScheduleChunkSize<TKernel>>
+        {
+            //! Run parallel OpenMP loop
+            //!
+            //! \tparam TLoopBody The loop body functor type.
+            //! \tparam TIdx The index type.
+            //!
+            //! \param kernel The kernel instance reference
+            //! \param loopBody The loop body functor instance, takes iteration index as input.
+            //! \param numIterations The number of loop iterations.
+            template<typename TLoopBody, typename TIdx>
+            ALPAKA_FN_HOST void operator()(
+                TKernel const& kernel,
+                TLoopBody&& loopBody,
+                TIdx const numIterations,
+                TSchedule const&)
+            {
+#    if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
+                         // header.
+                std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
+                std::intmax_t i;
+#        pragma omp for nowait schedule(dynamic, kernel.ompScheduleChunkSize)
+                for(i = 0; i < iNumBlocksInGrid; ++i)
+#    else
+#        pragma omp for nowait schedule(dynamic, kernel.ompScheduleChunkSize)
+                for(TIdx i = 0; i < numIterations; ++i)
+#    endif
+                {
+                    // Make another lambda to work around #1288
+                    auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
+                    wrappedLoopBody(i);
+                }
+            }
+        };
+
+        //! Executor of parallel OpenMP loop with the dynamic schedule
+        //!
+        //! Specialization for kernels not specializing the OmpSchedule trait.
+        //! Falls back to ParallelForDynamicImpl for further dispatch.
+        //!
+        //! \tparam TKernel The kernel type.
+        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
+        template<typename TKernel, typename TSchedule>
+        struct ParallelForImpl<TKernel, TSchedule, omp::Schedule::Dynamic> : ParallelForDynamicImpl<TKernel, TSchedule>
+        {
+        };
+
+        //! Executor of parallel OpenMP loop with the guided schedule
+        //!
+        //! Specialization for kernels specializing the OmpSchedule trait.
+        //!
+        //! \tparam TKernel The kernel type.
+        template<typename TKernel>
+        struct ParallelForImpl<TKernel, omp::Schedule, omp::Schedule::Guided>
+        {
+            //! Run parallel OpenMP loop
+            //!
+            //! \tparam TLoopBody The loop body functor type.
+            //! \tparam TIdx The index type.
+            //!
+            //! \param loopBody The loop body functor instance, takes iteration index as input.
+            //! \param numIterations The number of loop iterations.
+            //! \param schedule The schedule object.
+            template<typename TLoopBody, typename TIdx>
+            ALPAKA_FN_HOST void operator()(
+                TKernel const&,
+                TLoopBody&& loopBody,
+                TIdx const numIterations,
+                omp::Schedule const& schedule)
+            {
+#    if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
+                         // header.
+                std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
+                std::intmax_t i;
+#        pragma omp for nowait schedule(guided, schedule.chunkSize)
+                for(i = 0; i < iNumBlocksInGrid; ++i)
+#    else
+#        pragma omp for nowait schedule(guided, schedule.chunkSize)
+                for(TIdx i = 0; i < numIterations; ++i)
+#    endif
+                {
+                    // Make another lambda to work around #1288
+                    auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
+                    wrappedLoopBody(i);
+                }
+            }
+        };
+
+        //! Helper executor of parallel OpenMP loop with the guided schedule
+        //!
+        //! Generel implementation is for TKernel types without member ompScheduleChunkSize.
+        //!
+        //! \tparam TKernel The kernel type.
+        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
+        template<typename TKernel, typename TSchedule, typename TSfinae = void>
+        struct ParallelForGuidedImpl
+        {
+            //! Run parallel OpenMP loop
+            //!
+            //! \tparam TLoopBody The loop body functor type.
+            //! \tparam TIdx The index type.
+            //!
+            //! \param loopBody The loop body functor instance, takes iteration index as input.
+            //! \param numIterations The number of loop iterations.
+            template<typename TLoopBody, typename TIdx>
+            ALPAKA_FN_HOST void operator()(
+                TKernel const&,
+                TLoopBody&& loopBody,
+                TIdx const numIterations,
+                TSchedule const&)
+            {
+#    if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
+                         // header.
+                std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
+                std::intmax_t i;
+#        pragma omp for nowait schedule(guided)
+                for(i = 0; i < iNumBlocksInGrid; ++i)
+#    else
+#        pragma omp for nowait schedule(guided)
+                for(TIdx i = 0; i < numIterations; ++i)
+#    endif
+                {
+                    // Make another lambda to work around #1288
+                    auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
+                    wrappedLoopBody(i);
+                }
+            }
+        };
+
+        //! Helper executor of parallel OpenMP loop with the guided schedule
+        //!
+        //! Specialization for kernels with ompScheduleChunkSize member.
+        //!
+        //! \tparam TKernel The kernel type.
+        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
+        template<typename TKernel, typename TSchedule>
+        struct ParallelForGuidedImpl<TKernel, TSchedule, HasScheduleChunkSize<TKernel>>
+        {
+            //! Run parallel OpenMP loop
+            //!
+            //! \tparam TLoopBody The loop body functor type.
+            //! \tparam TIdx The index type.
+            //!
+            //! \param kernel The kernel instance reference
+            //! \param loopBody The loop body functor instance, takes iteration index as input.
+            //! \param numIterations The number of loop iterations.
+            template<typename TLoopBody, typename TIdx>
+            ALPAKA_FN_HOST void operator()(
+                TKernel const& kernel,
+                TLoopBody&& loopBody,
+                TIdx const numIterations,
+                TSchedule const&)
+            {
+#    if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
+                         // header.
+                std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
+                std::intmax_t i;
+#        pragma omp for nowait schedule(guided, kernel.ompScheduleChunkSize)
+                for(i = 0; i < iNumBlocksInGrid; ++i)
+#    else
+#        pragma omp for nowait schedule(guided, kernel.ompScheduleChunkSize)
+                for(TIdx i = 0; i < numIterations; ++i)
+#    endif
+                {
+                    // Make another lambda to work around #1288
+                    auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
+                    wrappedLoopBody(i);
+                }
+            }
+        };
+
+        //! Executor of parallel OpenMP loop with the guided schedule
+        //!
+        //! Specialization for kernels not specializing the OmpSchedule trait.
+        //! Falls back to ParallelForGuidedImpl for further dispatch.
+        //!
+        //! \tparam TKernel The kernel type.
+        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
+        template<typename TKernel, typename TSchedule>
+        struct ParallelForImpl<TKernel, TSchedule, omp::Schedule::Guided> : ParallelForGuidedImpl<TKernel, TSchedule>
+        {
+        };
+
+#    if _OPENMP >= 200805
+        //! Executor of parallel OpenMP loop with auto schedule set
+        //!
+        //! Does not use chunk size.
+        //!
+        //! \tparam TKernel The kernel type.
+        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
+        template<typename TKernel, typename TSchedule>
+        struct ParallelForImpl<TKernel, TSchedule, omp::Schedule::Auto>
+        {
+            //! Run parallel OpenMP loop
+            //!
+            //! \tparam TLoopBody The loop body functor type.
+            //! \tparam TIdx The index type.
+            //!
+            //! \param loopBody The loop body functor instance, takes iteration index as input.
+            //! \param numIterations The number of loop iterations.
+            template<typename TLoopBody, typename TIdx>
+            ALPAKA_FN_HOST void operator()(
+                TKernel const&,
+                TLoopBody&& loopBody,
+                TIdx const numIterations,
+                TSchedule const&)
+            {
+#        pragma omp for nowait schedule(auto)
+                for(TIdx i = 0; i < numIterations; ++i)
+                {
+                    // Make another lambda to work around #1288
+                    auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
+                    wrappedLoopBody(i);
+                }
+            }
+        };
+#    endif
+
+        //! Executor of parallel OpenMP loop with runtime schedule set
+        //!
+        //! Does not use chunk size.
+        //!
+        //! \tparam TKernel The kernel type.
+        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
+        template<typename TKernel, typename TSchedule>
+        struct ParallelForImpl<TKernel, TSchedule, omp::Schedule::Runtime>
+        {
+            //! Run parallel OpenMP loop
+            //!
+            //! \tparam TLoopBody The loop body functor type.
+            //! \tparam TIdx The index type.
+            //!
+            //! \param loopBody The loop body functor instance, takes iteration index as input.
+            //! \param numIterations The number of loop iterations.
+            template<typename TLoopBody, typename TIdx>
+            ALPAKA_FN_HOST void operator()(
+                TKernel const&,
+                TLoopBody&& loopBody,
+                TIdx const numIterations,
+                TSchedule const&)
+            {
+#    if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
+                         // header.
+                std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
+                std::intmax_t i;
+#        pragma omp for nowait schedule(runtime)
+                for(i = 0; i < iNumBlocksInGrid; ++i)
+#    else
+#        pragma omp for nowait schedule(runtime)
+                for(TIdx i = 0; i < numIterations; ++i)
+#    endif
+                {
+                    // Make another lambda to work around #1288
+                    auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
+                    wrappedLoopBody(i);
+                }
+            }
+        };
+
+        //! Executor of parallel OpenMP loop
+        //!
+        //! Performs dispatch based on schedule kind and forwards to the corresponding ParallelForImpl.
+        //! The default implementation is for the kernels that do not set schedule in any way, compile-time dispatch.
+        //!
+        //! \tparam TKernel The kernel type.
+        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
+        template<typename TKernel, typename TSchedule, typename TSfinae = void>
+        struct ParallelFor
+        {
+            //! Run parallel OpenMP loop
+            //!
+            //! \tparam TLoopBody The loop body functor type.
+            //! \tparam TIdx The index type.
+            //!
+            //! \param kernel The kernel instance reference
+            //! \param loopBody The loop body functor instance, takes iteration index as input.
+            //! \param numIterations The number of loop iterations.
+            //! \param schedule The schedule object.
+            template<typename TLoopBody, typename TIdx>
+            ALPAKA_FN_HOST void operator()(
+                TKernel const& kernel,
+                TLoopBody&& loopBody,
+                TIdx const numIterations,
+                TSchedule const& schedule)
+            {
+                // Forward to ParallelForImpl that performs dispatch by by chunk size
+                ParallelForImpl<TKernel, TSchedule, omp::Schedule::NoSchedule>{}(
+                    kernel,
+                    std::forward<TLoopBody>(loopBody),
+                    numIterations,
+                    schedule);
+            }
+        };
+
+        //! Executor of parallel OpenMP loop
+        //!
+        //! Performs dispatch based on schedule kind and forwards to the corresponding ParallelForImpl.
+        //! Specialization for kernels specializing the OmpSchedule trait, run-time dispatch.
+        //!
+        //! \tparam TKernel The kernel type.
+        template<typename TKernel>
+        struct ParallelFor<TKernel, omp::Schedule>
+        {
+            //! Run parallel OpenMP loop
+            //!
+            //! \tparam TLoopBody The loop body functor type.
+            //! \tparam TIdx The index type.
+            //!
+            //! \param kernel The kernel instance reference
+            //! \param loopBody The loop body functor instance, takes iteration index as input.
+            //! \param numIterations The number of loop iterations.
+            //! \param schedule The schedule object.
+            template<typename TLoopBody, typename TIdx>
+            ALPAKA_FN_HOST void operator()(
+                TKernel const& kernel,
+                TLoopBody&& loopBody,
+                TIdx const numIterations,
+                omp::Schedule const& schedule)
+            {
+                // Forward to ParallelForImpl that performs dispatch by by chunk size
+                switch(schedule.kind)
+                {
+                case omp::Schedule::NoSchedule:
+                    ParallelForImpl<TKernel, omp::Schedule, omp::Schedule::NoSchedule>{}(
+                        kernel,
+                        std::forward<TLoopBody>(loopBody),
+                        numIterations,
+                        schedule);
+                    break;
+                case omp::Schedule::Static:
+                    ParallelForImpl<TKernel, omp::Schedule, omp::Schedule::Static>{}(
+                        kernel,
+                        std::forward<TLoopBody>(loopBody),
+                        numIterations,
+                        schedule);
+                    break;
+                case omp::Schedule::Dynamic:
+                    ParallelForImpl<TKernel, omp::Schedule, omp::Schedule::Dynamic>{}(
+                        kernel,
+                        std::forward<TLoopBody>(loopBody),
+                        numIterations,
+                        schedule);
+                    break;
+                case omp::Schedule::Guided:
+                    ParallelForImpl<TKernel, omp::Schedule, omp::Schedule::Guided>{}(
+                        kernel,
+                        std::forward<TLoopBody>(loopBody),
+                        numIterations,
+                        schedule);
+                    break;
+#    if _OPENMP >= 200805
+                case omp::Schedule::Auto:
+                    ParallelForImpl<TKernel, omp::Schedule, omp::Schedule::Auto>{}(
+                        kernel,
+                        std::forward<TLoopBody>(loopBody),
+                        numIterations,
+                        schedule);
+                    break;
+#    endif
+                case omp::Schedule::Runtime:
+                    ParallelForImpl<TKernel, omp::Schedule, omp::Schedule::Runtime>{}(
+                        kernel,
+                        std::forward<TLoopBody>(loopBody),
+                        numIterations,
+                        schedule);
+                    break;
+                }
+            }
+        };
+
+        //! Helper type to check if TSchedule is a type originating from OmpSchedule trait definition
+        //!
+        //! \tparam TSchedule The schedule type.
+        template<typename TSchedule>
+        using IsOmpScheduleTraitSpecialized
+            = std::integral_constant<bool, std::is_same<TSchedule, omp::Schedule>::value>;
+
+        //! Helper type to check if member ompScheduleKind of TKernel should be used
+        //!
+        //! For that it has to be present, and no OmpSchedule trait specialized.
+        //! Is void for those types, ill-formed otherwise.
+        //!
+        //! \tparam TKernel The kernel type.
+        //! \tparam TSchedule The schedule type.
+        template<typename TKernel, typename TSchedule>
+        using UseScheduleKind
+            = std::enable_if_t<sizeof(TKernel::ompScheduleKind) && !IsOmpScheduleTraitSpecialized<TSchedule>::value>;
+
+        //! Executor of parallel OpenMP loop
+        //!
+        //! Performs dispatch based on schedule kind and forwards to the corresponding ParallelForImpl.
+        //! Specialization for kernels with ompScheduleKind member, compile-time dispatch.
+        //!
+        //! \tparam TKernel The kernel type.
+        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
+        template<typename TKernel, typename TSchedule>
+        struct ParallelFor<TKernel, TSchedule, UseScheduleKind<TKernel, TSchedule>>
+        {
+            //! Run parallel OpenMP loop
+            //!
+            //! \tparam TLoopBody The loop body functor type.
+            //! \tparam TIdx The index type.
+            //!
+            //! \param kernel The kernel instance reference
+            //! \param loopBody The loop body functor instance, takes iteration index as input.
+            //! \param numIterations The number of loop iterations.
+            //! \param schedule The schedule object.
+            template<typename TLoopBody, typename TIdx>
+            ALPAKA_FN_HOST void operator()(
+                TKernel const& kernel,
+                TLoopBody&& loopBody,
+                TIdx const numIterations,
+                TSchedule const& schedule)
+            {
+                // Forward to ParallelForImpl that performs dispatch by by chunk size
+                ParallelForImpl<TKernel, TSchedule, TKernel::ompScheduleKind>{}(
+                    kernel,
+                    std::forward<TLoopBody>(loopBody),
+                    numIterations,
+                    schedule);
+            }
+        };
+
+        //! Run parallel OpenMP loop
+        //!
+        //! \tparam TKernel The kernel type.
+        //! \tparam TLoopBody The loop body functor type.
+        //! \tparam TIdx The index type.
+        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
+        //!
+        //! \param kernel The kernel instance reference,
+        //!        not perfect=forwarded to shorten SFINAE internally.
+        //! \param loopBody The loop body functor instance, takes iteration index as input.
+        //! \param numIterations The number of loop iterations.
+        //! \param schedule The schedule object.
+        template<typename TKernel, typename TLoopBody, typename TIdx, typename TSchedule>
+        ALPAKA_FN_HOST ALPAKA_FN_INLINE void parallelFor(
+            TKernel const& kernel,
+            TLoopBody&& loopBody,
+            TIdx const numIterations,
+            TSchedule const& schedule)
+        {
+            // Forward to ParallelFor that performs first a dispatch by schedule kind, and then by chunk size
+            ParallelFor<TKernel, TSchedule>{}(kernel, std::forward<TLoopBody>(loopBody), numIterations, schedule);
+        }
+
+    } // namespace detail
+
+    //! The CPU OpenMP 2.0 block accelerator execution task.
+    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelCpuOmp2Blocks final : public WorkDivMembers<TDim, TIdx>
+    {
+    public:
+        template<typename TWorkDiv>
+        ALPAKA_FN_HOST TaskKernelCpuOmp2Blocks(TWorkDiv&& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
+            : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
+            , m_kernelFnObj(kernelFnObj)
+            , m_args(std::forward<TArgs>(args)...)
+        {
+            static_assert(
+                Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
+                "The work division and the execution task have to be of the same dimensionality!");
+        }
+
+        //! Executes the kernel function object.
+        ALPAKA_FN_HOST auto operator()() const -> void
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+            auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(*this);
+            auto const blockThreadExtent = getWorkDiv<Block, Threads>(*this);
+            auto const threadElemExtent = getWorkDiv<Thread, Elems>(*this);
+
+            // Get the size of the block shared dynamic memory.
+            auto const blockSharedMemDynSizeBytes = std::apply(
+                [&](std::decay_t<TArgs> const&... args)
+                {
+                    return getBlockSharedMemDynSizeBytes<AccCpuOmp2Blocks<TDim, TIdx>>(
+                        m_kernelFnObj,
+                        blockThreadExtent,
+                        threadElemExtent,
+                        args...);
+                },
+                m_args);
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            std::cout << __func__ << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
+                      << std::endl;
+#    endif
+
+            // The number of blocks in the grid.
+            TIdx const numBlocksInGrid(gridBlockExtent.prod());
+
+            // Get the OpenMP schedule information for the given kernel and parameter types
+            auto const schedule = std::apply(
+                [&](std::decay_t<TArgs> const&... args) {
+                    return getOmpSchedule<AccCpuOmp2Blocks<TDim, TIdx>>(
+                        m_kernelFnObj,
+                        blockThreadExtent,
+                        threadElemExtent,
+                        args...);
+                },
+                m_args);
+
+            if(::omp_in_parallel() != 0)
+            {
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                std::cout << __func__ << " already within a parallel region." << std::endl;
+#    endif
+                parallelFn(blockSharedMemDynSizeBytes, numBlocksInGrid, gridBlockExtent, schedule);
+            }
+            else
+            {
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                std::cout << __func__ << " opening new parallel region." << std::endl;
+#    endif
+#    pragma omp parallel
+                parallelFn(blockSharedMemDynSizeBytes, numBlocksInGrid, gridBlockExtent, schedule);
+            }
+        }
+
+    private:
+        template<typename TSchedule>
+        ALPAKA_FN_HOST auto parallelFn(
+            std::size_t const& blockSharedMemDynSizeBytes,
+            TIdx const& numBlocksInGrid,
+            Vec<TDim, TIdx> const& gridBlockExtent,
+            TSchedule const& schedule) const -> void
+        {
+#    pragma omp single nowait
+            {
+                // The OpenMP runtime does not create a parallel region when either:
+                // * only one thread is required in the num_threads clause
+                // * or only one thread is available
+                // In all other cases we expect to be in a parallel region now.
+                if((numBlocksInGrid > 1) && (::omp_get_max_threads() > 1) && (::omp_in_parallel() == 0))
+                {
+                    throw std::runtime_error("The OpenMP 2.0 runtime did not create a parallel region!");
+                }
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+                std::cout << __func__ << " omp_get_num_threads: " << ::omp_get_num_threads() << std::endl;
+#    endif
+            }
+
+            AccCpuOmp2Blocks<TDim, TIdx> acc(
+                *static_cast<WorkDivMembers<TDim, TIdx> const*>(this),
+                blockSharedMemDynSizeBytes);
+
+            // Body of the OpenMP parallel loop to be executed.
+            // Index type is auto since we have a difference for OpenMP 2.0 and later ones
+            auto loopBody = [&](auto currentIndex)
+            {
+#    if _OPENMP < 200805
+                auto const i_tidx = static_cast<TIdx>(currentIndex); // for issue #840
+                auto const index = Vec<DimInt<1u>, TIdx>(i_tidx); // for issue #840
+#    else
+                auto const index = Vec<DimInt<1u>, TIdx>(currentIndex); // for issue #840
+#    endif
+                acc.m_gridBlockIdx = mapIdx<TDim::value>(index, gridBlockExtent);
+
+                std::apply(m_kernelFnObj, std::tuple_cat(std::tie(acc), m_args));
+
+                // After a block has been processed, the shared memory has to be deleted.
+                freeSharedVars(acc);
+            };
+
+            detail::parallelFor(m_kernelFnObj, loopBody, numBlocksInGrid, schedule);
+        }
+
+        TKernelFnObj m_kernelFnObj;
+        std::tuple<std::decay_t<TArgs>...> m_args;
+    };
+
+    namespace trait
+    {
+        //! The CPU OpenMP 2.0 grid block execution task accelerator type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct AccType<TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = AccCpuOmp2Blocks<TDim, TIdx>;
+        };
+
+        //! The CPU OpenMP 2.0 grid block execution task device type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct DevType<TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = DevCpu;
+        };
+
+        //! The CPU OpenMP 2.0 grid block execution task dimension getter trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct DimType<TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = TDim;
+        };
+
+        //! The CPU OpenMP 2.0 grid block execution task platform type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct PlatformType<TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = PlatformCpu;
+        };
+
+        //! The CPU OpenMP 2.0 block execution task idx type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct IdxType<TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = TIdx;
+        };
+
+        //! \brief Specialisation of the class template FunctionAttributes
+        //! \tparam TDev The device type.
+        //! \tparam TDim The dimensionality of the accelerator device properties.
+        //! \tparam TIdx The idx type of the accelerator device properties.
+        //! \tparam TKernelFn Kernel function object type.
+        //! \tparam TArgs Kernel function object argument types as a parameter pack.
+        template<typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
+        struct FunctionAttributes<AccCpuOmp2Blocks<TDim, TIdx>, TDev, TKernelFn, TArgs...>
+        {
+            //! \param dev The device instance
+            //! \param kernelFn The kernel function object which should be executed.
+            //! \param args The kernel invocation arguments.
+            //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
+            //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
+            ALPAKA_FN_HOST static auto getFunctionAttributes(
+                TDev const& dev,
+                [[maybe_unused]] TKernelFn const& kernelFn,
+                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
+            {
+                alpaka::KernelFunctionAttributes kernelFunctionAttributes;
+
+                // set function properties for maxThreadsPerBlock to device properties, since API doesn't have function
+                // properties function.
+                auto const& props = alpaka::getAccDevProps<AccCpuOmp2Blocks<TDim, TIdx>>(dev);
+                kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(props.m_blockThreadCountMax);
+                kernelFunctionAttributes.maxDynamicSharedSizeBytes
+                    = static_cast<int>(alpaka::BlockSharedDynMemberAllocKiB * 1024);
+                return kernelFunctionAttributes;
+            }
+        };
+
+    } // namespace trait
+} // namespace alpaka
+
+#    if BOOST_COMP_CLANG
+#        pragma clang diagnostic pop
+#    endif
+
+#endif
diff --git a/include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp b/include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp
new file mode 100644
index 0000000..6b08e96
--- /dev/null
+++ b/include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp
@@ -0,0 +1,232 @@
+/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Bert Wesarg, René Widera, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+// Specialized traits.
+#include "alpaka/acc/Traits.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/dim/Traits.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/platform/Traits.hpp"
+
+// Implementation details.
+#include "alpaka/acc/AccCpuOmp2Threads.hpp"
+#include "alpaka/core/Decay.hpp"
+#include "alpaka/dev/DevCpu.hpp"
+#include "alpaka/kernel/KernelFunctionAttributes.hpp"
+#include "alpaka/kernel/Traits.hpp"
+#include "alpaka/meta/NdLoop.hpp"
+#include "alpaka/platform/PlatformCpu.hpp"
+#include "alpaka/workdiv/WorkDivMembers.hpp"
+
+#include <functional>
+#include <stdexcept>
+#include <tuple>
+#include <type_traits>
+#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+#    include <iostream>
+#endif
+
+#ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
+
+#    if _OPENMP < 200203
+#        error If ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
+#    endif
+
+#    include <omp.h>
+
+namespace alpaka
+{
+    //! The CPU OpenMP 2.0 thread accelerator execution task.
+    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelCpuOmp2Threads final : public WorkDivMembers<TDim, TIdx>
+    {
+    public:
+        template<typename TWorkDiv>
+        ALPAKA_FN_HOST TaskKernelCpuOmp2Threads(TWorkDiv&& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
+            : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
+            , m_kernelFnObj(kernelFnObj)
+            , m_args(std::forward<TArgs>(args)...)
+        {
+            static_assert(
+                Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
+                "The work division and the execution task have to be of the same dimensionality!");
+        }
+
+        //! Executes the kernel function object.
+        ALPAKA_FN_HOST auto operator()() const -> void
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+            auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(*this);
+            auto const blockThreadExtent = getWorkDiv<Block, Threads>(*this);
+            auto const threadElemExtent = getWorkDiv<Thread, Elems>(*this);
+
+            // Get the size of the block shared dynamic memory.
+            auto const blockSharedMemDynSizeBytes = std::apply(
+                [&](std::decay_t<TArgs> const&... args)
+                {
+                    return getBlockSharedMemDynSizeBytes<AccCpuOmp2Threads<TDim, TIdx>>(
+                        m_kernelFnObj,
+                        blockThreadExtent,
+                        threadElemExtent,
+                        args...);
+                },
+                m_args);
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            std::cout << __func__ << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
+                      << std::endl;
+#    endif
+
+            AccCpuOmp2Threads<TDim, TIdx> acc(
+                *static_cast<WorkDivMembers<TDim, TIdx> const*>(this),
+                blockSharedMemDynSizeBytes);
+
+            // The number of threads in this block.
+            TIdx const blockThreadCount(blockThreadExtent.prod());
+            [[maybe_unused]] int const iBlockThreadCount(static_cast<int>(blockThreadCount));
+
+            if(::omp_in_parallel() != 0)
+            {
+                throw std::runtime_error(
+                    "The OpenMP 2.0 thread backend can not be used within an existing parallel region!");
+            }
+
+            // Force the environment to use the given number of threads.
+            int const ompIsDynamic(::omp_get_dynamic());
+            ::omp_set_dynamic(0);
+
+            // Execute the blocks serially.
+            meta::ndLoopIncIdx(
+                gridBlockExtent,
+                [&](Vec<TDim, TIdx> const& gridBlockIdx)
+                {
+                    acc.m_gridBlockIdx = gridBlockIdx;
+
+// Execute the threads in parallel.
+
+// Parallel execution of the threads in a block is required because when syncBlockThreads is called all of them have to
+// be done with their work up to this line. So we have to spawn one OS thread per thread in a block. 'omp for' is not
+// useful because it is meant for cases where multiple iterations are executed by one thread but in our case a 1:1
+// mapping is required. Therefore we use 'omp parallel' with the specified number of threads in a block.
+#    pragma omp parallel num_threads(iBlockThreadCount)
+                    {
+                        // The guard is for gcc internal compiler error, as discussed in #735
+                        if constexpr((!BOOST_COMP_GNUC) || (BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(8, 1, 0)))
+                        {
+#    pragma omp single nowait
+                            {
+                                // The OpenMP runtime does not create a parallel region when only one thread is
+                                // required in the num_threads clause. In all other cases we expect to be in a parallel
+                                // region now.
+                                if((iBlockThreadCount > 1) && (::omp_in_parallel() == 0))
+                                {
+                                    throw std::runtime_error(
+                                        "The OpenMP 2.0 runtime did not create a parallel region!");
+                                }
+
+                                int const numThreads = ::omp_get_num_threads();
+                                if(numThreads != iBlockThreadCount)
+                                {
+                                    throw std::runtime_error(
+                                        "The OpenMP 2.0 runtime did not use the number of threads "
+                                        "that had been required!");
+                                }
+                            }
+                        }
+
+                        std::apply(m_kernelFnObj, std::tuple_cat(std::tie(acc), m_args));
+
+                        // Wait for all threads to finish before deleting the shared memory.
+                        // This is done by default if the omp 'nowait' clause is missing on the omp parallel directive
+                        // syncBlockThreads(acc);
+                    }
+
+                    // After a block has been processed, the shared memory has to be deleted.
+                    freeSharedVars(acc);
+                });
+
+            // Reset the dynamic thread number setting.
+            ::omp_set_dynamic(ompIsDynamic);
+        }
+
+    private:
+        TKernelFnObj m_kernelFnObj;
+        std::tuple<std::decay_t<TArgs>...> m_args;
+    };
+
+    namespace trait
+    {
+        //! The CPU OpenMP 2.0 block thread execution task accelerator type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct AccType<TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = AccCpuOmp2Threads<TDim, TIdx>;
+        };
+
+        //! The CPU OpenMP 2.0 block thread execution task device type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct DevType<TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = DevCpu;
+        };
+
+        //! The CPU OpenMP 2.0 block thread execution task dimension getter trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct DimType<TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = TDim;
+        };
+
+        //! The CPU OpenMP 2.0 block thread execution task platform type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct PlatformType<TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = PlatformCpu;
+        };
+
+        //! The CPU OpenMP 2.0 block thread execution task idx type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct IdxType<TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = TIdx;
+        };
+
+        //! \brief Specialisation of the class template FunctionAttributes
+        //! \tparam TDev The device type.
+        //! \tparam TDim The dimensionality of the accelerator device properties.
+        //! \tparam TIdx The idx type of the accelerator device properties.
+        //! \tparam TKernelFn Kernel function object type.
+        //! \tparam TArgs Kernel function object argument types as a parameter pack.
+        template<typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
+        struct FunctionAttributes<AccCpuOmp2Threads<TDim, TIdx>, TDev, TKernelFn, TArgs...>
+        {
+            //! \param dev The device instance
+            //! \param kernelFn The kernel function object which should be executed.
+            //! \param args The kernel invocation arguments.
+            //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
+            //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
+            ALPAKA_FN_HOST static auto getFunctionAttributes(
+                TDev const& dev,
+                [[maybe_unused]] TKernelFn const& kernelFn,
+                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
+            {
+                alpaka::KernelFunctionAttributes kernelFunctionAttributes;
+
+                // set function properties for maxThreadsPerBlock to device properties, since API doesn't have function
+                // properties function.
+                auto const& props = alpaka::getAccDevProps<AccCpuOmp2Threads<TDim, TIdx>>(dev);
+                kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(props.m_blockThreadCountMax);
+                kernelFunctionAttributes.maxDynamicSharedSizeBytes
+                    = static_cast<int>(alpaka::BlockSharedDynMemberAllocKiB * 1024);
+                return kernelFunctionAttributes;
+            }
+        };
+
+    } // namespace trait
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/kernel/TaskKernelCpuSerial.hpp b/include/alpaka/kernel/TaskKernelCpuSerial.hpp
new file mode 100644
index 0000000..a9a370d
--- /dev/null
+++ b/include/alpaka/kernel/TaskKernelCpuSerial.hpp
@@ -0,0 +1,171 @@
+/* Copyright 2022 Axel Huebl, Benjamin Worpitz, René Widera, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+// Specialized traits.
+#include "alpaka/acc/Traits.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/dim/Traits.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/platform/Traits.hpp"
+
+// Implementation details.
+#include "alpaka/acc/AccCpuSerial.hpp"
+#include "alpaka/core/Decay.hpp"
+#include "alpaka/dev/DevCpu.hpp"
+#include "alpaka/kernel/KernelFunctionAttributes.hpp"
+#include "alpaka/kernel/Traits.hpp"
+#include "alpaka/meta/NdLoop.hpp"
+#include "alpaka/platform/PlatformCpu.hpp"
+#include "alpaka/workdiv/WorkDivMembers.hpp"
+
+#include <functional>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+#    include <iostream>
+#endif
+
+#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
+
+namespace alpaka
+{
+    //! The CPU serial execution task implementation.
+    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelCpuSerial final : public WorkDivMembers<TDim, TIdx>
+    {
+    public:
+        template<typename TWorkDiv>
+        ALPAKA_FN_HOST TaskKernelCpuSerial(TWorkDiv&& workDiv, TKernelFnObj kernelFnObj, TArgs&&... args)
+            : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
+            , m_kernelFnObj(std::move(kernelFnObj))
+            , m_args(std::forward<TArgs>(args)...)
+        {
+            static_assert(
+                Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
+                "The work division and the execution task have to be of the same dimensionality!");
+        }
+
+        //! Executes the kernel function object.
+        ALPAKA_FN_HOST auto operator()() const -> void
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+            auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(*this);
+            auto const blockThreadExtent = getWorkDiv<Block, Threads>(*this);
+            auto const threadElemExtent = getWorkDiv<Thread, Elems>(*this);
+
+            // Get the size of the block shared dynamic memory.
+            auto const blockSharedMemDynSizeBytes = std::apply(
+                [&](std::decay_t<TArgs> const&... args)
+                {
+                    return getBlockSharedMemDynSizeBytes<AccCpuSerial<TDim, TIdx>>(
+                        m_kernelFnObj,
+                        blockThreadExtent,
+                        threadElemExtent,
+                        args...);
+                },
+                m_args);
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            std::cout << __func__ << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
+                      << std::endl;
+#    endif
+
+            AccCpuSerial<TDim, TIdx> acc(
+                *static_cast<WorkDivMembers<TDim, TIdx> const*>(this),
+                blockSharedMemDynSizeBytes);
+
+            // Execute the blocks serially.
+            meta::ndLoopIncIdx(
+                gridBlockExtent,
+                [&](Vec<TDim, TIdx> const& blockThreadIdx)
+                {
+                    acc.m_gridBlockIdx = blockThreadIdx;
+
+                    std::apply(m_kernelFnObj, std::tuple_cat(std::tie(acc), m_args));
+
+                    // After a block has been processed, the shared memory has to be deleted.
+                    freeSharedVars(acc);
+                });
+        }
+
+    private:
+        TKernelFnObj m_kernelFnObj;
+        std::tuple<std::decay_t<TArgs>...> m_args;
+    };
+
+    namespace trait
+    {
+        //! The CPU serial execution task accelerator type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct AccType<TaskKernelCpuSerial<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = AccCpuSerial<TDim, TIdx>;
+        };
+
+        //! The CPU serial execution task device type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct DevType<TaskKernelCpuSerial<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = DevCpu;
+        };
+
+        //! The CPU serial execution task dimension getter trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct DimType<TaskKernelCpuSerial<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = TDim;
+        };
+
+        //! The CPU serial execution task platform type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct PlatformType<TaskKernelCpuSerial<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = PlatformCpu;
+        };
+
+        //! The CPU serial execution task idx type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct IdxType<TaskKernelCpuSerial<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = TIdx;
+        };
+
+        //! \brief Specialisation of the class template FunctionAttributes
+        //! \tparam TDev The device type.
+        //! \tparam TDim The dimensionality of the accelerator device properties.
+        //! \tparam TIdx The idx type of the accelerator device properties.
+        //! \tparam TKernelFn Kernel function object type.
+        //! \tparam TArgs Kernel function object argument types as a parameter pack.
+        template<typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
+        struct FunctionAttributes<AccCpuSerial<TDim, TIdx>, TDev, TKernelFn, TArgs...>
+        {
+            //! \param dev The device instance
+            //! \param kernelFn The kernel function object which should be executed.
+            //! \param args The kernel invocation arguments.
+            //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
+            //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
+            ALPAKA_FN_HOST static auto getFunctionAttributes(
+                TDev const& dev,
+                [[maybe_unused]] TKernelFn const& kernelFn,
+                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
+            {
+                alpaka::KernelFunctionAttributes kernelFunctionAttributes;
+
+                // set function properties for maxThreadsPerBlock to device properties, since API doesn't have function
+                // properties function.
+                auto const& props = alpaka::getAccDevProps<AccCpuSerial<TDim, TIdx>>(dev);
+                kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(props.m_blockThreadCountMax);
+                kernelFunctionAttributes.maxDynamicSharedSizeBytes
+                    = static_cast<int>(alpaka::BlockSharedDynMemberAllocKiB * 1024);
+                return kernelFunctionAttributes;
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/kernel/TaskKernelCpuSycl.hpp b/include/alpaka/kernel/TaskKernelCpuSycl.hpp
new file mode 100644
index 0000000..b811a63
--- /dev/null
+++ b/include/alpaka/kernel/TaskKernelCpuSycl.hpp
@@ -0,0 +1,20 @@
+/* Copyright 2024 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/acc/Tag.hpp"
+#include "alpaka/kernel/TaskKernelGenericSycl.hpp"
+
+#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_CPU)
+
+namespace alpaka
+{
+    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    using TaskKernelCpuSycl
+        = TaskKernelGenericSycl<TagCpuSycl, AccCpuSycl<TDim, TIdx>, TDim, TIdx, TKernelFnObj, TArgs...>;
+
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp b/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp
new file mode 100644
index 0000000..4ca90dd
--- /dev/null
+++ b/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp
@@ -0,0 +1,183 @@
+/* Copyright 2022 Benjamin Worpitz, Erik Zenker, René Widera, Felice Pantaleo, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+// Specialized traits.
+#include "alpaka/acc/Traits.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/dim/Traits.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/platform/Traits.hpp"
+
+// Implementation details.
+#include "alpaka/acc/AccCpuTbbBlocks.hpp"
+#include "alpaka/core/Decay.hpp"
+#include "alpaka/dev/DevCpu.hpp"
+#include "alpaka/idx/MapIdx.hpp"
+#include "alpaka/kernel/KernelFunctionAttributes.hpp"
+#include "alpaka/kernel/Traits.hpp"
+#include "alpaka/meta/NdLoop.hpp"
+#include "alpaka/platform/PlatformCpu.hpp"
+#include "alpaka/workdiv/WorkDivMembers.hpp"
+
+#include <functional>
+#include <stdexcept>
+#include <tuple>
+#include <type_traits>
+#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+#    include <iostream>
+#endif
+
+#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
+
+#    include <tbb/blocked_range.h>
+#    include <tbb/parallel_for.h>
+#    include <tbb/task_group.h>
+
+namespace alpaka
+{
+    //! The CPU TBB block accelerator execution task.
+    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelCpuTbbBlocks final : public WorkDivMembers<TDim, TIdx>
+    {
+    public:
+        template<typename TWorkDiv>
+        ALPAKA_FN_HOST TaskKernelCpuTbbBlocks(TWorkDiv&& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
+            : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
+            , m_kernelFnObj(kernelFnObj)
+            , m_args(std::forward<TArgs>(args)...)
+        {
+            static_assert(
+                Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
+                "The work division and the execution task have to be of the same dimensionality!");
+        }
+
+        //! Executes the kernel function object.
+        ALPAKA_FN_HOST auto operator()() const -> void
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+            auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(*this);
+            auto const blockThreadExtent = getWorkDiv<Block, Threads>(*this);
+            auto const threadElemExtent = getWorkDiv<Thread, Elems>(*this);
+
+            // Get the size of the block shared dynamic memory.
+            auto const blockSharedMemDynSizeBytes = std::apply(
+                [&](std::decay_t<TArgs> const&... args)
+                {
+                    return getBlockSharedMemDynSizeBytes<AccCpuTbbBlocks<TDim, TIdx>>(
+                        m_kernelFnObj,
+                        blockThreadExtent,
+                        threadElemExtent,
+                        args...);
+                },
+                m_args);
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            std::cout << __func__ << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
+                      << std::endl;
+#    endif
+
+            // The number of blocks in the grid.
+            TIdx const numBlocksInGrid = gridBlockExtent.prod();
+
+            tbb::this_task_arena::isolate(
+                [&]
+                {
+                    tbb::parallel_for(
+                        static_cast<TIdx>(0),
+                        static_cast<TIdx>(numBlocksInGrid),
+                        [&](TIdx i)
+                        {
+                            AccCpuTbbBlocks<TDim, TIdx> acc(
+                                *static_cast<WorkDivMembers<TDim, TIdx> const*>(this),
+                                blockSharedMemDynSizeBytes);
+
+                            acc.m_gridBlockIdx
+                                = mapIdx<TDim::value>(Vec<DimInt<1u>, TIdx>(static_cast<TIdx>(i)), gridBlockExtent);
+
+                            std::apply(m_kernelFnObj, std::tuple_cat(std::tie(acc), m_args));
+
+                            freeSharedVars(acc);
+                        });
+                });
+        }
+
+    private:
+        TKernelFnObj m_kernelFnObj;
+        std::tuple<std::decay_t<TArgs>...> m_args;
+    };
+
+    namespace trait
+    {
+        //! The CPU TBB block execution task accelerator type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct AccType<TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = AccCpuTbbBlocks<TDim, TIdx>;
+        };
+
+        //! The CPU TBB block execution task device type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct DevType<TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = DevCpu;
+        };
+
+        //! The CPU TBB block execution task dimension getter trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct DimType<TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = TDim;
+        };
+
+        //! The CPU TBB block execution task platform type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct PlatformType<TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = PlatformCpu;
+        };
+
+        //! The CPU TBB block execution task idx type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct IdxType<TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = TIdx;
+        };
+
+        //! \brief Specialisation of the class template FunctionAttributes
+        //! \tparam TDev The device type.
+        //! \tparam TDim The dimensionality of the accelerator device properties.
+        //! \tparam TIdx The idx type of the accelerator device properties.
+        //! \tparam TKernelFn Kernel function object type.
+        //! \tparam TArgs Kernel function object argument types as a parameter pack.
+        template<typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
+        struct FunctionAttributes<AccCpuTbbBlocks<TDim, TIdx>, TDev, TKernelFn, TArgs...>
+        {
+            //! \param dev The device instance
+            //! \param kernelFn The kernel function object which should be executed.
+            //! \param args The kernel invocation arguments.
+            //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
+            //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
+            ALPAKA_FN_HOST static auto getFunctionAttributes(
+                TDev const& dev,
+                [[maybe_unused]] TKernelFn const& kernelFn,
+                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
+            {
+                alpaka::KernelFunctionAttributes kernelFunctionAttributes;
+
+                // set function properties for maxThreadsPerBlock to device properties, since API doesn't have function
+                // properties function.
+                auto const& props = alpaka::getAccDevProps<AccCpuTbbBlocks<TDim, TIdx>>(dev);
+                kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(props.m_blockThreadCountMax);
+                kernelFunctionAttributes.maxDynamicSharedSizeBytes
+                    = static_cast<int>(alpaka::BlockSharedDynMemberAllocKiB * 1024);
+                return kernelFunctionAttributes;
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/kernel/TaskKernelCpuThreads.hpp b/include/alpaka/kernel/TaskKernelCpuThreads.hpp
new file mode 100644
index 0000000..850b661
--- /dev/null
+++ b/include/alpaka/kernel/TaskKernelCpuThreads.hpp
@@ -0,0 +1,240 @@
+/* Copyright 2023 Benjamin Worpitz, René Widera, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+// Specialized traits.
+#include "alpaka/acc/Traits.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/dim/Traits.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/platform/Traits.hpp"
+
+// Implementation details.
+#include "alpaka/acc/AccCpuThreads.hpp"
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/Decay.hpp"
+#include "alpaka/core/ThreadPool.hpp"
+#include "alpaka/dev/DevCpu.hpp"
+#include "alpaka/kernel/KernelFunctionAttributes.hpp"
+#include "alpaka/kernel/Traits.hpp"
+#include "alpaka/meta/NdLoop.hpp"
+#include "alpaka/platform/PlatformCpu.hpp"
+#include "alpaka/workdiv/WorkDivMembers.hpp"
+
+#include <algorithm>
+#include <functional>
+#include <future>
+#include <thread>
+#include <tuple>
+#include <type_traits>
+#include <vector>
+#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+#    include <iostream>
+#endif
+
+#ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
+
+namespace alpaka
+{
+    //! The CPU threads execution task.
+    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelCpuThreads final : public WorkDivMembers<TDim, TIdx>
+    {
+    private:
+        // When using the thread pool the threads are yielding because this is faster.
+        // Using condition variables and going to sleep is very costly for real threads.
+        // Especially when the time to wait is really short (syncBlockThreads) yielding is much faster.
+        using ThreadPool = alpaka::core::detail::ThreadPool;
+
+    public:
+        template<typename TWorkDiv>
+        ALPAKA_FN_HOST TaskKernelCpuThreads(TWorkDiv&& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
+            : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
+            , m_kernelFnObj(kernelFnObj)
+            , m_args(std::forward<TArgs>(
+                  args)...) // FIXME(bgruber): this does not forward, since TArgs is not a deduced template parameter
+        {
+            static_assert(
+                Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
+                "The work division and the execution task have to be of the same dimensionality!");
+        }
+
+        //! Executes the kernel function object.
+        ALPAKA_FN_HOST auto operator()() const -> void
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+            std::apply([&](auto const&... args) { runWithArgs(args...); }, m_args);
+        }
+
+    private:
+        ALPAKA_FN_HOST auto runWithArgs(std::decay_t<TArgs> const&... args) const -> void
+        {
+            auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(*this);
+            auto const blockThreadExtent = getWorkDiv<Block, Threads>(*this);
+            auto const threadElemExtent = getWorkDiv<Thread, Elems>(*this);
+
+            // Get the size of the block shared dynamic memory.
+            auto const smBytes = getBlockSharedMemDynSizeBytes<AccCpuThreads<TDim, TIdx>>(
+                m_kernelFnObj,
+                blockThreadExtent,
+                threadElemExtent,
+                args...);
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            std::cout << __func__ << " smBytes: " << smBytes << " B" << std::endl;
+#    endif
+            AccCpuThreads<TDim, TIdx> acc(*static_cast<WorkDivMembers<TDim, TIdx> const*>(this), smBytes);
+
+            auto const threadsPerBlock = blockThreadExtent.prod();
+            ThreadPool threadPool(static_cast<std::size_t>(threadsPerBlock));
+
+            // Execute the blocks serially.
+            meta::ndLoopIncIdx(
+                gridBlockExtent,
+                [&](Vec<TDim, TIdx> const& gridBlockIdx)
+                { runBlock(acc, gridBlockIdx, blockThreadExtent, threadPool, m_kernelFnObj, args...); });
+        }
+
+        //! The function executed for each grid block.
+        ALPAKA_FN_HOST static auto runBlock(
+            AccCpuThreads<TDim, TIdx>& acc,
+            Vec<TDim, TIdx> const& gridBlockIdx,
+            Vec<TDim, TIdx> const& blockThreadExtent,
+            ThreadPool& threadPool,
+            TKernelFnObj const& kernelFnObj,
+            std::decay_t<TArgs> const&... args) -> void
+        {
+            std::vector<std::future<void>> futuresInBlock;
+            acc.m_gridBlockIdx = gridBlockIdx;
+
+            // Execute the threads of this block in parallel.
+            meta::ndLoopIncIdx(
+                blockThreadExtent,
+                [&](Vec<TDim, TIdx> const& blockThreadIdx)
+                {
+                    // copy blockThreadIdx because it will get changed for the next iteration/thread.
+                    futuresInBlock.emplace_back(threadPool.enqueueTask(
+                        [&, blockThreadIdx] { runThread(acc, blockThreadIdx, kernelFnObj, args...); }));
+                });
+
+            // Wait for the completion of the block thread kernels.
+            for(auto& t : futuresInBlock)
+                t.wait();
+
+            // Clean up.
+            futuresInBlock.clear();
+            acc.m_threadToIndexMap.clear();
+            freeSharedVars(acc); // After a block has been processed, the shared memory has to be deleted.
+        }
+
+        //! The thread entry point on the accelerator.
+        ALPAKA_FN_HOST static auto runThread(
+            AccCpuThreads<TDim, TIdx>& acc,
+            Vec<TDim, TIdx> const& blockThreadIdx,
+            TKernelFnObj const& kernelFnObj,
+            std::decay_t<TArgs> const&... args) -> void
+        {
+            // We have to store the thread data before the kernel is calling any of the methods of this class depending
+            // on them.
+            auto const threadId = std::this_thread::get_id();
+
+            if(blockThreadIdx.sum() == 0)
+            {
+                acc.m_idMasterThread = threadId;
+            }
+
+            {
+                // Save the thread id, and index.
+                std::lock_guard<std::mutex> lock(acc.m_mtxMapInsert);
+                acc.m_threadToIndexMap.emplace(threadId, blockThreadIdx);
+            }
+
+            // Sync all threads so that the maps with thread id's are complete and not changed after here.
+            syncBlockThreads(acc);
+
+            // Execute the kernel itself.
+            kernelFnObj(std::as_const(acc), args...);
+
+            // We have to sync all threads here because if a thread would finish before all threads have been started,
+            // a new thread could get the recycled (then duplicate) thread id!
+            syncBlockThreads(acc);
+        }
+
+        TKernelFnObj m_kernelFnObj;
+        std::tuple<std::decay_t<TArgs>...> m_args;
+    };
+
+    namespace trait
+    {
+        //! The CPU threads execution task accelerator type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct AccType<TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = AccCpuThreads<TDim, TIdx>;
+        };
+
+        //! The CPU threads execution task device type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct DevType<TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = DevCpu;
+        };
+
+        //! The CPU threads execution task dimension getter trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct DimType<TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = TDim;
+        };
+
+        //! The CPU threads execution task platform type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct PlatformType<TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = PlatformCpu;
+        };
+
+        //! The CPU threads execution task idx type trait specialization.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct IdxType<TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = TIdx;
+        };
+
+        //! \brief Specialisation of the class template FunctionAttributes
+        //! \tparam TDev The device type.
+        //! \tparam TDim The dimensionality of the accelerator device properties.
+        //! \tparam TIdx The idx type of the accelerator device properties.
+        //! \tparam TKernelFn Kernel function object type.
+        //! \tparam TArgs Kernel function object argument types as a parameter pack.
+        template<typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
+        struct FunctionAttributes<AccCpuThreads<TDim, TIdx>, TDev, TKernelFn, TArgs...>
+        {
+            //! \param dev The device instance
+            //! \param kernelFn The kernel function object which should be executed.
+            //! \param args The kernel invocation arguments.
+            //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
+            //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
+            ALPAKA_FN_HOST static auto getFunctionAttributes(
+                TDev const& dev,
+                [[maybe_unused]] TKernelFn const& kernelFn,
+                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
+            {
+                alpaka::KernelFunctionAttributes kernelFunctionAttributes;
+
+                // set function properties for maxThreadsPerBlock to device properties, since API doesn't have function
+                // properties function.
+                auto const& props = alpaka::getAccDevProps<AccCpuThreads<TDim, TIdx>>(dev);
+                kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(props.m_blockThreadCountMax);
+                kernelFunctionAttributes.maxDynamicSharedSizeBytes
+                    = static_cast<int>(alpaka::BlockSharedDynMemberAllocKiB * 1024);
+                return kernelFunctionAttributes;
+            }
+        };
+
+    } // namespace trait
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/kernel/TaskKernelFpgaSyclIntel.hpp b/include/alpaka/kernel/TaskKernelFpgaSyclIntel.hpp
new file mode 100644
index 0000000..6163165
--- /dev/null
+++ b/include/alpaka/kernel/TaskKernelFpgaSyclIntel.hpp
@@ -0,0 +1,20 @@
+/* Copyright 2024 Jan Stephan, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/acc/Tag.hpp"
+#include "alpaka/kernel/TaskKernelGenericSycl.hpp"
+
+#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_FPGA)
+
+namespace alpaka
+{
+    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    using TaskKernelFpgaSyclIntel
+        = TaskKernelGenericSycl<TagFpgaSyclIntel, AccFpgaSyclIntel<TDim, TIdx>, TDim, TIdx, TKernelFnObj, TArgs...>;
+
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/kernel/TaskKernelGenericSycl.hpp b/include/alpaka/kernel/TaskKernelGenericSycl.hpp
new file mode 100644
index 0000000..11cc2ca
--- /dev/null
+++ b/include/alpaka/kernel/TaskKernelGenericSycl.hpp
@@ -0,0 +1,314 @@
+/* Copyright 2024 Jan Stephan, Andrea Bocci, Luca Ferragina, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/acc/AccGenericSycl.hpp"
+#include "alpaka/acc/Traits.hpp"
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/Sycl.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/dim/Traits.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/kernel/KernelFunctionAttributes.hpp"
+#include "alpaka/kernel/SyclSubgroupSize.hpp"
+#include "alpaka/kernel/Traits.hpp"
+#include "alpaka/platform/PlatformGenericSycl.hpp"
+#include "alpaka/platform/Traits.hpp"
+#include "alpaka/queue/Traits.hpp"
+#include "alpaka/workdiv/WorkDivMembers.hpp"
+
+#include <cassert>
+#include <functional>
+#include <memory>
+#include <stdexcept>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+
+#    if BOOST_COMP_CLANG
+#        pragma clang diagnostic push
+#        pragma clang diagnostic ignored "-Wunused-lambda-capture"
+#        pragma clang diagnostic ignored "-Wunused-parameter"
+#    endif
+
+#    include <sycl/sycl.hpp>
+
+#    define LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(sub_group_size)                                                    \
+        cgh.parallel_for(                                                                                             \
+            sycl::nd_range<TDim::value>{global_size, local_size},                                                     \
+            [item_elements, dyn_shared_accessor, st_shared_accessor, k_func, k_args](                                 \
+                sycl::nd_item<TDim::value> work_item) [[intel::reqd_sub_group_size(sub_group_size)]]                  \
+            {                                                                                                         \
+                auto acc = TAcc{item_elements, work_item, dyn_shared_accessor, st_shared_accessor};                   \
+                std::apply(                                                                                           \
+                    [k_func, &acc](typename std::decay_t<TArgs> const&... args) { k_func(acc, args...); },            \
+                    k_args);                                                                                          \
+            });
+
+#    define LAUNCH_SYCL_KERNEL_WITH_DEFAULT_SUBGROUP_SIZE                                                             \
+        cgh.parallel_for(                                                                                             \
+            sycl::nd_range<TDim::value>{global_size, local_size},                                                     \
+            [item_elements, dyn_shared_accessor, st_shared_accessor, k_func, k_args](                                 \
+                sycl::nd_item<TDim::value> work_item)                                                                 \
+            {                                                                                                         \
+                auto acc = TAcc{item_elements, work_item, dyn_shared_accessor, st_shared_accessor};                   \
+                std::apply(                                                                                           \
+                    [k_func, &acc](typename std::decay_t<TArgs> const&... args) { k_func(acc, args...); },            \
+                    k_args);                                                                                          \
+            });
+
+#    define THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL                                                                        \
+        throw sycl::exception(sycl::make_error_code(sycl::errc::kernel_not_supported));                               \
+        cgh.parallel_for(                                                                                             \
+            sycl::nd_range<TDim::value>{global_size, local_size},                                                     \
+            [item_elements, dyn_shared_accessor, st_shared_accessor, k_func, k_args](                                 \
+                sycl::nd_item<TDim::value> work_item) {});
+
+namespace alpaka
+{
+    //! The SYCL accelerator execution task.
+    template<typename TTag, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelGenericSycl final : public WorkDivMembers<TDim, TIdx>
+    {
+    public:
+        static_assert(TDim::value > 0 && TDim::value <= 3, "Invalid kernel dimensionality");
+
+        template<typename TWorkDiv>
+        TaskKernelGenericSycl(TWorkDiv&& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
+            : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
+            , m_kernelFnObj{kernelFnObj}
+            , m_args{std::forward<TArgs>(args)...}
+        {
+        }
+
+        auto operator()(sycl::handler& cgh) const -> void
+        {
+            auto const work_groups = WorkDivMembers<TDim, TIdx>::m_gridBlockExtent;
+            auto const group_items = WorkDivMembers<TDim, TIdx>::m_blockThreadExtent;
+            auto const item_elements = WorkDivMembers<TDim, TIdx>::m_threadElemExtent;
+
+            auto const global_size = get_global_size(work_groups, group_items);
+            auto const local_size = get_local_size(group_items);
+
+            // allocate dynamic shared memory -- needs at least 1 byte to make the Xilinx Runtime happy
+            auto const dyn_shared_mem_bytes = std::max(
+                1ul,
+                std::apply(
+                    [&](std::decay_t<TArgs> const&... args) {
+                        return getBlockSharedMemDynSizeBytes<TAcc>(m_kernelFnObj, group_items, item_elements, args...);
+                    },
+                    m_args));
+
+            auto dyn_shared_accessor = sycl::local_accessor<std::byte>{sycl::range<1>{dyn_shared_mem_bytes}, cgh};
+
+            // allocate static shared memory -- value comes from the build system
+            constexpr auto st_shared_mem_bytes = std::size_t{ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB * 1024};
+            auto st_shared_accessor = sycl::local_accessor<std::byte>{sycl::range<1>{st_shared_mem_bytes}, cgh};
+
+            // copy-by-value so we don't access 'this' on the device
+            auto k_func = m_kernelFnObj;
+            auto k_args = m_args;
+
+            constexpr std::size_t sub_group_size = trait::warpSize<TKernelFnObj, TAcc>;
+            bool supported = false;
+
+            if constexpr(sub_group_size == 0)
+            {
+                // no explicit subgroup size requirement
+                LAUNCH_SYCL_KERNEL_WITH_DEFAULT_SUBGROUP_SIZE
+                supported = true;
+            }
+            else
+            {
+#    if(SYCL_SUBGROUP_SIZE == 0)
+                // no explicit SYCL target, assume JIT compilation
+                LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(sub_group_size)
+                supported = true;
+#    else
+                // check if the kernel should be launched with a subgroup size of 4
+                if constexpr(sub_group_size == 4)
+                {
+#        if(SYCL_SUBGROUP_SIZE & 4)
+                    LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(4)
+                    supported = true;
+#        else
+                    // empty kernel, required to keep SYCL happy
+                    THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
+#        endif
+                }
+
+                // check if the kernel should be launched with a subgroup size of 8
+                if constexpr(sub_group_size == 8)
+                {
+#        if(SYCL_SUBGROUP_SIZE & 8)
+                    LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(8)
+                    supported = true;
+#        else
+                    // empty kernel, required to keep SYCL happy
+                    THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
+#        endif
+                }
+
+                // check if the kernel should be launched with a subgroup size of 16
+                if constexpr(sub_group_size == 16)
+                {
+#        if(SYCL_SUBGROUP_SIZE & 16)
+                    LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(16)
+                    supported = true;
+#        else
+                    // empty kernel, required to keep SYCL happy
+                    THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
+#        endif
+                }
+
+                // check if the kernel should be launched with a subgroup size of 32
+                if constexpr(sub_group_size == 32)
+                {
+#        if(SYCL_SUBGROUP_SIZE & 32)
+                    LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(32)
+                    supported = true;
+#        else
+                    // empty kernel, required to keep SYCL happy
+                    THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
+#        endif
+                }
+
+                // check if the kernel should be launched with a subgroup size of 64
+                if constexpr(sub_group_size == 64)
+                {
+#        if(SYCL_SUBGROUP_SIZE & 64)
+                    LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(64)
+                    supported = true;
+#        else
+                    // empty kernel, required to keep SYCL happy
+                    THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
+#        endif
+                }
+#    endif
+
+                // this subgroup size is not supported, raise an exception
+                if(not supported)
+                    throw sycl::exception(sycl::make_error_code(sycl::errc::kernel_not_supported));
+            }
+        }
+
+        static constexpr auto is_sycl_task = true;
+        // Distinguish from other tasks
+        static constexpr auto is_sycl_kernel = true;
+
+    private:
+        auto get_global_size(Vec<TDim, TIdx> const& work_groups, Vec<TDim, TIdx> const& group_items) const
+        {
+            if constexpr(TDim::value == 1)
+                return sycl::range<1>{static_cast<std::size_t>(work_groups[0] * group_items[0])};
+            else if constexpr(TDim::value == 2)
+                return sycl::range<2>{
+                    static_cast<std::size_t>(work_groups[1] * group_items[1]),
+                    static_cast<std::size_t>(work_groups[0] * group_items[0])};
+            else
+                return sycl::range<3>{
+                    static_cast<std::size_t>(work_groups[2] * group_items[2]),
+                    static_cast<std::size_t>(work_groups[1] * group_items[1]),
+                    static_cast<std::size_t>(work_groups[0] * group_items[0])};
+        }
+
+        auto get_local_size(Vec<TDim, TIdx> const& group_items) const
+        {
+            if constexpr(TDim::value == 1)
+                return sycl::range<1>{static_cast<std::size_t>(group_items[0])};
+            else if constexpr(TDim::value == 2)
+                return sycl::range<2>{
+                    static_cast<std::size_t>(group_items[1]),
+                    static_cast<std::size_t>(group_items[0])};
+            else
+                return sycl::range<3>{
+                    static_cast<std::size_t>(group_items[2]),
+                    static_cast<std::size_t>(group_items[1]),
+                    static_cast<std::size_t>(group_items[0])};
+        }
+
+    public:
+        TKernelFnObj m_kernelFnObj;
+        std::tuple<std::decay_t<TArgs>...> m_args;
+    };
+
+} // namespace alpaka
+
+#    if BOOST_COMP_CLANG
+#        pragma clang diagnostic pop
+#    endif
+
+namespace alpaka::trait
+{
+    //! The SYCL execution task accelerator type trait specialization.
+    template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    struct AccType<TaskKernelGenericSycl<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
+    {
+        using type = TAcc;
+    };
+
+    //! The SYCL execution task device type trait specialization.
+    template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    struct DevType<TaskKernelGenericSycl<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
+    {
+        using type = typename DevType<TAcc>::type;
+    };
+
+    //! The SYCL execution task platform type trait specialization.
+    template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    struct PlatformType<TaskKernelGenericSycl<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
+    {
+        using type = typename PlatformType<TAcc>::type;
+    };
+
+    //! The SYCL execution task dimension getter trait specialization.
+    template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    struct DimType<TaskKernelGenericSycl<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
+    {
+        using type = TDim;
+    };
+
+    //! The SYCL execution task idx type trait specialization.
+    template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    struct IdxType<TaskKernelGenericSycl<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
+    {
+        using type = TIdx;
+    };
+
+    //! \brief Specialisation of the class template FunctionAttributes
+    //! \tparam TTag The SYCL device selector.
+    //! \tparam TDev The device type.
+    //! \tparam TDim The dimensionality of the accelerator device properties.
+    //! \tparam TIdx The idx type of the accelerator device properties.
+    //! \tparam TKernelFn Kernel function object type.
+    //! \tparam TArgs Kernel function object argument types as a parameter pack.
+    template<typename TTag, typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
+    struct FunctionAttributes<AccGenericSycl<TTag, TDim, TIdx>, TDev, TKernelFn, TArgs...>
+    {
+        //! \param dev The device instance
+        //! \param kernelFn The kernel function object which should be executed.
+        //! \param args The kernel invocation arguments.
+        //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
+        //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
+        ALPAKA_FN_HOST static auto getFunctionAttributes(
+            TDev const& dev,
+            [[maybe_unused]] TKernelFn const& kernelFn,
+            [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
+        {
+            alpaka::KernelFunctionAttributes kernelFunctionAttributes;
+
+            // set function properties for maxThreadsPerBlock to device properties
+            auto const& props = alpaka::getAccDevProps<AccGenericSycl<TTag, TDim, TIdx>>(dev);
+            kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(props.m_blockThreadCountMax);
+            return kernelFunctionAttributes;
+        }
+    };
+} // namespace alpaka::trait
+
+#    undef LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS
+
+#endif
diff --git a/include/alpaka/kernel/TaskKernelGpuCudaRt.hpp b/include/alpaka/kernel/TaskKernelGpuCudaRt.hpp
new file mode 100644
index 0000000..416e893
--- /dev/null
+++ b/include/alpaka/kernel/TaskKernelGpuCudaRt.hpp
@@ -0,0 +1,19 @@
+/* Copyright 2022 Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/ApiCudaRt.hpp"
+#include "alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp"
+
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+
+namespace alpaka
+{
+    template<typename TAcc, typename TDev, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    using TaskKernelGpuCudaRt
+        = TaskKernelGpuUniformCudaHipRt<ApiCudaRt, TAcc, TDev, TDim, TIdx, TKernelFnObj, TArgs...>;
+} // namespace alpaka
+
+#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
diff --git a/include/alpaka/kernel/TaskKernelGpuHipRt.hpp b/include/alpaka/kernel/TaskKernelGpuHipRt.hpp
new file mode 100644
index 0000000..b4b284c
--- /dev/null
+++ b/include/alpaka/kernel/TaskKernelGpuHipRt.hpp
@@ -0,0 +1,18 @@
+/* Copyright 2022 Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/ApiHipRt.hpp"
+#include "alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp"
+
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+
+namespace alpaka
+{
+    template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    using TaskKernelGpuHipRt = TaskKernelGpuUniformCudaHipRt<ApiHipRt, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>;
+} // namespace alpaka
+
+#endif // ALPAKA_ACC_GPU_HIP_ENABLED
diff --git a/include/alpaka/kernel/TaskKernelGpuSyclIntel.hpp b/include/alpaka/kernel/TaskKernelGpuSyclIntel.hpp
new file mode 100644
index 0000000..e5c5a9a
--- /dev/null
+++ b/include/alpaka/kernel/TaskKernelGpuSyclIntel.hpp
@@ -0,0 +1,20 @@
+/* Copyright 2024 Jan Stephan, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/acc/Tag.hpp"
+#include "alpaka/kernel/TaskKernelGenericSycl.hpp"
+
+#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_GPU)
+
+namespace alpaka
+{
+    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    using TaskKernelGpuSyclIntel
+        = TaskKernelGenericSycl<TagGpuSyclIntel, AccGpuSyclIntel<TDim, TIdx>, TDim, TIdx, TKernelFnObj, TArgs...>;
+
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp b/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp
new file mode 100644
index 0000000..53bbaf6
--- /dev/null
+++ b/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp
@@ -0,0 +1,373 @@
+/* Copyright 2024 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera, Jan Stephan, Andrea Bocci, Bernhard
+ * Manfred Gruber, Antonio Di Pilato, Mehmet Yusufoglu
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/acc/AccGpuUniformCudaHipRt.hpp"
+#include "alpaka/acc/Traits.hpp"
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/Cuda.hpp"
+#include "alpaka/core/Decay.hpp"
+#include "alpaka/core/DemangleTypeNames.hpp"
+#include "alpaka/core/Hip.hpp"
+#include "alpaka/core/RemoveRestrict.hpp"
+#include "alpaka/dev/DevUniformCudaHipRt.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/dim/Traits.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/kernel/KernelFunctionAttributes.hpp"
+#include "alpaka/kernel/Traits.hpp"
+#include "alpaka/platform/Traits.hpp"
+#include "alpaka/queue/Traits.hpp"
+#include "alpaka/queue/cuda_hip/QueueUniformCudaHipRt.hpp"
+#include "alpaka/workdiv/WorkDivHelpers.hpp"
+#include "alpaka/workdiv/WorkDivMembers.hpp"
+
+#include <stdexcept>
+#include <tuple>
+#include <type_traits>
+#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+#    include <iostream>
+#endif
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    if !defined(ALPAKA_HOST_ONLY)
+
+#        include "alpaka/core/BoostPredef.hpp"
+
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+
+#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+
+namespace alpaka
+{
+    namespace detail
+    {
+#        if BOOST_COMP_CLANG
+#            pragma clang diagnostic push
+#            pragma clang diagnostic ignored "-Wunused-template"
+#        endif
+        //! The GPU CUDA/HIP kernel entry point.
+        // \NOTE: 'A __global__ function or function template cannot have a trailing return type.'
+        // We have put the function into a shallow namespace and gave it a short name, so the mangled name in the
+        // profiler (e.g. ncu) is as shorter as possible.
+        template<typename TKernelFnObj, typename TApi, typename TAcc, typename TDim, typename TIdx, typename... TArgs>
+        __global__ void gpuKernel(
+            Vec<TDim, TIdx> const threadElemExtent,
+            TKernelFnObj const kernelFnObj,
+            TArgs... args)
+        {
+            TAcc const acc(threadElemExtent);
+
+// with clang it is not possible to query std::result_of for a pure device lambda created on the host side
+#        if !(BOOST_COMP_CLANG_CUDA && BOOST_COMP_CLANG)
+            static_assert(
+                std::is_same_v<decltype(kernelFnObj(const_cast<TAcc const&>(acc), args...)), void>,
+                "The TKernelFnObj is required to return void!");
+#        endif
+            kernelFnObj(const_cast<TAcc const&>(acc), args...);
+        }
+#        if BOOST_COMP_CLANG
+#            pragma clang diagnostic pop
+#        endif
+    } // namespace detail
+
+    namespace uniform_cuda_hip
+    {
+        namespace detail
+        {
+            template<typename TDim, typename TIdx>
+            ALPAKA_FN_HOST auto checkVecOnly3Dim(Vec<TDim, TIdx> const& vec) -> void
+            {
+                if constexpr(TDim::value > 0)
+                {
+                    for(auto i = std::min(typename TDim::value_type{3}, TDim::value); i < TDim::value; ++i)
+                    {
+                        if(vec[TDim::value - 1u - i] != 1)
+                        {
+                            throw std::runtime_error(
+                                "The CUDA/HIP accelerator supports a maximum of 3 dimensions. All "
+                                "work division extents of the dimensions higher 3 have to be 1!");
+                        }
+                    }
+                }
+            }
+
+            template<typename TDim, typename TIdx>
+            ALPAKA_FN_HOST auto convertVecToUniformCudaHipDim(Vec<TDim, TIdx> const& vec) -> dim3
+            {
+                dim3 dim(1, 1, 1);
+                if constexpr(TDim::value >= 1)
+                    dim.x = static_cast<unsigned>(vec[TDim::value - 1u]);
+                if constexpr(TDim::value >= 2)
+                    dim.y = static_cast<unsigned>(vec[TDim::value - 2u]);
+                if constexpr(TDim::value >= 3)
+                    dim.z = static_cast<unsigned>(vec[TDim::value - 3u]);
+                checkVecOnly3Dim(vec);
+                return dim;
+            }
+        } // namespace detail
+    } // namespace uniform_cuda_hip
+
+    //! The GPU CUDA/HIP accelerator execution task.
+    template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    class TaskKernelGpuUniformCudaHipRt final : public WorkDivMembers<TDim, TIdx>
+    {
+    public:
+        template<typename TWorkDiv>
+        ALPAKA_FN_HOST TaskKernelGpuUniformCudaHipRt(
+            TWorkDiv&& workDiv,
+            TKernelFnObj const& kernelFnObj,
+            TArgs&&... args)
+            : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
+            , m_kernelFnObj(kernelFnObj)
+            , m_args(std::forward<TArgs>(args)...)
+        {
+            static_assert(
+                Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
+                "The work division and the execution task have to be of the same dimensionality!");
+        }
+
+        TKernelFnObj m_kernelFnObj;
+        std::tuple<remove_restrict_t<std::decay_t<TArgs>>...> m_args;
+    };
+
+    namespace trait
+    {
+        //! The GPU CUDA/HIP execution task accelerator type trait specialization.
+        template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct AccType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = AccGpuUniformCudaHipRt<TApi, TDim, TIdx>;
+        };
+
+        //! The GPU CUDA/HIP execution task device type trait specialization.
+        template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct DevType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = DevUniformCudaHipRt<TApi>;
+        };
+
+        //! The GPU CUDA/HIP execution task dimension getter trait specialization.
+        template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct DimType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = TDim;
+        };
+
+        //! The CPU CUDA/HIP execution task platform type trait specialization.
+        template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct PlatformType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = PlatformUniformCudaHipRt<TApi>;
+        };
+
+        //! The GPU CUDA/HIP execution task idx type trait specialization.
+        template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct IdxType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            using type = TIdx;
+        };
+
+        //! The CUDA/HIP kernel enqueue trait specialization.
+        template<
+            typename TApi,
+            bool TBlocking,
+            typename TAcc,
+            typename TDim,
+            typename TIdx,
+            typename TKernelFnObj,
+            typename... TArgs>
+        struct Enqueue<
+            uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>,
+            TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            ALPAKA_FN_HOST static auto enqueue(
+                uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>& queue,
+                TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...> const& task) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+                // TODO: Check that (sizeof(TKernelFnObj) * m_3uiBlockThreadExtent.prod()) < available memory idx
+
+#        if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                // std::size_t printfFifoSize;
+                // TApi::deviceGetLimit(&printfFifoSize, TApi::limitPrintfFifoSize);
+                // std::cout << __func__ << " INFO: printfFifoSize: " << printfFifoSize << std::endl;
+                // TApi::deviceSetLimit(TApi::limitPrintfFifoSize, printfFifoSize*10);
+                // TApi::deviceGetLimit(&printfFifoSize, TApi::limitPrintfFifoSize);
+                // std::cout << __func__ << " INFO: printfFifoSize: " << printfFifoSize << std::endl;
+#        endif
+                auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(task);
+                auto const blockThreadExtent = getWorkDiv<Block, Threads>(task);
+                auto const threadElemExtent = getWorkDiv<Thread, Elems>(task);
+
+                dim3 const gridDim = uniform_cuda_hip::detail::convertVecToUniformCudaHipDim(gridBlockExtent);
+                dim3 const blockDim = uniform_cuda_hip::detail::convertVecToUniformCudaHipDim(blockThreadExtent);
+                uniform_cuda_hip::detail::checkVecOnly3Dim(threadElemExtent);
+
+#        if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                std::cout << __func__ << " gridDim: (" << gridDim.z << ", " << gridDim.y << ", " << gridDim.x << ")\n";
+                std::cout << __func__ << " blockDim: (" << blockDim.z << ", " << blockDim.y << ", " << blockDim.x
+                          << ")\n";
+#        endif
+
+#        if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+                // This checks for a valid work division that is also compliant with the hardware maxima of the
+                // accelerator.
+                if(!isValidWorkDiv<TAcc>(task, getDev(queue)))
+                {
+                    throw std::runtime_error(
+                        "The given work division is not valid or not supported by the device of type "
+                        + getAccName<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>() + "!");
+                }
+#        endif
+
+                // Get the size of the block shared dynamic memory.
+                auto const blockSharedMemDynSizeBytes = std::apply(
+                    [&](remove_restrict_t<std::decay_t<TArgs>> const&... args) {
+                        return getBlockSharedMemDynSizeBytes<TAcc>(
+                            task.m_kernelFnObj,
+                            blockThreadExtent,
+                            threadElemExtent,
+                            args...);
+                    },
+                    task.m_args);
+
+#        if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                // Log the block shared memory idx.
+                std::cout << __func__ << " BlockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
+                          << std::endl;
+#        endif
+
+                auto kernelName = alpaka::detail::
+                    gpuKernel<TKernelFnObj, TApi, TAcc, TDim, TIdx, remove_restrict_t<std::decay_t<TArgs>>...>;
+
+#        if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                // Log the function attributes.
+                typename TApi::FuncAttributes_t funcAttrs;
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::funcGetAttributes(&funcAttrs, kernelName));
+                std::cout << __func__ << " binaryVersion: " << funcAttrs.binaryVersion
+                          << " constSizeBytes: " << funcAttrs.constSizeBytes << " B"
+                          << " localSizeBytes: " << funcAttrs.localSizeBytes << " B"
+                          << " maxThreadsPerBlock: " << funcAttrs.maxThreadsPerBlock
+                          << " numRegs: " << funcAttrs.numRegs << " ptxVersion: " << funcAttrs.ptxVersion
+                          << " sharedSizeBytes: " << funcAttrs.sharedSizeBytes << " B" << std::endl;
+#        endif
+
+                // Set the current device.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(queue.m_spQueueImpl->m_dev.getNativeHandle()));
+
+                // Enqueue the kernel execution.
+                // \NOTE: No const reference (const &) is allowed as the parameter type because the kernel launch
+                // language extension expects the arguments by value. This forces the type of a float argument given
+                // with std::forward to this function to be of type float instead of e.g. "float const & __ptr64"
+                // (MSVC). If not given by value, the kernel launch code does not copy the value but the pointer to the
+                // value location.
+                std::apply(
+                    [&](remove_restrict_t<std::decay_t<TArgs>> const&... args)
+                    {
+                        kernelName<<<
+                            gridDim,
+                            blockDim,
+                            static_cast<std::size_t>(blockSharedMemDynSizeBytes),
+                            queue.getNativeHandle()>>>(threadElemExtent, task.m_kernelFnObj, args...);
+                    },
+                    task.m_args);
+
+                if constexpr(TBlocking || ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL)
+                {
+                    // Wait for the kernel execution to finish but do not check error return of this call.
+                    // Do not use the alpaka::wait method because it checks the error itself but we want to give a
+                    // custom error message.
+                    std::ignore = TApi::streamSynchronize(queue.getNativeHandle());
+                }
+                if constexpr(ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL)
+                {
+                    auto const msg
+                        = std::string{"execution of kernel '" + core::demangled<TKernelFnObj> + "' failed with"};
+                    ::alpaka::uniform_cuda_hip::detail::rtCheckLastError<TApi, true>(msg.c_str(), __FILE__, __LINE__);
+                }
+            }
+        };
+
+        //! \brief Specialisation of the class template FunctionAttributes
+        //! \tparam TApi The type the API of the GPU accelerator backend. Currently Cuda or Hip.
+        //! \tparam TDim The dimensionality of the accelerator device properties.
+        //! \tparam TIdx The idx type of the accelerator device properties.
+        //! \tparam TKernelFn Kernel function object type.
+        //! \tparam TArgs Kernel function object argument types as a parameter pack.
+        template<typename TApi, typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
+        struct FunctionAttributes<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>, TDev, TKernelFn, TArgs...>
+        {
+            //! \param dev The device instance
+            //! \param kernelFn The kernel function object which should be executed.
+            //! \param args The kernel invocation arguments.
+            //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
+            //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
+            ALPAKA_FN_HOST static auto getFunctionAttributes(
+                [[maybe_unused]] TDev const& dev,
+                [[maybe_unused]] TKernelFn const& kernelFn,
+                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
+            {
+                auto kernelName = alpaka::detail::gpuKernel<
+                    TKernelFn,
+                    TApi,
+                    AccGpuUniformCudaHipRt<TApi, TDim, TIdx>,
+                    TDim,
+                    TIdx,
+                    remove_restrict_t<std::decay_t<TArgs>>...>;
+
+                typename TApi::FuncAttributes_t funcAttrs;
+#        if BOOST_COMP_GNUC
+                // Disable and enable compile warnings for gcc
+#            pragma GCC diagnostic push
+#            pragma GCC diagnostic ignored "-Wconditionally-supported"
+#        endif
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    TApi::funcGetAttributes(&funcAttrs, reinterpret_cast<void const*>(kernelName)));
+#        if BOOST_COMP_GNUC
+#            pragma GCC diagnostic pop
+#        endif
+
+                alpaka::KernelFunctionAttributes kernelFunctionAttributes;
+                kernelFunctionAttributes.constSizeBytes = funcAttrs.constSizeBytes;
+                kernelFunctionAttributes.localSizeBytes = funcAttrs.localSizeBytes;
+                kernelFunctionAttributes.sharedSizeBytes = funcAttrs.sharedSizeBytes;
+                kernelFunctionAttributes.maxDynamicSharedSizeBytes = funcAttrs.maxDynamicSharedSizeBytes;
+                kernelFunctionAttributes.numRegs = funcAttrs.numRegs;
+                kernelFunctionAttributes.asmVersion = funcAttrs.ptxVersion;
+                kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(funcAttrs.maxThreadsPerBlock);
+
+#        if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                printf("Kernel Function Attributes: \n");
+                printf("binaryVersion: %d \n", funcAttrs.binaryVersion);
+                printf(
+                    "constSizeBytes: %lu \n localSizeBytes: %lu, sharedSizeBytes %lu  maxDynamicSharedSizeBytes: %d "
+                    "\n",
+                    funcAttrs.constSizeBytes,
+                    funcAttrs.localSizeBytes,
+                    funcAttrs.sharedSizeBytes,
+                    funcAttrs.maxDynamicSharedSizeBytes);
+
+                printf(
+                    "numRegs: %d, ptxVersion: %d \n maxThreadsPerBlock: %d .\n ",
+                    funcAttrs.numRegs,
+                    funcAttrs.ptxVersion,
+                    funcAttrs.maxThreadsPerBlock);
+#        endif
+                return kernelFunctionAttributes;
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#    endif
+
+#endif
diff --git a/include/alpaka/kernel/Traits.hpp b/include/alpaka/kernel/Traits.hpp
new file mode 100644
index 0000000..c2c0a55
--- /dev/null
+++ b/include/alpaka/kernel/Traits.hpp
@@ -0,0 +1,383 @@
+/* Copyright 2023 Axel Huebl, Benjamin Worpitz, René Widera, Sergei Bastrakov, Jan Stephan, Bernhard Manfred Gruber,
+ *                Andrea Bocci, Aurora Perego, Mehmet Yusufoglu
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/Common.hpp"
+#include "alpaka/core/Debug.hpp"
+#include "alpaka/core/DemangleTypeNames.hpp"
+#include "alpaka/core/OmpSchedule.hpp"
+#include "alpaka/dim/Traits.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/kernel/KernelFunctionAttributes.hpp"
+#include "alpaka/queue/Traits.hpp"
+#include "alpaka/vec/Vec.hpp"
+#include "alpaka/workdiv/Traits.hpp"
+
+#include <type_traits>
+
+//! The alpaka accelerator library.
+namespace alpaka
+{
+    //! The kernel traits.
+    namespace trait
+    {
+        //! The kernel execution task creation trait.
+        template<
+            typename TAcc,
+            typename TWorkDiv,
+            typename TKernelFnObj,
+            typename... TArgs/*,
+            typename TSfinae = void*/>
+        struct CreateTaskKernel;
+
+        //! The trait for getting the size of the block shared dynamic memory of a kernel.
+        //!
+        //! \tparam TKernelFnObj The kernel function object.
+        //! \tparam TAcc The accelerator.
+        //!
+        //! The default implementation returns 0.
+        template<typename TKernelFnObj, typename TAcc, typename TSfinae = void>
+        struct BlockSharedMemDynSizeBytes
+        {
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored                                                                                  \
+        "-Wdocumentation" // clang does not support the syntax for variadic template arguments "args,..."
+#endif
+            //! \param kernelFnObj The kernel object for which the block shared memory size should be calculated.
+            //! \param blockThreadExtent The block thread extent.
+            //! \param threadElemExtent The thread element extent.
+            //! \tparam TArgs The kernel invocation argument types pack.
+            //! \param args,... The kernel invocation arguments.
+            //! \return The size of the shared memory allocated for a block in bytes.
+            //! The default version always returns zero.
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic pop
+#endif
+            ALPAKA_NO_HOST_ACC_WARNING
+            template<typename TDim, typename... TArgs>
+            ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(
+                [[maybe_unused]] TKernelFnObj const& kernelFnObj,
+                [[maybe_unused]] Vec<TDim, Idx<TAcc>> const& blockThreadExtent,
+                [[maybe_unused]] Vec<TDim, Idx<TAcc>> const& threadElemExtent,
+                [[maybe_unused]] TArgs const&... args) -> std::size_t
+            {
+                return 0u;
+            }
+        };
+
+        //! \brief The structure template to access to the functions attributes of a kernel function object.
+        //! \tparam TAcc The accelerator type
+        //! \tparam TKernelFnObj Kernel function object type.
+        //! \tparam TArgs Kernel function object argument types as a parameter pack.
+        template<typename TAcc, typename TDev, typename TKernelFnObj, typename... TArgs>
+        struct FunctionAttributes
+        {
+            //! \param dev The device instance
+            //! \param kernelFn The kernel function object which should be executed.
+            //! \param args The kernel invocation arguments.
+            //! \return KernelFunctionAttributes data structure instance. The default version always returns the
+            //! instance with fields which are set to zero.
+            ALPAKA_FN_HOST static auto getFunctionAttributes(
+                [[maybe_unused]] TDev const& dev,
+                [[maybe_unused]] TKernelFnObj const& kernelFn,
+                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
+            {
+                std::string const str
+                    = std::string(__func__) + " function is not specialised for the given arguments.\n";
+                throw std::invalid_argument{str};
+            }
+        };
+
+        //! The trait for getting the warp size required by a kernel.
+        //!
+        //! \tparam TKernelFnObj The kernel function object.
+        //! \tparam TAcc The accelerator.
+        //!
+        //! The default implementation returns 0, which lets the accelerator compiler and runtime choose the warp size.
+        template<typename TKernelFnObj, typename TAcc, typename TSfinae = void>
+        struct WarpSize : std::integral_constant<std::uint32_t, 0>
+        {
+        };
+
+        //! This is a shortcut for the trait defined above
+        template<typename TKernelFnObj, typename TAcc>
+        inline constexpr std::uint32_t warpSize = WarpSize<TKernelFnObj, TAcc>::value;
+
+        //! The trait for getting the schedule to use when a kernel is run using the CpuOmp2Blocks accelerator.
+        //!
+        //! Has no effect on other accelerators.
+        //!
+        //! A user could either specialize this trait for their kernel, or define a public static member
+        //! ompScheduleKind of type alpaka::omp::Schedule, and additionally also int member ompScheduleChunkSize. In
+        //! the latter case, alpaka never odr-uses these members.
+        //!
+        //! In case schedule kind and chunk size are compile-time constants, setting then inside kernel may benefit
+        //! performance.
+        //!
+        //! \tparam TKernelFnObj The kernel function object.
+        //! \tparam TAcc The accelerator.
+        //!
+        //! The default implementation behaves as if the trait was not specialized.
+        template<typename TKernelFnObj, typename TAcc, typename TSfinae = void>
+        struct OmpSchedule
+        {
+        private:
+            //! Type returned when the trait is not specialized
+            struct TraitNotSpecialized
+            {
+            };
+
+        public:
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored                                                                                  \
+        "-Wdocumentation" // clang does not support the syntax for variadic template arguments "args,..."
+#endif
+            //! \param kernelFnObj The kernel object for which the schedule should be returned.
+            //! \param blockThreadExtent The block thread extent.
+            //! \param threadElemExtent The thread element extent.
+            //! \tparam TArgs The kernel invocation argument types pack.
+            //! \param args,... The kernel invocation arguments.
+            //! \return The OpenMP schedule information as an alpaka::omp::Schedule object,
+            //!         returning an object of any other type is treated as if the trait is not specialized.
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic pop
+#endif
+            ALPAKA_NO_HOST_ACC_WARNING
+            template<typename TDim, typename... TArgs>
+            ALPAKA_FN_HOST static auto getOmpSchedule(
+                [[maybe_unused]] TKernelFnObj const& kernelFnObj,
+                [[maybe_unused]] Vec<TDim, Idx<TAcc>> const& blockThreadExtent,
+                [[maybe_unused]] Vec<TDim, Idx<TAcc>> const& threadElemExtent,
+                [[maybe_unused]] TArgs const&... args) -> TraitNotSpecialized
+            {
+                return TraitNotSpecialized{};
+            }
+        };
+    } // namespace trait
+
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored                                                                                  \
+        "-Wdocumentation" // clang does not support the syntax for variadic template arguments "args,..."
+#endif
+//! \tparam TAcc The accelerator type.
+//! \param kernelFnObj The kernel object for which the block shared memory size should be calculated.
+//! \param blockThreadExtent The block thread extent.
+//! \param threadElemExtent The thread element extent.
+//! \param args,... The kernel invocation arguments.
+//! \return The size of the shared memory allocated for a block in bytes.
+//! The default implementation always returns zero.
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic pop
+#endif
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc, typename TKernelFnObj, typename TDim, typename... TArgs>
+    ALPAKA_FN_HOST_ACC auto getBlockSharedMemDynSizeBytes(
+        TKernelFnObj const& kernelFnObj,
+        Vec<TDim, Idx<TAcc>> const& blockThreadExtent,
+        Vec<TDim, Idx<TAcc>> const& threadElemExtent,
+        TArgs const&... args) -> std::size_t
+    {
+        return trait::BlockSharedMemDynSizeBytes<TKernelFnObj, TAcc>::getBlockSharedMemDynSizeBytes(
+            kernelFnObj,
+            blockThreadExtent,
+            threadElemExtent,
+            args...);
+    }
+
+    //! \tparam TAcc The accelerator type.
+    //! \tparam TDev The device type.
+    //! \param dev The device instance
+    //! \param kernelFnObj The kernel function object which should be executed.
+    //! \param args The kernel invocation arguments.
+    //! \return KernelFunctionAttributes instance. Instance is filled with values returned by the accelerator API
+    //! depending on the specific kernel. The default version always returns the instance with fields which are set to
+    //! zero.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc, typename TDev, typename TKernelFnObj, typename... TArgs>
+    ALPAKA_FN_HOST auto getFunctionAttributes(TDev const& dev, TKernelFnObj const& kernelFnObj, TArgs&&... args)
+        -> alpaka::KernelFunctionAttributes
+    {
+        return trait::FunctionAttributes<TAcc, TDev, TKernelFnObj, TArgs...>::getFunctionAttributes(
+            dev,
+            kernelFnObj,
+            std::forward<TArgs>(args)...);
+    }
+
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored                                                                                  \
+        "-Wdocumentation" // clang does not support the syntax for variadic template arguments "args,..."
+#endif
+//! \tparam TAcc The accelerator type.
+//! \param kernelFnObj The kernel object for which the block shared memory size should be calculated.
+//! \param blockThreadExtent The block thread extent.
+//! \param threadElemExtent The thread element extent.
+//! \param args,... The kernel invocation arguments.
+//! \return The OpenMP schedule information as an alpaka::omp::Schedule object if the kernel specialized the
+//!         OmpSchedule trait, an object of another type if the kernel didn't specialize the trait.
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic pop
+#endif
+    template<typename TAcc, typename TKernelFnObj, typename TDim, typename... TArgs>
+    ALPAKA_FN_HOST auto getOmpSchedule(
+        TKernelFnObj const& kernelFnObj,
+        Vec<TDim, Idx<TAcc>> const& blockThreadExtent,
+        Vec<TDim, Idx<TAcc>> const& threadElemExtent,
+        TArgs const&... args)
+    {
+        return trait::OmpSchedule<TKernelFnObj, TAcc>::getOmpSchedule(
+            kernelFnObj,
+            blockThreadExtent,
+            threadElemExtent,
+            args...);
+    }
+
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored                                                                                  \
+        "-Wdocumentation" // clang does not support the syntax for variadic template arguments "args,..."
+#endif
+
+
+    //! Check if a type used as kernel argument is trivially copyable
+    //!
+    //! \attention In case this trait is specialized for a user type the user should be sure that the result of calling
+    //! the copy constructor is equal to use memcpy to duplicate the object. An existing destructor should be free
+    //! of side effects.
+    //!
+    //! It's implementation defined whether the closure type of a lambda is trivially copyable.
+    //! Therefor the default implementation is true for trivially copyable or empty (stateless) types.
+    //!
+    //! @tparam T type to check
+    //! @{
+    template<typename T, typename = void>
+    struct IsKernelArgumentTriviallyCopyable
+        : std::bool_constant<std::is_empty_v<T> || std::is_trivially_copyable_v<T>>
+    {
+    };
+
+    template<typename T>
+    inline constexpr bool isKernelArgumentTriviallyCopyable = IsKernelArgumentTriviallyCopyable<T>::value;
+
+    //! @}
+
+    namespace detail
+    {
+        //! Check that the return of TKernelFnObj is void
+        template<typename TAcc, typename TSfinae = void>
+        struct CheckFnReturnType
+        {
+            template<typename TKernelFnObj, typename... TArgs>
+            void operator()(TKernelFnObj const&, TArgs const&...)
+            {
+                using Result = std::invoke_result_t<TKernelFnObj, TAcc const&, TArgs const&...>;
+                static_assert(std::is_same_v<Result, void>, "The TKernelFnObj is required to return void!");
+            }
+        };
+
+        // asserts that T is trivially copyable. We put this in a separate function so we can see which T would fail
+        // the test, when called from a fold expression.
+        template<typename T>
+        inline void assertKernelArgIsTriviallyCopyable()
+        {
+            static_assert(isKernelArgumentTriviallyCopyable<T>, "The kernel argument T must be trivially copyable!");
+        }
+    } // namespace detail
+
+    //! Check if the kernel type is trivially copyable
+    //!
+    //! \attention In case this trait is specialized for a user type the user should be sure that the result of calling
+    //! the copy constructor is equal to use memcpy to duplicate the object. An existing destructor should be free
+    //! of side effects.
+    //!
+    //! The default implementation is true for trivially copyable types (or for extended lambda expressions for CUDA).
+    //!
+    //! @tparam T type to check
+    //! @{
+    template<typename T, typename = void>
+    struct IsKernelTriviallyCopyable
+#if BOOST_COMP_NVCC
+        : std::bool_constant<
+              std::is_trivially_copyable_v<T> || __nv_is_extended_device_lambda_closure_type(T)
+              || __nv_is_extended_host_device_lambda_closure_type(T)>
+#else
+        : std::is_trivially_copyable<T>
+#endif
+    {
+    };
+
+    template<typename T>
+    inline constexpr bool isKernelTriviallyCopyable = IsKernelTriviallyCopyable<T>::value;
+
+//! @}
+
+//! Creates a kernel execution task.
+//!
+//! \tparam TAcc The accelerator type.
+//! \param workDiv The index domain work division.
+//! \param kernelFnObj The kernel function object which should be executed.
+//! \param args,... The kernel invocation arguments.
+//! \return The kernel execution task.
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic pop
+#endif
+    template<typename TAcc, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
+    ALPAKA_FN_HOST auto createTaskKernel(TWorkDiv const& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
+    {
+        // check for void return type
+        detail::CheckFnReturnType<TAcc>{}(kernelFnObj, args...);
+
+#if BOOST_COMP_NVCC
+        static_assert(
+            isKernelTriviallyCopyable<TKernelFnObj>,
+            "Kernels must be trivially copyable or an extended CUDA lambda expression!");
+#else
+        static_assert(isKernelTriviallyCopyable<TKernelFnObj>, "Kernels must be trivially copyable!");
+#endif
+        (detail::assertKernelArgIsTriviallyCopyable<std::decay_t<TArgs>>(), ...);
+        static_assert(
+            Dim<std::decay_t<TWorkDiv>>::value == Dim<TAcc>::value,
+            "The dimensions of TAcc and TWorkDiv have to be identical!");
+        static_assert(
+            std::is_same_v<Idx<std::decay_t<TWorkDiv>>, Idx<TAcc>>,
+            "The idx type of TAcc and the idx type of TWorkDiv have to be identical!");
+
+#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+        std::cout << __func__ << " workDiv: " << workDiv << ", kernelFnObj: " << core::demangled<decltype(kernelFnObj)>
+                  << std::endl;
+#endif
+        return trait::CreateTaskKernel<TAcc, TWorkDiv, TKernelFnObj, TArgs...>::createTaskKernel(
+            workDiv,
+            kernelFnObj,
+            std::forward<TArgs>(args)...);
+    }
+
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored                                                                                  \
+        "-Wdocumentation" // clang does not support the syntax for variadic template arguments "args,..."
+#endif
+//! Executes the given kernel in the given queue.
+//!
+//! \tparam TAcc The accelerator type.
+//! \param queue The queue to enqueue the view copy task into.
+//! \param workDiv The index domain work division.
+//! \param kernelFnObj The kernel function object which should be executed.
+//! \param args,... The kernel invocation arguments.
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic pop
+#endif
+    template<typename TAcc, typename TQueue, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
+    ALPAKA_FN_HOST auto exec(TQueue& queue, TWorkDiv const& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
+        -> void
+    {
+        enqueue(queue, createTaskKernel<TAcc>(workDiv, kernelFnObj, std::forward<TArgs>(args)...));
+    }
+} // namespace alpaka
diff --git a/include/alpaka/math/Complex.hpp b/include/alpaka/math/Complex.hpp
new file mode 100644
index 0000000..f265c7b
--- /dev/null
+++ b/include/alpaka/math/Complex.hpp
@@ -0,0 +1,582 @@
+/* Copyright 2022 Sergei Bastrakov
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/math/FloatEqualExact.hpp"
+
+#include <cmath>
+#include <complex>
+#include <iostream>
+#include <type_traits>
+
+namespace alpaka
+{
+    namespace internal
+    {
+        //! Implementation of a complex number useable on host and device.
+        //!
+        //! It follows the layout of std::complex and so array-oriented access.
+        //! The class template implements all methods and operators as std::complex<T>.
+        //! Additionally, it provides an implicit conversion to and from std::complex<T>.
+        //! All methods besides operators << and >> are host-device.
+        //! It does not provide non-member functions of std::complex besides the operators.
+        //! Those are provided the same way as alpaka math functions for real numbers.
+        //!
+        //! Note that unlike most of alpaka, this is a concrete type template, and not merely a concept.
+        //!
+        //! Naming and order of the methods match https://en.cppreference.com/w/cpp/numeric/complex in C++17.
+        //! Implementation chose to not extend it e.g. by adding constexpr to some places that would get it in C++20.
+        //! The motivation is that with internal conversion to std::complex<T> for CPU backends, it would define the
+        //! common interface for generic code anyways. So it is more clear to have alpaka's interface exactly matching
+        //! when possible, and not "improving".
+        //!
+        //! @tparam T type of the real and imaginary part: float, double, or long double.
+        template<typename T>
+        class Complex
+        {
+        public:
+            // Make sure the input type is floating-point
+            static_assert(std::is_floating_point_v<T>);
+
+            //! Type of the real and imaginary parts
+            using value_type = T;
+
+            //! Constructor from the given real and imaginary parts
+            constexpr ALPAKA_FN_HOST_ACC Complex(T const& real = T{}, T const& imag = T{}) : m_real(real), m_imag(imag)
+            {
+            }
+
+            //! Copy constructor
+            constexpr Complex(Complex const& other) = default;
+
+            //! Constructor from Complex of another type
+            template<typename U>
+            constexpr ALPAKA_FN_HOST_ACC Complex(Complex<U> const& other)
+                : m_real(static_cast<T>(other.real()))
+                , m_imag(static_cast<T>(other.imag()))
+            {
+            }
+
+            //! Constructor from std::complex
+            constexpr ALPAKA_FN_HOST_ACC Complex(std::complex<T> const& other)
+                : m_real(other.real())
+                , m_imag(other.imag())
+            {
+            }
+
+            //! Conversion to std::complex
+            constexpr ALPAKA_FN_HOST_ACC operator std::complex<T>() const
+            {
+                return std::complex<T>{m_real, m_imag};
+            }
+
+            //! Assignment
+            Complex& operator=(Complex const&) = default;
+
+            //! Get the real part
+            constexpr ALPAKA_FN_HOST_ACC T real() const
+            {
+                return m_real;
+            }
+
+            //! Set the real part
+            constexpr ALPAKA_FN_HOST_ACC void real(T value)
+            {
+                m_real = value;
+            }
+
+            //! Get the imaginary part
+            constexpr ALPAKA_FN_HOST_ACC T imag() const
+            {
+                return m_imag;
+            }
+
+            //! Set the imaginary part
+            constexpr ALPAKA_FN_HOST_ACC void imag(T value)
+            {
+                m_imag = value;
+            }
+
+            //! Addition assignment with a real number
+            ALPAKA_FN_HOST_ACC Complex& operator+=(T const& other)
+            {
+                m_real += other;
+                return *this;
+            }
+
+            //! Addition assignment with a complex number
+            template<typename U>
+            ALPAKA_FN_HOST_ACC Complex& operator+=(Complex<U> const& other)
+            {
+                m_real += static_cast<T>(other.real());
+                m_imag += static_cast<T>(other.imag());
+                return *this;
+            }
+
+            //! Subtraction assignment with a real number
+            ALPAKA_FN_HOST_ACC Complex& operator-=(T const& other)
+            {
+                m_real -= other;
+                return *this;
+            }
+
+            //! Subtraction assignment with a complex number
+            template<typename U>
+            ALPAKA_FN_HOST_ACC Complex& operator-=(Complex<U> const& other)
+            {
+                m_real -= static_cast<T>(other.real());
+                m_imag -= static_cast<T>(other.imag());
+                return *this;
+            }
+
+            //! Multiplication assignment with a real number
+            ALPAKA_FN_HOST_ACC Complex& operator*=(T const& other)
+            {
+                m_real *= other;
+                m_imag *= other;
+                return *this;
+            }
+
+            //! Multiplication assignment with a complex number
+            template<typename U>
+            ALPAKA_FN_HOST_ACC Complex& operator*=(Complex<U> const& other)
+            {
+                auto const newReal = m_real * static_cast<T>(other.real()) - m_imag * static_cast<T>(other.imag());
+                auto const newImag = m_imag * static_cast<T>(other.real()) + m_real * static_cast<T>(other.imag());
+                m_real = newReal;
+                m_imag = newImag;
+                return *this;
+            }
+
+            //! Division assignment with a real number
+            ALPAKA_FN_HOST_ACC Complex& operator/=(T const& other)
+            {
+                m_real /= other;
+                m_imag /= other;
+                return *this;
+            }
+
+            //! Division assignment with a complex number
+            template<typename U>
+            ALPAKA_FN_HOST_ACC Complex& operator/=(Complex<U> const& other)
+            {
+                return *this *= Complex{
+                           static_cast<T>(other.real() / (other.real() * other.real() + other.imag() * other.imag())),
+                           static_cast<T>(
+                               -other.imag() / (other.real() * other.real() + other.imag() * other.imag()))};
+            }
+
+        private:
+            //! Real and imaginary parts, storage enables array-oriented access
+            T m_real, m_imag;
+        };
+
+        //! Host-device arithmetic operations matching std::complex<T>.
+        //!
+        //! They take and return alpaka::Complex.
+        //!
+        //! @{
+        //!
+
+        //! Unary plus (added for compatibility with std::complex)
+        template<typename T>
+        ALPAKA_FN_HOST_ACC Complex<T> operator+(Complex<T> const& val)
+        {
+            return val;
+        }
+
+        //! Unary minus
+        template<typename T>
+        ALPAKA_FN_HOST_ACC Complex<T> operator-(Complex<T> const& val)
+        {
+            return Complex<T>{-val.real(), -val.imag()};
+        }
+
+        //! Addition of two complex numbers
+        template<typename T>
+        ALPAKA_FN_HOST_ACC Complex<T> operator+(Complex<T> const& lhs, Complex<T> const& rhs)
+        {
+            return Complex<T>{lhs.real() + rhs.real(), lhs.imag() + rhs.imag()};
+        }
+
+        //! Addition of a complex and a real number
+        template<typename T>
+        ALPAKA_FN_HOST_ACC Complex<T> operator+(Complex<T> const& lhs, T const& rhs)
+        {
+            return Complex<T>{lhs.real() + rhs, lhs.imag()};
+        }
+
+        //! Addition of a real and a complex number
+        template<typename T>
+        ALPAKA_FN_HOST_ACC Complex<T> operator+(T const& lhs, Complex<T> const& rhs)
+        {
+            return Complex<T>{lhs + rhs.real(), rhs.imag()};
+        }
+
+        //! Subtraction of two complex numbers
+        template<typename T>
+        ALPAKA_FN_HOST_ACC Complex<T> operator-(Complex<T> const& lhs, Complex<T> const& rhs)
+        {
+            return Complex<T>{lhs.real() - rhs.real(), lhs.imag() - rhs.imag()};
+        }
+
+        //! Subtraction of a complex and a real number
+        template<typename T>
+        ALPAKA_FN_HOST_ACC Complex<T> operator-(Complex<T> const& lhs, T const& rhs)
+        {
+            return Complex<T>{lhs.real() - rhs, lhs.imag()};
+        }
+
+        //! Subtraction of a real and a complex number
+        template<typename T>
+        ALPAKA_FN_HOST_ACC Complex<T> operator-(T const& lhs, Complex<T> const& rhs)
+        {
+            return Complex<T>{lhs - rhs.real(), -rhs.imag()};
+        }
+
+        //! Muptiplication of two complex numbers
+        template<typename T>
+        ALPAKA_FN_HOST_ACC Complex<T> operator*(Complex<T> const& lhs, Complex<T> const& rhs)
+        {
+            return Complex<T>{
+                lhs.real() * rhs.real() - lhs.imag() * rhs.imag(),
+                lhs.imag() * rhs.real() + lhs.real() * rhs.imag()};
+        }
+
+        //! Muptiplication of a complex and a real number
+        template<typename T>
+        ALPAKA_FN_HOST_ACC Complex<T> operator*(Complex<T> const& lhs, T const& rhs)
+        {
+            return Complex<T>{lhs.real() * rhs, lhs.imag() * rhs};
+        }
+
+        //! Muptiplication of a real and a complex number
+        template<typename T>
+        ALPAKA_FN_HOST_ACC Complex<T> operator*(T const& lhs, Complex<T> const& rhs)
+        {
+            return Complex<T>{lhs * rhs.real(), lhs * rhs.imag()};
+        }
+
+        //! Division of two complex numbers
+        template<typename T>
+        ALPAKA_FN_HOST_ACC Complex<T> operator/(Complex<T> const& lhs, Complex<T> const& rhs)
+        {
+            return Complex<T>{
+                (lhs.real() * rhs.real() + lhs.imag() * rhs.imag())
+                    / (rhs.real() * rhs.real() + rhs.imag() * rhs.imag()),
+                (lhs.imag() * rhs.real() - lhs.real() * rhs.imag())
+                    / (rhs.real() * rhs.real() + rhs.imag() * rhs.imag())};
+        }
+
+        //! Division of complex and a real number
+        template<typename T>
+        ALPAKA_FN_HOST_ACC Complex<T> operator/(Complex<T> const& lhs, T const& rhs)
+        {
+            return Complex<T>{lhs.real() / rhs, lhs.imag() / rhs};
+        }
+
+        //! Division of a real and a complex number
+        template<typename T>
+        ALPAKA_FN_HOST_ACC Complex<T> operator/(T const& lhs, Complex<T> const& rhs)
+        {
+            return Complex<T>{
+                lhs * rhs.real() / (rhs.real() * rhs.real() + rhs.imag() * rhs.imag()),
+                -lhs * rhs.imag() / (rhs.real() * rhs.real() + rhs.imag() * rhs.imag())};
+        }
+
+        //! Equality of two complex numbers
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC bool operator==(Complex<T> const& lhs, Complex<T> const& rhs)
+        {
+            return math::floatEqualExactNoWarning(lhs.real(), rhs.real())
+                   && math::floatEqualExactNoWarning(lhs.imag(), rhs.imag());
+        }
+
+        //! Equality of a complex and a real number
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC bool operator==(Complex<T> const& lhs, T const& rhs)
+        {
+            return math::floatEqualExactNoWarning(lhs.real(), rhs)
+                   && math::floatEqualExactNoWarning(lhs.imag(), static_cast<T>(0));
+        }
+
+        //! Equality of a real and a complex number
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC bool operator==(T const& lhs, Complex<T> const& rhs)
+        {
+            return math::floatEqualExactNoWarning(lhs, rhs.real())
+                   && math::floatEqualExactNoWarning(static_cast<T>(0), rhs.imag());
+        }
+
+        //! Inequality of two complex numbers.
+        //!
+        //! @note this and other versions of operator != should be removed since C++20, as so does std::complex
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC bool operator!=(Complex<T> const& lhs, Complex<T> const& rhs)
+        {
+            return !(lhs == rhs);
+        }
+
+        //! Inequality of a complex and a real number
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC bool operator!=(Complex<T> const& lhs, T const& rhs)
+        {
+            return !math::floatEqualExactNoWarning(lhs.real(), rhs)
+                   || !math::floatEqualExactNoWarning(lhs.imag(), static_cast<T>(0));
+        }
+
+        //! Inequality of a real and a complex number
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC bool operator!=(T const& lhs, Complex<T> const& rhs)
+        {
+            return !math::floatEqualExactNoWarning(lhs, rhs.real())
+                   || !math::floatEqualExactNoWarning(static_cast<T>(0), rhs.imag());
+        }
+
+        //! @}
+
+        //! Host-only output of a complex number
+        template<typename T, typename TChar, typename TTraits>
+        std::basic_ostream<TChar, TTraits>& operator<<(std::basic_ostream<TChar, TTraits>& os, Complex<T> const& x)
+        {
+            os << x.operator std::complex<T>();
+            return os;
+        }
+
+        //! Host-only input of a complex number
+        template<typename T, typename TChar, typename TTraits>
+        std::basic_istream<TChar, TTraits>& operator>>(std::basic_istream<TChar, TTraits>& is, Complex<T> const& x)
+        {
+            std::complex<T> z;
+            is >> z;
+            x = z;
+            return is;
+        }
+
+        //! Host-only math functions matching std::complex<T>.
+        //!
+        //! Due to issue #1688, these functions are technically marked host-device and suppress related warnings.
+        //! However, they must be called for host only.
+        //!
+        //! They take and return alpaka::Complex (or a real number when appropriate).
+        //! Internally cast, fall back to std::complex implementation and cast back.
+        //! These functions can be used directly on the host side.
+        //! They are also picked up by ADL in math traits for CPU backends.
+        //!
+        //! On the device side, alpaka math traits must be used instead.
+        //! Note that the set of the traits is currently a bit smaller.
+        //!
+        //! @{
+        //!
+
+        //! Absolute value
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC T abs(Complex<T> const& x)
+        {
+            return std::abs(std::complex<T>(x));
+        }
+
+        //! Arc cosine
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> acos(Complex<T> const& x)
+        {
+            return std::acos(std::complex<T>(x));
+        }
+
+        //! Arc hyperbolic cosine
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> acosh(Complex<T> const& x)
+        {
+            return std::acosh(std::complex<T>(x));
+        }
+
+        //! Argument
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC T arg(Complex<T> const& x)
+        {
+            return std::arg(std::complex<T>(x));
+        }
+
+        //! Arc sine
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> asin(Complex<T> const& x)
+        {
+            return std::asin(std::complex<T>(x));
+        }
+
+        //! Arc hyperbolic sine
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> asinh(Complex<T> const& x)
+        {
+            return std::asinh(std::complex<T>(x));
+        }
+
+        //! Arc tangent
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> atan(Complex<T> const& x)
+        {
+            return std::atan(std::complex<T>(x));
+        }
+
+        //! Arc hyperbolic tangent
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> atanh(Complex<T> const& x)
+        {
+            return std::atanh(std::complex<T>(x));
+        }
+
+        //! Complex conjugate
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> conj(Complex<T> const& x)
+        {
+            return std::conj(std::complex<T>(x));
+        }
+
+        //! Cosine
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> cos(Complex<T> const& x)
+        {
+            return std::cos(std::complex<T>(x));
+        }
+
+        //! Hyperbolic cosine
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> cosh(Complex<T> const& x)
+        {
+            return std::cosh(std::complex<T>(x));
+        }
+
+        //! Exponential
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> exp(Complex<T> const& x)
+        {
+            return std::exp(std::complex<T>(x));
+        }
+
+        //! Natural logarithm
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> log(Complex<T> const& x)
+        {
+            return std::log(std::complex<T>(x));
+        }
+
+        //! Base 10 logarithm
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> log10(Complex<T> const& x)
+        {
+            return std::log10(std::complex<T>(x));
+        }
+
+        //! Squared magnitude
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC T norm(Complex<T> const& x)
+        {
+            return std::norm(std::complex<T>(x));
+        }
+
+        //! Get a complex number with given magnitude and phase angle
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> polar(T const& r, T const& theta = T())
+        {
+            return std::polar(r, theta);
+        }
+
+        //! Complex power of a complex number
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T, typename U>
+        constexpr ALPAKA_FN_HOST_ACC auto pow(Complex<T> const& x, Complex<U> const& y)
+        {
+            // Use same type promotion as std::pow
+            auto const result = std::pow(std::complex<T>(x), std::complex<U>(y));
+            using ValueType = typename decltype(result)::value_type;
+            return Complex<ValueType>(result);
+        }
+
+        //! Real power of a complex number
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T, typename U>
+        constexpr ALPAKA_FN_HOST_ACC auto pow(Complex<T> const& x, U const& y)
+        {
+            return pow(x, Complex<U>(y));
+        }
+
+        //! Complex power of a real number
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T, typename U>
+        constexpr ALPAKA_FN_HOST_ACC auto pow(T const& x, Complex<U> const& y)
+        {
+            return pow(Complex<T>(x), y);
+        }
+
+        //! Projection onto the Riemann sphere
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> proj(Complex<T> const& x)
+        {
+            return std::proj(std::complex<T>(x));
+        }
+
+        //! Sine
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> sin(Complex<T> const& x)
+        {
+            return std::sin(std::complex<T>(x));
+        }
+
+        //! Hyperbolic sine
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> sinh(Complex<T> const& x)
+        {
+            return std::sinh(std::complex<T>(x));
+        }
+
+        //! Square root
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> sqrt(Complex<T> const& x)
+        {
+            return std::sqrt(std::complex<T>(x));
+        }
+
+        //! Tangent
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> tan(Complex<T> const& x)
+        {
+            return std::tan(std::complex<T>(x));
+        }
+
+        //! Hyperbolic tangent
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T>
+        constexpr ALPAKA_FN_HOST_ACC Complex<T> tanh(Complex<T> const& x)
+        {
+            return std::tanh(std::complex<T>(x));
+        }
+
+        //! @}
+    } // namespace internal
+
+    using internal::Complex;
+} // namespace alpaka
diff --git a/include/alpaka/math/FloatEqualExact.hpp b/include/alpaka/math/FloatEqualExact.hpp
new file mode 100644
index 0000000..8c252b4
--- /dev/null
+++ b/include/alpaka/math/FloatEqualExact.hpp
@@ -0,0 +1,50 @@
+/* Copyright 2021 Jiri Vyskocil
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+
+#include <type_traits>
+
+namespace alpaka
+{
+    namespace math
+    {
+        /** Compare two floating point numbers for exact equivalence. Use only when necessary, and be aware of the
+         * implications. Most codes should not use this function and instead implement a correct epsilon-based
+         * comparison. If you are unfamiliar with the topic, check out
+         * https://www.geeksforgeeks.org/problem-in-comparing-floating-point-numbers-and-how-to-compare-them-correctly/
+         * or Goldberg 1991: "What every computer scientist should know about floating-point arithmetic",
+         * https://dl.acm.org/doi/10.1145/103162.103163
+         *
+         * This function calls the == operator for floating point types, but disables the warning issued by the
+         * compiler when compiling with the float equality warning checks enabled. This warning is valid an valuable in
+         * most codes and should be generally enabled, but there are specific instances where a piece of code might
+         * need to do an exact comparison (e.g. @a CudaVectorArrayWrapperTest.cpp). The verbose name for the function
+         * is intentional as it should raise a red flag if used while not absolutely needed. Users are advised to add a
+         * justification whenever they use this function.
+         *
+         * @tparam T both operands have to be the same type and conform to std::is_floating_point
+         * @param a first operand
+         * @param b second operand
+         * @return a == b
+         */
+        template<typename T>
+        ALPAKA_FN_INLINE ALPAKA_FN_HOST_ACC auto floatEqualExactNoWarning(T a, T b) -> bool
+        {
+            static_assert(std::is_floating_point_v<T>, "floatEqualExactNoWarning is for floating point values only!");
+
+            // So far only GCC and Clang check for float comparison and both accept the GCC pragmas.
+#ifdef __GNUC__
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+            return a == b;
+#ifdef __GNUC__
+#    pragma GCC diagnostic pop
+#endif
+        }
+    } // namespace math
+} // namespace alpaka
diff --git a/include/alpaka/math/MathGenericSycl.hpp b/include/alpaka/math/MathGenericSycl.hpp
new file mode 100644
index 0000000..086c480
--- /dev/null
+++ b/include/alpaka/math/MathGenericSycl.hpp
@@ -0,0 +1,751 @@
+/* Copyright 2023 Jan Stephan, Sergei Bastrakov, René Widera, Luca Ferragina, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/math/Complex.hpp"
+#include "alpaka/math/Traits.hpp"
+
+#include <type_traits>
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+
+#    include <sycl/sycl.hpp>
+
+//! The mathematical operation specifics.
+namespace alpaka::math
+{
+    //! The SYCL abs.
+    class AbsGenericSycl : public concepts::Implements<alpaka::math::ConceptMathAbs, AbsGenericSycl>
+    {
+    };
+
+    //! The SYCL acos.
+    class AcosGenericSycl : public concepts::Implements<alpaka::math::ConceptMathAcos, AcosGenericSycl>
+    {
+    };
+
+    //! The SYCL acosh.
+    class AcoshGenericSycl : public concepts::Implements<alpaka::math::ConceptMathAcosh, AcoshGenericSycl>
+    {
+    };
+
+    //! The SYCL arg.
+    class ArgGenericSycl : public concepts::Implements<alpaka::math::ConceptMathArg, ArgGenericSycl>
+    {
+    };
+
+    //! The SYCL asin.
+    class AsinGenericSycl : public concepts::Implements<alpaka::math::ConceptMathAsin, AsinGenericSycl>
+    {
+    };
+
+    //! The SYCL asinh.
+    class AsinhGenericSycl : public concepts::Implements<alpaka::math::ConceptMathAsinh, AsinhGenericSycl>
+    {
+    };
+
+    //! The SYCL atan.
+    class AtanGenericSycl : public concepts::Implements<alpaka::math::ConceptMathAtan, AtanGenericSycl>
+    {
+    };
+
+    //! The SYCL atanh.
+    class AtanhGenericSycl : public concepts::Implements<alpaka::math::ConceptMathAtanh, AtanhGenericSycl>
+    {
+    };
+
+    //! The SYCL atan2.
+    class Atan2GenericSycl : public concepts::Implements<alpaka::math::ConceptMathAtan2, Atan2GenericSycl>
+    {
+    };
+
+    //! The SYCL cbrt.
+    class CbrtGenericSycl : public concepts::Implements<alpaka::math::ConceptMathCbrt, CbrtGenericSycl>
+    {
+    };
+
+    //! The SYCL ceil.
+    class CeilGenericSycl : public concepts::Implements<alpaka::math::ConceptMathCeil, CeilGenericSycl>
+    {
+    };
+
+    //! The SYCL conj.
+    class ConjGenericSycl : public concepts::Implements<alpaka::math::ConceptMathConj, ConjGenericSycl>
+    {
+    };
+
+    //! The SYCL copysign.
+    class CopysignGenericSycl : public concepts::Implements<alpaka::math::ConceptMathCopysign, CopysignGenericSycl>
+    {
+    };
+
+    //! The SYCL cos.
+    class CosGenericSycl : public concepts::Implements<alpaka::math::ConceptMathCos, CosGenericSycl>
+    {
+    };
+
+    //! The SYCL cosh.
+    class CoshGenericSycl : public concepts::Implements<alpaka::math::ConceptMathCosh, CoshGenericSycl>
+    {
+    };
+
+    //! The SYCL erf.
+    class ErfGenericSycl : public concepts::Implements<alpaka::math::ConceptMathErf, ErfGenericSycl>
+    {
+    };
+
+    //! The SYCL exp.
+    class ExpGenericSycl : public concepts::Implements<alpaka::math::ConceptMathExp, ExpGenericSycl>
+    {
+    };
+
+    //! The SYCL floor.
+    class FloorGenericSycl : public concepts::Implements<alpaka::math::ConceptMathFloor, FloorGenericSycl>
+    {
+    };
+
+    //! The SYCL fma.
+    class FmaGenericSycl : public concepts::Implements<alpaka::math::ConceptMathFma, FmaGenericSycl>
+    {
+    };
+
+    //! The SYCL fmod.
+    class FmodGenericSycl : public concepts::Implements<alpaka::math::ConceptMathFmod, FmodGenericSycl>
+    {
+    };
+
+    //! The SYCL isfinite.
+    class IsfiniteGenericSycl : public concepts::Implements<alpaka::math::ConceptMathIsfinite, IsfiniteGenericSycl>
+    {
+    };
+
+    //! The SYCL isfinite.
+    class IsinfGenericSycl : public concepts::Implements<alpaka::math::ConceptMathIsinf, IsinfGenericSycl>
+    {
+    };
+
+    //! The SYCL isnan.
+    class IsnanGenericSycl : public concepts::Implements<alpaka::math::ConceptMathIsnan, IsnanGenericSycl>
+    {
+    };
+
+    //! The SYCL log.
+    class LogGenericSycl : public concepts::Implements<alpaka::math::ConceptMathLog, LogGenericSycl>
+    {
+    };
+
+    //! The SYCL log2.
+    class Log2GenericSycl : public concepts::Implements<alpaka::math::ConceptMathLog2, Log2GenericSycl>
+    {
+    };
+
+    //! The SYCL log10.
+    class Log10GenericSycl : public concepts::Implements<alpaka::math::ConceptMathLog10, Log10GenericSycl>
+    {
+    };
+
+    //! The SYCL max.
+    class MaxGenericSycl : public concepts::Implements<alpaka::math::ConceptMathMax, MaxGenericSycl>
+    {
+    };
+
+    //! The SYCL min.
+    class MinGenericSycl : public concepts::Implements<alpaka::math::ConceptMathMin, MinGenericSycl>
+    {
+    };
+
+    //! The SYCL pow.
+    class PowGenericSycl : public concepts::Implements<alpaka::math::ConceptMathPow, PowGenericSycl>
+    {
+    };
+
+    //! The SYCL remainder.
+    class RemainderGenericSycl : public concepts::Implements<alpaka::math::ConceptMathRemainder, RemainderGenericSycl>
+    {
+    };
+
+    //! The SYCL round.
+    class RoundGenericSycl : public concepts::Implements<alpaka::math::ConceptMathRound, RoundGenericSycl>
+    {
+    };
+
+    //! The SYCL rsqrt.
+    class RsqrtGenericSycl : public concepts::Implements<alpaka::math::ConceptMathRsqrt, RsqrtGenericSycl>
+    {
+    };
+
+    //! The SYCL sin.
+    class SinGenericSycl : public concepts::Implements<alpaka::math::ConceptMathSin, SinGenericSycl>
+    {
+    };
+
+    //! The SYCL sinh.
+    class SinhGenericSycl : public concepts::Implements<alpaka::math::ConceptMathSinh, SinhGenericSycl>
+    {
+    };
+
+    //! The SYCL sincos.
+    class SinCosGenericSycl : public concepts::Implements<alpaka::math::ConceptMathSinCos, SinCosGenericSycl>
+    {
+    };
+
+    //! The SYCL sqrt.
+    class SqrtGenericSycl : public concepts::Implements<alpaka::math::ConceptMathSqrt, SqrtGenericSycl>
+    {
+    };
+
+    //! The SYCL tan.
+    class TanGenericSycl : public concepts::Implements<alpaka::math::ConceptMathTan, TanGenericSycl>
+    {
+    };
+
+    //! The SYCL tanh.
+    class TanhGenericSycl : public concepts::Implements<alpaka::math::ConceptMathTanh, TanhGenericSycl>
+    {
+    };
+
+    //! The SYCL trunc.
+    class TruncGenericSycl : public concepts::Implements<alpaka::math::ConceptMathTrunc, TruncGenericSycl>
+    {
+    };
+
+    //! The SYCL math trait specializations.
+    class MathGenericSycl
+        : public AbsGenericSycl
+        , public AcosGenericSycl
+        , public AcoshGenericSycl
+        , public ArgGenericSycl
+        , public AsinGenericSycl
+        , public AsinhGenericSycl
+        , public AtanGenericSycl
+        , public AtanhGenericSycl
+        , public Atan2GenericSycl
+        , public CbrtGenericSycl
+        , public CeilGenericSycl
+        , public ConjGenericSycl
+        , public CopysignGenericSycl
+        , public CosGenericSycl
+        , public CoshGenericSycl
+        , public ErfGenericSycl
+        , public ExpGenericSycl
+        , public FloorGenericSycl
+        , public FmaGenericSycl
+        , public FmodGenericSycl
+        , public IsfiniteGenericSycl
+        , public IsinfGenericSycl
+        , public IsnanGenericSycl
+        , public LogGenericSycl
+        , public Log2GenericSycl
+        , public Log10GenericSycl
+        , public MaxGenericSycl
+        , public MinGenericSycl
+        , public PowGenericSycl
+        , public RemainderGenericSycl
+        , public RoundGenericSycl
+        , public RsqrtGenericSycl
+        , public SinGenericSycl
+        , public SinhGenericSycl
+        , public SinCosGenericSycl
+        , public SqrtGenericSycl
+        , public TanGenericSycl
+        , public TanhGenericSycl
+        , public TruncGenericSycl
+    {
+    };
+} // namespace alpaka::math
+
+namespace alpaka::math::trait
+{
+    //! The SYCL abs trait specialization.
+    template<typename TArg>
+    struct Abs<math::AbsGenericSycl, TArg, std::enable_if_t<std::is_arithmetic_v<TArg>>>
+    {
+        auto operator()(math::AbsGenericSycl const&, TArg const& arg)
+        {
+            if constexpr(std::is_integral_v<TArg>)
+                return sycl::abs(arg);
+            else if constexpr(std::is_floating_point_v<TArg>)
+                return sycl::fabs(arg);
+            else
+                static_assert(!sizeof(TArg), "Unsupported data type");
+        }
+    };
+
+    //! The SYCL acos trait specialization.
+    template<typename TArg>
+    struct Acos<math::AcosGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+    {
+        auto operator()(math::AcosGenericSycl const&, TArg const& arg)
+        {
+            return sycl::acos(arg);
+        }
+    };
+
+    //! The SYCL acosh trait specialization.
+    template<typename TArg>
+    struct Acosh<math::AcoshGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+    {
+        auto operator()(math::AcoshGenericSycl const&, TArg const& arg)
+        {
+            return sycl::acosh(arg);
+        }
+    };
+
+    //! The SYCL arg trait specialization.
+    template<typename TArgument>
+    struct Arg<math::ArgGenericSycl, TArgument, std::enable_if_t<std::is_arithmetic_v<TArgument>>>
+    {
+        auto operator()(math::ArgGenericSycl const&, TArgument const& argument)
+        {
+            if constexpr(std::is_integral_v<TArgument>)
+                return sycl::atan2(0.0, static_cast<double>(argument));
+            else if constexpr(std::is_floating_point_v<TArgument>)
+                return sycl::atan2(static_cast<TArgument>(0.0), argument);
+            else
+                static_assert(!sizeof(TArgument), "Unsupported data type");
+        }
+    };
+
+    //! The SYCL asin trait specialization.
+    template<typename TArg>
+    struct Asin<math::AsinGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+    {
+        auto operator()(math::AsinGenericSycl const&, TArg const& arg)
+        {
+            return sycl::asin(arg);
+        }
+    };
+
+    //! The SYCL asinh trait specialization.
+    template<typename TArg>
+    struct Asinh<math::AsinhGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+    {
+        auto operator()(math::AsinhGenericSycl const&, TArg const& arg)
+        {
+            return sycl::asinh(arg);
+        }
+    };
+
+    //! The SYCL atan trait specialization.
+    template<typename TArg>
+    struct Atan<math::AtanGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+    {
+        auto operator()(math::AtanGenericSycl const&, TArg const& arg)
+        {
+            return sycl::atan(arg);
+        }
+    };
+
+    //! The SYCL atanh trait specialization.
+    template<typename TArg>
+    struct Atanh<math::AtanhGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+    {
+        auto operator()(math::AtanhGenericSycl const&, TArg const& arg)
+        {
+            return sycl::atanh(arg);
+        }
+    };
+
+    //! The SYCL atan2 trait specialization.
+    template<typename Ty, typename Tx>
+    struct Atan2<
+        math::Atan2GenericSycl,
+        Ty,
+        Tx,
+        std::enable_if_t<std::is_floating_point_v<Ty> && std::is_floating_point_v<Tx>>>
+    {
+        using TCommon = std::common_type_t<Ty, Tx>;
+
+        auto operator()(math::Atan2GenericSycl const&, Ty const& y, Tx const& x)
+        {
+            return sycl::atan2(static_cast<TCommon>(y), static_cast<TCommon>(x));
+        }
+    };
+
+    //! The SYCL cbrt trait specialization.
+    template<typename TArg>
+    struct Cbrt<math::CbrtGenericSycl, TArg, std::enable_if_t<std::is_arithmetic_v<TArg>>>
+    {
+        auto operator()(math::CbrtGenericSycl const&, TArg const& arg)
+        {
+            if constexpr(std::is_integral_v<TArg>)
+                return sycl::cbrt(static_cast<double>(arg)); // Mirror CUDA back-end and use double for ints
+            else if constexpr(std::is_floating_point_v<TArg>)
+                return sycl::cbrt(arg);
+            else
+                static_assert(!sizeof(TArg), "Unsupported data type");
+        }
+    };
+
+    //! The SYCL ceil trait specialization.
+    template<typename TArg>
+    struct Ceil<math::CeilGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+    {
+        auto operator()(math::CeilGenericSycl const&, TArg const& arg)
+        {
+            return sycl::ceil(arg);
+        }
+    };
+
+    //! The SYCL conj trait specialization.
+    template<typename TArg>
+    struct Conj<math::ConjGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+    {
+        auto operator()(math::ConjGenericSycl const&, TArg const& arg)
+        {
+            return Complex<TArg>{arg, TArg{0.0}};
+        }
+    };
+
+    //! The SYCL copysign trait specialization.
+    template<typename TMag, typename TSgn>
+    struct Copysign<
+        math::CopysignGenericSycl,
+        TMag,
+        TSgn,
+        std::enable_if_t<std::is_floating_point_v<TMag> && std::is_floating_point_v<TSgn>>>
+    {
+        using TCommon = std::common_type_t<TMag, TSgn>;
+
+        auto operator()(math::CopysignGenericSycl const&, TMag const& y, TSgn const& x)
+        {
+            return sycl::copysign(static_cast<TCommon>(y), static_cast<TCommon>(x));
+        }
+    };
+
+    //! The SYCL cos trait specialization.
+    template<typename TArg>
+    struct Cos<math::CosGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+    {
+        auto operator()(math::CosGenericSycl const&, TArg const& arg)
+        {
+            return sycl::cos(arg);
+        }
+    };
+
+    //! The SYCL cos trait specialization.
+    template<typename TArg>
+    struct Cosh<math::CoshGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+    {
+        auto operator()(math::CoshGenericSycl const&, TArg const& arg)
+        {
+            return sycl::cosh(arg);
+        }
+    };
+
+    //! The SYCL erf trait specialization.
+    template<typename TArg>
+    struct Erf<math::ErfGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+    {
+        auto operator()(math::ErfGenericSycl const&, TArg const& arg)
+        {
+            return sycl::erf(arg);
+        }
+    };
+
+    //! The SYCL exp trait specialization.
+    template<typename TArg>
+    struct Exp<math::ExpGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+    {
+        auto operator()(math::ExpGenericSycl const&, TArg const& arg)
+        {
+            return sycl::exp(arg);
+        }
+    };
+
+    //! The SYCL floor trait specialization.
+    template<typename TArg>
+    struct Floor<math::FloorGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+    {
+        auto operator()(math::FloorGenericSycl const&, TArg const& arg)
+        {
+            return sycl::floor(arg);
+        }
+    };
+
+    //! The SYCL fma trait specialization.
+    template<typename Tx, typename Ty, typename Tz>
+    struct Fma<
+        math::FmaGenericSycl,
+        Tx,
+        Ty,
+        Tz,
+        std::enable_if_t<std::is_floating_point_v<Tx> && std::is_floating_point_v<Ty> && std::is_floating_point_v<Tz>>>
+    {
+        auto operator()(math::FmaGenericSycl const&, Tx const& x, Ty const& y, Tz const& z)
+        {
+            return sycl::fma(x, y, z);
+        }
+    };
+
+    //! The SYCL fmod trait specialization.
+    template<typename Tx, typename Ty>
+    struct Fmod<
+        math::FmodGenericSycl,
+        Tx,
+        Ty,
+        std::enable_if_t<std::is_floating_point_v<Tx> && std::is_floating_point_v<Ty>>>
+    {
+        using TCommon = std::common_type_t<Tx, Ty>;
+
+        auto operator()(math::FmodGenericSycl const&, Tx const& x, Ty const& y)
+        {
+            return sycl::fmod(static_cast<TCommon>(x), static_cast<TCommon>(y));
+        }
+    };
+
+    //! The SYCL isfinite trait specialization.
+    template<typename TArg>
+    struct Isfinite<math::IsfiniteGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+    {
+        auto operator()(math::IsfiniteGenericSycl const&, TArg const& arg)
+        {
+            return static_cast<bool>(sycl::isfinite(arg));
+        }
+    };
+
+    //! The SYCL isinf trait specialization.
+    template<typename TArg>
+    struct Isinf<math::IsinfGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+    {
+        auto operator()(math::IsinfGenericSycl const&, TArg const& arg)
+        {
+            return static_cast<bool>(sycl::isinf(arg));
+        }
+    };
+
+    //! The SYCL isnan trait specialization.
+    template<typename TArg>
+    struct Isnan<math::IsnanGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+    {
+        auto operator()(math::IsnanGenericSycl const&, TArg const& arg)
+        {
+            return static_cast<bool>(sycl::isnan(arg));
+        }
+    };
+
+    //! The SYCL log trait specialization.
+    template<typename TArg>
+    struct Log<math::LogGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+    {
+        auto operator()(math::LogGenericSycl const&, TArg const& arg)
+        {
+            return sycl::log(arg);
+        }
+    };
+
+    //! The SYCL log2 trait specialization.
+    template<typename TArg>
+    struct Log2<math::Log2GenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+    {
+        auto operator()(math::Log2GenericSycl const&, TArg const& arg)
+        {
+            return sycl::log2(arg);
+        }
+    };
+
+    //! The SYCL log10 trait specialization.
+    template<typename TArg>
+    struct Log10<math::Log10GenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+    {
+        auto operator()(math::Log10GenericSycl const&, TArg const& arg)
+        {
+            return sycl::log10(arg);
+        }
+    };
+
+    //! The SYCL max trait specialization.
+    template<typename Tx, typename Ty>
+    struct Max<math::MaxGenericSycl, Tx, Ty, std::enable_if_t<std::is_arithmetic_v<Tx> && std::is_arithmetic_v<Ty>>>
+    {
+        using TCommon = std::common_type_t<Tx, Ty>;
+
+        auto operator()(math::MaxGenericSycl const&, Tx const& x, Ty const& y)
+        {
+            if constexpr(std::is_integral_v<Tx> && std::is_integral_v<Ty>)
+                return sycl::max(static_cast<TCommon>(x), static_cast<TCommon>(y));
+            else if constexpr(std::is_floating_point_v<Tx> && std::is_floating_point_v<Ty>)
+                return sycl::fmax(static_cast<TCommon>(x), static_cast<TCommon>(y));
+            else if constexpr(
+                (std::is_floating_point_v<Tx> && std::is_integral_v<Ty>)
+                || (std::is_integral_v<Tx> && std::is_floating_point_v<Ty>) )
+                return sycl::fmax(static_cast<double>(x), static_cast<double>(y)); // mirror CUDA back-end
+            else
+                static_assert(!sizeof(Tx), "Unsupported data types");
+        }
+    };
+
+    //! The SYCL min trait specialization.
+    template<typename Tx, typename Ty>
+    struct Min<math::MinGenericSycl, Tx, Ty, std::enable_if_t<std::is_arithmetic_v<Tx> && std::is_arithmetic_v<Ty>>>
+    {
+        auto operator()(math::MinGenericSycl const&, Tx const& x, Ty const& y)
+        {
+            if constexpr(std::is_integral_v<Tx> && std::is_integral_v<Ty>)
+                return sycl::min(x, y);
+            else if constexpr(std::is_floating_point_v<Tx> || std::is_floating_point_v<Ty>)
+                return sycl::fmin(x, y);
+            else if constexpr(
+                (std::is_floating_point_v<Tx> && std::is_integral_v<Ty>)
+                || (std::is_integral_v<Tx> && std::is_floating_point_v<Ty>) )
+                return sycl::fmin(static_cast<double>(x), static_cast<double>(y)); // mirror CUDA back-end
+            else
+                static_assert(!sizeof(Tx), "Unsupported data types");
+        }
+    };
+
+    //! The SYCL pow trait specialization.
+    template<typename TBase, typename TExp>
+    struct Pow<
+        math::PowGenericSycl,
+        TBase,
+        TExp,
+        std::enable_if_t<std::is_floating_point_v<TBase> && std::is_floating_point_v<TExp>>>
+    {
+        using TCommon = std::common_type_t<TBase, TExp>;
+
+        auto operator()(math::PowGenericSycl const&, TBase const& base, TExp const& exp)
+        {
+            return sycl::pow(static_cast<TCommon>(base), static_cast<TCommon>(exp));
+        }
+    };
+
+    //! The SYCL remainder trait specialization.
+    template<typename Tx, typename Ty>
+    struct Remainder<
+        math::RemainderGenericSycl,
+        Tx,
+        Ty,
+        std::enable_if_t<std::is_floating_point_v<Tx> && std::is_floating_point_v<Ty>>>
+    {
+        using TCommon = std::common_type_t<Tx, Ty>;
+
+        auto operator()(math::RemainderGenericSycl const&, Tx const& x, Ty const& y)
+        {
+            return sycl::remainder(static_cast<TCommon>(x), static_cast<TCommon>(y));
+        }
+    };
+
+    //! The SYCL round trait specialization.
+    template<typename TArg>
+    struct Round<math::RoundGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+    {
+        auto operator()(math::RoundGenericSycl const&, TArg const& arg)
+        {
+            return sycl::round(arg);
+        }
+    };
+
+    //! The SYCL lround trait specialization.
+    template<typename TArg>
+    struct Lround<math::RoundGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+    {
+        auto operator()(math::RoundGenericSycl const&, TArg const& arg)
+        {
+            return static_cast<long>(sycl::round(arg));
+        }
+    };
+
+    //! The SYCL llround trait specialization.
+    template<typename TArg>
+    struct Llround<math::RoundGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+    {
+        auto operator()(math::RoundGenericSycl const&, TArg const& arg)
+        {
+            return static_cast<long long>(sycl::round(arg));
+        }
+    };
+
+    //! The SYCL rsqrt trait specialization.
+    template<typename TArg>
+    struct Rsqrt<math::RsqrtGenericSycl, TArg, std::enable_if_t<std::is_arithmetic_v<TArg>>>
+    {
+        auto operator()(math::RsqrtGenericSycl const&, TArg const& arg)
+        {
+            if constexpr(std::is_floating_point_v<TArg>)
+                return sycl::rsqrt(arg);
+            else if constexpr(std::is_integral_v<TArg>)
+                return sycl::rsqrt(static_cast<double>(arg)); // mirror CUDA back-end and use double for ints
+            else
+                static_assert(!sizeof(TArg), "Unsupported data type");
+        }
+    };
+
+    //! The SYCL sin trait specialization.
+    template<typename TArg>
+    struct Sin<math::SinGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+    {
+        auto operator()(math::SinGenericSycl const&, TArg const& arg)
+        {
+            return sycl::sin(arg);
+        }
+    };
+
+    //! The SYCL sinh trait specialization.
+    template<typename TArg>
+    struct Sinh<math::SinhGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+    {
+        auto operator()(math::SinhGenericSycl const&, TArg const& arg)
+        {
+            return sycl::sinh(arg);
+        }
+    };
+
+    //! The SYCL sincos trait specialization.
+    template<typename TArg>
+    struct SinCos<math::SinCosGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+    {
+        auto operator()(math::SinCosGenericSycl const&, TArg const& arg, TArg& result_sin, TArg& result_cos) -> void
+        {
+            result_sin = sycl::sincos(arg, &result_cos);
+        }
+    };
+
+    //! The SYCL sqrt trait specialization.
+    template<typename TArg>
+    struct Sqrt<math::SqrtGenericSycl, TArg, std::enable_if_t<std::is_arithmetic_v<TArg>>>
+    {
+        auto operator()(math::SqrtGenericSycl const&, TArg const& arg)
+        {
+            if constexpr(std::is_floating_point_v<TArg>)
+                return sycl::sqrt(arg);
+            else if constexpr(std::is_integral_v<TArg>)
+                return sycl::sqrt(static_cast<double>(arg)); // mirror CUDA back-end and use double for ints
+        }
+    };
+
+    //! The SYCL tan trait specialization.
+    template<typename TArg>
+    struct Tan<math::TanGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+    {
+        auto operator()(math::TanGenericSycl const&, TArg const& arg)
+        {
+            return sycl::tan(arg);
+        }
+    };
+
+    //! The SYCL tanh trait specialization.
+    template<typename TArg>
+    struct Tanh<math::TanhGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+    {
+        auto operator()(math::TanhGenericSycl const&, TArg const& arg)
+        {
+            return sycl::tanh(arg);
+        }
+    };
+
+    //! The SYCL trunc trait specialization.
+    template<typename TArg>
+    struct Trunc<math::TruncGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+    {
+        auto operator()(math::TruncGenericSycl const&, TArg const& arg)
+        {
+            return sycl::trunc(arg);
+        }
+    };
+} // namespace alpaka::math::trait
+
+#endif
diff --git a/include/alpaka/math/MathStdLib.hpp b/include/alpaka/math/MathStdLib.hpp
new file mode 100644
index 0000000..e74380f
--- /dev/null
+++ b/include/alpaka/math/MathStdLib.hpp
@@ -0,0 +1,299 @@
+/* Copyright 2023 Alexander Matthes, Axel Huebl, Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber,
+ * Jeffrey Kelling, Sergei Bastrakov, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Decay.hpp"
+#include "alpaka/math/Traits.hpp"
+
+namespace alpaka::math
+{
+    //! The standard library abs, implementation covered by the general template.
+    class AbsStdLib : public concepts::Implements<ConceptMathAbs, AbsStdLib>
+    {
+    };
+
+    //! The standard library acos, implementation covered by the general template.
+    class AcosStdLib : public concepts::Implements<ConceptMathAcos, AcosStdLib>
+    {
+    };
+
+    //! The standard library acos, implementation covered by the general template.
+    class AcoshStdLib : public concepts::Implements<ConceptMathAcosh, AcoshStdLib>
+    {
+    };
+
+    //! The standard library arg, implementation covered by the general template.
+    class ArgStdLib : public concepts::Implements<ConceptMathArg, ArgStdLib>
+    {
+    };
+
+    //! The standard library asin, implementation covered by the general template.
+    class AsinStdLib : public concepts::Implements<ConceptMathAsin, AsinStdLib>
+    {
+    };
+
+    //! The standard library asinh, implementation covered by the general template.
+    class AsinhStdLib : public concepts::Implements<ConceptMathAsinh, AsinhStdLib>
+    {
+    };
+
+    //! The standard library atan, implementation covered by the general template.
+    class AtanStdLib : public concepts::Implements<ConceptMathAtan, AtanStdLib>
+    {
+    };
+
+    //! The standard library atanh, implementation covered by the general template.
+    class AtanhStdLib : public concepts::Implements<ConceptMathAtanh, AtanhStdLib>
+    {
+    };
+
+    //! The standard library atan2, implementation covered by the general template.
+    class Atan2StdLib : public concepts::Implements<ConceptMathAtan2, Atan2StdLib>
+    {
+    };
+
+    //! The standard library cbrt, implementation covered by the general template.
+    class CbrtStdLib : public concepts::Implements<ConceptMathCbrt, CbrtStdLib>
+    {
+    };
+
+    //! The standard library ceil, implementation covered by the general template.
+    class CeilStdLib : public concepts::Implements<ConceptMathCeil, CeilStdLib>
+    {
+    };
+
+    //! The standard library conj, implementation covered by the general template.
+    class ConjStdLib : public concepts::Implements<ConceptMathConj, ConjStdLib>
+    {
+    };
+
+    //! The standard library copysign, implementation covered by the general template.
+    class CopysignStdLib : public concepts::Implements<ConceptMathCopysign, CopysignStdLib>
+    {
+    };
+
+    //! The standard library cos, implementation covered by the general template.
+    class CosStdLib : public concepts::Implements<ConceptMathCos, CosStdLib>
+    {
+    };
+
+    //! The standard library cosh, implementation covered by the general template.
+    class CoshStdLib : public concepts::Implements<ConceptMathCosh, CoshStdLib>
+    {
+    };
+
+    //! The standard library erf, implementation covered by the general template.
+    class ErfStdLib : public concepts::Implements<ConceptMathErf, ErfStdLib>
+    {
+    };
+
+    //! The standard library exp, implementation covered by the general template.
+    class ExpStdLib : public concepts::Implements<ConceptMathExp, ExpStdLib>
+    {
+    };
+
+    //! The standard library floor, implementation covered by the general template.
+    class FloorStdLib : public concepts::Implements<ConceptMathFloor, FloorStdLib>
+    {
+    };
+
+    //! The standard library fma, implementation covered by the general template.
+    class FmaStdLib : public concepts::Implements<ConceptMathFma, FmaStdLib>
+    {
+    };
+
+    //! The standard library fmod, implementation covered by the general template.
+    class FmodStdLib : public concepts::Implements<ConceptMathFmod, FmodStdLib>
+    {
+    };
+
+    //! The standard library isfinite, implementation covered by the general template.
+    class IsfiniteStdLib : public concepts::Implements<ConceptMathIsfinite, IsfiniteStdLib>
+    {
+    };
+
+    //! The standard library isinf, implementation covered by the general template.
+    class IsinfStdLib : public concepts::Implements<ConceptMathIsinf, IsinfStdLib>
+    {
+    };
+
+    //! The standard library isnan, implementation covered by the general template.
+    class IsnanStdLib : public concepts::Implements<ConceptMathIsnan, IsnanStdLib>
+    {
+    };
+
+    //! The standard library log, implementation covered by the general template.
+    class LogStdLib : public concepts::Implements<ConceptMathLog, LogStdLib>
+    {
+    };
+
+    //! The standard library log2, implementation covered by the general template.
+    class Log2StdLib : public concepts::Implements<ConceptMathLog2, Log2StdLib>
+    {
+    };
+
+    //! The standard library log10, implementation covered by the general template.
+    class Log10StdLib : public concepts::Implements<ConceptMathLog10, Log10StdLib>
+    {
+    };
+
+    //! The standard library max.
+    class MaxStdLib : public concepts::Implements<ConceptMathMax, MaxStdLib>
+    {
+    };
+
+    //! The standard library min.
+    class MinStdLib : public concepts::Implements<ConceptMathMin, MinStdLib>
+    {
+    };
+
+    //! The standard library pow, implementation covered by the general template.
+    class PowStdLib : public concepts::Implements<ConceptMathPow, PowStdLib>
+    {
+    };
+
+    //! The standard library remainder, implementation covered by the general template.
+    class RemainderStdLib : public concepts::Implements<ConceptMathRemainder, RemainderStdLib>
+    {
+    };
+
+    //! The standard library round, implementation covered by the general template.
+    class RoundStdLib : public concepts::Implements<ConceptMathRound, RoundStdLib>
+    {
+    };
+
+    //! The standard library rsqrt, implementation covered by the general template.
+    class RsqrtStdLib : public concepts::Implements<ConceptMathRsqrt, RsqrtStdLib>
+    {
+    };
+
+    //! The standard library sin, implementation covered by the general template.
+    class SinStdLib : public concepts::Implements<ConceptMathSin, SinStdLib>
+    {
+    };
+
+    //! The standard library sinh, implementation covered by the general template.
+    class SinhStdLib : public concepts::Implements<ConceptMathSinh, SinhStdLib>
+    {
+    };
+
+    //! The standard library sincos, implementation covered by the general template.
+    class SinCosStdLib : public concepts::Implements<ConceptMathSinCos, SinCosStdLib>
+    {
+    };
+
+    //! The standard library sqrt, implementation covered by the general template.
+    class SqrtStdLib : public concepts::Implements<ConceptMathSqrt, SqrtStdLib>
+    {
+    };
+
+    //! The standard library tan, implementation covered by the general template.
+    class TanStdLib : public concepts::Implements<ConceptMathTan, TanStdLib>
+    {
+    };
+
+    //! The standard library tanh, implementation covered by the general template.
+    class TanhStdLib : public concepts::Implements<ConceptMathTanh, TanhStdLib>
+    {
+    };
+
+    //! The standard library trunc, implementation covered by the general template.
+    class TruncStdLib : public concepts::Implements<ConceptMathTrunc, TruncStdLib>
+    {
+    };
+
+    //! The standard library math trait specializations.
+    class MathStdLib
+        : public AbsStdLib
+        , public AcosStdLib
+        , public AcoshStdLib
+        , public ArgStdLib
+        , public AsinStdLib
+        , public AsinhStdLib
+        , public AtanStdLib
+        , public AtanhStdLib
+        , public Atan2StdLib
+        , public CbrtStdLib
+        , public CeilStdLib
+        , public ConjStdLib
+        , public CopysignStdLib
+        , public CosStdLib
+        , public CoshStdLib
+        , public ErfStdLib
+        , public ExpStdLib
+        , public FloorStdLib
+        , public FmaStdLib
+        , public FmodStdLib
+        , public LogStdLib
+        , public Log2StdLib
+        , public Log10StdLib
+        , public MaxStdLib
+        , public MinStdLib
+        , public PowStdLib
+        , public RemainderStdLib
+        , public RoundStdLib
+        , public RsqrtStdLib
+        , public SinStdLib
+        , public SinhStdLib
+        , public SinCosStdLib
+        , public SqrtStdLib
+        , public TanStdLib
+        , public TanhStdLib
+        , public TruncStdLib
+        , public IsnanStdLib
+        , public IsinfStdLib
+        , public IsfiniteStdLib
+    {
+    };
+
+    namespace trait
+    {
+        //! The standard library max trait specialization.
+        template<typename Tx, typename Ty>
+        struct Max<MaxStdLib, Tx, Ty, std::enable_if_t<std::is_arithmetic_v<Tx> && std::is_arithmetic_v<Ty>>>
+        {
+            ALPAKA_FN_HOST auto operator()(MaxStdLib const& /* max_ctx */, Tx const& x, Ty const& y)
+            {
+                using std::fmax;
+                using std::max;
+
+                if constexpr(std::is_integral_v<Tx> && std::is_integral_v<Ty>)
+                    return max(x, y);
+                else if constexpr(
+                    is_decayed_v<Tx, float> || is_decayed_v<Ty, float> || is_decayed_v<Tx, double>
+                    || is_decayed_v<Ty, double>)
+                    return fmax(x, y);
+                else
+                    static_assert(!sizeof(Tx), "Unsupported data type");
+
+                ALPAKA_UNREACHABLE(std::common_type_t<Tx, Ty>{});
+            }
+        };
+
+        //! The standard library min trait specialization.
+        template<typename Tx, typename Ty>
+        struct Min<MinStdLib, Tx, Ty, std::enable_if_t<std::is_arithmetic_v<Tx> && std::is_arithmetic_v<Ty>>>
+        {
+            ALPAKA_FN_HOST auto operator()(MinStdLib const& /* min_ctx */, Tx const& x, Ty const& y)
+            {
+                using std::fmin;
+                using std::min;
+
+                if constexpr(std::is_integral_v<Tx> && std::is_integral_v<Ty>)
+                    return min(x, y);
+                else if constexpr(
+                    is_decayed_v<Tx, float> || is_decayed_v<Ty, float> || is_decayed_v<Tx, double>
+                    || is_decayed_v<Ty, double>)
+                    return fmin(x, y);
+                else
+                    static_assert(!sizeof(Tx), "Unsupported data type");
+
+                ALPAKA_UNREACHABLE(std::common_type_t<Tx, Ty>{});
+            }
+        };
+    } // namespace trait
+
+} // namespace alpaka::math
diff --git a/include/alpaka/math/MathUniformCudaHipBuiltIn.hpp b/include/alpaka/math/MathUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000..ef89423
--- /dev/null
+++ b/include/alpaka/math/MathUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,1373 @@
+/* Copyright 2023 Axel Huebl, Benjamin Worpitz, Matthias Werner, Bert Wesarg, Valentin Gehrke, René Widera,
+ * Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Jeffrey Kelling, Sergei Bastrakov
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/core/CudaHipCommon.hpp"
+#include "alpaka/core/Decay.hpp"
+#include "alpaka/core/UniformCudaHip.hpp"
+#include "alpaka/core/Unreachable.hpp"
+#include "alpaka/math/Complex.hpp"
+#include "alpaka/math/Traits.hpp"
+
+#include <type_traits>
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+namespace alpaka::math
+{
+    //! The CUDA built in abs.
+    class AbsUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathAbs, AbsUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA built in acos.
+    class AcosUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathAcos, AcosUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA built in acosh.
+    class AcoshUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathAcosh, AcoshUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA built in arg.
+    class ArgUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathArg, ArgUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA built in asin.
+    class AsinUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathAsin, AsinUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA built in asinh.
+    class AsinhUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathAsinh, AsinhUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA built in atan.
+    class AtanUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathAtan, AtanUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA built in atanh.
+    class AtanhUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathAtanh, AtanhUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA built in atan2.
+    class Atan2UniformCudaHipBuiltIn : public concepts::Implements<ConceptMathAtan2, Atan2UniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA built in cbrt.
+    class CbrtUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathCbrt, CbrtUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA built in ceil.
+    class CeilUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathCeil, CeilUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA built in conj.
+    class ConjUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathConj, ConjUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA built in copysign.
+    class CopysignUniformCudaHipBuiltIn
+        : public concepts::Implements<ConceptMathCopysign, CopysignUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA built in cos.
+    class CosUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathCos, CosUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA built in cosh.
+    class CoshUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathCosh, CoshUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA built in erf.
+    class ErfUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathErf, ErfUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA built in exp.
+    class ExpUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathExp, ExpUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA built in floor.
+    class FloorUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathFloor, FloorUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA built in fma.
+    class FmaUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathFma, FmaUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA built in fmod.
+    class FmodUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathFmod, FmodUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA built in isfinite.
+    class IsfiniteUniformCudaHipBuiltIn
+        : public concepts::Implements<ConceptMathIsfinite, IsfiniteUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA built in isinf.
+    class IsinfUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathIsinf, IsinfUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA built in isnan.
+    class IsnanUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathIsnan, IsnanUniformCudaHipBuiltIn>
+    {
+    };
+
+    // ! The CUDA built in log.
+    class LogUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathLog, LogUniformCudaHipBuiltIn>
+    {
+    };
+
+    // ! The CUDA built in log2.
+    class Log2UniformCudaHipBuiltIn : public concepts::Implements<ConceptMathLog2, Log2UniformCudaHipBuiltIn>
+    {
+    };
+
+    // ! The CUDA built in log10.
+    class Log10UniformCudaHipBuiltIn : public concepts::Implements<ConceptMathLog10, Log10UniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA built in max.
+    class MaxUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathMax, MaxUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA built in min.
+    class MinUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathMin, MinUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA built in pow.
+    class PowUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathPow, PowUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA built in remainder.
+    class RemainderUniformCudaHipBuiltIn
+        : public concepts::Implements<ConceptMathRemainder, RemainderUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA round.
+    class RoundUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathRound, RoundUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA rsqrt.
+    class RsqrtUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathRsqrt, RsqrtUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA sin.
+    class SinUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathSin, SinUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA sinh.
+    class SinhUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathSinh, SinhUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA sincos.
+    class SinCosUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathSinCos, SinCosUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA sqrt.
+    class SqrtUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathSqrt, SqrtUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA tan.
+    class TanUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathTan, TanUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA tanh.
+    class TanhUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathTanh, TanhUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The CUDA trunc.
+    class TruncUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathTrunc, TruncUniformCudaHipBuiltIn>
+    {
+    };
+
+    //! The standard library math trait specializations.
+    class MathUniformCudaHipBuiltIn
+        : public AbsUniformCudaHipBuiltIn
+        , public AcosUniformCudaHipBuiltIn
+        , public AcoshUniformCudaHipBuiltIn
+        , public ArgUniformCudaHipBuiltIn
+        , public AsinUniformCudaHipBuiltIn
+        , public AsinhUniformCudaHipBuiltIn
+        , public AtanUniformCudaHipBuiltIn
+        , public AtanhUniformCudaHipBuiltIn
+        , public Atan2UniformCudaHipBuiltIn
+        , public CbrtUniformCudaHipBuiltIn
+        , public CeilUniformCudaHipBuiltIn
+        , public ConjUniformCudaHipBuiltIn
+        , public CopysignUniformCudaHipBuiltIn
+        , public CosUniformCudaHipBuiltIn
+        , public CoshUniformCudaHipBuiltIn
+        , public ErfUniformCudaHipBuiltIn
+        , public ExpUniformCudaHipBuiltIn
+        , public FloorUniformCudaHipBuiltIn
+        , public FmaUniformCudaHipBuiltIn
+        , public FmodUniformCudaHipBuiltIn
+        , public LogUniformCudaHipBuiltIn
+        , public Log2UniformCudaHipBuiltIn
+        , public Log10UniformCudaHipBuiltIn
+        , public MaxUniformCudaHipBuiltIn
+        , public MinUniformCudaHipBuiltIn
+        , public PowUniformCudaHipBuiltIn
+        , public RemainderUniformCudaHipBuiltIn
+        , public RoundUniformCudaHipBuiltIn
+        , public RsqrtUniformCudaHipBuiltIn
+        , public SinUniformCudaHipBuiltIn
+        , public SinhUniformCudaHipBuiltIn
+        , public SinCosUniformCudaHipBuiltIn
+        , public SqrtUniformCudaHipBuiltIn
+        , public TanUniformCudaHipBuiltIn
+        , public TanhUniformCudaHipBuiltIn
+        , public TruncUniformCudaHipBuiltIn
+        , public IsnanUniformCudaHipBuiltIn
+        , public IsinfUniformCudaHipBuiltIn
+        , public IsfiniteUniformCudaHipBuiltIn
+    {
+    };
+
+#    if !defined(ALPAKA_HOST_ONLY)
+
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+
+#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDA_ARCH__)
+#            include <cuda_runtime.h>
+#        endif
+
+#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_DEVICE_COMPILE__)
+#            include <hip/math_functions.h>
+#        endif
+
+    namespace trait
+    {
+        //! The CUDA abs trait specialization for real types.
+        template<typename TArg>
+        struct Abs<AbsUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_signed_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(AbsUniformCudaHipBuiltIn const& /* abs_ctx */, TArg const& arg)
+            {
+                if constexpr(is_decayed_v<TArg, float>)
+                    return ::fabsf(arg);
+                else if constexpr(is_decayed_v<TArg, double>)
+                    return ::fabs(arg);
+                else if constexpr(is_decayed_v<TArg, int>)
+                    return ::abs(arg);
+                else if constexpr(is_decayed_v<TArg, long int>)
+                    return ::labs(arg);
+                else if constexpr(is_decayed_v<TArg, long long int>)
+                    return ::llabs(arg);
+                else
+                    static_assert(!sizeof(TArg), "Unsupported data type");
+
+                ALPAKA_UNREACHABLE(TArg{});
+            }
+        };
+
+        //! The CUDA abs trait specialization for complex types.
+        template<typename T>
+        struct Abs<AbsUniformCudaHipBuiltIn, Complex<T>>
+        {
+            //! Take context as original (accelerator) type, since we call other math functions
+            template<typename TCtx>
+            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& arg)
+            {
+                return sqrt(ctx, arg.real() * arg.real() + arg.imag() * arg.imag());
+            }
+        };
+
+        //! The CUDA acos trait specialization for real types.
+        template<typename TArg>
+        struct Acos<AcosUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(AcosUniformCudaHipBuiltIn const& /* acos_ctx */, TArg const& arg)
+            {
+                if constexpr(is_decayed_v<TArg, float>)
+                    return ::acosf(arg);
+                else if constexpr(is_decayed_v<TArg, double>)
+                    return ::acos(arg);
+                else
+                    static_assert(!sizeof(TArg), "Unsupported data type");
+
+                ALPAKA_UNREACHABLE(TArg{});
+            }
+        };
+
+        //! The CUDA acos trait specialization for complex types.
+        template<typename T>
+        struct Acos<AcosUniformCudaHipBuiltIn, Complex<T>>
+        {
+            //! Take context as original (accelerator) type, since we call other math functions
+            template<typename TCtx>
+            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& arg)
+            {
+                // This holds everywhere, including the branch cuts: acos(z) = -i * ln(z + i * sqrt(1 - z^2))
+                return Complex<T>{0.0, -1.0} * log(ctx, arg + Complex<T>{0.0, 1.0} * sqrt(ctx, T(1.0) - arg * arg));
+            }
+        };
+
+        //! The CUDA acosh trait specialization for real types.
+        template<typename TArg>
+        struct Acosh<AcoshUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(AcoshUniformCudaHipBuiltIn const& /* acosh_ctx */, TArg const& arg)
+            {
+                if constexpr(is_decayed_v<TArg, float>)
+                    return ::acoshf(arg);
+                else if constexpr(is_decayed_v<TArg, double>)
+                    return ::acosh(arg);
+                else
+                    static_assert(!sizeof(TArg), "Unsupported data type");
+
+                ALPAKA_UNREACHABLE(TArg{});
+            }
+        };
+
+        //! The CUDA acosh trait specialization for complex types.
+        template<typename T>
+        struct Acosh<AcoshUniformCudaHipBuiltIn, Complex<T>>
+        {
+            //! Take context as original (accelerator) type, since we call other math functions
+            template<typename TCtx>
+            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& arg)
+            {
+                // acos(z) = ln(z + sqrt(z-1) * sqrt(z+1))
+                return log(ctx, arg + sqrt(ctx, arg - static_cast<T>(1.0)) * sqrt(ctx, arg + static_cast<T>(1.0)));
+            }
+        };
+
+        //! The CUDA arg trait specialization for real types.
+        template<typename TArgument>
+        struct Arg<ArgUniformCudaHipBuiltIn, TArgument, std::enable_if_t<std::is_floating_point_v<TArgument>>>
+        {
+            //! Take context as original (accelerator) type, since we call other math functions
+            template<typename TCtx>
+            __host__ __device__ auto operator()(TCtx const& ctx, TArgument const& argument)
+            {
+                // Fall back to atan2 so that boundary cases are resolved consistently
+                return atan2(ctx, TArgument{0.0}, argument);
+            }
+        };
+
+        //! The CUDA arg Complex<T> specialization for complex types.
+        template<typename T>
+        struct Arg<ArgUniformCudaHipBuiltIn, Complex<T>>
+        {
+            //! Take context as original (accelerator) type, since we call other math functions
+            template<typename TCtx>
+            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& argument)
+            {
+                return atan2(ctx, argument.imag(), argument.real());
+            }
+        };
+
+        //! The CUDA asin trait specialization for real types.
+        template<typename TArg>
+        struct Asin<AsinUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(AsinUniformCudaHipBuiltIn const& /* asin_ctx */, TArg const& arg)
+            {
+                if constexpr(is_decayed_v<TArg, float>)
+                    return ::asinf(arg);
+                else if constexpr(is_decayed_v<TArg, double>)
+                    return ::asin(arg);
+                else
+                    static_assert(!sizeof(TArg), "Unsupported data type");
+
+                ALPAKA_UNREACHABLE(TArg{});
+            }
+        };
+
+        //! The CUDA asin trait specialization for complex types.
+        template<typename T>
+        struct Asin<AsinUniformCudaHipBuiltIn, Complex<T>>
+        {
+            //! Take context as original (accelerator) type, since we call other math functions
+            template<typename TCtx>
+            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& arg)
+            {
+                // This holds everywhere, including the branch cuts: asin(z) = i * ln(sqrt(1 - z^2) - i * z)
+                return Complex<T>{0.0, 1.0} * log(ctx, sqrt(ctx, T(1.0) - arg * arg) - Complex<T>{0.0, 1.0} * arg);
+            }
+        };
+
+        //! The CUDA asinh trait specialization for real types.
+        template<typename TArg>
+        struct Asinh<AsinhUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(AsinhUniformCudaHipBuiltIn const& /* asinh_ctx */, TArg const& arg)
+            {
+                if constexpr(is_decayed_v<TArg, float>)
+                    return ::asinhf(arg);
+                else if constexpr(is_decayed_v<TArg, double>)
+                    return ::asinh(arg);
+                else
+                    static_assert(!sizeof(TArg), "Unsupported data type");
+
+                ALPAKA_UNREACHABLE(TArg{});
+            }
+        };
+
+        //! The CUDA asinh trait specialization for complex types.
+        template<typename T>
+        struct Asinh<AsinhUniformCudaHipBuiltIn, Complex<T>>
+        {
+            //! Take context as original (accelerator) type, since we call other math functions
+            template<typename TCtx>
+            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& arg)
+            {
+                // asinh(z) = ln(z + sqrt(z^2 + 1))
+                return log(ctx, arg + sqrt(ctx, arg * arg + static_cast<T>(1.0)));
+            }
+        };
+
+        //! The CUDA atan trait specialization for real types.
+        template<typename TArg>
+        struct Atan<AtanUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(AtanUniformCudaHipBuiltIn const& /* atan_ctx */, TArg const& arg)
+            {
+                if constexpr(is_decayed_v<TArg, float>)
+                    return ::atanf(arg);
+                else if constexpr(is_decayed_v<TArg, double>)
+                    return ::atan(arg);
+                else
+                    static_assert(!sizeof(TArg), "Unsupported data type");
+
+                ALPAKA_UNREACHABLE(TArg{});
+            }
+        };
+
+        //! The CUDA atan trait specialization for complex types.
+        template<typename T>
+        struct Atan<AtanUniformCudaHipBuiltIn, Complex<T>>
+        {
+            //! Take context as original (accelerator) type, since we call other math functions
+            template<typename TCtx>
+            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& arg)
+            {
+                // This holds everywhere, including the branch cuts: atan(z) = -i/2 * ln((i - z) / (i + z))
+                return Complex<T>{0.0, -0.5} * log(ctx, (Complex<T>{0.0, 1.0} - arg) / (Complex<T>{0.0, 1.0} + arg));
+            }
+        };
+
+        //! The CUDA atanh trait specialization for real types.
+        template<typename TArg>
+        struct Atanh<AtanhUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(AtanhUniformCudaHipBuiltIn const& /* atanh_ctx */, TArg const& arg)
+            {
+                if constexpr(is_decayed_v<TArg, float>)
+                    return ::atanhf(arg);
+                else if constexpr(is_decayed_v<TArg, double>)
+                    return ::atanh(arg);
+                else
+                    static_assert(!sizeof(TArg), "Unsupported data type");
+
+                ALPAKA_UNREACHABLE(TArg{});
+            }
+        };
+
+        //! The CUDA atanh trait specialization for complex types.
+        template<typename T>
+        struct Atanh<AtanhUniformCudaHipBuiltIn, Complex<T>>
+        {
+            //! Take context as original (accelerator) type, since we call other math functions
+            template<typename TCtx>
+            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& arg)
+            {
+                //  atanh(z) = 0.5 * (ln(1 + z) - ln(1 - z))
+                return static_cast<T>(0.5)
+                       * (log(ctx, static_cast<T>(1.0) + arg) - log(ctx, static_cast<T>(1.0) - arg));
+            }
+        };
+
+        //! The CUDA atan2 trait specialization.
+        template<typename Ty, typename Tx>
+        struct Atan2<
+            Atan2UniformCudaHipBuiltIn,
+            Ty,
+            Tx,
+            std::enable_if_t<std::is_floating_point_v<Ty> && std::is_floating_point_v<Tx>>>
+        {
+            __host__ __device__ auto operator()(
+                Atan2UniformCudaHipBuiltIn const& /* atan2_ctx */,
+                Ty const& y,
+                Tx const& x)
+            {
+                if constexpr(is_decayed_v<Ty, float> && is_decayed_v<Tx, float>)
+                    return ::atan2f(y, x);
+                else if constexpr(is_decayed_v<Ty, double> || is_decayed_v<Tx, double>)
+                    return ::atan2(y, x);
+                else
+                    static_assert(!sizeof(Ty), "Unsupported data type");
+
+                ALPAKA_UNREACHABLE(Ty{});
+            }
+        };
+
+        //! The CUDA cbrt trait specialization.
+        template<typename TArg>
+        struct Cbrt<CbrtUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_arithmetic_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(CbrtUniformCudaHipBuiltIn const& /* cbrt_ctx */, TArg const& arg)
+            {
+                if constexpr(is_decayed_v<TArg, float>)
+                    return ::cbrtf(arg);
+                else if constexpr(is_decayed_v<TArg, double> || std::is_integral_v<TArg>)
+                    return ::cbrt(arg);
+                else
+                    static_assert(!sizeof(TArg), "Unsupported data type");
+
+                ALPAKA_UNREACHABLE(TArg{});
+            }
+        };
+
+        //! The CUDA ceil trait specialization.
+        template<typename TArg>
+        struct Ceil<CeilUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(CeilUniformCudaHipBuiltIn const& /* ceil_ctx */, TArg const& arg)
+            {
+                if constexpr(is_decayed_v<TArg, float>)
+                    return ::ceilf(arg);
+                else if constexpr(is_decayed_v<TArg, double>)
+                    return ::ceil(arg);
+                else
+                    static_assert(!sizeof(TArg), "Unsupported data type");
+
+                ALPAKA_UNREACHABLE(TArg{});
+            }
+        };
+
+        //! The CUDA conj trait specialization for real types.
+        template<typename TArg>
+        struct Conj<ConjUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(ConjUniformCudaHipBuiltIn const& /* conj_ctx */, TArg const& arg)
+            {
+                return Complex<TArg>{arg, TArg{0.0}};
+            }
+        };
+
+        //! The CUDA conj specialization for complex types.
+        template<typename T>
+        struct Conj<ConjUniformCudaHipBuiltIn, Complex<T>>
+        {
+            __host__ __device__ auto operator()(ConjUniformCudaHipBuiltIn const& /* conj_ctx */, Complex<T> const& arg)
+            {
+                return Complex<T>{arg.real(), -arg.imag()};
+            }
+        };
+
+        //! The CUDA copysign trait specialization for real types.
+        template<typename TMag, typename TSgn>
+        struct Copysign<
+            CopysignUniformCudaHipBuiltIn,
+            TMag,
+            TSgn,
+            std::enable_if_t<std::is_floating_point_v<TMag> && std::is_floating_point_v<TSgn>>>
+        {
+            __host__ __device__ auto operator()(
+                CopysignUniformCudaHipBuiltIn const& /* copysign_ctx */,
+                TMag const& mag,
+                TSgn const& sgn)
+            {
+                if constexpr(is_decayed_v<TMag, float> && is_decayed_v<TSgn, float>)
+                    return ::copysignf(mag, sgn);
+                else if constexpr(is_decayed_v<TMag, double> || is_decayed_v<TSgn, double>)
+                    return ::copysign(mag, sgn);
+                else
+                    static_assert(!sizeof(TMag), "Unsupported data type");
+
+                ALPAKA_UNREACHABLE(TMag{});
+            }
+        };
+
+        //! The CUDA cos trait specialization for real types.
+        template<typename TArg>
+        struct Cos<CosUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(CosUniformCudaHipBuiltIn const& /* cos_ctx */, TArg const& arg)
+            {
+                if constexpr(is_decayed_v<TArg, float>)
+                    return ::cosf(arg);
+                else if constexpr(is_decayed_v<TArg, double>)
+                    return ::cos(arg);
+                else
+                    static_assert(!sizeof(TArg), "Unsupported data type");
+
+                ALPAKA_UNREACHABLE(TArg{});
+            }
+        };
+
+        //! The CUDA cos trait specialization for complex types.
+        template<typename T>
+        struct Cos<CosUniformCudaHipBuiltIn, Complex<T>>
+        {
+            //! Take context as original (accelerator) type, since we call other math functions
+            template<typename TCtx>
+            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& arg)
+            {
+                // cos(z) = 0.5 * (exp(i * z) + exp(-i * z))
+                return T(0.5) * (exp(ctx, Complex<T>{0.0, 1.0} * arg) + exp(ctx, Complex<T>{0.0, -1.0} * arg));
+            }
+        };
+
+        //! The CUDA cosh trait specialization for real types.
+        template<typename TArg>
+        struct Cosh<CoshUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(CoshUniformCudaHipBuiltIn const& /* cos_ctx */, TArg const& arg)
+            {
+                if constexpr(is_decayed_v<TArg, float>)
+                    return ::coshf(arg);
+                else if constexpr(is_decayed_v<TArg, double>)
+                    return ::cosh(arg);
+                else
+                    static_assert(!sizeof(TArg), "Unsupported data type");
+
+                ALPAKA_UNREACHABLE(TArg{});
+            }
+        };
+
+        //! The CUDA cosh trait specialization for complex types.
+        template<typename T>
+        struct Cosh<CoshUniformCudaHipBuiltIn, Complex<T>>
+        {
+            //! Take context as original (accelerator) type, since we call other math functions
+            template<typename TCtx>
+            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& arg)
+            {
+                // cosh(z) = 0.5 * (exp(z) + exp(-z))
+                return T(0.5) * (exp(ctx, arg) + exp(ctx, static_cast<T>(-1.0) * arg));
+            }
+        };
+
+        //! The CUDA erf trait specialization.
+        template<typename TArg>
+        struct Erf<ErfUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(ErfUniformCudaHipBuiltIn const& /* erf_ctx */, TArg const& arg)
+            {
+                if constexpr(is_decayed_v<TArg, float>)
+                    return ::erff(arg);
+                else if constexpr(is_decayed_v<TArg, double>)
+                    return ::erf(arg);
+                else
+                    static_assert(!sizeof(TArg), "Unsupported data type");
+
+                ALPAKA_UNREACHABLE(TArg{});
+            }
+        };
+
+        //! The CUDA exp trait specialization for real types.
+        template<typename TArg>
+        struct Exp<ExpUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(ExpUniformCudaHipBuiltIn const& /* exp_ctx */, TArg const& arg)
+            {
+                if constexpr(is_decayed_v<TArg, float>)
+                    return ::expf(arg);
+                else if constexpr(is_decayed_v<TArg, double>)
+                    return ::exp(arg);
+                else
+                    static_assert(!sizeof(TArg), "Unsupported data type");
+
+                ALPAKA_UNREACHABLE(TArg{});
+            }
+        };
+
+        //! The CUDA exp trait specialization for complex types.
+        template<typename T>
+        struct Exp<ExpUniformCudaHipBuiltIn, Complex<T>>
+        {
+            //! Take context as original (accelerator) type, since we call other math functions
+            template<typename TCtx>
+            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& arg)
+            {
+                // exp(z) = exp(x + iy) = exp(x) * (cos(y) + i * sin(y))
+                auto re = T{}, im = T{};
+                sincos(ctx, arg.imag(), im, re);
+                return exp(ctx, arg.real()) * Complex<T>{re, im};
+            }
+        };
+
+        //! The CUDA floor trait specialization.
+        template<typename TArg>
+        struct Floor<FloorUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(FloorUniformCudaHipBuiltIn const& /* floor_ctx */, TArg const& arg)
+            {
+                if constexpr(is_decayed_v<TArg, float>)
+                    return ::floorf(arg);
+                else if constexpr(is_decayed_v<TArg, double>)
+                    return ::floor(arg);
+                else
+                    static_assert(!sizeof(TArg), "Unsupported data type");
+
+                ALPAKA_UNREACHABLE(TArg{});
+            }
+        };
+
+        //! The CUDA fma trait specialization.
+        template<typename Tx, typename Ty, typename Tz>
+        struct Fma<
+            FmaUniformCudaHipBuiltIn,
+            Tx,
+            Ty,
+            Tz,
+            std::enable_if_t<
+                std::is_floating_point_v<Tx> && std::is_floating_point_v<Ty> && std::is_floating_point_v<Tz>>>
+        {
+            __host__ __device__ auto operator()(
+                FmaUniformCudaHipBuiltIn const& /* fma_ctx */,
+                Tx const& x,
+                Ty const& y,
+                Tz const& z)
+            {
+                if constexpr(is_decayed_v<Tx, float> && is_decayed_v<Ty, float> && is_decayed_v<Tz, float>)
+                    return ::fmaf(x, y, z);
+                else if constexpr(is_decayed_v<Tx, double> || is_decayed_v<Ty, double> || is_decayed_v<Tz, double>)
+                    return ::fma(x, y, z);
+                else
+                    static_assert(!sizeof(Tx), "Unsupported data type");
+
+                using Ret [[maybe_unused]] = std::conditional_t<
+                    is_decayed_v<Tx, float> && is_decayed_v<Ty, float> && is_decayed_v<Tz, float>,
+                    float,
+                    double>;
+                ALPAKA_UNREACHABLE(Ret{});
+            }
+        };
+
+        //! The CUDA fmod trait specialization.
+        template<typename Tx, typename Ty>
+        struct Fmod<
+            FmodUniformCudaHipBuiltIn,
+            Tx,
+            Ty,
+            std::enable_if_t<std::is_floating_point_v<Tx> && std::is_floating_point_v<Ty>>>
+        {
+            __host__ __device__ auto operator()(
+                FmodUniformCudaHipBuiltIn const& /* fmod_ctx */,
+                Tx const& x,
+                Ty const& y)
+            {
+                if constexpr(is_decayed_v<Tx, float> && is_decayed_v<Ty, float>)
+                    return ::fmodf(x, y);
+                else if constexpr(is_decayed_v<Tx, double> || is_decayed_v<Ty, double>)
+                    return ::fmod(x, y);
+                else
+                    static_assert(!sizeof(Tx), "Unsupported data type");
+
+                using Ret [[maybe_unused]]
+                = std::conditional_t<is_decayed_v<Tx, float> && is_decayed_v<Ty, float>, float, double>;
+                ALPAKA_UNREACHABLE(Ret{});
+            }
+        };
+
+        //! The CUDA isfinite trait specialization.
+        template<typename TArg>
+        struct Isfinite<IsfiniteUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(IsfiniteUniformCudaHipBuiltIn const& /* ctx */, TArg const& arg)
+            {
+                return ::isfinite(arg);
+            }
+        };
+
+        //! The CUDA isinf trait specialization.
+        template<typename TArg>
+        struct Isinf<IsinfUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(IsinfUniformCudaHipBuiltIn const& /* ctx */, TArg const& arg)
+            {
+                return ::isinf(arg);
+            }
+        };
+
+        //! The CUDA isnan trait specialization.
+        template<typename TArg>
+        struct Isnan<IsnanUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(IsnanUniformCudaHipBuiltIn const& /* ctx */, TArg const& arg)
+            {
+                return ::isnan(arg);
+            }
+        };
+
+        //! The CUDA log trait specialization for real types.
+        template<typename TArg>
+        struct Log<LogUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(LogUniformCudaHipBuiltIn const& /* log_ctx */, TArg const& arg)
+            {
+                if constexpr(is_decayed_v<TArg, float>)
+                    return ::logf(arg);
+                else if constexpr(is_decayed_v<TArg, double>)
+                    return ::log(arg);
+                else
+                    static_assert(!sizeof(TArg), "Unsupported data type");
+
+                ALPAKA_UNREACHABLE(TArg{});
+            }
+        };
+
+        //! The CUDA log trait specialization for complex types.
+        template<typename T>
+        struct Log<LogUniformCudaHipBuiltIn, Complex<T>>
+        {
+            //! Take context as original (accelerator) type, since we call other math functions
+            template<typename TCtx>
+            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& argument)
+            {
+                // Branch cut along the negative real axis (same as for std::complex),
+                // principal value of ln(z) = ln(|z|) + i * arg(z)
+                return log(ctx, abs(ctx, argument)) + Complex<T>{0.0, 1.0} * arg(ctx, argument);
+            }
+        };
+
+        //! The CUDA log2 trait specialization for real types.
+        template<typename TArg>
+        struct Log2<Log2UniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(Log2UniformCudaHipBuiltIn const& /* log2_ctx */, TArg const& arg)
+            {
+                if constexpr(is_decayed_v<TArg, float>)
+                    return ::log2f(arg);
+                else if constexpr(is_decayed_v<TArg, double>)
+                    return ::log2(arg);
+                else
+                    static_assert(!sizeof(TArg), "Unsupported data type");
+
+                ALPAKA_UNREACHABLE(TArg{});
+            }
+        };
+
+        //! The CUDA log10 trait specialization for real types.
+        template<typename TArg>
+        struct Log10<Log10UniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(Log10UniformCudaHipBuiltIn const& /* log10_ctx */, TArg const& arg)
+            {
+                if constexpr(is_decayed_v<TArg, float>)
+                    return ::log10f(arg);
+                else if constexpr(is_decayed_v<TArg, double>)
+                    return ::log10(arg);
+                else
+                    static_assert(!sizeof(TArg), "Unsupported data type");
+
+                ALPAKA_UNREACHABLE(TArg{});
+            }
+        };
+
+        //! The CUDA log10 trait specialization for complex types.
+        template<typename T>
+        struct Log10<Log10UniformCudaHipBuiltIn, Complex<T>>
+        {
+            //! Take context as original (accelerator) type, since we call other math functions
+            template<typename TCtx>
+            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& argument)
+            {
+                return log(ctx, argument) / log(ctx, static_cast<T>(10));
+            }
+        };
+
+        //! The CUDA max trait specialization.
+        template<typename Tx, typename Ty>
+        struct Max<
+            MaxUniformCudaHipBuiltIn,
+            Tx,
+            Ty,
+            std::enable_if_t<std::is_arithmetic_v<Tx> && std::is_arithmetic_v<Ty>>>
+        {
+            __host__ __device__ auto operator()(
+                MaxUniformCudaHipBuiltIn const& /* max_ctx */,
+                Tx const& x,
+                Ty const& y)
+            {
+                if constexpr(std::is_integral_v<Tx> && std::is_integral_v<Ty>)
+                    return ::max(x, y);
+                else if constexpr(is_decayed_v<Tx, float> && is_decayed_v<Ty, float>)
+                    return ::fmaxf(x, y);
+                else if constexpr(
+                    is_decayed_v<Tx, double> || is_decayed_v<Ty, double>
+                    || (is_decayed_v<Tx, float> && std::is_integral_v<Ty>)
+                    || (std::is_integral_v<Tx> && is_decayed_v<Ty, float>) )
+                    return ::fmax(x, y);
+                else
+                    static_assert(!sizeof(Tx), "Unsupported data type");
+
+                using Ret [[maybe_unused]] = std::conditional_t<
+                    std::is_integral_v<Tx> && std::is_integral_v<Ty>,
+                    decltype(::max(x, y)),
+                    std::conditional_t<is_decayed_v<Tx, float> && is_decayed_v<Ty, float>, float, double>>;
+                ALPAKA_UNREACHABLE(Ret{});
+            }
+        };
+
+        //! The CUDA min trait specialization.
+        template<typename Tx, typename Ty>
+        struct Min<
+            MinUniformCudaHipBuiltIn,
+            Tx,
+            Ty,
+            std::enable_if_t<std::is_arithmetic_v<Tx> && std::is_arithmetic_v<Ty>>>
+        {
+            __host__ __device__ auto operator()(
+                MinUniformCudaHipBuiltIn const& /* min_ctx */,
+                Tx const& x,
+                Ty const& y)
+            {
+                if constexpr(std::is_integral_v<Tx> && std::is_integral_v<Ty>)
+                    return ::min(x, y);
+                else if constexpr(is_decayed_v<Tx, float> && is_decayed_v<Ty, float>)
+                    return ::fminf(x, y);
+                else if constexpr(
+                    is_decayed_v<Tx, double> || is_decayed_v<Ty, double>
+                    || (is_decayed_v<Tx, float> && std::is_integral_v<Ty>)
+                    || (std::is_integral_v<Tx> && is_decayed_v<Ty, float>) )
+                    return ::fmin(x, y);
+                else
+                    static_assert(!sizeof(Tx), "Unsupported data type");
+
+                using Ret [[maybe_unused]] = std::conditional_t<
+                    std::is_integral_v<Tx> && std::is_integral_v<Ty>,
+                    decltype(::min(x, y)),
+                    std::conditional_t<is_decayed_v<Tx, float> && is_decayed_v<Ty, float>, float, double>>;
+                ALPAKA_UNREACHABLE(Ret{});
+            }
+        };
+
+        //! The CUDA pow trait specialization for real types.
+        template<typename TBase, typename TExp>
+        struct Pow<
+            PowUniformCudaHipBuiltIn,
+            TBase,
+            TExp,
+            std::enable_if_t<std::is_floating_point_v<TBase> && std::is_floating_point_v<TExp>>>
+        {
+            __host__ __device__ auto operator()(
+                PowUniformCudaHipBuiltIn const& /* pow_ctx */,
+                TBase const& base,
+                TExp const& exp)
+            {
+                if constexpr(is_decayed_v<TBase, float> && is_decayed_v<TExp, float>)
+                    return ::powf(base, exp);
+                else if constexpr(is_decayed_v<TBase, double> || is_decayed_v<TExp, double>)
+                    return ::pow(static_cast<double>(base), static_cast<double>(exp));
+                else
+                    static_assert(!sizeof(TBase), "Unsupported data type");
+
+                using Ret [[maybe_unused]]
+                = std::conditional_t<is_decayed_v<TBase, float> && is_decayed_v<TExp, float>, float, double>;
+                ALPAKA_UNREACHABLE(Ret{});
+            }
+        };
+
+        //! The CUDA pow trait specialization for complex types.
+        template<typename T, typename U>
+        struct Pow<PowUniformCudaHipBuiltIn, Complex<T>, Complex<U>>
+        {
+            //! Take context as original (accelerator) type, since we call other math functions
+            template<typename TCtx>
+            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& base, Complex<U> const& exponent)
+            {
+                // Type promotion matching rules of complex std::pow but simplified given our math only supports float
+                // and double, no long double.
+                using Promoted
+                    = Complex<std::conditional_t<is_decayed_v<T, float> && is_decayed_v<U, float>, float, double>>;
+                // pow(z1, z2) = e^(z2 * log(z1))
+                return exp(ctx, Promoted{exponent} * log(ctx, Promoted{base}));
+            }
+        };
+
+        //! The CUDA pow trait specialization for complex and real types.
+        template<typename T, typename U>
+        struct Pow<PowUniformCudaHipBuiltIn, Complex<T>, U>
+        {
+            //! Take context as original (accelerator) type, since we call other math functions
+            template<typename TCtx>
+            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& base, U const& exponent)
+            {
+                return pow(ctx, base, Complex<U>{exponent});
+            }
+        };
+
+        //! The CUDA pow trait specialization for real and complex types.
+        template<typename T, typename U>
+        struct Pow<PowUniformCudaHipBuiltIn, T, Complex<U>>
+        {
+            //! Take context as original (accelerator) type, since we call other math functions
+            template<typename TCtx>
+            __host__ __device__ auto operator()(TCtx const& ctx, T const& base, Complex<U> const& exponent)
+            {
+                return pow(ctx, Complex<T>{base}, exponent);
+            }
+        };
+
+        //! The CUDA remainder trait specialization.
+        template<typename Tx, typename Ty>
+        struct Remainder<
+            RemainderUniformCudaHipBuiltIn,
+            Tx,
+            Ty,
+            std::enable_if_t<std::is_floating_point_v<Tx> && std::is_floating_point_v<Ty>>>
+        {
+            __host__ __device__ auto operator()(
+                RemainderUniformCudaHipBuiltIn const& /* remainder_ctx */,
+                Tx const& x,
+                Ty const& y)
+            {
+                if constexpr(is_decayed_v<Tx, float> && is_decayed_v<Ty, float>)
+                    return ::remainderf(x, y);
+                else if constexpr(is_decayed_v<Tx, double> || is_decayed_v<Ty, double>)
+                    return ::remainder(x, y);
+                else
+                    static_assert(!sizeof(Tx), "Unsupported data type");
+
+                using Ret [[maybe_unused]]
+                = std::conditional_t<is_decayed_v<Tx, float> && is_decayed_v<Ty, float>, float, double>;
+                ALPAKA_UNREACHABLE(Ret{});
+            }
+        };
+
+        //! The CUDA round trait specialization.
+        template<typename TArg>
+        struct Round<RoundUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(RoundUniformCudaHipBuiltIn const& /* round_ctx */, TArg const& arg)
+            {
+                if constexpr(is_decayed_v<TArg, float>)
+                    return ::roundf(arg);
+                else if constexpr(is_decayed_v<TArg, double>)
+                    return ::round(arg);
+                else
+                    static_assert(!sizeof(TArg), "Unsupported data type");
+
+                ALPAKA_UNREACHABLE(TArg{});
+            }
+        };
+
+        //! The CUDA lround trait specialization.
+        template<typename TArg>
+        struct Lround<RoundUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(RoundUniformCudaHipBuiltIn const& /* lround_ctx */, TArg const& arg)
+            {
+                if constexpr(is_decayed_v<TArg, float>)
+                    return ::lroundf(arg);
+                else if constexpr(is_decayed_v<TArg, double>)
+                    return ::lround(arg);
+                else
+                    static_assert(!sizeof(TArg), "Unsupported data type");
+
+                ALPAKA_UNREACHABLE(long{});
+            }
+        };
+
+        //! The CUDA llround trait specialization.
+        template<typename TArg>
+        struct Llround<RoundUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(RoundUniformCudaHipBuiltIn const& /* llround_ctx */, TArg const& arg)
+            {
+                if constexpr(is_decayed_v<TArg, float>)
+                    return ::llroundf(arg);
+                else if constexpr(is_decayed_v<TArg, double>)
+                    return ::llround(arg);
+                else
+                    static_assert(!sizeof(TArg), "Unsupported data type");
+
+                // NVCC versions before 11.3 are unable to compile 'long long{}': "type name is not allowed".
+                using Ret [[maybe_unused]] = long long;
+                ALPAKA_UNREACHABLE(Ret{});
+            }
+        };
+
+        //! The CUDA rsqrt trait specialization for real types.
+        template<typename TArg>
+        struct Rsqrt<RsqrtUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_arithmetic_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(RsqrtUniformCudaHipBuiltIn const& /* rsqrt_ctx */, TArg const& arg)
+            {
+                if constexpr(is_decayed_v<TArg, float>)
+                    return ::rsqrtf(arg);
+                else if constexpr(is_decayed_v<TArg, double> || std::is_integral_v<TArg>)
+                    return ::rsqrt(arg);
+                else
+                    static_assert(!sizeof(TArg), "Unsupported data type");
+
+                ALPAKA_UNREACHABLE(TArg{});
+            }
+        };
+
+        //! The CUDA rsqrt trait specialization for complex types.
+        template<typename T>
+        struct Rsqrt<RsqrtUniformCudaHipBuiltIn, Complex<T>>
+        {
+            //! Take context as original (accelerator) type, since we call other math functions
+            template<typename TCtx>
+            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& arg)
+            {
+                return T{1.0} / sqrt(ctx, arg);
+            }
+        };
+
+        //! The CUDA sin trait specialization for real types.
+        template<typename TArg>
+        struct Sin<SinUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(SinUniformCudaHipBuiltIn const& /* sin_ctx */, TArg const& arg)
+            {
+                if constexpr(is_decayed_v<TArg, float>)
+                    return ::sinf(arg);
+                else if constexpr(is_decayed_v<TArg, double>)
+                    return ::sin(arg);
+                else
+                    static_assert(!sizeof(TArg), "Unsupported data type");
+
+                ALPAKA_UNREACHABLE(TArg{});
+            }
+        };
+
+        //! The CUDA sin trait specialization for complex types.
+        template<typename T>
+        struct Sin<SinUniformCudaHipBuiltIn, Complex<T>>
+        {
+            //! Take context as original (accelerator) type, since we call other math functions
+            template<typename TCtx>
+            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& arg)
+            {
+                // sin(z) = (exp(i * z) - exp(-i * z)) / 2i
+                return (exp(ctx, Complex<T>{0.0, 1.0} * arg) - exp(ctx, Complex<T>{0.0, -1.0} * arg))
+                       / Complex<T>{0.0, 2.0};
+            }
+        };
+
+        //! The CUDA sinh trait specialization for real types.
+        template<typename TArg>
+        struct Sinh<SinhUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(SinhUniformCudaHipBuiltIn const& /* sinh_ctx */, TArg const& arg)
+            {
+                if constexpr(is_decayed_v<TArg, float>)
+                    return ::sinhf(arg);
+                else if constexpr(is_decayed_v<TArg, double>)
+                    return ::sinh(arg);
+                else
+                    static_assert(!sizeof(TArg), "Unsupported data type");
+
+                ALPAKA_UNREACHABLE(TArg{});
+            }
+        };
+
+        //! The CUDA sinh trait specialization for complex types.
+        template<typename T>
+        struct Sinh<SinhUniformCudaHipBuiltIn, Complex<T>>
+        {
+            //! Take context as original (accelerator) type, since we call other math functions
+            template<typename TCtx>
+            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& arg)
+            {
+                // sinh(z) = (exp(z) - exp(-i * z)) / 2
+                return (exp(ctx, arg) - exp(ctx, static_cast<T>(-1.0) * arg)) / static_cast<T>(2.0);
+            }
+        };
+
+        //! The CUDA sincos trait specialization for real types.
+        template<typename TArg>
+        struct SinCos<SinCosUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(
+                SinCosUniformCudaHipBuiltIn const& /* sincos_ctx */,
+                TArg const& arg,
+                TArg& result_sin,
+                TArg& result_cos) -> void
+            {
+                if constexpr(is_decayed_v<TArg, float>)
+                    ::sincosf(arg, &result_sin, &result_cos);
+                else if constexpr(is_decayed_v<TArg, double>)
+                    ::sincos(arg, &result_sin, &result_cos);
+                else
+                    static_assert(!sizeof(TArg), "Unsupported data type");
+            }
+        };
+
+        //! The CUDA sincos trait specialization for complex types.
+        template<typename T>
+        struct SinCos<SinCosUniformCudaHipBuiltIn, Complex<T>>
+        {
+            //! Take context as original (accelerator) type, since we call other math functions
+            template<typename TCtx>
+            __host__ __device__ auto operator()(
+                TCtx const& ctx,
+                Complex<T> const& arg,
+                Complex<T>& result_sin,
+                Complex<T>& result_cos) -> void
+            {
+                result_sin = sin(ctx, arg);
+                result_cos = cos(ctx, arg);
+            }
+        };
+
+        //! The CUDA sqrt trait specialization for real types.
+        template<typename TArg>
+        struct Sqrt<SqrtUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_arithmetic_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(SqrtUniformCudaHipBuiltIn const& /* sqrt_ctx */, TArg const& arg)
+            {
+                if constexpr(is_decayed_v<TArg, float>)
+                    return ::sqrtf(arg);
+                else if constexpr(is_decayed_v<TArg, double> || std::is_integral_v<TArg>)
+                    return ::sqrt(arg);
+                else
+                    static_assert(!sizeof(TArg), "Unsupported data type");
+
+                ALPAKA_UNREACHABLE(TArg{});
+            }
+        };
+
+        //! The CUDA sqrt trait specialization for complex types.
+        template<typename T>
+        struct Sqrt<SqrtUniformCudaHipBuiltIn, Complex<T>>
+        {
+            //! Take context as original (accelerator) type, since we call other math functions
+            template<typename TCtx>
+            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& argument)
+            {
+                // Branch cut along the negative real axis (same as for std::complex),
+                // principal value of sqrt(z) = sqrt(|z|) * e^(i * arg(z) / 2)
+                auto const halfArg = T(0.5) * arg(ctx, argument);
+                auto re = T{}, im = T{};
+                sincos(ctx, halfArg, im, re);
+                return sqrt(ctx, abs(ctx, argument)) * Complex<T>(re, im);
+            }
+        };
+
+        //! The CUDA tan trait specialization for real types.
+        template<typename TArg>
+        struct Tan<TanUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(TanUniformCudaHipBuiltIn const& /* tan_ctx */, TArg const& arg)
+            {
+                if constexpr(is_decayed_v<TArg, float>)
+                    return ::tanf(arg);
+                else if constexpr(is_decayed_v<TArg, double>)
+                    return ::tan(arg);
+                else
+                    static_assert(!sizeof(TArg), "Unsupported data type");
+
+                ALPAKA_UNREACHABLE(TArg{});
+            }
+        };
+
+        //! The CUDA tan trait specialization for complex types.
+        template<typename T>
+        struct Tan<TanUniformCudaHipBuiltIn, Complex<T>>
+        {
+            //! Take context as original (accelerator) type, since we call other math functions
+            template<typename TCtx>
+            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& arg)
+            {
+                // tan(z) = i * (e^-iz - e^iz) / (e^-iz + e^iz) = i * (1 - e^2iz) / (1 + e^2iz)
+                // Warning: this straightforward implementation can easily result in NaN as 0/0 or inf/inf.
+                auto const expValue = exp(ctx, Complex<T>{0.0, 2.0} * arg);
+                return Complex<T>{0.0, 1.0} * (T{1.0} - expValue) / (T{1.0} + expValue);
+            }
+        };
+
+        //! The CUDA tanh trait specialization for real types.
+        template<typename TArg>
+        struct Tanh<TanhUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(TanhUniformCudaHipBuiltIn const& /* tanh_ctx */, TArg const& arg)
+            {
+                if constexpr(is_decayed_v<TArg, float>)
+                    return ::tanhf(arg);
+                else if constexpr(is_decayed_v<TArg, double>)
+                    return ::tanh(arg);
+                else
+                    static_assert(!sizeof(TArg), "Unsupported data type");
+
+                ALPAKA_UNREACHABLE(TArg{});
+            }
+        };
+
+        //! The CUDA tanh trait specialization for complex types.
+        template<typename T>
+        struct Tanh<TanhUniformCudaHipBuiltIn, Complex<T>>
+        {
+            //! Take context as original (accelerator) type, since we call other math functions
+            template<typename TCtx>
+            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& arg)
+            {
+                // tanh(z) = (e^z - e^-z)/(e^z+e^-z)
+                return (exp(ctx, arg) - exp(ctx, static_cast<T>(-1.0) * arg))
+                       / (exp(ctx, arg) + exp(ctx, static_cast<T>(-1.0) * arg));
+            }
+        };
+
+        //! The CUDA trunc trait specialization.
+        template<typename TArg>
+        struct Trunc<TruncUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
+        {
+            __host__ __device__ auto operator()(TruncUniformCudaHipBuiltIn const& /* trunc_ctx */, TArg const& arg)
+            {
+                if constexpr(is_decayed_v<TArg, float>)
+                    return ::truncf(arg);
+                else if constexpr(is_decayed_v<TArg, double>)
+                    return ::trunc(arg);
+                else
+                    static_assert(!sizeof(TArg), "Unsupported data type");
+
+                ALPAKA_UNREACHABLE(TArg{});
+            }
+        };
+    } // namespace trait
+#    endif
+} // namespace alpaka::math
+
+#endif
diff --git a/include/alpaka/math/Traits.hpp b/include/alpaka/math/Traits.hpp
new file mode 100644
index 0000000..c63b662
--- /dev/null
+++ b/include/alpaka/math/Traits.hpp
@@ -0,0 +1,1488 @@
+/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber, Sergei Bastrakov,
+ *                Andrea Bocci, René Widera
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/core/Concepts.hpp"
+
+#include <cmath>
+#include <complex>
+#if __has_include(<version>) // Not part of the C++17 standard but all major standard libraries include this
+#    include <version>
+#endif
+#ifdef __cpp_lib_math_constants
+#    include <numbers>
+#endif
+
+namespace alpaka::math
+{
+    namespace constants
+    {
+#ifdef __cpp_lib_math_constants
+        inline constexpr double e = std::numbers::e;
+        inline constexpr double log2e = std::numbers::log2e;
+        inline constexpr double log10e = std::numbers::log10e;
+        inline constexpr double pi = std::numbers::pi;
+        inline constexpr double inv_pi = std::numbers::inv_pi;
+        inline constexpr double ln2 = std::numbers::ln2;
+        inline constexpr double ln10 = std::numbers::ln10;
+        inline constexpr double sqrt2 = std::numbers::sqrt2;
+
+        template<typename T>
+        inline constexpr T e_v = std::numbers::e_v<T>;
+
+        template<typename T>
+        inline constexpr T log2e_v = std::numbers::log2e_v<T>;
+
+        template<typename T>
+        inline constexpr T log10e_v = std::numbers::log10e_v<T>;
+
+        template<typename T>
+        inline constexpr T pi_v = std::numbers::pi_v<T>;
+
+        template<typename T>
+        inline constexpr T inv_pi_v = std::numbers::inv_pi_v<T>;
+
+        template<typename T>
+        inline constexpr T ln2_v = std::numbers::ln2_v<T>;
+
+        template<typename T>
+        inline constexpr T ln10_v = std::numbers::ln10_v<T>;
+
+        template<typename T>
+        inline constexpr T sqrt2_v = std::numbers::sqrt2_v<T>;
+#else
+        inline constexpr double e = M_E;
+        inline constexpr double log2e = M_LOG2E;
+        inline constexpr double log10e = M_LOG10E;
+        inline constexpr double pi = M_PI;
+        inline constexpr double inv_pi = M_1_PI;
+        inline constexpr double ln2 = M_LN2;
+        inline constexpr double ln10 = M_LN10;
+        inline constexpr double sqrt2 = M_SQRT2;
+
+        template<typename T>
+        inline constexpr T e_v = static_cast<T>(e);
+
+        template<typename T>
+        inline constexpr T log2e_v = static_cast<T>(log2e);
+
+        template<typename T>
+        inline constexpr T log10e_v = static_cast<T>(log10e);
+
+        template<typename T>
+        inline constexpr T pi_v = static_cast<T>(pi);
+
+        template<typename T>
+        inline constexpr T inv_pi_v = static_cast<T>(inv_pi);
+
+        template<typename T>
+        inline constexpr T ln2_v = static_cast<T>(ln2);
+
+        template<typename T>
+        inline constexpr T ln10_v = static_cast<T>(ln10);
+
+        template<typename T>
+        inline constexpr T sqrt2_v = static_cast<T>(sqrt2);
+
+        // Use predefined float constants when available
+#    if defined(M_Ef)
+        template<>
+        inline constexpr float e_v<float> = M_Ef;
+#    endif
+
+#    if defined(M_LOG2Ef)
+        template<>
+        inline constexpr float log2e_v<float> = M_LOG2Ef;
+#    endif
+
+#    if defined(M_LOG10Ef)
+        template<>
+        inline constexpr float log10e_v<float> = M_LOG10Ef;
+#    endif
+
+#    if defined(M_PIf)
+        template<>
+        inline constexpr float pi_v<float> = M_PIf;
+#    endif
+
+#    if defined(M_1_PIf)
+        template<>
+        inline constexpr float inv_pi_v<float> = M_1_PIf;
+#    endif
+
+#    if defined(M_LN2f)
+        template<>
+        inline constexpr float ln2_v<float> = M_LN2f;
+#    endif
+
+#    if defined(M_LN10f)
+        template<>
+        inline constexpr float ln10_v<float> = M_LN10f;
+#    endif
+
+#    if defined(M_SQRT2f)
+        template<>
+        inline constexpr float sqrt2_v<float> = M_SQRT2f;
+#    endif
+
+#endif
+    } // namespace constants
+
+    struct ConceptMathAbs
+    {
+    };
+
+    struct ConceptMathAcos
+    {
+    };
+
+    struct ConceptMathAcosh
+    {
+    };
+
+    struct ConceptMathArg
+    {
+    };
+
+    struct ConceptMathAsin
+    {
+    };
+
+    struct ConceptMathAsinh
+    {
+    };
+
+    struct ConceptMathAtan
+    {
+    };
+
+    struct ConceptMathAtanh
+    {
+    };
+
+    struct ConceptMathAtan2
+    {
+    };
+
+    struct ConceptMathCbrt
+    {
+    };
+
+    struct ConceptMathCeil
+    {
+    };
+
+    struct ConceptMathConj
+    {
+    };
+
+    struct ConceptMathCopysign
+    {
+    };
+
+    struct ConceptMathCos
+    {
+    };
+
+    struct ConceptMathCosh
+    {
+    };
+
+    struct ConceptMathErf
+    {
+    };
+
+    struct ConceptMathExp
+    {
+    };
+
+    struct ConceptMathFloor
+    {
+    };
+
+    struct ConceptMathFma
+    {
+    };
+
+    struct ConceptMathFmod
+    {
+    };
+
+    struct ConceptMathIsfinite
+    {
+    };
+
+    struct ConceptMathIsinf
+    {
+    };
+
+    struct ConceptMathIsnan
+    {
+    };
+
+    struct ConceptMathLog
+    {
+    };
+
+    struct ConceptMathLog2
+    {
+    };
+
+    struct ConceptMathLog10
+    {
+    };
+
+    struct ConceptMathMax
+    {
+    };
+
+    struct ConceptMathMin
+    {
+    };
+
+    struct ConceptMathPow
+    {
+    };
+
+    struct ConceptMathRemainder
+    {
+    };
+
+    struct ConceptMathRound
+    {
+    };
+
+    struct ConceptMathRsqrt
+    {
+    };
+
+    struct ConceptMathSin
+    {
+    };
+
+    struct ConceptMathSinh
+    {
+    };
+
+    struct ConceptMathSinCos
+    {
+    };
+
+    struct ConceptMathSqrt
+    {
+    };
+
+    struct ConceptMathTan
+    {
+    };
+
+    struct ConceptMathTanh
+    {
+    };
+
+    struct ConceptMathTrunc
+    {
+    };
+
+    //! The math traits.
+    namespace trait
+    {
+        //! The abs trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Abs
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find abs(TArg) in the namespace of your type.
+                using std::abs;
+                return abs(arg);
+            }
+        };
+
+        //! The acos trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Acos
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find acos(TArg) in the namespace of your type.
+                using std::acos;
+                return acos(arg);
+            }
+        };
+
+        //! The acosh trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Acosh
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find acosh(TArg) in the namespace of your type.
+                using std::acosh;
+                return acosh(arg);
+            }
+        };
+
+        //! The arg trait.
+        template<typename T, typename TArgument, typename TSfinae = void>
+        struct Arg
+        {
+            // It is unclear why this is needed here and not in other math trait structs. But removing it causes
+            // warnings with calling a __host__ function from a __host__ __device__ function when building for CUDA.
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArgument const& argument)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find arg(TArgument) in the namespace of your type.
+                using std::arg;
+                return arg(argument);
+            }
+        };
+
+        //! The asin trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Asin
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find asin(TArg) in the namespace of your type.
+                using std::asin;
+                return asin(arg);
+            }
+        };
+
+        //! The asin trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Asinh
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find asin(TArg) in the namespace of your type.
+                using std::asinh;
+                return asinh(arg);
+            }
+        };
+
+        //! The atan trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Atan
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find atan(TArg) in the namespace of your type.
+                using std::atan;
+                return atan(arg);
+            }
+        };
+
+        //! The atanh trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Atanh
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find atanh(TArg) in the namespace of your type.
+                using std::atanh;
+                return atanh(arg);
+            }
+        };
+
+        //! The atan2 trait.
+        template<typename T, typename Ty, typename Tx, typename TSfinae = void>
+        struct Atan2
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, Ty const& y, Tx const& x)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find atan2(Tx, Ty) in the namespace of your type.
+                using std::atan2;
+                return atan2(y, x);
+            }
+        };
+
+        //! The cbrt trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Cbrt
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find cbrt(TArg) in the namespace of your type.
+                using std::cbrt;
+                return cbrt(arg);
+            } //! The erf trait.
+        };
+
+        //! The ceil trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Ceil
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find ceil(TArg) in the namespace of your type.
+                using std::ceil;
+                return ceil(arg);
+            }
+        };
+
+        //! The conj trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Conj
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find conj(TArg) in the namespace of your type.
+                using std::conj;
+                return conj(arg);
+            }
+        };
+
+        //! The copysign trait.
+        template<typename T, typename TMag, typename TSgn, typename TSfinae = void>
+        struct Copysign
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TMag const& mag, TSgn const& sgn)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find copysign(TMag, TSgn) in the namespace of your type.
+                using std::copysign;
+                return copysign(mag, sgn);
+            }
+        };
+
+        //! The cos trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Cos
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find cos(TArg) in the namespace of your type.
+                using std::cos;
+                return cos(arg);
+            }
+        };
+
+        //! The cosh trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Cosh
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find cos(TArg) in the namespace of your type.
+                using std::cosh;
+                return cosh(arg);
+            }
+        };
+
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Erf
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find erf(TArg) in the namespace of your type.
+                using std::erf;
+                return erf(arg);
+            }
+        };
+
+        //! The exp trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Exp
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find exp(TArg) in the namespace of your type.
+                using std::exp;
+                return exp(arg);
+            }
+        };
+
+        //! The floor trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Floor
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find floor(TArg) in the namespace of your type.
+                using std::floor;
+                return floor(arg);
+            }
+        };
+
+        //! The fma trait.
+        template<typename T, typename Tx, typename Ty, typename Tz, typename TSfinae = void>
+        struct Fma
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, Tx const& x, Ty const& y, Tz const& z)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find fma(Tx, Ty, Tz) in the namespace of your type.
+                using std::fma;
+                return fma(x, y, z);
+            }
+        };
+
+        //! The fmod trait.
+        template<typename T, typename Tx, typename Ty, typename TSfinae = void>
+        struct Fmod
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, Tx const& x, Ty const& y)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find fmod(Tx, Ty) in the namespace of your type.
+                using std::fmod;
+                return fmod(x, y);
+            }
+        };
+
+        //! The isfinite trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Isfinite
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find isfinite(TArg) in the namespace of your type.
+                using std::isfinite;
+                return isfinite(arg);
+            }
+        };
+
+        //! The isinf trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Isinf
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find isinf(TArg) in the namespace of your type.
+                using std::isinf;
+                return isinf(arg);
+            }
+        };
+
+        //! The isnan trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Isnan
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find isnan(TArg) in the namespace of your type.
+                using std::isnan;
+                return isnan(arg);
+            }
+        };
+
+        //! The log trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Log
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find log(TArg) in the namespace of your type.
+                using std::log;
+                return log(arg);
+            }
+        };
+
+        //! The bas 2 log trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Log2
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find log2(TArg) in the namespace of your type.
+                using std::log2;
+                return log2(arg);
+            }
+        };
+
+        //! The base 10 log trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Log10
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find log10(TArg) in the namespace of your type.
+                using std::log10;
+                return log10(arg);
+            }
+        };
+
+        //! The max trait.
+        template<typename T, typename Tx, typename Ty, typename TSfinae = void>
+        struct Max
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, Tx const& x, Ty const& y)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find max(Tx, Ty) in the namespace of your type.
+                using std::max;
+                return max(x, y);
+            }
+        };
+
+        //! The min trait.
+        template<typename T, typename Tx, typename Ty, typename TSfinae = void>
+        struct Min
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, Tx const& x, Ty const& y)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find min(Tx, Ty) in the namespace of your type.
+                using std::min;
+                return min(x, y);
+            }
+        };
+
+        //! The pow trait.
+        template<typename T, typename TBase, typename TExp, typename TSfinae = void>
+        struct Pow
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TBase const& base, TExp const& exp)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find pow(base, exp) in the namespace of your type.
+                using std::pow;
+                return pow(base, exp);
+            }
+        };
+
+        //! The remainder trait.
+        template<typename T, typename Tx, typename Ty, typename TSfinae = void>
+        struct Remainder
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, Tx const& x, Ty const& y)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find remainder(Tx, Ty) in the namespace of your type.
+                using std::remainder;
+                return remainder(x, y);
+            }
+        };
+
+        //! The round trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Round
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find round(TArg) in the namespace of your type.
+                using std::round;
+                return round(arg);
+            }
+        };
+
+        //! The round trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Lround
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find lround(TArg) in the namespace of your type.
+                using std::lround;
+                return lround(arg);
+            }
+        };
+
+        //! The round trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Llround
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find llround(TArg) in the namespace of your type.
+                using std::llround;
+                return llround(arg);
+            }
+        };
+
+        namespace detail
+        {
+            //! Fallback implementation when no better ADL match was found
+            template<typename TArg>
+            ALPAKA_FN_HOST_ACC auto rsqrt(TArg const& arg)
+            {
+                // Still use ADL to try find sqrt(arg)
+                using std::sqrt;
+                return static_cast<TArg>(1) / sqrt(arg);
+            }
+        } // namespace detail
+
+        //! The rsqrt trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Rsqrt
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find rsqrt(TArg) in the namespace of your type.
+                using detail::rsqrt;
+                return rsqrt(arg);
+            }
+        };
+
+        //! The sin trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Sin
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find sin(TArg) in the namespace of your type.
+                using std::sin;
+                return sin(arg);
+            }
+        };
+
+        //! The sin trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Sinh
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find sin(TArg) in the namespace of your type.
+                using std::sinh;
+                return sinh(arg);
+            }
+        };
+
+        namespace detail
+        {
+            //! Fallback implementation when no better ADL match was found
+            template<typename TArg>
+            ALPAKA_FN_HOST_ACC auto sincos(TArg const& arg, TArg& result_sin, TArg& result_cos)
+            {
+                // Still use ADL to try find sin(arg) and cos(arg)
+                using std::sin;
+                result_sin = sin(arg);
+                using std::cos;
+                result_cos = cos(arg);
+            }
+        } // namespace detail
+
+        //! The sincos trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct SinCos
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg, TArg& result_sin, TArg& result_cos)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find sincos(TArg, TArg&, TArg&) in the namespace of your type.
+                using detail::sincos;
+                return sincos(arg, result_sin, result_cos);
+            }
+        };
+
+        //! The sqrt trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Sqrt
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find sqrt(TArg) in the namespace of your type.
+                using std::sqrt;
+                return sqrt(arg);
+            }
+        };
+
+        //! The tan trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Tan
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find tan(TArg) in the namespace of your type.
+                using std::tan;
+                return tan(arg);
+            }
+        };
+
+        //! The tanh trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Tanh
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find tanh(TArg) in the namespace of your type.
+                using std::tanh;
+                return tanh(arg);
+            }
+        };
+
+        //! The trunc trait.
+        template<typename T, typename TArg, typename TSfinae = void>
+        struct Trunc
+        {
+            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
+            {
+                // This is an ADL call. If you get a compile error here then your type is not supported by the
+                // backend and we could not find trunc(TArg) in the namespace of your type.
+                using std::trunc;
+                return trunc(arg);
+            }
+        };
+    } // namespace trait
+
+    //! Computes the absolute value.
+    //!
+    //! \tparam T The type of the object specializing Abs.
+    //! \tparam TArg The arg type.
+    //! \param abs_ctx The object specializing Abs.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto abs(T const& abs_ctx, TArg const& arg)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathAbs, T>;
+        return trait::Abs<ImplementationBase, TArg>{}(abs_ctx, arg);
+    }
+
+    //! Computes the principal value of the arc cosine.
+    //!
+    //! The valid real argument range is [-1.0, 1.0]. For other values
+    //! the result may depend on the backend and compilation options, will
+    //! likely be NaN.
+    //!
+    //! \tparam TArg The arg type.
+    //! \param acos_ctx The object specializing Acos.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto acos(T const& acos_ctx, TArg const& arg)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathAcos, T>;
+        return trait::Acos<ImplementationBase, TArg>{}(acos_ctx, arg);
+    }
+
+    //! Computes the principal value of the hyperbolic arc cosine.
+    //!
+    //! The valid real argument range is [1.0, Inf]. For other values
+    //! the result may depend on the backend and compilation options, will
+    //! likely be NaN.
+    //!
+    //! \tparam TArg The arg type.
+    //! \param acosh_ctx The object specializing Acos.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto acosh(T const& acosh_ctx, TArg const& arg)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathAcosh, T>;
+        return trait::Acosh<ImplementationBase, TArg>{}(acosh_ctx, arg);
+    }
+
+    //! Computes the complex argument of the value.
+    //!
+    //! \tparam T The type of the object specializing Arg.
+    //! \tparam TArgument The argument type.
+    //! \param arg_ctx The object specializing Arg.
+    //! \param argument The argument.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArgument>
+    ALPAKA_FN_HOST_ACC auto arg(T const& arg_ctx, TArgument const& argument)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathArg, T>;
+        return trait::Arg<ImplementationBase, TArgument>{}(arg_ctx, argument);
+    }
+
+    //! Computes the principal value of the arc sine.
+    //!
+    //! The valid real argument range is [-1.0, 1.0]. For other values
+    //! the result may depend on the backend and compilation options, will
+    //! likely be NaN.
+    //!
+    //! \tparam TArg The arg type.
+    //! \param asin_ctx The object specializing Asin.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto asin(T const& asin_ctx, TArg const& arg)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathAsin, T>;
+        return trait::Asin<ImplementationBase, TArg>{}(asin_ctx, arg);
+    }
+
+    //! Computes the principal value of the hyperbolic arc sine.
+    //!
+    //! \tparam TArg The arg type.
+    //! \param asinh_ctx The object specializing Asin.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto asinh(T const& asinh_ctx, TArg const& arg)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathAsinh, T>;
+        return trait::Asinh<ImplementationBase, TArg>{}(asinh_ctx, arg);
+    }
+
+    //! Computes the principal value of the arc tangent.
+    //!
+    //! \tparam TArg The arg type.
+    //! \param atan_ctx The object specializing Atan.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto atan(T const& atan_ctx, TArg const& arg)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathAtan, T>;
+        return trait::Atan<ImplementationBase, TArg>{}(atan_ctx, arg);
+    }
+
+    //! Computes the principal value of the hyperbolic arc tangent.
+    //!
+    //! The valid real argument range is [-1.0, 1.0]. For other values
+    //! the result may depend on the backend and compilation options, will
+    //! likely be NaN.
+
+    //! \tparam TArg The arg type.
+    //! \param atanh_ctx The object specializing Atanh.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto atanh(T const& atanh_ctx, TArg const& arg)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathAtanh, T>;
+        return trait::Atanh<ImplementationBase, TArg>{}(atanh_ctx, arg);
+    }
+
+    //! Computes the arc tangent of y/x using the signs of arguments to determine the correct quadrant.
+    //!
+    //! \tparam T The type of the object specializing Atan2.
+    //! \tparam Ty The y arg type.
+    //! \tparam Tx The x arg type.
+    //! \param atan2_ctx The object specializing Atan2.
+    //! \param y The y arg.
+    //! \param x The x arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename Ty, typename Tx>
+    ALPAKA_FN_HOST_ACC auto atan2(T const& atan2_ctx, Ty const& y, Tx const& x)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathAtan2, T>;
+        return trait::Atan2<ImplementationBase, Ty, Tx>{}(atan2_ctx, y, x);
+    }
+
+    //! Computes the cbrt.
+    //!
+    //! \tparam T The type of the object specializing Cbrt.
+    //! \tparam TArg The arg type.
+    //! \param cbrt_ctx The object specializing Cbrt.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto cbrt(T const& cbrt_ctx, TArg const& arg)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathCbrt, T>;
+        return trait::Cbrt<ImplementationBase, TArg>{}(cbrt_ctx, arg);
+    }
+
+    //! Computes the smallest integer value not less than arg.
+    //!
+    //! \tparam T The type of the object specializing Ceil.
+    //! \tparam TArg The arg type.
+    //! \param ceil_ctx The object specializing Ceil.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto ceil(T const& ceil_ctx, TArg const& arg)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathCeil, T>;
+        return trait::Ceil<ImplementationBase, TArg>{}(ceil_ctx, arg);
+    }
+
+    //! Computes the complex conjugate of arg.
+    //!
+    //! \tparam T The type of the object specializing Conj.
+    //! \tparam TArg The arg type.
+    //! \param conj_ctx The object specializing Conj.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto conj(T const& conj_ctx, TArg const& arg)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathConj, T>;
+        return trait::Conj<ImplementationBase, TArg>{}(conj_ctx, arg);
+    }
+
+    //! Creates a value with the magnitude of mag and the sign of sgn.
+    //!
+    //! \tparam T The type of the object specializing Copysign.
+    //! \tparam TMag The mag type.
+    //! \tparam TSgn The sgn type.
+    //! \param copysign_ctx The object specializing Copysign.
+    //! \param mag The mag.
+    //! \param sgn The sgn.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TMag, typename TSgn>
+    ALPAKA_FN_HOST_ACC auto copysign(T const& copysign_ctx, TMag const& mag, TSgn const& sgn)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathCopysign, T>;
+        return trait::Copysign<ImplementationBase, TMag, TSgn>{}(copysign_ctx, mag, sgn);
+    }
+
+    //! Computes the cosine (measured in radians).
+    //!
+    //! \tparam T The type of the object specializing Cos.
+    //! \tparam TArg The arg type.
+    //! \param cos_ctx The object specializing Cos.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto cos(T const& cos_ctx, TArg const& arg)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathCos, T>;
+        return trait::Cos<ImplementationBase, TArg>{}(cos_ctx, arg);
+    }
+
+    //! Computes the hyperbolic cosine (measured in radians).
+    //!
+    //! \tparam T The type of the object specializing Cos.
+    //! \tparam TArg The arg type.
+    //! \param cosh_ctx The object specializing Cos.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto cosh(T const& cosh_ctx, TArg const& arg)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathCosh, T>;
+        return trait::Cosh<ImplementationBase, TArg>{}(cosh_ctx, arg);
+    }
+
+    //! Computes the error function of arg.
+    //!
+    //! \tparam T The type of the object specializing Erf.
+    //! \tparam TArg The arg type.
+    //! \param erf_ctx The object specializing Erf.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto erf(T const& erf_ctx, TArg const& arg)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathErf, T>;
+        return trait::Erf<ImplementationBase, TArg>{}(erf_ctx, arg);
+    }
+
+    //! Computes the e (Euler's number, 2.7182818) raised to the given power arg.
+    //!
+    //! \tparam T The type of the object specializing Exp.
+    //! \tparam TArg The arg type.
+    //! \param exp_ctx The object specializing Exp.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto exp(T const& exp_ctx, TArg const& arg)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathExp, T>;
+        return trait::Exp<ImplementationBase, TArg>{}(exp_ctx, arg);
+    }
+
+    //! Computes the largest integer value not greater than arg.
+    //!
+    //! \tparam T The type of the object specializing Floor.
+    //! \tparam TArg The arg type.
+    //! \param floor_ctx The object specializing Floor.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto floor(T const& floor_ctx, TArg const& arg)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathFloor, T>;
+        return trait::Floor<ImplementationBase, TArg>{}(floor_ctx, arg);
+    }
+
+    //! Computes x * y + z as if to infinite precision and rounded only once to fit the result type.
+    //!
+    //! \tparam T The type of the object specializing Fma.
+    //! \tparam Tx The type of the first argument.
+    //! \tparam Ty The type of the second argument.
+    //! \tparam Tz The type of the third argument.
+    //! \param fma_ctx The object specializing .
+    //! \param x The first argument.
+    //! \param y The second argument.
+    //! \param z The third argument.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename Tx, typename Ty, typename Tz>
+    ALPAKA_FN_HOST_ACC auto fma(T const& fma_ctx, Tx const& x, Ty const& y, Tz const& z)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathFma, T>;
+        return trait::Fma<ImplementationBase, Tx, Ty, Tz>{}(fma_ctx, x, y, z);
+    }
+
+    //! Computes the floating-point remainder of the division operation x/y.
+    //!
+    //! \tparam T The type of the object specializing Fmod.
+    //! \tparam Tx The type of the first argument.
+    //! \tparam Ty The type of the second argument.
+    //! \param fmod_ctx The object specializing Fmod.
+    //! \param x The first argument.
+    //! \param y The second argument.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename Tx, typename Ty>
+    ALPAKA_FN_HOST_ACC auto fmod(T const& fmod_ctx, Tx const& x, Ty const& y)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathFmod, T>;
+        return trait::Fmod<ImplementationBase, Tx, Ty>{}(fmod_ctx, x, y);
+    }
+
+    //! Checks if given value is finite.
+    //!
+    //! \tparam T The type of the object specializing Isfinite.
+    //! \tparam TArg The arg type.
+    //! \param ctx The object specializing Isfinite.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto isfinite(T const& ctx, TArg const& arg)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathIsfinite, T>;
+        return trait::Isfinite<ImplementationBase, TArg>{}(ctx, arg);
+    }
+
+    //! Checks if given value is inf.
+    //!
+    //! \tparam T The type of the object specializing Isinf.
+    //! \tparam TArg The arg type.
+    //! \param ctx The object specializing Isinf.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto isinf(T const& ctx, TArg const& arg)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathIsinf, T>;
+        return trait::Isinf<ImplementationBase, TArg>{}(ctx, arg);
+    }
+
+    //! Checks if given value is NaN.
+    //!
+    //! \tparam T The type of the object specializing Isnan.
+    //! \tparam TArg The arg type.
+    //! \param ctx The object specializing Isnan.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto isnan(T const& ctx, TArg const& arg)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathIsnan, T>;
+        return trait::Isnan<ImplementationBase, TArg>{}(ctx, arg);
+    }
+
+    //! Computes the the natural (base e) logarithm of arg.
+    //!
+    //! Valid real arguments are non-negative. For other values the result
+    //! may depend on the backend and compilation options, will likely
+    //! be NaN.
+    //!
+    //! \tparam T The type of the object specializing Log.
+    //! \tparam TArg The arg type.
+    //! \param log_ctx The object specializing Log.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto log(T const& log_ctx, TArg const& arg)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathLog, T>;
+        return trait::Log<ImplementationBase, TArg>{}(log_ctx, arg);
+    }
+
+    //! Computes the the natural (base 2) logarithm of arg.
+    //!
+    //! Valid real arguments are non-negative. For other values the result
+    //! may depend on the backend and compilation options, will likely
+    //! be NaN.
+    //!
+    //! \tparam T The type of the object specializing Log2.
+    //! \tparam TArg The arg type.
+    //! \param log2_ctx The object specializing Log2.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto log2(T const& log2_ctx, TArg const& arg)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathLog2, T>;
+        return trait::Log2<ImplementationBase, TArg>{}(log2_ctx, arg);
+    }
+
+    //! Computes the the natural (base 10) logarithm of arg.
+    //!
+    //! Valid real arguments are non-negative. For other values the result
+    //! may depend on the backend and compilation options, will likely
+    //! be NaN.
+    //!
+    //! \tparam T The type of the object specializing Log10.
+    //! \tparam TArg The arg type.
+    //! \param log10_ctx The object specializing Log10.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto log10(T const& log10_ctx, TArg const& arg)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathLog10, T>;
+        return trait::Log10<ImplementationBase, TArg>{}(log10_ctx, arg);
+    }
+
+    //! Returns the larger of two arguments.
+    //! NaNs are treated as missing data (between a NaN and a numeric value, the numeric value is chosen).
+    //!
+    //! \tparam T The type of the object specializing Max.
+    //! \tparam Tx The type of the first argument.
+    //! \tparam Ty The type of the second argument.
+    //! \param max_ctx The object specializing Max.
+    //! \param x The first argument.
+    //! \param y The second argument.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename Tx, typename Ty>
+    ALPAKA_FN_HOST_ACC auto max(T const& max_ctx, Tx const& x, Ty const& y)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathMax, T>;
+        return trait::Max<ImplementationBase, Tx, Ty>{}(max_ctx, x, y);
+    }
+
+    //! Returns the smaller of two arguments.
+    //! NaNs are treated as missing data (between a NaN and a numeric value, the numeric value is chosen).
+    //!
+    //! \tparam T The type of the object specializing Min.
+    //! \tparam Tx The type of the first argument.
+    //! \tparam Ty The type of the second argument.
+    //! \param min_ctx The object specializing Min.
+    //! \param x The first argument.
+    //! \param y The second argument.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename Tx, typename Ty>
+    ALPAKA_FN_HOST_ACC auto min(T const& min_ctx, Tx const& x, Ty const& y)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathMin, T>;
+        return trait::Min<ImplementationBase, Tx, Ty>{}(min_ctx, x, y);
+    }
+
+    //! Computes the value of base raised to the power exp.
+    //!
+    //! Valid real arguments for base are non-negative. For other values
+    //! the result may depend on the backend and compilation options, will
+    //! likely be NaN.
+    //!
+    //! \tparam T The type of the object specializing Pow.
+    //! \tparam TBase The base type.
+    //! \tparam TExp The exponent type.
+    //! \param pow_ctx The object specializing Pow.
+    //! \param base The base.
+    //! \param exp The exponent.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TBase, typename TExp>
+    ALPAKA_FN_HOST_ACC auto pow(T const& pow_ctx, TBase const& base, TExp const& exp)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathPow, T>;
+        return trait::Pow<ImplementationBase, TBase, TExp>{}(pow_ctx, base, exp);
+    }
+
+    //! Computes the IEEE remainder of the floating point division operation x/y.
+    //!
+    //! \tparam T The type of the object specializing Remainder.
+    //! \tparam Tx The type of the first argument.
+    //! \tparam Ty The type of the second argument.
+    //! \param remainder_ctx The object specializing Max.
+    //! \param x The first argument.
+    //! \param y The second argument.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename Tx, typename Ty>
+    ALPAKA_FN_HOST_ACC auto remainder(T const& remainder_ctx, Tx const& x, Ty const& y)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathRemainder, T>;
+        return trait::Remainder<ImplementationBase, Tx, Ty>{}(remainder_ctx, x, y);
+    }
+
+    //! Computes the nearest integer value to arg (in floating-point format), rounding halfway cases away from
+    //! zero, regardless of the current rounding mode.
+    //!
+    //! \tparam T The type of the object specializing Round.
+    //! \tparam TArg The arg type.
+    //! \param round_ctx The object specializing Round.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto round(T const& round_ctx, TArg const& arg)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathRound, T>;
+        return trait::Round<ImplementationBase, TArg>{}(round_ctx, arg);
+    }
+
+    //! Computes the nearest integer value to arg (in integer format), rounding halfway cases away from zero,
+    //! regardless of the current rounding mode.
+    //!
+    //! \tparam T The type of the object specializing Round.
+    //! \tparam TArg The arg type.
+    //! \param lround_ctx The object specializing Round.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto lround(T const& lround_ctx, TArg const& arg) -> long int
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathRound, T>;
+        return trait::Lround<ImplementationBase, TArg>{}(lround_ctx, arg);
+    }
+
+    //! Computes the nearest integer value to arg (in integer format), rounding halfway cases away from zero,
+    //! regardless of the current rounding mode.
+    //!
+    //! \tparam T The type of the object specializing Round.
+    //! \tparam TArg The arg type.
+    //! \param llround_ctx The object specializing Round.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto llround(T const& llround_ctx, TArg const& arg) -> long long int
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathRound, T>;
+        return trait::Llround<ImplementationBase, TArg>{}(llround_ctx, arg);
+    }
+
+    //! Computes the rsqrt.
+    //!
+    //! Valid real arguments are positive. For other values the result
+    //! may depend on the backend and compilation options, will likely
+    //! be NaN.
+    //!
+    //! \tparam T The type of the object specializing Rsqrt.
+    //! \tparam TArg The arg type.
+    //! \param rsqrt_ctx The object specializing Rsqrt.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto rsqrt(T const& rsqrt_ctx, TArg const& arg)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathRsqrt, T>;
+        return trait::Rsqrt<ImplementationBase, TArg>{}(rsqrt_ctx, arg);
+    }
+
+    //! Computes the sine (measured in radians).
+    //!
+    //! \tparam T The type of the object specializing Sin.
+    //! \tparam TArg The arg type.
+    //! \param sin_ctx The object specializing Sin.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto sin(T const& sin_ctx, TArg const& arg)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathSin, T>;
+        return trait::Sin<ImplementationBase, TArg>{}(sin_ctx, arg);
+    }
+
+    //! Computes the hyperbolic sine (measured in radians).
+    //!
+    //! \tparam T The type of the object specializing Sin.
+    //! \tparam TArg The arg type.
+    //! \param sinh_ctx The object specializing Sin.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto sinh(T const& sinh_ctx, TArg const& arg)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathSinh, T>;
+        return trait::Sinh<ImplementationBase, TArg>{}(sinh_ctx, arg);
+    }
+
+    //! Computes the sine and cosine (measured in radians).
+    //!
+    //! \tparam T The type of the object specializing SinCos.
+    //! \tparam TArg The arg type.
+    //! \param sincos_ctx The object specializing SinCos.
+    //! \param arg The arg.
+    //! \param result_sin result of sine
+    //! \param result_cos result of cosine
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto sincos(T const& sincos_ctx, TArg const& arg, TArg& result_sin, TArg& result_cos) -> void
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathSinCos, T>;
+        trait::SinCos<ImplementationBase, TArg>{}(sincos_ctx, arg, result_sin, result_cos);
+    }
+
+    //! Computes the square root of arg.
+    //!
+    //! Valid real arguments are non-negative. For other values the result
+    //! may depend on the backend and compilation options, will likely
+    //! be NaN.
+    //!
+    //! \tparam T The type of the object specializing Sqrt.
+    //! \tparam TArg The arg type.
+    //! \param sqrt_ctx The object specializing Sqrt.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto sqrt(T const& sqrt_ctx, TArg const& arg)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathSqrt, T>;
+        return trait::Sqrt<ImplementationBase, TArg>{}(sqrt_ctx, arg);
+    }
+
+    //! Computes the tangent (measured in radians).
+    //!
+    //! \tparam T The type of the object specializing Tan.
+    //! \tparam TArg The arg type.
+    //! \param tan_ctx The object specializing Tan.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto tan(T const& tan_ctx, TArg const& arg)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathTan, T>;
+        return trait::Tan<ImplementationBase, TArg>{}(tan_ctx, arg);
+    }
+
+    //! Computes the hyperbolic tangent (measured in radians).
+    //!
+    //! \tparam T The type of the object specializing Tanh.
+    //! \tparam TArg The arg type.
+    //! \param tanh_ctx The object specializing Tanh.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto tanh(T const& tanh_ctx, TArg const& arg)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathTanh, T>;
+        return trait::Tanh<ImplementationBase, TArg>{}(tanh_ctx, arg);
+    }
+
+    //! Computes the nearest integer not greater in magnitude than arg.
+    //!
+    //! \tparam T The type of the object specializing Trunc.
+    //! \tparam TArg The arg type.
+    //! \param trunc_ctx The object specializing Trunc.
+    //! \param arg The arg.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T, typename TArg>
+    ALPAKA_FN_HOST_ACC auto trunc(T const& trunc_ctx, TArg const& arg)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMathTrunc, T>;
+        return trait::Trunc<ImplementationBase, TArg>{}(trunc_ctx, arg);
+    }
+} // namespace alpaka::math
diff --git a/include/alpaka/mem/alloc/AllocCpuAligned.hpp b/include/alpaka/mem/alloc/AllocCpuAligned.hpp
new file mode 100644
index 0000000..e458d99
--- /dev/null
+++ b/include/alpaka/mem/alloc/AllocCpuAligned.hpp
@@ -0,0 +1,67 @@
+/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/AlignedAlloc.hpp"
+#include "alpaka/core/Common.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/dev/cpu/SysInfo.hpp"
+#include "alpaka/mem/alloc/Traits.hpp"
+
+#include <algorithm>
+
+namespace alpaka
+{
+    //! The CPU boost aligned allocator.
+    //!
+    //! \tparam TAlignment An integral constant containing the alignment.
+    template<typename TAlignment>
+    class AllocCpuAligned : public concepts::Implements<ConceptMemAlloc, AllocCpuAligned<TAlignment>>
+    {
+    };
+
+    namespace trait
+    {
+        //! The CPU boost aligned allocator memory allocation trait specialization.
+        template<typename T, typename TAlignment>
+        struct Malloc<T, AllocCpuAligned<TAlignment>>
+        {
+            ALPAKA_FN_HOST static auto malloc(
+                AllocCpuAligned<TAlignment> const& /* alloc */,
+                std::size_t const& sizeElems) -> T*
+            {
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+                // For CUDA/HIP host memory must be aligned to 4 kib to pin it with `cudaHostRegister`,
+                // this was described in older programming guides but was removed later.
+                // From testing with PIConGPU and cuda-memcheck we found out that the alignment is still required.
+                //
+                // For HIP the required alignment is the size of a cache line.
+                // https://rocm-developer-tools.github.io/HIP/group__Memory.html#gab8258f051e1a1f7385f794a15300e674
+                // On most x86 systems the page size is 4KiB and on OpenPower 64KiB.
+                // Page size can be tested on the terminal with: `getconf PAGE_SIZE`
+                size_t minAlignement = std::max<size_t>(TAlignment::value, cpu::detail::getPageSize());
+#else
+                constexpr size_t minAlignement = TAlignment::value;
+#endif
+                return reinterpret_cast<T*>(core::alignedAlloc(minAlignement, sizeElems * sizeof(T)));
+            }
+        };
+
+        //! The CPU boost aligned allocator memory free trait specialization.
+        template<typename T, typename TAlignment>
+        struct Free<T, AllocCpuAligned<TAlignment>>
+        {
+            ALPAKA_FN_HOST static auto free(AllocCpuAligned<TAlignment> const& /* alloc */, T const* const ptr) -> void
+            {
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+                size_t minAlignement = std::max<size_t>(TAlignment::value, cpu::detail::getPageSize());
+#else
+                constexpr size_t minAlignement = TAlignment::value;
+#endif
+                core::alignedFree(minAlignement, const_cast<void*>(reinterpret_cast<void const*>(ptr)));
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/mem/alloc/AllocCpuNew.hpp b/include/alpaka/mem/alloc/AllocCpuNew.hpp
new file mode 100644
index 0000000..026d46e
--- /dev/null
+++ b/include/alpaka/mem/alloc/AllocCpuNew.hpp
@@ -0,0 +1,39 @@
+/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/mem/alloc/Traits.hpp"
+
+namespace alpaka
+{
+    //! The CPU new allocator.
+    class AllocCpuNew : public concepts::Implements<ConceptMemAlloc, AllocCpuNew>
+    {
+    };
+
+    namespace trait
+    {
+        //! The CPU new allocator memory allocation trait specialization.
+        template<typename T>
+        struct Malloc<T, AllocCpuNew>
+        {
+            ALPAKA_FN_HOST static auto malloc(AllocCpuNew const& /* alloc */, std::size_t const& sizeElems) -> T*
+            {
+                return new T[sizeElems];
+            }
+        };
+
+        //! The CPU new allocator memory free trait specialization.
+        template<typename T>
+        struct Free<T, AllocCpuNew>
+        {
+            ALPAKA_FN_HOST static auto free(AllocCpuNew const& /* alloc */, T const* const ptr) -> void
+            {
+                return delete[] ptr;
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/mem/alloc/Traits.hpp b/include/alpaka/mem/alloc/Traits.hpp
new file mode 100644
index 0000000..4b9cfcc
--- /dev/null
+++ b/include/alpaka/mem/alloc/Traits.hpp
@@ -0,0 +1,46 @@
+/* Copyright 2022 Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/dim/Traits.hpp"
+#include "alpaka/extent/Traits.hpp"
+
+namespace alpaka
+{
+    struct ConceptMemAlloc
+    {
+    };
+
+    //! The allocator traits.
+    namespace trait
+    {
+        //! The memory allocation trait.
+        template<typename T, typename TAlloc, typename TSfinae = void>
+        struct Malloc;
+
+        //! The memory free trait.
+        template<typename T, typename TAlloc, typename TSfinae = void>
+        struct Free;
+    } // namespace trait
+
+    //! \return The pointer to the allocated memory.
+    template<typename T, typename TAlloc>
+    ALPAKA_FN_HOST auto malloc(TAlloc const& alloc, std::size_t const& sizeElems) -> T*
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMemAlloc, TAlloc>;
+        return trait::Malloc<T, ImplementationBase>::malloc(alloc, sizeElems);
+    }
+
+    //! Frees the memory identified by the given pointer.
+    template<typename TAlloc, typename T>
+    ALPAKA_FN_HOST auto free(TAlloc const& alloc, T const* const ptr) -> void
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMemAlloc, TAlloc>;
+        trait::Free<T, ImplementationBase>::free(alloc, ptr);
+    }
+} // namespace alpaka
diff --git a/include/alpaka/mem/buf/BufCpu.hpp b/include/alpaka/mem/buf/BufCpu.hpp
new file mode 100644
index 0000000..4bfc91c
--- /dev/null
+++ b/include/alpaka/mem/buf/BufCpu.hpp
@@ -0,0 +1,314 @@
+/* Copyright 2022 Alexander Matthes, Axel Huebl, Benjamin Worpitz, Andrea Bocci, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/ApiCudaRt.hpp"
+#include "alpaka/core/ApiHipRt.hpp"
+#include "alpaka/core/Cuda.hpp"
+#include "alpaka/core/Hip.hpp"
+#include "alpaka/core/Vectorize.hpp"
+#include "alpaka/dev/DevCpu.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/mem/alloc/AllocCpuAligned.hpp"
+#include "alpaka/mem/buf/Traits.hpp"
+#include "alpaka/mem/view/ViewAccessOps.hpp"
+#include "alpaka/meta/DependentFalseType.hpp"
+#include "alpaka/platform/PlatformCpu.hpp"
+#include "alpaka/vec/Vec.hpp"
+
+#include <functional>
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+namespace alpaka
+{
+    namespace detail
+    {
+        //! The CPU memory buffer.
+        template<typename TElem, typename TDim, typename TIdx>
+        class BufCpuImpl final
+        {
+            static_assert(
+                !std::is_const_v<TElem>,
+                "The elem type of the buffer can not be const because the C++ Standard forbids containers of const "
+                "elements!");
+            static_assert(!std::is_const_v<TIdx>, "The idx type of the buffer can not be const!");
+
+        public:
+            template<typename TExtent>
+            ALPAKA_FN_HOST BufCpuImpl(
+                DevCpu dev,
+                TElem* pMem,
+                std::function<void(TElem*)> deleter,
+                TExtent const& extent) noexcept
+                : m_dev(std::move(dev))
+                , m_extentElements(getExtentVecEnd<TDim>(extent))
+                , m_pMem(pMem)
+                , m_deleter(std::move(deleter))
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                static_assert(
+                    TDim::value == Dim<TExtent>::value,
+                    "The dimensionality of TExtent and the dimensionality of the TDim template parameter have to be "
+                    "identical!");
+                static_assert(
+                    std::is_same_v<TIdx, Idx<TExtent>>,
+                    "The idx type of TExtent and the TIdx template parameter have to be identical!");
+
+#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                std::cout << __func__ << " e: " << m_extentElements << " ptr: " << static_cast<void*>(m_pMem)
+                          << std::endl;
+#endif
+            }
+
+            BufCpuImpl(BufCpuImpl&&) = delete;
+            auto operator=(BufCpuImpl&&) -> BufCpuImpl& = delete;
+
+            ALPAKA_FN_HOST ~BufCpuImpl()
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // NOTE: m_pMem is allowed to be a nullptr here.
+                m_deleter(m_pMem);
+            }
+
+        public:
+            DevCpu const m_dev;
+            Vec<TDim, TIdx> const m_extentElements;
+            TElem* const m_pMem;
+            std::function<void(TElem*)> m_deleter;
+        };
+    } // namespace detail
+
+    //! The CPU memory buffer.
+    template<typename TElem, typename TDim, typename TIdx>
+    class BufCpu : public internal::ViewAccessOps<BufCpu<TElem, TDim, TIdx>>
+    {
+    public:
+        template<typename TExtent, typename Deleter>
+        ALPAKA_FN_HOST BufCpu(DevCpu const& dev, TElem* pMem, Deleter deleter, TExtent const& extent)
+            : m_spBufCpuImpl{
+                std::make_shared<detail::BufCpuImpl<TElem, TDim, TIdx>>(dev, pMem, std::move(deleter), extent)}
+        {
+        }
+
+    public:
+        std::shared_ptr<detail::BufCpuImpl<TElem, TDim, TIdx>> m_spBufCpuImpl;
+    };
+
+    namespace trait
+    {
+        //! The BufCpu device type trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct DevType<BufCpu<TElem, TDim, TIdx>>
+        {
+            using type = DevCpu;
+        };
+
+        //! The BufCpu device get trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct GetDev<BufCpu<TElem, TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST static auto getDev(BufCpu<TElem, TDim, TIdx> const& buf) -> DevCpu
+            {
+                return buf.m_spBufCpuImpl->m_dev;
+            }
+        };
+
+        //! The BufCpu dimension getter trait.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct DimType<BufCpu<TElem, TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //! The BufCpu memory element type get trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct ElemType<BufCpu<TElem, TDim, TIdx>>
+        {
+            using type = TElem;
+        };
+
+        //! The BufCpu width get trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct GetExtents<BufCpu<TElem, TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST auto operator()(BufCpu<TElem, TDim, TIdx> const& buf)
+            {
+                return buf.m_spBufCpuImpl->m_extentElements;
+            }
+        };
+
+        //! The BufCpu native pointer get trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct GetPtrNative<BufCpu<TElem, TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST static auto getPtrNative(BufCpu<TElem, TDim, TIdx> const& buf) -> TElem const*
+            {
+                return buf.m_spBufCpuImpl->m_pMem;
+            }
+
+            ALPAKA_FN_HOST static auto getPtrNative(BufCpu<TElem, TDim, TIdx>& buf) -> TElem*
+            {
+                return buf.m_spBufCpuImpl->m_pMem;
+            }
+        };
+
+        //! The BufCpu pointer on device get trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct GetPtrDev<BufCpu<TElem, TDim, TIdx>, DevCpu>
+        {
+            ALPAKA_FN_HOST static auto getPtrDev(BufCpu<TElem, TDim, TIdx> const& buf, DevCpu const& dev)
+                -> TElem const*
+            {
+                if(dev == getDev(buf))
+                {
+                    return buf.m_spBufCpuImpl->m_pMem;
+                }
+                else
+                {
+                    throw std::runtime_error("The buffer is not accessible from the given device!");
+                }
+            }
+
+            ALPAKA_FN_HOST static auto getPtrDev(BufCpu<TElem, TDim, TIdx>& buf, DevCpu const& dev) -> TElem*
+            {
+                if(dev == getDev(buf))
+                {
+                    return buf.m_spBufCpuImpl->m_pMem;
+                }
+                else
+                {
+                    throw std::runtime_error("The buffer is not accessible from the given device!");
+                }
+            }
+        };
+
+        //! The BufCpu memory allocation trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct BufAlloc<TElem, TDim, TIdx, DevCpu>
+        {
+            template<typename TExtent>
+            ALPAKA_FN_HOST static auto allocBuf(DevCpu const& dev, TExtent const& extent) -> BufCpu<TElem, TDim, TIdx>
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // If ALPAKA_DEFAULT_HOST_MEMORY_ALIGNMENT is defined, positive, and a power of 2, use it as the
+                // default alignment for host memory allocations. Otherwise, the alignment is chosen to enable optimal
+                // performance dependant on the target architecture.
+#if defined(ALPAKA_DEFAULT_HOST_MEMORY_ALIGNMENT)
+                static_assert(
+                    ALPAKA_DEFAULT_HOST_MEMORY_ALIGNMENT > 0
+                        && ((ALPAKA_DEFAULT_HOST_MEMORY_ALIGNMENT & (ALPAKA_DEFAULT_HOST_MEMORY_ALIGNMENT - 1)) == 0),
+                    "If defined, ALPAKA_DEFAULT_HOST_MEMORY_ALIGNMENT must be a power of 2.");
+                constexpr std::size_t alignment = static_cast<std::size_t>(ALPAKA_DEFAULT_HOST_MEMORY_ALIGNMENT);
+#else
+                constexpr std::size_t alignment = core::vectorization::defaultAlignment;
+#endif
+                // alpaka::AllocCpuAligned is stateless
+                using Allocator = AllocCpuAligned<std::integral_constant<std::size_t, alignment>>;
+                static_assert(std::is_empty_v<Allocator>, "AllocCpuAligned is expected to be stateless");
+                auto* memPtr = alpaka::malloc<TElem>(Allocator{}, static_cast<std::size_t>(getExtentProduct(extent)));
+                auto deleter = [](TElem* ptr) { alpaka::free(Allocator{}, ptr); };
+
+                return BufCpu<TElem, TDim, TIdx>(dev, memPtr, std::move(deleter), extent);
+            }
+        };
+
+        //! The BufCpu stream-ordered memory allocation trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct AsyncBufAlloc<TElem, TDim, TIdx, DevCpu>
+        {
+            template<typename TQueue, typename TExtent>
+            ALPAKA_FN_HOST static auto allocAsyncBuf(TQueue queue, TExtent const& extent) -> BufCpu<TElem, TDim, TIdx>
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                static_assert(
+                    std::is_same_v<Dev<TQueue>, DevCpu>,
+                    "The BufCpu buffer can only be used with a queue on a DevCpu device!");
+                DevCpu const& dev = getDev(queue);
+
+                // If ALPAKA_DEFAULT_HOST_MEMORY_ALIGNMENT is defined, positive, and a power of 2, use it as the
+                // default alignment for host memory allocations. Otherwise, the alignment is chosen to enable optimal
+                // performance dependant on the target architecture.
+#if defined(ALPAKA_DEFAULT_HOST_MEMORY_ALIGNMENT)
+                static_assert(
+                    ALPAKA_DEFAULT_HOST_MEMORY_ALIGNMENT > 0
+                        && ((ALPAKA_DEFAULT_HOST_MEMORY_ALIGNMENT & (ALPAKA_DEFAULT_HOST_MEMORY_ALIGNMENT - 1)) == 0),
+                    "If defined, ALPAKA_DEFAULT_HOST_MEMORY_ALIGNMENT must be a power of 2.");
+                constexpr std::size_t alignment = static_cast<std::size_t>(ALPAKA_DEFAULT_HOST_MEMORY_ALIGNMENT);
+#else
+                constexpr std::size_t alignment = core::vectorization::defaultAlignment;
+#endif
+                // alpaka::AllocCpuAligned is stateless
+                using Allocator = AllocCpuAligned<std::integral_constant<std::size_t, alignment>>;
+                static_assert(std::is_empty_v<Allocator>, "AllocCpuAligned is expected to be stateless");
+                auto* memPtr = alpaka::malloc<TElem>(Allocator{}, static_cast<std::size_t>(getExtentProduct(extent)));
+                auto deleter = [l_queue = std::move(queue)](TElem* ptr) mutable
+                {
+                    alpaka::enqueue(
+                        l_queue,
+                        [ptr]()
+                        {
+                            // free the memory
+                            alpaka::free(Allocator{}, ptr);
+                        });
+                };
+
+                return BufCpu<TElem, TDim, TIdx>(dev, memPtr, std::move(deleter), extent);
+            }
+        };
+
+        //! The BufCpu stream-ordered memory allocation capability trait specialization.
+        template<typename TDim>
+        struct HasAsyncBufSupport<TDim, DevCpu> : public std::true_type
+        {
+        };
+
+        //! The pinned/mapped memory allocation trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct BufAllocMapped<PlatformCpu, TElem, TDim, TIdx>
+        {
+            template<typename TExtent>
+            ALPAKA_FN_HOST static auto allocMappedBuf(
+                DevCpu const& host,
+                PlatformCpu const& /*platform*/,
+                TExtent const& extent) -> BufCpu<TElem, TDim, TIdx>
+            {
+                // Allocate standard host memory.
+                return allocBuf<TElem, TIdx>(host, extent);
+            }
+        };
+
+        //! The pinned/mapped memory allocation capability trait specialization.
+        template<>
+        struct HasMappedBufSupport<PlatformCpu> : public std::true_type
+        {
+        };
+
+        //! The BufCpu offset get trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct GetOffsets<BufCpu<TElem, TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST auto operator()(BufCpu<TElem, TDim, TIdx> const&) const -> Vec<TDim, TIdx>
+            {
+                return Vec<TDim, TIdx>::zeros();
+            }
+        };
+
+        //! The BufCpu idx type trait specialization.
+        template<typename TElem, typename TDim, typename TIdx>
+        struct IdxType<BufCpu<TElem, TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#include "alpaka/mem/buf/cpu/Copy.hpp"
+#include "alpaka/mem/buf/cpu/Set.hpp"
diff --git a/include/alpaka/mem/buf/BufCpuSycl.hpp b/include/alpaka/mem/buf/BufCpuSycl.hpp
new file mode 100644
index 0000000..ab36f8b
--- /dev/null
+++ b/include/alpaka/mem/buf/BufCpuSycl.hpp
@@ -0,0 +1,19 @@
+/* Copyright 2024 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/dev/DevCpuSycl.hpp"
+#include "alpaka/mem/buf/BufGenericSycl.hpp"
+#include "alpaka/platform/PlatformCpuSycl.hpp"
+
+#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_CPU)
+
+namespace alpaka
+{
+    template<typename TElem, typename TDim, typename TIdx>
+    using BufCpuSycl = BufGenericSycl<TElem, TDim, TIdx, PlatformCpuSycl>;
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/mem/buf/BufCudaRt.hpp b/include/alpaka/mem/buf/BufCudaRt.hpp
new file mode 100644
index 0000000..a5e0020
--- /dev/null
+++ b/include/alpaka/mem/buf/BufCudaRt.hpp
@@ -0,0 +1,18 @@
+/* Copyright 2022 Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/ApiCudaRt.hpp"
+#include "alpaka/mem/buf/BufUniformCudaHipRt.hpp"
+
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+
+namespace alpaka
+{
+    template<typename TElem, typename TDim, typename TIdx>
+    using BufCudaRt = BufUniformCudaHipRt<ApiCudaRt, TElem, TDim, TIdx>;
+} // namespace alpaka
+
+#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
diff --git a/include/alpaka/mem/buf/BufFpgaSyclIntel.hpp b/include/alpaka/mem/buf/BufFpgaSyclIntel.hpp
new file mode 100644
index 0000000..562fae9
--- /dev/null
+++ b/include/alpaka/mem/buf/BufFpgaSyclIntel.hpp
@@ -0,0 +1,19 @@
+/* Copyright 2024 Jan Stephan, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/dev/DevFpgaSyclIntel.hpp"
+#include "alpaka/mem/buf/BufGenericSycl.hpp"
+#include "alpaka/platform/PlatformFpgaSyclIntel.hpp"
+
+#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_FPGA)
+
+namespace alpaka
+{
+    template<typename TElem, typename TDim, typename TIdx>
+    using BufFpgaSyclIntel = BufGenericSycl<TElem, TDim, TIdx, PlatformFpgaSyclIntel>;
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/mem/buf/BufGenericSycl.hpp b/include/alpaka/mem/buf/BufGenericSycl.hpp
new file mode 100644
index 0000000..9beb16c
--- /dev/null
+++ b/include/alpaka/mem/buf/BufGenericSycl.hpp
@@ -0,0 +1,272 @@
+/* Copyright 2024 Jan Stephan, Luca Ferragina, Aurora Perego, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Sycl.hpp"
+#include "alpaka/dev/DevGenericSycl.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/dim/DimIntegralConst.hpp"
+#include "alpaka/dim/Traits.hpp"
+#include "alpaka/mem/buf/BufCpu.hpp"
+#include "alpaka/mem/buf/Traits.hpp"
+#include "alpaka/mem/view/ViewAccessOps.hpp"
+#include "alpaka/vec/Vec.hpp"
+
+#include <memory>
+#include <type_traits>
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+
+#    include <sycl/sycl.hpp>
+
+namespace alpaka
+{
+    //! The SYCL memory buffer.
+    template<typename TElem, typename TDim, typename TIdx, typename TTag>
+    class BufGenericSycl : public internal::ViewAccessOps<BufGenericSycl<TElem, TDim, TIdx, TTag>>
+    {
+    public:
+        static_assert(
+            !std::is_const_v<TElem>,
+            "The elem type of the buffer can not be const because the C++ Standard forbids containers of const "
+            "elements!");
+        static_assert(!std::is_const_v<TIdx>, "The idx type of the buffer can not be const!");
+
+        //! Constructor
+        template<typename TExtent, typename Deleter>
+        BufGenericSycl(DevGenericSycl<TTag> const& dev, TElem* const pMem, Deleter deleter, TExtent const& extent)
+            : m_dev{dev}
+            , m_extentElements{getExtentVecEnd<TDim>(extent)}
+            , m_spMem(pMem, std::move(deleter))
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+            static_assert(
+                TDim::value == Dim<TExtent>::value,
+                "The dimensionality of TExtent and the dimensionality of the TDim template parameter have to be "
+                "identical!");
+
+            static_assert(
+                std::is_same_v<TIdx, Idx<TExtent>>,
+                "The idx type of TExtent and the TIdx template parameter have to be identical!");
+        }
+
+        DevGenericSycl<TTag> m_dev;
+        Vec<TDim, TIdx> m_extentElements;
+        std::shared_ptr<TElem> m_spMem;
+    };
+} // namespace alpaka
+
+namespace alpaka::trait
+{
+    //! The BufGenericSycl device type trait specialization.
+    template<typename TElem, typename TDim, typename TIdx, typename TTag>
+    struct DevType<BufGenericSycl<TElem, TDim, TIdx, TTag>>
+    {
+        using type = DevGenericSycl<TTag>;
+    };
+
+    //! The BufGenericSycl device get trait specialization.
+    template<typename TElem, typename TDim, typename TIdx, typename TTag>
+    struct GetDev<BufGenericSycl<TElem, TDim, TIdx, TTag>>
+    {
+        static auto getDev(BufGenericSycl<TElem, TDim, TIdx, TTag> const& buf)
+        {
+            return buf.m_dev;
+        }
+    };
+
+    //! The BufGenericSycl dimension getter trait specialization.
+    template<typename TElem, typename TDim, typename TIdx, typename TTag>
+    struct DimType<BufGenericSycl<TElem, TDim, TIdx, TTag>>
+    {
+        using type = TDim;
+    };
+
+    //! The BufGenericSycl memory element type get trait specialization.
+    template<typename TElem, typename TDim, typename TIdx, typename TTag>
+    struct ElemType<BufGenericSycl<TElem, TDim, TIdx, TTag>>
+    {
+        using type = TElem;
+    };
+
+    //! The BufGenericSycl extent get trait specialization.
+    template<typename TElem, typename TDim, typename TIdx, typename TTag>
+    struct GetExtents<BufGenericSycl<TElem, TDim, TIdx, TTag>>
+    {
+        auto operator()(BufGenericSycl<TElem, TDim, TIdx, TTag> const& buf) const
+        {
+            return buf.m_extentElements;
+        }
+    };
+
+    //! The BufGenericSycl native pointer get trait specialization.
+    template<typename TElem, typename TDim, typename TIdx, typename TTag>
+    struct GetPtrNative<BufGenericSycl<TElem, TDim, TIdx, TTag>>
+    {
+        static auto getPtrNative(BufGenericSycl<TElem, TDim, TIdx, TTag> const& buf) -> TElem const*
+        {
+            return buf.m_spMem.get();
+        }
+
+        static auto getPtrNative(BufGenericSycl<TElem, TDim, TIdx, TTag>& buf) -> TElem*
+        {
+            return buf.m_spMem.get();
+        }
+    };
+
+    //! The BufGenericSycl pointer on device get trait specialization.
+    template<typename TElem, typename TDim, typename TIdx, typename TTag>
+    struct GetPtrDev<BufGenericSycl<TElem, TDim, TIdx, TTag>, DevGenericSycl<TTag>>
+    {
+        static auto getPtrDev(BufGenericSycl<TElem, TDim, TIdx, TTag> const& buf, DevGenericSycl<TTag> const& dev)
+            -> TElem const*
+        {
+            if(dev == getDev(buf))
+            {
+                return buf.m_spMem.get();
+            }
+            else
+            {
+                throw std::runtime_error("The buffer is not accessible from the given device!");
+            }
+        }
+
+        static auto getPtrDev(BufGenericSycl<TElem, TDim, TIdx, TTag>& buf, DevGenericSycl<TTag> const& dev) -> TElem*
+        {
+            if(dev == getDev(buf))
+            {
+                return buf.m_spMem.get();
+            }
+            else
+            {
+                throw std::runtime_error("The buffer is not accessible from the given device!");
+            }
+        }
+    };
+
+    //! The SYCL memory allocation trait specialization.
+    template<typename TElem, typename TDim, typename TIdx, typename TTag>
+    struct BufAlloc<TElem, TDim, TIdx, DevGenericSycl<TTag>>
+    {
+        template<typename TExtent>
+        static auto allocBuf(DevGenericSycl<TTag> const& dev, TExtent const& extent)
+            -> BufGenericSycl<TElem, TDim, TIdx, TTag>
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            if constexpr(TDim::value == 0)
+                std::cout << __func__ << " ewb: " << sizeof(TElem) << '\n';
+            else if constexpr(TDim::value == 1)
+            {
+                auto const width = getWidth(extent);
+
+                auto const widthBytes = width * static_cast<TIdx>(sizeof(TElem));
+                std::cout << __func__ << " ew: " << width << " ewb: " << widthBytes << '\n';
+            }
+            else if constexpr(TDim::value == 2)
+            {
+                auto const width = getWidth(extent);
+                auto const height = getHeight(extent);
+
+                auto const widthBytes = width * static_cast<TIdx>(sizeof(TElem));
+                std::cout << __func__ << " ew: " << width << " eh: " << height << " ewb: " << widthBytes
+                          << " pitch: " << widthBytes << '\n';
+            }
+            else if constexpr(TDim::value == 3)
+            {
+                auto const width = getWidth(extent);
+                auto const height = getHeight(extent);
+                auto const depth = getDepth(extent);
+
+                auto const widthBytes = width * static_cast<TIdx>(sizeof(TElem));
+                std::cout << __func__ << " ew: " << width << " eh: " << height << " ed: " << depth
+                          << " ewb: " << widthBytes << " pitch: " << widthBytes << '\n';
+            }
+#    endif
+
+            auto const& [nativeDev, nativeContext] = dev.getNativeHandle();
+            TElem* memPtr = sycl::malloc_device<TElem>(
+                static_cast<std::size_t>(getExtentProduct(extent)),
+                nativeDev,
+                nativeContext);
+            auto deleter = [ctx = nativeContext](TElem* ptr) { sycl::free(ptr, ctx); };
+
+            return BufGenericSycl<TElem, TDim, TIdx, TTag>(dev, memPtr, std::move(deleter), extent);
+        }
+    };
+
+    //! The BufGenericSycl stream-ordered memory allocation capability trait specialization.
+    template<typename TDim, typename TTag>
+    struct HasAsyncBufSupport<TDim, DevGenericSycl<TTag>> : std::false_type
+    {
+    };
+
+    //! The BufGenericSycl offset get trait specialization.
+    template<typename TElem, typename TDim, typename TIdx, typename TTag>
+    struct GetOffsets<BufGenericSycl<TElem, TDim, TIdx, TTag>>
+    {
+        auto operator()(BufGenericSycl<TElem, TDim, TIdx, TTag> const&) const -> Vec<TDim, TIdx>
+        {
+            return Vec<TDim, TIdx>::zeros();
+        }
+    };
+
+    //! The pinned/mapped memory allocation trait specialization for the SYCL devices.
+    template<typename TTag, typename TElem, typename TDim, typename TIdx>
+    struct BufAllocMapped<PlatformGenericSycl<TTag>, TElem, TDim, TIdx>
+    {
+        template<typename TExtent>
+        static auto allocMappedBuf(
+            DevCpu const& host,
+            PlatformGenericSycl<TTag> const& platform,
+            TExtent const& extent) -> BufCpu<TElem, TDim, TIdx>
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+            // Allocate SYCL page-locked memory on the host, mapped into the SYCL platform's address space and
+            // accessible to all devices in the SYCL platform.
+            auto ctx = platform.syclContext();
+            TElem* memPtr = sycl::malloc_host<TElem>(static_cast<std::size_t>(getExtentProduct(extent)), ctx);
+            auto deleter = [ctx](TElem* ptr) { sycl::free(ptr, ctx); };
+
+            return BufCpu<TElem, TDim, TIdx>(host, memPtr, std::move(deleter), extent);
+        }
+    };
+
+    //! The pinned/mapped memory allocation capability trait specialization.
+    template<typename TTag>
+    struct HasMappedBufSupport<PlatformGenericSycl<TTag>> : public std::true_type
+    {
+    };
+
+    //! The BufGenericSycl idx type trait specialization.
+    template<typename TElem, typename TDim, typename TIdx, typename TTag>
+    struct IdxType<BufGenericSycl<TElem, TDim, TIdx, TTag>>
+    {
+        using type = TIdx;
+    };
+
+    //! The BufCpu pointer on SYCL device get trait specialization.
+    template<typename TElem, typename TDim, typename TIdx, typename TTag>
+    struct GetPtrDev<BufCpu<TElem, TDim, TIdx>, DevGenericSycl<TTag>>
+    {
+        static auto getPtrDev(BufCpu<TElem, TDim, TIdx> const& buf, DevGenericSycl<TTag> const&) -> TElem const*
+        {
+            return getPtrNative(buf);
+        }
+
+        static auto getPtrDev(BufCpu<TElem, TDim, TIdx>& buf, DevGenericSycl<TTag> const&) -> TElem*
+        {
+            return getPtrNative(buf);
+        }
+    };
+} // namespace alpaka::trait
+
+#    include "alpaka/mem/buf/sycl/Copy.hpp"
+#    include "alpaka/mem/buf/sycl/Set.hpp"
+
+#endif
diff --git a/include/alpaka/mem/buf/BufGpuSyclIntel.hpp b/include/alpaka/mem/buf/BufGpuSyclIntel.hpp
new file mode 100644
index 0000000..5597f70
--- /dev/null
+++ b/include/alpaka/mem/buf/BufGpuSyclIntel.hpp
@@ -0,0 +1,19 @@
+/* Copyright 2024 Jan Stephan, Luca Ferragina, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/dev/DevGpuSyclIntel.hpp"
+#include "alpaka/mem/buf/BufGenericSycl.hpp"
+#include "alpaka/platform/PlatformGpuSyclIntel.hpp"
+
+#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_GPU)
+
+namespace alpaka
+{
+    template<typename TElem, typename TDim, typename TIdx>
+    using BufGpuSyclIntel = BufGenericSycl<TElem, TDim, TIdx, PlatformGpuSyclIntel>;
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/mem/buf/BufHipRt.hpp b/include/alpaka/mem/buf/BufHipRt.hpp
new file mode 100644
index 0000000..4a59bc4
--- /dev/null
+++ b/include/alpaka/mem/buf/BufHipRt.hpp
@@ -0,0 +1,18 @@
+/* Copyright 2022 Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/ApiHipRt.hpp"
+#include "alpaka/mem/buf/BufUniformCudaHipRt.hpp"
+
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+
+namespace alpaka
+{
+    template<typename TElem, typename TDim, typename TIdx>
+    using BufHipRt = BufUniformCudaHipRt<ApiHipRt, TElem, TDim, TIdx>;
+} // namespace alpaka
+
+#endif // ALPAKA_ACC_GPU_HIP_ENABLED
diff --git a/include/alpaka/mem/buf/BufUniformCudaHipRt.hpp b/include/alpaka/mem/buf/BufUniformCudaHipRt.hpp
new file mode 100644
index 0000000..826edab
--- /dev/null
+++ b/include/alpaka/mem/buf/BufUniformCudaHipRt.hpp
@@ -0,0 +1,422 @@
+/* Copyright 2023 Alexander Matthes, Benjamin Worpitz, Matthias Werner, René Widera, Andrea Bocci, Jan Stephan,
+ *                Bernhard Manfred Gruber, Antonio Di Pilato
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Assert.hpp"
+#include "alpaka/core/Cuda.hpp"
+#include "alpaka/core/Hip.hpp"
+#include "alpaka/dev/DevUniformCudaHipRt.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/dim/DimIntegralConst.hpp"
+#include "alpaka/mem/buf/Traits.hpp"
+#include "alpaka/mem/view/ViewAccessOps.hpp"
+#include "alpaka/meta/DependentFalseType.hpp"
+#include "alpaka/vec/Vec.hpp"
+
+#include <cstddef>
+#include <functional>
+#include <memory>
+#include <type_traits>
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+namespace alpaka
+{
+    // Forward declarations.
+    struct ApiCudaRt;
+    struct ApiHipRt;
+
+    template<typename TElem, typename TDim, typename TIdx>
+    class BufCpu;
+
+    namespace detail
+    {
+        template<typename TDim, typename SFINAE = void>
+        struct PitchHolder
+        {
+            explicit PitchHolder(std::size_t)
+            {
+            }
+        };
+
+        template<typename TDim>
+        struct PitchHolder<TDim, std::enable_if_t<TDim::value >= 2>>
+        {
+            std::size_t m_rowPitchInBytes;
+        };
+    } // namespace detail
+
+    //! The CUDA/HIP memory buffer.
+    template<typename TApi, typename TElem, typename TDim, typename TIdx>
+    struct BufUniformCudaHipRt
+        : detail::PitchHolder<TDim>
+        , internal::ViewAccessOps<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
+    {
+        static_assert(!std::is_const_v<TElem>, "The elem type of the buffer must not be const");
+        static_assert(!std::is_const_v<TIdx>, "The idx type of the buffer must not be const!");
+
+        //! Constructor
+        template<typename TExtent, typename Deleter>
+        ALPAKA_FN_HOST BufUniformCudaHipRt(
+            DevUniformCudaHipRt<TApi> const& dev,
+            TElem* const pMem,
+            Deleter deleter,
+            TExtent const& extent,
+            std::size_t pitchBytes)
+            : detail::PitchHolder<TDim>{pitchBytes}
+            , m_dev(dev)
+            , m_extentElements(getExtents(extent))
+            , m_spMem(pMem, std::move(deleter))
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+            static_assert(
+                TDim::value == alpaka::Dim<TExtent>::value,
+                "The dimensionality of TExtent and the dimensionality of the TDim template parameter have to be "
+                "identical!");
+            static_assert(
+                std::is_same_v<TIdx, alpaka::Idx<TExtent>>,
+                "The idx type of TExtent and the TIdx template parameter have to be identical!");
+        }
+
+        DevUniformCudaHipRt<TApi> m_dev;
+        Vec<TDim, TIdx> m_extentElements;
+        std::shared_ptr<TElem> m_spMem;
+    };
+
+    namespace trait
+    {
+        //! The BufUniformCudaHipRt device type trait specialization.
+        template<typename TApi, typename TElem, typename TDim, typename TIdx>
+        struct DevType<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
+        {
+            using type = DevUniformCudaHipRt<TApi>;
+        };
+
+        //! The BufUniformCudaHipRt device get trait specialization.
+        template<typename TApi, typename TElem, typename TDim, typename TIdx>
+        struct GetDev<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST static auto getDev(BufUniformCudaHipRt<TApi, TElem, TDim, TIdx> const& buf)
+                -> DevUniformCudaHipRt<TApi>
+            {
+                return buf.m_dev;
+            }
+        };
+
+        //! The BufUniformCudaHipRt dimension getter trait specialization.
+        template<typename TApi, typename TElem, typename TDim, typename TIdx>
+        struct DimType<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //! The BufUniformCudaHipRt memory element type get trait specialization.
+        template<typename TApi, typename TElem, typename TDim, typename TIdx>
+        struct ElemType<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
+        {
+            using type = TElem;
+        };
+
+        //! The BufUniformCudaHipRt extent get trait specialization.
+        template<typename TApi, typename TElem, typename TDim, typename TIdx>
+        struct GetExtents<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST auto operator()(BufUniformCudaHipRt<TApi, TElem, TDim, TIdx> const& buffer) const
+            {
+                return buffer.m_extentElements;
+            }
+        };
+
+        //! The BufUniformCudaHipRt native pointer get trait specialization.
+        template<typename TApi, typename TElem, typename TDim, typename TIdx>
+        struct GetPtrNative<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST static auto getPtrNative(BufUniformCudaHipRt<TApi, TElem, TDim, TIdx> const& buf)
+                -> TElem const*
+            {
+                return buf.m_spMem.get();
+            }
+
+            ALPAKA_FN_HOST static auto getPtrNative(BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>& buf) -> TElem*
+            {
+                return buf.m_spMem.get();
+            }
+        };
+
+        //! The BufUniformCudaHipRt pointer on device get trait specialization.
+        template<typename TApi, typename TElem, typename TDim, typename TIdx>
+        struct GetPtrDev<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>, DevUniformCudaHipRt<TApi>>
+        {
+            ALPAKA_FN_HOST static auto getPtrDev(
+                BufUniformCudaHipRt<TApi, TElem, TDim, TIdx> const& buf,
+                DevUniformCudaHipRt<TApi> const& dev) -> TElem const*
+            {
+                if(dev == getDev(buf))
+                {
+                    return buf.m_spMem.get();
+                }
+                else
+                {
+                    throw std::runtime_error("The buffer is not accessible from the given device!");
+                }
+            }
+
+            ALPAKA_FN_HOST static auto getPtrDev(
+                BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>& buf,
+                DevUniformCudaHipRt<TApi> const& dev) -> TElem*
+            {
+                if(dev == getDev(buf))
+                {
+                    return buf.m_spMem.get();
+                }
+                else
+                {
+                    throw std::runtime_error("The buffer is not accessible from the given device!");
+                }
+            }
+        };
+
+        template<typename TApi, typename TElem, typename TDim, typename TIdx>
+        struct GetPitchesInBytes<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST auto operator()(BufUniformCudaHipRt<TApi, TElem, TDim, TIdx> const& buf) const
+                -> Vec<TDim, TIdx>
+            {
+                Vec<TDim, TIdx> v{};
+                if constexpr(TDim::value > 0)
+                {
+                    v.back() = sizeof(TElem);
+                    if constexpr(TDim::value > 1)
+                    {
+                        v[TDim::value - 2] = static_cast<TIdx>(buf.m_rowPitchInBytes);
+                        for(TIdx i = TDim::value - 2; i > 0; i--)
+                            v[i - 1] = buf.m_extentElements[i] * v[i];
+                    }
+                }
+                return v;
+            }
+        };
+
+        //! The CUDA/HIP memory allocation trait specialization.
+        template<typename TApi, typename TElem, typename Dim, typename TIdx>
+        struct BufAlloc<TElem, Dim, TIdx, DevUniformCudaHipRt<TApi>>
+        {
+            template<typename TExtent>
+            ALPAKA_FN_HOST static auto allocBuf(DevUniformCudaHipRt<TApi> const& dev, TExtent const& extent)
+                -> BufUniformCudaHipRt<TApi, TElem, Dim, TIdx>
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(dev.getNativeHandle()));
+
+                void* memPtr = nullptr;
+                std::size_t rowPitchInBytes = 0u;
+                if(getExtentProduct(extent) != 0)
+                {
+                    if constexpr(Dim::value == 0)
+                    {
+                        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::malloc(&memPtr, sizeof(TElem)));
+                    }
+                    else if constexpr(Dim::value == 1)
+                    {
+                        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                            TApi::malloc(&memPtr, static_cast<std::size_t>(getWidth(extent)) * sizeof(TElem)));
+                    }
+                    else if constexpr(Dim::value == 2)
+                    {
+                        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::mallocPitch(
+                            &memPtr,
+                            &rowPitchInBytes,
+                            static_cast<std::size_t>(getWidth(extent)) * sizeof(TElem),
+                            static_cast<std::size_t>(getHeight(extent))));
+                    }
+                    else if constexpr(Dim::value == 3)
+                    {
+                        typename TApi::Extent_t const extentVal = TApi::makeExtent(
+                            static_cast<std::size_t>(getWidth(extent)) * sizeof(TElem),
+                            static_cast<std::size_t>(getHeight(extent)),
+                            static_cast<std::size_t>(getDepth(extent)));
+                        typename TApi::PitchedPtr_t pitchedPtrVal;
+                        pitchedPtrVal.ptr = nullptr;
+                        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::malloc3D(&pitchedPtrVal, extentVal));
+                        memPtr = pitchedPtrVal.ptr;
+                        rowPitchInBytes = pitchedPtrVal.pitch;
+                    }
+                }
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                std::cout << __func__;
+                if constexpr(Dim::value >= 1)
+                    std::cout << " ew: " << getWidth(extent);
+                if constexpr(Dim::value >= 2)
+                    std::cout << " eh: " << getHeight(extent);
+                if constexpr(Dim::value >= 3)
+                    std::cout << " ed: " << getDepth(extent);
+                std::cout << " ptr: " << memPtr;
+                if constexpr(Dim::value >= 2)
+                    std::cout << " rowpitch: " << rowPitchInBytes;
+                std::cout << std::endl;
+#    endif
+                return {
+                    dev,
+                    reinterpret_cast<TElem*>(memPtr),
+                    [](TElem* ptr) { ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::free(ptr)); },
+                    extent,
+                    rowPitchInBytes};
+            }
+        };
+
+        //! The CUDA/HIP stream-ordered memory allocation trait specialization.
+        template<typename TApi, typename TElem, typename TDim, typename TIdx>
+        struct AsyncBufAlloc<TElem, TDim, TIdx, DevUniformCudaHipRt<TApi>>
+        {
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+            static_assert(
+                std::is_same_v<TApi, ApiCudaRt> && TApi::version >= BOOST_VERSION_NUMBER(11, 2, 0),
+                "Support for stream-ordered memory buffers requires CUDA 11.2 or higher.");
+#    endif
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+            static_assert(
+                std::is_same_v<TApi, ApiHipRt> && TApi::version >= BOOST_VERSION_NUMBER(5, 3, 0),
+                "Support for stream-ordered memory buffers requires HIP/ROCm 5.3 or higher.");
+#    endif
+            static_assert(
+                TDim::value <= 1,
+                "CUDA/HIP devices support only one-dimensional stream-ordered memory buffers.");
+
+            template<typename TQueue, typename TExtent>
+            ALPAKA_FN_HOST static auto allocAsyncBuf(TQueue queue, [[maybe_unused]] TExtent const& extent)
+                -> BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                static_assert(TDim::value == Dim<TExtent>::value, "extent must have the same dimension as the buffer");
+                auto const width = getExtentProduct(extent); // handles 1D and 0D buffers
+
+                auto const& dev = getDev(queue);
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(dev.getNativeHandle()));
+                void* memPtr = nullptr;
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::mallocAsync(
+                    &memPtr,
+                    static_cast<std::size_t>(width) * sizeof(TElem),
+                    queue.getNativeHandle()));
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                std::cout << __func__ << " ew: " << width << " ptr: " << memPtr << std::endl;
+#    endif
+                return {
+                    dev,
+                    reinterpret_cast<TElem*>(memPtr),
+                    [q = std::move(queue)](TElem* ptr)
+                    { ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::freeAsync(ptr, q.getNativeHandle())); },
+                    extent,
+                    static_cast<std::size_t>(width) * sizeof(TElem)};
+            }
+        };
+
+        //! The CUDA/HIP stream-ordered memory allocation capability trait specialization.
+        template<typename TApi, typename TDim>
+        struct HasAsyncBufSupport<TDim, DevUniformCudaHipRt<TApi>>
+            : std::bool_constant<
+                  TDim::value <= 1
+                  && (
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                      std::is_same_v<TApi, ApiCudaRt> && TApi::version >= BOOST_VERSION_NUMBER(11, 2, 0)
+#    elif defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+                      std::is_same_v<TApi, ApiHipRt> && TApi::version >= BOOST_VERSION_NUMBER(5, 3, 0)
+#    else
+                      false
+#    endif
+                          )>
+        {
+        };
+
+        //! The pinned/mapped memory allocation trait specialization for the CUDA/HIP devices.
+        template<typename TApi, typename TElem, typename TDim, typename TIdx>
+        struct BufAllocMapped<PlatformUniformCudaHipRt<TApi>, TElem, TDim, TIdx>
+        {
+            template<typename TExtent>
+            ALPAKA_FN_HOST static auto allocMappedBuf(
+                DevCpu const& host,
+                PlatformUniformCudaHipRt<TApi> const& /*platform*/,
+                TExtent const& extent) -> BufCpu<TElem, TDim, TIdx>
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // Allocate CUDA/HIP page-locked memory on the host, mapped into the CUDA/HIP address space and
+                // accessible to all CUDA/HIP devices.
+                TElem* memPtr = nullptr;
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::hostMalloc(
+                    reinterpret_cast<void**>(&memPtr),
+                    sizeof(TElem) * static_cast<std::size_t>(getExtentProduct(extent)),
+                    TApi::hostMallocMapped | TApi::hostMallocPortable));
+                auto deleter = [](TElem* ptr) { ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::hostFree(ptr)); };
+
+                return BufCpu<TElem, TDim, TIdx>(host, memPtr, std::move(deleter), extent);
+            }
+        };
+
+        //! The pinned/mapped memory allocation capability trait specialization.
+        template<typename TApi>
+        struct HasMappedBufSupport<PlatformUniformCudaHipRt<TApi>> : public std::true_type
+        {
+        };
+
+        //! The BufUniformCudaHipRt offset get trait specialization.
+        template<typename TApi, typename TElem, typename TDim, typename TIdx>
+        struct GetOffsets<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST auto operator()(BufUniformCudaHipRt<TApi, TElem, TDim, TIdx> const&) const
+                -> Vec<TDim, TIdx>
+            {
+                return Vec<TDim, TIdx>::zeros();
+            }
+        };
+
+        //! The BufUniformCudaHipRt idx type trait specialization.
+        template<typename TApi, typename TElem, typename TDim, typename TIdx>
+        struct IdxType<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+
+        //! The BufCpu pointer on CUDA/HIP device get trait specialization.
+        template<typename TApi, typename TElem, typename TDim, typename TIdx>
+        struct GetPtrDev<BufCpu<TElem, TDim, TIdx>, DevUniformCudaHipRt<TApi>>
+        {
+            ALPAKA_FN_HOST static auto getPtrDev(
+                BufCpu<TElem, TDim, TIdx> const& buf,
+                DevUniformCudaHipRt<TApi> const&) -> TElem const*
+            {
+                // TODO: Check if the memory is mapped at all!
+                TElem* pDev(nullptr);
+
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::hostGetDevicePointer(
+                    &pDev,
+                    const_cast<void*>(reinterpret_cast<void const*>(getPtrNative(buf))),
+                    0));
+
+                return pDev;
+            }
+
+            ALPAKA_FN_HOST static auto getPtrDev(BufCpu<TElem, TDim, TIdx>& buf, DevUniformCudaHipRt<TApi> const&)
+                -> TElem*
+            {
+                // TODO: Check if the memory is mapped at all!
+                TElem* pDev(nullptr);
+
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::hostGetDevicePointer(&pDev, getPtrNative(buf), 0));
+
+                return pDev;
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#    include "alpaka/mem/buf/uniformCudaHip/Copy.hpp"
+#    include "alpaka/mem/buf/uniformCudaHip/Set.hpp"
+
+#endif
diff --git a/include/alpaka/mem/buf/SetKernel.hpp b/include/alpaka/mem/buf/SetKernel.hpp
new file mode 100644
index 0000000..229fce9
--- /dev/null
+++ b/include/alpaka/mem/buf/SetKernel.hpp
@@ -0,0 +1,58 @@
+/* Copyright 2022 Jeffrey Kelling, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/idx/Accessors.hpp"
+#include "alpaka/idx/MapIdx.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/mem/buf/Traits.hpp"
+#include "alpaka/meta/Fold.hpp"
+
+namespace alpaka
+{
+    //! any device ND memory set kernel.
+    class MemSetKernel
+    {
+    public:
+        //! The kernel entry point.
+        //!
+        //! All but the last element of threadElemExtent must be one.
+        //!
+        //! \tparam TAcc The accelerator environment to be executed on.
+        //! \tparam TExtent extent type.
+        //! \param acc The accelerator to be executed on.
+        //! \param val value to set.
+        //! \param dst target mem ptr.
+        //! \param extent area to set.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TAcc, typename TExtent, typename TPitch>
+        ALPAKA_FN_ACC auto operator()(
+            TAcc const& acc,
+            std::uint8_t const val,
+            std::uint8_t* dst,
+            TExtent extent,
+            TPitch pitch) const -> void
+        {
+            using Idx = typename alpaka::trait::IdxType<TExtent>::type;
+            auto const gridThreadIdx(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc));
+            auto const threadElemExtent(alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc));
+            auto const idxThreadFirstElem = getIdxThreadFirstElem(acc, gridThreadIdx, threadElemExtent);
+            auto idx = mapIdxPitchBytes<1u, Dim<TAcc>::value>(idxThreadFirstElem, pitch)[0];
+            constexpr auto lastDim = Dim<TAcc>::value - 1;
+            auto const lastIdx = idx
+                                 + std::min(
+                                     threadElemExtent[lastDim],
+                                     static_cast<Idx>(extent[lastDim] - idxThreadFirstElem[lastDim]));
+
+            if((idxThreadFirstElem < extent).foldrAll(std::logical_and<bool>()))
+            {
+                for(; idx < lastIdx; ++idx)
+                {
+                    *(dst + idx) = val;
+                }
+            }
+        }
+    };
+} // namespace alpaka
diff --git a/include/alpaka/mem/buf/Traits.hpp b/include/alpaka/mem/buf/Traits.hpp
new file mode 100644
index 0000000..e29cf5b
--- /dev/null
+++ b/include/alpaka/mem/buf/Traits.hpp
@@ -0,0 +1,192 @@
+/* Copyright 2023 Alexander Matthes, Benjamin Worpitz, Andrea Bocci, Bernhard Manfred Gruber, Jan Stephan,
+ *                Christian Kaever
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/Common.hpp"
+#include "alpaka/mem/view/Traits.hpp"
+#include "alpaka/platform/Traits.hpp"
+
+namespace alpaka
+{
+    //! The CPU device handle.
+    class DevCpu;
+
+    //! The buffer traits.
+    namespace trait
+    {
+        //! The memory buffer type trait.
+        template<typename TDev, typename TElem, typename TDim, typename TIdx, typename TSfinae = void>
+        struct BufType;
+
+        //! The memory allocator trait.
+        template<typename TElem, typename TDim, typename TIdx, typename TDev, typename TSfinae = void>
+        struct BufAlloc;
+
+        //! The stream-ordered memory allocator trait.
+        template<typename TElem, typename TDim, typename TIdx, typename TDev, typename TSfinae = void>
+        struct AsyncBufAlloc;
+
+        //! The stream-ordered memory allocation capability trait.
+        template<typename TDim, typename TDev>
+        struct HasAsyncBufSupport : public std::false_type
+        {
+        };
+
+        //! The pinned/mapped memory allocator trait.
+        template<typename TPlatform, typename TElem, typename TDim, typename TIdx>
+        struct BufAllocMapped;
+
+        //! The pinned/mapped memory allocation capability trait.
+        template<typename TPlatform>
+        struct HasMappedBufSupport : public std::false_type
+        {
+        };
+    } // namespace trait
+
+    //! The memory buffer type trait alias template to remove the ::type.
+    template<typename TDev, typename TElem, typename TDim, typename TIdx>
+    using Buf = typename trait::BufType<alpaka::Dev<TDev>, TElem, TDim, TIdx>::type;
+
+    //! Allocates memory on the given device.
+    //!
+    //! \tparam TElem The element type of the returned buffer.
+    //! \tparam TIdx The linear index type of the buffer.
+    //! \tparam TExtent The extent type of the buffer.
+    //! \tparam TDev The type of device the buffer is allocated on.
+    //! \param dev The device to allocate the buffer on.
+    //! \param extent The extent of the buffer.
+    //! \return The newly allocated buffer.
+    template<typename TElem, typename TIdx, typename TExtent, typename TDev>
+    ALPAKA_FN_HOST auto allocBuf(TDev const& dev, TExtent const& extent = TExtent())
+    {
+        return trait::BufAlloc<TElem, Dim<TExtent>, TIdx, TDev>::allocBuf(dev, extent);
+    }
+
+    //! Allocates stream-ordered memory on the given device.
+    //!
+    //! \tparam TElem The element type of the returned buffer.
+    //! \tparam TIdx The linear index type of the buffer.
+    //! \tparam TExtent The extent type of the buffer.
+    //! \tparam TQueue The type of queue used to order the buffer allocation.
+    //! \param queue The queue used to order the buffer allocation.
+    //! \param extent The extent of the buffer.
+    //! \return The newly allocated buffer.
+    template<typename TElem, typename TIdx, typename TExtent, typename TQueue>
+    ALPAKA_FN_HOST auto allocAsyncBuf(TQueue queue, TExtent const& extent = TExtent())
+    {
+        return trait::AsyncBufAlloc<TElem, Dim<TExtent>, TIdx, alpaka::Dev<TQueue>>::allocAsyncBuf(queue, extent);
+    }
+
+    /* TODO: Remove this pragma block once support for clang versions <= 13 is removed. These versions are unable to
+       figure out that the template parameters are attached to a C++17 inline variable. */
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wdocumentation"
+#endif
+    //! Checks if the given device can allocate a stream-ordered memory buffer of the given dimensionality.
+    //!
+    //! \tparam TDev The type of device to allocate the buffer on.
+    //! \tparam TDim The dimensionality of the buffer to allocate.
+    template<typename TDev, typename TDim>
+    inline constexpr bool hasAsyncBufSupport = trait::HasAsyncBufSupport<TDim, TDev>::value;
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic pop
+#endif
+
+    //! If supported, allocates stream-ordered memory on the given queue and the associated device.
+    //! Otherwise, allocates regular memory on the device associated to the queue.
+    //! Please note that stream-ordered and regular memory have different semantics:
+    //! this function is provided for convenience in the cases where the difference is not relevant,
+    //! and the stream-ordered memory is only used as a performance optimisation.
+    //!
+    //! \tparam TElem The element type of the returned buffer.
+    //! \tparam TIdx The linear index type of the buffer.
+    //! \tparam TExtent The extent type of the buffer.
+    //! \tparam TQueue The type of queue used to order the buffer allocation.
+    //! \param queue The queue used to order the buffer allocation.
+    //! \param extent The extent of the buffer.
+    //! \return The newly allocated buffer.
+    template<typename TElem, typename TIdx, typename TExtent, typename TQueue>
+    ALPAKA_FN_HOST auto allocAsyncBufIfSupported(TQueue queue, TExtent const& extent = TExtent())
+    {
+        if constexpr(hasAsyncBufSupport<alpaka::Dev<TQueue>, Dim<TExtent>>)
+        {
+            return allocAsyncBuf<TElem, TIdx>(queue, extent);
+        }
+        else
+        {
+            return allocBuf<TElem, TIdx>(getDev(queue), extent);
+        }
+
+        ALPAKA_UNREACHABLE(allocBuf<TElem, TIdx>(getDev(queue), extent));
+    }
+
+    //! Allocates pinned/mapped host memory, accessible by all devices in the given platform.
+    //!
+    //! \tparam TElem The element type of the returned buffer.
+    //! \tparam TIdx The linear index type of the buffer.
+    //! \tparam TExtent The extent type of the buffer.
+    //! \tparam TPlatform The platform from which the buffer is accessible.
+    //! \param host The host device to allocate the buffer on.
+    //! \param extent The extent of the buffer.
+    //! \return The newly allocated buffer.
+    template<typename TElem, typename TIdx, typename TExtent, typename TPlatform>
+    ALPAKA_FN_HOST auto allocMappedBuf(
+        DevCpu const& host,
+        TPlatform const& platform,
+        TExtent const& extent = TExtent())
+    {
+        return trait::BufAllocMapped<TPlatform, TElem, Dim<TExtent>, TIdx>::allocMappedBuf(host, platform, extent);
+    }
+
+    /* TODO: Remove this pragma block once support for clang versions <= 13 is removed. These versions are unable to
+       figure out that the template parameters are attached to a C++17 inline variable. */
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wdocumentation"
+#endif
+    //! Checks if the host can allocate a pinned/mapped host memory, accessible by all devices in the given platform.
+    //!
+    //! \tparam TPlatform The platform from which the buffer is accessible.
+    template<typename TPlatform>
+    inline constexpr bool hasMappedBufSupport = trait::HasMappedBufSupport<TPlatform>::value;
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic pop
+#endif
+
+    //! If supported, allocates pinned/mapped host memory, accessible by all devices in the given platform.
+    //! Otherwise, allocates regular host memory.
+    //! Please note that pinned/mapped and regular memory may have different semantics:
+    //! this function is provided for convenience in the cases where the difference is not relevant,
+    //! and the pinned/mapped memory is only used as a performance optimisation.
+    //!
+    //! \tparam TElem The element type of the returned buffer.
+    //! \tparam TIdx The linear index type of the buffer.
+    //! \tparam TExtent The extent type of the buffer.
+    //! \tparam TPlatform The platform from which the buffer is accessible.
+    //! \param host The host device to allocate the buffer on.
+    //! \param extent The extent of the buffer.
+    //! \return The newly allocated buffer.
+    template<typename TElem, typename TIdx, typename TExtent, typename TPlatform>
+    ALPAKA_FN_HOST auto allocMappedBufIfSupported(
+        DevCpu const& host,
+        TPlatform const& platform,
+        TExtent const& extent = TExtent())
+    {
+        using Platform = alpaka::Platform<TPlatform>;
+        if constexpr(hasMappedBufSupport<Platform>)
+        {
+            return allocMappedBuf<TElem, TIdx>(host, platform, extent);
+        }
+        else
+        {
+            return allocBuf<TElem, TIdx>(host, extent);
+        }
+
+        ALPAKA_UNREACHABLE(allocBuf<TElem, TIdx>(host, extent));
+    }
+} // namespace alpaka
diff --git a/include/alpaka/mem/buf/cpu/Copy.hpp b/include/alpaka/mem/buf/cpu/Copy.hpp
new file mode 100644
index 0000000..dd707bd
--- /dev/null
+++ b/include/alpaka/mem/buf/cpu/Copy.hpp
@@ -0,0 +1,220 @@
+/* Copyright 2022 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera, Andrea Bocci, Jan Stephan, Bernhard
+ * Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Assert.hpp"
+#include "alpaka/dim/DimIntegralConst.hpp"
+#include "alpaka/extent/Traits.hpp"
+#include "alpaka/mem/view/Traits.hpp"
+#include "alpaka/meta/Integral.hpp"
+#include "alpaka/meta/NdLoop.hpp"
+
+#include <cstring>
+
+namespace alpaka
+{
+    class DevCpu;
+} // namespace alpaka
+
+namespace alpaka
+{
+    namespace detail
+    {
+        //! The CPU device memory copy task base.
+        //!
+        //! Copies from CPU memory into CPU memory.
+        template<typename TDim, typename TViewDst, typename TViewSrc, typename TExtent>
+        struct TaskCopyCpuBase
+        {
+            static_assert(TDim::value > 0);
+
+            using ExtentSize = Idx<TExtent>;
+            using DstSize = Idx<TViewDst>;
+            using SrcSize = Idx<TViewSrc>;
+            using Elem = alpaka::Elem<TViewSrc>;
+
+            template<typename TViewFwd>
+            TaskCopyCpuBase(TViewFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
+                : m_extent(getExtents(extent))
+                , m_extentWidthBytes(m_extent.back() * static_cast<ExtentSize>(sizeof(Elem)))
+#if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+                , m_dstExtent(getExtents(viewDst))
+                , m_srcExtent(getExtents(viewSrc))
+#endif
+                , m_dstPitchBytes(getPitchesInBytes(viewDst))
+                , m_srcPitchBytes(getPitchesInBytes(viewSrc))
+                , m_dstMemNative(reinterpret_cast<std::uint8_t*>(getPtrNative(viewDst)))
+                , m_srcMemNative(reinterpret_cast<std::uint8_t const*>(getPtrNative(viewSrc)))
+            {
+                if constexpr(TDim::value > 0)
+                {
+                    ALPAKA_ASSERT((castVec<DstSize>(m_extent) <= m_dstExtent).all());
+                    ALPAKA_ASSERT((castVec<SrcSize>(m_extent) <= m_srcExtent).all());
+                    if constexpr(TDim::value > 1)
+                    {
+                        ALPAKA_ASSERT(static_cast<DstSize>(m_extentWidthBytes) <= m_dstPitchBytes[TDim::value - 2]);
+                        ALPAKA_ASSERT(static_cast<SrcSize>(m_extentWidthBytes) <= m_srcPitchBytes[TDim::value - 2]);
+                    }
+                }
+            }
+
+#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            ALPAKA_FN_HOST auto printDebug() const -> void
+            {
+                std::cout << __func__ << " e: " << m_extent << " ewb: " << this->m_extentWidthBytes
+                          << " de: " << m_dstExtent << " dptr: " << reinterpret_cast<void*>(m_dstMemNative)
+                          << " dpitchb: " << m_dstPitchBytes << " se: " << m_srcExtent
+                          << " sptr: " << reinterpret_cast<void const*>(m_srcMemNative)
+                          << " spitchb: " << m_srcPitchBytes << std::endl;
+            }
+#endif
+
+            Vec<TDim, ExtentSize> const m_extent;
+            ExtentSize const m_extentWidthBytes;
+#if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+            Vec<TDim, DstSize> const m_dstExtent;
+            Vec<TDim, SrcSize> const m_srcExtent;
+#endif
+            Vec<TDim, DstSize> const m_dstPitchBytes;
+            Vec<TDim, SrcSize> const m_srcPitchBytes;
+
+            std::uint8_t* const m_dstMemNative;
+            std::uint8_t const* const m_srcMemNative;
+        };
+
+        //! The CPU device ND memory copy task.
+        template<typename TDim, typename TViewDst, typename TViewSrc, typename TExtent>
+        struct TaskCopyCpu : public TaskCopyCpuBase<TDim, TViewDst, TViewSrc, TExtent>
+        {
+            using DimMin1 = DimInt<TDim::value - 1u>;
+            using typename TaskCopyCpuBase<TDim, TViewDst, TViewSrc, TExtent>::ExtentSize;
+            using typename TaskCopyCpuBase<TDim, TViewDst, TViewSrc, TExtent>::DstSize;
+            using typename TaskCopyCpuBase<TDim, TViewDst, TViewSrc, TExtent>::SrcSize;
+
+            using TaskCopyCpuBase<TDim, TViewDst, TViewSrc, TExtent>::TaskCopyCpuBase;
+
+            ALPAKA_FN_HOST auto operator()() const -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                this->printDebug();
+#endif
+                // [z, y, x] -> [z, y] because all elements with the innermost x dimension are handled within one
+                // iteration.
+                Vec<DimMin1, ExtentSize> const extentWithoutInnermost = subVecBegin<DimMin1>(this->m_extent);
+                Vec<DimMin1, DstSize> const dstPitchBytesWithoutInnermost
+                    = subVecBegin<DimMin1>(this->m_dstPitchBytes);
+                Vec<DimMin1, SrcSize> const srcPitchBytesWithoutInnermost
+                    = subVecBegin<DimMin1>(this->m_srcPitchBytes);
+
+                if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
+                {
+                    meta::ndLoopIncIdx(
+                        extentWithoutInnermost,
+                        [&](Vec<DimMin1, ExtentSize> const& idx)
+                        {
+                            std::memcpy(
+                                this->m_dstMemNative + (castVec<DstSize>(idx) * dstPitchBytesWithoutInnermost).sum(),
+                                this->m_srcMemNative + (castVec<SrcSize>(idx) * srcPitchBytesWithoutInnermost).sum(),
+                                static_cast<std::size_t>(this->m_extentWidthBytes));
+                        });
+                }
+            }
+        };
+
+        //! The CPU device 1D memory copy task.
+        template<typename TViewDst, typename TViewSrc, typename TExtent>
+        struct TaskCopyCpu<DimInt<1u>, TViewDst, TViewSrc, TExtent>
+            : TaskCopyCpuBase<DimInt<1u>, TViewDst, TViewSrc, TExtent>
+        {
+            using TaskCopyCpuBase<DimInt<1u>, TViewDst, TViewSrc, TExtent>::TaskCopyCpuBase;
+
+            ALPAKA_FN_HOST auto operator()() const -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                this->printDebug();
+#endif
+                if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
+                {
+                    std::memcpy(
+                        reinterpret_cast<void*>(this->m_dstMemNative),
+                        reinterpret_cast<void const*>(this->m_srcMemNative),
+                        static_cast<std::size_t>(this->m_extentWidthBytes));
+                }
+            }
+        };
+
+        //! The CPU device scalar memory copy task.
+        //!
+        //! Copies from CPU memory into CPU memory.
+        template<typename TViewDst, typename TViewSrc, typename TExtent>
+        struct TaskCopyCpu<DimInt<0u>, TViewDst, TViewSrc, TExtent>
+        {
+            using Elem = alpaka::Elem<TViewSrc>;
+
+            template<typename TViewDstFwd>
+            TaskCopyCpu(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, [[maybe_unused]] TExtent const& extent)
+                : m_dstMemNative(reinterpret_cast<std::uint8_t*>(getPtrNative(viewDst)))
+                , m_srcMemNative(reinterpret_cast<std::uint8_t const*>(getPtrNative(viewSrc)))
+            {
+                // all zero-sized extents are equivalent
+                ALPAKA_ASSERT(getExtents(extent).prod() == 1u);
+                ALPAKA_ASSERT(getExtents(viewDst).prod() == 1u);
+                ALPAKA_ASSERT(getExtents(viewSrc).prod() == 1u);
+            }
+
+#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            ALPAKA_FN_HOST auto printDebug() const -> void
+            {
+                using Scalar = Vec<DimInt<0u>, Idx<TExtent>>;
+                std::cout << __func__ << " e: " << Scalar() << " ewb: " << sizeof(Elem) << " de: " << Scalar()
+                          << " dptr: " << reinterpret_cast<void*>(m_dstMemNative) << " dpitchb: " << Scalar()
+                          << " se: " << Scalar() << " sptr: " << reinterpret_cast<void const*>(m_srcMemNative)
+                          << " spitchb: " << Scalar() << std::endl;
+            }
+#endif
+
+            ALPAKA_FN_HOST auto operator()() const noexcept(ALPAKA_DEBUG < ALPAKA_DEBUG_FULL) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                printDebug();
+#endif
+                std::memcpy(
+                    reinterpret_cast<void*>(m_dstMemNative),
+                    reinterpret_cast<void const*>(m_srcMemNative),
+                    sizeof(Elem));
+            }
+
+            std::uint8_t* const m_dstMemNative;
+            std::uint8_t const* const m_srcMemNative;
+        };
+    } // namespace detail
+
+    namespace trait
+    {
+        //! The CPU device memory copy trait specialization.
+        //!
+        //! Copies from CPU memory into CPU memory.
+        template<typename TDim>
+        struct CreateTaskMemcpy<TDim, DevCpu, DevCpu>
+        {
+            template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
+            ALPAKA_FN_HOST static auto createTaskMemcpy(
+                TViewDstFwd&& viewDst,
+                TViewSrc const& viewSrc,
+                TExtent const& extent)
+                -> alpaka::detail::TaskCopyCpu<TDim, std::remove_reference_t<TViewDstFwd>, TViewSrc, TExtent>
+            {
+                return {std::forward<TViewDstFwd>(viewDst), viewSrc, extent};
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/mem/buf/cpu/Set.hpp b/include/alpaka/mem/buf/cpu/Set.hpp
new file mode 100644
index 0000000..1e617e2
--- /dev/null
+++ b/include/alpaka/mem/buf/cpu/Set.hpp
@@ -0,0 +1,186 @@
+/* Copyright 2022 Benjamin Worpitz, Erik Zenker, Matthias Werner, Andrea Bocci, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Assert.hpp"
+#include "alpaka/dim/DimIntegralConst.hpp"
+#include "alpaka/extent/Traits.hpp"
+#include "alpaka/mem/view/Traits.hpp"
+#include "alpaka/meta/Integral.hpp"
+#include "alpaka/meta/NdLoop.hpp"
+
+#include <cstring>
+
+namespace alpaka
+{
+    class DevCpu;
+
+    namespace detail
+    {
+        //! The CPU device ND memory set task base.
+        template<typename TDim, typename TView, typename TExtent>
+        struct TaskSetCpuBase
+        {
+            static_assert(TDim::value > 0);
+
+            using ExtentSize = Idx<TExtent>;
+            using DstSize = Idx<TView>;
+            using Elem = alpaka::Elem<TView>;
+
+            template<typename TViewFwd>
+            TaskSetCpuBase(TViewFwd&& view, std::uint8_t const& byte, TExtent const& extent)
+                : m_byte(byte)
+                , m_extent(getExtents(extent))
+                , m_extentWidthBytes(m_extent.back() * static_cast<ExtentSize>(sizeof(Elem)))
+#if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+                , m_dstExtent(getExtents(view))
+#endif
+                , m_dstPitchBytes(getPitchesInBytes(view))
+                , m_dstMemNative(reinterpret_cast<std::uint8_t*>(getPtrNative(view)))
+            {
+                ALPAKA_ASSERT((castVec<DstSize>(m_extent) <= m_dstExtent).all());
+                if constexpr(TDim::value > 1)
+                    ALPAKA_ASSERT(m_extentWidthBytes <= m_dstPitchBytes[TDim::value - 2]);
+            }
+
+#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            ALPAKA_FN_HOST auto printDebug() const -> void
+            {
+                std::cout << __func__ << " e: " << this->m_extent << " ewb: " << this->m_extentWidthBytes
+                          << " de: " << this->m_dstExtent << " dptr: " << reinterpret_cast<void*>(this->m_dstMemNative)
+                          << " dpitchb: " << this->m_dstPitchBytes << std::endl;
+            }
+#endif
+
+            std::uint8_t const m_byte;
+            Vec<TDim, ExtentSize> const m_extent;
+            ExtentSize const m_extentWidthBytes;
+#if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+            Vec<TDim, DstSize> const m_dstExtent;
+#endif
+            Vec<TDim, DstSize> const m_dstPitchBytes;
+            std::uint8_t* const m_dstMemNative;
+        };
+
+        //! The CPU device ND memory set task.
+        template<typename TDim, typename TView, typename TExtent>
+        struct TaskSetCpu : public TaskSetCpuBase<TDim, TView, TExtent>
+        {
+            using DimMin1 = DimInt<TDim::value - 1u>;
+            using typename TaskSetCpuBase<TDim, TView, TExtent>::ExtentSize;
+            using typename TaskSetCpuBase<TDim, TView, TExtent>::DstSize;
+
+            using TaskSetCpuBase<TDim, TView, TExtent>::TaskSetCpuBase;
+
+            ALPAKA_FN_HOST auto operator()() const -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                this->printDebug();
+#endif
+                // [z, y, x] -> [z, y] because all elements with the innermost x dimension are handled within one
+                // iteration.
+                Vec<DimMin1, ExtentSize> const extentWithoutInnermost = subVecBegin<DimMin1>(this->m_extent);
+                Vec<DimMin1, DstSize> const dstPitchBytesWithoutOutmost = subVecBegin<DimMin1>(this->m_dstPitchBytes);
+
+                if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
+                {
+                    meta::ndLoopIncIdx(
+                        extentWithoutInnermost,
+                        [&](Vec<DimMin1, ExtentSize> const& idx)
+                        {
+                            std::memset(
+                                this->m_dstMemNative + (castVec<DstSize>(idx) * dstPitchBytesWithoutOutmost).sum(),
+                                this->m_byte,
+                                static_cast<std::size_t>(this->m_extentWidthBytes));
+                        });
+                }
+            }
+        };
+
+        //! The CPU device 1D memory set task.
+        template<typename TView, typename TExtent>
+        struct TaskSetCpu<DimInt<1u>, TView, TExtent> : public TaskSetCpuBase<DimInt<1u>, TView, TExtent>
+        {
+            using TaskSetCpuBase<DimInt<1u>, TView, TExtent>::TaskSetCpuBase;
+
+            ALPAKA_FN_HOST auto operator()() const -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                this->printDebug();
+#endif
+                if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
+                {
+                    std::memset(
+                        this->m_dstMemNative,
+                        this->m_byte,
+                        static_cast<std::size_t>(this->m_extentWidthBytes));
+                }
+            }
+        };
+
+        //! The CPU device scalar memory set task.
+        template<typename TView, typename TExtent>
+        struct TaskSetCpu<DimInt<0u>, TView, TExtent>
+        {
+            using ExtentSize = Idx<TExtent>;
+            using Scalar = Vec<DimInt<0u>, ExtentSize>;
+            using DstSize = Idx<TView>;
+            using Elem = alpaka::Elem<TView>;
+
+            template<typename TViewFwd>
+            TaskSetCpu(TViewFwd&& view, std::uint8_t const& byte, [[maybe_unused]] TExtent const& extent)
+                : m_byte(byte)
+                , m_dstMemNative(reinterpret_cast<std::uint8_t*>(getPtrNative(view)))
+            {
+                // all zero-sized extents are equivalent
+                ALPAKA_ASSERT(getExtents(extent).prod() == 1u);
+                ALPAKA_ASSERT(getExtents(view).prod() == 1u);
+            }
+
+#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            ALPAKA_FN_HOST auto printDebug() const -> void
+            {
+                std::cout << __func__ << " e: " << Scalar() << " ewb: " << sizeof(Elem) << " de: " << Scalar()
+                          << " dptr: " << reinterpret_cast<void*>(m_dstMemNative) << " dpitchb: " << Scalar()
+                          << std::endl;
+            }
+#endif
+
+            ALPAKA_FN_HOST auto operator()() const noexcept(ALPAKA_DEBUG < ALPAKA_DEBUG_FULL) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                printDebug();
+#endif
+                std::memset(m_dstMemNative, m_byte, sizeof(Elem));
+            }
+
+            std::uint8_t const m_byte;
+            std::uint8_t* const m_dstMemNative;
+        };
+    } // namespace detail
+
+    namespace trait
+    {
+        //! The CPU device memory set trait specialization.
+        template<typename TDim>
+        struct CreateTaskMemset<TDim, DevCpu>
+        {
+            template<typename TExtent, typename TViewFwd>
+            ALPAKA_FN_HOST static auto createTaskMemset(
+                TViewFwd&& view,
+                std::uint8_t const& byte,
+                TExtent const& extent) -> alpaka::detail::TaskSetCpu<TDim, std::remove_reference_t<TViewFwd>, TExtent>
+            {
+                return {std::forward<TViewFwd>(view), byte, extent};
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/mem/buf/sycl/Common.hpp b/include/alpaka/mem/buf/sycl/Common.hpp
new file mode 100644
index 0000000..498577d
--- /dev/null
+++ b/include/alpaka/mem/buf/sycl/Common.hpp
@@ -0,0 +1,57 @@
+/* Copyright 2022 Jan Stephan
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/dim/Traits.hpp"
+#include "alpaka/extent/Traits.hpp"
+#include "alpaka/offset/Traits.hpp"
+
+#include <cstddef>
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+
+#    include <sycl/sycl.hpp>
+
+namespace alpaka::detail
+{
+    template<typename TExtent>
+    inline auto make_sycl_range(TExtent const& ext, std::size_t multiplier = 1)
+    {
+        constexpr auto dim = Dim<TExtent>::value;
+
+        if constexpr(dim == 0)
+            return sycl::range<1>{multiplier};
+        else
+        {
+            auto const width = getWidth(ext) * multiplier;
+            if constexpr(dim == 1)
+                return sycl::range<1>{width};
+            else if constexpr(dim == 2)
+                return sycl::range<2>{width, getHeight(ext)};
+            else
+                return sycl::range<3>{width, getHeight(ext), getDepth(ext)};
+        }
+    }
+
+    template<typename TView>
+    inline auto make_sycl_offset(TView const& view)
+    {
+        constexpr auto dim = Dim<TView>::value;
+
+        if constexpr(dim == 0)
+            return sycl::range<1>{1};
+        else
+        {
+            if constexpr(dim == 1)
+                return sycl::id<1>{getOffsetX(view)};
+            else if constexpr(dim == 2)
+                return sycl::id<2>{getOffsetX(view), getOffsetY(view)};
+            else
+                return sycl::id<3>{getOffsetX(view), getOffsetY(view), getOffsetZ(view)};
+        }
+    }
+} // namespace alpaka::detail
+
+#endif
diff --git a/include/alpaka/mem/buf/sycl/Copy.hpp b/include/alpaka/mem/buf/sycl/Copy.hpp
new file mode 100644
index 0000000..44098f1
--- /dev/null
+++ b/include/alpaka/mem/buf/sycl/Copy.hpp
@@ -0,0 +1,240 @@
+/* Copyright 2024 Jan Stephan, Bernhard Manfred Gruber, Luca Ferragina, Aurora Perego, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Debug.hpp"
+#include "alpaka/core/Sycl.hpp"
+#include "alpaka/dev/DevCpu.hpp"
+#include "alpaka/dev/DevGenericSycl.hpp"
+#include "alpaka/dim/DimIntegralConst.hpp"
+#include "alpaka/elem/Traits.hpp"
+#include "alpaka/extent/Traits.hpp"
+#include "alpaka/mem/buf/sycl/Common.hpp"
+#include "alpaka/mem/view/Traits.hpp"
+#include "alpaka/meta/NdLoop.hpp"
+#include "alpaka/queue/QueueGenericSyclBlocking.hpp"
+#include "alpaka/queue/QueueGenericSyclNonBlocking.hpp"
+
+#include <memory>
+#include <type_traits>
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+
+#    include <sycl/sycl.hpp>
+
+namespace alpaka::detail
+{
+    //!  The SYCL device memory copy task base.
+    template<typename TDim, typename TViewDst, typename TViewSrc, typename TExtent>
+    struct TaskCopySyclBase
+    {
+        static_assert(
+            std::is_same_v<std::remove_const_t<alpaka::Elem<TViewSrc>>, std::remove_const_t<alpaka::Elem<TViewDst>>>,
+            "The source and the destination view are required to have the same element type!");
+        using ExtentSize = Idx<TExtent>;
+        using DstSize = Idx<TViewDst>;
+        using SrcSize = Idx<TViewSrc>;
+        using Elem = alpaka::Elem<TViewSrc>;
+
+        template<typename TViewFwd>
+        TaskCopySyclBase(TViewFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
+            : m_extent(getExtents(extent))
+            , m_extentWidthBytes(m_extent.back() * static_cast<ExtentSize>(sizeof(Elem)))
+#    if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+            , m_dstExtent(getExtents(viewDst))
+            , m_srcExtent(getExtents(viewSrc))
+#    endif
+            , m_dstPitchBytes(getPitchesInBytes(viewDst))
+            , m_srcPitchBytes(getPitchesInBytes(viewSrc))
+            , m_dstMemNative(reinterpret_cast<std::uint8_t*>(getPtrNative(viewDst)))
+            , m_srcMemNative(reinterpret_cast<std::uint8_t const*>(getPtrNative(viewSrc)))
+        {
+            if constexpr(TDim::value > 0)
+            {
+                ALPAKA_ASSERT((castVec<DstSize>(m_extent) <= m_dstExtent).all());
+                ALPAKA_ASSERT((castVec<SrcSize>(m_extent) <= m_srcExtent).all());
+            }
+        }
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+        auto printDebug() const -> void
+        {
+            std::cout << __func__ << " e: " << m_extent << " ewb: " << this->m_extentWidthBytes
+                      << " de: " << m_dstExtent << " dptr: " << reinterpret_cast<void*>(m_dstMemNative)
+                      << " se: " << m_srcExtent << " sptr: " << reinterpret_cast<void const*>(m_srcMemNative)
+                      << std::endl;
+        }
+#    endif
+
+        Vec<TDim, ExtentSize> const m_extent;
+        ExtentSize const m_extentWidthBytes;
+#    if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+        Vec<TDim, DstSize> const m_dstExtent;
+        Vec<TDim, SrcSize> const m_srcExtent;
+#    endif
+
+        Vec<TDim, DstSize> const m_dstPitchBytes;
+        Vec<TDim, SrcSize> const m_srcPitchBytes;
+        std::uint8_t* const m_dstMemNative;
+        std::uint8_t const* const m_srcMemNative;
+        static constexpr auto is_sycl_task = true;
+    };
+
+    //! The SYCL device ND memory copy task.
+    template<typename TDim, typename TViewDst, typename TViewSrc, typename TExtent>
+    struct TaskCopySycl : public TaskCopySyclBase<TDim, TViewDst, TViewSrc, TExtent>
+    {
+        using DimMin1 = DimInt<TDim::value - 1u>;
+        using typename TaskCopySyclBase<TDim, TViewDst, TViewSrc, TExtent>::ExtentSize;
+        using typename TaskCopySyclBase<TDim, TViewDst, TViewSrc, TExtent>::DstSize;
+        using typename TaskCopySyclBase<TDim, TViewDst, TViewSrc, TExtent>::SrcSize;
+
+        using TaskCopySyclBase<TDim, TViewDst, TViewSrc, TExtent>::TaskCopySyclBase;
+
+        auto operator()(sycl::queue& queue, std::vector<sycl::event> const& requirements) const -> sycl::event
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            this->printDebug();
+#    endif
+            // [z, y, x] -> [z, y] because all elements with the innermost x dimension are handled within one
+            // iteration.
+            Vec<DimMin1, ExtentSize> const extentWithoutInnermost(subVecBegin<DimMin1>(this->m_extent));
+            Vec<DimMin1, DstSize> const dstPitchBytesWithoutInnermost(subVecBegin<DimMin1>(this->m_dstPitchBytes));
+            Vec<DimMin1, SrcSize> const srcPitchBytesWithoutInnermost(subVecBegin<DimMin1>(this->m_srcPitchBytes));
+
+            // Record an event for each memcpy call
+            std::vector<sycl::event> events;
+            events.reserve(static_cast<std::size_t>(extentWithoutInnermost.prod()));
+
+            if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
+            {
+                meta::ndLoopIncIdx(
+                    extentWithoutInnermost,
+                    [&](Vec<DimMin1, ExtentSize> const& idx)
+                    {
+                        events.push_back(queue.memcpy(
+                            this->m_dstMemNative + (castVec<DstSize>(idx) * dstPitchBytesWithoutInnermost).sum(),
+                            this->m_srcMemNative + (castVec<SrcSize>(idx) * srcPitchBytesWithoutInnermost).sum(),
+                            static_cast<std::size_t>(this->m_extentWidthBytes),
+                            requirements));
+                    });
+            }
+
+            // Return an event that depends on all the events assciated to the memcpy calls
+            return queue.ext_oneapi_submit_barrier(events);
+        }
+    };
+
+    //! The SYCL device 1D memory copy task.
+    template<typename TViewDst, typename TViewSrc, typename TExtent>
+    struct TaskCopySycl<DimInt<1u>, TViewDst, TViewSrc, TExtent>
+        : TaskCopySyclBase<DimInt<1u>, TViewDst, TViewSrc, TExtent>
+    {
+        using TaskCopySyclBase<DimInt<1u>, TViewDst, TViewSrc, TExtent>::TaskCopySyclBase;
+        using Elem = alpaka::Elem<TViewSrc>;
+
+        auto operator()(sycl::queue& queue, std::vector<sycl::event> const& requirements) const -> sycl::event
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            this->printDebug();
+#    endif
+            if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
+            {
+                return queue.memcpy(
+                    this->m_dstMemNative,
+                    this->m_srcMemNative,
+                    sizeof(Elem) * static_cast<std::size_t>(this->m_extent.prod()),
+                    requirements);
+            }
+            else
+            {
+                return queue.ext_oneapi_submit_barrier();
+            }
+        }
+    };
+
+    //! The scalar SYCL memory copy trait.
+    template<typename TViewDst, typename TViewSrc, typename TExtent>
+    struct TaskCopySycl<DimInt<0u>, TViewDst, TViewSrc, TExtent>
+    {
+        static_assert(
+            std::is_same_v<std::remove_const_t<alpaka::Elem<TViewSrc>>, std::remove_const_t<alpaka::Elem<TViewDst>>>,
+            "The source and the destination view are required to have the same element type!");
+
+        using Elem = alpaka::Elem<TViewSrc>;
+
+        template<typename TViewDstFwd>
+        TaskCopySycl(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, [[maybe_unused]] TExtent const& extent)
+            : m_dstMemNative(reinterpret_cast<void*>(getPtrNative(viewDst)))
+            , m_srcMemNative(reinterpret_cast<void const*>(getPtrNative(viewSrc)))
+        {
+            // all zero-sized extents are equivalent
+            ALPAKA_ASSERT(getExtents(extent).prod() == 1u);
+            ALPAKA_ASSERT(getExtents(viewDst).prod() == 1u);
+            ALPAKA_ASSERT(getExtents(viewSrc).prod() == 1u);
+        }
+
+        auto operator()(sycl::queue& queue, std::vector<sycl::event> const& requirements) const -> sycl::event
+        {
+            return queue.memcpy(m_dstMemNative, m_srcMemNative, sizeof(Elem), requirements);
+        }
+
+        void* m_dstMemNative;
+        void const* m_srcMemNative;
+        static constexpr auto is_sycl_task = true;
+    };
+} // namespace alpaka::detail
+
+// Trait specializations for CreateTaskMemcpy.
+namespace alpaka::trait
+{
+    //! The SYCL host-to-device memory copy trait specialization.
+    template<typename TTag, typename TDim>
+    struct CreateTaskMemcpy<TDim, DevGenericSycl<TTag>, DevCpu>
+    {
+        template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
+        static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
+            -> alpaka::detail::TaskCopySycl<TDim, std::remove_reference_t<TViewDstFwd>, TViewSrc, TExtent>
+        {
+            ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+            return {std::forward<TViewDstFwd>(viewDst), viewSrc, extent};
+        }
+    };
+
+    //! The SYCL device-to-host memory copy trait specialization.
+    template<typename TTag, typename TDim>
+    struct CreateTaskMemcpy<TDim, DevCpu, DevGenericSycl<TTag>>
+    {
+        template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
+        static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
+            -> alpaka::detail::TaskCopySycl<TDim, std::remove_reference_t<TViewDstFwd>, TViewSrc, TExtent>
+        {
+            ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+            return {std::forward<TViewDstFwd>(viewDst), viewSrc, extent};
+        }
+    };
+
+    //! The SYCL device-to-device memory copy trait specialization.
+    template<typename TTagDst, typename TTagSrc, typename TDim>
+    struct CreateTaskMemcpy<TDim, DevGenericSycl<TTagDst>, DevGenericSycl<TTagSrc>>
+    {
+        template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
+        static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
+            -> alpaka::detail::TaskCopySycl<TDim, std::remove_reference_t<TViewDstFwd>, TViewSrc, TExtent>
+        {
+            ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+            return {std::forward<TViewDstFwd>(viewDst), viewSrc, extent};
+        }
+    };
+} // namespace alpaka::trait
+
+#endif
diff --git a/include/alpaka/mem/buf/sycl/Set.hpp b/include/alpaka/mem/buf/sycl/Set.hpp
new file mode 100644
index 0000000..73478d3
--- /dev/null
+++ b/include/alpaka/mem/buf/sycl/Set.hpp
@@ -0,0 +1,212 @@
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Aurora Perego, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Debug.hpp"
+#include "alpaka/core/Sycl.hpp"
+#include "alpaka/dev/DevGenericSycl.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/dim/DimIntegralConst.hpp"
+#include "alpaka/extent/Traits.hpp"
+#include "alpaka/mem/buf/sycl/Common.hpp"
+#include "alpaka/mem/view/Traits.hpp"
+#include "alpaka/meta/NdLoop.hpp"
+#include "alpaka/queue/QueueGenericSyclBlocking.hpp"
+#include "alpaka/queue/QueueGenericSyclNonBlocking.hpp"
+#include "alpaka/queue/Traits.hpp"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+
+namespace alpaka
+{
+
+    namespace detail
+    {
+        //!  The SYCL ND memory set task base.
+        template<typename TDim, typename TView, typename TExtent>
+        struct TaskSetSyclBase
+        {
+            using ExtentSize = Idx<TExtent>;
+            using DstSize = Idx<TView>;
+            using Elem = alpaka::Elem<TView>;
+
+            template<typename TViewFwd>
+            TaskSetSyclBase(TViewFwd&& view, std::uint8_t const& byte, TExtent const& extent)
+                : m_byte(byte)
+                , m_extent(getExtents(extent))
+                , m_extentWidthBytes(m_extent.back() * static_cast<ExtentSize>(sizeof(Elem)))
+#    if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+                , m_dstExtent(getExtents(view))
+#    endif
+
+                , m_dstPitchBytes(getPitchesInBytes(view))
+                , m_dstMemNative(reinterpret_cast<std::uint8_t*>(getPtrNative(view)))
+
+            {
+                ALPAKA_ASSERT((castVec<DstSize>(m_extent) <= m_dstExtent).all());
+                if constexpr(TDim::value > 1)
+                    ALPAKA_ASSERT(m_extentWidthBytes <= m_dstPitchBytes[TDim::value - 2]);
+            }
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            auto printDebug() const -> void
+            {
+                std::cout << __func__ << " e: " << this->m_extent << " ewb: " << this->m_extentWidthBytes
+                          << " de: " << this->m_dstExtent << " dptr: " << reinterpret_cast<void*>(this->m_dstMemNative)
+                          << " dpitchb: " << this->m_dstPitchBytes << std::endl;
+            }
+#    endif
+
+            std::uint8_t const m_byte;
+            Vec<TDim, ExtentSize> const m_extent;
+            ExtentSize const m_extentWidthBytes;
+#    if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+            Vec<TDim, DstSize> const m_dstExtent;
+#    endif
+            Vec<TDim, DstSize> const m_dstPitchBytes;
+            std::uint8_t* const m_dstMemNative;
+            static constexpr auto is_sycl_task = true;
+        };
+
+        //! The SYCL device ND memory set task.
+        template<typename TDim, typename TView, typename TExtent>
+        struct TaskSetSycl : public TaskSetSyclBase<TDim, TView, TExtent>
+        {
+            using DimMin1 = DimInt<TDim::value - 1u>;
+            using typename TaskSetSyclBase<TDim, TView, TExtent>::ExtentSize;
+            using typename TaskSetSyclBase<TDim, TView, TExtent>::DstSize;
+
+            using TaskSetSyclBase<TDim, TView, TExtent>::TaskSetSyclBase;
+
+            auto operator()(sycl::queue& queue, std::vector<sycl::event> const& requirements) const -> sycl::event
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                this->printDebug();
+#    endif
+                // [z, y, x] -> [z, y] because all elements with the innermost x dimension are handled within one
+                // iteration.
+                Vec<DimMin1, ExtentSize> const extentWithoutInnermost(subVecBegin<DimMin1>(this->m_extent));
+                Vec<DimMin1, DstSize> const dstPitchBytesWithoutInnermost(subVecBegin<DimMin1>(this->m_dstPitchBytes));
+
+                // Record an event for each memcpy call
+                std::vector<sycl::event> events;
+                events.reserve(static_cast<std::size_t>(extentWithoutInnermost.prod()));
+
+                if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
+                {
+                    meta::ndLoopIncIdx(
+                        extentWithoutInnermost,
+                        [&](Vec<DimMin1, ExtentSize> const& idx)
+                        {
+                            events.push_back(queue.memset(
+                                this->m_dstMemNative + (castVec<DstSize>(idx) * dstPitchBytesWithoutInnermost).sum(),
+                                this->m_byte,
+                                static_cast<std::size_t>(this->m_extentWidthBytes),
+                                requirements));
+                        });
+                }
+
+                // Return an event that depends on all the events assciated to the memcpy calls
+                return queue.ext_oneapi_submit_barrier(events);
+            }
+        };
+
+        //! The 1D SYCL memory set task.
+        template<typename TView, typename TExtent>
+        struct TaskSetSycl<DimInt<1u>, TView, TExtent> : public TaskSetSyclBase<DimInt<1u>, TView, TExtent>
+        {
+            using TaskSetSyclBase<DimInt<1u>, TView, TExtent>::TaskSetSyclBase;
+
+            auto operator()(sycl::queue& queue, std::vector<sycl::event> const& requirements) const -> sycl::event
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                this->printDebug();
+#    endif
+                if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
+                {
+                    return queue.memset(
+                        reinterpret_cast<void*>(this->m_dstMemNative),
+                        this->m_byte,
+                        static_cast<std::size_t>(this->m_extentWidthBytes),
+                        requirements);
+                }
+                else
+                {
+                    return queue.ext_oneapi_submit_barrier();
+                }
+            }
+        };
+
+        //! The SYCL device scalar memory set task.
+        template<typename TView, typename TExtent>
+        struct TaskSetSycl<DimInt<0u>, TView, TExtent>
+        {
+            using ExtentSize = Idx<TExtent>;
+            using Scalar = Vec<DimInt<0u>, ExtentSize>;
+            using DstSize = Idx<TView>;
+            using Elem = alpaka::Elem<TView>;
+
+            template<typename TViewFwd>
+            TaskSetSycl(TViewFwd&& view, std::uint8_t const& byte, [[maybe_unused]] TExtent const& extent)
+                : m_byte(byte)
+                , m_dstMemNative(reinterpret_cast<std::uint8_t*>(getPtrNative(view)))
+            {
+                // all zero-sized extents are equivalent
+                ALPAKA_ASSERT(getExtents(extent).prod() == 1u);
+                ALPAKA_ASSERT(getExtents(view).prod() == 1u);
+            }
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            auto printDebug() const -> void
+            {
+                std::cout << __func__ << " e: " << Scalar() << " ewb: " << sizeof(Elem) << " de: " << Scalar()
+                          << " dptr: " << reinterpret_cast<void*>(m_dstMemNative) << " dpitchb: " << Scalar()
+                          << std::endl;
+            }
+#    endif
+
+            auto operator()(sycl::queue& queue, std::vector<sycl::event> const& requirements) const -> sycl::event
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                printDebug();
+#    endif
+                return queue.memset(reinterpret_cast<void*>(m_dstMemNative), m_byte, sizeof(Elem), requirements);
+            }
+
+            std::uint8_t const m_byte;
+            std::uint8_t* const m_dstMemNative;
+            static constexpr auto is_sycl_task = true;
+        };
+
+    } // namespace detail
+
+    namespace trait
+    {
+        //! The SYCL device memory set trait specialization.
+        template<typename TDim, typename TPlatform>
+        struct CreateTaskMemset<TDim, DevGenericSycl<TPlatform>>
+        {
+            template<typename TExtent, typename TView>
+            static auto createTaskMemset(TView& view, std::uint8_t const& byte, TExtent const& extent)
+                -> alpaka::detail::TaskSetSycl<TDim, TView, TExtent>
+            {
+                return alpaka::detail::TaskSetSycl<TDim, TView, TExtent>(view, byte, extent);
+            }
+        };
+
+    } // namespace trait
+
+} // namespace alpaka
+#endif
diff --git a/include/alpaka/mem/buf/uniformCudaHip/Copy.hpp b/include/alpaka/mem/buf/uniformCudaHip/Copy.hpp
new file mode 100644
index 0000000..37ee6fb
--- /dev/null
+++ b/include/alpaka/mem/buf/uniformCudaHip/Copy.hpp
@@ -0,0 +1,643 @@
+/* Copyright 2023 Axel Hübl, Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera, Andrea Bocci, Jan Stephan,
+ *                Bernhard Manfred Gruber, Antonio Di Pilato
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Assert.hpp"
+#include "alpaka/core/Cuda.hpp"
+#include "alpaka/core/Hip.hpp"
+#include "alpaka/dev/DevCpu.hpp"
+#include "alpaka/dev/DevUniformCudaHipRt.hpp"
+#include "alpaka/dim/DimIntegralConst.hpp"
+#include "alpaka/extent/Traits.hpp"
+#include "alpaka/mem/view/Traits.hpp"
+#include "alpaka/queue/QueueUniformCudaHipRtBlocking.hpp"
+#include "alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp"
+
+#include <cstddef>
+#include <cstdint>
+#include <set>
+#include <tuple>
+#include <type_traits>
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+namespace alpaka
+{
+    namespace detail
+    {
+        //! The CUDA/HIP memory copy trait.
+        template<typename TApi, typename TDim, typename TViewDst, typename TViewSrc, typename TExtent>
+        struct TaskCopyUniformCudaHip;
+
+        //! The scalar CUDA/HIP memory copy trait.
+        template<typename TApi, typename TViewDst, typename TViewSrc, typename TExtent>
+        struct TaskCopyUniformCudaHip<TApi, DimInt<0u>, TViewDst, TViewSrc, TExtent>
+        {
+            using Idx = alpaka::Idx<TExtent>;
+
+            template<typename TViewDstFwd>
+            ALPAKA_FN_HOST TaskCopyUniformCudaHip(
+                TViewDstFwd&& viewDst,
+                TViewSrc const& viewSrc,
+                [[maybe_unused]] TExtent const& extent,
+                typename TApi::MemcpyKind_t const& uniformMemCpyKind,
+                int const& iDstDevice,
+                int const& iSrcDevice)
+                : m_uniformMemCpyKind(uniformMemCpyKind)
+                , m_iDstDevice(iDstDevice)
+                , m_iSrcDevice(iSrcDevice)
+                , m_dstMemNative(reinterpret_cast<void*>(getPtrNative(viewDst)))
+                , m_srcMemNative(reinterpret_cast<void const*>(getPtrNative(viewSrc)))
+            {
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                ALPAKA_ASSERT(getExtentProduct(extent) == 1);
+#    endif
+            }
+
+            template<typename TQueue>
+            auto enqueue(TQueue& queue) const -> void
+            {
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                printDebug();
+#    endif
+                // cudaMemcpy variants on cudaMallocAsync'ed memory need to be called with the correct device,
+                // see https://github.com/fwyzard/nvidia_bug_3446335 .
+                // Set the current device.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(m_iDstDevice));
+                // Initiate the memory copy.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memcpyAsync(
+                    m_dstMemNative,
+                    m_srcMemNative,
+                    sizeof(Elem<TViewDst>),
+                    m_uniformMemCpyKind,
+                    queue.getNativeHandle()));
+            }
+
+        private:
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            ALPAKA_FN_HOST auto printDebug() const -> void
+            {
+                std::cout << __func__ << " ddev: " << m_iDstDevice << " ew: " << Idx(1u)
+                          << " ewb: " << static_cast<Idx>(sizeof(Elem<TViewDst>)) << " dw: " << Idx(1u)
+                          << " dptr: " << m_dstMemNative << " sdev: " << m_iSrcDevice << " sw: " << Idx(1u)
+                          << " sptr: " << m_srcMemNative << std::endl;
+            }
+#    endif
+
+            typename TApi::MemcpyKind_t m_uniformMemCpyKind;
+            int m_iDstDevice;
+            int m_iSrcDevice;
+            void* m_dstMemNative;
+            void const* m_srcMemNative;
+        };
+
+        //! The 1D CUDA/HIP memory copy trait.
+        template<typename TApi, typename TViewDst, typename TViewSrc, typename TExtent>
+        struct TaskCopyUniformCudaHip<TApi, DimInt<1u>, TViewDst, TViewSrc, TExtent>
+        {
+            using Idx = alpaka::Idx<TExtent>;
+
+            template<typename TViewDstFwd>
+            ALPAKA_FN_HOST TaskCopyUniformCudaHip(
+                TViewDstFwd&& viewDst,
+                TViewSrc const& viewSrc,
+                TExtent const& extent,
+                typename TApi::MemcpyKind_t const& uniformMemCpyKind,
+                int const& iDstDevice,
+                int const& iSrcDevice)
+                : m_uniformMemCpyKind(uniformMemCpyKind)
+                , m_iDstDevice(iDstDevice)
+                , m_iSrcDevice(iSrcDevice)
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                , m_extentWidth(getWidth(extent))
+                , m_dstWidth(static_cast<Idx>(getWidth(viewDst)))
+                , m_srcWidth(static_cast<Idx>(getWidth(viewSrc)))
+#    endif
+                , m_extentWidthBytes(static_cast<std::size_t>(getWidth(extent)) * sizeof(Elem<TViewDst>))
+                , m_dstMemNative(reinterpret_cast<void*>(getPtrNative(viewDst)))
+                , m_srcMemNative(reinterpret_cast<void const*>(getPtrNative(viewSrc)))
+            {
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                ALPAKA_ASSERT(m_extentWidth <= m_dstWidth);
+                ALPAKA_ASSERT(m_extentWidth <= m_srcWidth);
+#    endif
+            }
+
+            template<typename TQueue>
+            auto enqueue(TQueue& queue) const -> void
+            {
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                printDebug();
+#    endif
+                if(m_extentWidthBytes == std::size_t{0})
+                {
+                    return;
+                }
+
+                // cudaMemcpy variants on cudaMallocAsync'ed memory need to be called with the correct device,
+                // see https://github.com/fwyzard/nvidia_bug_3446335 .
+                // Set the current device.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(m_iDstDevice));
+                // Initiate the memory copy.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memcpyAsync(
+                    m_dstMemNative,
+                    m_srcMemNative,
+                    m_extentWidthBytes,
+                    m_uniformMemCpyKind,
+                    queue.getNativeHandle()));
+            }
+
+        private:
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            ALPAKA_FN_HOST auto printDebug() const -> void
+            {
+                std::cout << __func__ << " ddev: " << m_iDstDevice << " ew: " << m_extentWidth
+                          << " ewb: " << m_extentWidthBytes << " dw: " << m_dstWidth << " dptr: " << m_dstMemNative
+                          << " sdev: " << m_iSrcDevice << " sw: " << m_srcWidth << " sptr: " << m_srcMemNative
+                          << std::endl;
+            }
+#    endif
+
+            typename TApi::MemcpyKind_t m_uniformMemCpyKind;
+            int m_iDstDevice;
+            int m_iSrcDevice;
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            Idx m_extentWidth;
+            Idx m_dstWidth;
+            Idx m_srcWidth;
+#    endif
+            std::size_t m_extentWidthBytes;
+            void* m_dstMemNative;
+            void const* m_srcMemNative;
+        };
+
+        //! The 2D CUDA/HIP memory copy trait.
+        template<typename TApi, typename TViewDst, typename TViewSrc, typename TExtent>
+        struct TaskCopyUniformCudaHip<TApi, DimInt<2u>, TViewDst, TViewSrc, TExtent>
+        {
+            using Idx = alpaka::Idx<TExtent>;
+
+            template<typename TViewDstFwd>
+            ALPAKA_FN_HOST TaskCopyUniformCudaHip(
+                TViewDstFwd&& viewDst,
+                TViewSrc const& viewSrc,
+                TExtent const& extent,
+                typename TApi::MemcpyKind_t const& uniformMemcpyKind,
+                int const& iDstDevice,
+                int const& iSrcDevice)
+                : m_uniformMemCpyKind(uniformMemcpyKind)
+                , m_iDstDevice(iDstDevice)
+                , m_iSrcDevice(iSrcDevice)
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                , m_extentWidth(getWidth(extent))
+#    endif
+                , m_extentWidthBytes(static_cast<std::size_t>(getWidth(extent)) * sizeof(Elem<TViewDst>))
+                , m_dstWidth(static_cast<Idx>(getWidth(viewDst)))
+                , m_srcWidth(static_cast<Idx>(getWidth(viewSrc)))
+                , m_extentHeight(getHeight(extent))
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                , m_dstHeight(static_cast<Idx>(getHeight(viewDst)))
+                , m_srcHeight(static_cast<Idx>(getHeight(viewSrc)))
+#    endif
+                , m_dstRowPitchBytes(static_cast<std::size_t>(getPitchesInBytes(viewDst)[0]))
+                , m_srcRowPitchBytes(static_cast<std::size_t>(getPitchesInBytes(viewSrc)[0]))
+                , m_dstMemNative(reinterpret_cast<void*>(getPtrNative(viewDst)))
+                , m_srcMemNative(reinterpret_cast<void const*>(getPtrNative(viewSrc)))
+            {
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                ALPAKA_ASSERT(m_extentWidth <= m_dstWidth);
+                ALPAKA_ASSERT(m_extentHeight <= m_dstHeight);
+                ALPAKA_ASSERT(m_extentWidth <= m_srcWidth);
+                ALPAKA_ASSERT(m_extentHeight <= m_srcHeight);
+                ALPAKA_ASSERT(m_extentWidthBytes <= m_dstRowPitchBytes);
+                ALPAKA_ASSERT(m_extentWidthBytes <= m_srcRowPitchBytes);
+#    endif
+            }
+
+            template<typename TQueue>
+            auto enqueue(TQueue& queue) const -> void
+            {
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                printDebug();
+#    endif
+                // This is not only an optimization but also prevents a division by zero.
+                if(m_extentWidthBytes == std::size_t{0} || m_extentHeight == 0)
+                {
+                    return;
+                }
+
+                // cudaMemcpy variants on cudaMallocAsync'ed memory need to be called with the correct device,
+                // see https://github.com/fwyzard/nvidia_bug_3446335 .
+                // Set the current device.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(m_iDstDevice));
+                // Initiate the memory copy.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memcpy2DAsync(
+                    m_dstMemNative,
+                    m_dstRowPitchBytes,
+                    m_srcMemNative,
+                    m_srcRowPitchBytes,
+                    m_extentWidthBytes,
+                    static_cast<std::size_t>(m_extentHeight),
+                    m_uniformMemCpyKind,
+                    queue.getNativeHandle()));
+            }
+
+        private:
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            ALPAKA_FN_HOST auto printDebug() const -> void
+            {
+                std::cout << __func__ << " ew: " << m_extentWidth << " eh: " << m_extentHeight
+                          << " ewb: " << m_extentWidthBytes << " ddev: " << m_iDstDevice << " dw: " << m_dstWidth
+                          << " dh: " << m_dstHeight << " dptr: " << m_dstMemNative << " dpitch: " << m_dstRowPitchBytes
+                          << " sdev: " << m_iSrcDevice << " sw: " << m_srcWidth << " sh: " << m_srcHeight
+                          << " sptr: " << m_srcMemNative << " spitch: " << m_srcRowPitchBytes << std::endl;
+            }
+#    endif
+
+            typename TApi::MemcpyKind_t m_uniformMemCpyKind;
+            int m_iDstDevice;
+            int m_iSrcDevice;
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            Idx m_extentWidth;
+#    endif
+            std::size_t m_extentWidthBytes;
+            Idx m_dstWidth;
+            Idx m_srcWidth;
+
+            Idx m_extentHeight;
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            Idx m_dstHeight;
+            Idx m_srcHeight;
+#    endif
+            std::size_t m_dstRowPitchBytes;
+            std::size_t m_srcRowPitchBytes;
+
+            void* m_dstMemNative;
+            void const* m_srcMemNative;
+        };
+
+        //! The 3D CUDA/HIP memory copy trait.
+        template<typename TApi, typename TViewDst, typename TViewSrc, typename TExtent>
+        struct TaskCopyUniformCudaHip<TApi, DimInt<3u>, TViewDst, TViewSrc, TExtent>
+        {
+            using Idx = alpaka::Idx<TExtent>;
+
+            template<typename TViewDstFwd>
+            ALPAKA_FN_HOST TaskCopyUniformCudaHip(
+                TViewDstFwd&& viewDst,
+                TViewSrc const& viewSrc,
+                TExtent const& extent,
+                typename TApi::MemcpyKind_t const& uniformMemcpyKind,
+                int const& iDstDevice,
+                int const& iSrcDevice)
+                : m_uniformMemCpyKind(uniformMemcpyKind)
+                , m_iDstDevice(iDstDevice)
+                , m_iSrcDevice(iSrcDevice)
+                , m_extentWidth(getWidth(extent))
+                , m_extentWidthBytes(static_cast<std::size_t>(m_extentWidth) * sizeof(Elem<TViewDst>))
+                , m_dstWidth(static_cast<Idx>(getWidth(viewDst)))
+                , m_srcWidth(static_cast<Idx>(getWidth(viewSrc)))
+                , m_extentHeight(getHeight(extent))
+                , m_extentDepth(getDepth(extent))
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                , m_dstHeight(static_cast<Idx>(getHeight(viewDst)))
+                , m_srcHeight(static_cast<Idx>(getHeight(viewSrc)))
+                , m_dstDepth(static_cast<Idx>(getDepth(viewDst)))
+                , m_srcDepth(static_cast<Idx>(getDepth(viewSrc)))
+#    endif
+                , m_dstRowPitchBytes(static_cast<std::size_t>(getPitchesInBytes(viewDst)[1]))
+                , m_srcRowPitchBytes(static_cast<std::size_t>(getPitchesInBytes(viewSrc)[1]))
+                , m_dstSlicePitchBytes(static_cast<std::size_t>(getPitchesInBytes(viewDst)[0]))
+                , m_srcSlicePitchBytes(static_cast<std::size_t>(getPitchesInBytes(viewSrc)[0]))
+                , m_dstMemNative(reinterpret_cast<void*>(getPtrNative(viewDst)))
+                , m_srcMemNative(reinterpret_cast<void const*>(getPtrNative(viewSrc)))
+            {
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                ALPAKA_ASSERT(m_extentWidth <= m_dstWidth);
+                ALPAKA_ASSERT(m_extentHeight <= m_dstHeight);
+                ALPAKA_ASSERT(m_extentDepth <= m_dstDepth);
+                ALPAKA_ASSERT(m_extentWidth <= m_srcWidth);
+                ALPAKA_ASSERT(m_extentHeight <= m_srcHeight);
+                ALPAKA_ASSERT(m_extentDepth <= m_srcDepth);
+                ALPAKA_ASSERT(m_extentWidthBytes <= m_dstRowPitchBytes);
+                ALPAKA_ASSERT(m_extentWidthBytes <= m_srcRowPitchBytes);
+#    endif
+            }
+
+            template<typename TQueue>
+            auto enqueue(TQueue& queue) const -> void
+            {
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                printDebug();
+#    endif
+                // This is not only an optimization but also prevents a division by zero.
+                if(m_extentWidthBytes == std::size_t{0} || m_extentHeight == 0 || m_extentDepth == 0)
+                {
+                    return;
+                }
+
+                // Create the struct describing the copy.
+                typename TApi::Memcpy3DParms_t const uniformCudaHipMemCpy3DParms(buildUniformCudaHipMemcpy3DParms());
+
+                // cudaMemcpy variants on cudaMallocAsync'ed memory need to be called with the correct device,
+                // see https://github.com/fwyzard/nvidia_bug_3446335 .
+                // Set the current device.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(m_iDstDevice));
+
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    TApi::memcpy3DAsync(&uniformCudaHipMemCpy3DParms, queue.getNativeHandle()));
+            }
+
+        private:
+            ALPAKA_FN_HOST auto buildUniformCudaHipMemcpy3DParms() const -> typename TApi::Memcpy3DParms_t
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                // Fill CUDA/HIP parameter structure.
+                typename TApi::Memcpy3DParms_t memCpy3DParms{}; // zero-init required per CUDA documentation
+                memCpy3DParms.srcPtr = TApi::makePitchedPtr(
+                    const_cast<void*>(m_srcMemNative),
+                    m_srcRowPitchBytes,
+                    static_cast<std::size_t>(m_srcWidth),
+                    m_srcSlicePitchBytes / m_srcRowPitchBytes);
+                memCpy3DParms.dstPtr = TApi::makePitchedPtr(
+                    m_dstMemNative,
+                    m_dstRowPitchBytes,
+                    static_cast<std::size_t>(m_dstWidth),
+                    m_dstSlicePitchBytes / m_dstRowPitchBytes);
+                memCpy3DParms.extent = TApi::makeExtent(
+                    m_extentWidthBytes,
+                    static_cast<std::size_t>(m_extentHeight),
+                    static_cast<std::size_t>(m_extentDepth));
+                memCpy3DParms.kind = m_uniformMemCpyKind;
+                return memCpy3DParms;
+            }
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            ALPAKA_FN_HOST auto printDebug() const -> void
+            {
+                std::cout << __func__ << " ew: " << m_extentWidth << " eh: " << m_extentHeight
+                          << " ed: " << m_extentDepth << " ewb: " << m_extentWidthBytes << " ddev: " << m_iDstDevice
+                          << " dw: " << m_dstWidth << " dh: " << m_dstHeight << " dd: " << m_dstDepth
+                          << " dptr: " << m_dstMemNative << " drowpitch: " << m_dstRowPitchBytes
+                          << " dslicepitch: " << m_dstSlicePitchBytes << " sdev: " << m_iSrcDevice
+                          << " sw: " << m_srcWidth << " sh: " << m_srcHeight << " sd: " << m_srcDepth
+                          << " sptr: " << m_srcMemNative << " srowpitch: " << m_srcRowPitchBytes
+                          << " sslicepitch: " << m_srcSlicePitchBytes << std::endl;
+            }
+#    endif
+            typename TApi::MemcpyKind_t m_uniformMemCpyKind;
+            int m_iDstDevice;
+            int m_iSrcDevice;
+
+            Idx m_extentWidth;
+            std::size_t m_extentWidthBytes;
+            Idx m_dstWidth;
+            Idx m_srcWidth;
+
+            Idx m_extentHeight;
+            Idx m_extentDepth;
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            Idx m_dstHeight;
+            Idx m_srcHeight;
+            Idx m_dstDepth;
+            Idx m_srcDepth;
+#    endif
+            std::size_t m_dstRowPitchBytes;
+            std::size_t m_srcRowPitchBytes;
+            std::size_t m_dstSlicePitchBytes;
+            std::size_t m_srcSlicePitchBytes;
+
+            void* m_dstMemNative;
+            void const* m_srcMemNative;
+        };
+    } // namespace detail
+
+    // Trait specializations for CreateTaskMemcpy.
+    namespace trait
+    {
+        //! The CUDA/HIP to CPU memory copy trait specialization.
+        template<typename TApi, typename TDim>
+        struct CreateTaskMemcpy<TDim, DevCpu, DevUniformCudaHipRt<TApi>>
+        {
+            template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
+            ALPAKA_FN_HOST static auto createTaskMemcpy(
+                TViewDstFwd&& viewDst,
+                TViewSrc const& viewSrc,
+                TExtent const& extent) -> alpaka::detail::
+                TaskCopyUniformCudaHip<TApi, TDim, std::remove_reference_t<TViewDstFwd>, TViewSrc, TExtent>
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                auto const iDevice = getDev(viewSrc).getNativeHandle();
+
+                return {
+                    std::forward<TViewDstFwd>(viewDst),
+                    viewSrc,
+                    extent,
+                    TApi::memcpyDeviceToHost,
+                    iDevice,
+                    iDevice};
+            }
+        };
+
+        //! The CPU to CUDA/HIP memory copy trait specialization.
+        template<typename TApi, typename TDim>
+        struct CreateTaskMemcpy<TDim, DevUniformCudaHipRt<TApi>, DevCpu>
+        {
+            template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
+            ALPAKA_FN_HOST static auto createTaskMemcpy(
+                TViewDstFwd&& viewDst,
+                TViewSrc const& viewSrc,
+                TExtent const& extent) -> alpaka::detail::
+                TaskCopyUniformCudaHip<TApi, TDim, std::remove_reference_t<TViewDstFwd>, TViewSrc, TExtent>
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                auto const iDevice = getDev(viewDst).getNativeHandle();
+
+                return {
+                    std::forward<TViewDstFwd>(viewDst),
+                    viewSrc,
+                    extent,
+                    TApi::memcpyHostToDevice,
+                    iDevice,
+                    iDevice};
+            }
+        };
+
+        //! The CUDA/HIP to CUDA/HIP memory copy trait specialization.
+        template<typename TApi, typename TDim>
+        struct CreateTaskMemcpy<TDim, DevUniformCudaHipRt<TApi>, DevUniformCudaHipRt<TApi>>
+        {
+            template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
+            ALPAKA_FN_HOST static auto createTaskMemcpy(
+                TViewDstFwd&& viewDst,
+                TViewSrc const& viewSrc,
+                TExtent const& extent) -> alpaka::detail::
+                TaskCopyUniformCudaHip<TApi, TDim, std::remove_reference_t<TViewDstFwd>, TViewSrc, TExtent>
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                auto const iDstDevice = getDev(viewDst).getNativeHandle();
+
+                return {
+                    std::forward<TViewDstFwd>(viewDst),
+                    viewSrc,
+                    extent,
+                    TApi::memcpyDeviceToDevice,
+                    iDstDevice,
+                    getDev(viewSrc).getNativeHandle()};
+            }
+        };
+
+        //! The CUDA/HIP non-blocking device queue scalar copy enqueue trait specialization.
+        template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
+        struct Enqueue<
+            QueueUniformCudaHipRtNonBlocking<TApi>,
+            alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<0u>, TViewDst, TViewSrc, TExtent>>
+        {
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtNonBlocking<TApi>& queue,
+                alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<0u>, TViewDst, TViewSrc, TExtent> const& task)
+                -> void
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                task.enqueue(queue);
+            }
+        };
+
+        //! The CUDA/HIP blocking device queue scalar copy enqueue trait specialization.
+        template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
+        struct Enqueue<
+            QueueUniformCudaHipRtBlocking<TApi>,
+            alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<0u>, TViewDst, TViewSrc, TExtent>>
+        {
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtBlocking<TApi>& queue,
+                alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<0u>, TViewDst, TViewSrc, TExtent> const& task)
+                -> void
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                task.enqueue(queue);
+
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamSynchronize(queue.getNativeHandle()));
+            }
+        };
+
+        //! The CUDA/HIP non-blocking device queue 1D copy enqueue trait specialization.
+        template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
+        struct Enqueue<
+            QueueUniformCudaHipRtNonBlocking<TApi>,
+            alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<1u>, TViewDst, TViewSrc, TExtent>>
+        {
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtNonBlocking<TApi>& queue,
+                alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<1u>, TViewDst, TViewSrc, TExtent> const& task)
+                -> void
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                task.enqueue(queue);
+            }
+        };
+
+        //! The CUDA/HIP blocking device queue 1D copy enqueue trait specialization.
+        template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
+        struct Enqueue<
+            QueueUniformCudaHipRtBlocking<TApi>,
+            alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<1u>, TViewDst, TViewSrc, TExtent>>
+        {
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtBlocking<TApi>& queue,
+                alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<1u>, TViewDst, TViewSrc, TExtent> const& task)
+                -> void
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                task.enqueue(queue);
+
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamSynchronize(queue.getNativeHandle()));
+            }
+        };
+
+        //! The CUDA/HIP non-blocking device queue 2D copy enqueue trait specialization.
+        template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
+        struct Enqueue<
+            QueueUniformCudaHipRtNonBlocking<TApi>,
+            alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<2u>, TViewDst, TViewSrc, TExtent>>
+        {
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtNonBlocking<TApi>& queue,
+                alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<2u>, TViewDst, TViewSrc, TExtent> const& task)
+                -> void
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                task.enqueue(queue);
+            }
+        };
+
+        //! The CUDA/HIP blocking device queue 2D copy enqueue trait specialization.
+        template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
+        struct Enqueue<
+            QueueUniformCudaHipRtBlocking<TApi>,
+            alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<2u>, TViewDst, TViewSrc, TExtent>>
+        {
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtBlocking<TApi>& queue,
+                alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<2u>, TViewDst, TViewSrc, TExtent> const& task)
+                -> void
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                task.enqueue(queue);
+
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamSynchronize(queue.getNativeHandle()));
+            }
+        };
+
+        //! The CUDA/HIP non-blocking device queue 3D copy enqueue trait specialization.
+        template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
+        struct Enqueue<
+            QueueUniformCudaHipRtNonBlocking<TApi>,
+            alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<3u>, TViewDst, TViewSrc, TExtent>>
+        {
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtNonBlocking<TApi>& queue,
+                alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<3u>, TViewDst, TViewSrc, TExtent> const& task)
+                -> void
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                task.enqueue(queue);
+            }
+        };
+
+        //! The CUDA/HIP blocking device queue 3D copy enqueue trait specialization.
+        template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
+        struct Enqueue<
+            QueueUniformCudaHipRtBlocking<TApi>,
+            alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<3u>, TViewDst, TViewSrc, TExtent>>
+        {
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtBlocking<TApi>& queue,
+                alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<3u>, TViewDst, TViewSrc, TExtent> const& task)
+                -> void
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                task.enqueue(queue);
+
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamSynchronize(queue.getNativeHandle()));
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/mem/buf/uniformCudaHip/Set.hpp b/include/alpaka/mem/buf/uniformCudaHip/Set.hpp
new file mode 100644
index 0000000..3b6551c
--- /dev/null
+++ b/include/alpaka/mem/buf/uniformCudaHip/Set.hpp
@@ -0,0 +1,385 @@
+/* Copyright 2023 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera, Andrea Bocci, Bernhard Manfred Gruber,
+ *                Antonio Di Pilato, Jan Stephan
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Assert.hpp"
+#include "alpaka/core/Cuda.hpp"
+#include "alpaka/core/Hip.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/dim/DimIntegralConst.hpp"
+#include "alpaka/extent/Traits.hpp"
+#include "alpaka/mem/view/Traits.hpp"
+#include "alpaka/queue/QueueUniformCudaHipRtBlocking.hpp"
+#include "alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp"
+#include "alpaka/queue/Traits.hpp"
+#include "alpaka/wait/Traits.hpp"
+
+#include <cstddef>
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+namespace alpaka
+{
+    template<typename TApi>
+    class DevUniformCudaHipRt;
+
+    namespace detail
+    {
+        //! The CUDA/HIP memory set task base.
+        template<typename TApi, typename TDim, typename TView, typename TExtent>
+        struct TaskSetUniformCudaHipBase
+        {
+            TaskSetUniformCudaHipBase(TView& view, std::uint8_t const& byte, TExtent const& extent)
+                : m_view(view)
+                , m_byte(byte)
+                , m_extent(extent)
+                , m_iDevice(getDev(view).getNativeHandle())
+            {
+            }
+
+        protected:
+            TView& m_view;
+            std::uint8_t const m_byte;
+            TExtent const m_extent;
+            std::int32_t const m_iDevice;
+        };
+
+        //! The CUDA/HIP memory set task.
+        template<typename TApi, typename TDim, typename TView, typename TExtent>
+        struct TaskSetUniformCudaHip;
+
+        //! The scalar CUDA/HIP memory set task.
+        template<typename TApi, typename TView, typename TExtent>
+        struct TaskSetUniformCudaHip<TApi, DimInt<0u>, TView, TExtent>
+            : public TaskSetUniformCudaHipBase<TApi, DimInt<0u>, TView, TExtent>
+        {
+            template<typename TViewFwd>
+            TaskSetUniformCudaHip(TViewFwd&& view, std::uint8_t const& byte, TExtent const& extent)
+                : TaskSetUniformCudaHipBase<TApi, DimInt<0u>, TView, TExtent>(
+                    std::forward<TViewFwd>(view),
+                    byte,
+                    extent)
+            {
+            }
+
+            template<typename TQueue>
+            auto enqueue(TQueue& queue) const -> void
+            {
+                // Initiate the memory set.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memsetAsync(
+                    getPtrNative(this->m_view),
+                    static_cast<int>(this->m_byte),
+                    sizeof(Elem<TView>),
+                    queue.getNativeHandle()));
+            }
+        };
+
+        //! The 1D CUDA/HIP memory set task.
+        template<typename TApi, typename TView, typename TExtent>
+        struct TaskSetUniformCudaHip<TApi, DimInt<1u>, TView, TExtent>
+            : public TaskSetUniformCudaHipBase<TApi, DimInt<1u>, TView, TExtent>
+        {
+            template<typename TViewFwd>
+            TaskSetUniformCudaHip(TViewFwd&& view, std::uint8_t const& byte, TExtent const& extent)
+                : TaskSetUniformCudaHipBase<TApi, DimInt<1u>, TView, TExtent>(
+                    std::forward<TViewFwd>(view),
+                    byte,
+                    extent)
+            {
+            }
+
+            template<typename TQueue>
+            auto enqueue(TQueue& queue) const -> void
+            {
+                auto& view = this->m_view;
+                auto const& extent = this->m_extent;
+
+                auto const extentWidth = getWidth(extent);
+                ALPAKA_ASSERT(extentWidth <= getWidth(view));
+
+                if(extentWidth == 0)
+                {
+                    return;
+                }
+
+                // Initiate the memory set.
+                auto const extentWidthBytes = static_cast<std::size_t>(extentWidth) * sizeof(Elem<TView>);
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memsetAsync(
+                    getPtrNative(view),
+                    static_cast<int>(this->m_byte),
+                    extentWidthBytes,
+                    queue.getNativeHandle()));
+            }
+        };
+
+        //! The 2D CUDA/HIP memory set task.
+        template<typename TApi, typename TView, typename TExtent>
+        struct TaskSetUniformCudaHip<TApi, DimInt<2u>, TView, TExtent>
+            : public TaskSetUniformCudaHipBase<TApi, DimInt<2u>, TView, TExtent>
+        {
+            template<typename TViewFwd>
+            TaskSetUniformCudaHip(TViewFwd&& view, std::uint8_t const& byte, TExtent const& extent)
+                : TaskSetUniformCudaHipBase<TApi, DimInt<2u>, TView, TExtent>(
+                    std::forward<TViewFwd>(view),
+                    byte,
+                    extent)
+            {
+            }
+
+            template<typename TQueue>
+            auto enqueue(TQueue& queue) const -> void
+            {
+                auto& view = this->m_view;
+                auto const& extent = this->m_extent;
+
+                auto const extentWidth = getWidth(extent);
+                auto const extentHeight = getHeight(extent);
+
+                if(extentWidth == 0 || extentHeight == 0)
+                {
+                    return;
+                }
+
+                auto const extentWidthBytes = static_cast<std::size_t>(extentWidth) * sizeof(Elem<TView>);
+
+#    if !defined(NDEBUG)
+                auto const dstWidth = getWidth(view);
+                auto const dstHeight = getHeight(view);
+#    endif
+                auto const dstRowPitchBytes = static_cast<std::size_t>(getPitchesInBytes(view)[0]);
+                auto const dstNativePtr = reinterpret_cast<void*>(getPtrNative(view));
+                ALPAKA_ASSERT(extentWidth <= dstWidth);
+                ALPAKA_ASSERT(extentHeight <= dstHeight);
+
+                // Initiate the memory set.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memset2DAsync(
+                    dstNativePtr,
+                    dstRowPitchBytes,
+                    static_cast<int>(this->m_byte),
+                    extentWidthBytes,
+                    static_cast<std::size_t>(extentHeight),
+                    queue.getNativeHandle()));
+            }
+        };
+
+        //! The 3D CUDA/HIP memory set task.
+        template<typename TApi, typename TView, typename TExtent>
+        struct TaskSetUniformCudaHip<TApi, DimInt<3u>, TView, TExtent>
+            : public TaskSetUniformCudaHipBase<TApi, DimInt<3u>, TView, TExtent>
+        {
+            template<typename TViewFwd>
+            TaskSetUniformCudaHip(TViewFwd&& view, std::uint8_t const& byte, TExtent const& extent)
+                : TaskSetUniformCudaHipBase<TApi, DimInt<3u>, TView, TExtent>(
+                    std::forward<TViewFwd>(view),
+                    byte,
+                    extent)
+            {
+            }
+
+            template<typename TQueue>
+            auto enqueue(TQueue& queue) const -> void
+            {
+                using Elem = alpaka::Elem<TView>;
+
+                auto& view = this->m_view;
+                auto const& extent = this->m_extent;
+
+                auto const extentWidth = getWidth(extent);
+                auto const extentHeight = getHeight(extent);
+                auto const extentDepth = getDepth(extent);
+
+                // This is not only an optimization but also prevents a division by zero.
+                if(extentWidth == 0 || extentHeight == 0 || extentDepth == 0)
+                {
+                    return;
+                }
+
+                auto const dstWidth = getWidth(view);
+#    if !defined(NDEBUG)
+                auto const dstHeight = getHeight(view);
+                auto const dstDepth = getDepth(view);
+#    endif
+                auto const [dstSlicePitchBytes, dstRowPitchBytes, _] = getPitchesInBytes(view);
+                auto const dstNativePtr = reinterpret_cast<void*>(getPtrNative(view));
+                ALPAKA_ASSERT(extentWidth <= dstWidth);
+                ALPAKA_ASSERT(extentHeight <= dstHeight);
+                ALPAKA_ASSERT(extentDepth <= dstDepth);
+
+                // Fill CUDA parameter structures.
+                typename TApi::PitchedPtr_t const pitchedPtrVal = TApi::makePitchedPtr(
+                    dstNativePtr,
+                    static_cast<std::size_t>(dstRowPitchBytes),
+                    static_cast<std::size_t>(dstWidth) * sizeof(Elem),
+                    static_cast<std::size_t>(dstSlicePitchBytes / dstRowPitchBytes));
+
+                typename TApi::Extent_t const extentVal = TApi::makeExtent(
+                    static_cast<std::size_t>(extentWidth) * sizeof(Elem),
+                    static_cast<std::size_t>(extentHeight),
+                    static_cast<std::size_t>(extentDepth));
+
+                // Initiate the memory set.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memset3DAsync(
+                    pitchedPtrVal,
+                    static_cast<int>(this->m_byte),
+                    extentVal,
+                    queue.getNativeHandle()));
+            }
+        };
+    } // namespace detail
+
+    namespace trait
+    {
+        //! The CUDA device memory set trait specialization.
+        template<typename TApi, typename TDim>
+        struct CreateTaskMemset<TDim, DevUniformCudaHipRt<TApi>>
+        {
+            template<typename TExtent, typename TView>
+            ALPAKA_FN_HOST static auto createTaskMemset(TView& view, std::uint8_t const& byte, TExtent const& extent)
+                -> alpaka::detail::TaskSetUniformCudaHip<TApi, TDim, TView, TExtent>
+            {
+                return alpaka::detail::TaskSetUniformCudaHip<TApi, TDim, TView, TExtent>(view, byte, extent);
+            }
+        };
+
+        //! The CUDA non-blocking device queue scalar set enqueue trait specialization.
+        template<typename TApi, typename TView, typename TExtent>
+        struct Enqueue<
+            QueueUniformCudaHipRtNonBlocking<TApi>,
+            alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<0u>, TView, TExtent>>
+        {
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtNonBlocking<TApi>& queue,
+                alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<0u>, TView, TExtent> const& task) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                task.enqueue(queue);
+            }
+        };
+
+        //! The CUDA blocking device queue scalar set enqueue trait specialization.
+        template<typename TApi, typename TView, typename TExtent>
+        struct Enqueue<
+            QueueUniformCudaHipRtBlocking<TApi>,
+            alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<0u>, TView, TExtent>>
+        {
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtBlocking<TApi>& queue,
+                alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<0u>, TView, TExtent> const& task) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                task.enqueue(queue);
+
+                wait(queue);
+            }
+        };
+
+        //! The CUDA non-blocking device queue 1D set enqueue trait specialization.
+        template<typename TApi, typename TView, typename TExtent>
+        struct Enqueue<
+            QueueUniformCudaHipRtNonBlocking<TApi>,
+            alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<1u>, TView, TExtent>>
+        {
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtNonBlocking<TApi>& queue,
+                alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<1u>, TView, TExtent> const& task) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                task.enqueue(queue);
+            }
+        };
+
+        //! The CUDA blocking device queue 1D set enqueue trait specialization.
+        template<typename TApi, typename TView, typename TExtent>
+        struct Enqueue<
+            QueueUniformCudaHipRtBlocking<TApi>,
+            alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<1u>, TView, TExtent>>
+        {
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtBlocking<TApi>& queue,
+                alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<1u>, TView, TExtent> const& task) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                task.enqueue(queue);
+
+                wait(queue);
+            }
+        };
+
+        //! The CUDA non-blocking device queue 2D set enqueue trait specialization.
+        template<typename TApi, typename TView, typename TExtent>
+        struct Enqueue<
+            QueueUniformCudaHipRtNonBlocking<TApi>,
+            alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<2u>, TView, TExtent>>
+        {
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtNonBlocking<TApi>& queue,
+                alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<2u>, TView, TExtent> const& task) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                task.enqueue(queue);
+            }
+        };
+
+        //! The CUDA blocking device queue 2D set enqueue trait specialization.
+        template<typename TApi, typename TView, typename TExtent>
+        struct Enqueue<
+            QueueUniformCudaHipRtBlocking<TApi>,
+            alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<2u>, TView, TExtent>>
+        {
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtBlocking<TApi>& queue,
+                alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<2u>, TView, TExtent> const& task) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                task.enqueue(queue);
+
+                wait(queue);
+            }
+        };
+
+        //! The CUDA non-blocking device queue 3D set enqueue trait specialization.
+        template<typename TApi, typename TView, typename TExtent>
+        struct Enqueue<
+            QueueUniformCudaHipRtNonBlocking<TApi>,
+            alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<3u>, TView, TExtent>>
+        {
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtNonBlocking<TApi>& queue,
+                alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<3u>, TView, TExtent> const& task) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                task.enqueue(queue);
+            }
+        };
+
+        //! The CUDA blocking device queue 3D set enqueue trait specialization.
+        template<typename TApi, typename TView, typename TExtent>
+        struct Enqueue<
+            QueueUniformCudaHipRtBlocking<TApi>,
+            alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<3u>, TView, TExtent>>
+        {
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueUniformCudaHipRtBlocking<TApi>& queue,
+                alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<3u>, TView, TExtent> const& task) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                task.enqueue(queue);
+
+                wait(queue);
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/mem/fence/MemFenceCpu.hpp b/include/alpaka/mem/fence/MemFenceCpu.hpp
new file mode 100644
index 0000000..43b8cd9
--- /dev/null
+++ b/include/alpaka/mem/fence/MemFenceCpu.hpp
@@ -0,0 +1,61 @@
+/* Copyright 2022 Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/mem/fence/Traits.hpp"
+
+#include <atomic>
+
+namespace alpaka
+{
+    //! The default CPU memory fence.
+    class MemFenceCpu : public concepts::Implements<ConceptMemFence, MemFenceCpu>
+    {
+    };
+
+    namespace trait
+    {
+        template<typename TMemScope>
+        struct MemFence<MemFenceCpu, TMemScope>
+        {
+            static auto mem_fence(MemFenceCpu const&, TMemScope const&)
+            {
+                /*
+                 * Intuitively, std::atomic_thread_fence creates a fence on the block level.
+                 *
+                 * Creating a block fence is enough for the whole device because the blocks are executed serially. By
+                 * definition of fences, preceding blocks don't have a guarantee to see the results of this block's
+                 * STORE operations (only that they will be ordered correctly); the following blocks see the results
+                 * once they start. Consider the following code:
+                 *
+                 * int x = 1;
+                 * int y = 2;
+                 *
+                 * void foo()
+                 * {
+                 *     x = 10;
+                 *     alpaka::mem_fence(acc, memory_scope::device);
+                 *     y = 20;
+                 * }
+                 *
+                 * void bar()
+                 * {
+                 *     auto b = y;
+                 *     alpaka::mem_fence(acc, memory_scope::device);
+                 *     auto a = x;
+                 * }
+                 *
+                 * The following are all valid outcomes:
+                 *   a == 1 && b == 2
+                 *   a == 10 && b == 2
+                 *   a == 10 && b == 20
+                 */
+
+                std::atomic_thread_fence(std::memory_order_acq_rel);
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/mem/fence/MemFenceCpuSerial.hpp b/include/alpaka/mem/fence/MemFenceCpuSerial.hpp
new file mode 100644
index 0000000..df981f1
--- /dev/null
+++ b/include/alpaka/mem/fence/MemFenceCpuSerial.hpp
@@ -0,0 +1,49 @@
+/* Copyright 2022 Jan Stephan, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/mem/fence/Traits.hpp"
+
+#include <atomic>
+
+namespace alpaka
+{
+    //! The serial CPU memory fence.
+    class MemFenceCpuSerial : public concepts::Implements<ConceptMemFence, MemFenceCpuSerial>
+    {
+    };
+
+    namespace trait
+    {
+        template<>
+        struct MemFence<MemFenceCpuSerial, memory_scope::Block>
+        {
+            static auto mem_fence(MemFenceCpuSerial const&, memory_scope::Block const&)
+            {
+                /* Nothing to be done on the block level for the serial case. */
+            }
+        };
+
+        template<>
+        struct MemFence<MemFenceCpuSerial, memory_scope::Grid>
+        {
+            static auto mem_fence(MemFenceCpuSerial const&, memory_scope::Grid const&)
+            {
+                /* Nothing to be done on the grid level for the serial case. */
+            }
+        };
+
+        template<typename TMemScope>
+        struct MemFence<MemFenceCpuSerial, TMemScope>
+        {
+            static auto mem_fence(MemFenceCpuSerial const&, TMemScope const&)
+            {
+                /* Enable device fences because we may want to synchronize with other (serial) kernels. */
+                std::atomic_thread_fence(std::memory_order_acq_rel);
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/mem/fence/MemFenceGenericSycl.hpp b/include/alpaka/mem/fence/MemFenceGenericSycl.hpp
new file mode 100644
index 0000000..2c2cd9e
--- /dev/null
+++ b/include/alpaka/mem/fence/MemFenceGenericSycl.hpp
@@ -0,0 +1,60 @@
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/mem/fence/Traits.hpp"
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+
+#    include <sycl/sycl.hpp>
+
+namespace alpaka
+{
+    namespace detail
+    {
+        template<typename TAlpakaMemScope>
+        struct SyclFenceProps
+        {
+        };
+
+        template<>
+        struct SyclFenceProps<alpaka::memory_scope::Block>
+        {
+            static constexpr auto scope = sycl::memory_scope::work_group;
+        };
+
+        template<>
+        struct SyclFenceProps<alpaka::memory_scope::Device>
+        {
+            static constexpr auto scope = sycl::memory_scope::device;
+        };
+
+        template<>
+        struct SyclFenceProps<alpaka::memory_scope::Grid>
+        {
+            static constexpr auto scope = sycl::memory_scope::device;
+        };
+    } // namespace detail
+
+    //! The SYCL memory fence.
+    class MemFenceGenericSycl : public concepts::Implements<ConceptMemFence, MemFenceGenericSycl>
+    {
+    };
+} // namespace alpaka
+
+namespace alpaka::trait
+{
+    template<typename TMemScope>
+    struct MemFence<MemFenceGenericSycl, TMemScope>
+    {
+        static auto mem_fence(MemFenceGenericSycl const&, TMemScope const&)
+        {
+            static constexpr auto scope = alpaka::detail::SyclFenceProps<TMemScope>::scope;
+            sycl::atomic_fence(sycl::memory_order::acq_rel, scope);
+        }
+    };
+} // namespace alpaka::trait
+
+#endif
diff --git a/include/alpaka/mem/fence/MemFenceOmp2Blocks.hpp b/include/alpaka/mem/fence/MemFenceOmp2Blocks.hpp
new file mode 100644
index 0000000..09f7811
--- /dev/null
+++ b/include/alpaka/mem/fence/MemFenceOmp2Blocks.hpp
@@ -0,0 +1,54 @@
+/* Copyright 2022 Jan Stephan, Bernhard Manfred Gruber, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/mem/fence/Traits.hpp"
+
+#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
+
+#    if _OPENMP < 200203
+#        error If ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
+#    endif
+
+namespace alpaka
+{
+    //! The CPU OpenMP 2.0 block memory fence.
+    class MemFenceOmp2Blocks : public concepts::Implements<ConceptMemFence, MemFenceOmp2Blocks>
+    {
+    };
+
+    namespace trait
+    {
+        template<>
+        struct MemFence<MemFenceOmp2Blocks, memory_scope::Block>
+        {
+            static auto mem_fence(MemFenceOmp2Blocks const&, memory_scope::Block const&)
+            {
+                // Only one thread per block allowed -> no memory fence required on block level
+            }
+        };
+
+        template<>
+        struct MemFence<MemFenceOmp2Blocks, memory_scope::Grid>
+        {
+            static auto mem_fence(MemFenceOmp2Blocks const&, memory_scope::Grid const&)
+            {
+#    pragma omp flush
+            }
+        };
+
+        template<>
+        struct MemFence<MemFenceOmp2Blocks, memory_scope::Device>
+        {
+            static auto mem_fence(MemFenceOmp2Blocks const&, memory_scope::Device const&)
+            {
+#    pragma omp flush
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/mem/fence/MemFenceOmp2Threads.hpp b/include/alpaka/mem/fence/MemFenceOmp2Threads.hpp
new file mode 100644
index 0000000..45ba0d5
--- /dev/null
+++ b/include/alpaka/mem/fence/MemFenceOmp2Threads.hpp
@@ -0,0 +1,68 @@
+/* Copyright 2022 Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/mem/fence/Traits.hpp"
+
+#ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
+
+#    if _OPENMP < 200203
+#        error If ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
+#    endif
+
+namespace alpaka
+{
+    //! The CPU OpenMP 2.0 block memory fence.
+    class MemFenceOmp2Threads : public concepts::Implements<ConceptMemFence, MemFenceOmp2Threads>
+    {
+    };
+
+    namespace trait
+    {
+        template<typename TMemScope>
+        struct MemFence<MemFenceOmp2Threads, TMemScope>
+        {
+            static auto mem_fence(MemFenceOmp2Threads const&, TMemScope const&)
+            {
+                /*
+                 * Intuitively, this pragma creates a fence on the block level.
+                 *
+                 * Creating a block fence is enough for the whole device because the blocks are executed serially. By
+                 * definition of fences, preceding blocks don't have a guarantee to see the results of this block's
+                 * STORE operations (only that they will be ordered correctly); the following blocks see the results
+                 * once they start. Consider the following code:
+                 *
+                 * int x = 1;
+                 * int y = 2;
+                 *
+                 * void foo()
+                 * {
+                 *     x = 10;
+                 *     alpaka::mem_fence(acc, memory_scope::device);
+                 *     y = 20;
+                 * }
+                 *
+                 * void bar()
+                 * {
+                 *     auto b = y;
+                 *     alpaka::mem_fence(acc, memory_scope::device);
+                 *     auto a = x;
+                 * }
+                 *
+                 * The following are all valid outcomes:
+                 *   a == 1 && b == 2
+                 *   a == 10 && b == 2
+                 *   a == 10 && b == 20
+                 */
+#    pragma omp flush
+#    ifdef _MSC_VER
+                ; // MSVC needs an empty statement here or it diagnoses a syntax error
+#    endif
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
+#endif
diff --git a/include/alpaka/mem/fence/MemFenceUniformCudaHipBuiltIn.hpp b/include/alpaka/mem/fence/MemFenceUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000..d94b1bc
--- /dev/null
+++ b/include/alpaka/mem/fence/MemFenceUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,65 @@
+/* Copyright 2022 Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/mem/fence/Traits.hpp"
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+namespace alpaka
+{
+    //! The GPU CUDA/HIP memory fence.
+    class MemFenceUniformCudaHipBuiltIn : public concepts::Implements<ConceptMemFence, MemFenceUniformCudaHipBuiltIn>
+    {
+    };
+
+#    if !defined(ALPAKA_HOST_ONLY)
+
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+
+#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+
+    namespace trait
+    {
+        template<>
+        struct MemFence<MemFenceUniformCudaHipBuiltIn, memory_scope::Block>
+        {
+            __device__ static auto mem_fence(MemFenceUniformCudaHipBuiltIn const&, memory_scope::Block const&)
+            {
+                __threadfence_block();
+            }
+        };
+
+        template<>
+        struct MemFence<MemFenceUniformCudaHipBuiltIn, memory_scope::Grid>
+        {
+            __device__ static auto mem_fence(MemFenceUniformCudaHipBuiltIn const&, memory_scope::Grid const&)
+            {
+                // CUDA and HIP do not have a per-grid memory fence, so a device-level fence is used
+                __threadfence();
+            }
+        };
+
+        template<>
+        struct MemFence<MemFenceUniformCudaHipBuiltIn, memory_scope::Device>
+        {
+            __device__ static auto mem_fence(MemFenceUniformCudaHipBuiltIn const&, memory_scope::Device const&)
+            {
+                __threadfence();
+            }
+        };
+    } // namespace trait
+
+#    endif
+
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/mem/fence/Traits.hpp b/include/alpaka/mem/fence/Traits.hpp
new file mode 100644
index 0000000..da02ff3
--- /dev/null
+++ b/include/alpaka/mem/fence/Traits.hpp
@@ -0,0 +1,66 @@
+/* Copyright 2022 Jan Stephan, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/core/Concepts.hpp"
+
+namespace alpaka
+{
+    struct ConceptMemFence
+    {
+    };
+
+    namespace memory_scope
+    {
+        //! Memory fences are observed by all threads in the same block.
+        struct Block
+        {
+        };
+
+        //! Memory fences are observed by all threads in the same grid.
+        struct Grid
+        {
+        };
+
+        //! Memory fences are observed by all threads on the device.
+        struct Device
+        {
+        };
+    } // namespace memory_scope
+
+    //! The memory fence trait.
+    namespace trait
+    {
+        //! The mem_fence trait.
+        template<typename TMemFence, typename TMemScope, typename TSfinae = void>
+        struct MemFence;
+    } // namespace trait
+
+    //! Issues memory fence instructions.
+    //
+    // Issues a memory fence instruction for a given memory scope (\a memory_scope::Block or \a memory_scope::Device).
+    // This guarantees the following:
+    // * All \a LOAD instructions preceeding the fence will always occur before the LOAD instructions following the
+    //   fence (\a LoadLoad coherence)
+    // * All \a STORE instructions preceeding the fence will always occur before the STORE instructions following the
+    //   fence (\a LoadStore coherence). The pre-fence STORE results will be propagated to the other threads in the
+    //   scope at an unknown point in time.
+    //
+    // Note that there are no further guarantees, especially with regard to \a LoadStore ordering. Users should not
+    // mistake this as a synchronization function between threads (please use syncBlockThreads() instead).
+    //
+    //! \tparam TMemFence The memory fence implementation type.
+    //! \tparam TMemScope The memory scope type.
+    //! \param fence The memory fence implementation.
+    //! \param scope The memory scope.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TMemFence, typename TMemScope>
+    ALPAKA_FN_ACC auto mem_fence(TMemFence const& fence, TMemScope const& scope) -> void
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptMemFence, TMemFence>;
+        trait::MemFence<ImplementationBase, TMemScope>::mem_fence(fence, scope);
+    }
+} // namespace alpaka
diff --git a/include/alpaka/mem/global/DeviceGlobalCpu.hpp b/include/alpaka/mem/global/DeviceGlobalCpu.hpp
new file mode 100644
index 0000000..aafcb06
--- /dev/null
+++ b/include/alpaka/mem/global/DeviceGlobalCpu.hpp
@@ -0,0 +1,151 @@
+/* Copyright 2024 Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/mem/buf/cpu/Copy.hpp"
+#include "alpaka/mem/global/Traits.hpp"
+#include "alpaka/mem/view/ViewPlainPtr.hpp"
+
+#include <type_traits>
+
+// memcpy specialization for device global variables
+namespace alpaka
+{
+
+    namespace detail
+    {
+        template<typename T>
+        struct DevGlobalTrait<TagCpuOmp2Blocks, T>
+        {
+            using Type = detail::DevGlobalImplGeneric<TagCpuOmp2Blocks, T>;
+        };
+
+        template<typename T>
+        struct DevGlobalTrait<TagCpuOmp2Threads, T>
+        {
+            using Type = detail::DevGlobalImplGeneric<TagCpuOmp2Threads, T>;
+        };
+
+        template<typename T>
+        struct DevGlobalTrait<TagCpuSerial, T>
+        {
+            using Type = detail::DevGlobalImplGeneric<TagCpuSerial, T>;
+        };
+
+        template<typename T>
+        struct DevGlobalTrait<TagCpuTbbBlocks, T>
+        {
+            using Type = detail::DevGlobalImplGeneric<TagCpuTbbBlocks, T>;
+        };
+
+        template<typename T>
+        struct DevGlobalTrait<TagCpuThreads, T>
+        {
+            using Type = detail::DevGlobalImplGeneric<TagCpuThreads, T>;
+        };
+    } // namespace detail
+
+    template<
+        typename TTag,
+        typename TViewSrc,
+        typename TTypeDst,
+        typename TQueue,
+        typename std::enable_if_t<
+            std::is_same_v<TTag, TagCpuOmp2Blocks> || std::is_same_v<TTag, TagCpuOmp2Threads>
+                || std::is_same_v<TTag, TagCpuSerial> || std::is_same_v<TTag, TagCpuTbbBlocks>
+                || std::is_same_v<TTag, TagCpuThreads>,
+            int>
+        = 0>
+    ALPAKA_FN_HOST auto memcpy(
+        TQueue& queue,
+        alpaka::detail::DevGlobalImplGeneric<TTag, TTypeDst>& viewDst,
+        TViewSrc const& viewSrc) -> void
+    {
+        using Type = std::remove_const_t<std::remove_all_extents_t<TTypeDst>>;
+        auto extent = getExtents(viewSrc);
+        auto view = alpaka::ViewPlainPtr<DevCpu, Type, alpaka::Dim<decltype(extent)>, alpaka::Idx<decltype(extent)>>(
+            reinterpret_cast<Type*>(const_cast<std::remove_const_t<TTypeDst>*>(&viewDst)),
+            alpaka::getDev(queue),
+            extent);
+        enqueue(queue, createTaskMemcpy(std::forward<decltype(view)>(view), viewSrc, extent));
+    }
+
+    template<
+        typename TTag,
+        typename TTypeSrc,
+        typename TViewDstFwd,
+        typename TQueue,
+        typename std::enable_if_t<
+            std::is_same_v<TTag, TagCpuOmp2Blocks> || std::is_same_v<TTag, TagCpuOmp2Threads>
+                || std::is_same_v<TTag, TagCpuSerial> || std::is_same_v<TTag, TagCpuTbbBlocks>
+                || std::is_same_v<TTag, TagCpuThreads>,
+            int>
+        = 0>
+    ALPAKA_FN_HOST auto memcpy(
+        TQueue& queue,
+        TViewDstFwd&& viewDst,
+        alpaka::detail::DevGlobalImplGeneric<TTag, TTypeSrc>& viewSrc) -> void
+    {
+        using Type = std::remove_all_extents_t<TTypeSrc>;
+        auto extent = getExtents(viewDst);
+        auto view = alpaka::ViewPlainPtr<DevCpu, Type, alpaka::Dim<decltype(extent)>, alpaka::Idx<decltype(extent)>>(
+            reinterpret_cast<Type*>(&viewSrc),
+            alpaka::getDev(queue),
+            extent);
+        enqueue(queue, createTaskMemcpy(std::forward<TViewDstFwd>(viewDst), view, extent));
+    }
+
+    template<
+        typename TTag,
+        typename TExtent,
+        typename TViewSrc,
+        typename TTypeDst,
+        typename TQueue,
+        typename std::enable_if_t<
+            std::is_same_v<TTag, TagCpuOmp2Blocks> || std::is_same_v<TTag, TagCpuOmp2Threads>
+                || std::is_same_v<TTag, TagCpuSerial> || std::is_same_v<TTag, TagCpuTbbBlocks>
+                || std::is_same_v<TTag, TagCpuThreads>,
+            int>
+        = 0>
+    ALPAKA_FN_HOST auto memcpy(
+        TQueue& queue,
+        alpaka::detail::DevGlobalImplGeneric<TTag, TTypeDst>& viewDst,
+        TViewSrc const& viewSrc,
+        TExtent const& extent) -> void
+    {
+        using Type = std::remove_const_t<std::remove_all_extents_t<TTypeDst>>;
+        auto view = alpaka::ViewPlainPtr<DevCpu, Type, alpaka::Dim<TExtent>, alpaka::Idx<TExtent>>(
+            reinterpret_cast<Type*>(const_cast<std::remove_const_t<TTypeDst>*>(&viewDst)),
+            alpaka::getDev(queue),
+            extent);
+        enqueue(queue, createTaskMemcpy(std::forward<decltype(view)>(view), viewSrc, extent));
+    }
+
+    template<
+        typename TTag,
+        typename TExtent,
+        typename TTypeSrc,
+        typename TViewDstFwd,
+        typename TQueue,
+        typename std::enable_if_t<
+            std::is_same_v<TTag, TagCpuOmp2Blocks> || std::is_same_v<TTag, TagCpuOmp2Threads>
+                || std::is_same_v<TTag, TagCpuSerial> || std::is_same_v<TTag, TagCpuTbbBlocks>
+                || std::is_same_v<TTag, TagCpuThreads>,
+            int>
+        = 0>
+    ALPAKA_FN_HOST auto memcpy(
+        TQueue& queue,
+        TViewDstFwd&& viewDst,
+        alpaka::detail::DevGlobalImplGeneric<TTag, TTypeSrc>& viewSrc,
+        TExtent const& extent) -> void
+    {
+        using Type = std::remove_all_extents_t<TTypeSrc>;
+        auto view = alpaka::ViewPlainPtr<DevCpu, Type, alpaka::Dim<TExtent>, alpaka::Idx<TExtent>>(
+            reinterpret_cast<Type*>(&viewSrc),
+            alpaka::getDev(queue),
+            extent);
+        enqueue(queue, createTaskMemcpy(std::forward<TViewDstFwd>(viewDst), view, extent));
+    }
+} // namespace alpaka
diff --git a/include/alpaka/mem/global/DeviceGlobalGenericSycl.hpp b/include/alpaka/mem/global/DeviceGlobalGenericSycl.hpp
new file mode 100644
index 0000000..56ee98c
--- /dev/null
+++ b/include/alpaka/mem/global/DeviceGlobalGenericSycl.hpp
@@ -0,0 +1,96 @@
+/* Copyright 2024 Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/elem/Traits.hpp"
+#include "alpaka/mem/global/Traits.hpp"
+#include "alpaka/queue/sycl/QueueGenericSyclBase.hpp"
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+
+#    include <sycl/sycl.hpp>
+
+namespace alpaka
+{
+    namespace detail
+    {
+        template<typename T>
+        struct DevGlobalTrait<TagCpuSycl, T>
+        {
+            // SYCL CPU implementation
+            using Type = sycl::ext::oneapi::experimental::device_global<T>;
+        };
+
+        template<typename T>
+        struct DevGlobalTrait<TagGpuSyclIntel, T>
+        {
+            // SYCL GPU implementation
+            using Type = sycl::ext::oneapi::experimental::device_global<T>;
+        };
+
+        template<typename T>
+        struct DevGlobalTrait<TagFpgaSyclIntel, T>
+        {
+            // SYCL FPGA implementation
+            using Type = sycl::ext::oneapi::experimental::device_global<T>;
+        };
+
+        template<typename T>
+        struct DevGlobalTrait<TagGenericSycl, T>
+        {
+            // generic SYCL implementation
+            using Type = sycl::ext::oneapi::experimental::device_global<T>;
+        };
+    } // namespace detail
+
+    // from device to host
+    template<typename TDev, bool TBlocking, typename TViewDst, typename TTypeSrc>
+    ALPAKA_FN_HOST auto memcpy(
+        detail::QueueGenericSyclBase<TDev, TBlocking>& queue,
+        TViewDst&& viewDst,
+        sycl::ext::oneapi::experimental::device_global<TTypeSrc> const& viewSrc)
+    {
+        queue.getNativeHandle().memcpy(reinterpret_cast<void*>(getPtrNative(viewDst)), viewSrc);
+    }
+
+    // from host to device
+    template<typename TDev, bool TBlocking, typename TTypeDst, typename TViewSrc>
+    ALPAKA_FN_HOST auto memcpy(
+        detail::QueueGenericSyclBase<TDev, TBlocking>& queue,
+        sycl::ext::oneapi::experimental::device_global<TTypeDst>& viewDst,
+        TViewSrc const& viewSrc)
+    {
+        queue.getNativeHandle().memcpy(viewDst, reinterpret_cast<void const*>(getPtrNative(viewSrc)));
+    }
+
+    // from device to host
+    template<typename TDev, bool TBlocking, typename TViewDst, typename TTypeSrc, typename TExtent>
+    ALPAKA_FN_HOST auto memcpy(
+        detail::QueueGenericSyclBase<TDev, TBlocking>& queue,
+        TViewDst&& viewDst,
+        sycl::ext::oneapi::experimental::device_global<TTypeSrc> const& viewSrc,
+        TExtent extent)
+    {
+        using Elem = alpaka::Elem<std::remove_reference_t<TViewDst>>;
+        auto size = static_cast<std::size_t>(getHeight(extent)) * static_cast<std::size_t>(getDepth(extent))
+                    * static_cast<std::size_t>(getWidth(extent)) * sizeof(Elem);
+        queue.getNativeHandle().memcpy(reinterpret_cast<void*>(getPtrNative(viewDst)), viewSrc, size);
+    }
+
+    // from host to device
+    template<typename TDev, bool TBlocking, typename TTypeDst, typename TViewSrc, typename TExtent>
+    ALPAKA_FN_HOST auto memcpy(
+        detail::QueueGenericSyclBase<TDev, TBlocking>& queue,
+        sycl::ext::oneapi::experimental::device_global<TTypeDst>& viewDst,
+        TViewSrc const& viewSrc,
+        TExtent extent)
+    {
+        using Elem = alpaka::Elem<TViewSrc>;
+        auto size = static_cast<std::size_t>(getHeight(extent)) * static_cast<std::size_t>(getDepth(extent))
+                    * static_cast<std::size_t>(getWidth(extent)) * sizeof(Elem);
+        queue.getNativeHandle().memcpy(viewDst, reinterpret_cast<void const*>(getPtrNative(viewSrc)), size);
+    }
+} // namespace alpaka
+#endif
diff --git a/include/alpaka/mem/global/DeviceGlobalUniformCudaHipBuiltIn.hpp b/include/alpaka/mem/global/DeviceGlobalUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000..6b802fc
--- /dev/null
+++ b/include/alpaka/mem/global/DeviceGlobalUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,187 @@
+/* Copyright 2024 Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/dev/DevUniformCudaHipRt.hpp"
+#include "alpaka/mem/global/Traits.hpp"
+#include "alpaka/mem/view/ViewPlainPtr.hpp"
+#include "alpaka/queue/cuda_hip/QueueUniformCudaHipRt.hpp"
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include "alpaka/core/ApiCudaRt.hpp"
+#    endif
+
+#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+#        include "alpaka/core/ApiHipRt.hpp"
+#    endif
+
+namespace alpaka
+{
+
+    namespace detail
+    {
+        template<typename T>
+        struct DevGlobalTrait<TagGpuCudaRt, T>
+        {
+            // CUDA implementation
+            using Type = detail::DevGlobalImplGeneric<TagGpuCudaRt, T>;
+        };
+
+        template<typename T>
+        struct DevGlobalTrait<TagGpuHipRt, T>
+        {
+            // HIP/ROCm implementation
+            using Type = detail::DevGlobalImplGeneric<TagGpuHipRt, T>;
+        };
+    } // namespace detail
+
+    // from device to host
+    template<
+        typename TTag,
+        typename TApi,
+        bool TBlocking,
+        typename TViewDst,
+        typename TTypeSrc,
+        typename std::enable_if_t<
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+            (std::is_same_v<TTag, TagGpuCudaRt> && std::is_same_v<TApi, ApiCudaRt>)
+#    else
+            (std::is_same_v<TTag, TagGpuHipRt> && std::is_same_v<TApi, ApiHipRt>)
+#    endif
+                ,
+            int>
+        = 0>
+    ALPAKA_FN_HOST auto memcpy(
+        uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>& queue,
+        TViewDst& viewDst,
+        alpaka::detail::DevGlobalImplGeneric<TTag, TTypeSrc>& viewSrc)
+    {
+        using Type = std::remove_const_t<std::remove_all_extents_t<TTypeSrc>>;
+        using TypeExt = std::remove_const_t<TTypeSrc>;
+        auto extent = getExtents(viewDst);
+        TypeExt* pMemAcc(nullptr);
+        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+            TApi::getSymbolAddress(reinterpret_cast<void**>(&pMemAcc), *(const_cast<TypeExt*>(&viewSrc))));
+
+        auto view = alpaka::ViewPlainPtr<
+            DevUniformCudaHipRt<TApi>,
+            Type,
+            alpaka::Dim<decltype(extent)>,
+            alpaka::Idx<decltype(extent)>>(reinterpret_cast<Type*>(pMemAcc), alpaka::getDev(queue), extent);
+        enqueue(queue, createTaskMemcpy(std::forward<TViewDst>(viewDst), view, extent));
+    }
+
+    // from host to device
+    template<
+        typename TTag,
+        typename TApi,
+        bool TBlocking,
+        typename TTypeDst,
+        typename TViewSrc,
+        typename std::enable_if_t<
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+            (std::is_same_v<TTag, TagGpuCudaRt> && std::is_same_v<TApi, ApiCudaRt>)
+#    else
+            (std::is_same_v<TTag, TagGpuHipRt> && std::is_same_v<TApi, ApiHipRt>)
+#    endif
+                ,
+            int>
+        = 0>
+    ALPAKA_FN_HOST auto memcpy(
+        uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>& queue,
+        alpaka::detail::DevGlobalImplGeneric<TTag, TTypeDst>& viewDst,
+        TViewSrc const& viewSrc)
+    {
+        using Type = std::remove_const_t<std::remove_all_extents_t<TTypeDst>>;
+        using TypeExt = std::remove_const_t<TTypeDst>;
+        auto extent = getExtents(viewSrc);
+        Type* pMemAcc(nullptr);
+        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+            TApi::getSymbolAddress(reinterpret_cast<void**>(&pMemAcc), *(const_cast<TypeExt*>(&viewDst))));
+
+        auto view = alpaka::ViewPlainPtr<
+            DevUniformCudaHipRt<TApi>,
+            Type,
+            alpaka::Dim<decltype(extent)>,
+            alpaka::Idx<decltype(extent)>>(reinterpret_cast<Type*>(pMemAcc), alpaka::getDev(queue), extent);
+        enqueue(queue, createTaskMemcpy(std::forward<decltype(view)>(view), viewSrc, extent));
+    }
+
+    // from device to host
+    template<
+        typename TTag,
+        typename TApi,
+        bool TBlocking,
+        typename TViewDst,
+        typename TTypeSrc,
+        typename TExtent,
+        typename std::enable_if_t<
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+            (std::is_same_v<TTag, TagGpuCudaRt> && std::is_same_v<TApi, ApiCudaRt>)
+#    else
+            (std::is_same_v<TTag, TagGpuHipRt> && std::is_same_v<TApi, ApiHipRt>)
+#    endif
+                ,
+            int>
+        = 0>
+    ALPAKA_FN_HOST auto memcpy(
+        uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>& queue,
+        TViewDst& viewDst,
+        alpaka::detail::DevGlobalImplGeneric<TTag, TTypeSrc>& viewSrc,
+        TExtent extent)
+    {
+        using Type = std::remove_const_t<std::remove_all_extents_t<TTypeSrc>>;
+        using TypeExt = std::remove_const_t<TTypeSrc>;
+        Type* pMemAcc(nullptr);
+        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+            TApi::getSymbolAddress(reinterpret_cast<void**>(&pMemAcc), *(const_cast<TypeExt*>(&viewSrc))));
+
+        auto view = alpaka::ViewPlainPtr<DevUniformCudaHipRt<TApi>, Type, alpaka::Dim<TExtent>, alpaka::Idx<TExtent>>(
+            reinterpret_cast<Type*>(pMemAcc),
+            alpaka::getDev(queue),
+            extent);
+        enqueue(queue, createTaskMemcpy(std::forward<TViewDst>(viewDst), view, extent));
+    }
+
+    // from host to device
+    template<
+        typename TTag,
+        typename TApi,
+        bool TBlocking,
+        typename TTypeDst,
+        typename TViewSrc,
+        typename TExtent,
+        typename std::enable_if_t<
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+            (std::is_same_v<TTag, TagGpuCudaRt> && std::is_same_v<TApi, ApiCudaRt>)
+#    else
+            (std::is_same_v<TTag, TagGpuHipRt> && std::is_same_v<TApi, ApiHipRt>)
+#    endif
+                ,
+            int>
+        = 0>
+    ALPAKA_FN_HOST auto memcpy(
+        uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>& queue,
+        alpaka::detail::DevGlobalImplGeneric<TTag, TTypeDst>& viewDst,
+        TViewSrc const& viewSrc,
+        TExtent extent)
+    {
+        using Type = std::remove_const_t<std::remove_all_extents_t<TTypeDst>>;
+        using TypeExt = std::remove_const_t<TTypeDst>;
+        Type* pMemAcc(nullptr);
+        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+            TApi::getSymbolAddress(reinterpret_cast<void**>(&pMemAcc), *(const_cast<TypeExt*>(&viewDst))));
+
+        auto view = alpaka::ViewPlainPtr<DevUniformCudaHipRt<TApi>, Type, alpaka::Dim<TExtent>, alpaka::Idx<TExtent>>(
+            reinterpret_cast<Type*>(pMemAcc),
+            alpaka::getDev(queue),
+            extent);
+        enqueue(queue, createTaskMemcpy(std::forward<decltype(view)>(view), viewSrc, extent));
+    }
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/mem/global/Traits.hpp b/include/alpaka/mem/global/Traits.hpp
new file mode 100644
index 0000000..7b3c3d1
--- /dev/null
+++ b/include/alpaka/mem/global/Traits.hpp
@@ -0,0 +1,45 @@
+/* Copyright 2024 Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/acc/Tag.hpp"
+#include "alpaka/core/Common.hpp"
+#include "alpaka/meta/DependentFalseType.hpp"
+
+namespace alpaka
+{
+
+    namespace detail
+    {
+        template<typename TTag, typename T>
+        struct DevGlobalImplGeneric
+        {
+            // does not make use of TTag
+            using Type = std::remove_const_t<T>;
+            Type value; // backend specific value
+
+            ALPAKA_FN_HOST_ACC T* operator&()
+            {
+                return &value;
+            }
+
+            ALPAKA_FN_HOST_ACC T& get()
+            {
+                return value;
+            }
+        };
+
+        template<typename TTag, typename T>
+        struct DevGlobalTrait
+        {
+            static constexpr bool const IsImplementedFor = alpaka::meta::DependentFalseType<TTag>::value;
+
+            static_assert(IsImplementedFor, "Error: device global variables are not implemented for the given Tag");
+        };
+    } // namespace detail
+
+    template<typename TAcc, typename T>
+    using DevGlobal = typename detail::DevGlobalTrait<typename alpaka::trait::AccToTag<TAcc>::type, T>::Type;
+} // namespace alpaka
diff --git a/include/alpaka/mem/view/Traits.hpp b/include/alpaka/mem/view/Traits.hpp
new file mode 100644
index 0000000..5a9db5b
--- /dev/null
+++ b/include/alpaka/mem/view/Traits.hpp
@@ -0,0 +1,614 @@
+/* Copyright 2024 Axel Hübl, Benjamin Worpitz, Matthias Werner, Andrea Bocci, Jan Stephan, Bernhard Manfred Gruber,
+ *                Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/core/Unreachable.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/dim/Traits.hpp"
+#include "alpaka/elem/Traits.hpp"
+#include "alpaka/extent/Traits.hpp"
+#include "alpaka/meta/Fold.hpp"
+#include "alpaka/meta/Integral.hpp"
+#include "alpaka/offset/Traits.hpp"
+#include "alpaka/queue/Traits.hpp"
+#include "alpaka/vec/Traits.hpp"
+#include "alpaka/vec/Vec.hpp"
+
+#include <array>
+#include <cstddef>
+#include <iosfwd>
+#include <type_traits>
+#include <vector>
+#ifdef ALPAKA_USE_MDSPAN
+#    include <experimental/mdspan>
+#endif
+
+namespace alpaka
+{
+    namespace detail
+    {
+        //! Calculate the pitches purely from the extents.
+        template<typename TElem, typename TDim, typename TIdx>
+        ALPAKA_FN_HOST_ACC inline constexpr auto calculatePitchesFromExtents(Vec<TDim, TIdx> const& extent)
+        {
+            Vec<TDim, TIdx> pitchBytes{};
+            constexpr auto dim = TIdx{TDim::value};
+            if constexpr(dim > 0)
+                pitchBytes.back() = static_cast<TIdx>(sizeof(TElem));
+            if constexpr(dim > 1)
+                for(TIdx i = TDim::value - 1; i > 0; i--)
+                    pitchBytes[i - 1] = extent[i] * pitchBytes[i];
+            return pitchBytes;
+        }
+    } // namespace detail
+
+    //! The view traits.
+    namespace trait
+    {
+        //! The native pointer get trait.
+        template<typename TView, typename TSfinae = void>
+        struct GetPtrNative;
+
+        //! The pointer on device get trait.
+        template<typename TView, typename TDev, typename TSfinae = void>
+        struct GetPtrDev;
+
+        //! The pitch in bytes.
+        //! This is the distance in bytes in the linear memory between two consecutive elements in the next higher
+        //! dimension (TIdx-1).
+        //!
+        //! The default implementation uses the extent to calculate the pitch.
+        template<typename TIdx, typename TView, typename TSfinae = void>
+        struct [[deprecated("Use GetPitchesInBytes instead")]] GetPitchBytes
+        {
+            using ViewIdx = Idx<TView>;
+
+            ALPAKA_FN_HOST static auto getPitchBytes(TView const& view) -> ViewIdx
+            {
+                return getPitchBytesDefault(view);
+            }
+
+        private:
+            static auto getPitchBytesDefault(TView const& view) -> ViewIdx
+            {
+                constexpr auto idx = TIdx::value;
+                constexpr auto viewDim = Dim<TView>::value;
+                if constexpr(idx < viewDim - 1)
+                {
+#if BOOST_COMP_CLANG || BOOST_COMP_GNUC
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+                    return getExtents(view)[idx] * GetPitchBytes<DimInt<idx + 1>, TView>::getPitchBytes(view);
+#if BOOST_COMP_CLANG || BOOST_COMP_GNUC
+#    pragma GCC diagnostic pop
+#endif
+                }
+                else if constexpr(idx == viewDim - 1)
+                    return getExtents(view)[viewDim - 1] * static_cast<ViewIdx>(sizeof(Elem<TView>));
+                else
+                    return static_cast<ViewIdx>(sizeof(Elem<TView>));
+                ALPAKA_UNREACHABLE({});
+            }
+        };
+
+        //! Customization point for \ref getPitchesInBytes.
+        //! The default implementation uses the extent to calculate the pitches.
+        template<typename TView, typename TSfinae = void>
+        struct GetPitchesInBytes
+        {
+            ALPAKA_FN_HOST_ACC constexpr auto operator()(TView const& view) const
+            {
+                return alpaka::detail::calculatePitchesFromExtents<Elem<TView>>(getExtents(view));
+            }
+        };
+
+        //! The memory set task trait.
+        //!
+        //! Fills the view with data.
+        template<typename TDim, typename TDev, typename TSfinae = void>
+        struct CreateTaskMemset;
+
+        //! The memory copy task trait.
+        //!
+        //! Copies memory from one view into another view possibly on a different device.
+        template<typename TDim, typename TDevDst, typename TDevSrc, typename TSfinae = void>
+        struct CreateTaskMemcpy;
+
+        //! The device memory view creation trait.
+        template<typename TDev, typename TSfinae = void>
+        struct CreateViewPlainPtr;
+
+        //! The sub view creation trait.
+        template<typename TDev, typename TSfinae = void>
+        struct CreateSubView;
+    } // namespace trait
+
+    //! Gets the native pointer of the memory view.
+    //!
+    //! \param view The memory view.
+    //! \return The native pointer.
+    template<typename TView>
+    ALPAKA_FN_HOST auto getPtrNative(TView const& view) -> Elem<TView> const*
+    {
+        return trait::GetPtrNative<TView>::getPtrNative(view);
+    }
+
+    //! Gets the native pointer of the memory view.
+    //!
+    //! \param view The memory view.
+    //! \return The native pointer.
+    template<typename TView>
+    ALPAKA_FN_HOST auto getPtrNative(TView& view) -> Elem<TView>*
+    {
+        return trait::GetPtrNative<TView>::getPtrNative(view);
+    }
+
+    //! Gets the pointer to the view on the given device.
+    //!
+    //! \param view The memory view.
+    //! \param dev The device.
+    //! \return The pointer on the device.
+    template<typename TView, typename TDev>
+    ALPAKA_FN_HOST auto getPtrDev(TView const& view, TDev const& dev) -> Elem<TView> const*
+    {
+        return trait::GetPtrDev<TView, TDev>::getPtrDev(view, dev);
+    }
+
+    //! Gets the pointer to the view on the given device.
+    //!
+    //! \param view The memory view.
+    //! \param dev The device.
+    //! \return The pointer on the device.
+    template<typename TView, typename TDev>
+    ALPAKA_FN_HOST auto getPtrDev(TView& view, TDev const& dev) -> Elem<TView>*
+    {
+        return trait::GetPtrDev<TView, TDev>::getPtrDev(view, dev);
+    }
+
+    //! \return The pitch in bytes. This is the distance in bytes between two consecutive elements in the given
+    //! dimension.
+    template<std::size_t Tidx, typename TView>
+    [[deprecated("Use getPitchesInBytes instead")]] ALPAKA_FN_HOST auto getPitchBytes(TView const& view) -> Idx<TView>
+    {
+#if BOOST_COMP_CLANG || BOOST_COMP_GNUC
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+        return trait::GetPitchBytes<DimInt<Tidx>, TView>::getPitchBytes(view);
+#if BOOST_COMP_CLANG || BOOST_COMP_GNUC
+#    pragma GCC diagnostic pop
+#endif
+    }
+
+    //! \return The pitches in bytes as an alpaka::Vec. This is the distance in bytes between two consecutive elements
+    //! in the given dimension.
+    //! E.g. for a 3D view without padding, the 0-dim pitch is the distance in bytes to jump from one element to the
+    //! next within the same row, the 1-dim pitch (aka. the row pitch) is the distance in bytes to jump from one
+    //! element to the neighboring element on the next row. The 2-dim pitch (aka. the slice pitch) is the distance in
+    //! bytes to jump from one element to the neighboring element on the next slice.
+    //! E.g. a 3D view of floats without padding and the extents {42, 10, 2}, would have a pitch vector of {80, 8, 4}.
+    template<typename TView>
+    ALPAKA_FN_HOST auto getPitchesInBytes(TView const& view) -> Vec<Dim<TView>, Idx<TView>>
+    {
+        return trait::GetPitchesInBytes<TView>{}(view);
+    }
+
+    //! Create a memory set task.
+    //!
+    //! \param view The memory view to fill.
+    //! \param byte Value to set for each element of the specified view.
+    //! \param extent The extent of the view to fill.
+    template<typename TExtent, typename TViewFwd>
+    ALPAKA_FN_HOST auto createTaskMemset(TViewFwd&& view, std::uint8_t const& byte, TExtent const& extent)
+    {
+        using TView = std::remove_reference_t<TViewFwd>;
+        static_assert(!std::is_const_v<TView>, "The view must not be const!");
+        static_assert(
+            Dim<TView>::value == Dim<TExtent>::value,
+            "The view and the extent are required to have the same dimensionality!");
+        static_assert(
+            meta::IsIntegralSuperset<Idx<TView>, Idx<TExtent>>::value,
+            "The view and the extent must have compatible index types!");
+
+        return trait::CreateTaskMemset<Dim<TView>, Dev<TView>>::createTaskMemset(
+            std::forward<TViewFwd>(view),
+            byte,
+            extent);
+    }
+
+    //! Sets the bytes of the memory of view, described by extent, to the given value.
+    //!
+    //! \param queue The queue to enqueue the view fill task into.
+    //! \param[in,out] view The memory view to fill. May be a temporary object.
+    //! \param byte Value to set for each element of the specified view.
+    //! \param extent The extent of the view to fill.
+    template<typename TExtent, typename TViewFwd, typename TQueue>
+    ALPAKA_FN_HOST auto memset(TQueue& queue, TViewFwd&& view, std::uint8_t const& byte, TExtent const& extent) -> void
+    {
+        enqueue(queue, createTaskMemset(std::forward<TViewFwd>(view), byte, extent));
+    }
+
+    //! Sets each byte of the memory of the entire view to the given value.
+    //!
+    //! \param queue The queue to enqueue the view fill task into.
+    //! \param[in,out] view The memory view to fill. May be a temporary object.
+    //! \param byte Value to set for each element of the specified view.
+    template<typename TViewFwd, typename TQueue>
+    ALPAKA_FN_HOST auto memset(TQueue& queue, TViewFwd&& view, std::uint8_t const& byte) -> void
+    {
+        enqueue(queue, createTaskMemset(std::forward<TViewFwd>(view), byte, getExtents(view)));
+    }
+
+    //! Creates a memory copy task.
+    //!
+    //! \param viewDst The destination memory view.
+    //! \param viewSrc The source memory view.
+    //! \param extent The extent of the view to copy.
+    template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
+    ALPAKA_FN_HOST auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
+    {
+        using TViewDst = std::remove_reference_t<TViewDstFwd>;
+        using SrcElem = Elem<TViewSrc>;
+        using DstElem = Elem<TViewDst>;
+        using ExtentIdx = Idx<TExtent>;
+        using DstIdx = Idx<TViewDst>;
+        using SrcIdx = Idx<TViewSrc>;
+
+        static_assert(!std::is_const_v<TViewDst>, "The destination view must not be const!");
+        static_assert(!std::is_const_v<DstElem>, "The destination view's element type must not be const!");
+        static_assert(
+            Dim<TViewDst>::value == Dim<TViewSrc>::value,
+            "The source and the destination view must have the same dimensionality!");
+        static_assert(
+            Dim<TViewDst>::value == Dim<TExtent>::value,
+            "The destination view and the extent must have the same dimensionality!");
+        static_assert(
+            std::is_same_v<DstElem, std::remove_const_t<SrcElem>>,
+            "The source and destination view must have the same element type!");
+        static_assert(
+            meta::IsIntegralSuperset<DstIdx, ExtentIdx>::value,
+            "The destination view and the extent are required to have compatible index types!");
+        static_assert(
+            meta::IsIntegralSuperset<SrcIdx, ExtentIdx>::value,
+            "The source view and the extent are required to have compatible index types!");
+
+        return trait::CreateTaskMemcpy<Dim<TViewDst>, Dev<TViewDst>, Dev<TViewSrc>>::createTaskMemcpy(
+            std::forward<TViewDstFwd>(viewDst),
+            viewSrc,
+            extent);
+    }
+
+    //! Copies memory from a part of viewSrc to viewDst, described by extent. Possibly copies between different memory
+    //! spaces.
+    //!
+    //! \param queue The queue to enqueue the view copy task into.
+    //! \param[in,out] viewDst The destination memory view. May be a temporary object.
+    //! \param viewSrc The source memory view. May be a temporary object.
+    //! \param extent The extent of the view to copy.
+    template<typename TExtent, typename TViewSrc, typename TViewDstFwd, typename TQueue>
+    ALPAKA_FN_HOST auto memcpy(TQueue& queue, TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
+        -> void
+    {
+        enqueue(queue, createTaskMemcpy(std::forward<TViewDstFwd>(viewDst), viewSrc, extent));
+    }
+
+    //! Copies the entire memory of viewSrc to viewDst. Possibly copies between different memory
+    //! spaces.
+    //!
+    //! \param queue The queue to enqueue the view copy task into.
+    //! \param[in,out] viewDst The destination memory view. May be a temporary object.
+    //! \param viewSrc The source memory view. May be a temporary object.
+    template<typename TViewSrc, typename TViewDstFwd, typename TQueue>
+    ALPAKA_FN_HOST auto memcpy(TQueue& queue, TViewDstFwd&& viewDst, TViewSrc const& viewSrc) -> void
+    {
+        enqueue(queue, createTaskMemcpy(std::forward<TViewDstFwd>(viewDst), viewSrc, getExtents(viewSrc)));
+    }
+
+    namespace detail
+    {
+        template<typename TDim, typename TView>
+        struct Print
+        {
+            ALPAKA_FN_HOST static auto print(
+                TView const& view,
+                Elem<TView> const* const ptr,
+                Vec<Dim<TView>, Idx<TView>> const& extent,
+                std::ostream& os,
+                std::string const& elementSeparator,
+                std::string const& rowSeparator,
+                std::string const& rowPrefix,
+                std::string const& rowSuffix) -> void
+            {
+                os << rowPrefix;
+
+                auto const pitch = getPitchesInBytes(view)[TDim::value + 1];
+                auto const lastIdx(extent[TDim::value] - 1u);
+                for(auto i(decltype(lastIdx)(0)); i <= lastIdx; ++i)
+                {
+                    Print<DimInt<TDim::value + 1u>, TView>::print(
+                        view,
+                        reinterpret_cast<Elem<TView> const*>(reinterpret_cast<std::uint8_t const*>(ptr) + i * pitch),
+                        extent,
+                        os,
+                        elementSeparator,
+                        rowSeparator,
+                        rowPrefix,
+                        rowSuffix);
+
+                    // While we are not at the end of a row, add the row separator.
+                    if(i != lastIdx)
+                    {
+                        os << rowSeparator;
+                    }
+                }
+
+                os << rowSuffix;
+            }
+        };
+
+        template<typename TView>
+        struct Print<DimInt<Dim<TView>::value - 1u>, TView>
+        {
+            ALPAKA_FN_HOST static auto print(
+                TView const& /* view */,
+                Elem<TView> const* const ptr,
+                Vec<Dim<TView>, Idx<TView>> const& extent,
+                std::ostream& os,
+                std::string const& elementSeparator,
+                std::string const& /* rowSeparator */,
+                std::string const& rowPrefix,
+                std::string const& rowSuffix) -> void
+            {
+                os << rowPrefix;
+
+                auto const lastIdx(extent[Dim<TView>::value - 1u] - 1u);
+                for(auto i(decltype(lastIdx)(0)); i <= lastIdx; ++i)
+                {
+                    // Add the current element.
+                    os << *(ptr + i);
+
+                    // While we are not at the end of a line, add the element separator.
+                    if(i != lastIdx)
+                    {
+                        os << elementSeparator;
+                    }
+                }
+
+                os << rowSuffix;
+            }
+        };
+    } // namespace detail
+
+    //! Prints the content of the view to the given queue.
+    // \TODO: Add precision flag.
+    // \TODO: Add column alignment flag.
+    template<typename TView>
+    ALPAKA_FN_HOST auto print(
+        TView const& view,
+        std::ostream& os,
+        std::string const& elementSeparator = ", ",
+        std::string const& rowSeparator = "\n",
+        std::string const& rowPrefix = "[",
+        std::string const& rowSuffix = "]") -> void
+    {
+        detail::Print<DimInt<0u>, TView>::print(
+            view,
+            getPtrNative(view),
+            getExtents(view),
+            os,
+            elementSeparator,
+            rowSeparator,
+            rowPrefix,
+            rowSuffix);
+    }
+
+    //! \return The pitch vector.
+    template<typename TView>
+    [[deprecated("Use getPitchesInBytes instead")]] auto getPitchBytesVec(TView const& view)
+        -> Vec<Dim<TView>, Idx<TView>>
+    {
+        return getPitchesInBytes(view);
+    }
+
+    //! \return The pitch but only the last N elements.
+    template<typename TDim, typename TView>
+    ALPAKA_FN_HOST auto getPitchBytesVecEnd(TView const& view = TView()) -> Vec<TDim, Idx<TView>>
+    {
+        return subVecEnd<TDim>(getPitchesInBytes(view));
+    }
+
+    //! Creates a view to a device pointer
+    //!
+    //! \param dev Device from where pMem can be accessed.
+    //! \param pMem Pointer to memory. The pointer must be accessible from the given device.
+    //! \param extent Number of elements represented by the pMem.
+    //!               Using a multi dimensional extent will result in a multi dimension view to the memory represented
+    //!               by pMem.
+    //! \return A view to device memory.
+    template<typename TDev, typename TElem, typename TExtent>
+    auto createView(TDev const& dev, TElem* pMem, TExtent const& extent)
+    {
+        using Dim = alpaka::Dim<TExtent>;
+        using Idx = alpaka::Idx<TExtent>;
+        auto const extentVec = Vec<Dim, Idx>(extent);
+        return trait::CreateViewPlainPtr<TDev>::createViewPlainPtr(
+            dev,
+            pMem,
+            extentVec,
+            detail::calculatePitchesFromExtents<TElem>(extentVec));
+    }
+
+    //! Creates a view to a device pointer
+    //!
+    //! \param dev Device from where pMem can be accessed.
+    //! \param pMem Pointer to memory. The pointer must be accessible from the given device.
+    //! \param extent Number of elements represented by the pMem.
+    //!               Using a multi dimensional extent will result in a multi dimension view to the memory represented
+    //!               by pMem.
+    //! \param pitch Pitch in bytes for each dimension. Dimensionality must be equal to extent.
+    //! \return A view to device memory.
+    template<typename TDev, typename TElem, typename TExtent, typename TPitch>
+    auto createView(TDev const& dev, TElem* pMem, TExtent const& extent, TPitch pitch)
+    {
+        return trait::CreateViewPlainPtr<TDev>::createViewPlainPtr(dev, pMem, extent, pitch);
+    }
+
+    //! Creates a view to a contiguous container of device-accessible memory.
+    //!
+    //! \param dev Device from which the container can be accessed.
+    //! \param con Contiguous container. The container must provide a `data()` method. The data held by the container
+    //!            must be accessible from the given device. The `GetExtent` trait must be defined for the container.
+    //! \return A view to device memory.
+    template<typename TDev, typename TContainer>
+    auto createView(TDev const& dev, TContainer& con)
+    {
+        return createView(dev, std::data(con), getExtents(con));
+    }
+
+    //! Creates a view to a contiguous container of device-accessible memory.
+    //!
+    //! \param dev Device from which the container can be accessed.
+    //! \param con Contiguous container. The container must provide a `data()` method. The data held by the container
+    //!            must be accessible from the given device. The `GetExtent` trait must be defined for the container.
+    //! \param extent Number of elements held by the container. Using a multi-dimensional extent will result in a
+    //!               multi-dimensional view to the memory represented by the container.
+    //! \return A view to device memory.
+    template<typename TDev, typename TContainer, typename TExtent>
+    auto createView(TDev const& dev, TContainer& con, TExtent const& extent)
+    {
+        return createView(dev, std::data(con), extent);
+    }
+
+    //! Creates a sub view to an existing view.
+    //!
+    //! \param view The view this view is a sub-view of.
+    //! \param extent Number of elements the resulting view holds.
+    //! \param offset Number of elements skipped in view for the new origin of the resulting view.
+    //! \return A sub view to a existing view.
+    template<typename TView, typename TExtent, typename TOffsets>
+    auto createSubView(TView& view, TExtent const& extent, TOffsets const& offset = TExtent())
+    {
+        return trait::CreateSubView<typename trait::DevType<TView>::type>::createSubView(view, extent, offset);
+    }
+
+#ifdef ALPAKA_USE_MDSPAN
+    namespace experimental
+    {
+        // import mdspan into alpaka::experimental namespace. see: https://eel.is/c++draft/mdspan.syn
+        using std::experimental::default_accessor;
+        using std::experimental::dextents;
+        using std::experimental::extents;
+        using std::experimental::layout_left;
+        using std::experimental::layout_right;
+        using std::experimental::layout_stride;
+        using std::experimental::mdspan;
+        // import submdspan as well, which is not standardized yet
+        using std::experimental::full_extent;
+        using std::experimental::submdspan;
+
+        namespace traits
+        {
+            namespace detail
+            {
+                template<typename ElementType>
+                struct ByteIndexedAccessor
+                {
+                    using offset_policy = ByteIndexedAccessor;
+                    using element_type = ElementType;
+                    using reference = ElementType&;
+
+                    using data_handle_type
+                        = std::conditional_t<std::is_const_v<ElementType>, std::byte const*, std::byte*>;
+
+                    constexpr ByteIndexedAccessor() noexcept = default;
+
+                    ALPAKA_FN_HOST_ACC constexpr data_handle_type offset(data_handle_type p, size_t i) const noexcept
+                    {
+                        return p + i;
+                    }
+
+                    ALPAKA_FN_HOST_ACC constexpr reference access(data_handle_type p, size_t i) const noexcept
+                    {
+                        assert(i % alignof(ElementType) == 0);
+#    if BOOST_COMP_GNUC
+#        pragma GCC diagnostic push
+#        pragma GCC diagnostic ignored "-Wcast-align"
+#    endif
+                        return *reinterpret_cast<ElementType*>(p + i);
+#    if BOOST_COMP_GNUC
+#        pragma GCC diagnostic pop
+#    endif
+                    }
+                };
+
+                template<typename TView, std::size_t... Is>
+                ALPAKA_FN_HOST auto makeExtents(TView const& view, std::index_sequence<Is...>)
+                {
+                    auto const ex = getExtents(view);
+                    return std::experimental::dextents<Idx<TView>, Dim<TView>::value>{ex[Is]...};
+                }
+            } // namespace detail
+
+            //! Customization point for getting an mdspan from a view.
+            template<typename TView, typename TSfinae = void>
+            struct GetMdSpan
+            {
+                ALPAKA_FN_HOST static auto getMdSpan(TView& view)
+                {
+                    constexpr auto dim = Dim<TView>::value;
+                    using Element = Elem<TView>;
+                    auto extents = detail::makeExtents(view, std::make_index_sequence<dim>{});
+                    auto* ptr = reinterpret_cast<std::byte*>(getPtrNative(view));
+                    auto const strides = toArray(getPitchesInBytes(view));
+                    layout_stride::mapping<decltype(extents)> m{extents, strides};
+                    return mdspan<Element, decltype(extents), layout_stride, detail::ByteIndexedAccessor<Element>>{
+                        ptr,
+                        m};
+                }
+
+                ALPAKA_FN_HOST static auto getMdSpanTransposed(TView& view)
+                {
+                    constexpr auto dim = Dim<TView>::value;
+                    using Element = Elem<TView>;
+                    auto extents = detail::makeExtents(view, std::make_index_sequence<dim>{});
+                    auto* ptr = reinterpret_cast<std::byte*>(getPtrNative(view));
+                    auto strides = toArray(getPitchesInBytes(view));
+                    std::reverse(begin(strides), end(strides));
+                    layout_stride::mapping<decltype(extents)> m{extents, strides};
+                    return mdspan<Element, decltype(extents), layout_stride, detail::ByteIndexedAccessor<Element>>{
+                        ptr,
+                        m};
+                }
+            };
+        } // namespace traits
+
+        //! Gets a std::mdspan from the given view. The memory layout is determined by the pitches of the view.
+        template<typename TView>
+        ALPAKA_FN_HOST auto getMdSpan(TView& view)
+        {
+            return traits::GetMdSpan<TView>::getMdSpan(view);
+        }
+
+        //! Gets a std::mdspan from the given view. The memory layout is determined by the reversed pitches of the
+        //! view. This effectively also reverses the extents of the view. In order words, if you create a transposed
+        //! mdspan on a 10x5 element view, the mdspan will have an iteration space of 5x10.
+        template<typename TView>
+        ALPAKA_FN_HOST auto getMdSpanTransposed(TView& view)
+        {
+            return traits::GetMdSpan<TView>::getMdSpanTransposed(view);
+        }
+
+        template<typename TElem, typename TIdx, typename TDim>
+        using MdSpan = alpaka::experimental::mdspan<
+            TElem,
+            alpaka::experimental::dextents<TIdx, TDim::value>,
+            alpaka::experimental::layout_stride,
+            alpaka::experimental::traits::detail::ByteIndexedAccessor<TElem>>;
+    } // namespace experimental
+#endif
+} // namespace alpaka
diff --git a/include/alpaka/mem/view/ViewAccessOps.hpp b/include/alpaka/mem/view/ViewAccessOps.hpp
new file mode 100644
index 0000000..2705667
--- /dev/null
+++ b/include/alpaka/mem/view/ViewAccessOps.hpp
@@ -0,0 +1,151 @@
+/* Copyright 2023 Andrea Bocci, Bernhard Manfred Gruber, Jan Stephan
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/dim/Traits.hpp"
+#include "alpaka/extent/Traits.hpp"
+#include "alpaka/mem/view/Traits.hpp"
+
+#include <cstdint>
+#include <sstream>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+
+namespace alpaka::internal
+{
+    template<typename T, typename SFINAE = void>
+    inline constexpr bool isView = false;
+
+    // TODO(bgruber): replace this by a concept in C++20
+    template<typename TView>
+    inline constexpr bool isView<
+        TView,
+        std::void_t<
+            Idx<TView>,
+            Dim<TView>,
+            decltype(getPtrNative(std::declval<TView>())),
+            decltype(getPitchesInBytes(std::declval<TView>())),
+            decltype(getExtents(std::declval<TView>()))>>
+        = true;
+
+    template<typename TView>
+    struct ViewAccessOps
+    {
+        static_assert(isView<TView>);
+
+    private:
+        using value_type = Elem<TView>;
+        using pointer = value_type*;
+        using const_pointer = value_type const*;
+        using reference = value_type&;
+        using const_reference = value_type const&;
+        using Idx = alpaka::Idx<TView>;
+        using Dim = alpaka::Dim<TView>;
+
+    public:
+        ALPAKA_FN_HOST auto data() -> pointer
+        {
+            return getPtrNative(*static_cast<TView*>(this));
+        }
+
+        [[nodiscard]] ALPAKA_FN_HOST auto data() const -> const_pointer
+        {
+            return getPtrNative(*static_cast<TView const*>(this));
+        }
+
+        ALPAKA_FN_HOST auto operator*() -> reference
+        {
+            static_assert(Dim::value == 0, "operator* is only valid for Buffers and Views of dimension 0");
+            return *data();
+        }
+
+        ALPAKA_FN_HOST auto operator*() const -> const_reference
+        {
+            static_assert(Dim::value == 0, "operator* is only valid for Buffers and Views of dimension 0");
+            return *data();
+        }
+
+        ALPAKA_FN_HOST auto operator->() -> pointer
+        {
+            static_assert(Dim::value == 0, "operator-> is only valid for Buffers and Views of dimension 0");
+            return data();
+        }
+
+        ALPAKA_FN_HOST auto operator->() const -> const_pointer
+        {
+            static_assert(Dim::value == 0, "operator-> is only valid for Buffers and Views of dimension 0");
+            return data();
+        }
+
+        ALPAKA_FN_HOST auto operator[](Idx i) -> reference
+        {
+            static_assert(Dim::value == 1, "operator[i] is only valid for Buffers and Views of dimension 1");
+            return data()[i];
+        }
+
+        ALPAKA_FN_HOST auto operator[](Idx i) const -> const_reference
+        {
+            static_assert(Dim::value == 1, "operator[i] is only valid for Buffers and Views of dimension 1");
+            return data()[i];
+        }
+
+    private:
+        template<typename TIdx>
+        [[nodiscard]] ALPAKA_FN_HOST auto ptr_at([[maybe_unused]] Vec<Dim, TIdx> index) const -> const_pointer
+        {
+            static_assert(
+                std::is_convertible_v<TIdx, Idx>,
+                "the index type must be convertible to the index of the Buffer or View");
+
+            auto ptr = reinterpret_cast<std::uintptr_t>(data());
+            if constexpr(Dim::value > 0)
+            {
+                ptr += static_cast<std::uintptr_t>(
+                    (getPitchesInBytes(*static_cast<TView const*>(this)) * castVec<Idx>(index)).sum());
+            }
+            return reinterpret_cast<const_pointer>(ptr);
+        }
+
+    public:
+        template<typename TIdx>
+        ALPAKA_FN_HOST auto operator[](Vec<Dim, TIdx> index) -> reference
+        {
+            return *const_cast<pointer>(ptr_at(index));
+        }
+
+        template<typename TIdx>
+        ALPAKA_FN_HOST auto operator[](Vec<Dim, TIdx> index) const -> const_reference
+        {
+            return *ptr_at(index);
+        }
+
+        template<typename TIdx>
+        ALPAKA_FN_HOST auto at(Vec<Dim, TIdx> index) -> reference
+        {
+            auto extent = getExtents(*static_cast<TView*>(this));
+            if(!(index < extent).all())
+            {
+                std::stringstream msg;
+                msg << "index " << index << " is outside of the Buffer or View extent " << extent;
+                throw std::out_of_range(msg.str());
+            }
+            return *const_cast<pointer>(ptr_at(index));
+        }
+
+        template<typename TIdx>
+        [[nodiscard]] ALPAKA_FN_HOST auto at(Vec<Dim, TIdx> index) const -> const_reference
+        {
+            auto extent = getExtents(*static_cast<TView const*>(this));
+            if(!(index < extent).all())
+            {
+                std::stringstream msg;
+                msg << "index " << index << " is outside of the Buffer or View extent " << extent;
+                throw std::out_of_range(msg.str());
+            }
+            return *ptr_at(index);
+        }
+    };
+} // namespace alpaka::internal
diff --git a/include/alpaka/mem/view/ViewConst.hpp b/include/alpaka/mem/view/ViewConst.hpp
new file mode 100644
index 0000000..a4cd5db
--- /dev/null
+++ b/include/alpaka/mem/view/ViewConst.hpp
@@ -0,0 +1,115 @@
+/* Copyright 2022 Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/dim/Traits.hpp"
+#include "alpaka/extent/Traits.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/mem/view/Traits.hpp"
+#include "alpaka/mem/view/ViewAccessOps.hpp"
+#include "alpaka/offset/Traits.hpp"
+
+namespace alpaka
+{
+    //! A non-modifiable wrapper around a view. This view acts as the wrapped view, but the underlying data is only
+    //! exposed const-qualified.
+    template<typename TView>
+    struct ViewConst : internal::ViewAccessOps<ViewConst<TView>>
+    {
+        static_assert(!std::is_const_v<TView>, "ViewConst must be instantiated with a non-const type");
+        static_assert(
+            !std::is_reference_v<TView>,
+            "This is not implemented"); // It might even be dangerous for ViewConst to store a reference to the wrapped
+                                        // view, as this decouples the wrapped view's lifetime.
+
+        ALPAKA_FN_HOST ViewConst(TView const& view) : m_view(view)
+        {
+        }
+
+        ALPAKA_FN_HOST ViewConst(TView&& view) : m_view(std::move(view))
+        {
+        }
+
+        TView m_view;
+    };
+
+    template<typename TView>
+    ViewConst(TView) -> ViewConst<std::decay_t<TView>>;
+
+    namespace trait
+    {
+        template<typename TView>
+        struct DevType<ViewConst<TView>> : DevType<TView>
+        {
+        };
+
+        template<typename TView>
+        struct GetDev<ViewConst<TView>>
+        {
+            ALPAKA_FN_HOST static auto getDev(ViewConst<TView> const& view)
+            {
+                return alpaka::getDev(view.m_view);
+            }
+        };
+
+        template<typename TView>
+        struct DimType<ViewConst<TView>> : DimType<TView>
+        {
+        };
+
+        template<typename TView>
+        struct ElemType<ViewConst<TView>>
+        {
+            // const qualify the element type of the inner view
+            using type = typename ElemType<TView>::type const;
+        };
+
+        template<typename TView>
+        struct GetExtents<ViewConst<TView>>
+        {
+            ALPAKA_FN_HOST auto operator()(ViewConst<TView> const& view) const
+            {
+                return getExtents(view.m_view);
+            }
+        };
+
+        template<typename TView>
+        struct GetPtrNative<ViewConst<TView>>
+        {
+            using TElem = typename ElemType<TView>::type;
+
+            // const qualify the element type of the inner view
+            ALPAKA_FN_HOST static auto getPtrNative(ViewConst<TView> const& view) -> TElem const*
+            {
+                return alpaka::getPtrNative(view.m_view);
+            }
+        };
+
+        template<typename TView>
+        struct GetPitchesInBytes<ViewConst<TView>>
+        {
+            ALPAKA_FN_HOST auto operator()(ViewConst<TView> const& view) const
+            {
+                return alpaka::getPitchesInBytes(view.m_view);
+            }
+        };
+
+        template<typename TView>
+        struct GetOffsets<ViewConst<TView>>
+        {
+            ALPAKA_FN_HOST auto operator()(ViewConst<TView> const& view) const
+            {
+                return alpaka::getOffsets(view.m_view);
+            }
+        };
+
+        template<typename TView>
+        struct IdxType<ViewConst<TView>> : IdxType<TView>
+        {
+        };
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/mem/view/ViewPlainPtr.hpp b/include/alpaka/mem/view/ViewPlainPtr.hpp
new file mode 100644
index 0000000..dda4a17
--- /dev/null
+++ b/include/alpaka/mem/view/ViewPlainPtr.hpp
@@ -0,0 +1,192 @@
+/* Copyright 2024 Benjamin Worpitz, Matthias Werner, René Widera, Sergei Bastrakov, Bernhard Manfred Gruber,
+ *                Jan Stephan, Andrea Bocci, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/dev/DevCpu.hpp"
+#include "alpaka/dev/DevGenericSycl.hpp"
+#include "alpaka/dev/DevUniformCudaHipRt.hpp"
+#include "alpaka/mem/view/Traits.hpp"
+#include "alpaka/mem/view/ViewAccessOps.hpp"
+#include "alpaka/meta/DependentFalseType.hpp"
+#include "alpaka/vec/Vec.hpp"
+
+#include <type_traits>
+#include <utility>
+
+namespace alpaka
+{
+    //! The memory view to wrap plain pointers.
+    template<typename TDev, typename TElem, typename TDim, typename TIdx>
+    struct ViewPlainPtr final : internal::ViewAccessOps<ViewPlainPtr<TDev, TElem, TDim, TIdx>>
+    {
+        static_assert(!std::is_const_v<TIdx>, "The idx type of the view can not be const!");
+
+        template<typename TExtent>
+        ALPAKA_FN_HOST ViewPlainPtr(TElem* pMem, TDev dev, TExtent const& extent = TExtent())
+            : ViewPlainPtr(pMem, std::move(dev), extent, detail::calculatePitchesFromExtents<TElem>(extent))
+        {
+        }
+
+        template<typename TExtent, typename TPitch>
+        ALPAKA_FN_HOST ViewPlainPtr(TElem* pMem, TDev dev, TExtent const& extent, TPitch pitchBytes)
+            : m_pMem(pMem)
+            , m_dev(std::move(dev))
+            , m_extentElements(extent)
+            , m_pitchBytes(static_cast<Vec<TDim, TIdx>>(pitchBytes))
+        {
+        }
+
+        TElem* m_pMem;
+        TDev m_dev;
+        Vec<TDim, TIdx> m_extentElements;
+        Vec<TDim, TIdx> m_pitchBytes;
+    };
+
+    // Trait specializations for ViewPlainPtr.
+    namespace trait
+    {
+        //! The ViewPlainPtr device type trait specialization.
+        template<typename TDev, typename TElem, typename TDim, typename TIdx>
+        struct DevType<ViewPlainPtr<TDev, TElem, TDim, TIdx>>
+        {
+            using type = alpaka::Dev<TDev>;
+        };
+
+        //! The ViewPlainPtr device get trait specialization.
+        template<typename TDev, typename TElem, typename TDim, typename TIdx>
+        struct GetDev<ViewPlainPtr<TDev, TElem, TDim, TIdx>>
+        {
+            static auto getDev(ViewPlainPtr<TDev, TElem, TDim, TIdx> const& view) -> alpaka::Dev<TDev>
+            {
+                return view.m_dev;
+            }
+        };
+
+        //! The ViewPlainPtr dimension getter trait.
+        template<typename TDev, typename TElem, typename TDim, typename TIdx>
+        struct DimType<ViewPlainPtr<TDev, TElem, TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //! The ViewPlainPtr memory element type get trait specialization.
+        template<typename TDev, typename TElem, typename TDim, typename TIdx>
+        struct ElemType<ViewPlainPtr<TDev, TElem, TDim, TIdx>>
+        {
+            using type = TElem;
+        };
+    } // namespace trait
+
+    namespace trait
+    {
+        template<typename TDev, typename TElem, typename TDim, typename TIdx>
+        struct GetExtents<ViewPlainPtr<TDev, TElem, TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST auto operator()(ViewPlainPtr<TDev, TElem, TDim, TIdx> const& view) const
+            {
+                return view.m_extentElements;
+            }
+        };
+
+        //! The ViewPlainPtr native pointer get trait specialization.
+        template<typename TDev, typename TElem, typename TDim, typename TIdx>
+        struct GetPtrNative<ViewPlainPtr<TDev, TElem, TDim, TIdx>>
+        {
+            static auto getPtrNative(ViewPlainPtr<TDev, TElem, TDim, TIdx> const& view) -> TElem const*
+            {
+                return view.m_pMem;
+            }
+
+            static auto getPtrNative(ViewPlainPtr<TDev, TElem, TDim, TIdx>& view) -> TElem*
+            {
+                return view.m_pMem;
+            }
+        };
+
+        template<typename TDev, typename TElem, typename TDim, typename TIdx>
+        struct GetPitchesInBytes<ViewPlainPtr<TDev, TElem, TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST auto operator()(ViewPlainPtr<TDev, TElem, TDim, TIdx> const& view) const
+            {
+                return view.m_pitchBytes;
+            }
+        };
+
+        //! The CPU device CreateViewPlainPtr trait specialization.
+        template<>
+        struct CreateViewPlainPtr<DevCpu>
+        {
+            template<typename TElem, typename TExtent, typename TPitch>
+            static auto createViewPlainPtr(DevCpu const& dev, TElem* pMem, TExtent const& extent, TPitch pitch)
+            {
+                return alpaka::ViewPlainPtr<DevCpu, TElem, alpaka::Dim<TExtent>, alpaka::Idx<TExtent>>(
+                    pMem,
+                    dev,
+                    extent,
+                    pitch);
+            }
+        };
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+        //! The CUDA/HIP RT device CreateViewPlainPtr trait specialization.
+        template<typename TApi>
+        struct CreateViewPlainPtr<DevUniformCudaHipRt<TApi>>
+        {
+            template<typename TElem, typename TExtent, typename TPitch>
+            static auto createViewPlainPtr(
+                DevUniformCudaHipRt<TApi> const& dev,
+                TElem* pMem,
+                TExtent const& extent,
+                TPitch pitch)
+            {
+                return alpaka::
+                    ViewPlainPtr<DevUniformCudaHipRt<TApi>, TElem, alpaka::Dim<TExtent>, alpaka::Idx<TExtent>>(
+                        pMem,
+                        dev,
+                        extent,
+                        pitch);
+            }
+        };
+#endif
+
+#if defined(ALPAKA_ACC_SYCL_ENABLED)
+        //! The SYCL device CreateViewPlainPtr trait specialization.
+        template<typename TTag>
+        struct CreateViewPlainPtr<DevGenericSycl<TTag>>
+        {
+            template<typename TElem, typename TExtent, typename TPitch>
+            static auto createViewPlainPtr(
+                DevGenericSycl<TTag> const& dev,
+                TElem* pMem,
+                TExtent const& extent,
+                TPitch pitch)
+            {
+                return alpaka::ViewPlainPtr<DevGenericSycl<TTag>, TElem, alpaka::Dim<TExtent>, alpaka::Idx<TExtent>>(
+                    pMem,
+                    dev,
+                    extent,
+                    pitch);
+            }
+        };
+#endif
+        //! The ViewPlainPtr offset get trait specialization.
+        template<typename TDev, typename TElem, typename TDim, typename TIdx>
+        struct GetOffsets<ViewPlainPtr<TDev, TElem, TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST auto operator()(ViewPlainPtr<TDev, TElem, TDim, TIdx> const&) const -> Vec<TDim, TIdx>
+            {
+                return Vec<TDim, TIdx>::zeros();
+            }
+        };
+
+        //! The ViewPlainPtr idx type trait specialization.
+        template<typename TDev, typename TElem, typename TDim, typename TIdx>
+        struct IdxType<ViewPlainPtr<TDev, TElem, TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/mem/view/ViewStdArray.hpp b/include/alpaka/mem/view/ViewStdArray.hpp
new file mode 100644
index 0000000..de01ec8
--- /dev/null
+++ b/include/alpaka/mem/view/ViewStdArray.hpp
@@ -0,0 +1,94 @@
+/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+/* TODO: Once C++20 is available remove this file and replace with a generic ContiguousContainer solution based on
+ * concepts. It should be sufficient to check for the existence of Container.size() and Container.data() */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/dev/DevCpu.hpp"
+#include "alpaka/mem/view/Traits.hpp"
+#include "alpaka/platform/PlatformCpu.hpp"
+
+#include <array>
+
+namespace alpaka::trait
+{
+    //! The std::array device type trait specialization.
+    template<typename TElem, std::size_t Tsize>
+    struct DevType<std::array<TElem, Tsize>>
+    {
+        using type = DevCpu;
+    };
+
+    //! The std::array device get trait specialization.
+    template<typename TElem, std::size_t Tsize>
+    struct GetDev<std::array<TElem, Tsize>>
+    {
+        ALPAKA_FN_HOST static auto getDev(std::array<TElem, Tsize> const& /* view */) -> DevCpu
+        {
+            // Instantiating the CPU platform here is a hack we can do internally, because we know that the CPU
+            // platform does not contain any data. But it generally does not apply.
+            return getDevByIdx(PlatformCpu{}, 0u);
+        }
+    };
+
+    //! The std::array dimension getter trait specialization.
+    template<typename TElem, std::size_t Tsize>
+    struct DimType<std::array<TElem, Tsize>>
+    {
+        using type = DimInt<1u>;
+    };
+
+    //! The std::array memory element type get trait specialization.
+    template<typename TElem, std::size_t Tsize>
+    struct ElemType<std::array<TElem, Tsize>>
+    {
+        using type = TElem;
+    };
+
+    template<typename TElem, std::size_t Tsize>
+    struct GetExtents<std::array<TElem, Tsize>>
+    {
+        ALPAKA_FN_HOST constexpr auto operator()(std::array<TElem, Tsize> const& a)
+            -> Vec<DimInt<1>, Idx<std::array<TElem, Tsize>>>
+        {
+            return {std::size(a)};
+        }
+    };
+
+    //! The std::array native pointer get trait specialization.
+    template<typename TElem, std::size_t Tsize>
+    struct GetPtrNative<std::array<TElem, Tsize>>
+    {
+        ALPAKA_FN_HOST static auto getPtrNative(std::array<TElem, Tsize> const& view) -> TElem const*
+        {
+            return std::data(view);
+        }
+
+        ALPAKA_FN_HOST static auto getPtrNative(std::array<TElem, Tsize>& view) -> TElem*
+        {
+            return std::data(view);
+        }
+    };
+
+    //! The std::array offset get trait specialization.
+    template<typename TElem, std::size_t Tsize>
+    struct GetOffsets<std::array<TElem, Tsize>>
+    {
+        ALPAKA_FN_HOST auto operator()(std::array<TElem, Tsize> const&)
+            -> Vec<DimInt<1>, Idx<std::array<TElem, Tsize>>>
+        {
+            return {0};
+        }
+    };
+
+    //! The std::vector idx type trait specialization.
+    template<typename TElem, std::size_t Tsize>
+    struct IdxType<std::array<TElem, Tsize>>
+    {
+        using type = std::size_t;
+    };
+} // namespace alpaka::trait
diff --git a/include/alpaka/mem/view/ViewStdVector.hpp b/include/alpaka/mem/view/ViewStdVector.hpp
new file mode 100644
index 0000000..e09b370
--- /dev/null
+++ b/include/alpaka/mem/view/ViewStdVector.hpp
@@ -0,0 +1,92 @@
+/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+/* TODO: Once C++20 is available remove this file and replace with a generic ContiguousContainer solution based on
+ * concepts. It should be sufficient to check for the existence of Container.size() and Container.data() */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/dev/DevCpu.hpp"
+#include "alpaka/mem/view/Traits.hpp"
+#include "alpaka/platform/PlatformCpu.hpp"
+
+#include <vector>
+
+namespace alpaka::trait
+{
+    //! The std::vector device type trait specialization.
+    template<typename TElem, typename TAllocator>
+    struct DevType<std::vector<TElem, TAllocator>>
+    {
+        using type = DevCpu;
+    };
+
+    //! The std::vector device get trait specialization.
+    template<typename TElem, typename TAllocator>
+    struct GetDev<std::vector<TElem, TAllocator>>
+    {
+        ALPAKA_FN_HOST static auto getDev(std::vector<TElem, TAllocator> const& /* view */) -> DevCpu
+        {
+            return getDevByIdx(PlatformCpu{}, 0u);
+        }
+    };
+
+    //! The std::vector dimension getter trait specialization.
+    template<typename TElem, typename TAllocator>
+    struct DimType<std::vector<TElem, TAllocator>>
+    {
+        using type = DimInt<1u>;
+    };
+
+    //! The std::vector memory element type get trait specialization.
+    template<typename TElem, typename TAllocator>
+    struct ElemType<std::vector<TElem, TAllocator>>
+    {
+        using type = TElem;
+    };
+
+    template<typename TElem, typename TAllocator>
+    struct GetExtents<std::vector<TElem, TAllocator>>
+    {
+        ALPAKA_FN_HOST constexpr auto operator()(std::vector<TElem, TAllocator> const& a)
+            -> Vec<DimInt<1>, Idx<std::vector<TElem, TAllocator>>>
+        {
+            return {std::size(a)};
+        }
+    };
+
+    //! The std::vector native pointer get trait specialization.
+    template<typename TElem, typename TAllocator>
+    struct GetPtrNative<std::vector<TElem, TAllocator>>
+    {
+        ALPAKA_FN_HOST static auto getPtrNative(std::vector<TElem, TAllocator> const& view) -> TElem const*
+        {
+            return std::data(view);
+        }
+
+        ALPAKA_FN_HOST static auto getPtrNative(std::vector<TElem, TAllocator>& view) -> TElem*
+        {
+            return std::data(view);
+        }
+    };
+
+    //! The std::vector offset get trait specialization.
+    template<typename TElem, typename TAllocator>
+    struct GetOffsets<std::vector<TElem, TAllocator>>
+    {
+        ALPAKA_FN_HOST auto operator()(std::vector<TElem, TAllocator> const&) const
+            -> Vec<DimInt<1>, Idx<std::vector<TElem, TAllocator>>>
+        {
+            return {0};
+        }
+    };
+
+    //! The std::vector idx type trait specialization.
+    template<typename TElem, typename TAllocator>
+    struct IdxType<std::vector<TElem, TAllocator>>
+    {
+        using type = std::size_t;
+    };
+} // namespace alpaka::trait
diff --git a/include/alpaka/mem/view/ViewSubView.hpp b/include/alpaka/mem/view/ViewSubView.hpp
new file mode 100644
index 0000000..a35fa22
--- /dev/null
+++ b/include/alpaka/mem/view/ViewSubView.hpp
@@ -0,0 +1,217 @@
+/* Copyright 2022 Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Assert.hpp"
+#include "alpaka/core/Common.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/dim/Traits.hpp"
+#include "alpaka/extent/Traits.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/mem/view/Traits.hpp"
+#include "alpaka/mem/view/ViewAccessOps.hpp"
+#include "alpaka/mem/view/ViewPlainPtr.hpp"
+#include "alpaka/offset/Traits.hpp"
+#include "alpaka/vec/Vec.hpp"
+
+#include <type_traits>
+#include <utility>
+
+namespace alpaka
+{
+    //! A sub-view to a view.
+    template<typename TDev, typename TElem, typename TDim, typename TIdx>
+    class ViewSubView : public internal::ViewAccessOps<ViewSubView<TDev, TElem, TDim, TIdx>>
+    {
+        static_assert(!std::is_const_v<TIdx>, "The idx type of the view can not be const!");
+
+        using Dev = alpaka::Dev<TDev>;
+
+    public:
+        //! Constructor.
+        //! \param view The view this view is a sub-view of.
+        //! \param extentElements The extent in elements.
+        //! \param relativeOffsetsElements The offsets in elements.
+        template<typename TQualifiedView, typename TOffsets, typename TExtent>
+        ViewSubView(
+            TQualifiedView& view,
+            TExtent const& extentElements,
+            TOffsets const& relativeOffsetsElements = TOffsets())
+            : m_viewParentView(getPtrNative(view), getDev(view), getExtents(view), getPitchesInBytes(view))
+            , m_extentElements(getExtents(extentElements))
+            , m_offsetsElements(getOffsets(relativeOffsetsElements))
+            , m_nativePtr(computeNativePtr())
+        {
+            ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+            using View = std::remove_cv_t<TQualifiedView>;
+
+            static_assert(
+                std::is_same_v<Dev, alpaka::Dev<View>>,
+                "The dev type of TView and the Dev template parameter have to be identical!");
+
+            static_assert(
+                std::is_same_v<TIdx, alpaka::Idx<View>>,
+                "The idx type of TView and the TIdx template parameter have to be identical!");
+            static_assert(
+                std::is_same_v<TIdx, alpaka::Idx<TExtent>>,
+                "The idx type of TExtent and the TIdx template parameter have to be identical!");
+            static_assert(
+                std::is_same_v<TIdx, alpaka::Idx<TOffsets>>,
+                "The idx type of TOffsets and the TIdx template parameter have to be identical!");
+
+            static_assert(
+                std::is_same_v<TDim, alpaka::Dim<View>>,
+                "The dim type of TView and the TDim template parameter have to be identical!");
+            static_assert(
+                std::is_same_v<TDim, alpaka::Dim<TExtent>>,
+                "The dim type of TExtent and the TDim template parameter have to be identical!");
+            static_assert(
+                std::is_same_v<TDim, alpaka::Dim<TOffsets>>,
+                "The dim type of TOffsets and the TDim template parameter have to be identical!");
+
+            ALPAKA_ASSERT(((m_offsetsElements + m_extentElements) <= getExtents(view)).all());
+        }
+
+        //! \param view The view this view is a sub-view of.
+        template<typename TView>
+        explicit ViewSubView(TView const& view) : ViewSubView(view, getExtents(view), Vec<TDim, TIdx>::zeros())
+        {
+            ALPAKA_DEBUG_FULL_LOG_SCOPE;
+        }
+
+        //! \param view The view this view is a sub-view of.
+        template<typename TView>
+        explicit ViewSubView(TView& view) : ViewSubView(view, getExtents(view), Vec<TDim, TIdx>::zeros())
+        {
+            ALPAKA_DEBUG_FULL_LOG_SCOPE;
+        }
+
+    public:
+        ALPAKA_FN_HOST auto computeNativePtr()
+        {
+#if BOOST_COMP_GNUC
+#    pragma GCC diagnostic push
+            // "cast from 'std::uint8_t*' to 'TElem*' increases required alignment of target type"
+#    pragma GCC diagnostic ignored "-Wcast-align"
+#endif
+            return reinterpret_cast<TElem*>(
+                reinterpret_cast<std::uint8_t*>(alpaka::getPtrNative(m_viewParentView))
+                + (m_offsetsElements * getPitchesInBytes(m_viewParentView)).sum());
+#if BOOST_COMP_GNUC
+#    pragma GCC diagnostic pop
+#endif
+        }
+
+        ViewPlainPtr<Dev, TElem, TDim, TIdx> m_viewParentView; // This wraps the parent view.
+        Vec<TDim, TIdx> m_extentElements; // The extent of this view.
+        Vec<TDim, TIdx> m_offsetsElements; // The offset relative to the parent view.
+        TElem* m_nativePtr;
+    };
+
+    // Trait specializations for ViewSubView.
+    namespace trait
+    {
+        //! The ViewSubView device type trait specialization.
+        template<typename TElem, typename TDim, typename TDev, typename TIdx>
+        struct DevType<ViewSubView<TDev, TElem, TDim, TIdx>>
+        {
+            using type = alpaka::Dev<TDev>;
+        };
+
+        //! The ViewSubView device get trait specialization.
+        template<typename TElem, typename TDim, typename TDev, typename TIdx>
+        struct GetDev<ViewSubView<TDev, TElem, TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST static auto getDev(ViewSubView<TDev, TElem, TDim, TIdx> const& view) -> alpaka::Dev<TDev>
+            {
+                return alpaka::getDev(view.m_viewParentView);
+            }
+        };
+
+        //! The ViewSubView dimension getter trait specialization.
+        template<typename TElem, typename TDim, typename TDev, typename TIdx>
+        struct DimType<ViewSubView<TDev, TElem, TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //! The ViewSubView memory element type get trait specialization.
+        template<typename TElem, typename TDim, typename TDev, typename TIdx>
+        struct ElemType<ViewSubView<TDev, TElem, TDim, TIdx>>
+        {
+            using type = TElem;
+        };
+
+        //! The ViewSubView width get trait specialization.
+        template<typename TElem, typename TDim, typename TDev, typename TIdx>
+        struct GetExtents<ViewSubView<TDev, TElem, TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST auto operator()(ViewSubView<TDev, TElem, TDim, TIdx> const& view) const
+            {
+                return view.m_extentElements;
+            }
+        };
+
+        //! The ViewSubView native pointer get trait specialization.
+        template<typename TElem, typename TDim, typename TDev, typename TIdx>
+        struct GetPtrNative<ViewSubView<TDev, TElem, TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST static auto getPtrNative(ViewSubView<TDev, TElem, TDim, TIdx> const& view) -> TElem const*
+            {
+                return view.m_nativePtr;
+            }
+
+            ALPAKA_FN_HOST static auto getPtrNative(ViewSubView<TDev, TElem, TDim, TIdx>& view) -> TElem*
+            {
+                return view.m_nativePtr;
+            }
+        };
+
+        //! The ViewSubView pitch get trait specialization.
+        template<typename TDev, typename TElem, typename TDim, typename TIdx>
+        struct GetPitchesInBytes<ViewSubView<TDev, TElem, TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST auto operator()(ViewSubView<TDev, TElem, TDim, TIdx> const& view) const
+            {
+                return getPitchesInBytes(view.m_viewParentView);
+            }
+        };
+
+        //! The ViewSubView x offset get trait specialization.
+        template<typename TElem, typename TDim, typename TDev, typename TIdx>
+        struct GetOffsets<ViewSubView<TDev, TElem, TDim, TIdx>>
+        {
+            ALPAKA_FN_HOST auto operator()(ViewSubView<TDev, TElem, TDim, TIdx> const& offset)
+            {
+                return offset.m_offsetsElements;
+            }
+        };
+
+        //! The ViewSubView idx type trait specialization.
+        template<typename TElem, typename TDim, typename TDev, typename TIdx>
+        struct IdxType<ViewSubView<TDev, TElem, TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+
+        //! The CPU device CreateSubView trait default implementation
+        template<typename TDev, typename TSfinae>
+        struct CreateSubView
+        {
+            template<typename TView, typename TExtent, typename TOffsets>
+            static auto createSubView(
+                TView& view,
+                TExtent const& extentElements,
+                TOffsets const& relativeOffsetsElements)
+            {
+                using Dim = alpaka::Dim<TExtent>;
+                using Idx = alpaka::Idx<TExtent>;
+                using Elem = typename trait::ElemType<TView>::type;
+                return ViewSubView<TDev, Elem, Dim, Idx>(view, extentElements, relativeOffsetsElements);
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/meta/Apply.hpp b/include/alpaka/meta/Apply.hpp
new file mode 100644
index 0000000..bcffe8c
--- /dev/null
+++ b/include/alpaka/meta/Apply.hpp
@@ -0,0 +1,22 @@
+/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+namespace alpaka::meta
+{
+    namespace detail
+    {
+        template<typename TList, template<typename...> class TApplicant>
+        struct ApplyImpl;
+
+        template<template<typename...> class TList, template<typename...> class TApplicant, typename... T>
+        struct ApplyImpl<TList<T...>, TApplicant>
+        {
+            using type = TApplicant<T...>;
+        };
+    } // namespace detail
+    template<typename TList, template<typename...> class TApplicant>
+    using Apply = typename detail::ApplyImpl<TList, TApplicant>::type;
+} // namespace alpaka::meta
diff --git a/include/alpaka/meta/CartesianProduct.hpp b/include/alpaka/meta/CartesianProduct.hpp
new file mode 100644
index 0000000..dc1a1d6
--- /dev/null
+++ b/include/alpaka/meta/CartesianProduct.hpp
@@ -0,0 +1,84 @@
+/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/meta/Concatenate.hpp"
+
+namespace alpaka::meta
+{
+    // This is based on code by Patrick Fromberg.
+    // See
+    // http://stackoverflow.com/questions/9122028/how-to-create-the-cartesian-product-of-a-type-list/19611856#19611856
+    namespace detail
+    {
+        template<typename... Ts>
+        struct CartesianProductImplHelper;
+
+        // Stop condition.
+        template<template<typename...> class TList, typename... Ts>
+        struct CartesianProductImplHelper<TList<Ts...>>
+        {
+            using type = TList<Ts...>;
+        };
+
+        // Catches first empty tuple.
+        template<template<typename...> class TList, typename... Ts>
+        struct CartesianProductImplHelper<TList<TList<>>, Ts...>
+        {
+            using type = TList<>;
+        };
+
+        // Catches any empty tuple except first.
+        template<template<typename...> class TList, typename... Ts, typename... Rests>
+        struct CartesianProductImplHelper<TList<Ts...>, TList<>, Rests...>
+        {
+            using type = TList<>;
+        };
+
+        template<template<typename...> class TList, typename... X, typename H, typename... Rests>
+        struct CartesianProductImplHelper<TList<X...>, TList<H>, Rests...>
+        {
+            using type1 = TList<Concatenate<X, TList<H>>...>;
+            using type = typename CartesianProductImplHelper<type1, Rests...>::type;
+        };
+
+        template<
+            template<typename...>
+            class TList,
+            typename... X,
+            template<typename...>
+            class Head,
+            typename T,
+            typename... Ts,
+            typename... Rests>
+        struct CartesianProductImplHelper<TList<X...>, Head<T, Ts...>, Rests...>
+        {
+            using type1 = TList<Concatenate<X, TList<T>>...>;
+            using type2 = typename CartesianProductImplHelper<TList<X...>, TList<Ts...>>::type;
+            using type3 = Concatenate<type1, type2>;
+            using type = typename CartesianProductImplHelper<type3, Rests...>::type;
+        };
+
+        template<template<typename...> class TList, typename... Ts>
+        struct CartesianProductImpl;
+
+        // The base case for no input returns an empty sequence.
+        template<template<typename...> class TList>
+        struct CartesianProductImpl<TList>
+        {
+            using type = TList<>;
+        };
+
+        // R is the return type, Head<A...> is the first input list
+        template<template<typename...> class TList, template<typename...> class Head, typename... Ts, typename... Tail>
+        struct CartesianProductImpl<TList, Head<Ts...>, Tail...>
+        {
+            using type = typename detail::CartesianProductImplHelper<TList<TList<Ts>...>, Tail...>::type;
+        };
+    } // namespace detail
+
+    template<template<typename...> class TList, typename... Ts>
+    using CartesianProduct = typename detail::CartesianProductImpl<TList, Ts...>::type;
+} // namespace alpaka::meta
diff --git a/include/alpaka/meta/Concatenate.hpp b/include/alpaka/meta/Concatenate.hpp
new file mode 100644
index 0000000..9133eb6
--- /dev/null
+++ b/include/alpaka/meta/Concatenate.hpp
@@ -0,0 +1,29 @@
+/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+namespace alpaka::meta
+{
+    namespace detail
+    {
+        template<typename... T>
+        struct ConcatenateImpl;
+
+        template<typename T>
+        struct ConcatenateImpl<T>
+        {
+            using type = T;
+        };
+
+        template<template<typename...> class TList, typename... As, typename... Bs, typename... TRest>
+        struct ConcatenateImpl<TList<As...>, TList<Bs...>, TRest...>
+        {
+            using type = typename ConcatenateImpl<TList<As..., Bs...>, TRest...>::type;
+        };
+    } // namespace detail
+
+    template<typename... T>
+    using Concatenate = typename detail::ConcatenateImpl<T...>::type;
+} // namespace alpaka::meta
diff --git a/include/alpaka/meta/DependentFalseType.hpp b/include/alpaka/meta/DependentFalseType.hpp
new file mode 100644
index 0000000..a0f2855
--- /dev/null
+++ b/include/alpaka/meta/DependentFalseType.hpp
@@ -0,0 +1,17 @@
+/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include <type_traits>
+
+namespace alpaka::meta
+{
+    //! A false_type being dependent on a ignored template parameter.
+    //! This allows to use static_assert in uninstantiated template specializations without triggering.
+    template<typename T>
+    struct DependentFalseType : std::false_type
+    {
+    };
+} // namespace alpaka::meta
diff --git a/include/alpaka/meta/Filter.hpp b/include/alpaka/meta/Filter.hpp
new file mode 100644
index 0000000..52e93dc
--- /dev/null
+++ b/include/alpaka/meta/Filter.hpp
@@ -0,0 +1,47 @@
+/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/meta/Concatenate.hpp"
+
+#include <type_traits>
+
+namespace alpaka::meta
+{
+    namespace detail
+    {
+        template<template<typename...> class TList, template<typename...> class TPred, typename... Ts>
+        struct FilterImplHelper;
+
+        template<template<typename...> class TList, template<typename...> class TPred>
+        struct FilterImplHelper<TList, TPred>
+        {
+            using type = TList<>;
+        };
+
+        template<template<typename...> class TList, template<typename...> class TPred, typename T, typename... Ts>
+        struct FilterImplHelper<TList, TPred, T, Ts...>
+        {
+            using type = std::conditional_t<
+                TPred<T>::value,
+                Concatenate<TList<T>, typename FilterImplHelper<TList, TPred, Ts...>::type>,
+                typename FilterImplHelper<TList, TPred, Ts...>::type>;
+        };
+
+        template<typename TList, template<typename...> class TPred>
+        struct FilterImpl;
+
+        template<template<typename...> class TList, template<typename...> class TPred, typename... Ts>
+        struct FilterImpl<TList<Ts...>, TPred>
+        {
+            using type = typename detail::FilterImplHelper<TList, TPred, Ts...>::type;
+        };
+    } // namespace detail
+
+    /// \tparam TPred Only the first parameter is used, all other must be set by TPred to some default.
+    ///               Using '...' instead of a single type is a workaround for CrayClang.
+    template<typename TList, template<typename...> class TPred>
+    using Filter = typename detail::FilterImpl<TList, TPred>::type;
+} // namespace alpaka::meta
diff --git a/include/alpaka/meta/Fold.hpp b/include/alpaka/meta/Fold.hpp
new file mode 100644
index 0000000..1a258f4
--- /dev/null
+++ b/include/alpaka/meta/Fold.hpp
@@ -0,0 +1,24 @@
+/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+
+namespace alpaka::meta
+{
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TFnObj, typename T>
+    ALPAKA_FN_HOST_ACC constexpr auto foldr(TFnObj const& /* f */, T const& t) -> T
+    {
+        return t;
+    }
+
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TFnObj, typename T0, typename T1, typename... Ts>
+    ALPAKA_FN_HOST_ACC constexpr auto foldr(TFnObj const& f, T0 const& t0, T1 const& t1, Ts const&... ts)
+    {
+        return f(t0, foldr(f, t1, ts...));
+    }
+} // namespace alpaka::meta
diff --git a/include/alpaka/meta/ForEachType.hpp b/include/alpaka/meta/ForEachType.hpp
new file mode 100644
index 0000000..030851f
--- /dev/null
+++ b/include/alpaka/meta/ForEachType.hpp
@@ -0,0 +1,52 @@
+/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+
+#include <utility>
+
+namespace alpaka::meta
+{
+    namespace detail
+    {
+        template<typename TList>
+        struct ForEachTypeHelper;
+
+        template<template<typename...> class TList>
+        struct ForEachTypeHelper<TList<>>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            template<typename TFnObj, typename... TArgs>
+            ALPAKA_FN_HOST_ACC static auto forEachTypeHelper(TFnObj&& /* f */, TArgs&&... /* args */) -> void
+            {
+            }
+        };
+
+        template<template<typename...> class TList, typename T, typename... Ts>
+        struct ForEachTypeHelper<TList<T, Ts...>>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            template<typename TFnObj, typename... TArgs>
+            ALPAKA_FN_HOST_ACC static auto forEachTypeHelper(TFnObj&& f, TArgs&&... args) -> void
+            {
+                f.template operator()<T>(std::forward<TArgs>(args)...);
+                ForEachTypeHelper<TList<Ts...>>::forEachTypeHelper(
+                    std::forward<TFnObj>(f),
+                    std::forward<TArgs>(args)...);
+            }
+        };
+    } // namespace detail
+
+    //! Equivalent to boost::mpl::for_each but does not require the types of the sequence to be default
+    //! constructible. This function does not create instances of the types instead it passes the types as template
+    //! parameter.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TList, typename TFnObj, typename... TArgs>
+    ALPAKA_FN_HOST_ACC auto forEachType(TFnObj&& f, TArgs&&... args) -> void
+    {
+        detail::ForEachTypeHelper<TList>::forEachTypeHelper(std::forward<TFnObj>(f), std::forward<TArgs>(args)...);
+    }
+} // namespace alpaka::meta
diff --git a/include/alpaka/meta/Functional.hpp b/include/alpaka/meta/Functional.hpp
new file mode 100644
index 0000000..0a5d848
--- /dev/null
+++ b/include/alpaka/meta/Functional.hpp
@@ -0,0 +1,30 @@
+/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+
+namespace alpaka::meta
+{
+    template<typename T>
+    struct min
+    {
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC constexpr auto operator()(T const& lhs, T const& rhs) const
+        {
+            return (lhs < rhs) ? lhs : rhs;
+        }
+    };
+
+    template<typename T>
+    struct max
+    {
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC constexpr auto operator()(T const& lhs, T const& rhs) const
+        {
+            return (lhs > rhs) ? lhs : rhs;
+        }
+    };
+} // namespace alpaka::meta
diff --git a/include/alpaka/meta/InheritFromList.hpp b/include/alpaka/meta/InheritFromList.hpp
new file mode 100644
index 0000000..e0a8fac
--- /dev/null
+++ b/include/alpaka/meta/InheritFromList.hpp
@@ -0,0 +1,16 @@
+/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+namespace alpaka::meta
+{
+    template<typename TBaseList>
+    class InheritFromList;
+
+    template<template<typename...> class TList, typename... TBases>
+    class InheritFromList<TList<TBases...>> : public TBases...
+    {
+    };
+} // namespace alpaka::meta
diff --git a/include/alpaka/meta/IntegerSequence.hpp b/include/alpaka/meta/IntegerSequence.hpp
new file mode 100644
index 0000000..bc8bfac
--- /dev/null
+++ b/include/alpaka/meta/IntegerSequence.hpp
@@ -0,0 +1,125 @@
+/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/meta/Set.hpp"
+
+#include <cstddef>
+#include <type_traits>
+#include <utility>
+
+namespace alpaka::meta
+{
+    namespace detail
+    {
+        template<typename TDstType, typename TIntegerSequence>
+        struct ConvertIntegerSequence;
+
+        template<typename TDstType, typename T, T... Tvals>
+        struct ConvertIntegerSequence<TDstType, std::integer_sequence<T, Tvals...>>
+        {
+            using type = std::integer_sequence<TDstType, static_cast<TDstType>(Tvals)...>;
+        };
+    } // namespace detail
+
+    template<typename TDstType, typename TIntegerSequence>
+    using ConvertIntegerSequence = typename detail::ConvertIntegerSequence<TDstType, TIntegerSequence>::type;
+
+    namespace detail
+    {
+        template<bool TisSizeNegative, bool TbIsBegin, typename T, T Tbegin, typename TIntCon, typename TIntSeq>
+        struct MakeIntegerSequenceHelper
+        {
+            static_assert(!TisSizeNegative, "MakeIntegerSequence<T, N> requires N to be non-negative.");
+        };
+
+        template<typename T, T Tbegin, T... Tvals>
+        struct MakeIntegerSequenceHelper<
+            false,
+            true,
+            T,
+            Tbegin,
+            std::integral_constant<T, Tbegin>,
+            std::integer_sequence<T, Tvals...>>
+        {
+            using type = std::integer_sequence<T, Tvals...>;
+        };
+
+        template<typename T, T Tbegin, T TIdx, T... Tvals>
+        struct MakeIntegerSequenceHelper<
+            false,
+            false,
+            T,
+            Tbegin,
+            std::integral_constant<T, TIdx>,
+            std::integer_sequence<T, Tvals...>>
+        {
+            using type = typename MakeIntegerSequenceHelper<
+                false,
+                TIdx == (Tbegin + 1),
+                T,
+                Tbegin,
+                std::integral_constant<T, TIdx - 1>,
+                std::integer_sequence<T, TIdx - 1, Tvals...>>::type;
+        };
+    } // namespace detail
+
+    template<typename T, T Tbegin, T Tsize>
+    using MakeIntegerSequenceOffset = typename detail::MakeIntegerSequenceHelper<
+        (Tsize < 0),
+        (Tsize == 0),
+        T,
+        Tbegin,
+        std::integral_constant<T, Tbegin + Tsize>,
+        std::integer_sequence<T>>::type;
+
+    //! Checks if the integral values are unique.
+    template<typename T, T... Tvals>
+    struct IntegralValuesUnique
+    {
+        static constexpr bool value = meta::IsParameterPackSet<std::integral_constant<T, Tvals>...>::value;
+    };
+
+    //! Checks if the values in the index sequence are unique.
+    template<typename TIntegerSequence>
+    struct IntegerSequenceValuesUnique;
+
+    //! Checks if the values in the index sequence are unique.
+    template<typename T, T... Tvals>
+    struct IntegerSequenceValuesUnique<std::integer_sequence<T, Tvals...>>
+    {
+        static constexpr bool value = IntegralValuesUnique<T, Tvals...>::value;
+    };
+
+    //! Checks if the integral values are within the given range.
+    template<typename T, T Tmin, T Tmax, T... Tvals>
+    struct IntegralValuesInRange;
+
+    //! Checks if the integral values are within the given range.
+    template<typename T, T Tmin, T Tmax>
+    struct IntegralValuesInRange<T, Tmin, Tmax>
+    {
+        static constexpr bool value = true;
+    };
+
+    //! Checks if the integral values are within the given range.
+    template<typename T, T Tmin, T Tmax, T I, T... Tvals>
+    struct IntegralValuesInRange<T, Tmin, Tmax, I, Tvals...>
+    {
+        static constexpr bool value
+            = (I >= Tmin) && (I <= Tmax) && IntegralValuesInRange<T, Tmin, Tmax, Tvals...>::value;
+    };
+
+    //! Checks if the values in the index sequence are within the given range.
+    template<typename TIntegerSequence, typename T, T Tmin, T Tmax>
+    struct IntegerSequenceValuesInRange;
+
+    //! Checks if the values in the index sequence are within the given range.
+    template<typename T, T... Tvals, T Tmin, T Tmax>
+    struct IntegerSequenceValuesInRange<std::integer_sequence<T, Tvals...>, T, Tmin, Tmax>
+    {
+        static constexpr bool value = IntegralValuesInRange<T, Tmin, Tmax, Tvals...>::value;
+    };
+} // namespace alpaka::meta
diff --git a/include/alpaka/meta/Integral.hpp b/include/alpaka/meta/Integral.hpp
new file mode 100644
index 0000000..48f4867
--- /dev/null
+++ b/include/alpaka/meta/Integral.hpp
@@ -0,0 +1,56 @@
+/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include <type_traits>
+
+namespace alpaka::meta
+{
+    //! The trait is true if all values of TSubset are contained in TSuperset.
+    template<typename TSuperset, typename TSubset>
+    using IsIntegralSuperset = std::integral_constant<
+        bool,
+        std::is_integral_v<TSuperset> && std::is_integral_v<TSubset>
+            && (
+                // If the signdness is equal, the sizes have to be greater or equal to be a superset.
+                ((std::is_unsigned_v<TSuperset>
+                  == std::is_unsigned_v<TSubset>) &&(sizeof(TSuperset) >= sizeof(TSubset)))
+                // If the signdness is non-equal, the superset has to have at least one bit more.
+                || ((std::is_unsigned_v<TSuperset> != std::is_unsigned_v<TSubset>) &&(
+                    sizeof(TSuperset) > sizeof(TSubset))))>;
+
+    //! The type that has the higher max value.
+    template<typename T0, typename T1>
+    using HigherMax = std::conditional_t<
+        (sizeof(T0) > sizeof(T1)),
+        T0,
+        std::conditional_t<((sizeof(T0) == sizeof(T1)) && std::is_unsigned_v<T0> && std::is_signed_v<T1>), T0, T1>>;
+
+    //! The type that has the lower max value.
+    template<typename T0, typename T1>
+    using LowerMax = std::conditional_t<
+        (sizeof(T0) < sizeof(T1)),
+        T0,
+        std::conditional_t<((sizeof(T0) == sizeof(T1)) && std::is_signed_v<T0> && std::is_unsigned_v<T1>), T0, T1>>;
+
+    //! The type that has the higher min value. If both types have the same min value, the type with the wider
+    //! range is chosen.
+    template<typename T0, typename T1>
+    using HigherMin = std::conditional_t<
+        (std::is_unsigned_v<T0> == std::is_unsigned_v<T1>),
+        std::conditional_t<
+            std::is_unsigned_v<T0>,
+            std::conditional_t<(sizeof(T0) < sizeof(T1)), T1, T0>,
+            std::conditional_t<(sizeof(T0) < sizeof(T1)), T0, T1>>,
+        std::conditional_t<std::is_unsigned_v<T0>, T0, T1>>;
+
+    //! The type that has the lower min value. If both types have the same min value, the type with the wider range
+    //! is chosen.
+    template<typename T0, typename T1>
+    using LowerMin = std::conditional_t<
+        (std::is_unsigned_v<T0> == std::is_unsigned_v<T1>),
+        std::conditional_t<(sizeof(T0) > sizeof(T1)), T0, T1>,
+        std::conditional_t<std::is_signed_v<T0>, T0, T1>>;
+} // namespace alpaka::meta
diff --git a/include/alpaka/meta/IsArrayOrVector.hpp b/include/alpaka/meta/IsArrayOrVector.hpp
new file mode 100644
index 0000000..f755916
--- /dev/null
+++ b/include/alpaka/meta/IsArrayOrVector.hpp
@@ -0,0 +1,65 @@
+/* Copyright 2022 Jiri Vyskocil, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/vec/Vec.hpp"
+
+#include <functional>
+#include <numeric>
+#include <type_traits>
+#include <vector>
+
+namespace alpaka::meta
+{
+    /** Checks whether T is an array or a vector type
+     *
+     * @tparam T a type to check
+     */
+    template<typename T>
+    struct IsArrayOrVector : std::false_type
+    {
+    };
+
+    /** Specialization of \a IsArrayOrVector for vector types
+     *
+     * @tparam T inner type held in the vector
+     * @tparam A vector allocator
+     */
+    template<typename T, typename A>
+    struct IsArrayOrVector<std::vector<T, A>> : std::true_type
+    {
+    };
+
+    /** Specialization of \a IsArrayOrVector for plain arrays
+     *
+     * @tparam T inner type held in the array
+     * @tparam N size of the array
+     */
+    template<typename T, std::size_t N>
+    struct IsArrayOrVector<T[N]> : std::true_type
+    {
+    };
+
+    /** Specialization of \a IsArrayOrVector for std::array
+     *
+     * @tparam T inner type held in the array
+     * @tparam N size of the array
+     */
+    template<typename T, std::size_t N>
+    struct IsArrayOrVector<std::array<T, N>> : std::true_type
+    {
+    };
+
+    /** Specialization of \a IsArrayOrVector for alpaka::Vec
+     *
+     * @tparam T inner type held in the array
+     * @tparam N size of the array
+     */
+    template<typename T, typename N>
+    struct IsArrayOrVector<alpaka::Vec<N, T>> : std::true_type
+    {
+    };
+
+} // namespace alpaka::meta
diff --git a/include/alpaka/meta/IsStrictBase.hpp b/include/alpaka/meta/IsStrictBase.hpp
new file mode 100644
index 0000000..80ece93
--- /dev/null
+++ b/include/alpaka/meta/IsStrictBase.hpp
@@ -0,0 +1,15 @@
+/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include <type_traits>
+
+namespace alpaka::meta
+{
+    //! The trait is true if TDerived is derived from TBase but is not TBase itself.
+    template<typename TBase, typename TDerived>
+    using IsStrictBase = std::
+        integral_constant<bool, std::is_base_of_v<TBase, TDerived> && !std::is_same_v<TBase, std::decay_t<TDerived>>>;
+} // namespace alpaka::meta
diff --git a/include/alpaka/meta/NdLoop.hpp b/include/alpaka/meta/NdLoop.hpp
new file mode 100644
index 0000000..a9a3267
--- /dev/null
+++ b/include/alpaka/meta/NdLoop.hpp
@@ -0,0 +1,85 @@
+/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/dim/Traits.hpp"
+#include "alpaka/vec/Vec.hpp"
+
+#include <utility>
+
+namespace alpaka::meta
+{
+    namespace detail
+    {
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TIndex, typename TExtentVec, typename TFnObj>
+        ALPAKA_FN_HOST_ACC constexpr void ndLoopImpl(
+            std::index_sequence<>,
+            TIndex& idx,
+            TExtentVec const&,
+            TFnObj const& f)
+        {
+            f(idx);
+        }
+
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<std::size_t Tdim0, std::size_t... Tdims, typename TIndex, typename TExtentVec, typename TFnObj>
+        ALPAKA_FN_HOST_ACC constexpr void ndLoopImpl(
+            std::index_sequence<Tdim0, Tdims...>,
+            TIndex& idx,
+            TExtentVec const& extent,
+            TFnObj const& f)
+        {
+            static_assert(Dim<TIndex>::value > 0u, "The dimension given to ndLoop has to be larger than zero!");
+            static_assert(
+                Dim<TIndex>::value == Dim<TExtentVec>::value,
+                "The dimensions of the iteration vector and the extent vector have to be identical!");
+            static_assert(Dim<TIndex>::value > Tdim0, "The current dimension has to be in the range [0,dim-1]!");
+
+            for(idx[Tdim0] = 0u; idx[Tdim0] < extent[Tdim0]; ++idx[Tdim0])
+            {
+                ndLoopImpl(std::index_sequence<Tdims...>{}, idx, extent, f);
+            }
+        }
+    } // namespace detail
+
+    //! Loops over an n-dimensional iteration index variable calling f(idx, args...) for each iteration.
+    //! The loops are nested in the order given by the index_sequence with the first element being the outermost
+    //! and the last index the innermost loop.
+    //!
+    //! \param indexSequence A sequence of indices being a permutation of the values [0, dim-1].
+    //! \param extent N-dimensional loop extent.
+    //! \param f The function called at each iteration.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TExtentVec, typename TFnObj, std::size_t... Tdims>
+    ALPAKA_FN_HOST_ACC auto ndLoop(
+        [[maybe_unused]] std::index_sequence<Tdims...> indexSequence,
+        TExtentVec const& extent,
+        TFnObj const& f) -> void
+    {
+        static_assert(
+            IntegerSequenceValuesInRange<std::index_sequence<Tdims...>, std::size_t, 0, Dim<TExtentVec>::value>::value,
+            "The values in the index_sequence have to be in the range [0,dim-1]!");
+        static_assert(
+            IntegerSequenceValuesUnique<std::index_sequence<Tdims...>>::value,
+            "The values in the index_sequence have to be unique!");
+
+        auto idx = Vec<Dim<TExtentVec>, Idx<TExtentVec>>::zeros();
+        detail::ndLoopImpl(std::index_sequence<Tdims...>{}, idx, extent, f);
+    }
+
+    //! Loops over an n-dimensional iteration index variable calling f(idx, args...) for each iteration.
+    //! The loops are nested from index zero outmost to index (dim-1) innermost.
+    //!
+    //! \param extent N-dimensional loop extent.
+    //! \param f The function called at each iteration.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TExtentVec, typename TFnObj>
+    ALPAKA_FN_HOST_ACC auto ndLoopIncIdx(TExtentVec const& extent, TFnObj const& f) -> void
+    {
+        ndLoop(std::make_index_sequence<Dim<TExtentVec>::value>(), extent, f);
+    }
+} // namespace alpaka::meta
diff --git a/include/alpaka/meta/NonZero.hpp b/include/alpaka/meta/NonZero.hpp
new file mode 100644
index 0000000..49d9bf9
--- /dev/null
+++ b/include/alpaka/meta/NonZero.hpp
@@ -0,0 +1,27 @@
+/* Copyright 2023 Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include <type_traits>
+
+namespace alpaka::meta
+{
+    namespace detail
+    {
+        template<typename T>
+        struct NonZeroImpl : std::false_type
+        {
+        };
+
+        template<typename T, T TValue>
+        struct NonZeroImpl<std::integral_constant<T, TValue>> : std::bool_constant<TValue != static_cast<T>(0)>
+        {
+        };
+    } // namespace detail
+
+    template<typename T>
+    using NonZero = typename detail::NonZeroImpl<T>;
+
+} // namespace alpaka::meta
diff --git a/include/alpaka/meta/Set.hpp b/include/alpaka/meta/Set.hpp
new file mode 100644
index 0000000..a4e387c
--- /dev/null
+++ b/include/alpaka/meta/Set.hpp
@@ -0,0 +1,60 @@
+/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include <utility>
+
+namespace alpaka::meta
+{
+    namespace detail
+    {
+        //! Empty dependent type.
+        template<typename T>
+        struct Empty
+        {
+        };
+
+        template<typename... Ts>
+        struct IsParameterPackSetImpl;
+
+        template<>
+        struct IsParameterPackSetImpl<>
+        {
+            static constexpr bool value = true;
+        };
+
+        // Based on code by Roland Bock: https://gist.github.com/rbock/ad8eedde80c060132a18
+        // Linearly inherits from empty<T> and checks if it has already inherited from this type.
+        template<typename T, typename... Ts>
+        struct IsParameterPackSetImpl<T, Ts...>
+            : public IsParameterPackSetImpl<Ts...>
+            , public virtual Empty<T>
+        {
+            using Base = IsParameterPackSetImpl<Ts...>;
+
+            static constexpr bool value = Base::value && !std::is_base_of_v<Empty<T>, Base>;
+        };
+    } // namespace detail
+
+    //! Trait that tells if the parameter pack contains only unique (no equal) types.
+    template<typename... Ts>
+    using IsParameterPackSet = detail::IsParameterPackSetImpl<Ts...>;
+
+    namespace detail
+    {
+        template<typename TList>
+        struct IsSetImpl;
+
+        template<template<typename...> class TList, typename... Ts>
+        struct IsSetImpl<TList<Ts...>>
+        {
+            static constexpr bool value = IsParameterPackSet<Ts...>::value;
+        };
+    } // namespace detail
+
+    //! Trait that tells if the template contains only unique (no equal) types.
+    template<typename TList>
+    using IsSet = detail::IsSetImpl<TList>;
+} // namespace alpaka::meta
diff --git a/include/alpaka/meta/Transform.hpp b/include/alpaka/meta/Transform.hpp
new file mode 100644
index 0000000..d7d079a
--- /dev/null
+++ b/include/alpaka/meta/Transform.hpp
@@ -0,0 +1,22 @@
+/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+namespace alpaka::meta
+{
+    namespace detail
+    {
+        template<typename Ts, template<typename...> class TOp>
+        struct TransformImpl;
+
+        template<template<typename...> class TList, typename... Ts, template<typename...> class TOp>
+        struct TransformImpl<TList<Ts...>, TOp>
+        {
+            using type = TList<TOp<Ts>...>;
+        };
+    } // namespace detail
+    template<typename Ts, template<typename...> class TOp>
+    using Transform = typename detail::TransformImpl<Ts, TOp>::type;
+} // namespace alpaka::meta
diff --git a/include/alpaka/meta/TypeListOps.hpp b/include/alpaka/meta/TypeListOps.hpp
new file mode 100644
index 0000000..c63b656
--- /dev/null
+++ b/include/alpaka/meta/TypeListOps.hpp
@@ -0,0 +1,95 @@
+/* Copyright 2022 Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include <tuple>
+#include <type_traits>
+
+namespace alpaka::meta
+{
+    namespace detail
+    {
+        template<typename List>
+        struct Front
+        {
+        };
+
+        template<template<typename...> class List, typename Head, typename... Tail>
+        struct Front<List<Head, Tail...>>
+        {
+            using type = Head;
+        };
+    } // namespace detail
+
+    template<typename List>
+    using Front = typename detail::Front<List>::type;
+
+    template<typename List, typename Value>
+    struct Contains : std::false_type
+    {
+    };
+
+    template<template<typename...> class List, typename Head, typename... Tail, typename Value>
+    struct Contains<List<Head, Tail...>, Value>
+    {
+        static constexpr bool value = std::is_same_v<Head, Value> || Contains<List<Tail...>, Value>::value;
+    };
+
+    // copied from https://stackoverflow.com/a/51073558/22035743
+    template<typename T>
+    struct IsList : std::false_type
+    {
+    };
+
+    template<template<typename...> class TList, typename... TTypes>
+    struct IsList<TList<TTypes...>> : std::true_type
+    {
+    };
+
+    //! \brief Checks whether the specified type is a list. List is a type with a variadic number of template types.
+    template<typename T>
+    constexpr bool isList = IsList<std::decay_t<T>>::value;
+
+    namespace detail
+    {
+        template<template<typename...> class TListType, typename TType, typename = void>
+        struct ToListImpl
+        {
+            using type = TListType<TType>;
+        };
+
+        template<template<typename...> class TListType, typename TList>
+        struct ToListImpl<TListType, TList, std::enable_if_t<alpaka::meta::isList<TList>>>
+        {
+            using type = TList;
+        };
+    } // namespace detail
+
+    //! \brief Takes an arbitrary number of types (T) and creates a type list of type TListType with the types (T). If
+    //! T is a single template parameter and it satisfies alpaka::meta::isList, the type of the structure is T (no type
+    //! change). For example std::tuple can be used as TListType.
+    //! \tparam TListType type of the created list
+    //! \tparam T possible list types or type list
+    template<template<typename...> class TListType, typename... T>
+    struct ToList;
+
+    template<template<typename...> class TListType, typename T>
+    struct ToList<TListType, T> : detail::ToListImpl<TListType, T>
+    {
+    };
+
+    template<template<typename...> class TListType, typename T, typename... Ts>
+    struct ToList<TListType, T, Ts...>
+    {
+        using type = TListType<T, Ts...>;
+    };
+
+    //! \brief If T is a single argument and a type list (fullfil alpaka::meta::isList), the return type is T.
+    //! Otherwise, std::tuple is returned with T types as template parameters.
+    template<typename... T>
+    using ToTuple = typename ToList<std::tuple, T...>::type;
+
+
+} // namespace alpaka::meta
diff --git a/include/alpaka/meta/Unique.hpp b/include/alpaka/meta/Unique.hpp
new file mode 100644
index 0000000..ea20ff2
--- /dev/null
+++ b/include/alpaka/meta/Unique.hpp
@@ -0,0 +1,41 @@
+/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include <type_traits>
+
+namespace alpaka::meta
+{
+    namespace detail
+    {
+        template<typename T, typename... Ts>
+        struct UniqueHelper
+        {
+            using type = T;
+        };
+
+        template<template<typename...> class TList, typename... Ts, typename U, typename... Us>
+        struct UniqueHelper<TList<Ts...>, U, Us...>
+            : std::conditional_t<
+                  (std::is_same_v<U, Ts> || ...),
+                  UniqueHelper<TList<Ts...>, Us...>,
+                  UniqueHelper<TList<Ts..., U>, Us...>>
+        {
+        };
+
+        template<typename T>
+        struct UniqueImpl;
+
+        template<template<typename...> class TList, typename... Ts>
+        struct UniqueImpl<TList<Ts...>>
+        {
+            using type = typename UniqueHelper<TList<>, Ts...>::type;
+        };
+    } // namespace detail
+
+    //! Trait that returns a list with only unique (no equal) types (a set). Duplicates will be filtered out.
+    template<typename TList>
+    using Unique = typename detail::UniqueImpl<TList>::type;
+} // namespace alpaka::meta
diff --git a/include/alpaka/offset/Traits.hpp b/include/alpaka/offset/Traits.hpp
new file mode 100644
index 0000000..c2edb3b
--- /dev/null
+++ b/include/alpaka/offset/Traits.hpp
@@ -0,0 +1,132 @@
+/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/dim/DimIntegralConst.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/vec/Vec.hpp"
+
+#include <type_traits>
+
+namespace alpaka
+{
+    //! The offset traits.
+    namespace trait
+    {
+        //! The x offset get trait.
+        //!
+        //! If not specialized explicitly it returns 0.
+        template<typename TIdx, typename TOffsets, typename TSfinae = void>
+        struct [[deprecated("Specialize GetOffsets instead")]] GetOffset
+        {
+            ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC static auto getOffset(TOffsets const&) -> Idx<TOffsets>
+            {
+                return static_cast<Idx<TOffsets>>(0);
+            } // namespace trait
+        }; // namespace alpaka
+
+        //! The GetOffsets trait for getting the offsets of an object as an alpaka::Vec.
+        template<typename TExtent, typename TSfinae = void>
+        struct GetOffsets;
+    } // namespace trait
+
+    //! \return The offset in the given dimension.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<std::size_t Tidx, typename TOffsets>
+    [[deprecated("use getOffsets(offsets)[Tidx] instead")]] ALPAKA_FN_HOST_ACC auto getOffset(TOffsets const& offsets)
+        -> Idx<TOffsets>
+    {
+#if BOOST_COMP_CLANG || BOOST_COMP_GNUC
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+        return trait::GetOffset<DimInt<Tidx>, TOffsets>::getOffset(offsets);
+#if BOOST_COMP_CLANG || BOOST_COMP_GNUC
+#    pragma GCC diagnostic pop
+#endif
+    }
+
+    //! \return The extents of the given object.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T>
+    ALPAKA_FN_HOST_ACC auto getOffsets(T const& object) -> Vec<Dim<T>, Idx<T>>
+    {
+        return trait::GetOffsets<T>{}(object);
+    }
+
+    //! \tparam T has to specialize GetOffsets.
+    //! \return The offset vector.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename T>
+    ALPAKA_FN_HOST_ACC constexpr auto getOffsetVec(T const& object = {}) -> Vec<Dim<T>, Idx<T>>
+    {
+        return getOffsets(object);
+    }
+
+    //! \tparam T has to specialize GetOffsets.
+    //! \return The offset vector but only the last TDim elements.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TDim, typename T>
+    ALPAKA_FN_HOST_ACC constexpr auto getOffsetVecEnd(T const& object = {}) -> Vec<TDim, Idx<T>>
+    {
+        static_assert(TDim::value <= Dim<T>::value, "Cannot get more items than the offsets hold");
+
+        auto const o = getOffsets(object);
+        Vec<TDim, Idx<T>> v;
+        for(unsigned i = 0; i < TDim::value; i++)
+            v[i] = o[(Dim<T>::value - TDim::value) + i];
+        return v;
+    }
+
+    //! \return The offset in x dimension.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TOffsets>
+    ALPAKA_FN_HOST_ACC auto getOffsetX(TOffsets const& offsets = TOffsets()) -> Idx<TOffsets>
+    {
+        return getOffsets(offsets)[Dim<TOffsets>::value - 1u];
+    }
+
+    //! \return The offset in y dimension.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TOffsets>
+    ALPAKA_FN_HOST_ACC auto getOffsetY(TOffsets const& offsets = TOffsets()) -> Idx<TOffsets>
+    {
+        return getOffsets(offsets)[Dim<TOffsets>::value - 2u];
+    }
+
+    //! \return The offset in z dimension.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TOffsets>
+    ALPAKA_FN_HOST_ACC auto getOffsetZ(TOffsets const& offsets = TOffsets()) -> Idx<TOffsets>
+    {
+        return getOffsets(offsets)[Dim<TOffsets>::value - 3u];
+    }
+
+    namespace trait
+    {
+        //! The Vec offset get trait specialization.
+        template<typename TDim, typename TVal>
+        struct GetOffsets<Vec<TDim, TVal>>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC constexpr auto operator()(Vec<TDim, TVal> const& offsets) const -> Vec<TDim, TVal>
+            {
+                return offsets;
+            }
+        };
+
+        //! The unsigned integral x offset get trait specialization.
+        template<typename TIntegral>
+        struct GetOffsets<TIntegral, std::enable_if_t<std::is_integral_v<TIntegral>>>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC constexpr auto operator()(TIntegral const& i) const
+            {
+                return Vec{i};
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/platform/PlatformCpu.hpp b/include/alpaka/platform/PlatformCpu.hpp
new file mode 100644
index 0000000..c431fd4
--- /dev/null
+++ b/include/alpaka/platform/PlatformCpu.hpp
@@ -0,0 +1,69 @@
+/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/dev/DevCpu.hpp"
+#include "alpaka/platform/Traits.hpp"
+
+#include <sstream>
+#include <vector>
+
+namespace alpaka
+{
+    //! The CPU device platform.
+    struct PlatformCpu : concepts::Implements<ConceptPlatform, PlatformCpu>
+    {
+#if defined(BOOST_COMP_GNUC) && BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(11, 0, 0)                                     \
+    && BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(12, 0, 0)
+        // This is a workaround for g++-11 bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96295
+        // g++-11 complains in *all* places where a PlatformCpu is used, that it "may be used uninitialized"
+        char c = {};
+#endif
+    };
+
+    namespace trait
+    {
+        //! The CPU device device type trait specialization.
+        template<>
+        struct DevType<PlatformCpu>
+        {
+            using type = DevCpu;
+        };
+
+        //! The CPU platform device count get trait specialization.
+        template<>
+        struct GetDevCount<PlatformCpu>
+        {
+            ALPAKA_FN_HOST static auto getDevCount(PlatformCpu const&) -> std::size_t
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                return 1;
+            }
+        };
+
+        //! The CPU platform device get trait specialization.
+        template<>
+        struct GetDevByIdx<PlatformCpu>
+        {
+            ALPAKA_FN_HOST static auto getDevByIdx(PlatformCpu const& platform, std::size_t const& devIdx) -> DevCpu
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                std::size_t const devCount = getDevCount(platform);
+                if(devIdx >= devCount)
+                {
+                    std::stringstream ssErr;
+                    ssErr << "Unable to return device handle for CPU device with index " << devIdx
+                          << " because there are only " << devCount << " devices!";
+                    throw std::runtime_error(ssErr.str());
+                }
+
+                return {};
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/platform/PlatformCpuSycl.hpp b/include/alpaka/platform/PlatformCpuSycl.hpp
new file mode 100644
index 0000000..4fdda8d
--- /dev/null
+++ b/include/alpaka/platform/PlatformCpuSycl.hpp
@@ -0,0 +1,33 @@
+/* Copyright 2024 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/dev/DevGenericSycl.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/platform/PlatformGenericSycl.hpp"
+
+#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_CPU)
+
+#    include <sycl/sycl.hpp>
+
+namespace alpaka
+{
+    namespace detail
+    {
+        template<>
+        struct SYCLDeviceSelector<TagCpuSycl>
+        {
+            auto operator()(sycl::device const& dev) const -> int
+            {
+                return dev.is_cpu() ? 1 : -1;
+            }
+        };
+    } // namespace detail
+
+    //! The SYCL device manager.
+    using PlatformCpuSycl = PlatformGenericSycl<TagCpuSycl>;
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/platform/PlatformCudaRt.hpp b/include/alpaka/platform/PlatformCudaRt.hpp
new file mode 100644
index 0000000..9bf76fa
--- /dev/null
+++ b/include/alpaka/platform/PlatformCudaRt.hpp
@@ -0,0 +1,18 @@
+/* Copyright 2022 Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/ApiCudaRt.hpp"
+#include "alpaka/platform/PlatformUniformCudaHipRt.hpp"
+
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+
+namespace alpaka
+{
+    //! The CUDA RT platform.
+    using PlatformCudaRt = PlatformUniformCudaHipRt<ApiCudaRt>;
+} // namespace alpaka
+
+#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
diff --git a/include/alpaka/platform/PlatformFpgaSyclIntel.hpp b/include/alpaka/platform/PlatformFpgaSyclIntel.hpp
new file mode 100644
index 0000000..a3a7342
--- /dev/null
+++ b/include/alpaka/platform/PlatformFpgaSyclIntel.hpp
@@ -0,0 +1,51 @@
+/* Copyright 2024 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/dev/DevGenericSycl.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/platform/PlatformGenericSycl.hpp"
+
+#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_FPGA)
+
+#    include <sycl/sycl.hpp>
+
+namespace alpaka
+{
+    namespace detail
+    {
+        // Prevent clang from annoying us with warnings about emitting too many vtables. These are discarded by the
+        // linker anyway.
+#    if BOOST_COMP_CLANG
+#        pragma clang diagnostic push
+#        pragma clang diagnostic ignored "-Wweak-vtables"
+#    endif
+        template<>
+        struct SYCLDeviceSelector<TagFpgaSyclIntel>
+        {
+#    ifdef ALPAKA_FPGA_EMULATION
+            static constexpr auto platform_name = "Intel(R) FPGA Emulation Platform for OpenCL(TM)";
+#    else
+            static constexpr auto platform_name = "Intel(R) FPGA SDK for OpenCL(TM)";
+#    endif
+
+            auto operator()(sycl::device const& dev) const -> int
+            {
+                auto const& platform = dev.get_platform().get_info<sycl::info::platform::name>();
+                auto const is_intel_fpga = dev.is_accelerator() && (platform == platform_name);
+
+                return is_intel_fpga ? 1 : -1;
+            }
+        };
+#    if BOOST_COMP_CLANG
+#        pragma clang diagnostic pop
+#    endif
+    } // namespace detail
+
+    //! The SYCL device manager.
+    using PlatformFpgaSyclIntel = PlatformGenericSycl<TagFpgaSyclIntel>;
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/platform/PlatformGenericSycl.hpp b/include/alpaka/platform/PlatformGenericSycl.hpp
new file mode 100644
index 0000000..12e00fc
--- /dev/null
+++ b/include/alpaka/platform/PlatformGenericSycl.hpp
@@ -0,0 +1,746 @@
+/* Copyright 2024 Jan Stephan, Luca Ferragina, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/core/Sycl.hpp"
+#include "alpaka/dev/DevGenericSycl.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/platform/Traits.hpp"
+
+#include <cstddef>
+#include <exception>
+#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+#    include <iostream>
+#endif
+#include <sstream>
+#include <stdexcept>
+#include <vector>
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+
+#    if BOOST_COMP_CLANG
+#        pragma clang diagnostic push
+#        pragma clang diagnostic ignored "-Wswitch-default"
+#    endif
+
+#    include <sycl/sycl.hpp>
+
+namespace alpaka
+{
+    namespace detail
+    {
+        template<typename TTag>
+        struct SYCLDeviceSelector;
+    } // namespace detail
+
+    //! The SYCL device manager.
+    template<typename TTag>
+    struct PlatformGenericSycl : concepts::Implements<ConceptPlatform, PlatformGenericSycl<TTag>>
+    {
+        PlatformGenericSycl()
+            : platform{detail::SYCLDeviceSelector<TTag>{}}
+            , devices(platform.get_devices())
+            , context{sycl::context{
+                  devices,
+                  [](sycl::exception_list exceptions)
+                  {
+                      auto ss_err = std::stringstream{};
+                      ss_err << "Caught asynchronous SYCL exception(s):\n";
+                      for(std::exception_ptr e : exceptions)
+                      {
+                          try
+                          {
+                              std::rethrow_exception(e);
+                          }
+                          catch(sycl::exception const& err)
+                          {
+                              ss_err << err.what() << " (" << err.code() << ")\n";
+                          }
+                      }
+                      throw std::runtime_error(ss_err.str());
+                  }}}
+        {
+        }
+
+        [[nodiscard]] auto syclPlatform() -> sycl::platform&
+        {
+            return platform;
+        }
+
+        [[nodiscard]] auto syclPlatform() const -> sycl::platform const&
+        {
+            return platform;
+        }
+
+        [[nodiscard]] auto syclDevices() -> std::vector<sycl::device>&
+        {
+            return devices;
+        }
+
+        [[nodiscard]] auto syclDevices() const -> std::vector<sycl::device> const&
+        {
+            return devices;
+        }
+
+        [[nodiscard]] auto syclContext() -> sycl::context&
+        {
+            return context;
+        }
+
+        [[nodiscard]] auto syclContext() const -> sycl::context const&
+        {
+            return context;
+        }
+
+    private:
+        sycl::platform platform;
+        std::vector<sycl::device> devices;
+        sycl::context context;
+    };
+
+    namespace trait
+    {
+        //! The SYCL platform device type trait specialization.
+        template<typename TTag>
+        struct DevType<PlatformGenericSycl<TTag>>
+        {
+            using type = DevGenericSycl<TTag>;
+        };
+
+        //! The SYCL platform device count get trait specialization.
+        template<typename TTag>
+        struct GetDevCount<PlatformGenericSycl<TTag>>
+        {
+            static auto getDevCount(PlatformGenericSycl<TTag> const& platform) -> std::size_t
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                return platform.syclDevices().size();
+            }
+        };
+
+        //! The SYCL platform device get trait specialization.
+        template<typename TTag>
+        struct GetDevByIdx<PlatformGenericSycl<TTag>>
+        {
+            static auto getDevByIdx(PlatformGenericSycl<TTag> const& platform, std::size_t const& devIdx)
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                auto const& devices = platform.syclDevices();
+                if(devIdx >= devices.size())
+                {
+                    auto ss_err = std::stringstream{};
+                    ss_err << "Unable to return device handle for device " << devIdx << ". There are only "
+                           << devices.size() << " SYCL devices!";
+                    throw std::runtime_error(ss_err.str());
+                }
+
+                auto sycl_dev = devices.at(devIdx);
+
+                // Log this device.
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                printDeviceProperties(sycl_dev);
+#    elif ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+                std::cout << __func__ << sycl_dev.template get_info<sycl::info::device::name>() << '\n';
+#    endif
+                using SyclPlatform = alpaka::PlatformGenericSycl<TTag>;
+                return typename DevType<SyclPlatform>::type{sycl_dev, platform.syclContext()};
+            }
+
+        private:
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            //! Prints all the device properties to std::cout.
+            static auto printDeviceProperties(sycl::device const& device) -> void
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                constexpr auto KiB = std::size_t{1024};
+                constexpr auto MiB = KiB * KiB;
+
+                std::cout << "Device type: ";
+                switch(device.get_info<sycl::info::device::device_type>())
+                {
+                case sycl::info::device_type::cpu:
+                    std::cout << "CPU";
+                    break;
+
+                case sycl::info::device_type::gpu:
+                    std::cout << "GPU";
+                    break;
+
+                case sycl::info::device_type::accelerator:
+                    std::cout << "Accelerator";
+                    break;
+
+                case sycl::info::device_type::custom:
+                    std::cout << "Custom";
+                    break;
+
+                case sycl::info::device_type::automatic:
+                    std::cout << "Automatic";
+                    break;
+
+                case sycl::info::device_type::host:
+                    std::cout << "Host";
+                    break;
+
+                // The SYCL spec forbids the return of device_type::all
+                // Including this here to prevent warnings because of
+                // missing cases
+                case sycl::info::device_type::all:
+                    std::cout << "All";
+                    break;
+                }
+                std::cout << '\n';
+
+                std::cout << "Name: " << device.get_info<sycl::info::device::name>() << '\n';
+
+                std::cout << "Vendor: " << device.get_info<sycl::info::device::vendor>() << '\n';
+
+                std::cout << "Vendor ID: " << device.get_info<sycl::info::device::vendor_id>() << '\n';
+
+                std::cout << "Driver version: " << device.get_info<sycl::info::device::driver_version>() << '\n';
+
+                std::cout << "SYCL version: " << device.get_info<sycl::info::device::version>() << '\n';
+
+#        if !defined(BOOST_COMP_ICPX)
+                // Not defined by Level Zero back-end
+                std::cout << "Backend version: " << device.get_info<sycl::info::device::backend_version>() << '\n';
+#        endif
+
+                std::cout << "Aspects: " << '\n';
+
+#        if defined(BOOST_COMP_ICPX)
+#            if BOOST_COMP_ICPX >= BOOST_VERSION_NUMBER(53, 2, 0)
+                // These aspects are missing from oneAPI versions < 2023.2.0
+                if(device.has(sycl::aspect::emulated))
+                    std::cout << "\t* emulated\n";
+
+                if(device.has(sycl::aspect::host_debuggable))
+                    std::cout << "\t* debuggable using standard debuggers\n";
+#            endif
+#        endif
+
+                if(device.has(sycl::aspect::fp16))
+                    std::cout << "\t* supports sycl::half precision\n";
+
+                if(device.has(sycl::aspect::fp64))
+                    std::cout << "\t* supports double precision\n";
+
+                if(device.has(sycl::aspect::atomic64))
+                    std::cout << "\t* supports 64-bit atomics\n";
+
+                if(device.has(sycl::aspect::image))
+                    std::cout << "\t* supports images\n";
+
+                if(device.has(sycl::aspect::online_compiler))
+                    std::cout << "\t* supports online compilation of device code\n";
+
+                if(device.has(sycl::aspect::online_linker))
+                    std::cout << "\t* supports online linking of device code\n";
+
+                if(device.has(sycl::aspect::queue_profiling))
+                    std::cout << "\t* supports queue profiling\n";
+
+                if(device.has(sycl::aspect::usm_device_allocations))
+                    std::cout << "\t* supports explicit USM allocations\n";
+
+                if(device.has(sycl::aspect::usm_host_allocations))
+                    std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::host\n";
+
+                if(device.has(sycl::aspect::usm_atomic_host_allocations))
+                    std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::host atomically\n";
+
+                if(device.has(sycl::aspect::usm_shared_allocations))
+                    std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::shared\n";
+
+                if(device.has(sycl::aspect::usm_atomic_shared_allocations))
+                    std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::shared atomically\n";
+
+                if(device.has(sycl::aspect::usm_system_allocations))
+                    std::cout << "\t* can access memory allocated by the system allocator\n";
+
+                std::cout << "Available compute units: " << device.get_info<sycl::info::device::max_compute_units>()
+                          << '\n';
+
+                std::cout << "Maximum work item dimensions: ";
+                auto dims = device.get_info<sycl::info::device::max_work_item_dimensions>();
+                std::cout << dims << std::endl;
+
+                std::cout << "Maximum number of work items:\n";
+                auto const wi_1D = device.get_info<sycl::info::device::max_work_item_sizes<1>>();
+                auto const wi_2D = device.get_info<sycl::info::device::max_work_item_sizes<2>>();
+                auto const wi_3D = device.get_info<sycl::info::device::max_work_item_sizes<3>>();
+                std::cout << "\t* 1D: (" << wi_1D.get(0) << ")\n";
+                std::cout << "\t* 2D: (" << wi_2D.get(0) << ", " << wi_2D.get(1) << ")\n";
+                std::cout << "\t* 3D: (" << wi_3D.get(0) << ", " << wi_3D.get(1) << ", " << wi_3D.get(2) << ")\n";
+
+                std::cout << "Maximum number of work items per work-group: "
+                          << device.get_info<sycl::info::device::max_work_group_size>() << '\n';
+
+                std::cout << "Maximum number of sub-groups per work-group: "
+                          << device.get_info<sycl::info::device::max_num_sub_groups>() << '\n';
+
+                std::cout << "Supported sub-group sizes: ";
+                auto const sg_sizes = device.get_info<sycl::info::device::sub_group_sizes>();
+                for(auto const& sz : sg_sizes)
+                    std::cout << sz << ", ";
+                std::cout << '\n';
+
+                std::cout << "Preferred native vector width (char): "
+                          << device.get_info<sycl::info::device::preferred_vector_width_char>() << '\n';
+
+                std::cout << "Native ISA vector width (char): "
+                          << device.get_info<sycl::info::device::native_vector_width_char>() << '\n';
+
+                std::cout << "Preferred native vector width (short): "
+                          << device.get_info<sycl::info::device::preferred_vector_width_short>() << '\n';
+
+                std::cout << "Native ISA vector width (short): "
+                          << device.get_info<sycl::info::device::native_vector_width_short>() << '\n';
+
+                std::cout << "Preferred native vector width (int): "
+                          << device.get_info<sycl::info::device::preferred_vector_width_int>() << '\n';
+
+                std::cout << "Native ISA vector width (int): "
+                          << device.get_info<sycl::info::device::native_vector_width_int>() << '\n';
+
+                std::cout << "Preferred native vector width (long): "
+                          << device.get_info<sycl::info::device::preferred_vector_width_long>() << '\n';
+
+                std::cout << "Native ISA vector width (long): "
+                          << device.get_info<sycl::info::device::native_vector_width_long>() << '\n';
+
+                std::cout << "Preferred native vector width (float): "
+                          << device.get_info<sycl::info::device::preferred_vector_width_float>() << '\n';
+
+                std::cout << "Native ISA vector width (float): "
+                          << device.get_info<sycl::info::device::native_vector_width_float>() << '\n';
+
+                if(device.has(sycl::aspect::fp64))
+                {
+                    std::cout << "Preferred native vector width (double): "
+                              << device.get_info<sycl::info::device::preferred_vector_width_double>() << '\n';
+
+                    std::cout << "Native ISA vector width (double): "
+                              << device.get_info<sycl::info::device::native_vector_width_double>() << '\n';
+                }
+
+                if(device.has(sycl::aspect::fp16))
+                {
+                    std::cout << "Preferred native vector width (half): "
+                              << device.get_info<sycl::info::device::preferred_vector_width_half>() << '\n';
+
+                    std::cout << "Native ISA vector width (half): "
+                              << device.get_info<sycl::info::device::native_vector_width_half>() << '\n';
+                }
+
+                std::cout << "Maximum clock frequency: " << device.get_info<sycl::info::device::max_clock_frequency>()
+                          << " MHz\n";
+
+                std::cout << "Address space size: " << device.get_info<sycl::info::device::address_bits>() << "-bit\n";
+
+                std::cout << "Maximum size of memory object allocation: "
+                          << device.get_info<sycl::info::device::max_mem_alloc_size>() << " bytes\n";
+
+                if(device.has(sycl::aspect::image))
+                {
+                    std::cout << "Maximum number of simultaneous image object reads per kernel: "
+                              << device.get_info<sycl::info::device::max_read_image_args>() << '\n';
+
+                    std::cout << "Maximum number of simultaneous image writes per kernel: "
+                              << device.get_info<sycl::info::device::max_write_image_args>() << '\n';
+
+                    std::cout << "Maximum 1D/2D image width: "
+                              << device.get_info<sycl::info::device::image2d_max_width>() << " px\n";
+
+                    std::cout << "Maximum 2D image height: "
+                              << device.get_info<sycl::info::device::image2d_max_height>() << " px\n";
+
+                    std::cout << "Maximum 3D image width: " << device.get_info<sycl::info::device::image3d_max_width>()
+                              << " px\n";
+
+                    std::cout << "Maximum 3D image height: "
+                              << device.get_info<sycl::info::device::image3d_max_height>() << " px\n";
+
+                    std::cout << "Maximum 3D image depth: " << device.get_info<sycl::info::device::image3d_max_depth>()
+                              << " px\n";
+
+                    std::cout << "Maximum number of samplers per kernel: "
+                              << device.get_info<sycl::info::device::max_samplers>() << '\n';
+                }
+
+                std::cout << "Maximum kernel argument size: "
+                          << device.get_info<sycl::info::device::max_parameter_size>() << " bytes\n";
+
+                std::cout << "Memory base address alignment: "
+                          << device.get_info<sycl::info::device::mem_base_addr_align>() << " bit\n";
+
+                auto print_fp_config = [](std::string const& fp, std::vector<sycl::info::fp_config> const& conf)
+                {
+                    std::cout << fp << " precision floating-point capabilities:\n";
+
+                    auto find_and_print = [&](sycl::info::fp_config val)
+                    {
+                        auto it = std::find(begin(conf), end(conf), val);
+                        std::cout << (it == std::end(conf) ? "No" : "Yes") << '\n';
+                    };
+
+                    std::cout << "\t* denorm support: ";
+                    find_and_print(sycl::info::fp_config::denorm);
+
+                    std::cout << "\t* INF & quiet NaN support: ";
+                    find_and_print(sycl::info::fp_config::inf_nan);
+
+                    std::cout << "\t* round to nearest even support: ";
+                    find_and_print(sycl::info::fp_config::round_to_nearest);
+
+                    std::cout << "\t* round to zero support: ";
+                    find_and_print(sycl::info::fp_config::round_to_zero);
+
+                    std::cout << "\t* round to infinity support: ";
+                    find_and_print(sycl::info::fp_config::round_to_inf);
+
+                    std::cout << "\t* IEEE754-2008 FMA support: ";
+                    find_and_print(sycl::info::fp_config::fma);
+
+                    std::cout << "\t* correctly rounded divide/sqrt support: ";
+                    find_and_print(sycl::info::fp_config::correctly_rounded_divide_sqrt);
+
+                    std::cout << "\t* software-implemented floating point operations: ";
+                    find_and_print(sycl::info::fp_config::soft_float);
+                };
+
+                if(device.has(sycl::aspect::fp16))
+                {
+                    auto const fp16_conf = device.get_info<sycl::info::device::half_fp_config>();
+                    print_fp_config("Half", fp16_conf);
+                }
+
+                auto const fp32_conf = device.get_info<sycl::info::device::single_fp_config>();
+                print_fp_config("Single", fp32_conf);
+
+                if(device.has(sycl::aspect::fp64))
+                {
+                    auto const fp64_conf = device.get_info<sycl::info::device::double_fp_config>();
+                    print_fp_config("Double", fp64_conf);
+                }
+
+                std::cout << "Global memory cache type: ";
+                auto has_global_mem_cache = false;
+                switch(device.get_info<sycl::info::device::global_mem_cache_type>())
+                {
+                case sycl::info::global_mem_cache_type::none:
+                    std::cout << "none";
+                    break;
+
+                case sycl::info::global_mem_cache_type::read_only:
+                    std::cout << "read-only";
+                    has_global_mem_cache = true;
+                    break;
+
+                case sycl::info::global_mem_cache_type::read_write:
+                    std::cout << "read-write";
+                    has_global_mem_cache = true;
+                    break;
+                }
+                std::cout << '\n';
+
+                if(has_global_mem_cache)
+                {
+                    std::cout << "Global memory cache line size: "
+                              << device.get_info<sycl::info::device::global_mem_cache_line_size>() << " bytes\n";
+
+                    std::cout << "Global memory cache size: "
+                              << device.get_info<sycl::info::device::global_mem_cache_size>() / KiB << " KiB\n";
+                }
+
+                std::cout << "Global memory size: " << device.get_info<sycl::info::device::global_mem_size>() / MiB
+                          << " MiB" << std::endl;
+
+                std::cout << "Local memory type: ";
+                auto has_local_memory = false;
+                switch(device.get_info<sycl::info::device::local_mem_type>())
+                {
+                case sycl::info::local_mem_type::none:
+                    std::cout << "none";
+                    break;
+
+                case sycl::info::local_mem_type::local:
+                    std::cout << "local";
+                    has_local_memory = true;
+                    break;
+
+                case sycl::info::local_mem_type::global:
+                    std::cout << "global";
+                    has_local_memory = true;
+                    break;
+                }
+                std::cout << '\n';
+
+                if(has_local_memory)
+                    std::cout << "Local memory size: " << device.get_info<sycl::info::device::local_mem_size>() / KiB
+                              << " KiB\n";
+
+                std::cout << "Error correction support: "
+                          << (device.get_info<sycl::info::device::error_correction_support>() ? "Yes" : "No") << '\n';
+
+                auto print_memory_orders = [](std::vector<sycl::memory_order> const& mem_orders)
+                {
+                    for(auto const& cap : mem_orders)
+                    {
+                        switch(cap)
+                        {
+                        case sycl::memory_order::relaxed:
+                            std::cout << "relaxed";
+                            break;
+
+                        case sycl::memory_order::acquire:
+                            std::cout << "acquire";
+                            break;
+
+                        case sycl::memory_order::release:
+                            std::cout << "release";
+                            break;
+
+                        case sycl::memory_order::acq_rel:
+                            std::cout << "acq_rel";
+                            break;
+
+                        case sycl::memory_order::seq_cst:
+                            std::cout << "seq_cst";
+                            break;
+#        if defined(BOOST_COMP_ICPX)
+                        // Stop icpx from complaining about its own internals.
+                        case sycl::memory_order::__consume_unsupported:
+                            break;
+#        endif
+                        }
+                        std::cout << ", ";
+                    }
+                    std::cout << '\n';
+                };
+
+                std::cout << "Supported memory orderings for atomic operations: ";
+                auto const mem_orders = device.get_info<sycl::info::device::atomic_memory_order_capabilities>();
+                print_memory_orders(mem_orders);
+
+#        if defined(BOOST_COMP_ICPX)
+#            if BOOST_COMP_ICPX >= BOOST_VERSION_NUMBER(53, 2, 0)
+                // Not implemented in oneAPI < 2023.2.0
+                std::cout << "Supported memory orderings for sycl::atomic_fence: ";
+                auto const fence_orders = device.get_info<sycl::info::device::atomic_fence_order_capabilities>();
+                print_memory_orders(fence_orders);
+#            endif
+#        endif
+
+                auto print_memory_scopes = [](std::vector<sycl::memory_scope> const& mem_scopes)
+                {
+                    for(auto const& cap : mem_scopes)
+                    {
+                        switch(cap)
+                        {
+                        case sycl::memory_scope::work_item:
+                            std::cout << "work-item";
+                            break;
+
+                        case sycl::memory_scope::sub_group:
+                            std::cout << "sub-group";
+                            break;
+
+                        case sycl::memory_scope::work_group:
+                            std::cout << "work-group";
+                            break;
+
+                        case sycl::memory_scope::device:
+                            std::cout << "device";
+                            break;
+
+                        case sycl::memory_scope::system:
+                            std::cout << "system";
+                            break;
+                        }
+                        std::cout << ", ";
+                    }
+                    std::cout << '\n';
+                };
+
+                std::cout << "Supported memory scopes for atomic operations: ";
+                auto const mem_scopes = device.get_info<sycl::info::device::atomic_memory_scope_capabilities>();
+                print_memory_scopes(mem_scopes);
+
+#        if defined(BOOST_COMP_ICPX)
+#            if BOOST_COMP_ICPX >= BOOST_VERSION_NUMBER(53, 2, 0)
+                // Not implemented in oneAPI < 2023.2.0
+                std::cout << "Supported memory scopes for sycl::atomic_fence: ";
+                auto const fence_scopes = device.get_info<sycl::info::device::atomic_fence_scope_capabilities>();
+                print_memory_scopes(fence_scopes);
+#            endif
+#        endif
+
+                std::cout << "Device timer resolution: "
+                          << device.get_info<sycl::info::device::profiling_timer_resolution>() << " ns\n";
+
+                std::cout << "Built-in kernels: ";
+                auto const builtins = device.get_info<sycl::info::device::built_in_kernel_ids>();
+                for(auto const& b : builtins)
+                    std::cout << b.get_name() << ", ";
+                std::cout << '\n';
+
+                std::cout << "Maximum number of subdevices: ";
+                auto const max_subs = device.get_info<sycl::info::device::partition_max_sub_devices>();
+                std::cout << max_subs << '\n';
+
+                if(max_subs > 1)
+                {
+                    std::cout << "Supported partition properties: ";
+                    auto const part_props = device.get_info<sycl::info::device::partition_properties>();
+                    auto has_affinity_domains = false;
+                    for(auto const& prop : part_props)
+                    {
+                        switch(prop)
+                        {
+                        case sycl::info::partition_property::no_partition:
+                            std::cout << "no partition";
+                            break;
+
+                        case sycl::info::partition_property::partition_equally:
+                            std::cout << "equally";
+                            break;
+
+                        case sycl::info::partition_property::partition_by_counts:
+                            std::cout << "by counts";
+                            break;
+
+                        case sycl::info::partition_property::partition_by_affinity_domain:
+                            std::cout << "by affinity domain";
+                            has_affinity_domains = true;
+                            break;
+#        if defined(BOOST_COMP_ICPX)
+                        case sycl::info::partition_property::ext_intel_partition_by_cslice:
+                            std::cout << "by compute slice (Intel extension; deprecated)";
+                            break;
+#        endif
+                        }
+                        std::cout << ", ";
+                    }
+                    std::cout << '\n';
+
+                    if(has_affinity_domains)
+                    {
+                        std::cout << "Supported partition affinity domains: ";
+                        auto const aff_doms = device.get_info<sycl::info::device::partition_affinity_domains>();
+                        for(auto const& dom : aff_doms)
+                        {
+                            switch(dom)
+                            {
+                            case sycl::info::partition_affinity_domain::not_applicable:
+                                std::cout << "not applicable";
+                                break;
+
+                            case sycl::info::partition_affinity_domain::numa:
+                                std::cout << "NUMA";
+                                break;
+
+                            case sycl::info::partition_affinity_domain::L4_cache:
+                                std::cout << "L4 cache";
+                                break;
+
+                            case sycl::info::partition_affinity_domain::L3_cache:
+                                std::cout << "L3 cache";
+                                break;
+
+                            case sycl::info::partition_affinity_domain::L2_cache:
+                                std::cout << "L2 cache";
+                                break;
+
+                            case sycl::info::partition_affinity_domain::L1_cache:
+                                std::cout << "L1 cache";
+                                break;
+
+                            case sycl::info::partition_affinity_domain::next_partitionable:
+                                std::cout << "next partitionable";
+                                break;
+                            }
+                            std::cout << ", ";
+                        }
+                        std::cout << '\n';
+                    }
+
+                    std::cout << "Current partition property: ";
+                    switch(device.get_info<sycl::info::device::partition_type_property>())
+                    {
+                    case sycl::info::partition_property::no_partition:
+                        std::cout << "no partition";
+                        break;
+
+                    case sycl::info::partition_property::partition_equally:
+                        std::cout << "partitioned equally";
+                        break;
+
+                    case sycl::info::partition_property::partition_by_counts:
+                        std::cout << "partitioned by counts";
+                        break;
+
+                    case sycl::info::partition_property::partition_by_affinity_domain:
+                        std::cout << "partitioned by affinity domain";
+                        break;
+
+#        if defined(BOOST_COMP_ICPX)
+                    case sycl::info::partition_property::ext_intel_partition_by_cslice:
+                        std::cout << "partitioned by compute slice (Intel extension; deprecated)";
+                        break;
+#        endif
+                    }
+                    std::cout << '\n';
+
+                    std::cout << "Current partition affinity domain: ";
+                    switch(device.get_info<sycl::info::device::partition_type_affinity_domain>())
+                    {
+                    case sycl::info::partition_affinity_domain::not_applicable:
+                        std::cout << "not applicable";
+                        break;
+
+                    case sycl::info::partition_affinity_domain::numa:
+                        std::cout << "NUMA";
+                        break;
+
+                    case sycl::info::partition_affinity_domain::L4_cache:
+                        std::cout << "L4 cache";
+                        break;
+
+                    case sycl::info::partition_affinity_domain::L3_cache:
+                        std::cout << "L3 cache";
+                        break;
+
+                    case sycl::info::partition_affinity_domain::L2_cache:
+                        std::cout << "L2 cache";
+                        break;
+
+                    case sycl::info::partition_affinity_domain::L1_cache:
+                        std::cout << "L1 cache";
+                        break;
+
+                    case sycl::info::partition_affinity_domain::next_partitionable:
+                        std::cout << "next partitionable";
+                        break;
+                    }
+                    std::cout << '\n';
+                }
+
+                std::cout.flush();
+            }
+#    endif
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#    if BOOST_COMP_CLANG
+#        pragma clang diagnostic pop
+#    endif
+
+#endif
diff --git a/include/alpaka/platform/PlatformGpuSyclIntel.hpp b/include/alpaka/platform/PlatformGpuSyclIntel.hpp
new file mode 100644
index 0000000..d49695a
--- /dev/null
+++ b/include/alpaka/platform/PlatformGpuSyclIntel.hpp
@@ -0,0 +1,36 @@
+/* Copyright 2024 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/dev/DevGenericSycl.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/platform/PlatformGenericSycl.hpp"
+
+#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_GPU)
+
+#    include <sycl/sycl.hpp>
+
+namespace alpaka
+{
+    namespace detail
+    {
+        template<>
+        struct SYCLDeviceSelector<TagGpuSyclIntel>
+        {
+            auto operator()(sycl::device const& dev) const -> int
+            {
+                auto const& vendor = dev.get_info<sycl::info::device::vendor>();
+                auto const is_intel_gpu = dev.is_gpu() && (vendor.find("Intel(R) Corporation") != std::string::npos);
+
+                return is_intel_gpu ? 1 : -1;
+            }
+        };
+    } // namespace detail
+
+    //! The SYCL device manager.
+    using PlatformGpuSyclIntel = PlatformGenericSycl<TagGpuSyclIntel>;
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/platform/PlatformHipRt.hpp b/include/alpaka/platform/PlatformHipRt.hpp
new file mode 100644
index 0000000..25303ae
--- /dev/null
+++ b/include/alpaka/platform/PlatformHipRt.hpp
@@ -0,0 +1,18 @@
+/* Copyright 2022 Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/ApiHipRt.hpp"
+#include "alpaka/platform/PlatformUniformCudaHipRt.hpp"
+
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+
+namespace alpaka
+{
+    //! The HIP RT platform.
+    using PlatformHipRt = PlatformUniformCudaHipRt<ApiHipRt>;
+} // namespace alpaka
+
+#endif // ALPAKA_ACC_GPU_HIP_ENABLED
diff --git a/include/alpaka/platform/PlatformUniformCudaHipRt.hpp b/include/alpaka/platform/PlatformUniformCudaHipRt.hpp
new file mode 100644
index 0000000..a3ae0ef
--- /dev/null
+++ b/include/alpaka/platform/PlatformUniformCudaHipRt.hpp
@@ -0,0 +1,265 @@
+/* Copyright 2022 Benjamin Worpitz, René Widera, Andrea Bocci, Bernhard Manfred Gruber, Antonio Di Pilato,
+ *                Christian Kaever
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/core/Cuda.hpp"
+#include "alpaka/core/Hip.hpp"
+#include "alpaka/dev/DevUniformCudaHipRt.hpp"
+#include "alpaka/dev/Traits.hpp"
+
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <tuple>
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+namespace alpaka
+{
+    // Forward declarations.
+    struct ApiCudaRt;
+    struct ApiHipRt;
+
+    //! The CUDA/HIP RT platform.
+    template<typename TApi>
+    struct PlatformUniformCudaHipRt : concepts::Implements<ConceptPlatform, PlatformUniformCudaHipRt<TApi>>
+    {
+#    if defined(BOOST_COMP_GNUC) && BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(11, 0, 0)                                 \
+        && BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(12, 0, 0)
+        // This is a workaround for g++-11 bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96295
+        // g++-11 complains in *all* places where a PlatformCpu is used, that it "may be used uninitialized"
+        char c = {};
+#    endif
+    };
+
+    namespace trait
+    {
+        //! The CUDA/HIP RT platform device type trait specialization.
+        template<typename TApi>
+        struct DevType<PlatformUniformCudaHipRt<TApi>>
+        {
+            using type = DevUniformCudaHipRt<TApi>;
+        };
+
+        //! The CUDA/HIP RT platform device count get trait specialization.
+        template<typename TApi>
+        struct GetDevCount<PlatformUniformCudaHipRt<TApi>>
+        {
+            ALPAKA_FN_HOST static auto getDevCount(PlatformUniformCudaHipRt<TApi> const&) -> std::size_t
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                int iNumDevices(0);
+                typename TApi::Error_t error = TApi::getDeviceCount(&iNumDevices);
+                if(error != TApi::success)
+                    iNumDevices = 0;
+
+                return static_cast<std::size_t>(iNumDevices);
+            }
+        };
+
+        //! The CUDA/HIP RT platform device get trait specialization.
+        template<typename TApi>
+        struct GetDevByIdx<PlatformUniformCudaHipRt<TApi>>
+        {
+            ALPAKA_FN_HOST static auto getDevByIdx(
+                PlatformUniformCudaHipRt<TApi> const& platform,
+                std::size_t const& devIdx) -> DevUniformCudaHipRt<TApi>
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                std::size_t const devCount = getDevCount(platform);
+                if(devIdx >= devCount)
+                {
+                    std::stringstream ssErr;
+                    ssErr << "Unable to return device handle for device " << devIdx << ". There are only " << devCount
+                          << " devices!";
+                    throw std::runtime_error(ssErr.str());
+                }
+
+                if(isDevUsable(devIdx))
+                {
+                    DevUniformCudaHipRt<TApi> dev(static_cast<int>(devIdx));
+
+                    // Log this device.
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+                    typename TApi::DeviceProp_t devProp;
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::getDeviceProperties(&devProp, dev.getNativeHandle()));
+#    endif
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                    printDeviceProperties(devProp);
+#    elif ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+                    std::cout << __func__ << devProp.name << std::endl;
+#    endif
+                    return dev;
+                }
+                else
+                {
+                    std::stringstream ssErr;
+                    ssErr << "Unable to return device handle for device " << devIdx << ". It is not accessible!";
+                    throw std::runtime_error(ssErr.str());
+                }
+            }
+
+        private:
+            //! \return If the device is usable.
+            ALPAKA_FN_HOST static auto isDevUsable(std::size_t iDevice) -> bool
+            {
+                typename TApi::Error_t rc = TApi::setDevice(static_cast<int>(iDevice));
+                typename TApi::Stream_t queue = {};
+                // Create a dummy queue to check if the device is already used by an other process.
+                // cuda/hip-SetDevice never returns an error if another process already uses the selected device and
+                // gpu compute mode is set "process exclusive". \TODO: Check if this workaround is needed!
+                if(rc == TApi::success)
+                {
+                    rc = TApi::streamCreate(&queue);
+                }
+
+                if(rc == TApi::success)
+                {
+                    // Destroy the dummy queue.
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamDestroy(queue));
+                    return true;
+                }
+                else
+                {
+                    // Return the previous error from cudaStreamCreate.
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(rc);
+                    // Reset the Error state.
+                    std::ignore = TApi::getLastError();
+                    return false;
+                }
+            }
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            //! Prints all the device properties to std::cout.
+            ALPAKA_FN_HOST static auto printDeviceProperties(typename TApi::DeviceProp_t const& devProp) -> void
+            {
+                ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+                constexpr auto KiB = std::size_t{1024};
+                constexpr auto MiB = KiB * KiB;
+                std::cout << "name: " << devProp.name << std::endl;
+                std::cout << "totalGlobalMem: " << devProp.totalGlobalMem / MiB << " MiB" << std::endl;
+                std::cout << "sharedMemPerBlock: " << devProp.sharedMemPerBlock / KiB << " KiB" << std::endl;
+                std::cout << "regsPerBlock: " << devProp.regsPerBlock << std::endl;
+                std::cout << "warpSize: " << devProp.warpSize << std::endl;
+                std::cout << "maxThreadsPerBlock: " << devProp.maxThreadsPerBlock << std::endl;
+                std::cout << "maxThreadsDim[3]: (" << devProp.maxThreadsDim[0] << ", " << devProp.maxThreadsDim[1]
+                          << ", " << devProp.maxThreadsDim[2] << ")" << std::endl;
+                std::cout << "maxGridSize[3]: (" << devProp.maxGridSize[0] << ", " << devProp.maxGridSize[1] << ", "
+                          << devProp.maxGridSize[2] << ")" << std::endl;
+                std::cout << "clockRate: " << devProp.clockRate << " kHz" << std::endl;
+                std::cout << "totalConstMem: " << devProp.totalConstMem / KiB << " KiB" << std::endl;
+                std::cout << "major: " << devProp.major << std::endl;
+                std::cout << "minor: " << devProp.minor << std::endl;
+
+                // std::cout << "deviceOverlap: " << devProp.deviceOverlap << std::endl;    // Deprecated
+                std::cout << "multiProcessorCount: " << devProp.multiProcessorCount << std::endl;
+                std::cout << "integrated: " << devProp.integrated << std::endl;
+                std::cout << "canMapHostMemory: " << devProp.canMapHostMemory << std::endl;
+                std::cout << "computeMode: " << devProp.computeMode << std::endl;
+                std::cout << "concurrentKernels: " << devProp.concurrentKernels << std::endl;
+                std::cout << "pciBusID: " << devProp.pciBusID << std::endl;
+                std::cout << "pciDeviceID: " << devProp.pciDeviceID << std::endl;
+                std::cout << "pciDomainID: " << devProp.pciDomainID << std::endl;
+                std::cout << "memoryClockRate: " << devProp.memoryClockRate << " kHz" << std::endl;
+                std::cout << "memoryBusWidth: " << devProp.memoryBusWidth << " b" << std::endl;
+                std::cout << "l2CacheSize: " << devProp.l2CacheSize << " B" << std::endl;
+                std::cout << "maxThreadsPerMultiProcessor: " << devProp.maxThreadsPerMultiProcessor << std::endl;
+                std::cout << "isMultiGpuBoard: " << devProp.isMultiGpuBoard << std::endl;
+                if constexpr(std::is_same_v<TApi, ApiCudaRt>)
+                {
+                    std::cout << "memPitch: " << devProp.memPitch << " B" << std::endl;
+                    std::cout << "textureAlignment: " << devProp.textureAlignment << std::endl;
+                    std::cout << "texturePitchAlignment: " << devProp.texturePitchAlignment << std::endl;
+                    std::cout << "kernelExecTimeoutEnabled: " << devProp.kernelExecTimeoutEnabled << std::endl;
+                    std::cout << "unifiedAddressing: " << devProp.unifiedAddressing << std::endl;
+                    std::cout << "multiGpuBoardGroupID: " << devProp.multiGpuBoardGroupID << std::endl;
+                    std::cout << "singleToDoublePrecisionPerfRatio: " << devProp.singleToDoublePrecisionPerfRatio
+                              << std::endl;
+                    std::cout << "pageableMemoryAccess: " << devProp.pageableMemoryAccess << std::endl;
+                    std::cout << "concurrentManagedAccess: " << devProp.concurrentManagedAccess << std::endl;
+                    std::cout << "computePreemptionSupported: " << devProp.computePreemptionSupported << std::endl;
+                    std::cout << "canUseHostPointerForRegisteredMem: " << devProp.canUseHostPointerForRegisteredMem
+                              << std::endl;
+                    std::cout << "cooperativeLaunch: " << devProp.cooperativeLaunch << std::endl;
+                    std::cout << "cooperativeMultiDeviceLaunch: " << devProp.cooperativeMultiDeviceLaunch << std::endl;
+                    std::cout << "maxTexture1D: " << devProp.maxTexture1D << std::endl;
+                    std::cout << "maxTexture1DLinear: " << devProp.maxTexture1DLinear << std::endl;
+                    std::cout << "maxTexture2D[2]: " << devProp.maxTexture2D[0] << "x" << devProp.maxTexture2D[1]
+                              << std::endl;
+                    std::cout << "maxTexture2DLinear[3]: " << devProp.maxTexture2DLinear[0] << "x"
+                              << devProp.maxTexture2DLinear[1] << "x" << devProp.maxTexture2DLinear[2] << std::endl;
+                    std::cout << "maxTexture2DGather[2]: " << devProp.maxTexture2DGather[0] << "x"
+                              << devProp.maxTexture2DGather[1] << std::endl;
+                    std::cout << "maxTexture3D[3]: " << devProp.maxTexture3D[0] << "x" << devProp.maxTexture3D[1]
+                              << "x" << devProp.maxTexture3D[2] << std::endl;
+                    std::cout << "maxTextureCubemap: " << devProp.maxTextureCubemap << std::endl;
+                    std::cout << "maxTexture1DLayered[2]: " << devProp.maxTexture1DLayered[0] << "x"
+                              << devProp.maxTexture1DLayered[1] << std::endl;
+                    std::cout << "maxTexture2DLayered[3]: " << devProp.maxTexture2DLayered[0] << "x"
+                              << devProp.maxTexture2DLayered[1] << "x" << devProp.maxTexture2DLayered[2] << std::endl;
+                    std::cout << "maxTextureCubemapLayered[2]: " << devProp.maxTextureCubemapLayered[0] << "x"
+                              << devProp.maxTextureCubemapLayered[1] << std::endl;
+                    std::cout << "maxSurface1D: " << devProp.maxSurface1D << std::endl;
+                    std::cout << "maxSurface2D[2]: " << devProp.maxSurface2D[0] << "x" << devProp.maxSurface2D[1]
+                              << std::endl;
+                    std::cout << "maxSurface3D[3]: " << devProp.maxSurface3D[0] << "x" << devProp.maxSurface3D[1]
+                              << "x" << devProp.maxSurface3D[2] << std::endl;
+                    std::cout << "maxSurface1DLayered[2]: " << devProp.maxSurface1DLayered[0] << "x"
+                              << devProp.maxSurface1DLayered[1] << std::endl;
+                    std::cout << "maxSurface2DLayered[3]: " << devProp.maxSurface2DLayered[0] << "x"
+                              << devProp.maxSurface2DLayered[1] << "x" << devProp.maxSurface2DLayered[2] << std::endl;
+                    std::cout << "maxSurfaceCubemap: " << devProp.maxSurfaceCubemap << std::endl;
+                    std::cout << "maxSurfaceCubemapLayered[2]: " << devProp.maxSurfaceCubemapLayered[0] << "x"
+                              << devProp.maxSurfaceCubemapLayered[1] << std::endl;
+                    std::cout << "surfaceAlignment: " << devProp.surfaceAlignment << std::endl;
+                    std::cout << "ECCEnabled: " << devProp.ECCEnabled << std::endl;
+                    std::cout << "tccDriver: " << devProp.tccDriver << std::endl;
+                    std::cout << "asyncEngineCount: " << devProp.asyncEngineCount << std::endl;
+                    std::cout << "streamPrioritiesSupported: " << devProp.streamPrioritiesSupported << std::endl;
+                    std::cout << "globalL1CacheSupported: " << devProp.globalL1CacheSupported << std::endl;
+                    std::cout << "localL1CacheSupported: " << devProp.localL1CacheSupported << std::endl;
+                    std::cout << "sharedMemPerMultiprocessor: " << devProp.sharedMemPerMultiprocessor << std::endl;
+                    std::cout << "regsPerMultiprocessor: " << devProp.regsPerMultiprocessor << std::endl;
+                    std::cout << "managedMemory: " << devProp.managedMemory << std::endl;
+                }
+                else
+                { // ApiHipRt
+                    std::cout << "clockInstructionRate: " << devProp.clockInstructionRate << "kHz" << std::endl;
+                    std::cout << "maxSharedMemoryPerMultiProcessor: " << devProp.maxSharedMemoryPerMultiProcessor / KiB
+                              << " KiB" << std::endl;
+                    std::cout << "gcnArchName: " << devProp.gcnArchName << std::endl;
+                    std::cout << "arch: " << std::endl;
+                    std::cout << "    hasGlobalInt32Atomics: " << devProp.arch.hasGlobalInt32Atomics << std::endl;
+                    std::cout << "    hasGlobalFloatAtomicExch: " << devProp.arch.hasGlobalFloatAtomicExch
+                              << std::endl;
+                    std::cout << "    hasSharedInt32Atomics: " << devProp.arch.hasSharedInt32Atomics << std::endl;
+                    std::cout << "    hasSharedFloatAtomicExch: " << devProp.arch.hasSharedFloatAtomicExch
+                              << std::endl;
+                    std::cout << "    hasFloatAtomicAdd: " << devProp.arch.hasFloatAtomicAdd << std::endl;
+                    std::cout << "    hasGlobalInt64Atomics: " << devProp.arch.hasGlobalInt64Atomics << std::endl;
+                    std::cout << "    hasSharedInt64Atomics: " << devProp.arch.hasSharedInt64Atomics << std::endl;
+                    std::cout << "    hasDoubles: " << devProp.arch.hasDoubles << std::endl;
+                    std::cout << "    hasWarpVote: " << devProp.arch.hasWarpVote << std::endl;
+                    std::cout << "    hasWarpBallot: " << devProp.arch.hasWarpBallot << std::endl;
+                    std::cout << "    hasWarpShuffle: " << devProp.arch.hasWarpShuffle << std::endl;
+                    std::cout << "    hasFunnelShift: " << devProp.arch.hasFunnelShift << std::endl;
+                    std::cout << "    hasThreadFenceSystem: " << devProp.arch.hasThreadFenceSystem << std::endl;
+                    std::cout << "    hasSyncThreadsExt: " << devProp.arch.hasSyncThreadsExt << std::endl;
+                    std::cout << "    hasSurfaceFuncs: " << devProp.arch.hasSurfaceFuncs << std::endl;
+                    std::cout << "    has3dGrid: " << devProp.arch.has3dGrid << std::endl;
+                    std::cout << "    hasDynamicParallelism: " << devProp.arch.hasDynamicParallelism << std::endl;
+                }
+            }
+#    endif
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/platform/Traits.hpp b/include/alpaka/platform/Traits.hpp
new file mode 100644
index 0000000..5c094c3
--- /dev/null
+++ b/include/alpaka/platform/Traits.hpp
@@ -0,0 +1,94 @@
+/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/queue/Traits.hpp"
+
+#include <type_traits>
+#include <vector>
+
+namespace alpaka
+{
+    struct ConceptPlatform
+    {
+    };
+
+    //! True if TPlatform is a platform, i.e. if it implements the ConceptPlatform concept.
+    template<typename TPlatform>
+    inline constexpr bool isPlatform = concepts::ImplementsConcept<ConceptPlatform, TPlatform>::value;
+
+    //! The platform traits.
+    namespace trait
+    {
+        //! The platform type trait.
+        template<typename T, typename TSfinae = void>
+        struct PlatformType;
+
+        template<typename TPlatform>
+        struct PlatformType<
+            TPlatform,
+            std::enable_if_t<concepts::ImplementsConcept<ConceptPlatform, TPlatform>::value>>
+        {
+            using type = typename concepts::ImplementationBase<ConceptDev, TPlatform>;
+        };
+
+        //! The device count get trait.
+        template<typename T, typename TSfinae = void>
+        struct GetDevCount;
+
+        //! The device get trait.
+        template<typename T, typename TSfinae = void>
+        struct GetDevByIdx;
+    } // namespace trait
+
+    //! The platform type trait alias template to remove the ::type.
+    template<typename T>
+    using Platform = typename trait::PlatformType<T>::type;
+
+    //! \return The device identified by its index.
+    template<typename TPlatform>
+    ALPAKA_FN_HOST auto getDevCount(TPlatform const& platform)
+    {
+        return trait::GetDevCount<TPlatform>::getDevCount(platform);
+    }
+
+    //! \return The device identified by its index.
+    template<typename TPlatform>
+    ALPAKA_FN_HOST auto getDevByIdx(TPlatform const& platform, std::size_t const& devIdx) -> Dev<TPlatform>
+    {
+        return trait::GetDevByIdx<TPlatform>::getDevByIdx(platform, devIdx);
+    }
+
+    //! \return All the devices available on this accelerator.
+    template<typename TPlatform>
+    ALPAKA_FN_HOST auto getDevs(TPlatform const& platform) -> std::vector<Dev<TPlatform>>
+    {
+        std::vector<Dev<TPlatform>> devs;
+
+        std::size_t const devCount = getDevCount(platform);
+        devs.reserve(devCount);
+        for(std::size_t devIdx(0); devIdx < devCount; ++devIdx)
+        {
+            devs.push_back(getDevByIdx(platform, devIdx));
+        }
+
+        return devs;
+    }
+
+    namespace trait
+    {
+        template<typename TPlatform, typename TProperty>
+        struct QueueType<
+            TPlatform,
+            TProperty,
+            std::enable_if_t<concepts::ImplementsConcept<ConceptPlatform, TPlatform>::value>>
+        {
+            using type = typename QueueType<typename alpaka::trait::DevType<TPlatform>::type, TProperty>::type;
+        };
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/queue/Properties.hpp b/include/alpaka/queue/Properties.hpp
new file mode 100644
index 0000000..d3e3b55
--- /dev/null
+++ b/include/alpaka/queue/Properties.hpp
@@ -0,0 +1,20 @@
+/* Copyright 2020 Rene Widera, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+namespace alpaka
+{
+    //! Properties to define queue behavior
+    namespace property
+    {
+        //! The caller is waiting until the enqueued task is finished
+        struct Blocking;
+
+        //! The caller is NOT waiting until the enqueued task is finished
+        struct NonBlocking;
+    } // namespace property
+
+    using namespace property;
+} // namespace alpaka
diff --git a/include/alpaka/queue/QueueCpuBlocking.hpp b/include/alpaka/queue/QueueCpuBlocking.hpp
new file mode 100644
index 0000000..8cf4746
--- /dev/null
+++ b/include/alpaka/queue/QueueCpuBlocking.hpp
@@ -0,0 +1,13 @@
+/* Copyright 2020 Jeffrey Kelling, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/event/EventCpu.hpp"
+#include "alpaka/queue/QueueGenericThreadsBlocking.hpp"
+
+namespace alpaka
+{
+    using QueueCpuBlocking = QueueGenericThreadsBlocking<DevCpu>;
+} // namespace alpaka
diff --git a/include/alpaka/queue/QueueCpuNonBlocking.hpp b/include/alpaka/queue/QueueCpuNonBlocking.hpp
new file mode 100644
index 0000000..78eb028
--- /dev/null
+++ b/include/alpaka/queue/QueueCpuNonBlocking.hpp
@@ -0,0 +1,13 @@
+/* Copyright 2020 Jeffrey Kelling, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/event/EventCpu.hpp"
+#include "alpaka/queue/QueueGenericThreadsNonBlocking.hpp"
+
+namespace alpaka
+{
+    using QueueCpuNonBlocking = QueueGenericThreadsNonBlocking<DevCpu>;
+} // namespace alpaka
diff --git a/include/alpaka/queue/QueueCpuSyclBlocking.hpp b/include/alpaka/queue/QueueCpuSyclBlocking.hpp
new file mode 100644
index 0000000..392740a
--- /dev/null
+++ b/include/alpaka/queue/QueueCpuSyclBlocking.hpp
@@ -0,0 +1,17 @@
+/* Copyright 2024 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/acc/Tag.hpp"
+#include "alpaka/queue/QueueGenericSyclBlocking.hpp"
+
+#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_CPU)
+
+namespace alpaka
+{
+    using QueueCpuSyclBlocking = QueueGenericSyclBlocking<TagCpuSycl>;
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/queue/QueueCpuSyclNonBlocking.hpp b/include/alpaka/queue/QueueCpuSyclNonBlocking.hpp
new file mode 100644
index 0000000..19904ba
--- /dev/null
+++ b/include/alpaka/queue/QueueCpuSyclNonBlocking.hpp
@@ -0,0 +1,17 @@
+/* Copyright 2024 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/acc/Tag.hpp"
+#include "alpaka/queue/QueueGenericSyclNonBlocking.hpp"
+
+#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_CPU)
+
+namespace alpaka
+{
+    using QueueCpuSyclNonBlocking = QueueGenericSyclNonBlocking<TagCpuSycl>;
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/queue/QueueCudaRtBlocking.hpp b/include/alpaka/queue/QueueCudaRtBlocking.hpp
new file mode 100644
index 0000000..c54a618
--- /dev/null
+++ b/include/alpaka/queue/QueueCudaRtBlocking.hpp
@@ -0,0 +1,18 @@
+/* Copyright 2022 Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/ApiCudaRt.hpp"
+#include "alpaka/queue/QueueUniformCudaHipRtBlocking.hpp"
+
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+
+namespace alpaka
+{
+    //! The CUDA RT blocking queue.
+    using QueueCudaRtBlocking = QueueUniformCudaHipRtBlocking<ApiCudaRt>;
+} // namespace alpaka
+
+#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
diff --git a/include/alpaka/queue/QueueCudaRtNonBlocking.hpp b/include/alpaka/queue/QueueCudaRtNonBlocking.hpp
new file mode 100644
index 0000000..a7180d6
--- /dev/null
+++ b/include/alpaka/queue/QueueCudaRtNonBlocking.hpp
@@ -0,0 +1,18 @@
+/* Copyright 2022 Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/ApiCudaRt.hpp"
+#include "alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp"
+
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+
+namespace alpaka
+{
+    //! The CUDA RT non-blocking queue.
+    using QueueCudaRtNonBlocking = QueueUniformCudaHipRtNonBlocking<ApiCudaRt>;
+} // namespace alpaka
+
+#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
diff --git a/include/alpaka/queue/QueueFpgaSyclIntelBlocking.hpp b/include/alpaka/queue/QueueFpgaSyclIntelBlocking.hpp
new file mode 100644
index 0000000..7c2f791
--- /dev/null
+++ b/include/alpaka/queue/QueueFpgaSyclIntelBlocking.hpp
@@ -0,0 +1,17 @@
+/* Copyright 2024 Jan Stephan, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/acc/Tag.hpp"
+#include "alpaka/queue/QueueGenericSyclBlocking.hpp"
+
+#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_FPGA)
+
+namespace alpaka
+{
+    using QueueFpgaSyclIntelBlocking = QueueGenericSyclBlocking<TagFpgaSyclIntel>;
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/queue/QueueFpgaSyclIntelNonBlocking.hpp b/include/alpaka/queue/QueueFpgaSyclIntelNonBlocking.hpp
new file mode 100644
index 0000000..de1d7a6
--- /dev/null
+++ b/include/alpaka/queue/QueueFpgaSyclIntelNonBlocking.hpp
@@ -0,0 +1,17 @@
+/* Copyright 2024 Jan Stephan, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/acc/Tag.hpp"
+#include "alpaka/queue/QueueGenericSyclNonBlocking.hpp"
+
+#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_FPGA)
+
+namespace alpaka
+{
+    using QueueFpgaSyclIntelNonBlocking = QueueGenericSyclNonBlocking<TagFpgaSyclIntel>;
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/queue/QueueGenericSyclBlocking.hpp b/include/alpaka/queue/QueueGenericSyclBlocking.hpp
new file mode 100644
index 0000000..44dfb14
--- /dev/null
+++ b/include/alpaka/queue/QueueGenericSyclBlocking.hpp
@@ -0,0 +1,17 @@
+/* Copyright 2024 Jan Stephan, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/queue/sycl/QueueGenericSyclBase.hpp"
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+
+namespace alpaka
+{
+    template<typename TTag>
+    using QueueGenericSyclBlocking = detail::QueueGenericSyclBase<TTag, true>;
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/queue/QueueGenericSyclNonBlocking.hpp b/include/alpaka/queue/QueueGenericSyclNonBlocking.hpp
new file mode 100644
index 0000000..22615ca
--- /dev/null
+++ b/include/alpaka/queue/QueueGenericSyclNonBlocking.hpp
@@ -0,0 +1,17 @@
+/* Copyright 2024 Jan Stephan, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/queue/sycl/QueueGenericSyclBase.hpp"
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+
+namespace alpaka
+{
+    template<typename TTag>
+    using QueueGenericSyclNonBlocking = detail::QueueGenericSyclBase<TTag, false>;
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/queue/QueueGenericThreadsBlocking.hpp b/include/alpaka/queue/QueueGenericThreadsBlocking.hpp
new file mode 100644
index 0000000..65361bd
--- /dev/null
+++ b/include/alpaka/queue/QueueGenericThreadsBlocking.hpp
@@ -0,0 +1,166 @@
+/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/event/Traits.hpp"
+#include "alpaka/queue/Traits.hpp"
+#include "alpaka/queue/cpu/IGenericThreadsQueue.hpp"
+#include "alpaka/wait/Traits.hpp"
+
+#include <atomic>
+#include <memory>
+#include <mutex>
+
+namespace alpaka
+{
+    template<typename TDev>
+    class EventGenericThreads;
+
+    namespace generic
+    {
+        namespace detail
+        {
+#if BOOST_COMP_CLANG
+// avoid diagnostic warning: "has no out-of-line virtual method definitions; its vtable will be emitted in every
+// translation unit [-Werror,-Wweak-vtables]" https://stackoverflow.com/a/29288300
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wweak-vtables"
+#endif
+            //! The CPU device queue implementation.
+            template<typename TDev>
+            class QueueGenericThreadsBlockingImpl final : public IGenericThreadsQueue<TDev>
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic pop
+#endif
+            {
+            public:
+                explicit QueueGenericThreadsBlockingImpl(TDev dev) noexcept
+                    : m_dev(std::move(dev))
+                    , m_bCurrentlyExecutingTask(false)
+                {
+                }
+
+                QueueGenericThreadsBlockingImpl(QueueGenericThreadsBlockingImpl<TDev> const&) = delete;
+                auto operator=(QueueGenericThreadsBlockingImpl<TDev> const&)
+                    -> QueueGenericThreadsBlockingImpl<TDev>& = delete;
+
+                void enqueue(EventGenericThreads<TDev>& ev) final
+                {
+                    alpaka::enqueue(*this, ev);
+                }
+
+                void wait(EventGenericThreads<TDev> const& ev) final
+                {
+                    alpaka::wait(*this, ev);
+                }
+
+            public:
+                TDev const m_dev; //!< The device this queue is bound to.
+                std::mutex mutable m_mutex;
+                std::atomic<bool> m_bCurrentlyExecutingTask;
+            };
+        } // namespace detail
+    } // namespace generic
+
+    //! The CPU device queue.
+    template<typename TDev>
+    class QueueGenericThreadsBlocking final
+        : public concepts::Implements<ConceptCurrentThreadWaitFor, QueueGenericThreadsBlocking<TDev>>
+        , public concepts::Implements<ConceptQueue, QueueGenericThreadsBlocking<TDev>>
+        , public concepts::Implements<ConceptGetDev, QueueGenericThreadsBlocking<TDev>>
+    {
+    public:
+        explicit QueueGenericThreadsBlocking(TDev const& dev)
+            : m_spQueueImpl(std::make_shared<generic::detail::QueueGenericThreadsBlockingImpl<TDev>>(dev))
+        {
+            ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+            dev.registerQueue(m_spQueueImpl);
+        }
+
+        auto operator==(QueueGenericThreadsBlocking<TDev> const& rhs) const -> bool
+        {
+            return (m_spQueueImpl == rhs.m_spQueueImpl);
+        }
+
+        auto operator!=(QueueGenericThreadsBlocking<TDev> const& rhs) const -> bool
+        {
+            return !((*this) == rhs);
+        }
+
+    public:
+        std::shared_ptr<generic::detail::QueueGenericThreadsBlockingImpl<TDev>> m_spQueueImpl;
+    };
+
+    namespace trait
+    {
+        //! The CPU blocking device queue device type trait specialization.
+        template<typename TDev>
+        struct DevType<QueueGenericThreadsBlocking<TDev>>
+        {
+            using type = TDev;
+        };
+
+        //! The CPU blocking device queue device get trait specialization.
+        template<typename TDev>
+        struct GetDev<QueueGenericThreadsBlocking<TDev>>
+        {
+            ALPAKA_FN_HOST static auto getDev(QueueGenericThreadsBlocking<TDev> const& queue) -> TDev
+            {
+                return queue.m_spQueueImpl->m_dev;
+            }
+        };
+
+        //! The CPU blocking device queue event type trait specialization.
+        template<typename TDev>
+        struct EventType<QueueGenericThreadsBlocking<TDev>>
+        {
+            using type = EventGenericThreads<TDev>;
+        };
+
+        //! The CPU blocking device queue enqueue trait specialization.
+        //! This default implementation for all tasks directly invokes the function call operator of the task.
+        template<typename TDev, typename TTask>
+        struct Enqueue<QueueGenericThreadsBlocking<TDev>, TTask>
+        {
+            ALPAKA_FN_HOST static auto enqueue(QueueGenericThreadsBlocking<TDev>& queue, TTask const& task) -> void
+            {
+                std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
+
+                queue.m_spQueueImpl->m_bCurrentlyExecutingTask = true;
+
+                task();
+
+                queue.m_spQueueImpl->m_bCurrentlyExecutingTask = false;
+            }
+        };
+
+        //! The CPU blocking device queue test trait specialization.
+        template<typename TDev>
+        struct Empty<QueueGenericThreadsBlocking<TDev>>
+        {
+            ALPAKA_FN_HOST static auto empty(QueueGenericThreadsBlocking<TDev> const& queue) -> bool
+            {
+                return !queue.m_spQueueImpl->m_bCurrentlyExecutingTask;
+            }
+        };
+
+        //! The CPU blocking device queue thread wait trait specialization.
+        //!
+        //! Blocks execution of the calling thread until the queue has finished processing all previously requested
+        //! tasks (kernels, data copies, ...)
+        template<typename TDev>
+        struct CurrentThreadWaitFor<QueueGenericThreadsBlocking<TDev>>
+        {
+            ALPAKA_FN_HOST static auto currentThreadWaitFor(QueueGenericThreadsBlocking<TDev> const& queue) -> void
+            {
+                std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#include "alpaka/event/EventGenericThreads.hpp"
diff --git a/include/alpaka/queue/QueueGenericThreadsNonBlocking.hpp b/include/alpaka/queue/QueueGenericThreadsNonBlocking.hpp
new file mode 100644
index 0000000..4e02a91
--- /dev/null
+++ b/include/alpaka/queue/QueueGenericThreadsNonBlocking.hpp
@@ -0,0 +1,156 @@
+/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber, Jeffrey Kelling
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/CallbackThread.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/event/Traits.hpp"
+#include "alpaka/queue/Traits.hpp"
+#include "alpaka/queue/cpu/IGenericThreadsQueue.hpp"
+#include "alpaka/wait/Traits.hpp"
+
+#include <future>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <tuple>
+#include <type_traits>
+
+namespace alpaka
+{
+    template<typename TDev>
+    class EventGenericThreads;
+
+    namespace generic
+    {
+        namespace detail
+        {
+#if BOOST_COMP_CLANG
+// avoid diagnostic warning: "has no out-of-line virtual method definitions; its vtable will be emitted in every
+// translation unit [-Werror,-Wweak-vtables]" https://stackoverflow.com/a/29288300
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wweak-vtables"
+#endif
+            //! The CPU device queue implementation.
+            template<typename TDev>
+            class QueueGenericThreadsNonBlockingImpl final : public IGenericThreadsQueue<TDev>
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic pop
+#endif
+            {
+            public:
+                explicit QueueGenericThreadsNonBlockingImpl(TDev dev) : m_dev(std::move(dev))
+                {
+                }
+
+                QueueGenericThreadsNonBlockingImpl(QueueGenericThreadsNonBlockingImpl<TDev> const&) = delete;
+                QueueGenericThreadsNonBlockingImpl(QueueGenericThreadsNonBlockingImpl<TDev>&&) = delete;
+                auto operator=(QueueGenericThreadsNonBlockingImpl<TDev> const&)
+                    -> QueueGenericThreadsNonBlockingImpl<TDev>& = delete;
+                auto operator=(QueueGenericThreadsNonBlockingImpl&&)
+                    -> QueueGenericThreadsNonBlockingImpl<TDev>& = delete;
+
+                ~QueueGenericThreadsNonBlockingImpl() override
+                {
+                }
+
+                void enqueue(EventGenericThreads<TDev>& ev) final
+                {
+                    alpaka::enqueue(*this, ev);
+                }
+
+                void wait(EventGenericThreads<TDev> const& ev) final
+                {
+                    alpaka::wait(*this, ev);
+                }
+
+            public:
+                TDev const m_dev; //!< The device this queue is bound to.
+                core::CallbackThread m_workerThread;
+            };
+        } // namespace detail
+    } // namespace generic
+
+    //! The CPU device queue.
+    template<typename TDev>
+    class QueueGenericThreadsNonBlocking final
+        : public concepts::Implements<ConceptCurrentThreadWaitFor, QueueGenericThreadsNonBlocking<TDev>>
+        , public concepts::Implements<ConceptQueue, QueueGenericThreadsNonBlocking<TDev>>
+        , public concepts::Implements<ConceptGetDev, QueueGenericThreadsNonBlocking<TDev>>
+    {
+    public:
+        explicit QueueGenericThreadsNonBlocking(TDev const& dev)
+            : m_spQueueImpl(std::make_shared<generic::detail::QueueGenericThreadsNonBlockingImpl<TDev>>(dev))
+        {
+            ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
+            dev.registerQueue(m_spQueueImpl);
+        }
+
+        auto operator==(QueueGenericThreadsNonBlocking<TDev> const& rhs) const -> bool
+        {
+            return (m_spQueueImpl == rhs.m_spQueueImpl);
+        }
+
+        auto operator!=(QueueGenericThreadsNonBlocking<TDev> const& rhs) const -> bool
+        {
+            return !((*this) == rhs);
+        }
+
+    public:
+        std::shared_ptr<generic::detail::QueueGenericThreadsNonBlockingImpl<TDev>> m_spQueueImpl;
+    };
+
+    namespace trait
+    {
+        //! The CPU non-blocking device queue device type trait specialization.
+        template<typename TDev>
+        struct DevType<QueueGenericThreadsNonBlocking<TDev>>
+        {
+            using type = TDev;
+        };
+
+        //! The CPU non-blocking device queue device get trait specialization.
+        template<typename TDev>
+        struct GetDev<QueueGenericThreadsNonBlocking<TDev>>
+        {
+            ALPAKA_FN_HOST static auto getDev(QueueGenericThreadsNonBlocking<TDev> const& queue) -> TDev
+            {
+                return queue.m_spQueueImpl->m_dev;
+            }
+        };
+
+        //! The CPU non-blocking device queue event type trait specialization.
+        template<typename TDev>
+        struct EventType<QueueGenericThreadsNonBlocking<TDev>>
+        {
+            using type = EventGenericThreads<TDev>;
+        };
+
+        //! The CPU non-blocking device queue enqueue trait specialization.
+        //! This default implementation for all tasks directly invokes the function call operator of the task.
+        template<typename TDev, typename TTask>
+        struct Enqueue<QueueGenericThreadsNonBlocking<TDev>, TTask>
+        {
+            ALPAKA_FN_HOST static auto enqueue(QueueGenericThreadsNonBlocking<TDev>& queue, TTask const& task) -> void
+            {
+                queue.m_spQueueImpl->m_workerThread.submit(task);
+            }
+        };
+
+        //! The CPU non-blocking device queue test trait specialization.
+        template<typename TDev>
+        struct Empty<QueueGenericThreadsNonBlocking<TDev>>
+        {
+            ALPAKA_FN_HOST static auto empty(QueueGenericThreadsNonBlocking<TDev> const& queue) -> bool
+            {
+                return queue.m_spQueueImpl->m_workerThread.empty();
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#include "alpaka/event/EventGenericThreads.hpp"
diff --git a/include/alpaka/queue/QueueGpuSyclIntelBlocking.hpp b/include/alpaka/queue/QueueGpuSyclIntelBlocking.hpp
new file mode 100644
index 0000000..37d4bda
--- /dev/null
+++ b/include/alpaka/queue/QueueGpuSyclIntelBlocking.hpp
@@ -0,0 +1,17 @@
+/* Copyright 2024 Jan Stephan, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/acc/Tag.hpp"
+#include "alpaka/queue/QueueGenericSyclBlocking.hpp"
+
+#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_GPU)
+
+namespace alpaka
+{
+    using QueueGpuSyclIntelBlocking = QueueGenericSyclBlocking<TagGpuSyclIntel>;
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/queue/QueueGpuSyclIntelNonBlocking.hpp b/include/alpaka/queue/QueueGpuSyclIntelNonBlocking.hpp
new file mode 100644
index 0000000..a50299e
--- /dev/null
+++ b/include/alpaka/queue/QueueGpuSyclIntelNonBlocking.hpp
@@ -0,0 +1,17 @@
+/* Copyright 2024 Jan Stephan, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/acc/Tag.hpp"
+#include "alpaka/queue/QueueGenericSyclNonBlocking.hpp"
+
+#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_GPU)
+
+namespace alpaka
+{
+    using QueueGpuSyclIntelNonBlocking = QueueGenericSyclNonBlocking<TagGpuSyclIntel>;
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/queue/QueueHipRtBlocking.hpp b/include/alpaka/queue/QueueHipRtBlocking.hpp
new file mode 100644
index 0000000..cdb1dfb
--- /dev/null
+++ b/include/alpaka/queue/QueueHipRtBlocking.hpp
@@ -0,0 +1,18 @@
+/* Copyright 2022 Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/ApiHipRt.hpp"
+#include "alpaka/queue/QueueUniformCudaHipRtBlocking.hpp"
+
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+
+namespace alpaka
+{
+    //! The HIP RT blocking queue.
+    using QueueHipRtBlocking = QueueUniformCudaHipRtBlocking<ApiHipRt>;
+} // namespace alpaka
+
+#endif // ALPAKA_ACC_GPU_HIP_ENABLED
diff --git a/include/alpaka/queue/QueueHipRtNonBlocking.hpp b/include/alpaka/queue/QueueHipRtNonBlocking.hpp
new file mode 100644
index 0000000..732609e
--- /dev/null
+++ b/include/alpaka/queue/QueueHipRtNonBlocking.hpp
@@ -0,0 +1,18 @@
+/* Copyright 2022 Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/ApiHipRt.hpp"
+#include "alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp"
+
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+
+namespace alpaka
+{
+    //! The HIP RT non-blocking queue.
+    using QueueHipRtNonBlocking = QueueUniformCudaHipRtNonBlocking<ApiHipRt>;
+} // namespace alpaka
+
+#endif // ALPAKA_ACC_GPU_HIP_ENABLED
diff --git a/include/alpaka/queue/QueueUniformCudaHipRtBlocking.hpp b/include/alpaka/queue/QueueUniformCudaHipRtBlocking.hpp
new file mode 100644
index 0000000..5add0ef
--- /dev/null
+++ b/include/alpaka/queue/QueueUniformCudaHipRtBlocking.hpp
@@ -0,0 +1,19 @@
+/* Copyright 2022 Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/queue/cuda_hip/QueueUniformCudaHipRt.hpp"
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+namespace alpaka
+{
+    //! The CUDA/HIP RT blocking queue.
+    template<typename TApi>
+    using QueueUniformCudaHipRtBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, true>;
+
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp b/include/alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp
new file mode 100644
index 0000000..62b0b0f
--- /dev/null
+++ b/include/alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp
@@ -0,0 +1,19 @@
+/* Copyright 2022 Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/queue/cuda_hip/QueueUniformCudaHipRt.hpp"
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+namespace alpaka
+{
+    //! The CUDA/HIP RT non-blocking queue.
+    template<typename TApi>
+    using QueueUniformCudaHipRtNonBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, false>;
+
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/queue/Traits.hpp b/include/alpaka/queue/Traits.hpp
new file mode 100644
index 0000000..71d3ec9
--- /dev/null
+++ b/include/alpaka/queue/Traits.hpp
@@ -0,0 +1,71 @@
+/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/wait/Traits.hpp"
+
+#include <type_traits>
+#include <utility>
+
+namespace alpaka
+{
+    struct ConceptQueue;
+
+    //! True if TQueue is a queue, i.e. if it implements the ConceptQueue concept.
+    template<typename TQueue>
+    inline constexpr bool isQueue = concepts::ImplementsConcept<ConceptQueue, std::decay_t<TQueue>>::value;
+
+    //! The queue traits.
+    namespace trait
+    {
+        //! The queue enqueue trait.
+        template<typename TQueue, typename TTask, typename TSfinae = void>
+        struct Enqueue;
+
+        //! The queue empty trait.
+        template<typename TQueue, typename TSfinae = void>
+        struct Empty;
+
+        //! Queue for an accelerator
+        template<typename TAcc, typename TProperty, typename TSfinae = void>
+        struct QueueType;
+    } // namespace trait
+
+    //! Queues the given task in the given queue.
+    //!
+    //! Special Handling for events:
+    //!   If the event has previously been queued, then this call will overwrite any existing state of the event.
+    //!   Any subsequent calls which examine the status of event will only examine the completion of this most recent
+    //!   call to enqueue.
+    //!   If a queue is waiting for an event the latter's event state at the time of the API call to wait() will be
+    //!   used to release the queue.
+    template<typename TQueue, typename TTask>
+    ALPAKA_FN_HOST auto enqueue(TQueue& queue, TTask&& task) -> void
+    {
+        trait::Enqueue<TQueue, std::decay_t<TTask>>::enqueue(queue, std::forward<TTask>(task));
+    }
+
+    //! Tests if the queue is empty (all ops in the given queue have been completed).
+    //!
+    //! \warning This function is allowed to return false negatives. An empty queue can reported as
+    //! non empty because the status information are not fully propagated by the used alpaka backend.
+    //! \return true queue is empty else false.
+    template<typename TQueue>
+    ALPAKA_FN_HOST auto empty(TQueue const& queue) -> bool
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptQueue, TQueue>;
+        return trait::Empty<ImplementationBase>::empty(queue);
+    }
+
+    //! Queue based on the environment and a property
+    //!
+    //! \tparam TEnv Environment type, e.g.  accelerator, device or a platform.
+    //!              trait::QueueType must be specialized for TEnv
+    //! \tparam TProperty Property to define the behavior of TEnv.
+    template<typename TEnv, typename TProperty>
+    using Queue = typename trait::QueueType<TEnv, TProperty>::type;
+} // namespace alpaka
diff --git a/include/alpaka/queue/cpu/ICpuQueue.hpp b/include/alpaka/queue/cpu/ICpuQueue.hpp
new file mode 100644
index 0000000..cd71072
--- /dev/null
+++ b/include/alpaka/queue/cpu/ICpuQueue.hpp
@@ -0,0 +1,14 @@
+/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/dev/DevCpu.hpp"
+#include "alpaka/queue/cpu/IGenericThreadsQueue.hpp"
+
+namespace alpaka::cpu
+{
+    //! The CPU queue interface
+    using ICpuQueue = IGenericThreadsQueue<DevCpu>;
+} // namespace alpaka::cpu
diff --git a/include/alpaka/queue/cpu/IGenericThreadsQueue.hpp b/include/alpaka/queue/cpu/IGenericThreadsQueue.hpp
new file mode 100644
index 0000000..3d82a9c
--- /dev/null
+++ b/include/alpaka/queue/cpu/IGenericThreadsQueue.hpp
@@ -0,0 +1,35 @@
+/* Copyright 2020 Axel Huebl, Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+
+namespace alpaka
+{
+    template<typename TDev>
+    class EventGenericThreads;
+
+#if BOOST_COMP_CLANG
+// avoid diagnostic warning: "has no out-of-line virtual method definitions; its vtable will be emitted in every
+// translation unit [-Werror,-Wweak-vtables]" https://stackoverflow.com/a/29288300
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wweak-vtables"
+#endif
+
+    //! The CPU queue interface
+    template<typename TDev>
+    class IGenericThreadsQueue
+    {
+    public:
+        //! enqueue the event
+        virtual void enqueue(EventGenericThreads<TDev>&) = 0;
+        //! waiting for the event
+        virtual void wait(EventGenericThreads<TDev> const&) = 0;
+        virtual ~IGenericThreadsQueue() = default;
+    };
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic pop
+#endif
+} // namespace alpaka
diff --git a/include/alpaka/queue/cuda_hip/QueueUniformCudaHipRt.hpp b/include/alpaka/queue/cuda_hip/QueueUniformCudaHipRt.hpp
new file mode 100644
index 0000000..3a85fac
--- /dev/null
+++ b/include/alpaka/queue/cuda_hip/QueueUniformCudaHipRt.hpp
@@ -0,0 +1,245 @@
+/* Copyright 2022 Benjamin Worpitz, Matthias Werner, René Widera, Andrea Bocci, Bernhard Manfred Gruber,
+ * Antonio Di Pilato
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/CallbackThread.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/core/Cuda.hpp"
+#include "alpaka/core/Hip.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/event/Traits.hpp"
+#include "alpaka/meta/DependentFalseType.hpp"
+#include "alpaka/queue/Traits.hpp"
+#include "alpaka/traits/Traits.hpp"
+#include "alpaka/wait/Traits.hpp"
+
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <thread>
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+namespace alpaka
+{
+    template<typename TApi>
+    class EventUniformCudaHipRt;
+
+    template<typename TApi>
+    class DevUniformCudaHipRt;
+
+    namespace uniform_cuda_hip::detail
+    {
+        //! The CUDA/HIP RT queue implementation.
+        template<typename TApi>
+        class QueueUniformCudaHipRtImpl final
+        {
+        public:
+            ALPAKA_FN_HOST QueueUniformCudaHipRtImpl(DevUniformCudaHipRt<TApi> const& dev)
+                : m_dev(dev)
+                , m_UniformCudaHipQueue()
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // Set the current device.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(m_dev.getNativeHandle()));
+
+                // - [cuda/hip]StreamDefault: Default queue creation flag.
+                // - [cuda/hip]StreamNonBlocking: Specifies that work running in the created queue may run
+                // concurrently with work in queue 0 (the NULL queue),
+                //   and that the created queue should perform no implicit synchronization with queue 0.
+                // Create the queue on the current device.
+                // NOTE: [cuda/hip]StreamNonBlocking is required to match the semantic implemented in the alpaka
+                // CPU queue. It would be too much work to implement implicit default queue synchronization on CPU.
+
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    TApi::streamCreateWithFlags(&m_UniformCudaHipQueue, TApi::streamNonBlocking));
+            }
+
+            QueueUniformCudaHipRtImpl(QueueUniformCudaHipRtImpl&&) = default;
+            auto operator=(QueueUniformCudaHipRtImpl&&) -> QueueUniformCudaHipRtImpl& = delete;
+
+            ALPAKA_FN_HOST ~QueueUniformCudaHipRtImpl()
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // Make sure all pending async work is finished before destroying the stream to guarantee determinism.
+                // This would not be necessary for plain CUDA/HIP operations, but we can have host functions in the
+                // stream, which reference this queue instance and its CallbackThread. Make sure they are done.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::streamSynchronize(m_UniformCudaHipQueue));
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::streamDestroy(m_UniformCudaHipQueue));
+            }
+
+            [[nodiscard]] auto getNativeHandle() const noexcept
+            {
+                return m_UniformCudaHipQueue;
+            }
+
+        public:
+            DevUniformCudaHipRt<TApi> const m_dev; //!< The device this queue is bound to.
+            core::CallbackThread m_callbackThread;
+
+        private:
+            typename TApi::Stream_t m_UniformCudaHipQueue;
+        };
+
+        //! The CUDA/HIP RT queue.
+        template<typename TApi, bool TBlocking>
+        class QueueUniformCudaHipRt
+            : public concepts::Implements<ConceptCurrentThreadWaitFor, QueueUniformCudaHipRt<TApi, TBlocking>>
+            , public concepts::Implements<ConceptQueue, QueueUniformCudaHipRt<TApi, TBlocking>>
+            , public concepts::Implements<ConceptGetDev, QueueUniformCudaHipRt<TApi, TBlocking>>
+        {
+        public:
+            ALPAKA_FN_HOST QueueUniformCudaHipRt(DevUniformCudaHipRt<TApi> const& dev)
+                : m_spQueueImpl(std::make_shared<QueueUniformCudaHipRtImpl<TApi>>(dev))
+            {
+                dev.registerQueue(m_spQueueImpl);
+            }
+
+            ALPAKA_FN_HOST auto operator==(QueueUniformCudaHipRt const& rhs) const -> bool
+            {
+                return (m_spQueueImpl == rhs.m_spQueueImpl);
+            }
+
+            ALPAKA_FN_HOST auto operator!=(QueueUniformCudaHipRt const& rhs) const -> bool
+            {
+                return !((*this) == rhs);
+            }
+
+            [[nodiscard]] auto getNativeHandle() const noexcept
+            {
+                return m_spQueueImpl->getNativeHandle();
+            }
+
+            auto getCallbackThread() -> core::CallbackThread&
+            {
+                return m_spQueueImpl->m_callbackThread;
+            }
+
+        public:
+            std::shared_ptr<QueueUniformCudaHipRtImpl<TApi>> m_spQueueImpl;
+        };
+    } // namespace uniform_cuda_hip::detail
+
+    namespace trait
+    {
+        //! The CUDA/HIP RT queue device get trait specialization.
+        template<typename TApi, bool TBlocking>
+        struct GetDev<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>
+        {
+            ALPAKA_FN_HOST static auto getDev(
+                uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking> const& queue)
+                -> DevUniformCudaHipRt<TApi>
+            {
+                return queue.m_spQueueImpl->m_dev;
+            }
+        };
+
+        //! The CUDA/HIP RT queue test trait specialization.
+        template<typename TApi, bool TBlocking>
+        struct Empty<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>
+        {
+            ALPAKA_FN_HOST static auto empty(
+                uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking> const& queue) -> bool
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // Query is allowed even for queues on non current device.
+                typename TApi::Error_t ret = TApi::success;
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE(
+                    ret = TApi::streamQuery(queue.getNativeHandle()),
+                    TApi::errorNotReady);
+                return (ret == TApi::success);
+            }
+        };
+
+        //! The CUDA/HIP RT queue thread wait trait specialization.
+        //!
+        //! Blocks execution of the calling thread until the queue has finished processing all previously requested
+        //! tasks (kernels, data copies, ...)
+        template<typename TApi, bool TBlocking>
+        struct CurrentThreadWaitFor<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>
+        {
+            ALPAKA_FN_HOST static auto currentThreadWaitFor(
+                uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking> const& queue) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // Sync is allowed even for queues on non current device.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamSynchronize(queue.getNativeHandle()));
+            }
+        };
+
+        //! The CUDA/HIP RT blocking queue device type trait specialization.
+        template<typename TApi, bool TBlocking>
+        struct DevType<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>
+        {
+            using type = DevUniformCudaHipRt<TApi>;
+        };
+
+        //! The CUDA/HIP RT blocking queue event type trait specialization.
+        template<typename TApi, bool TBlocking>
+        struct EventType<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>
+        {
+            using type = EventUniformCudaHipRt<TApi>;
+        };
+
+        //! The CUDA/HIP RT blocking queue enqueue trait specialization.
+        template<typename TApi, bool TBlocking, typename TTask>
+        struct Enqueue<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>, TTask>
+        {
+            using QueueImpl = uniform_cuda_hip::detail::QueueUniformCudaHipRtImpl<TApi>;
+
+            struct HostFuncData
+            {
+                // We don't need to keep the queue alive, because in it's dtor it will synchronize with the CUDA/HIP
+                // stream and wait until all host functions and the CallbackThread are done. It's actually an error to
+                // copy the queue into the host function. Destroying it here would call CUDA/HIP APIs from the host
+                // function. Passing it further to the Callback thread, would make the Callback thread hold a task
+                // containing the queue with the CallbackThread itself. Destroying the task if no other queue instance
+                // exists will make the CallbackThread join itself and crash.
+                QueueImpl& q;
+                TTask t;
+            };
+
+            ALPAKA_FN_HOST static void uniformCudaHipRtHostFunc(void* arg)
+            {
+                auto data = std::unique_ptr<HostFuncData>(reinterpret_cast<HostFuncData*>(arg));
+                auto& queue = data->q;
+                auto f = queue.m_callbackThread.submit([d = std::move(data)] { d->t(); });
+                f.wait();
+            }
+
+            ALPAKA_FN_HOST static auto enqueue(
+                uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>& queue,
+                TTask const& task) -> void
+            {
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::launchHostFunc(
+                    queue.getNativeHandle(),
+                    uniformCudaHipRtHostFunc,
+                    new HostFuncData{*queue.m_spQueueImpl, task}));
+                if constexpr(TBlocking)
+                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamSynchronize(queue.getNativeHandle()));
+            }
+        };
+
+        //! The CUDA/HIP RT blocking queue native handle trait specialization.
+        template<typename TApi, bool TBlocking>
+        struct NativeHandle<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>
+        {
+            [[nodiscard]] static auto getNativeHandle(
+                uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking> const& queue)
+            {
+                return queue.getNativeHandle();
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#endif
diff --git a/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp b/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp
new file mode 100644
index 0000000..abf5763
--- /dev/null
+++ b/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp
@@ -0,0 +1,289 @@
+/* Copyright 2024 Jan Stephan, Antonio Di Pilato, Luca Ferragina, Andrea Bocci, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/event/Traits.hpp"
+#include "alpaka/queue/Traits.hpp"
+#include "alpaka/traits/Traits.hpp"
+#include "alpaka/wait/Traits.hpp"
+
+#include <algorithm>
+#include <exception>
+#include <memory>
+#include <mutex>
+#include <shared_mutex>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+
+#    include <sycl/sycl.hpp>
+
+namespace alpaka
+{
+    template<typename TTag>
+    class DevGenericSycl;
+
+    template<typename TTag>
+    class EventGenericSycl;
+
+    namespace detail
+    {
+        template<typename T, typename = void>
+        inline constexpr auto is_sycl_task = false;
+
+        template<typename T>
+        inline constexpr auto is_sycl_task<T, std::void_t<decltype(T::is_sycl_task)>> = true;
+
+        template<typename T, typename = void>
+        inline constexpr auto is_sycl_kernel = false;
+
+        template<typename T>
+        inline constexpr auto is_sycl_kernel<T, std::void_t<decltype(T::is_sycl_kernel)>> = true;
+
+        class QueueGenericSyclImpl
+        {
+        public:
+            QueueGenericSyclImpl(sycl::context context, sycl::device device)
+                : m_queue{
+                    std::move(context), // This is important. In SYCL a device can belong to multiple contexts.
+                    std::move(device),
+                    {sycl::property::queue::enable_profiling{}, sycl::property::queue::in_order{}}}
+            {
+            }
+
+            // This class will only exist as a pointer. We don't care about copy and move semantics.
+            QueueGenericSyclImpl(QueueGenericSyclImpl const& other) = delete;
+            auto operator=(QueueGenericSyclImpl const& rhs) -> QueueGenericSyclImpl& = delete;
+
+            QueueGenericSyclImpl(QueueGenericSyclImpl&& other) noexcept = delete;
+            auto operator=(QueueGenericSyclImpl&& rhs) noexcept -> QueueGenericSyclImpl& = delete;
+
+            ~QueueGenericSyclImpl()
+            {
+                try
+                {
+                    m_queue.wait_and_throw();
+                }
+                catch(sycl::exception const& err)
+                {
+                    std::cerr << "Caught SYCL exception while destructing a SYCL queue: " << err.what() << " ("
+                              << err.code() << ')' << std::endl;
+                }
+                catch(std::exception const& err)
+                {
+                    std::cerr << "The following runtime error(s) occured while destructing a SYCL queue:" << err.what()
+                              << std::endl;
+                }
+            }
+
+            // Don't call this without locking first!
+            auto clean_dependencies() -> void
+            {
+                // Clean up completed events
+                auto const start = std::begin(m_dependencies);
+                auto const old_end = std::end(m_dependencies);
+                auto const new_end = std::remove_if(
+                    start,
+                    old_end,
+                    [](sycl::event ev) {
+                        return ev.get_info<sycl::info::event::command_execution_status>()
+                               == sycl::info::event_command_status::complete;
+                    });
+
+                m_dependencies.erase(new_end, old_end);
+            }
+
+            auto register_dependency(sycl::event event) -> void
+            {
+                std::lock_guard<std::shared_mutex> lock{m_mutex};
+
+                clean_dependencies();
+                m_dependencies.push_back(event);
+            }
+
+            auto empty() const -> bool
+            {
+                std::shared_lock<std::shared_mutex> lock{m_mutex};
+                return m_last_event.get_info<sycl::info::event::command_execution_status>()
+                       == sycl::info::event_command_status::complete;
+            }
+
+            auto wait() -> void
+            {
+                // SYCL queues are thread-safe.
+                m_queue.wait_and_throw();
+            }
+
+            auto get_last_event() const -> sycl::event
+            {
+                std::shared_lock<std::shared_mutex> lock{m_mutex};
+                return m_last_event;
+            }
+
+            template<bool TBlocking, typename TTask>
+            auto enqueue(TTask const& task) -> void
+            {
+                {
+                    std::lock_guard<std::shared_mutex> lock{m_mutex};
+
+                    clean_dependencies();
+
+                    // Execute task
+                    if constexpr(is_sycl_task<TTask> && !is_sycl_kernel<TTask>) // Copy / Fill
+                    {
+                        m_last_event = task(m_queue, m_dependencies); // Will call queue.{copy, fill} internally
+                    }
+                    else
+                    {
+                        m_last_event = m_queue.submit(
+                            [this, &task](sycl::handler& cgh)
+                            {
+                                if(!m_dependencies.empty())
+                                    cgh.depends_on(m_dependencies);
+
+                                if constexpr(is_sycl_kernel<TTask>) // Kernel
+                                    task(cgh); // Will call cgh.parallel_for internally
+                                else // Host
+                                    cgh.host_task(task);
+                            });
+                    }
+
+                    m_dependencies.clear();
+                }
+
+                if constexpr(TBlocking)
+                    wait();
+            }
+
+            [[nodiscard]] auto getNativeHandle() const noexcept
+            {
+                return m_queue;
+            }
+
+            std::vector<sycl::event> m_dependencies;
+            sycl::event m_last_event;
+            std::shared_mutex mutable m_mutex;
+
+        private:
+            sycl::queue m_queue;
+        };
+
+        template<typename TTag, bool TBlocking>
+        class QueueGenericSyclBase
+            : public concepts::Implements<ConceptCurrentThreadWaitFor, QueueGenericSyclBase<TTag, TBlocking>>
+            , public concepts::Implements<ConceptQueue, QueueGenericSyclBase<TTag, TBlocking>>
+            , public concepts::Implements<ConceptGetDev, QueueGenericSyclBase<TTag, TBlocking>>
+        {
+        public:
+            QueueGenericSyclBase(DevGenericSycl<TTag> const& dev)
+                : m_dev{dev}
+                , m_spQueueImpl{std::make_shared<detail::QueueGenericSyclImpl>(
+                      dev.getNativeHandle().second,
+                      dev.getNativeHandle().first)}
+            {
+                m_dev.m_impl->register_queue(m_spQueueImpl);
+            }
+
+            friend auto operator==(QueueGenericSyclBase const& lhs, QueueGenericSyclBase const& rhs) -> bool
+            {
+                return (lhs.m_dev == rhs.m_dev) && (lhs.m_spQueueImpl == rhs.m_spQueueImpl);
+            }
+
+            friend auto operator!=(QueueGenericSyclBase const& lhs, QueueGenericSyclBase const& rhs) -> bool
+            {
+                return !(lhs == rhs);
+            }
+
+            [[nodiscard]] auto getNativeHandle() const noexcept
+            {
+                return m_spQueueImpl->getNativeHandle();
+            }
+
+            DevGenericSycl<TTag> m_dev;
+            std::shared_ptr<detail::QueueGenericSyclImpl> m_spQueueImpl;
+        };
+    } // namespace detail
+
+    namespace trait
+    {
+        //! The SYCL blocking queue device type trait specialization.
+        template<typename TTag, bool TBlocking>
+        struct DevType<alpaka::detail::QueueGenericSyclBase<TTag, TBlocking>>
+        {
+            using type = DevGenericSycl<TTag>;
+        };
+
+        //! The SYCL blocking queue device get trait specialization.
+        template<typename TTag, bool TBlocking>
+        struct GetDev<alpaka::detail::QueueGenericSyclBase<TTag, TBlocking>>
+        {
+            static auto getDev(alpaka::detail::QueueGenericSyclBase<TTag, TBlocking> const& queue)
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+                return queue.m_dev;
+            }
+        };
+
+        //! The SYCL blocking queue event type trait specialization.
+        template<typename TTag, bool TBlocking>
+        struct EventType<alpaka::detail::QueueGenericSyclBase<TTag, TBlocking>>
+        {
+            using type = EventGenericSycl<TTag>;
+        };
+
+        //! The SYCL blocking queue enqueue trait specialization.
+        template<typename TTag, bool TBlocking, typename TTask>
+        struct Enqueue<alpaka::detail::QueueGenericSyclBase<TTag, TBlocking>, TTask>
+        {
+            static auto enqueue(alpaka::detail::QueueGenericSyclBase<TTag, TBlocking>& queue, TTask const& task)
+                -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+                queue.m_spQueueImpl->template enqueue<TBlocking>(task);
+            }
+        };
+
+        //! The SYCL blocking queue test trait specialization.
+        template<typename TTag, bool TBlocking>
+        struct Empty<alpaka::detail::QueueGenericSyclBase<TTag, TBlocking>>
+        {
+            static auto empty(alpaka::detail::QueueGenericSyclBase<TTag, TBlocking> const& queue) -> bool
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+                return queue.m_spQueueImpl->empty();
+            }
+        };
+
+        //! The SYCL blocking queue thread wait trait specialization.
+        //!
+        //! Blocks execution of the calling thread until the queue has finished processing all previously requested
+        //! tasks (kernels, data copies, ...)
+        template<typename TTag, bool TBlocking>
+        struct CurrentThreadWaitFor<alpaka::detail::QueueGenericSyclBase<TTag, TBlocking>>
+        {
+            static auto currentThreadWaitFor(alpaka::detail::QueueGenericSyclBase<TTag, TBlocking> const& queue)
+                -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+                queue.m_spQueueImpl->wait();
+            }
+        };
+
+        //! The SYCL queue native handle trait specialization.
+        template<typename TTag, bool TBlocking>
+        struct NativeHandle<alpaka::detail::QueueGenericSyclBase<TTag, TBlocking>>
+        {
+            [[nodiscard]] static auto getNativeHandle(
+                alpaka::detail::QueueGenericSyclBase<TTag, TBlocking> const& queue)
+            {
+                return queue.getNativeHandle();
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
+#endif
diff --git a/include/alpaka/rand/Philox/MultiplyAndSplit64to32.hpp b/include/alpaka/rand/Philox/MultiplyAndSplit64to32.hpp
new file mode 100644
index 0000000..e0c0361
--- /dev/null
+++ b/include/alpaka/rand/Philox/MultiplyAndSplit64to32.hpp
@@ -0,0 +1,43 @@
+/* Copyright 2023 Jiří Vyskočil, Bernhard Manfred Gruber, Jan Stephan
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+
+#include <cstdint>
+
+namespace alpaka::rand
+{
+    /// Get high 32 bits of a 64-bit number
+    ALPAKA_FN_HOST_ACC inline constexpr auto high32Bits(std::uint64_t const x) -> std::uint32_t
+    {
+        return static_cast<std::uint32_t>(x >> 32);
+    }
+
+    /// Get low 32 bits of a 64-bit number
+    ALPAKA_FN_HOST_ACC inline constexpr auto low32Bits(std::uint64_t const x) -> std::uint32_t
+    {
+        return static_cast<std::uint32_t>(x & 0xffff'ffff);
+    }
+
+    /** Multiply two 64-bit numbers and split the result into high and low 32 bits, also known as "mulhilo32"
+     *
+     * @param a first 64-bit multiplier
+     * @param b second 64-bit multiplier
+     * @param resultHigh high 32 bits of the product a*b
+     * @param resultLow low 32 bits of the product a*b
+     */
+    // TODO: See single-instruction implementations in original Philox source code
+    ALPAKA_FN_HOST_ACC inline constexpr void multiplyAndSplit64to32(
+        std::uint64_t const a,
+        std::uint64_t const b,
+        std::uint32_t& resultHigh,
+        std::uint32_t& resultLow)
+    {
+        std::uint64_t res64 = a * b;
+        resultHigh = high32Bits(res64);
+        resultLow = low32Bits(res64);
+    }
+} // namespace alpaka::rand
diff --git a/include/alpaka/rand/Philox/PhiloxBaseCommon.hpp b/include/alpaka/rand/Philox/PhiloxBaseCommon.hpp
new file mode 100644
index 0000000..e80d8a1
--- /dev/null
+++ b/include/alpaka/rand/Philox/PhiloxBaseCommon.hpp
@@ -0,0 +1,92 @@
+/* Copyright 2022 Jiri Vyskocil, Bernhard Manfred Gruber, Jeffrey Kelling
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/rand/Philox/PhiloxStateless.hpp"
+
+#include <utility>
+
+namespace alpaka::rand::engine
+{
+    /** Common class for Philox family engines
+     *
+     * Relies on `PhiloxStateless` to provide the PRNG and adds state to handling the counting.
+     *
+     * @tparam TParams Philox algorithm parameters \sa PhiloxParams
+     * @tparam TImpl engine type implementation (CRTP)
+     *
+     * static const data members are transformed into functions, because GCC
+     * assumes types with static data members to be not mappable and makes not
+     * exception for constexpr ones. This is a valid interpretation of the
+     * OpenMP <= 4.5 standard. In OpenMP >= 5.0 types with any kind of static
+     * data member are mappable.
+     */
+    template<typename TParams, typename TImpl>
+    class PhiloxBaseCommon : public PhiloxStateless<TParams>
+    {
+    public:
+        using Counter = typename PhiloxStateless<TParams>::Counter;
+        using Key = typename PhiloxStateless<TParams>::Key;
+
+        /// Distribution container type
+        template<typename TDistributionResultScalar>
+        using ResultContainer = typename alpaka::Vec<alpaka::DimInt<TParams::counterSize>, TDistributionResultScalar>;
+
+    protected:
+        /** Advance the \a counter to the next state
+         *
+         * Increments the passed-in \a counter by one with a 128-bit carry.
+         *
+         * @param counter reference to the counter which is to be advanced
+         */
+        ALPAKA_FN_HOST_ACC void advanceCounter(Counter& counter)
+        {
+            counter[0]++;
+            /* 128-bit carry */
+            if(counter[0] == 0)
+            {
+                counter[1]++;
+                if(counter[1] == 0)
+                {
+                    counter[2]++;
+                    if(counter[2] == 0)
+                    {
+                        counter[3]++;
+                    }
+                }
+            }
+        }
+
+        /** Advance the internal state counter by \a offset N-vectors (N = counter size)
+         *
+         * Advances the internal value of this->state.counter
+         *
+         * @param offset number of N-vectors to skip
+         */
+        ALPAKA_FN_HOST_ACC void skip4(uint64_t offset)
+        {
+            Counter& counter = static_cast<TImpl*>(this)->state.counter;
+            Counter temp = counter;
+            counter[0] += low32Bits(offset);
+            counter[1] += high32Bits(offset) + (counter[0] < temp[0] ? 1 : 0);
+            counter[2] += (counter[0] < temp[1] ? 1u : 0u);
+            counter[3] += (counter[0] < temp[2] ? 1u : 0u);
+        }
+
+        /** Advance the counter by the length of \a subsequence
+         *
+         * Advances the internal value of this->state.counter
+         *
+         * @param subsequence number of subsequences to skip
+         */
+        ALPAKA_FN_HOST_ACC void skipSubsequence(uint64_t subsequence)
+        {
+            Counter& counter = static_cast<TImpl*>(this)->state.counter;
+            Counter temp = counter;
+            counter[2] += low32Bits(subsequence);
+            counter[3] += high32Bits(subsequence) + (counter[2] < temp[2] ? 1 : 0);
+        }
+    };
+} // namespace alpaka::rand::engine
diff --git a/include/alpaka/rand/Philox/PhiloxConstants.hpp b/include/alpaka/rand/Philox/PhiloxConstants.hpp
new file mode 100644
index 0000000..831a1de
--- /dev/null
+++ b/include/alpaka/rand/Philox/PhiloxConstants.hpp
@@ -0,0 +1,70 @@
+/* Copyright 2022 Jiri Vyskocil, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/rand/Philox/MultiplyAndSplit64to32.hpp"
+
+#include <cstdint>
+#include <utility>
+
+namespace alpaka::rand::engine
+{
+    /** Constants used in the Philox algorithm
+     *
+     * The numbers are taken from the reference Philox implementation:
+     *
+     * J. K. Salmon, M. A. Moraes, R. O. Dror and D. E. Shaw, "Parallel random numbers: As easy as 1, 2, 3,"
+     * SC '11: Proceedings of 2011 International Conference for High Performance Computing, Networking,
+     * Storage and Analysis, 2011, pp. 1-12, doi: 10.1145/2063384.2063405.
+     *
+     * @tparam TParams basic Philox algorithm parameters
+     *
+     * static const data members are transformed into functions, because GCC
+     * assumes types with static data members to be not mappable and makes not
+     * exception for constexpr ones. This is a valid interpretation of the
+     * OpenMP <= 4.5 standard. In OpenMP >= 5.0 types with any kind of static
+     * data member are mappable.
+     */
+    template<typename TParams>
+    class PhiloxConstants
+    {
+    public:
+        /// First Weyl sequence parameter: the golden ratio
+        static constexpr std::uint64_t WEYL_64_0()
+        {
+            return 0x9E37'79B9'7F4A'7C15;
+        }
+
+        /// Second Weyl sequence parameter: \f$ \sqrt{3}-1 \f$
+        static constexpr std::uint64_t WEYL_64_1()
+        {
+            return 0xBB67'AE85'84CA'A73B;
+        }
+
+        /// 1st Weyl sequence parameter, 32 bits
+        static constexpr std::uint32_t WEYL_32_0()
+        {
+            return high32Bits(WEYL_64_0());
+        }
+
+        /// 2nd Weyl sequence parameter, 32 bits
+        static constexpr std::uint32_t WEYL_32_1()
+        {
+            return high32Bits(WEYL_64_1());
+        }
+
+        /// First Philox S-box multiplier
+        static constexpr std::uint32_t MULTIPLITER_4x32_0()
+        {
+            return 0xCD9E'8D57;
+        }
+
+        /// Second Philox S-box multiplier
+        static constexpr std::uint32_t MULTIPLITER_4x32_1()
+        {
+            return 0xD251'1F53;
+        }
+    };
+} // namespace alpaka::rand::engine
diff --git a/include/alpaka/rand/Philox/PhiloxSingle.hpp b/include/alpaka/rand/Philox/PhiloxSingle.hpp
new file mode 100644
index 0000000..3f7b6ff
--- /dev/null
+++ b/include/alpaka/rand/Philox/PhiloxSingle.hpp
@@ -0,0 +1,148 @@
+/* Copyright 2022 Jiri Vyskocil, Rene Widera, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/rand/Philox/MultiplyAndSplit64to32.hpp"
+#include "alpaka/rand/Philox/PhiloxBaseCommon.hpp"
+
+#include <utility>
+
+namespace alpaka::rand::engine
+{
+    /** Philox state for single value engine
+     *
+     * @tparam TCounter Type of the Counter array
+     * @tparam TKey Type of the Key array
+     */
+    template<typename TCounter, typename TKey>
+    struct PhiloxStateSingle
+    {
+        using Counter = TCounter;
+        using Key = TKey;
+
+        /// Counter array
+        Counter counter;
+        /// Key array
+        Key key;
+        /// Intermediate result array
+        Counter result;
+        /// Pointer to the active intermediate result element
+        std::uint32_t position;
+        // TODO: Box-Muller states
+    };
+
+    /** Philox engine generating a single number
+     *
+     * This engine's operator() will return a single number. Since the result is the same size as the counter,
+     * and so it contains more than one number, it has to be stored between individual invocations of
+     * operator(). Additionally a pointer has to be stored indicating which part of the result array is to be
+     * returned next.
+     *
+     * @tparam TParams Basic parameters for the Philox algorithm
+     */
+    template<typename TParams>
+    class PhiloxSingle : public PhiloxBaseCommon<TParams, PhiloxSingle<TParams>>
+    {
+    public:
+        using Base = PhiloxBaseCommon<TParams, PhiloxSingle<TParams>>;
+
+        /// Counter type
+        using Counter = typename Base::Counter;
+        /// Key type
+        using Key = typename Base::Key;
+        /// State type
+        using State = PhiloxStateSingle<Counter, Key>;
+
+        /// Internal engine state
+        State state;
+
+    protected:
+        /** Advance internal counter to the next value
+         *
+         * Advances the full internal counter array, resets the position pointer and stores the intermediate
+         * result to be recalled when the user requests a number.
+         */
+        ALPAKA_FN_HOST_ACC void advanceState()
+        {
+            this->advanceCounter(state.counter);
+            state.result = this->nRounds(state.counter, state.key);
+            state.position = 0;
+        }
+
+        /** Get the next random number and advance internal state
+         *
+         * The intermediate result stores N = TParams::counterSize numbers. Check if we've already given out
+         * all of them. If so, generate a new intermediate result (this also resets the pointer to the position
+         * of the actual number). Finally, we return the actual number.
+         *
+         * @return The next random number
+         */
+        ALPAKA_FN_HOST_ACC auto nextNumber()
+        {
+            // Element zero will always contain the next valid random number.
+            auto result = state.result[0];
+            state.position++;
+            if(state.position == TParams::counterSize)
+            {
+                advanceState();
+            }
+            else
+            {
+                // Shift state results to allow hard coded access to element zero.
+                // This will avoid high register usage on NVIDIA devices.
+                // \todo Check if this shifting of the result vector is decreasing CPU performance.
+                //       If so this optimization for GPUs (mostly NVIDIA) should be moved into
+                //       PhiloxBaseCudaArray.
+                state.result[0] = state.result[1];
+                state.result[1] = state.result[2];
+                state.result[2] = state.result[3];
+            }
+
+            return result;
+        }
+
+        /// Skips the next \a offset numbers
+        ALPAKA_FN_HOST_ACC void skip(uint64_t offset)
+        {
+            static_assert(TParams::counterSize == 4, "Only counterSize is supported.");
+            state.position = static_cast<decltype(state.position)>(state.position + (offset & 3));
+            offset += state.position < 4 ? 0 : 4;
+            state.position -= state.position < 4 ? 0 : 4u;
+            for(auto numShifts = state.position; numShifts > 0; --numShifts)
+            {
+                // Shift state results to allow hard coded access to element zero.
+                // This will avoid high register usage on NVIDIA devices.
+                state.result[0] = state.result[1];
+                state.result[1] = state.result[2];
+                state.result[2] = state.result[3];
+            }
+            this->skip4(offset / 4);
+        }
+
+    public:
+        /** Construct a new Philox engine with single-value output
+         *
+         * @param seed Set the Philox generator key
+         * @param subsequence Select a subsequence of size 2^64
+         * @param offset Skip \a offset numbers form the start of the subsequence
+         */
+        ALPAKA_FN_HOST_ACC PhiloxSingle(uint64_t seed = 0, uint64_t subsequence = 0, uint64_t offset = 0)
+            : state{{0, 0, 0, 0}, {low32Bits(seed), high32Bits(seed)}, {0, 0, 0, 0}, 0}
+        {
+            this->skipSubsequence(subsequence);
+            skip(offset);
+            advanceState();
+        }
+
+        /** Get the next random number
+         *
+         * @return The next random number
+         */
+        ALPAKA_FN_HOST_ACC auto operator()()
+        {
+            return nextNumber();
+        }
+    };
+} // namespace alpaka::rand::engine
diff --git a/include/alpaka/rand/Philox/PhiloxStateless.hpp b/include/alpaka/rand/Philox/PhiloxStateless.hpp
new file mode 100644
index 0000000..3011d44
--- /dev/null
+++ b/include/alpaka/rand/Philox/PhiloxStateless.hpp
@@ -0,0 +1,125 @@
+/* Copyright 2022 Jiri Vyskocil, Bernhard Manfred Gruber, Jeffrey Kelling
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Unroll.hpp"
+#include "alpaka/rand/Philox/MultiplyAndSplit64to32.hpp"
+#include "alpaka/rand/Philox/PhiloxConstants.hpp"
+#include "alpaka/vec/Vec.hpp"
+
+#include <utility>
+
+namespace alpaka::rand::engine
+{
+    /** Philox algorithm parameters
+     *
+     * @tparam TCounterSize number of elements in the counter
+     * @tparam TWidth width of one counter element (in bits)
+     * @tparam TRounds number of S-box rounds
+     */
+    template<unsigned TCounterSize, unsigned TWidth, unsigned TRounds>
+    struct PhiloxParams
+    {
+        static constexpr unsigned counterSize = TCounterSize;
+        static constexpr unsigned width = TWidth;
+        static constexpr unsigned rounds = TRounds;
+    };
+
+    /** Class basic Philox family counter-based PRNG
+     *
+     * Checks the validity of passed-in parameters and calls the backend methods to perform N rounds of the
+     * Philox shuffle.
+     *
+     * @tparam TParams Philox algorithm parameters \sa PhiloxParams
+     */
+    template<typename TParams>
+    class PhiloxStateless : public PhiloxConstants<TParams>
+    {
+        static constexpr unsigned numRounds()
+        {
+            return TParams::rounds;
+        }
+
+        static constexpr unsigned vectorSize()
+        {
+            return TParams::counterSize;
+        }
+
+        static constexpr unsigned numberWidth()
+        {
+            return TParams::width;
+        }
+
+        static_assert(numRounds() > 0, "Number of Philox rounds must be > 0.");
+        static_assert(vectorSize() % 2 == 0, "Philox counter size must be an even number.");
+        static_assert(vectorSize() <= 16, "Philox SP network is not specified for sizes > 16.");
+        static_assert(numberWidth() % 8 == 0, "Philox number width in bits must be a multiple of 8.");
+
+        static_assert(numberWidth() == 32, "Philox implemented only for 32 bit numbers.");
+
+    public:
+        using Counter = alpaka::Vec<alpaka::DimInt<TParams::counterSize>, std::uint32_t>;
+        using Key = alpaka::Vec<alpaka::DimInt<TParams::counterSize / 2>, std::uint32_t>;
+        using Constants = PhiloxConstants<TParams>;
+
+    protected:
+        /** Single round of the Philox shuffle
+         *
+         * @param counter state of the counter
+         * @param key value of the key
+         * @return shuffled counter
+         */
+        static ALPAKA_FN_HOST_ACC auto singleRound(Counter const& counter, Key const& key)
+        {
+            std::uint32_t H0, L0, H1, L1;
+            multiplyAndSplit64to32(counter[0], Constants::MULTIPLITER_4x32_0(), H0, L0);
+            multiplyAndSplit64to32(counter[2], Constants::MULTIPLITER_4x32_1(), H1, L1);
+            return Counter{H1 ^ counter[1] ^ key[0], L1, H0 ^ counter[3] ^ key[1], L0};
+        }
+
+        /** Bump the \a key by the Weyl sequence step parameter
+         *
+         * @param key the key to be bumped
+         * @return the bumped key
+         */
+        static ALPAKA_FN_HOST_ACC auto bumpKey(Key const& key)
+        {
+            return Key{key[0] + Constants::WEYL_32_0(), key[1] + Constants::WEYL_32_1()};
+        }
+
+        /** Performs N rounds of the Philox shuffle
+         *
+         * @param counter_in initial state of the counter
+         * @param key_in initial state of the key
+         * @return result of the PRNG shuffle; has the same size as the counter
+         */
+        static ALPAKA_FN_HOST_ACC auto nRounds(Counter const& counter_in, Key const& key_in) -> Counter
+        {
+            Key key{key_in};
+            Counter counter = singleRound(counter_in, key);
+
+            ALPAKA_UNROLL(numRounds())
+            for(unsigned int n = 0; n < numRounds(); ++n)
+            {
+                key = bumpKey(key);
+                counter = singleRound(counter, key);
+            }
+
+            return counter;
+        }
+
+    public:
+        /** Generates a random number (\p TCounterSize x32-bit)
+         *
+         * @param counter initial state of the counter
+         * @param key initial state of the key
+         * @return result of the PRNG shuffle; has the same size as the counter
+         */
+        static ALPAKA_FN_HOST_ACC auto generate(Counter const& counter, Key const& key) -> Counter
+        {
+            return nRounds(counter, key);
+        }
+    };
+} // namespace alpaka::rand::engine
diff --git a/include/alpaka/rand/Philox/PhiloxStatelessKeyedBase.hpp b/include/alpaka/rand/Philox/PhiloxStatelessKeyedBase.hpp
new file mode 100644
index 0000000..bb6795b
--- /dev/null
+++ b/include/alpaka/rand/Philox/PhiloxStatelessKeyedBase.hpp
@@ -0,0 +1,36 @@
+/* Copyright 2022 Jeffrey Kelling
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/rand/Philox/PhiloxStateless.hpp"
+
+namespace alpaka::rand::engine
+{
+    /** Common class for Philox family engines
+     *
+     * Checks the validity of passed-in parameters and calls the backend methods to perform N rounds of the
+     * Philox shuffle.
+     *
+     * @tparam TParams Philox algorithm parameters \sa PhiloxParams
+     */
+    template<typename TParams>
+    struct PhiloxStatelessKeyedBase : public PhiloxStateless<TParams>
+    {
+    public:
+        using Counter = typename PhiloxStateless<TParams>::Counter;
+        using Key = typename PhiloxStateless<TParams>::Key;
+
+        Key const m_key;
+
+        PhiloxStatelessKeyedBase(Key&& key) : m_key(std::move(key))
+        {
+        }
+
+        ALPAKA_FN_HOST_ACC auto operator()(Counter const& counter) const
+        {
+            return this->generate(counter, m_key);
+        }
+    };
+} // namespace alpaka::rand::engine
diff --git a/include/alpaka/rand/Philox/PhiloxVector.hpp b/include/alpaka/rand/Philox/PhiloxVector.hpp
new file mode 100644
index 0000000..64c89b4
--- /dev/null
+++ b/include/alpaka/rand/Philox/PhiloxVector.hpp
@@ -0,0 +1,102 @@
+/* Copyright 2022 Jiri Vyskocil, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/rand/Philox/MultiplyAndSplit64to32.hpp"
+#include "alpaka/rand/Philox/PhiloxBaseCommon.hpp"
+
+#include <utility>
+
+namespace alpaka::rand::engine
+{
+    /** Philox state for vector generator
+     *
+     * @tparam TCounter Type of the Counter array
+     * @tparam TKey Type of the Key array
+     */
+    template<typename TCounter, typename TKey>
+    struct PhiloxStateVector
+    {
+        using Counter = TCounter;
+        using Key = TKey;
+
+        /// Counter array
+        Counter counter;
+        /// Key array
+        Key key;
+    };
+
+    /** Philox engine generating a vector of numbers
+     *
+     * This engine's operator() will return a vector of numbers corresponding to the full size of its counter.
+     * This is a convenience vs. memory size tradeoff since the user has to deal with the output array
+     * themselves, but the internal state comprises only of a single counter and a key.
+     *
+     * @tparam TParams Basic parameters for the Philox algorithm
+     */
+    template<typename TParams>
+    class PhiloxVector : public PhiloxBaseCommon<TParams, PhiloxVector<TParams>>
+    {
+    public:
+        using Base = PhiloxBaseCommon<TParams, PhiloxVector<TParams>>;
+
+        /// Counter type
+        using Counter = typename Base::Counter;
+        /// Key type
+        using Key = typename Base::Key;
+        /// State type
+        using State = PhiloxStateVector<Counter, Key>;
+
+        template<typename TDistributionResultScalar>
+        using ResultContainer = typename Base::template ResultContainer<TDistributionResultScalar>;
+
+        State state;
+
+    protected:
+        /** Get the next array of random numbers and advance internal state
+         *
+         * @return The next array of random numbers
+         */
+        ALPAKA_FN_HOST_ACC auto nextVector()
+        {
+            this->advanceCounter(state.counter);
+            return this->nRounds(state.counter, state.key);
+        }
+
+        /** Skips the next \a offset vectors
+         *
+         * Unlike its counterpart in \a PhiloxSingle, this function advances the state in multiples of the
+         * counter size thus skipping the entire array of numbers.
+         */
+        ALPAKA_FN_HOST_ACC void skip(uint64_t offset)
+        {
+            this->skip4(offset);
+        }
+
+    public:
+        /** Construct a new Philox engine with vector output
+         *
+         * @param seed Set the Philox generator key
+         * @param subsequence Select a subsequence of size 2^64
+         * @param offset Skip \a offset numbers form the start of the subsequence
+         */
+        ALPAKA_FN_HOST_ACC PhiloxVector(uint64_t seed = 0, uint64_t subsequence = 0, uint64_t offset = 0)
+            : state{{0, 0, 0, 0}, {low32Bits(seed), high32Bits(seed)}}
+        {
+            this->skipSubsequence(subsequence);
+            skip(offset);
+            nextVector();
+        }
+
+        /** Get the next vector of random numbers
+         *
+         * @return The next vector of random numbers
+         */
+        ALPAKA_FN_HOST_ACC auto operator()()
+        {
+            return nextVector();
+        }
+    };
+} // namespace alpaka::rand::engine
diff --git a/include/alpaka/rand/RandDefault.hpp b/include/alpaka/rand/RandDefault.hpp
new file mode 100644
index 0000000..bbe763c
--- /dev/null
+++ b/include/alpaka/rand/RandDefault.hpp
@@ -0,0 +1,216 @@
+/* Copyright 2022 Jeffrey Kelling, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/math/Traits.hpp"
+#include "alpaka/rand/RandPhilox.hpp"
+#include "alpaka/rand/Traits.hpp"
+
+#include <algorithm>
+#include <limits>
+#include <type_traits>
+
+namespace alpaka::rand
+{
+    class RandDefault : public concepts::Implements<ConceptRand, RandDefault>
+    {
+    };
+
+    namespace distribution::gpu
+    {
+        namespace detail
+        {
+            template<typename TFloat>
+            struct BitsType;
+
+            template<>
+            struct BitsType<float>
+            {
+                using type = std::uint32_t;
+            };
+
+            template<>
+            struct BitsType<double>
+            {
+                using type = std::uint64_t;
+            };
+        } // namespace detail
+
+        //! The GPU random number normal distribution.
+        template<typename T>
+        class UniformUint
+        {
+            static_assert(std::is_integral_v<T>, "Return type of UniformUint must be integral.");
+
+        public:
+            UniformUint() = default;
+
+            template<typename TEngine>
+            ALPAKA_FN_HOST_ACC auto operator()(TEngine& engine) -> T
+            {
+                using BitsT = typename TEngine::result_type;
+                T ret = 0;
+                constexpr auto N = sizeof(T) / sizeof(BitsT);
+                for(unsigned int a = 0; a < N; ++a)
+                {
+                    ret
+                        ^= (static_cast<T>(engine())
+                            << (sizeof(BitsT) * std::numeric_limits<unsigned char>::digits * a));
+                }
+                return ret;
+            }
+        };
+
+        //! The GPU random number uniform distribution.
+        template<typename T>
+        class UniformReal
+        {
+            static_assert(std::is_floating_point_v<T>, "Return type of UniformReal must be floating point.");
+
+            using BitsT = typename detail::BitsType<T>::type;
+
+        public:
+            UniformReal() = default;
+
+            template<typename TEngine>
+            ALPAKA_FN_HOST_ACC auto operator()(TEngine& engine) -> T
+            {
+                constexpr BitsT limit = static_cast<BitsT>(1) << std::numeric_limits<T>::digits;
+                BitsT const b = UniformUint<BitsT>()(engine);
+                auto const ret = static_cast<T>(b & (limit - 1)) / limit;
+                return ret;
+            }
+        };
+
+        /*! The GPU random number normal distribution.
+         *
+         * \note
+         * This type contains state and is not thread-safe: To be used
+         * per thread, not shared.
+         *
+         * \note When reproducibility is a concern, each instance of
+         * this class should be used with only on random engine
+         * instance, or two consecutive number should be generated with
+         * each engine used. This is due to the implicit caching of one
+         * Gaussian random number.
+         */
+        template<typename Acc, typename T>
+        class NormalReal
+        {
+            static_assert(std::is_floating_point_v<T>, "Return type of NormalReal must be floating point.");
+
+            Acc const* m_acc;
+            T m_cache = std::numeric_limits<T>::quiet_NaN();
+
+        public:
+            /*! \warning Retains a reference to \p acc, thus must not outlive it.
+             */
+            ALPAKA_FN_HOST_ACC constexpr NormalReal(Acc const& acc) : m_acc(&acc)
+            {
+            }
+
+            // All copy operations (and thus also move since we don't declare those and they fall back to copy) do NOT
+            // copy m_cache. This way we can ensure that the following holds:
+            // NormalReal<Acc> a(acc), b(acc);
+            // Engine<Acc> e(acc);
+            // assert(a(e) != b(e)); // because of two engine invocations
+            // b = a;
+            // assert(a(e) != b(e)); // because of two engine invocations
+
+            ALPAKA_FN_HOST_ACC constexpr NormalReal(NormalReal const& other) : m_acc(other.m_acc)
+            {
+            }
+
+            ALPAKA_FN_HOST_ACC constexpr auto operator=(NormalReal const& other) -> NormalReal&
+            {
+                m_acc = other.m_acc;
+                return *this;
+            }
+
+            template<typename TEngine>
+            ALPAKA_FN_HOST_ACC auto operator()(TEngine& engine) -> T
+            {
+                constexpr auto sigma = T{1};
+                constexpr auto mu = T{0};
+                if(math::isnan(*m_acc, m_cache))
+                {
+                    UniformReal<T> uni;
+
+                    T u1, u2;
+                    do
+                    {
+                        u1 = uni(engine);
+                        u2 = uni(engine);
+                    } while(u1 <= std::numeric_limits<T>::epsilon());
+
+                    // compute z0 and z1
+                    T const mag = sigma * math::sqrt(*m_acc, static_cast<T>(-2.) * math::log(*m_acc, u1));
+                    constexpr T twoPi = static_cast<T>(2. * math::constants::pi);
+                    // getting two normal number out of this, store one for later
+                    m_cache = mag * static_cast<T>(math::cos(*m_acc, twoPi * u2)) + mu;
+
+                    return mag * static_cast<T>(math::sin(*m_acc, twoPi * u2)) + mu;
+                }
+
+                T const ret = m_cache;
+                m_cache = std::numeric_limits<T>::quiet_NaN();
+                return ret;
+            }
+        };
+    } // namespace distribution::gpu
+
+    namespace distribution::trait
+    {
+        //! The GPU device random number float normal distribution get trait specialization.
+        template<typename T>
+        struct CreateNormalReal<RandDefault, T, std::enable_if_t<std::is_floating_point_v<T>>>
+        {
+            template<typename TAcc>
+            ALPAKA_FN_HOST_ACC static auto createNormalReal(TAcc const& acc) -> gpu::NormalReal<TAcc, T>
+            {
+                return {acc};
+            }
+        };
+
+        //! The GPU device random number float uniform distribution get trait specialization.
+        template<typename T>
+        struct CreateUniformReal<RandDefault, T, std::enable_if_t<std::is_floating_point_v<T>>>
+        {
+            ALPAKA_FN_HOST_ACC static auto createUniformReal(RandDefault const& /* rand */) -> gpu::UniformReal<T>
+            {
+                return {};
+            }
+        };
+
+        //! The GPU device random number integer uniform distribution get trait specialization.
+        template<typename T>
+        struct CreateUniformUint<RandDefault, T, std::enable_if_t<std::is_integral_v<T>>>
+        {
+            ALPAKA_FN_HOST_ACC static auto createUniformUint(RandDefault const& /* rand */) -> gpu::UniformUint<T>
+            {
+                return {};
+            }
+        };
+    } // namespace distribution::trait
+
+    namespace engine::trait
+    {
+        //! The GPU device random number default generator get trait specialization.
+        template<>
+        struct CreateDefault<RandDefault>
+        {
+            template<typename TAcc>
+            ALPAKA_FN_HOST_ACC static auto createDefault(
+                TAcc const& /* acc */,
+                std::uint32_t const& seed,
+                std::uint32_t const& subsequence,
+                std::uint32_t const& offset) -> Philox4x32x10
+            {
+                return {seed, subsequence, offset};
+            }
+        };
+    } // namespace engine::trait
+} // namespace alpaka::rand
diff --git a/include/alpaka/rand/RandGenericSycl.hpp b/include/alpaka/rand/RandGenericSycl.hpp
new file mode 100644
index 0000000..c114a4f
--- /dev/null
+++ b/include/alpaka/rand/RandGenericSycl.hpp
@@ -0,0 +1,198 @@
+/* Copyright 2023 Luca Ferragina, Aurora Perego, Jan Stephan, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/dev/DevGenericSycl.hpp"
+#include "alpaka/rand/Traits.hpp"
+
+#if defined(ALPAKA_ACC_SYCL_ENABLED) && !defined(ALPAKA_DISABLE_VENDOR_RNG)
+
+// Backend specific imports.
+#    include <sycl/sycl.hpp>
+#    if BOOST_COMP_CLANG
+#        pragma clang diagnostic push
+#        pragma clang diagnostic ignored "-Wcast-align"
+#        pragma clang diagnostic ignored "-Wcast-qual"
+#        pragma clang diagnostic ignored "-Wextra-semi"
+#        pragma clang diagnostic ignored "-Wfloat-equal"
+#        pragma clang diagnostic ignored "-Wold-style-cast"
+#        pragma clang diagnostic ignored "-Wreserved-identifier"
+#        pragma clang diagnostic ignored "-Wreserved-macro-identifier"
+#        pragma clang diagnostic ignored "-Wsign-compare"
+#        pragma clang diagnostic ignored "-Wundef"
+#    endif
+#    include <oneapi/dpl/random>
+#    if BOOST_COMP_CLANG
+#        pragma clang diagnostic pop
+#    endif
+
+#    include <type_traits>
+
+namespace alpaka::rand
+{
+    //! The SYCL rand implementation.
+    template<typename TDim>
+    struct RandGenericSycl : concepts::Implements<ConceptRand, RandGenericSycl<TDim>>
+    {
+        explicit RandGenericSycl(sycl::nd_item<TDim::value> my_item) : m_item_rand{my_item}
+        {
+        }
+
+        sycl::nd_item<TDim::value> m_item_rand;
+    };
+
+#    if !defined(ALPAKA_HOST_ONLY)
+    namespace distribution::sycl_rand
+    {
+        //! The SYCL random number floating point normal distribution.
+        template<typename T>
+        struct NormalReal;
+
+        //! The SYCL random number uniform distribution.
+        template<typename T>
+        struct Uniform;
+    } // namespace distribution::sycl_rand
+
+    namespace engine::sycl_rand
+    {
+        //! The SYCL linear congruential random number generator engine.
+        template<typename TDim>
+        class Minstd
+        {
+        public:
+            // After calling this constructor the instance is not valid initialized and
+            // need to be overwritten with a valid object
+            Minstd() = default;
+
+            Minstd(RandGenericSycl<TDim> rand, std::uint32_t const& seed)
+            {
+                oneapi::dpl::minstd_rand engine(seed, rand.m_item_rand.get_global_linear_id());
+                rng_engine = engine;
+            }
+
+        private:
+            template<typename T>
+            friend struct distribution::sycl_rand::NormalReal;
+            template<typename T>
+            friend struct distribution::sycl_rand::Uniform;
+
+            oneapi::dpl::minstd_rand rng_engine;
+
+        public:
+            using result_type = float;
+
+            ALPAKA_FN_HOST_ACC static result_type min()
+            {
+                return std::numeric_limits<result_type>::min();
+            }
+
+            ALPAKA_FN_HOST_ACC static result_type max()
+            {
+                return std::numeric_limits<result_type>::max();
+            }
+
+            result_type operator()()
+            {
+                oneapi::dpl::uniform_real_distribution<float> distr;
+                return distr(rng_engine);
+            }
+        };
+    } // namespace engine::sycl_rand
+
+    namespace distribution::sycl_rand
+    {
+
+        //! The SYCL random number double normal distribution.
+        template<typename F>
+        struct NormalReal
+        {
+            static_assert(std::is_floating_point_v<F>);
+
+            template<typename TEngine>
+            auto operator()(TEngine& engine) -> F
+            {
+                oneapi::dpl::normal_distribution<F> distr;
+                return distr(engine.rng_engine);
+            }
+        };
+
+        //! The SYCL random number float uniform distribution.
+        template<typename T>
+        struct Uniform
+        {
+            static_assert(std::is_floating_point_v<T> || std::is_unsigned_v<T>);
+
+            template<typename TEngine>
+            auto operator()(TEngine& engine) -> T
+            {
+                if constexpr(std::is_floating_point_v<T>)
+                {
+                    oneapi::dpl::uniform_real_distribution<T> distr;
+                    return distr(engine.rng_engine);
+                }
+                else
+                {
+                    oneapi::dpl::uniform_int_distribution<T> distr;
+                    return distr(engine.rng_engine);
+                }
+            }
+        };
+    } // namespace distribution::sycl_rand
+
+    namespace distribution::trait
+    {
+        //! The SYCL random number float normal distribution get trait specialization.
+        template<typename TDim, typename T>
+        struct CreateNormalReal<RandGenericSycl<TDim>, T, std::enable_if_t<std::is_floating_point_v<T>>>
+        {
+            static auto createNormalReal(RandGenericSycl<TDim> const& /*rand*/) -> sycl_rand::NormalReal<T>
+            {
+                return {};
+            }
+        };
+
+        //! The SYCL random number float uniform distribution get trait specialization.
+        template<typename TDim, typename T>
+        struct CreateUniformReal<RandGenericSycl<TDim>, T, std::enable_if_t<std::is_floating_point_v<T>>>
+        {
+            static auto createUniformReal(RandGenericSycl<TDim> const& /*rand*/) -> sycl_rand::Uniform<T>
+            {
+                return {};
+            }
+        };
+
+        //! The SYCL random number integer uniform distribution get trait specialization.
+        template<typename TDim, typename T>
+        struct CreateUniformUint<RandGenericSycl<TDim>, T, std::enable_if_t<std::is_integral_v<T>>>
+        {
+            static auto createUniformUint(RandGenericSycl<TDim> const& /*rand*/) -> sycl_rand::Uniform<T>
+            {
+                return {};
+            }
+        };
+    } // namespace distribution::trait
+
+    namespace engine::trait
+    {
+        //! The SYCL random number default generator get trait specialization.
+        template<typename TDim>
+        struct CreateDefault<RandGenericSycl<TDim>>
+        {
+            static auto createDefault(
+                RandGenericSycl<TDim> const& rand,
+                std::uint32_t const& seed = 0,
+                std::uint32_t const& /* subsequence */ = 0,
+                std::uint32_t const& /* offset */ = 0) -> sycl_rand::Minstd<TDim>
+            {
+                return {rand, seed};
+            }
+        };
+    } // namespace engine::trait
+#    endif
+} // namespace alpaka::rand
+
+#endif
diff --git a/include/alpaka/rand/RandPhilox.hpp b/include/alpaka/rand/RandPhilox.hpp
new file mode 100644
index 0000000..d11cacb
--- /dev/null
+++ b/include/alpaka/rand/RandPhilox.hpp
@@ -0,0 +1,201 @@
+/* Copyright 2022 Jiří Vyskočil, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/meta/IsArrayOrVector.hpp"
+#include "alpaka/rand/Philox/PhiloxSingle.hpp"
+#include "alpaka/rand/Philox/PhiloxVector.hpp"
+#include "alpaka/rand/Traits.hpp"
+
+#include <cstdint>
+#include <limits>
+#include <random>
+#include <type_traits>
+
+namespace alpaka::rand
+{
+    /** Most common Philox engine variant, outputs single number
+     *
+     * This is a variant of the Philox engine generator which outputs a single float. The counter size is \f$4
+     * \times 32 = 128\f$ bits. Since the engine returns a single number, the generated result, which has the same
+     * size as the counter, has to be stored between invocations. Additionally a 32 bit pointer is stored. The
+     * total size of the state is 352 bits = 44 bytes.
+     *
+     * Ref.: J. K. Salmon, M. A. Moraes, R. O. Dror and D. E. Shaw, "Parallel random numbers: As easy as 1, 2, 3,"
+     * SC '11: Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and
+     * Analysis, 2011, pp. 1-12, doi: 10.1145/2063384.2063405.
+     */
+    class Philox4x32x10 : public concepts::Implements<ConceptRand, Philox4x32x10>
+    {
+    public:
+        /// Philox algorithm: 10 rounds, 4 numbers of size 32.
+        using EngineParams = engine::PhiloxParams<4, 32, 10>;
+        /// Engine outputs a single number
+        using EngineVariant = engine::PhiloxSingle<EngineParams>;
+
+        /** Initialize a new Philox engine
+         *
+         * @param seed Set the Philox generator key
+         * @param subsequence Select a subsequence of size 2^64
+         * @param offset Skip \a offset numbers form the start of the subsequence
+         */
+        ALPAKA_FN_HOST_ACC Philox4x32x10(
+            std::uint64_t const seed = 0,
+            std::uint64_t const subsequence = 0,
+            std::uint64_t const offset = 0)
+            : engineVariant(seed, subsequence, offset)
+        {
+        }
+
+        // STL UniformRandomBitGenerator concept
+        // https://en.cppreference.com/w/cpp/named_req/UniformRandomBitGenerator
+        using result_type = std::uint32_t;
+
+        ALPAKA_FN_HOST_ACC constexpr auto min() -> result_type
+        {
+            return 0;
+        }
+
+        ALPAKA_FN_HOST_ACC constexpr auto max() -> result_type
+        {
+            return std::numeric_limits<result_type>::max();
+        }
+
+        ALPAKA_FN_HOST_ACC auto operator()() -> result_type
+        {
+            return engineVariant();
+        }
+
+    private:
+        EngineVariant engineVariant;
+    };
+
+    /** Most common Philox engine variant, outputs a 4-vector of floats
+     *
+     * This is a variant of the Philox engine generator which outputs a vector containing 4 floats. The counter
+     * size is \f$4 \times 32 = 128\f$ bits. Since the engine returns the whole generated vector, it is up to the
+     * user to extract individual floats as they need. The benefit is smaller state size since the state does not
+     * contain the intermediate results. The total size of the state is 192 bits = 24 bytes.
+     *
+     * Ref.: J. K. Salmon, M. A. Moraes, R. O. Dror and D. E. Shaw, "Parallel random numbers: As easy as 1, 2, 3,"
+     * SC '11: Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and
+     * Analysis, 2011, pp. 1-12, doi: 10.1145/2063384.2063405.
+     */
+    class Philox4x32x10Vector : public concepts::Implements<ConceptRand, Philox4x32x10Vector>
+    {
+    public:
+        using EngineParams = engine::PhiloxParams<4, 32, 10>;
+        using EngineVariant = engine::PhiloxVector<EngineParams>;
+
+        /** Initialize a new Philox engine
+         *
+         * @param seed Set the Philox generator key
+         * @param subsequence Select a subsequence of size 2^64
+         * @param offset Number of numbers to skip form the start of the subsequence.
+         */
+        ALPAKA_FN_HOST_ACC Philox4x32x10Vector(
+            std::uint32_t const seed = 0,
+            std::uint32_t const subsequence = 0,
+            std::uint32_t const offset = 0)
+            : engineVariant(seed, subsequence, offset)
+        {
+        }
+
+        template<typename TScalar>
+        using ResultContainer = typename EngineVariant::template ResultContainer<TScalar>;
+
+        using ResultInt = std::uint32_t;
+        using ResultVec = decltype(std::declval<EngineVariant>()());
+
+        ALPAKA_FN_HOST_ACC constexpr auto min() -> ResultInt
+        {
+            return 0;
+        }
+
+        ALPAKA_FN_HOST_ACC constexpr auto max() -> ResultInt
+        {
+            return std::numeric_limits<ResultInt>::max();
+        }
+
+        ALPAKA_FN_HOST_ACC auto operator()() -> ResultVec
+        {
+            return engineVariant();
+        }
+
+    private:
+        EngineVariant engineVariant;
+    };
+
+    // The following exists because you "cannot call __device__ function from a __host__ __device__ function"
+    // directly, but wrapping that call in a struct is just fine.
+    template<typename TEngine>
+    struct EngineCallHostAccProxy
+    {
+        ALPAKA_FN_HOST_ACC auto operator()(TEngine& engine) -> decltype(engine())
+        {
+            return engine();
+        }
+    };
+
+    /// TEMP: Distributions to be decided on later. The generator should be compatible with STL as of now.
+    template<typename TResult, typename TSfinae = void>
+    class UniformReal : public concepts::Implements<ConceptRand, UniformReal<TResult>>
+    {
+        template<typename TRes, typename TEnable = void>
+        struct ResultType
+        {
+            using type = TRes;
+        };
+
+        template<typename TRes>
+        struct ResultType<TRes, std::enable_if_t<meta::IsArrayOrVector<TRes>::value>>
+        {
+            using type = typename TRes::value_type;
+        };
+
+        using T = typename ResultType<TResult>::type;
+        static_assert(std::is_floating_point_v<T>, "Only floating-point types are supported");
+
+    public:
+        ALPAKA_FN_HOST_ACC UniformReal() : UniformReal(0, 1)
+        {
+        }
+
+        ALPAKA_FN_HOST_ACC UniformReal(T min, T max) : _min(min), _max(max), _range(_max - _min)
+        {
+        }
+
+        template<typename TEngine>
+        ALPAKA_FN_HOST_ACC auto operator()(TEngine& engine) -> TResult
+        {
+            if constexpr(meta::IsArrayOrVector<TResult>::value)
+            {
+                auto result = engine();
+                T scale = static_cast<T>(1) / static_cast<T>(engine.max()) * _range;
+                TResult ret{
+                    static_cast<T>(result[0]) * scale + _min,
+                    static_cast<T>(result[1]) * scale + _min,
+                    static_cast<T>(result[2]) * scale + _min,
+                    static_cast<T>(result[3]) * scale + _min};
+                return ret;
+            }
+            else
+            {
+                // Since it's possible to get a host-only engine here, the call has to go through proxy
+                return static_cast<T>(EngineCallHostAccProxy<TEngine>{}(engine)) / static_cast<T>(engine.max())
+                           * _range
+                       + _min;
+            }
+
+            ALPAKA_UNREACHABLE(TResult{});
+        }
+
+    private:
+        T const _min;
+        T const _max;
+        T const _range;
+    };
+} // namespace alpaka::rand
diff --git a/include/alpaka/rand/RandPhiloxStateless.hpp b/include/alpaka/rand/RandPhiloxStateless.hpp
new file mode 100644
index 0000000..b2530d1
--- /dev/null
+++ b/include/alpaka/rand/RandPhiloxStateless.hpp
@@ -0,0 +1,30 @@
+/* Copyright 2022 Jeffrey Kelling
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/rand/Philox/PhiloxStateless.hpp"
+#include "alpaka/rand/Traits.hpp"
+
+namespace alpaka::rand
+{
+    /** Most common Philox engine variant, stateless, outputs a 4-vector of floats
+     *
+     * This is a variant of the Philox engine generator which outputs a vector containing 4 floats. The counter
+     * size is \f$4 \times 32 = 128\f$ bits. Since the engine returns the whole generated vector, it is up to the
+     * user to extract individual floats as they need. The benefit is smaller state size since the state does not
+     * contain the intermediate results. The total size of the state is 192 bits = 24 bytes.
+     *
+     * Ref.: J. K. Salmon, M. A. Moraes, R. O. Dror and D. E. Shaw, "Parallel random numbers: As easy as 1, 2, 3,"
+     * SC '11: Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and
+     * Analysis, 2011, pp. 1-12, doi: 10.1145/2063384.2063405.
+     */
+    class PhiloxStateless4x32x10Vector
+        : public alpaka::rand::engine::PhiloxStateless<engine::PhiloxParams<4, 32, 10>>
+        , public concepts::Implements<ConceptRand, PhiloxStateless4x32x10Vector>
+    {
+    public:
+        using EngineParams = engine::PhiloxParams<4, 32, 10>;
+    };
+} // namespace alpaka::rand
diff --git a/include/alpaka/rand/RandStdLib.hpp b/include/alpaka/rand/RandStdLib.hpp
new file mode 100644
index 0000000..ec507e0
--- /dev/null
+++ b/include/alpaka/rand/RandStdLib.hpp
@@ -0,0 +1,279 @@
+/* Copyright 2022 Axel Huebl, Benjamin Worpitz, René Widera, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/rand/TinyMT/Engine.hpp"
+#include "alpaka/rand/Traits.hpp"
+
+#include <cstdint>
+#include <limits>
+#include <random>
+#include <type_traits>
+
+namespace alpaka::rand
+{
+    //! "Tiny" state mersenne twister implementation
+    class TinyMersenneTwister : public concepts::Implements<ConceptRand, TinyMersenneTwister>
+    {
+    };
+
+    using RandStdLib = TinyMersenneTwister;
+
+    //! The standard library mersenne twister implementation.
+    class MersenneTwister : public concepts::Implements<ConceptRand, MersenneTwister>
+    {
+    };
+
+    //! The standard library rand device implementation.
+    class RandomDevice : public concepts::Implements<ConceptRand, RandomDevice>
+    {
+    };
+
+    namespace engine::cpu
+    {
+        //! The standard library mersenne twister random number generator.
+        //!
+        //! size of state: 19937 bytes
+        class MersenneTwister
+        {
+            std::mt19937 state;
+
+        public:
+            MersenneTwister() = default;
+
+            ALPAKA_FN_HOST MersenneTwister(
+                std::uint32_t const& seed,
+                std::uint32_t const& subsequence = 0,
+                std::uint32_t const& offset = 0)
+                : // NOTE: XOR the seed and the subsequence to generate a unique seed.
+                state((seed ^ subsequence) + offset)
+            {
+            }
+
+            // STL UniformRandomBitGenerator concept interface
+            using result_type = std::mt19937::result_type;
+
+            ALPAKA_FN_HOST static constexpr auto min() -> result_type
+            {
+                return std::mt19937::min();
+            }
+
+            ALPAKA_FN_HOST static constexpr auto max() -> result_type
+            {
+                return std::mt19937::max();
+            }
+
+            ALPAKA_FN_HOST auto operator()() -> result_type
+            {
+                return state();
+            }
+        };
+
+        //! "Tiny" state mersenne twister implementation
+        //!
+        //! repository: github.com/MersenneTwister-Lab/TinyMT
+        //!
+        //! license: 3-clause BSD
+        //!
+        //! @author Mutsuo Saito (Hiroshima University)Tokio University.
+        //! @author Makoto Matsumoto (The University of Tokyo)
+        //!
+        //! size of state: 28 bytes (127 bits?!)
+        class TinyMersenneTwister
+        {
+            TinyMTengine state;
+
+        public:
+            TinyMersenneTwister() = default;
+
+            ALPAKA_FN_HOST TinyMersenneTwister(
+                std::uint32_t const& seed,
+                std::uint32_t const& subsequence = 0,
+                std::uint32_t const& offset = 0)
+                : // NOTE: XOR the seed and the subsequence to generate a unique seed.
+                state((seed ^ subsequence) + offset)
+            {
+            }
+
+            // STL UniformRandomBitGenerator concept interface
+            using result_type = TinyMTengine::result_type;
+
+            ALPAKA_FN_HOST static constexpr auto min() -> result_type
+            {
+                return TinyMTengine::min();
+            }
+
+            ALPAKA_FN_HOST static constexpr auto max() -> result_type
+            {
+                return TinyMTengine::max();
+            }
+
+            ALPAKA_FN_HOST auto operator()() -> result_type
+            {
+                return state();
+            }
+        };
+
+        //! The standard library's random device based on the local entropy pool.
+        //!
+        //! Warning: the entropy pool on many devices degrates quickly and performance
+        //!          will drop significantly when this point occures.
+        //!
+        //! size of state: 1 byte
+        class RandomDevice
+        {
+            std::random_device state;
+
+        public:
+            RandomDevice() = default;
+
+            ALPAKA_FN_HOST RandomDevice(std::uint32_t const&, std::uint32_t const& = 0, std::uint32_t const& = 0)
+            {
+            }
+
+            // STL UniformRandomBitGenerator concept interface
+            using result_type = std::random_device::result_type;
+
+            ALPAKA_FN_HOST static constexpr auto min() -> result_type
+            {
+                return std::random_device::min();
+            }
+
+            ALPAKA_FN_HOST static constexpr auto max() -> result_type
+            {
+                return std::random_device::max();
+            }
+
+            ALPAKA_FN_HOST auto operator()() -> result_type
+            {
+                return state();
+            }
+        };
+    } // namespace engine::cpu
+
+    namespace distribution::cpu
+    {
+        //! The CPU random number normal distribution.
+        template<typename T>
+        struct NormalReal
+        {
+            template<typename TEngine>
+            ALPAKA_FN_HOST auto operator()(TEngine& engine) -> T
+            {
+                return m_dist(engine);
+            }
+
+        private:
+            std::normal_distribution<T> m_dist;
+        };
+
+        //! The CPU random number uniform distribution.
+        template<typename T>
+        struct UniformReal
+        {
+            template<typename TEngine>
+            ALPAKA_FN_HOST auto operator()(TEngine& engine) -> T
+            {
+                return m_dist(engine);
+            }
+
+        private:
+            std::uniform_real_distribution<T> m_dist;
+        };
+
+        //! The CPU random number normal distribution.
+        template<typename T>
+        struct UniformUint
+        {
+            template<typename TEngine>
+            ALPAKA_FN_HOST auto operator()(TEngine& engine) -> T
+            {
+                return m_dist(engine);
+            }
+
+        private:
+            std::uniform_int_distribution<T> m_dist{
+                0, // For signed integer: std::numeric_limits<T>::lowest()
+                std::numeric_limits<T>::max()};
+        };
+    } // namespace distribution::cpu
+
+    namespace distribution::trait
+    {
+        //! The CPU device random number float normal distribution get trait specialization.
+        template<typename T>
+        struct CreateNormalReal<RandStdLib, T, std::enable_if_t<std::is_floating_point_v<T>>>
+        {
+            ALPAKA_FN_HOST static auto createNormalReal(RandStdLib const& /* rand */) -> cpu::NormalReal<T>
+            {
+                return {};
+            }
+        };
+
+        //! The CPU device random number float uniform distribution get trait specialization.
+        template<typename T>
+        struct CreateUniformReal<RandStdLib, T, std::enable_if_t<std::is_floating_point_v<T>>>
+        {
+            ALPAKA_FN_HOST static auto createUniformReal(RandStdLib const& /* rand */) -> cpu::UniformReal<T>
+            {
+                return {};
+            }
+        };
+
+        //! The CPU device random number integer uniform distribution get trait specialization.
+        template<typename T>
+        struct CreateUniformUint<RandStdLib, T, std::enable_if_t<std::is_integral_v<T>>>
+        {
+            ALPAKA_FN_HOST static auto createUniformUint(RandStdLib const& /* rand */) -> cpu::UniformUint<T>
+            {
+                return {};
+            }
+        };
+    } // namespace distribution::trait
+
+    namespace engine::trait
+    {
+        //! The CPU device random number default generator get trait specialization.
+        template<>
+        struct CreateDefault<TinyMersenneTwister>
+        {
+            ALPAKA_FN_HOST static auto createDefault(
+                TinyMersenneTwister const& /* rand */,
+                std::uint32_t const& seed = 0,
+                std::uint32_t const& subsequence = 0,
+                std::uint32_t const& offset = 0) -> cpu::TinyMersenneTwister
+            {
+                return {seed, subsequence, offset};
+            }
+        };
+
+        template<>
+        struct CreateDefault<MersenneTwister>
+        {
+            ALPAKA_FN_HOST static auto createDefault(
+                MersenneTwister const& /* rand */,
+                std::uint32_t const& seed = 0,
+                std::uint32_t const& subsequence = 0,
+                std::uint32_t const& offset = 0) -> cpu::MersenneTwister
+            {
+                return {seed, subsequence, offset};
+            }
+        };
+
+        template<>
+        struct CreateDefault<RandomDevice>
+        {
+            ALPAKA_FN_HOST static auto createDefault(
+                RandomDevice const& /* rand */,
+                std::uint32_t const& seed = 0,
+                std::uint32_t const& subsequence = 0,
+                std::uint32_t const& offset = 0) -> cpu::RandomDevice
+            {
+                return {seed, subsequence, offset};
+            }
+        };
+    } // namespace engine::trait
+} // namespace alpaka::rand
diff --git a/include/alpaka/rand/RandUniformCudaHipRand.hpp b/include/alpaka/rand/RandUniformCudaHipRand.hpp
new file mode 100644
index 0000000..63ffea9
--- /dev/null
+++ b/include/alpaka/rand/RandUniformCudaHipRand.hpp
@@ -0,0 +1,283 @@
+/* Copyright 2022 Benjamin Worpitz, René Widera, Andrea Bocci, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/core/Cuda.hpp"
+#include "alpaka/core/Hip.hpp"
+#include "alpaka/dev/DevUniformCudaHipRt.hpp"
+#include "alpaka/rand/Traits.hpp"
+
+#include <type_traits>
+
+#if(defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)) && !defined(ALPAKA_DISABLE_VENDOR_RNG)
+
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#        include <curand_kernel.h>
+#    elif defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+#        if BOOST_COMP_CLANG
+#            pragma clang diagnostic push
+#            pragma clang diagnostic ignored "-Wduplicate-decl-specifier"
+#        endif
+
+#        if HIP_VERSION >= 50'200'000
+#            include <hiprand/hiprand_kernel.h>
+#        else
+#            include <hiprand_kernel.h>
+#        endif
+
+#        if BOOST_COMP_CLANG
+#            pragma clang diagnostic pop
+#        endif
+#    endif
+
+namespace alpaka::rand
+{
+    //! The CUDA/HIP rand implementation.
+    template<typename TApi>
+    class RandUniformCudaHipRand : public concepts::Implements<ConceptRand, RandUniformCudaHipRand<TApi>>
+    {
+    };
+
+#    if !defined(ALPAKA_HOST_ONLY)
+
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+
+#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+
+    namespace distribution::uniform_cuda_hip
+    {
+        //! The CUDA/HIP random number floating point normal distribution.
+        template<typename T>
+        class NormalReal;
+
+        //! The CUDA/HIP random number floating point uniform distribution.
+        template<typename T>
+        class UniformReal;
+
+        //! The CUDA/HIP random number integer uniform distribution.
+        template<typename T>
+        class UniformUint;
+    } // namespace distribution::uniform_cuda_hip
+
+    namespace engine::uniform_cuda_hip
+    {
+        //! The CUDA/HIP Xor random number generator engine.
+        class Xor
+        {
+        public:
+            // After calling this constructor the instance is not valid initialized and
+            // need to be overwritten with a valid object
+            Xor() = default;
+
+            __device__ Xor(
+                std::uint32_t const& seed,
+                std::uint32_t const& subsequence = 0,
+                std::uint32_t const& offset = 0)
+            {
+#        ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+                curand_init(seed, subsequence, offset, &state);
+#        else
+                hiprand_init(seed, subsequence, offset, &state);
+#        endif
+            }
+
+        private:
+            template<typename T>
+            friend class distribution::uniform_cuda_hip::NormalReal;
+            template<typename T>
+            friend class distribution::uniform_cuda_hip::UniformReal;
+            template<typename T>
+            friend class distribution::uniform_cuda_hip::UniformUint;
+
+#        ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+            curandStateXORWOW_t state = curandStateXORWOW_t{};
+#        else
+            hiprandStateXORWOW_t state = hiprandStateXORWOW_t{};
+#        endif
+
+        public:
+            // STL UniformRandomBitGenerator concept. This is not strictly necessary as the distributions
+            // contained in this file are aware of the API specifics of the CUDA/HIP XORWOW engine and STL
+            // distributions might not work on the device, but it servers a compatibility bridge to other
+            // potentially compatible alpaka distributions.
+#        ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+            using result_type = decltype(curand(&state));
+#        else
+            using result_type = decltype(hiprand(&state));
+#        endif
+            ALPAKA_FN_HOST_ACC static constexpr result_type min()
+            {
+                return std::numeric_limits<result_type>::min();
+            }
+
+            ALPAKA_FN_HOST_ACC static constexpr result_type max()
+            {
+                return std::numeric_limits<result_type>::max();
+            }
+
+            __device__ result_type operator()()
+            {
+#        ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+                return curand(&state);
+#        else
+                return hiprand(&state);
+#        endif
+            }
+        };
+    } // namespace engine::uniform_cuda_hip
+
+    namespace distribution::uniform_cuda_hip
+    {
+        //! The CUDA/HIP random number float normal distribution.
+        template<>
+        class NormalReal<float>
+        {
+        public:
+            template<typename TEngine>
+            __device__ auto operator()(TEngine& engine) -> float
+            {
+#        ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+                return curand_normal(&engine.state);
+#        else
+                return hiprand_normal(&engine.state);
+#        endif
+            }
+        };
+
+        //! The CUDA/HIP random number float normal distribution.
+        template<>
+        class NormalReal<double>
+        {
+        public:
+            template<typename TEngine>
+            __device__ auto operator()(TEngine& engine) -> double
+            {
+#        ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+                return curand_normal_double(&engine.state);
+#        else
+                return hiprand_normal_double(&engine.state);
+#        endif
+            }
+        };
+
+        //! The CUDA/HIP random number float uniform distribution.
+        template<>
+        class UniformReal<float>
+        {
+        public:
+            template<typename TEngine>
+            __device__ auto operator()(TEngine& engine) -> float
+            {
+                // (0.f, 1.0f]
+#        ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+                float const fUniformRand(curand_uniform(&engine.state));
+#        else
+                float const fUniformRand(hiprand_uniform(&engine.state));
+#        endif
+                // NOTE: (1.0f - curand_uniform) does not work, because curand_uniform seems to return
+                // denormalized floats around 0.f. [0.f, 1.0f)
+                return fUniformRand * static_cast<float>(fUniformRand != 1.0f);
+            }
+        };
+
+        //! The CUDA/HIP random number float uniform distribution.
+        template<>
+        class UniformReal<double>
+        {
+        public:
+            template<typename TEngine>
+            __device__ auto operator()(TEngine& engine) -> double
+            {
+                // (0.f, 1.0f]
+#        ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+                double const fUniformRand(curand_uniform_double(&engine.state));
+#        else
+                double const fUniformRand(hiprand_uniform_double(&engine.state));
+#        endif
+                // NOTE: (1.0f - curand_uniform_double) does not work, because curand_uniform_double seems to
+                // return denormalized floats around 0.f. [0.f, 1.0f)
+                return fUniformRand * static_cast<double>(fUniformRand != 1.0);
+            }
+        };
+
+        //! The CUDA/HIP random number unsigned integer uniform distribution.
+        template<>
+        class UniformUint<unsigned int>
+        {
+        public:
+            template<typename TEngine>
+            __device__ auto operator()(TEngine& engine) -> unsigned int
+            {
+#        ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+                return curand(&engine.state);
+#        else
+                return hiprand(&engine.state);
+#        endif
+            }
+        };
+    } // namespace distribution::uniform_cuda_hip
+
+    namespace distribution::trait
+    {
+        //! The CUDA/HIP random number float normal distribution get trait specialization.
+        template<typename TApi, typename T>
+        struct CreateNormalReal<RandUniformCudaHipRand<TApi>, T, std::enable_if_t<std::is_floating_point_v<T>>>
+        {
+            static __device__ auto createNormalReal(RandUniformCudaHipRand<TApi> const& /*rand*/)
+                -> uniform_cuda_hip::NormalReal<T>
+            {
+                return {};
+            }
+        };
+
+        //! The CUDA/HIP random number float uniform distribution get trait specialization.
+        template<typename TApi, typename T>
+        struct CreateUniformReal<RandUniformCudaHipRand<TApi>, T, std::enable_if_t<std::is_floating_point_v<T>>>
+        {
+            static __device__ auto createUniformReal(RandUniformCudaHipRand<TApi> const& /*rand*/)
+                -> uniform_cuda_hip::UniformReal<T>
+            {
+                return {};
+            }
+        };
+
+        //! The CUDA/HIP random number integer uniform distribution get trait specialization.
+        template<typename TApi, typename T>
+        struct CreateUniformUint<RandUniformCudaHipRand<TApi>, T, std::enable_if_t<std::is_integral_v<T>>>
+        {
+            static __device__ auto createUniformUint(RandUniformCudaHipRand<TApi> const& /*rand*/)
+                -> uniform_cuda_hip::UniformUint<T>
+            {
+                return {};
+            }
+        };
+    } // namespace distribution::trait
+
+    namespace engine::trait
+    {
+        //! The CUDA/HIP random number default generator get trait specialization.
+        template<typename TApi>
+        struct CreateDefault<RandUniformCudaHipRand<TApi>>
+        {
+            static __device__ auto createDefault(
+                RandUniformCudaHipRand<TApi> const& /*rand*/,
+                std::uint32_t const& seed = 0,
+                std::uint32_t const& subsequence = 0,
+                std::uint32_t const& offset = 0) -> uniform_cuda_hip::Xor
+            {
+                return {seed, subsequence, offset};
+            }
+        };
+    } // namespace engine::trait
+#    endif
+} // namespace alpaka::rand
+
+#endif
diff --git a/include/alpaka/rand/TinyMT/Engine.hpp b/include/alpaka/rand/TinyMT/Engine.hpp
new file mode 100644
index 0000000..9f5d05e
--- /dev/null
+++ b/include/alpaka/rand/TinyMT/Engine.hpp
@@ -0,0 +1,66 @@
+/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/rand/TinyMT/tinymt32.h"
+
+#include <cstdint>
+
+namespace alpaka::rand::engine::cpu
+{
+    //! Implementation of std::UniformRandomBitGenerator for TinyMT32
+    struct TinyMTengine
+    {
+        using result_type = std::uint32_t;
+
+        static constexpr auto default_seed() -> result_type
+        {
+            return 42u;
+        }
+
+        void seed(result_type value = default_seed())
+        {
+            // parameters from TinyMT/jump/sample.c
+            prng.mat1 = 0x8f70'11ee;
+            prng.mat2 = 0xfc78'ff1f;
+            prng.tmat = 0x3793'fdff;
+
+            tinymt32_init(&prng, value);
+        }
+
+        TinyMTengine(std::uint32_t const& seedValue)
+        {
+            seed(seedValue);
+        }
+
+        TinyMTengine()
+        {
+            seed(default_seed());
+        }
+
+        auto operator()() -> result_type
+        {
+            return tinymt32_generate_uint32(&prng);
+        }
+
+        static constexpr auto min() -> result_type
+        {
+            return 0u;
+        }
+
+        static constexpr auto max() -> result_type
+        {
+            return UINT32_MAX;
+        }
+
+        void discard(unsigned long long) // z
+        {
+            // not implemented
+            // tinymt32_jump( &prng, z, z );
+        }
+
+        tinymt32_t prng;
+    };
+} // namespace alpaka::rand::engine::cpu
diff --git a/include/alpaka/rand/TinyMT/LICENSE.txt b/include/alpaka/rand/TinyMT/LICENSE.txt
new file mode 100644
index 0000000..88bd896
--- /dev/null
+++ b/include/alpaka/rand/TinyMT/LICENSE.txt
@@ -0,0 +1,38 @@
+/* Copyright 2019 Mutsuo Saito
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+Copyright (c) 2011, 2013 Mutsuo Saito, Makoto Matsumoto,
+Hiroshima University and The University of Tokyo.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the following
+      disclaimer in the documentation and/or other materials provided
+      with the distribution.
+    * Neither the name of the Hiroshima University nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/include/alpaka/rand/TinyMT/tinymt32.h b/include/alpaka/rand/TinyMT/tinymt32.h
new file mode 100644
index 0000000..55a946f
--- /dev/null
+++ b/include/alpaka/rand/TinyMT/tinymt32.h
@@ -0,0 +1,429 @@
+/* Copyright 2011 - 2023 Mutsuo Saito, Makoto Matsumoto, Axel Hübl, Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format off
+#ifndef TINYMT32_H
+#define TINYMT32_H
+/**
+ * @file tinymt32.h
+ *
+ * @brief Tiny Mersenne Twister only 127 bit internal state
+ *
+ * @author Mutsuo Saito (Hiroshima University)
+ * @author Makoto Matsumoto (University of Tokyo)
+ *
+ * Copyright (C) 2011 Mutsuo Saito, Makoto Matsumoto,
+ * Hiroshima University and The University of Tokyo.
+ * All rights reserved.
+ *
+ * The 3-clause BSD License is applied to this software, see
+ * LICENSE.txt
+ */
+
+#include "alpaka/core/BoostPredef.hpp"
+
+#include <cstdint>
+/* work-around for glibc < 2.18 according to bug
+ * https://sourceware.org/bugzilla/show_bug.cgi?id=15366
+ */
+#ifndef UINT32_MAX
+#   define UINT32_MAX ((uint32_t)-1u)
+#endif
+#ifndef UINT32_C
+#   define UINT32_C(value) uint_least32_t(value)
+#endif
+#include <cinttypes>
+
+#if BOOST_COMP_CLANG
+#   pragma clang diagnostic push
+#   pragma clang diagnostic ignored "-Wold-style-cast"
+#   pragma clang diagnostic ignored "-Wunused-function"
+#endif
+#if BOOST_COMP_GNUC
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wold-style-cast"
+#endif
+#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
+    #pragma warning(push)
+    #pragma warning(disable: 4100)  // tinymt32.h(60): warning C4100: 'random': unreferenced formal parameter
+#endif
+
+#define TINYMT32_MEXP 127
+#define TINYMT32_SH0 1
+#define TINYMT32_SH1 10
+#define TINYMT32_SH8 8
+#define TINYMT32_MASK UINT32_C(0x7fffffff)
+#define TINYMT32_MUL (1.0f / 16777216.0f)
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/**
+ * tinymt32 internal state vector and parameters
+ */
+struct TINYMT32_T {
+    uint32_t status[4];
+    uint32_t mat1;
+    uint32_t mat2;
+    uint32_t tmat;
+};
+
+typedef struct TINYMT32_T tinymt32_t;
+
+inline void tinymt32_init(tinymt32_t * random, uint32_t seed);
+inline void tinymt32_init_by_array(tinymt32_t * random, uint32_t init_key[],
+                            int key_length);
+
+#if defined(__GNUC__)
+/**
+ * This function always returns 127
+ * @param random not used
+ * @return always 127
+ */
+inline static int tinymt32_get_mexp(
+    tinymt32_t * random  __attribute__((unused))) {
+    return TINYMT32_MEXP;
+}
+#else
+inline static int tinymt32_get_mexp(tinymt32_t * random) {
+    return TINYMT32_MEXP;
+}
+#endif
+
+/**
+ * This function changes internal state of tinymt32.
+ * Users should not call this function directly.
+ * @param random tinymt internal status
+ */
+inline static void tinymt32_next_state(tinymt32_t * random) {
+    uint32_t x;
+    uint32_t y;
+
+    y = random->status[3];
+    x = (random->status[0] & TINYMT32_MASK)
+        ^ random->status[1]
+        ^ random->status[2];
+    x ^= (x << TINYMT32_SH0);
+    y ^= (y >> TINYMT32_SH0) ^ x;
+    random->status[0] = random->status[1];
+    random->status[1] = random->status[2];
+    random->status[2] = x ^ (y << TINYMT32_SH1);
+    random->status[3] = y;
+    int32_t const a = -((int32_t)(y & 1)) & (int32_t)random->mat1;
+    int32_t const b = -((int32_t)(y & 1)) & (int32_t)random->mat2;
+    random->status[1] ^= (uint32_t)a;
+    random->status[2] ^= (uint32_t)b;
+}
+
+/**
+ * This function outputs 32-bit unsigned integer from internal state.
+ * Users should not call this function directly.
+ * @param random tinymt internal status
+ * @return 32-bit unsigned pseudorandom number
+ */
+inline static uint32_t tinymt32_temper(tinymt32_t * random) {
+    uint32_t t0, t1;
+    t0 = random->status[3];
+#if defined(LINEARITY_CHECK)
+    t1 = random->status[0]
+        ^ (random->status[2] >> TINYMT32_SH8);
+#else
+    t1 = random->status[0]
+        + (random->status[2] >> TINYMT32_SH8);
+#endif
+    t0 ^= t1;
+    if ((t1 & 1) != 0) {
+        t0 ^= random->tmat;
+    }
+    return t0;
+}
+
+/**
+ * This function outputs floating point number from internal state.
+ * Users should not call this function directly.
+ * @param random tinymt internal status
+ * @return floating point number r (1.0 <= r < 2.0)
+ */
+inline static float tinymt32_temper_conv(tinymt32_t * random) {
+    uint32_t t0, t1;
+    union {
+        uint32_t u;
+        float f;
+    } conv;
+
+    t0 = random->status[3];
+#if defined(LINEARITY_CHECK)
+    t1 = random->status[0]
+        ^ (random->status[2] >> TINYMT32_SH8);
+#else
+    t1 = random->status[0]
+        + (random->status[2] >> TINYMT32_SH8);
+#endif
+    t0 ^= t1;
+    if ((t1 & 1) != 0) {
+        conv.u  = ((t0 ^ random->tmat) >> 9) | UINT32_C(0x3f800000);
+    } else {
+        conv.u  = (t0 >> 9) | UINT32_C(0x3f800000);
+    }
+    return conv.f;
+}
+
+/**
+ * This function outputs floating point number from internal state.
+ * Users should not call this function directly.
+ * @param random tinymt internal status
+ * @return floating point number r (1.0 < r < 2.0)
+ */
+inline static float tinymt32_temper_conv_open(tinymt32_t * random) {
+    uint32_t t0, t1;
+    union {
+        uint32_t u;
+        float f;
+    } conv;
+
+    t0 = random->status[3];
+#if defined(LINEARITY_CHECK)
+    t1 = random->status[0]
+        ^ (random->status[2] >> TINYMT32_SH8);
+#else
+    t1 = random->status[0]
+        + (random->status[2] >> TINYMT32_SH8);
+#endif
+    t0 ^= t1;
+    if ((t1 & 1) != 0) {
+        conv.u  = ((t0 ^ random->tmat) >> 9) | UINT32_C(0x3f800001);
+    } else {
+        conv.u  = (t0 >> 9) | UINT32_C(0x3f800001);
+    }
+    return conv.f;
+}
+
+/**
+ * This function outputs 32-bit unsigned integer from internal state.
+ * @param random tinymt internal status
+ * @return 32-bit unsigned integer r (0 <= r < 2^32)
+ */
+inline static uint32_t tinymt32_generate_uint32(tinymt32_t * random) {
+    tinymt32_next_state(random);
+    return tinymt32_temper(random);
+}
+
+/**
+ * This function outputs floating point number from internal state.
+ * This function is implemented using multiplying by (1 / 2^24).
+ * floating point multiplication is faster than using union trick in
+ * my Intel CPU.
+ * @param random tinymt internal status
+ * @return floating point number r (0.0 <= r < 1.0)
+ */
+inline static float tinymt32_generate_float(tinymt32_t * random) {
+    tinymt32_next_state(random);
+    return (float)(tinymt32_temper(random) >> 8) * TINYMT32_MUL;
+}
+
+/**
+ * This function outputs floating point number from internal state.
+ * This function is implemented using union trick.
+ * @param random tinymt internal status
+ * @return floating point number r (1.0 <= r < 2.0)
+ */
+inline static float tinymt32_generate_float12(tinymt32_t * random) {
+    tinymt32_next_state(random);
+    return tinymt32_temper_conv(random);
+}
+
+/**
+ * This function outputs floating point number from internal state.
+ * This function is implemented using union trick.
+ * @param random tinymt internal status
+ * @return floating point number r (0.0 <= r < 1.0)
+ */
+inline static float tinymt32_generate_float01(tinymt32_t * random) {
+    tinymt32_next_state(random);
+    return tinymt32_temper_conv(random) - 1.0f;
+}
+
+/**
+ * This function outputs floating point number from internal state.
+ * This function may return 1.0 and never returns 0.0.
+ * @param random tinymt internal status
+ * @return floating point number r (0.0 < r <= 1.0)
+ */
+inline static float tinymt32_generate_floatOC(tinymt32_t * random) {
+    tinymt32_next_state(random);
+    return 1.0f - tinymt32_generate_float(random);
+}
+
+/**
+ * This function outputs floating point number from internal state.
+ * This function returns neither 0.0 nor 1.0.
+ * @param random tinymt internal status
+ * @return floating point number r (0.0 < r < 1.0)
+ */
+inline static float tinymt32_generate_floatOO(tinymt32_t * random) {
+    tinymt32_next_state(random);
+    return tinymt32_temper_conv_open(random) - 1.0f;
+}
+
+/**
+ * This function outputs double precision floating point number from
+ * internal state. The returned value has 32-bit precision.
+ * In other words, this function makes one double precision floating point
+ * number from one 32-bit unsigned integer.
+ * @param random tinymt internal status
+ * @return floating point number r (0.0 <= r < 1.0)
+ */
+inline static double tinymt32_generate_32double(tinymt32_t * random) {
+    tinymt32_next_state(random);
+    return tinymt32_temper(random) * (1.0 / 4294967296.0);
+}
+
+#if defined(__cplusplus)
+}
+#endif
+
+#define MIN_LOOP 8
+#define PRE_LOOP 8
+
+/**
+ * This function represents a function used in the initialization
+ * by init_by_array
+ * @param x 32-bit integer
+ * @return 32-bit integer
+ */
+static uint32_t ini_func1(uint32_t x) {
+    return (x ^ (x >> 27)) * UINT32_C(1664525);
+}
+
+/**
+ * This function represents a function used in the initialization
+ * by init_by_array
+ * @param x 32-bit integer
+ * @return 32-bit integer
+ */
+static uint32_t ini_func2(uint32_t x) {
+    return (x ^ (x >> 27)) * UINT32_C(1566083941);
+}
+
+/**
+ * This function certificate the period of 2^127-1.
+ * @param random tinymt state vector.
+ */
+static void period_certification(tinymt32_t * random) {
+    if ((random->status[0] & TINYMT32_MASK) == 0 &&
+        random->status[1] == 0 &&
+        random->status[2] == 0 &&
+        random->status[3] == 0) {
+        random->status[0] = 'T';
+        random->status[1] = 'I';
+        random->status[2] = 'N';
+        random->status[3] = 'Y';
+    }
+}
+
+/**
+ * This function initializes the internal state array with a 32-bit
+ * unsigned integer seed.
+ * @param random tinymt state vector.
+ * @param seed a 32-bit unsigned integer used as a seed.
+ */
+void tinymt32_init(tinymt32_t * random, uint32_t seed) {
+    random->status[0] = seed;
+    random->status[1] = random->mat1;
+    random->status[2] = random->mat2;
+    random->status[3] = random->tmat;
+    for (unsigned int i = 1; i < MIN_LOOP; i++) {
+        random->status[i & 3] ^= i + UINT32_C(1812433253)
+            * (random->status[(i - 1) & 3]
+               ^ (random->status[(i - 1) & 3] >> 30));
+    }
+    period_certification(random);
+    for (unsigned int i = 0; i < PRE_LOOP; i++) {
+        tinymt32_next_state(random);
+    }
+}
+
+/**
+ * This function initializes the internal state array,
+ * with an array of 32-bit unsigned integers used as seeds
+ * @param random tinymt state vector.
+ * @param init_key the array of 32-bit integers, used as a seed.
+ * @param key_length the length of init_key.
+ */
+void tinymt32_init_by_array(tinymt32_t * random, uint32_t init_key[],
+                            int key_length) {
+    const unsigned int lag = 1;
+    const unsigned int mid = 1;
+    const unsigned int size = 4;
+    unsigned int i, j;
+    unsigned int count;
+    uint32_t r;
+    uint32_t * st = &random->status[0];
+
+    st[0] = 0;
+    st[1] = random->mat1;
+    st[2] = random->mat2;
+    st[3] = random->tmat;
+    if (key_length + 1 > MIN_LOOP) {
+        count = (unsigned int)key_length + 1;
+    } else {
+        count = MIN_LOOP;
+    }
+    r = ini_func1(st[0] ^ st[mid % size]
+                  ^ st[(size - 1) % size]);
+    st[mid % size] += r;
+    r += (unsigned int)key_length;
+    st[(mid + lag) % size] += r;
+    st[0] = r;
+    count--;
+    for (i = 1, j = 0; (j < count) && (j < (unsigned int)key_length); j++) {
+        r = ini_func1(st[i % size]
+                      ^ st[(i + mid) % size]
+                      ^ st[(i + size - 1) % size]);
+        st[(i + mid) % size] += r;
+        r += init_key[j] + i;
+        st[(i + mid + lag) % size] += r;
+        st[i % size] = r;
+        i = (i + 1) % size;
+    }
+    for (; j < count; j++) {
+        r = ini_func1(st[i % size]
+                      ^ st[(i + mid) % size]
+                      ^ st[(i + size - 1) % size]);
+        st[(i + mid) % size] += r;
+        r += i;
+        st[(i + mid + lag) % size] += r;
+        st[i % size] = r;
+        i = (i + 1) % size;
+    }
+    for (j = 0; j < size; j++) {
+        r = ini_func2(st[i % size]
+                      + st[(i + mid) % size]
+                      + st[(i + size - 1) % size]);
+        st[(i + mid) % size] ^= r;
+        r -= i;
+        st[(i + mid + lag) % size] ^= r;
+        st[i % size] = r;
+        i = (i + 1) % size;
+    }
+    period_certification(random);
+    for (i = 0; i < PRE_LOOP; i++) {
+        tinymt32_next_state(random);
+    }
+}
+
+#undef MIN_LOOP
+#undef PRE_LOOP
+
+#if BOOST_COMP_CLANG
+#   pragma clang diagnostic pop
+#endif
+#if BOOST_COMP_GNUC
+#   pragma GCC diagnostic pop
+#endif
+#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
+#   pragma warning(pop)
+#endif
+
+#endif
diff --git a/include/alpaka/rand/Traits.hpp b/include/alpaka/rand/Traits.hpp
new file mode 100644
index 0000000..1ccd1ba
--- /dev/null
+++ b/include/alpaka/rand/Traits.hpp
@@ -0,0 +1,100 @@
+/* Copyright 2023 Benjamin Worpitz, Bernhard Manfred Gruber, Jan Stephan
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/core/Concepts.hpp"
+
+#include <cstdint>
+#include <type_traits>
+
+namespace alpaka::rand
+{
+    struct ConceptRand
+    {
+    };
+
+    //! The random number generator distribution specifics.
+    namespace distribution
+    {
+        //! The random number generator distribution trait.
+        namespace trait
+        {
+            //! The random number float normal distribution get trait.
+            template<typename TRand, typename T, typename TSfinae = void>
+            struct CreateNormalReal;
+
+            //! The random number float uniform distribution get trait.
+            template<typename TRand, typename T, typename TSfinae = void>
+            struct CreateUniformReal;
+
+            //! The random number integer uniform distribution get trait.
+            template<typename TRand, typename T, typename TSfinae = void>
+            struct CreateUniformUint;
+        } // namespace trait
+
+        //! \return A normal float distribution with mean 0.0f and standard deviation 1.0f.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T, typename TRand>
+        ALPAKA_FN_HOST_ACC auto createNormalReal(TRand const& rand)
+        {
+            static_assert(std::is_floating_point_v<T>, "The value type T has to be a floating point type!");
+
+            using ImplementationBase = concepts::ImplementationBase<ConceptRand, TRand>;
+            return trait::CreateNormalReal<ImplementationBase, T>::createNormalReal(rand);
+        }
+
+        //! \return A uniform floating point distribution [0.0, 1.0).
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T, typename TRand>
+        ALPAKA_FN_HOST_ACC auto createUniformReal(TRand const& rand)
+        {
+            static_assert(std::is_floating_point_v<T>, "The value type T has to be a floating point type!");
+
+            using ImplementationBase = concepts::ImplementationBase<ConceptRand, TRand>;
+            return trait::CreateUniformReal<ImplementationBase, T>::createUniformReal(rand);
+        }
+
+        //! \return A uniform integer distribution [0, UINT_MAX].
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename T, typename TRand>
+        ALPAKA_FN_HOST_ACC auto createUniformUint(TRand const& rand)
+        {
+            static_assert(
+                std::is_integral_v<T> && std::is_unsigned_v<T>,
+                "The value type T has to be a unsigned integral type!");
+
+            using ImplementationBase = concepts::ImplementationBase<ConceptRand, TRand>;
+            return trait::CreateUniformUint<ImplementationBase, T>::createUniformUint(rand);
+        }
+    } // namespace distribution
+
+    //! The random number generator engine specifics.
+    namespace engine
+    {
+        //! The random number generator engine trait.
+        namespace trait
+        {
+            //! The random number default generator engine get trait.
+            template<typename TRand, typename TSfinae = void>
+            struct CreateDefault;
+        } // namespace trait
+
+        //! \return A default random number generator engine. Its type is guaranteed to be trivially copyable.
+        //!         Except HIP accelerator for HIP versions below 5.2 as its internal state was not trivially copyable.
+        //!         The limitation was discussed in PR #1778.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TRand>
+        ALPAKA_FN_HOST_ACC auto createDefault(
+            TRand const& rand,
+            std::uint32_t const& seed = 0,
+            std::uint32_t const& subsequence = 0,
+            std::uint32_t const& offset = 0)
+        {
+            using ImplementationBase = concepts::ImplementationBase<ConceptRand, TRand>;
+            return trait::CreateDefault<ImplementationBase>::createDefault(rand, seed, subsequence, offset);
+        }
+    } // namespace engine
+} // namespace alpaka::rand
diff --git a/include/alpaka/standalone/CpuOmp2Blocks.hpp b/include/alpaka/standalone/CpuOmp2Blocks.hpp
new file mode 100644
index 0000000..34c69d5
--- /dev/null
+++ b/include/alpaka/standalone/CpuOmp2Blocks.hpp
@@ -0,0 +1,9 @@
+/* Copyright 2019 Benjamin Worpitz
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#ifndef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
+#    define ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
+#endif
diff --git a/include/alpaka/standalone/CpuOmp2Threads.hpp b/include/alpaka/standalone/CpuOmp2Threads.hpp
new file mode 100644
index 0000000..b48139a
--- /dev/null
+++ b/include/alpaka/standalone/CpuOmp2Threads.hpp
@@ -0,0 +1,9 @@
+/* Copyright 2019 Benjamin Worpitz
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#ifndef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
+#    define ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
+#endif
diff --git a/include/alpaka/standalone/CpuSerial.hpp b/include/alpaka/standalone/CpuSerial.hpp
new file mode 100644
index 0000000..338a5c0
--- /dev/null
+++ b/include/alpaka/standalone/CpuSerial.hpp
@@ -0,0 +1,9 @@
+/* Copyright 2019 Benjamin Worpitz
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#ifndef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
+#    define ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
+#endif
diff --git a/include/alpaka/standalone/CpuSycl.hpp b/include/alpaka/standalone/CpuSycl.hpp
new file mode 100644
index 0000000..7e42735
--- /dev/null
+++ b/include/alpaka/standalone/CpuSycl.hpp
@@ -0,0 +1,13 @@
+/* Copyright 2023 Jan Stephan, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#include "alpaka/standalone/GenericSycl.hpp"
+
+#ifndef ALPAKA_SYCL_ONEAPI_CPU
+#    define ALPAKA_SYCL_ONEAPI_CPU
+#endif
+
+#ifndef ALPAKA_SYCL_TARGET_CPU
+#    define ALPAKA_SYCL_TARGET_CPU
+#endif
diff --git a/include/alpaka/standalone/CpuTbbBlocks.hpp b/include/alpaka/standalone/CpuTbbBlocks.hpp
new file mode 100644
index 0000000..87e7548
--- /dev/null
+++ b/include/alpaka/standalone/CpuTbbBlocks.hpp
@@ -0,0 +1,9 @@
+/* Copyright 2019 Benjamin Worpitz
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#ifndef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
+#    define ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
+#endif
diff --git a/include/alpaka/standalone/CpuThreads.hpp b/include/alpaka/standalone/CpuThreads.hpp
new file mode 100644
index 0000000..cd28f09
--- /dev/null
+++ b/include/alpaka/standalone/CpuThreads.hpp
@@ -0,0 +1,9 @@
+/* Copyright 2019 Benjamin Worpitz
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#ifndef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
+#    define ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
+#endif
diff --git a/include/alpaka/standalone/FpgaSyclIntel.hpp b/include/alpaka/standalone/FpgaSyclIntel.hpp
new file mode 100644
index 0000000..35a44bc
--- /dev/null
+++ b/include/alpaka/standalone/FpgaSyclIntel.hpp
@@ -0,0 +1,13 @@
+/* Copyright 2023 Jan Stephan
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#include "alpaka/standalone/GenericSycl.hpp"
+
+#ifndef ALPAKA_SYCL_ONEAPI_FPGA
+#    define ALPAKA_SYCL_ONEAPI_FPGA
+#endif
+
+#ifndef ALPAKA_SYCL_TARGET_FPGA
+#    define ALPAKA_SYCL_TARGET_FPGA
+#endif
diff --git a/include/alpaka/standalone/GenericSycl.hpp b/include/alpaka/standalone/GenericSycl.hpp
new file mode 100644
index 0000000..c75e0a1
--- /dev/null
+++ b/include/alpaka/standalone/GenericSycl.hpp
@@ -0,0 +1,9 @@
+/* Copyright 2022 Jan Stephan
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#ifndef ALPAKA_ACC_SYCL_ENABLED
+#    define ALPAKA_ACC_SYCL_ENABLED
+#endif
diff --git a/include/alpaka/standalone/GpuCudaRt.hpp b/include/alpaka/standalone/GpuCudaRt.hpp
new file mode 100644
index 0000000..eeaae15
--- /dev/null
+++ b/include/alpaka/standalone/GpuCudaRt.hpp
@@ -0,0 +1,21 @@
+/* Copyright 2022 Benjamin Worpitz, Jan Stephan
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#ifndef ALPAKA_ACC_GPU_CUDA_ENABLED
+#    define ALPAKA_ACC_GPU_CUDA_ENABLED
+#endif
+
+#include "alpaka/core/BoostPredef.hpp"
+
+#if defined(BOOST_COMP_CLANG_CUDA) && (BOOST_COMP_CLANG_CUDA == BOOST_VERSION_NUMBER(14, 0, 0))
+
+#    include <cuda.h>
+
+#    if(CUDART_VERSION == 11030)
+#        error "clang-14 cannot be used as CUDA compiler when using CUDA v11.3. See alpaka GitHub issue 1857."
+#    endif
+
+#endif
diff --git a/include/alpaka/standalone/GpuHipRt.hpp b/include/alpaka/standalone/GpuHipRt.hpp
new file mode 100644
index 0000000..494d3d4
--- /dev/null
+++ b/include/alpaka/standalone/GpuHipRt.hpp
@@ -0,0 +1,9 @@
+/* Copyright 2019 Benjamin Worpitz
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#ifndef ALPAKA_ACC_GPU_HIP_ENABLED
+#    define ALPAKA_ACC_GPU_HIP_ENABLED
+#endif
diff --git a/include/alpaka/standalone/GpuSyclIntel.hpp b/include/alpaka/standalone/GpuSyclIntel.hpp
new file mode 100644
index 0000000..8911e39
--- /dev/null
+++ b/include/alpaka/standalone/GpuSyclIntel.hpp
@@ -0,0 +1,13 @@
+/* Copyright 2023 Jan Stephan
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#include "alpaka/standalone/GenericSycl.hpp"
+
+#ifndef ALPAKA_SYCL_ONEAPI_GPU
+#    define ALPAKA_SYCL_ONEAPI_GPU
+#endif
+
+#ifndef ALPAKA_SYCL_TARGET_GPU
+#    define ALPAKA_SYCL_TARGET_GPU
+#endif
diff --git a/include/alpaka/test/Array.hpp b/include/alpaka/test/Array.hpp
new file mode 100644
index 0000000..08cc9f0
--- /dev/null
+++ b/include/alpaka/test/Array.hpp
@@ -0,0 +1,29 @@
+/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+#include "alpaka/alpaka.hpp"
+
+#include <cstddef>
+
+namespace alpaka::test
+{
+    template<typename TType, size_t TSize>
+    struct Array
+    {
+        TType m_data[TSize];
+
+        template<typename T_Idx>
+        ALPAKA_FN_HOST_ACC auto operator[](const T_Idx idx) const -> TType const&
+        {
+            return m_data[idx];
+        }
+
+        template<typename TIdx>
+        ALPAKA_FN_HOST_ACC auto operator[](const TIdx idx) -> TType&
+        {
+            return m_data[idx];
+        }
+    };
+} // namespace alpaka::test
diff --git a/include/alpaka/test/Check.hpp b/include/alpaka/test/Check.hpp
new file mode 100644
index 0000000..39545e7
--- /dev/null
+++ b/include/alpaka/test/Check.hpp
@@ -0,0 +1,19 @@
+/* Copyright 2023 Benjamin Worpitz, Jan Stephan, Luca Ferragina, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Sycl.hpp"
+
+#include <cstdio>
+
+#define ALPAKA_CHECK(success, expression)                                                                             \
+    do                                                                                                                \
+    {                                                                                                                 \
+        if(!(expression))                                                                                             \
+        {                                                                                                             \
+            printf("ALPAKA_CHECK failed because '!(%s)'\n", #expression);                                             \
+            success = false;                                                                                          \
+        }                                                                                                             \
+    } while(0)
diff --git a/include/alpaka/test/Extent.hpp b/include/alpaka/test/Extent.hpp
new file mode 100644
index 0000000..56ccfaf
--- /dev/null
+++ b/include/alpaka/test/Extent.hpp
@@ -0,0 +1,42 @@
+/* Copyright 2022 Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/alpaka.hpp"
+
+#include <cstddef>
+
+namespace alpaka::test
+{
+    template<typename TDim, typename TVal>
+    inline constexpr auto extentBuf = []
+    {
+        Vec<TDim, TVal> v;
+        if constexpr(TDim::value > 0)
+            for(TVal i = 0; i < TVal{TDim::value}; i++)
+                v[i] = 11 - i;
+        return v;
+    }();
+
+    template<typename TDim, typename TVal>
+    inline constexpr auto extentSubView = []
+    {
+        Vec<TDim, TVal> v;
+        if constexpr(TDim::value > 0)
+            for(TVal i = 0; i < TVal{TDim::value}; i++)
+                v[i] = 8 - i * 2;
+        return v;
+    }();
+
+    template<typename TDim, typename TVal>
+    inline constexpr auto offset = []
+    {
+        Vec<TDim, TVal> v;
+        if constexpr(TDim::value > 0)
+            for(TVal i = 0; i < TVal{TDim::value}; i++)
+                v[i] = 2 + i;
+        return v;
+    }();
+} // namespace alpaka::test
diff --git a/include/alpaka/test/KernelExecutionFixture.hpp b/include/alpaka/test/KernelExecutionFixture.hpp
new file mode 100644
index 0000000..0e59344
--- /dev/null
+++ b/include/alpaka/test/KernelExecutionFixture.hpp
@@ -0,0 +1,105 @@
+/* Copyright 2024 Benjamin Worpitz, Andrea Bocci, Bernhard Manfred Gruber, Jan Stephan, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/alpaka.hpp"
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#    error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#endif
+
+#if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#    error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#endif
+
+#include "alpaka/test/Check.hpp"
+#include "alpaka/test/queue/Queue.hpp"
+
+#include <utility>
+
+namespace alpaka::test
+{
+    //! The fixture for executing a kernel on a given accelerator.
+    template<typename TAcc>
+    class KernelExecutionFixture
+    {
+    public:
+        using Acc = TAcc;
+        using Dim = alpaka::Dim<Acc>;
+        using Idx = alpaka::Idx<Acc>;
+        using Platform = alpaka::Platform<Acc>;
+        using Device = Dev<Acc>;
+        using Queue = test::DefaultQueue<Device>;
+        using WorkDiv = WorkDivMembers<Dim, Idx>;
+
+        KernelExecutionFixture(WorkDiv workDiv) : m_queue{m_device}, m_workDiv{std::move(workDiv)}
+        {
+        }
+
+        template<typename TExtent>
+        KernelExecutionFixture(TExtent const& extent) : m_queue{m_device}
+                                                      , m_extent{extent}
+        {
+        }
+
+        KernelExecutionFixture(Queue queue, WorkDiv workDiv)
+            : m_platform{} // if the platform is not stateless, this is wrong; we ignore it because it is not be used
+            , m_device{alpaka::getDev(queue)}
+            , m_queue{std::move(queue)}
+            , m_workDiv{std::move(workDiv)}
+        {
+        }
+
+        template<typename TExtent>
+        KernelExecutionFixture(Queue queue, TExtent const& extent)
+            : m_platform{} // if the platform is not stateless, this is wrong; we ignore it because it is not be used
+            , m_device{alpaka::getDev(queue)}
+            , m_queue{std::move(queue)}
+            , m_extent{extent}
+        {
+        }
+
+        template<typename TKernelFnObj, typename... TArgs>
+        auto operator()(TKernelFnObj kernelFnObj, TArgs&&... args) -> bool
+        {
+            // Allocate the result value
+            auto bufAccResult = allocBuf<bool, Idx>(m_device, static_cast<Idx>(1u));
+            memset(m_queue, bufAccResult, static_cast<std::uint8_t>(true));
+
+
+            alpaka::KernelCfg<Acc> const kernelCfg = {m_extent, Vec<Dim, Idx>::ones()};
+
+            // set workdiv if it is not before
+            if(m_workDiv == WorkDiv{Vec<Dim, Idx>::all(0), Vec<Dim, Idx>::all(0), Vec<Dim, Idx>::all(0)})
+                m_workDiv = alpaka::getValidWorkDiv(
+                    kernelCfg,
+                    m_device,
+                    kernelFnObj,
+                    getPtrNative(bufAccResult),
+                    std::forward<TArgs>(args)...);
+
+            exec<Acc>(m_queue, m_workDiv, kernelFnObj, getPtrNative(bufAccResult), std::forward<TArgs>(args)...);
+
+            // Copy the result value to the host
+            auto bufHostResult = allocBuf<bool, Idx>(m_devHost, static_cast<Idx>(1u));
+            memcpy(m_queue, bufHostResult, bufAccResult);
+            wait(m_queue);
+
+            auto const result = *getPtrNative(bufHostResult);
+
+            return result;
+        }
+
+    private:
+        PlatformCpu m_platformHost{};
+        DevCpu m_devHost{getDevByIdx(m_platformHost, 0)};
+        Platform m_platform{};
+        Device m_device{getDevByIdx(m_platform, 0)};
+        Queue m_queue;
+        WorkDiv m_workDiv{Vec<Dim, Idx>::all(0), Vec<Dim, Idx>::all(0), Vec<Dim, Idx>::all(0)};
+        Vec<Dim, Idx> m_extent;
+    };
+
+} // namespace alpaka::test
diff --git a/include/alpaka/test/MeasureKernelRunTime.hpp b/include/alpaka/test/MeasureKernelRunTime.hpp
new file mode 100644
index 0000000..8ef4f45
--- /dev/null
+++ b/include/alpaka/test/MeasureKernelRunTime.hpp
@@ -0,0 +1,47 @@
+/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/alpaka.hpp"
+#include "alpaka/core/DemangleTypeNames.hpp"
+
+#include <type_traits>
+#include <utility>
+
+namespace alpaka::test::integ
+{
+    //! Measures and returns the runtime in ms of the passed callable.
+    //! \param callable An object with operator().
+    template<typename TCallable>
+    auto measureRunTimeMs(TCallable&& callable) -> std::chrono::milliseconds::rep
+    {
+        auto const start = std::chrono::high_resolution_clock::now();
+        std::forward<TCallable>(callable)();
+        auto const end = std::chrono::high_resolution_clock::now();
+        return std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+    }
+
+    //! \return The run time of the given kernel.
+    template<typename TQueue, typename TTask>
+    auto measureTaskRunTimeMs(TQueue& queue, TTask&& task) -> std::chrono::milliseconds::rep
+    {
+#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+        std::cout << "measureKernelRunTime("
+                  << " queue: " << core::demangled<TQueue> << " task: " << core::demangled<std::decay_t<TTask>> << ")"
+                  << std::endl;
+#endif
+        // Wait for the queue to finish all tasks enqueued prior to the given task.
+        alpaka::wait(queue);
+
+        return measureRunTimeMs(
+            [&]
+            {
+                alpaka::enqueue(queue, std::forward<TTask>(task));
+
+                // Wait for the queue to finish the task execution to measure its run time.
+                alpaka::wait(queue);
+            });
+    }
+} // namespace alpaka::test::integ
diff --git a/include/alpaka/test/acc/TestAccs.hpp b/include/alpaka/test/acc/TestAccs.hpp
new file mode 100644
index 0000000..2370fa4
--- /dev/null
+++ b/include/alpaka/test/acc/TestAccs.hpp
@@ -0,0 +1,183 @@
+/* Copyright 2024 Benjamin Worpitz, Erik Zenker, Matthias Werner, Andrea Bocci, Bernhard Manfred Gruber, Jan Stephan,
+ * Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/alpaka.hpp"
+#include "alpaka/test/dim/TestDims.hpp"
+#include "alpaka/test/idx/TestIdxs.hpp"
+
+#include <iosfwd>
+#include <tuple>
+#include <type_traits>
+
+// When compiling the tests with CUDA enabled (nvcc or native clang) on the CI infrastructure
+// we have to dramatically reduce the number of tested combinations.
+// Else the log length would be exceeded.
+#if defined(ALPAKA_CI)
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA                                                       \
+        || defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP
+#        define ALPAKA_CUDA_CI
+#    endif
+#endif
+
+namespace alpaka::test
+{
+    //! The detail namespace is used to separate implementation details from user accessible code.
+    namespace detail
+    {
+#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
+        template<typename TDim, typename TIdx>
+        using AccCpuSerialIfAvailableElseInt = AccCpuSerial<TDim, TIdx>;
+#else
+        template<typename TDim, typename TIdx>
+        using AccCpuSerialIfAvailableElseInt = int;
+#endif
+#if defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED) && !defined(ALPAKA_CUDA_CI)
+        template<typename TDim, typename TIdx>
+        using AccCpuThreadsIfAvailableElseInt = AccCpuThreads<TDim, TIdx>;
+#else
+        template<typename TDim, typename TIdx>
+        using AccCpuThreadsIfAvailableElseInt = int;
+#endif
+#if defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED)
+        template<typename TDim, typename TIdx>
+        using AccCpuTbbIfAvailableElseInt = AccCpuTbbBlocks<TDim, TIdx>;
+#else
+        template<typename TDim, typename TIdx>
+        using AccCpuTbbIfAvailableElseInt = int;
+#endif
+#if defined(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED)
+        template<typename TDim, typename TIdx>
+        using AccCpuOmp2BlocksIfAvailableElseInt = AccCpuOmp2Blocks<TDim, TIdx>;
+#else
+        template<typename TDim, typename TIdx>
+        using AccCpuOmp2BlocksIfAvailableElseInt = int;
+#endif
+#if defined(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED) && !defined(ALPAKA_CUDA_CI)
+        template<typename TDim, typename TIdx>
+        using AccCpuOmp2ThreadsIfAvailableElseInt = AccCpuOmp2Threads<TDim, TIdx>;
+#else
+        template<typename TDim, typename TIdx>
+        using AccCpuOmp2ThreadsIfAvailableElseInt = int;
+#endif
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && (BOOST_LANG_CUDA || defined(ALPAKA_HOST_ONLY))
+        template<typename TDim, typename TIdx>
+        using AccGpuCudaRtIfAvailableElseInt = AccGpuCudaRt<TDim, TIdx>;
+#else
+        template<typename TDim, typename TIdx>
+        using AccGpuCudaRtIfAvailableElseInt = int;
+#endif
+#if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && (BOOST_LANG_HIP || defined(ALPAKA_HOST_ONLY))
+        template<typename TDim, typename TIdx>
+        using AccGpuHipRtIfAvailableElseInt =
+            typename std::conditional<std::is_same_v<TDim, DimInt<3u>> == false, AccGpuHipRt<TDim, TIdx>, int>::type;
+#else
+        template<typename TDim, typename TIdx>
+        using AccGpuHipRtIfAvailableElseInt = int;
+#endif
+#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_TARGET_CPU)
+        template<typename TDim, typename TIdx>
+        using AccCpuSyclIfAvailableElseInt = AccCpuSycl<TDim, TIdx>;
+#else
+        template<typename TDim, typename TIdx>
+        using AccCpuSyclIfAvailableElseInt = int;
+#endif
+#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_TARGET_FPGA)
+        template<typename TDim, typename TIdx>
+        using AccFpgaSyclIntelIfAvailableElseInt = AccFpgaSyclIntel<TDim, TIdx>;
+#else
+        template<typename TDim, typename TIdx>
+        using AccFpgaSyclIntelIfAvailableElseInt = int;
+#endif
+#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_TARGET_GPU)
+        template<typename TDim, typename TIdx>
+        using AccGpuSyclIntelIfAvailableElseInt = AccGpuSyclIntel<TDim, TIdx>;
+#else
+        template<typename TDim, typename TIdx>
+        using AccGpuSyclIntelIfAvailableElseInt = int;
+#endif
+
+        //! A vector containing all available accelerators and int's.
+        template<typename TDim, typename TIdx>
+        using EnabledAccsElseInt = std::tuple<
+            AccCpuSerialIfAvailableElseInt<TDim, TIdx>,
+            AccCpuThreadsIfAvailableElseInt<TDim, TIdx>,
+            AccCpuTbbIfAvailableElseInt<TDim, TIdx>,
+            AccCpuOmp2BlocksIfAvailableElseInt<TDim, TIdx>,
+            AccCpuOmp2ThreadsIfAvailableElseInt<TDim, TIdx>,
+            AccGpuCudaRtIfAvailableElseInt<TDim, TIdx>,
+            AccGpuHipRtIfAvailableElseInt<TDim, TIdx>,
+            AccCpuSyclIfAvailableElseInt<TDim, TIdx>,
+            AccFpgaSyclIntelIfAvailableElseInt<TDim, TIdx>,
+            AccGpuSyclIntelIfAvailableElseInt<TDim, TIdx>>;
+    } // namespace detail
+
+    //! A vector containing all available accelerators.
+    template<typename TDim, typename TIdx>
+    using EnabledAccs = typename meta::Filter<detail::EnabledAccsElseInt<TDim, TIdx>, std::is_class>;
+
+    namespace detail
+    {
+        //! The accelerator name write wrapper.
+        struct StreamOutAccName
+        {
+            template<typename TAcc>
+            ALPAKA_FN_HOST auto operator()(std::ostream& os) -> void
+            {
+                os << getAccName<TAcc>();
+                os << " ";
+            }
+        };
+    } // namespace detail
+
+    //! Writes the enabled accelerators to the given stream.
+    template<typename TDim, typename TIdx>
+    ALPAKA_FN_HOST auto writeEnabledAccs(std::ostream& os) -> void
+    {
+        os << "Accelerators enabled: ";
+
+        meta::forEachType<EnabledAccs<TDim, TIdx>>(detail::StreamOutAccName(), std::ref(os));
+
+        os << std::endl;
+    }
+
+    namespace detail
+    {
+        //! A std::tuple holding multiple std::tuple consisting of a dimension and a idx type.
+        //!
+        //! TestDimIdxTuples =
+        //!     tuple<
+        //!         tuple<Dim1,Idx1>,
+        //!         tuple<Dim2,Idx1>,
+        //!         tuple<Dim3,Idx1>,
+        //!         ...,
+        //!         tuple<DimN,IdxN>>
+        using TestDimIdxTuples = meta::CartesianProduct<std::tuple, NonZeroTestDims, TestIdxs>;
+
+        template<typename TList>
+        using ApplyEnabledAccs = meta::Apply<TList, EnabledAccs>;
+
+        //! A std::tuple containing std::tuple with fully instantiated accelerators.
+        //!
+        //! TestEnabledAccs =
+        //!     tuple<
+        //!         tuple<Acc1<Dim1,Idx1>, ..., AccN<Dim1,Idx1>>,
+        //!         tuple<Acc1<Dim2,Idx1>, ..., AccN<Dim2,Idx1>>,
+        //!         ...,
+        //!         tuple<Acc1<DimN,IdxN>, ..., AccN<DimN,IdxN>>>
+        using InstantiatedEnabledAccs = meta::Transform<TestDimIdxTuples, ApplyEnabledAccs>;
+    } // namespace detail
+
+    //! A std::tuple containing fully instantiated accelerators.
+    //!
+    //! TestAccs =
+    //!     tuple<
+    //!         Acc1<Dim1,Idx1>, ..., AccN<Dim1,Idx1>,
+    //!         Acc1<Dim2,Idx1>, ..., AccN<Dim2,Idx1>,
+    //!         ...,
+    //!         Acc1<DimN,IdxN>, ..., AccN<DimN,IdxN>>
+    using TestAccs = meta::Apply<detail::InstantiatedEnabledAccs, meta::Concatenate>;
+} // namespace alpaka::test
diff --git a/include/alpaka/test/dim/TestDims.hpp b/include/alpaka/test/dim/TestDims.hpp
new file mode 100644
index 0000000..395c97e
--- /dev/null
+++ b/include/alpaka/test/dim/TestDims.hpp
@@ -0,0 +1,34 @@
+/* Copyright 2023 Benjamin Worpitz, Andrea Bocci, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/dim/DimIntegralConst.hpp"
+#include "alpaka/meta/Filter.hpp"
+#include "alpaka/meta/NonZero.hpp"
+
+#include <tuple>
+
+namespace alpaka::test
+{
+    //! A std::tuple holding dimensions.
+    using TestDims = std::tuple<
+        DimInt<0u>,
+        DimInt<1u>,
+        DimInt<2u>,
+        DimInt<3u>
+    // CUDA, HIP and SYCL accelerators do not support 4D buffers and 4D acceleration.
+#if !defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !defined(ALPAKA_ACC_SYCL_ENABLED)
+        ,
+        DimInt<4u>
+#endif
+        >;
+
+    //! A std::tuple holding non-zero dimensions.
+    //!
+    //! NonZeroTestDims = std::tuple<Dim1, Dim2, ... DimN>
+    using NonZeroTestDims = meta::Filter<TestDims, meta::NonZero>;
+
+} // namespace alpaka::test
diff --git a/include/alpaka/test/event/EventHostManualTrigger.hpp b/include/alpaka/test/event/EventHostManualTrigger.hpp
new file mode 100644
index 0000000..653dbbb
--- /dev/null
+++ b/include/alpaka/test/event/EventHostManualTrigger.hpp
@@ -0,0 +1,779 @@
+/* Copyright 2024 Benjamin Worpitz, Matthias Werner, Jan Stephan, Jeffrey Kelling, Andrea Bocci,
+ *                Bernhard Manfred Gruber, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/alpaka.hpp"
+
+#include <condition_variable>
+#include <mutex>
+#include <utility>
+
+namespace alpaka::test
+{
+    namespace trait
+    {
+        template<typename TDev>
+        struct EventHostManualTriggerType;
+
+        template<typename TDev>
+        struct IsEventHostManualTriggerSupported;
+    } // namespace trait
+
+    //! The event host manual trigger type trait alias template to remove the ::type.
+    template<typename TDev>
+    using EventHostManualTrigger = typename trait::EventHostManualTriggerType<TDev>::type;
+
+    template<typename TDev>
+    ALPAKA_FN_HOST auto isEventHostManualTriggerSupported(TDev const& dev) -> bool
+    {
+        return trait::IsEventHostManualTriggerSupported<TDev>::isSupported(dev);
+    }
+
+    namespace cpu::detail
+    {
+        //! Event that can be enqueued into a queue and can be triggered by the Host.
+        template<class TDev = DevCpu>
+        class EventHostManualTriggerCpuImpl
+        {
+        public:
+            //! Constructor.
+            ALPAKA_FN_HOST EventHostManualTriggerCpuImpl(TDev dev) noexcept
+                : m_dev(std::move(dev))
+                , m_mutex()
+                , m_enqueueCount(0u)
+                , m_bIsReady(true)
+            {
+            }
+
+            EventHostManualTriggerCpuImpl(EventHostManualTriggerCpuImpl const& other) = delete;
+            auto operator=(EventHostManualTriggerCpuImpl const&) -> EventHostManualTriggerCpuImpl& = delete;
+
+            void trigger()
+            {
+                {
+                    std::unique_lock<std::mutex> lock(m_mutex);
+                    m_bIsReady = true;
+                }
+                m_conditionVariable.notify_one();
+                // Give alpaka time to update into the new state, process all events and tasks.
+                std::this_thread::sleep_for(std::chrono::milliseconds(200u));
+            }
+
+        public:
+            TDev const m_dev; //!< The device this event is bound to.
+
+            mutable std::mutex m_mutex; //!< The mutex used to synchronize access to the event.
+
+            mutable std::condition_variable m_conditionVariable; //!< The condition signaling the event completion.
+            std::size_t m_enqueueCount; //!< The number of times this event has been enqueued.
+
+            bool m_bIsReady; //!< If the event is not waiting within a queue (not enqueued or already
+                             //!< completed).
+        };
+    } // namespace cpu::detail
+
+    //! Event that can be enqueued into a queue and can be triggered by the Host.
+    template<class TDev = DevCpu>
+    class EventHostManualTriggerCpu
+    {
+    public:
+        //! Constructor.
+        ALPAKA_FN_HOST EventHostManualTriggerCpu(TDev const& dev)
+            : m_spEventImpl(std::make_shared<cpu::detail::EventHostManualTriggerCpuImpl<TDev>>(dev))
+        {
+        }
+
+        //! Equality comparison operator.
+        ALPAKA_FN_HOST auto operator==(EventHostManualTriggerCpu const& rhs) const -> bool
+        {
+            return (m_spEventImpl == rhs.m_spEventImpl);
+        }
+
+        //! Inequality comparison operator.
+        ALPAKA_FN_HOST auto operator!=(EventHostManualTriggerCpu const& rhs) const -> bool
+        {
+            return !((*this) == rhs);
+        }
+
+        void trigger()
+        {
+            m_spEventImpl->trigger();
+            // Give alpaka time to update into the new state, process all events and tasks.
+            std::this_thread::sleep_for(std::chrono::milliseconds(200u));
+        }
+
+    public:
+        std::shared_ptr<cpu::detail::EventHostManualTriggerCpuImpl<TDev>> m_spEventImpl;
+    };
+
+    namespace trait
+    {
+        template<>
+        struct EventHostManualTriggerType<DevCpu>
+        {
+            using type = test::EventHostManualTriggerCpu<DevCpu>;
+        };
+
+        //! The CPU event host manual trigger support get trait specialization.
+        template<>
+        struct IsEventHostManualTriggerSupported<DevCpu>
+        {
+            ALPAKA_FN_HOST static auto isSupported(DevCpu const&) -> bool
+            {
+                return true;
+            }
+        };
+    } // namespace trait
+} // namespace alpaka::test
+
+namespace alpaka::trait
+{
+    //! The CPU device event device get trait specialization.
+    template<typename TDev>
+    struct GetDev<test::EventHostManualTriggerCpu<TDev>>
+    {
+        //
+        ALPAKA_FN_HOST static auto getDev(test::EventHostManualTriggerCpu<TDev> const& event) -> TDev
+        {
+            return event.m_spEventImpl->m_dev;
+        }
+    };
+
+    //! The CPU device event test trait specialization.
+    template<typename TDev>
+    struct IsComplete<test::EventHostManualTriggerCpu<TDev>>
+    {
+        //! \return If the event is not waiting within a queue (not enqueued or already handled).
+        ALPAKA_FN_HOST static auto isComplete(test::EventHostManualTriggerCpu<TDev> const& event) -> bool
+        {
+            std::lock_guard<std::mutex> lk(event.m_spEventImpl->m_mutex);
+
+            return event.m_spEventImpl->m_bIsReady;
+        }
+    };
+
+    template<typename TDev>
+    struct Enqueue<QueueGenericThreadsNonBlocking<TDev>, test::EventHostManualTriggerCpu<TDev>>
+    {
+        //
+        ALPAKA_FN_HOST static auto enqueue(
+            QueueGenericThreadsNonBlocking<TDev>& queue,
+            test::EventHostManualTriggerCpu<TDev>& event) -> void
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+            // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.
+            auto spEventImpl = event.m_spEventImpl;
+
+            // Setting the event state and enqueuing it has to be atomic.
+            std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
+
+            // The event should not yet be enqueued.
+            ALPAKA_ASSERT(spEventImpl->m_bIsReady);
+
+            // Set its state to enqueued.
+            spEventImpl->m_bIsReady = false;
+
+            // Increment the enqueue counter. This is used to skip waits for events that had already been finished
+            // and re-enqueued which would lead to deadlocks.
+            ++spEventImpl->m_enqueueCount;
+
+            auto const enqueueCount = spEventImpl->m_enqueueCount;
+
+            // Enqueue a task that only resets the events flag if it is completed.
+            queue.m_spQueueImpl->m_workerThread.submit(
+                [spEventImpl, enqueueCount]() mutable
+                {
+                    std::unique_lock<std::mutex> lk2(spEventImpl->m_mutex);
+                    spEventImpl->m_conditionVariable.wait(
+                        lk2,
+                        [spEventImpl, enqueueCount]
+                        { return (enqueueCount != spEventImpl->m_enqueueCount) || spEventImpl->m_bIsReady; });
+                });
+        }
+    };
+
+    template<typename TDev>
+    struct Enqueue<QueueGenericThreadsBlocking<TDev>, test::EventHostManualTriggerCpu<TDev>>
+    {
+        //
+        ALPAKA_FN_HOST static auto enqueue(
+            QueueGenericThreadsBlocking<TDev>&,
+            test::EventHostManualTriggerCpu<TDev>& event) -> void
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+            // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.
+            auto spEventImpl = event.m_spEventImpl;
+
+            // Setting the event state and enqueuing it has to be atomic.
+            std::unique_lock<std::mutex> lk(spEventImpl->m_mutex);
+
+            // The event should not yet be enqueued.
+            ALPAKA_ASSERT(spEventImpl->m_bIsReady);
+
+            // Set its state to enqueued.
+            spEventImpl->m_bIsReady = false;
+
+            // Increment the enqueue counter. This is used to skip waits for events that had already been finished
+            // and re-enqueued which would lead to deadlocks.
+            ++spEventImpl->m_enqueueCount;
+
+            auto const enqueueCount = spEventImpl->m_enqueueCount;
+
+            spEventImpl->m_conditionVariable.wait(
+                lk,
+                [spEventImpl, enqueueCount]
+                { return (enqueueCount != spEventImpl->m_enqueueCount) || spEventImpl->m_bIsReady; });
+        }
+    };
+} // namespace alpaka::trait
+
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+
+#    include "alpaka/core/BoostPredef.hpp"
+
+#    include <cuda.h>
+
+#    if !BOOST_LANG_CUDA && !defined(ALPAKA_HOST_ONLY)
+#        error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#    endif
+
+#    include "alpaka/core/Cuda.hpp"
+
+namespace alpaka::test
+{
+    namespace uniform_cuda_hip::detail
+    {
+        class EventHostManualTriggerCudaImpl final
+        {
+            using TApi = alpaka::ApiCudaRt;
+
+        public:
+            ALPAKA_FN_HOST EventHostManualTriggerCudaImpl(DevCudaRt const& dev)
+                : m_dev(dev)
+                , m_mutex()
+                , m_bIsReady(true)
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // Set the current device.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cudaSetDevice(m_dev.getNativeHandle()));
+                // Allocate the buffer on this device.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cudaMalloc(&m_devMem, static_cast<size_t>(sizeof(int32_t))));
+                // Initiate the memory set.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    cudaMemset(m_devMem, static_cast<int>(0u), static_cast<size_t>(sizeof(int32_t))));
+            }
+
+            EventHostManualTriggerCudaImpl(EventHostManualTriggerCudaImpl const&) = delete;
+            auto operator=(EventHostManualTriggerCudaImpl const&) -> EventHostManualTriggerCudaImpl& = delete;
+
+            ALPAKA_FN_HOST ~EventHostManualTriggerCudaImpl()
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // Free the buffer.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(cudaFree(m_devMem));
+            }
+
+            void trigger()
+            {
+                std::unique_lock<std::mutex> lock(m_mutex);
+                m_bIsReady = true;
+
+                // Set the current device.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cudaSetDevice(m_dev.getNativeHandle()));
+                // Initiate the memory set.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    cudaMemset(m_devMem, static_cast<int>(1u), static_cast<size_t>(sizeof(int32_t))));
+                // Give alpaka time to update into the new state, process all events and tasks.
+                std::this_thread::sleep_for(std::chrono::milliseconds(200u));
+            }
+
+        public:
+            DevCudaRt const m_dev; //!< The device this event is bound to.
+
+            mutable std::mutex m_mutex; //!< The mutex used to synchronize access to the event.
+            void* m_devMem;
+
+            bool m_bIsReady; //!< If the event is not waiting within a queue (not enqueued or already
+                             //!< completed).
+        };
+    } // namespace uniform_cuda_hip::detail
+
+    class EventHostManualTriggerCuda final
+    {
+    public:
+        ALPAKA_FN_HOST EventHostManualTriggerCuda(DevCudaRt const& dev)
+            : m_spEventImpl(std::make_shared<uniform_cuda_hip::detail::EventHostManualTriggerCudaImpl>(dev))
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+        }
+
+        ALPAKA_FN_HOST auto operator==(EventHostManualTriggerCuda const& rhs) const -> bool
+        {
+            return (m_spEventImpl == rhs.m_spEventImpl);
+        }
+
+        ALPAKA_FN_HOST auto operator!=(EventHostManualTriggerCuda const& rhs) const -> bool
+        {
+            return !((*this) == rhs);
+        }
+
+        void trigger()
+        {
+            m_spEventImpl->trigger();
+            // Give alpaka time to update into the new state, process all events and tasks.
+            std::this_thread::sleep_for(std::chrono::milliseconds(200u));
+        }
+
+    public:
+        std::shared_ptr<uniform_cuda_hip::detail::EventHostManualTriggerCudaImpl> m_spEventImpl;
+    };
+
+    namespace trait
+    {
+        template<>
+        struct EventHostManualTriggerType<DevCudaRt>
+        {
+            using type = test::EventHostManualTriggerCuda;
+        };
+
+        //! The CPU event host manual trigger support get trait specialization.
+        template<>
+        struct IsEventHostManualTriggerSupported<DevCudaRt>
+        {
+            ALPAKA_FN_HOST static auto isSupported([[maybe_unused]] DevCudaRt const& dev) -> bool
+            {
+#    if CUDA_VERSION < 11070
+                int result = 0;
+                cuDeviceGetAttribute(&result, CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS, dev.getNativeHandle());
+                return result != 0;
+#    else
+                return true; // Always enabled as of CUDA 11.7
+#    endif
+            }
+        };
+    } // namespace trait
+} // namespace alpaka::test
+
+namespace alpaka::trait
+{
+    namespace detail
+    {
+        // TODO: Replace with cuStreamWaitValue32 once support for CUDA < 12 is dropped.
+        inline auto streamWaitValue(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags)
+            -> CUresult
+        {
+            // NVIDIA introduced a new stream memory ops API with CUDA 11.7 (called v2). The corresponding CUDA
+            // functions were suffixed with `_v2`. With CUDA 12.0 v1 of the API was removed and the `_v2` removed
+            // from the new functions. So CUDA <= 11.6 and CUDA >= 12.0 share the same function signature but
+            // internally do different things.
+#    if(CUDA_VERSION < 11070) || (CUDA_VERSION >= 12000)
+            return cuStreamWaitValue32(stream, addr, value, flags);
+#    else
+            return cuStreamWaitValue32_v2(stream, addr, value, flags);
+#    endif
+        }
+    } // namespace detail
+
+    //! The CPU device event device get trait specialization.
+    template<>
+    struct GetDev<test::EventHostManualTriggerCuda>
+    {
+        ALPAKA_FN_HOST static auto getDev(test::EventHostManualTriggerCuda const& event) -> DevCudaRt
+        {
+            return event.m_spEventImpl->m_dev;
+        }
+    };
+
+    //! The CPU device event test trait specialization.
+    template<>
+    struct IsComplete<test::EventHostManualTriggerCuda>
+    {
+        //! \return If the event is not waiting within a queue (not enqueued or already handled).
+        ALPAKA_FN_HOST static auto isComplete(test::EventHostManualTriggerCuda const& event) -> bool
+        {
+            std::lock_guard<std::mutex> lk(event.m_spEventImpl->m_mutex);
+
+            return event.m_spEventImpl->m_bIsReady;
+        }
+    };
+
+    template<>
+    struct Enqueue<QueueCudaRtNonBlocking, test::EventHostManualTriggerCuda>
+    {
+        ALPAKA_FN_HOST static auto enqueue(QueueCudaRtNonBlocking& queue, test::EventHostManualTriggerCuda& event)
+            -> void
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+            // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.
+            auto spEventImpl(event.m_spEventImpl);
+
+            // Setting the event state and enqueuing it has to be atomic.
+            std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
+
+            // The event should not yet be enqueued.
+            ALPAKA_ASSERT(spEventImpl->m_bIsReady);
+
+            // Set its state to enqueued.
+            spEventImpl->m_bIsReady = false;
+
+            // PGI Profiler`s User Guide:
+            // The following are known issues related to Events and Metrics:
+            // * In event or metric profiling, kernel launches are blocking. Thus kernels waiting
+            //   on host updates may hang. This includes synchronization between the host and
+            //   the device build upon value-based CUDA queue synchronization APIs such as
+            //   cuStreamWaitValue32() and cuStreamWriteValue32().
+            ALPAKA_CUDA_DRV_CHECK(detail::streamWaitValue(
+                static_cast<CUstream>(queue.getNativeHandle()),
+                reinterpret_cast<CUdeviceptr>(event.m_spEventImpl->m_devMem),
+                0x0101'0101u,
+                CU_STREAM_WAIT_VALUE_GEQ));
+        }
+    };
+
+    template<>
+    struct Enqueue<QueueCudaRtBlocking, test::EventHostManualTriggerCuda>
+    {
+        ALPAKA_FN_HOST static auto enqueue(QueueCudaRtBlocking& queue, test::EventHostManualTriggerCuda& event) -> void
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+            // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.
+            auto spEventImpl(event.m_spEventImpl);
+
+            // Setting the event state and enqueuing it has to be atomic.
+            std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
+
+            // The event should not yet be enqueued.
+            ALPAKA_ASSERT(spEventImpl->m_bIsReady);
+
+            // Set its state to enqueued.
+            spEventImpl->m_bIsReady = false;
+
+            // PGI Profiler`s User Guide:
+            // The following are known issues related to Events and Metrics:
+            // * In event or metric profiling, kernel launches are blocking. Thus kernels waiting
+            //   on host updates may hang. This includes synchronization between the host and
+            //   the device build upon value-based CUDA queue synchronization APIs such as
+            //   cuStreamWaitValue32() and cuStreamWriteValue32().
+            ALPAKA_CUDA_DRV_CHECK(detail::streamWaitValue(
+                static_cast<CUstream>(queue.getNativeHandle()),
+                reinterpret_cast<CUdeviceptr>(event.m_spEventImpl->m_devMem),
+                0x0101'0101u,
+                CU_STREAM_WAIT_VALUE_GEQ));
+        }
+    };
+} // namespace alpaka::trait
+#endif
+
+
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+
+#    include <hip/hip_runtime.h>
+
+#    if !BOOST_LANG_HIP && !defined(ALPAKA_HOST_ONLY)
+#        error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#    endif
+
+#    include "alpaka/core/Hip.hpp"
+
+namespace alpaka::test
+{
+    namespace hip::detail
+    {
+        class EventHostManualTriggerHipImpl final
+        {
+            using TApi = alpaka::ApiHipRt;
+
+        public:
+            ALPAKA_FN_HOST EventHostManualTriggerHipImpl(DevHipRt const& dev) : m_dev(dev), m_mutex(), m_bIsReady(true)
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // Set the current device.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipSetDevice(m_dev.getNativeHandle()));
+                // Allocate the buffer on this device.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipMalloc(&m_devMem, static_cast<size_t>(sizeof(int32_t))));
+                // Initiate the memory set.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    hipMemset(m_devMem, static_cast<int>(0u), static_cast<size_t>(sizeof(int32_t))));
+            }
+
+            EventHostManualTriggerHipImpl(EventHostManualTriggerHipImpl const&) = delete;
+            auto operator=(EventHostManualTriggerHipImpl const&) -> EventHostManualTriggerHipImpl& = delete;
+
+            ALPAKA_FN_HOST ~EventHostManualTriggerHipImpl()
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                // Free the buffer.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(hipFree(m_devMem));
+            }
+
+            void trigger()
+            {
+                std::unique_lock<std::mutex> lock(m_mutex);
+                m_bIsReady = true;
+
+                // Set the current device.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipSetDevice(m_dev.getNativeHandle()));
+                // Initiate the memory set.
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    hipMemset(m_devMem, static_cast<int>(1u), static_cast<size_t>(sizeof(int32_t))));
+                // Give alpaka time to update into the new state, process all events and tasks.
+                std::this_thread::sleep_for(std::chrono::milliseconds(200u));
+            }
+
+        public:
+            DevHipRt const m_dev; //!< The device this event is bound to.
+
+            mutable std::mutex m_mutex; //!< The mutex used to synchronize access to the event.
+            void* m_devMem;
+
+            bool m_bIsReady; //!< If the event is not waiting within a queue (not enqueued or already
+                             //!< completed).
+        };
+    } // namespace hip::detail
+
+    class EventHostManualTriggerHip final
+    {
+    public:
+        ALPAKA_FN_HOST EventHostManualTriggerHip(DevHipRt const& dev)
+            : m_spEventImpl(std::make_shared<hip::detail::EventHostManualTriggerHipImpl>(dev))
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+        }
+
+        ALPAKA_FN_HOST auto operator==(EventHostManualTriggerHip const& rhs) const -> bool
+        {
+            return (m_spEventImpl == rhs.m_spEventImpl);
+        }
+
+        ALPAKA_FN_HOST auto operator!=(EventHostManualTriggerHip const& rhs) const -> bool
+        {
+            return !((*this) == rhs);
+        }
+
+        void trigger()
+        {
+            m_spEventImpl->trigger();
+            // Give alpaka time to update into the new state, process all events and tasks.
+            std::this_thread::sleep_for(std::chrono::milliseconds(200u));
+        }
+
+    public:
+        std::shared_ptr<hip::detail::EventHostManualTriggerHipImpl> m_spEventImpl;
+    };
+
+    namespace trait
+    {
+        template<>
+        struct EventHostManualTriggerType<DevHipRt>
+        {
+            using type = test::EventHostManualTriggerHip;
+        };
+
+        //! The HIP event host manual trigger support get trait specialization.
+        template<>
+        struct IsEventHostManualTriggerSupported<DevHipRt>
+        {
+            // TODO: there is no CUDA_VERSION in the HIP compiler path.
+            // TODO: there is a hipDeviceGetAttribute, but there is no pendant for
+            // CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS.
+            ALPAKA_FN_HOST static auto isSupported(DevHipRt const&) -> bool
+            {
+                return false;
+            }
+        };
+    } // namespace trait
+} // namespace alpaka::test
+
+namespace alpaka::trait
+{
+    //! The CPU device event device get trait specialization.
+    template<>
+    struct GetDev<test::EventHostManualTriggerHip>
+    {
+        ALPAKA_FN_HOST static auto getDev(test::EventHostManualTriggerHip const& event) -> DevHipRt
+        {
+            return event.m_spEventImpl->m_dev;
+        }
+    };
+
+    //! The CPU device event test trait specialization.
+    template<>
+    struct IsComplete<test::EventHostManualTriggerHip>
+    {
+        //! \return If the event is not waiting within a queue (not enqueued or already handled).
+        ALPAKA_FN_HOST static auto isComplete(test::EventHostManualTriggerHip const& event) -> bool
+        {
+            std::lock_guard<std::mutex> lk(event.m_spEventImpl->m_mutex);
+
+            return event.m_spEventImpl->m_bIsReady;
+        }
+    };
+
+    template<>
+    struct Enqueue<QueueHipRtNonBlocking, test::EventHostManualTriggerHip>
+    {
+        using TApi = alpaka::ApiHipRt;
+
+        ALPAKA_FN_HOST static auto enqueue(QueueHipRtNonBlocking& queue, test::EventHostManualTriggerHip& event)
+            -> void
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+            // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.
+            auto spEventImpl(event.m_spEventImpl);
+
+            // Setting the event state and enqueuing it has to be atomic.
+            std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
+
+            // The event should not yet be enqueued.
+            ALPAKA_ASSERT(spEventImpl->m_bIsReady);
+
+            // Set its state to enqueued.
+            spEventImpl->m_bIsReady = false;
+
+            // PGI Profiler`s User Guide:
+            // The following are known issues related to Events and Metrics:
+            // * In event or metric profiling, kernel launches are blocking. Thus kernels waiting
+            //   on host updates may hang. This includes synchronization between the host and
+            //   the device build upon value-based CUDA queue synchronization APIs such as
+            //   cuStreamWaitValue32() and cuStreamWriteValue32().
+            int32_t hostMem = 0;
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
+            std::cerr << "[Workaround] polling of device-located value in stream, as hipStreamWaitValue32 is not "
+                         "available.\n";
+#    endif
+            while(hostMem < 0x0101'0101)
+            {
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipMemcpyDtoHAsync(
+                    &hostMem,
+                    reinterpret_cast<hipDeviceptr_t>(event.m_spEventImpl->m_devMem),
+                    sizeof(int32_t),
+                    queue.getNativeHandle()));
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipStreamSynchronize(queue.getNativeHandle()));
+            }
+        }
+    };
+
+    template<>
+    struct Enqueue<QueueHipRtBlocking, test::EventHostManualTriggerHip>
+    {
+        using TApi = alpaka::ApiHipRt;
+
+        ALPAKA_FN_HOST static auto enqueue(QueueHipRtBlocking& /* queue */, test::EventHostManualTriggerHip& event)
+            -> void
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+            // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.
+            auto spEventImpl(event.m_spEventImpl);
+
+            // Setting the event state and enqueuing it has to be atomic.
+            std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
+
+            // The event should not yet be enqueued.
+            ALPAKA_ASSERT(spEventImpl->m_bIsReady);
+
+            // Set its state to enqueued.
+            spEventImpl->m_bIsReady = false;
+
+            // PGI Profiler`s User Guide:
+            // The following are known issues related to Events and Metrics:
+            // * In event or metric profiling, kernel launches are blocking. Thus kernels waiting
+            //   on host updates may hang. This includes synchronization between the host and
+            //   the device build upon value-based HIP queue synchronization APIs such as
+            //   cuStreamWaitValue32() and cuStreamWriteValue32().
+
+            // workaround for missing cuStreamWaitValue32 in HIP
+            std::uint32_t hmem = 0;
+            do
+            {
+                std::this_thread::sleep_for(std::chrono::milliseconds(10u));
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+                    hipMemcpy(&hmem, event.m_spEventImpl->m_devMem, sizeof(std::uint32_t), hipMemcpyDefault));
+            } while(hmem < 0x0101'0101u);
+        }
+    };
+} // namespace alpaka::trait
+#endif
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+namespace alpaka
+{
+    namespace test
+    {
+        template<typename TTag>
+        class EventHostManualTriggerSycl
+        {
+        public:
+            EventHostManualTriggerSycl(DevGenericSycl<TTag> const&)
+            {
+            }
+
+            auto trigger()
+            {
+            }
+        };
+
+        namespace trait
+        {
+            template<typename TTag>
+            struct EventHostManualTriggerType<DevGenericSycl<TTag>>
+            {
+                using type = alpaka::test::EventHostManualTriggerSycl<TTag>;
+            };
+
+            template<typename TTag>
+            struct IsEventHostManualTriggerSupported<DevGenericSycl<TTag>>
+            {
+                ALPAKA_FN_HOST static auto isSupported(DevGenericSycl<TTag> const&) -> bool
+                {
+                    return false;
+                }
+            };
+        } // namespace trait
+    } // namespace test
+
+    namespace trait
+    {
+        template<typename TTag>
+        struct Enqueue<QueueGenericSyclBlocking<TTag>, test::EventHostManualTriggerSycl<TTag>>
+        {
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueGenericSyclBlocking<TTag>& /* queue */,
+                test::EventHostManualTriggerSycl<TTag>& /* event */) -> void
+            {
+            }
+        };
+
+        template<typename TTag>
+        struct Enqueue<QueueGenericSyclNonBlocking<TTag>, test::EventHostManualTriggerSycl<TTag>>
+        {
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueGenericSyclNonBlocking<TTag>& /* queue */,
+                test::EventHostManualTriggerSycl<TTag>& /* event */) -> void
+            {
+            }
+        };
+
+        template<typename TTag>
+        struct IsComplete<test::EventHostManualTriggerSycl<TTag>>
+        {
+            ALPAKA_FN_HOST static auto isComplete(test::EventHostManualTriggerSycl<TTag> const& /* event */) -> bool
+            {
+                return true;
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
+#endif
diff --git a/include/alpaka/test/idx/TestIdxs.hpp b/include/alpaka/test/idx/TestIdxs.hpp
new file mode 100644
index 0000000..19bf5a9
--- /dev/null
+++ b/include/alpaka/test/idx/TestIdxs.hpp
@@ -0,0 +1,28 @@
+/* Copyright 2022 Benjamin Worpitz, Erik Zenker, Matthias Werner, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <tuple>
+
+namespace alpaka::test
+{
+    //! A std::tuple holding idx types.
+    using TestIdxs = std::tuple<
+    // size_t is most probably identical to either std::uint64_t or std::uint32_t.
+    // This would lead to duplicate tests (especially test names) which is not allowed.
+    // std::size_t,
+#if !defined(ALPAKA_CI)
+        std::int64_t,
+#endif
+        std::uint64_t,
+        std::int32_t
+#if !defined(ALPAKA_CI)
+        ,
+        std::uint32_t
+#endif
+        // index type must be >=32bit
+        >;
+} // namespace alpaka::test
diff --git a/include/alpaka/test/mem/view/Iterator.hpp b/include/alpaka/test/mem/view/Iterator.hpp
new file mode 100644
index 0000000..314d1c0
--- /dev/null
+++ b/include/alpaka/test/mem/view/Iterator.hpp
@@ -0,0 +1,143 @@
+/* Copyright 2022 Benjamin Worpitz, Erik Zenker, Bernhard Manfred Gruber, Jan Stephan
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/alpaka.hpp"
+
+#include <type_traits>
+
+namespace alpaka::test
+{
+    namespace trait
+    {
+        // \tparam T Type to conditionally make const.
+        // \tparam TSource Type to mimic the constness of.
+        template<typename T, typename TSource>
+        using MimicConst = std::conditional_t<std::is_const_v<TSource>, std::add_const_t<T>, std::remove_const_t<T>>;
+
+        template<typename TView, typename TSfinae = void>
+        class IteratorView
+        {
+            using TViewDecayed = std::decay_t<TView>;
+            using Dim = alpaka::Dim<TViewDecayed>;
+            using Idx = alpaka::Idx<TViewDecayed>;
+            using Elem = MimicConst<alpaka::Elem<TViewDecayed>, TView>;
+
+        public:
+            ALPAKA_FN_HOST IteratorView(TView& view, Idx const idx)
+                : m_nativePtr(getPtrNative(view))
+                , m_currentIdx(idx)
+                , m_extents(getExtents(view))
+                , m_pitchBytes(getPitchesInBytes(view))
+            {
+            }
+
+            ALPAKA_FN_HOST explicit IteratorView(TView& view) : IteratorView(view, 0)
+            {
+            }
+
+            ALPAKA_FN_HOST_ACC auto operator++() -> IteratorView&
+            {
+                ++m_currentIdx;
+                return *this;
+            }
+
+            ALPAKA_FN_HOST_ACC auto operator--() -> IteratorView&
+            {
+                --m_currentIdx;
+                return *this;
+            }
+
+            ALPAKA_FN_HOST_ACC auto operator++(int) -> IteratorView
+            {
+                IteratorView iterCopy = *this;
+                m_currentIdx++;
+                return iterCopy;
+            }
+
+            ALPAKA_FN_HOST_ACC auto operator--(int) -> IteratorView
+            {
+                IteratorView iterCopy = *this;
+                m_currentIdx--;
+                return iterCopy;
+            }
+
+            template<typename TIter>
+            ALPAKA_FN_HOST_ACC auto operator==(TIter& other) const -> bool
+            {
+                return m_currentIdx == other.m_currentIdx;
+            }
+
+            template<typename TIter>
+            ALPAKA_FN_HOST_ACC auto operator!=(TIter& other) const -> bool
+            {
+                return m_currentIdx != other.m_currentIdx;
+            }
+
+            ALPAKA_FN_HOST_ACC auto operator*() const -> Elem&
+            {
+                if constexpr(Dim::value == 0)
+                    return *m_nativePtr;
+                else
+                {
+                    Vec<Dim, Idx> const currentIdxDimx
+                        = mapIdx<Dim::value>(Vec<DimInt<1>, Idx>{m_currentIdx}, m_extents);
+                    auto const offsetInBytes = (currentIdxDimx * m_pitchBytes).sum();
+                    using QualifiedByte = MimicConst<std::byte, Elem>;
+#if BOOST_COMP_GNUC
+#    pragma GCC diagnostic push
+                    // "cast from 'Byte*' to 'Elem*' increases required alignment of target type"
+#    pragma GCC diagnostic ignored "-Wcast-align"
+#endif
+                    return *reinterpret_cast<Elem*>(reinterpret_cast<QualifiedByte*>(m_nativePtr) + offsetInBytes);
+#if BOOST_COMP_GNUC
+#    pragma GCC diagnostic pop
+#endif
+                }
+                ALPAKA_UNREACHABLE(*m_nativePtr);
+            }
+
+        private:
+            Elem* m_nativePtr;
+            Idx m_currentIdx;
+            Vec<Dim, Idx> m_extents;
+            Vec<Dim, Idx> m_pitchBytes;
+        };
+
+        template<typename TView, typename TSfinae = void>
+        struct Begin
+        {
+            ALPAKA_FN_HOST static auto begin(TView& view) -> IteratorView<TView>
+            {
+                return IteratorView<TView>(view);
+            }
+        };
+
+        template<typename TView, typename TSfinae = void>
+        struct End
+        {
+            ALPAKA_FN_HOST static auto end(TView& view) -> IteratorView<TView>
+            {
+                auto extents = getExtents(view);
+                return IteratorView<TView>(view, extents.prod());
+            }
+        };
+    } // namespace trait
+
+    template<typename TView>
+    using Iterator = trait::IteratorView<TView>;
+
+    template<typename TView>
+    ALPAKA_FN_HOST auto begin(TView& view) -> Iterator<TView>
+    {
+        return trait::Begin<TView>::begin(view);
+    }
+
+    template<typename TView>
+    ALPAKA_FN_HOST auto end(TView& view) -> Iterator<TView>
+    {
+        return trait::End<TView>::end(view);
+    }
+} // namespace alpaka::test
diff --git a/include/alpaka/test/mem/view/ViewTest.hpp b/include/alpaka/test/mem/view/ViewTest.hpp
new file mode 100644
index 0000000..eef3b5a
--- /dev/null
+++ b/include/alpaka/test/mem/view/ViewTest.hpp
@@ -0,0 +1,264 @@
+/* Copyright 2023 Benjamin Worpitz, Sergei Bastrakov, René Widera, Bernhard Manfred Gruber, Jan Stephan
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/alpaka.hpp"
+#include "alpaka/test/KernelExecutionFixture.hpp"
+#include "alpaka/test/mem/view/Iterator.hpp"
+
+#include <catch2/catch_test_macros.hpp>
+
+#include <numeric>
+#include <type_traits>
+
+//! The test specifics.
+namespace alpaka::test
+{
+    template<typename TElem, typename TDim, typename TIdx, typename TDev, typename TView>
+    ALPAKA_FN_HOST auto testViewImmutable(
+        TView const& view,
+        TDev const& dev,
+        Vec<TDim, TIdx> const& extent,
+        Vec<TDim, TIdx> const& offset) -> void
+    {
+        // trait::DevType
+        {
+            static_assert(
+                std::is_same_v<Dev<TView>, TDev>,
+                "The device type of the view has to be equal to the specified one.");
+        }
+
+        // trait::GetDev
+        {
+            REQUIRE(dev == getDev(view));
+        }
+
+        // trait::DimType
+        {
+            static_assert(
+                Dim<TView>::value == TDim::value,
+                "The dimensionality of the view has to be equal to the specified one.");
+        }
+
+        // trait::ElemType
+        {
+            static_assert(
+                std::is_same_v<Elem<TView>, TElem>,
+                "The element type of the view has to be equal to the specified one.");
+        }
+
+        // trait::GetExtents
+        {
+            REQUIRE(extent == getExtents(view));
+        }
+
+        // trait::GetPitchBytes
+        {
+            auto const pitchMinimum = alpaka::detail::calculatePitchesFromExtents<TElem>(extent);
+            auto const pitchView = getPitchesInBytes(view);
+
+            for(TIdx i = TDim::value; i > static_cast<TIdx>(0u); --i)
+            {
+                REQUIRE(pitchView[i - 1] >= pitchMinimum[i - 1]);
+            }
+        }
+
+        // trait::GetPtrNative
+        {
+            // The view is a const& so the pointer has to point to a const value.
+            using NativePtr = decltype(getPtrNative(view));
+            static_assert(std::is_pointer_v<NativePtr>, "The value returned by getPtrNative has to be a pointer.");
+            static_assert(
+                std::is_const_v<std::remove_pointer_t<NativePtr>>,
+                "The value returned by getPtrNative has to be const when the view is const.");
+
+            if(getExtentProduct(view) != static_cast<TIdx>(0u))
+            {
+                // The pointer is only required to be non-null when the extent is > 0.
+                TElem const* const invalidPtr(nullptr);
+                REQUIRE(invalidPtr != getPtrNative(view));
+            }
+            else
+            {
+                // When the extent is 0, the pointer is undefined but it should still be possible get it.
+                getPtrNative(view);
+            }
+        }
+
+        // trait::GetOffsets
+        {
+            REQUIRE(offset == getOffsets(view));
+        }
+
+        // trait::IdxType
+        {
+            static_assert(
+                std::is_same_v<Idx<TView>, TIdx>,
+                "The idx type of the view has to be equal to the specified one.");
+        }
+    }
+
+    //! Compares element-wise that all bytes are set to the same value.
+    struct VerifyBytesSetKernel
+    {
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TAcc, typename TIter>
+        ALPAKA_FN_ACC void operator()(
+            TAcc const& acc [[maybe_unused]], // used by SYCL back-end
+            bool* success,
+            TIter const& begin,
+            TIter const& end,
+            std::uint8_t const& byte) const
+        {
+            constexpr auto elemSizeInByte = static_cast<unsigned>(sizeof(decltype(*begin)));
+            for(auto it = begin; it != end; ++it)
+            {
+                auto const& elem = *it;
+                auto const pBytes = reinterpret_cast<std::uint8_t const*>(&elem);
+                for(unsigned i = 0; i < elemSizeInByte; ++i)
+                {
+                    if(pBytes[i] != byte)
+                    {
+                        printf("Byte at offset %u is different: %u != %u\n", i, unsigned{pBytes[i]}, unsigned{byte});
+                        *success = false;
+                    }
+                }
+            }
+        }
+    };
+
+    template<typename TAcc, typename TView>
+    ALPAKA_FN_HOST auto verifyBytesSet(TView const& view, std::uint8_t const& byte) -> void
+    {
+        using Dim = Dim<TView>;
+        using Idx = Idx<TView>;
+
+        KernelExecutionFixture<TAcc> fixture(Vec<Dim, Idx>::ones());
+
+        VerifyBytesSetKernel verifyBytesSet;
+
+        REQUIRE(fixture(verifyBytesSet, test::begin(view), test::end(view), byte));
+    }
+
+    //! Compares iterators element-wise
+#if BOOST_COMP_GNUC
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wfloat-equal" // "comparing floating point with == or != is unsafe"
+#endif
+    struct VerifyViewsEqualKernel
+    {
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TAcc, typename TIterA, typename TIterB>
+        ALPAKA_FN_ACC void operator()(
+            TAcc const& acc [[maybe_unused]], // used by SYCL back-end
+            bool* success,
+            TIterA beginA,
+            TIterA const& endA,
+            TIterB beginB) const
+        {
+            for(; beginA != endA; ++beginA, ++beginB)
+            {
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wfloat-equal" // "comparing floating point with == or != is unsafe"
+#endif
+                ALPAKA_CHECK(*success, *beginA == *beginB);
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic pop
+#endif
+            }
+        }
+    };
+#if BOOST_COMP_GNUC
+#    pragma GCC diagnostic pop
+#endif
+
+    template<typename TAcc, typename TViewB, typename TViewA>
+    ALPAKA_FN_HOST auto verifyViewsEqual(TViewA const& viewA, TViewB const& viewB) -> void
+    {
+        using DimA = Dim<TViewA>;
+        using DimB = Dim<TViewB>;
+        static_assert(DimA::value == DimB::value, "viewA and viewB are required to have identical Dim");
+        using IdxA = Idx<TViewA>;
+        using IdxB = Idx<TViewB>;
+        static_assert(std::is_same_v<IdxA, IdxB>, "viewA and viewB are required to have identical Idx");
+
+        test::KernelExecutionFixture<TAcc> fixture(Vec<DimA, IdxA>::ones());
+
+        VerifyViewsEqualKernel verifyViewsEqualKernel;
+
+        REQUIRE(fixture(verifyViewsEqualKernel, test::begin(viewA), test::end(viewA), test::begin(viewB)));
+    }
+
+    //! Fills the given view with increasing values starting at 0.
+    template<typename TView, typename TQueue>
+    ALPAKA_FN_HOST auto iotaFillView(TQueue& queue, TView& view) -> void
+    {
+        using Elem = Elem<TView>;
+
+        auto const platformHost = alpaka::PlatformCpu{};
+        auto const devHost = alpaka::getDevByIdx(platformHost, 0);
+
+        auto const extent = getExtents(view);
+
+        // Init buf with increasing values
+        std::vector<Elem> v(static_cast<std::size_t>(extent.prod()), static_cast<Elem>(0));
+        std::iota(std::begin(v), std::end(v), static_cast<Elem>(0));
+        auto plainBuf = createView(devHost, v, extent);
+
+        // Copy the generated content into the given view.
+        memcpy(queue, view, plainBuf);
+
+        wait(queue);
+    }
+
+    template<typename TAcc, typename TView, typename TQueue>
+    ALPAKA_FN_HOST auto testViewMutable(TQueue& queue, TView& view) -> void
+    {
+        // trait::GetPtrNative
+        {
+            // The view is a non-const so the pointer has to point to a non-const value.
+            using NativePtr = decltype(getPtrNative(view));
+            static_assert(std::is_pointer_v<NativePtr>, "The value returned by getPtrNative has to be a pointer.");
+            static_assert(
+                !std::is_const_v<std::remove_pointer_t<NativePtr>>,
+                "The value returned by getPtrNative has to be non-const when the view is non-const.");
+        }
+
+        // set
+        {
+            auto const byte(static_cast<uint8_t>(42u));
+            memset(queue, view, byte);
+            wait(queue);
+            verifyBytesSet<TAcc>(view, byte);
+        }
+
+        // copy
+        {
+            using Elem = Elem<TView>;
+            using Idx = Idx<TView>;
+
+            auto const devAcc = getDev(view);
+            auto const extent = getExtents(view);
+
+            // copy into given view
+            {
+                auto srcBufAcc = allocBuf<Elem, Idx>(devAcc, extent);
+                iotaFillView(queue, srcBufAcc);
+                memcpy(queue, view, srcBufAcc);
+                wait(queue);
+                verifyViewsEqual<TAcc>(view, srcBufAcc);
+            }
+
+            // copy from given view
+            {
+                auto dstBufAcc = allocBuf<Elem, Idx>(devAcc, extent);
+                memcpy(queue, dstBufAcc, view);
+                wait(queue);
+                verifyViewsEqual<TAcc>(dstBufAcc, view);
+            }
+        }
+    }
+} // namespace alpaka::test
diff --git a/include/alpaka/test/queue/Queue.hpp b/include/alpaka/test/queue/Queue.hpp
new file mode 100644
index 0000000..0518e6d
--- /dev/null
+++ b/include/alpaka/test/queue/Queue.hpp
@@ -0,0 +1,146 @@
+/* Copyright 2024 Benjamin Worpitz, Matthias Werner, René Widera, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci,
+ * Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/alpaka.hpp"
+
+namespace alpaka::test
+{
+    namespace trait
+    {
+        //! The default queue type trait for devices.
+        template<typename TDev, typename TSfinae = void>
+        struct DefaultQueueType;
+
+        //! The default queue type trait specialization for the CPU device.
+        template<>
+        struct DefaultQueueType<DevCpu>
+        {
+#if(ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+            using type = QueueCpuBlocking;
+#else
+            using type = QueueCpuNonBlocking;
+#endif
+        };
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+        //! The default queue type trait specialization for the CUDA/HIP device.
+        template<typename TApi>
+        struct DefaultQueueType<DevUniformCudaHipRt<TApi>>
+        {
+#    if(ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+            using type = QueueUniformCudaHipRtBlocking<TApi>;
+#    else
+            using type = QueueUniformCudaHipRtNonBlocking<TApi>;
+#    endif
+        };
+#endif
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+        //! The default queue type trait specialization for the SYCL device.
+        template<typename TTag>
+        struct DefaultQueueType<DevGenericSycl<TTag>>
+        {
+#    if(ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+            using type = QueueGenericSyclBlocking<TTag>;
+#    else
+            using type = QueueGenericSyclNonBlocking<TTag>;
+#    endif
+        };
+#endif
+
+        //! The blocking queue trait.
+        template<typename TQueue, typename TSfinae = void>
+        struct IsBlockingQueue;
+
+        //! The blocking queue trait specialization for a blocking CPU queue.
+        template<typename TDev>
+        struct IsBlockingQueue<QueueGenericThreadsBlocking<TDev>>
+        {
+            static constexpr bool value = true;
+        };
+
+        //! The blocking queue trait specialization for a non-blocking CPU queue.
+        template<typename TDev>
+        struct IsBlockingQueue<QueueGenericThreadsNonBlocking<TDev>>
+        {
+            static constexpr bool value = false;
+        };
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+        //! The blocking queue trait specialization for a blocking CUDA/HIP RT queue.
+        template<typename TApi>
+        struct IsBlockingQueue<QueueUniformCudaHipRtBlocking<TApi>>
+        {
+            static constexpr bool value = true;
+        };
+
+        //! The blocking queue trait specialization for a non-blocking CUDA/HIP RT queue.
+        template<typename TApi>
+        struct IsBlockingQueue<QueueUniformCudaHipRtNonBlocking<TApi>>
+        {
+            static constexpr bool value = false;
+        };
+#endif
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+        template<typename TTag>
+        struct IsBlockingQueue<QueueGenericSyclBlocking<TTag>>
+        {
+            static constexpr auto value = true;
+        };
+
+        template<typename TTag>
+        struct IsBlockingQueue<QueueGenericSyclNonBlocking<TTag>>
+        {
+            static constexpr auto value = false;
+        };
+#endif
+    } // namespace trait
+
+    //! The queue type that should be used for the given device.
+    template<typename TDev>
+    using DefaultQueue = typename trait::DefaultQueueType<TDev>::type;
+
+    //! The queue type that should be used for the given accelerator.
+    template<typename TQueue>
+    using IsBlockingQueue = trait::IsBlockingQueue<TQueue>;
+
+    //! A std::tuple holding tuples of devices and corresponding queue types.
+    using TestQueues = std::tuple<
+        std::tuple<DevCpu, QueueCpuBlocking>,
+        std::tuple<DevCpu, QueueCpuNonBlocking>
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+        ,
+        std::tuple<DevCudaRt, QueueCudaRtBlocking>,
+        std::tuple<DevCudaRt, QueueCudaRtNonBlocking>
+#endif
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+        ,
+        std::tuple<DevHipRt, QueueHipRtBlocking>,
+        std::tuple<DevHipRt, QueueHipRtNonBlocking>
+#endif
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+#    ifdef ALPAKA_SYCL_ONEAPI_CPU
+        ,
+        std::tuple<alpaka::DevCpuSycl, alpaka::QueueCpuSyclBlocking>,
+        std::tuple<alpaka::DevCpuSycl, alpaka::QueueCpuSyclNonBlocking>
+#    endif
+#    ifdef ALPAKA_SYCL_ONEAPI_FPGA
+        ,
+        std::tuple<alpaka::DevFpgaSyclIntel, alpaka::QueueFpgaSyclIntelBlocking>,
+        std::tuple<alpaka::DevFpgaSyclIntel, alpaka::QueueFpgaSyclIntelNonBlocking>
+#    endif
+#    ifdef ALPAKA_SYCL_ONEAPI_GPU
+        ,
+        std::tuple<alpaka::DevGpuSyclIntel, alpaka::QueueGpuSyclIntelBlocking>,
+        std::tuple<alpaka::DevGpuSyclIntel, alpaka::QueueGpuSyclIntelNonBlocking>
+#    endif
+#endif
+        >;
+} // namespace alpaka::test
diff --git a/include/alpaka/test/queue/QueueCpuOmp2Collective.hpp b/include/alpaka/test/queue/QueueCpuOmp2Collective.hpp
new file mode 100644
index 0000000..4b346c8
--- /dev/null
+++ b/include/alpaka/test/queue/QueueCpuOmp2Collective.hpp
@@ -0,0 +1,297 @@
+/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/dev/DevCpu.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/event/EventCpu.hpp"
+#include "alpaka/event/Traits.hpp"
+#include "alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp"
+#include "alpaka/queue/QueueCpuBlocking.hpp"
+#include "alpaka/queue/Traits.hpp"
+#include "alpaka/queue/cpu/ICpuQueue.hpp"
+#include "alpaka/test/event/EventHostManualTrigger.hpp"
+#include "alpaka/test/queue/Queue.hpp"
+#include "alpaka/wait/Traits.hpp"
+
+#include <atomic>
+#include <mutex>
+
+#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
+
+#    if _OPENMP < 200203
+#        error If ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
+#    endif
+
+#    include <omp.h>
+
+namespace alpaka
+{
+    namespace cpu::detail
+    {
+#    if BOOST_COMP_CLANG
+// avoid diagnostic warning: "has no out-of-line virtual method definitions; its vtable will be emitted in every
+// translation unit [-Werror,-Wweak-vtables]" https://stackoverflow.com/a/29288300
+#        pragma clang diagnostic push
+#        pragma clang diagnostic ignored "-Wweak-vtables"
+#    endif
+        //! The CPU collective device queue implementation.
+        struct QueueCpuOmp2CollectiveImpl final : cpu::ICpuQueue
+#    if BOOST_COMP_CLANG
+#        pragma clang diagnostic pop
+#    endif
+        {
+            explicit QueueCpuOmp2CollectiveImpl(DevCpu const& dev) noexcept : m_dev(dev), blockingQueue(dev)
+            {
+            }
+
+            void enqueue(EventCpu& ev) final
+            {
+                alpaka::enqueue(*this, ev);
+            }
+
+            void wait(EventCpu const& ev) final
+            {
+                alpaka::wait(*this, ev);
+            }
+
+            void busyWaitUntilBlockingQueueEmpty()
+            {
+                while(!empty(blockingQueue))
+                    ;
+            }
+
+            DevCpu const m_dev; //!< The device this queue is bound to.
+            std::mutex mutable m_mutex;
+            QueueCpuBlocking blockingQueue;
+            std::atomic<uint32_t> m_uCurrentlyExecutingTask = 0;
+        };
+    } // namespace cpu::detail
+
+    //! The CPU collective device queue.
+    //
+    // @attention Queue can only be used together with the accelerator AccCpuOmp2Blocks.
+    //
+    // This queue is an example for a user provided queue and the behavior is strongly coupled
+    // to the user workflows.
+    //
+    // Within an OpenMP parallel region kernel will be performed collectively.
+    // All other operations will be performed from one thread (it is not defined which thread) and there will be no
+    // implicit synchronization between other operations within the parallel OpenMP parallel region. Operations
+    // executed within a OpenMP parallel region will be executed after already queued tasks before the parallel region
+    // was created.
+    //
+    // Outside of an OpenMP parallel region the queue behaves like QueueCpuBlocking.
+    struct QueueCpuOmp2Collective final : concepts::Implements<ConceptCurrentThreadWaitFor, QueueCpuOmp2Collective>
+    {
+        explicit QueueCpuOmp2Collective(DevCpu const& dev)
+            : m_spQueueImpl(std::make_shared<cpu::detail::QueueCpuOmp2CollectiveImpl>(dev))
+        {
+            dev.registerQueue(m_spQueueImpl);
+        }
+
+        auto operator==(QueueCpuOmp2Collective const& rhs) const -> bool
+        {
+            return m_spQueueImpl == rhs.m_spQueueImpl;
+        }
+
+        auto operator!=(QueueCpuOmp2Collective const& rhs) const -> bool
+        {
+            return !((*this) == rhs);
+        }
+
+        std::shared_ptr<cpu::detail::QueueCpuOmp2CollectiveImpl> m_spQueueImpl;
+    };
+
+    namespace trait
+    {
+        //! The CPU blocking device queue device type trait specialization.
+        template<>
+        struct DevType<QueueCpuOmp2Collective>
+        {
+            using type = DevCpu;
+        };
+
+        //! The CPU blocking device queue device get trait specialization.
+        template<>
+        struct GetDev<QueueCpuOmp2Collective>
+        {
+            ALPAKA_FN_HOST static auto getDev(QueueCpuOmp2Collective const& queue) -> DevCpu
+            {
+                return queue.m_spQueueImpl->m_dev;
+            }
+        };
+
+        //! The CPU blocking device queue event type trait specialization.
+        template<>
+        struct EventType<QueueCpuOmp2Collective>
+        {
+            using type = EventCpu;
+        };
+
+        //! The CPU blocking device queue enqueue trait specialization.
+        //! This default implementation for all tasks directly invokes the function call operator of the task.
+        template<typename TTask>
+        struct Enqueue<QueueCpuOmp2Collective, TTask>
+        {
+            ALPAKA_FN_HOST static auto enqueue(QueueCpuOmp2Collective& queue, TTask const& task) -> void
+            {
+                if(::omp_in_parallel() != 0)
+                {
+                    // wait for all tasks enqueued before the parallel region
+                    queue.m_spQueueImpl->busyWaitUntilBlockingQueueEmpty();
+                    ++queue.m_spQueueImpl->m_uCurrentlyExecutingTask;
+#    pragma omp single nowait
+                    task();
+                    --queue.m_spQueueImpl->m_uCurrentlyExecutingTask;
+                }
+                else
+                {
+                    std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
+                    alpaka::enqueue(queue.m_spQueueImpl->blockingQueue, task);
+                }
+            }
+        };
+
+        //! The CPU blocking device queue test trait specialization.
+        template<>
+        struct Empty<QueueCpuOmp2Collective>
+        {
+            ALPAKA_FN_HOST static auto empty(QueueCpuOmp2Collective const& queue) -> bool
+            {
+                return queue.m_spQueueImpl->m_uCurrentlyExecutingTask == 0u
+                       && alpaka::empty(queue.m_spQueueImpl->blockingQueue);
+            }
+        };
+
+        //! The CPU OpenMP2 collective device queue enqueue trait specialization.
+        template<>
+        struct Enqueue<cpu::detail::QueueCpuOmp2CollectiveImpl, EventCpu>
+        {
+            ALPAKA_FN_HOST static auto enqueue(cpu::detail::QueueCpuOmp2CollectiveImpl&, EventCpu&) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+#    pragma omp barrier
+            }
+        };
+
+        //! The CPU OpenMP2 collective device queue enqueue trait specialization.
+        template<>
+        struct Enqueue<QueueCpuOmp2Collective, EventCpu>
+        {
+            ALPAKA_FN_HOST static auto enqueue(QueueCpuOmp2Collective& queue, EventCpu& event) -> void
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+                if(::omp_in_parallel() != 0)
+                {
+                    // wait for all tasks en-queued before the parallel region
+                    queue.m_spQueueImpl->busyWaitUntilBlockingQueueEmpty();
+#    pragma omp barrier
+                }
+                else
+                {
+                    alpaka::enqueue(queue.m_spQueueImpl->blockingQueue, event);
+                }
+            }
+        };
+
+        //! The CPU blocking device queue enqueue trait specialization.
+        //! This default implementation for all tasks directly invokes the function call operator of the task.
+        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+        struct Enqueue<QueueCpuOmp2Collective, TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
+        {
+            ALPAKA_FN_HOST static auto enqueue(
+                QueueCpuOmp2Collective& queue,
+                TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...> const& task) -> void
+            {
+                if(::omp_in_parallel() != 0)
+                {
+                    queue.m_spQueueImpl->busyWaitUntilBlockingQueueEmpty();
+                    ++queue.m_spQueueImpl->m_uCurrentlyExecutingTask;
+                    // execute task within an OpenMP parallel region
+                    task();
+                    --queue.m_spQueueImpl->m_uCurrentlyExecutingTask;
+                }
+                else
+                {
+                    std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
+                    alpaka::enqueue(queue.m_spQueueImpl->blockingQueue, task);
+                }
+            }
+        };
+
+        template<>
+        struct Enqueue<QueueCpuOmp2Collective, test::EventHostManualTriggerCpu<>>
+        {
+            ALPAKA_FN_HOST static auto enqueue(QueueCpuOmp2Collective&, test::EventHostManualTriggerCpu<>&) -> void
+            {
+                // EventHostManualTriggerCpu are not supported for together with the queue
+                // QueueCpuOmp2Collective but a specialization is needed to path the EventTests
+            }
+        };
+
+        //! The CPU blocking device queue thread wait trait specialization.
+        //!
+        //! Blocks execution of the calling thread until the queue has finished processing all previously requested
+        //! tasks (kernels, data copies, ...)
+        template<>
+        struct CurrentThreadWaitFor<QueueCpuOmp2Collective>
+        {
+            ALPAKA_FN_HOST static auto currentThreadWaitFor(QueueCpuOmp2Collective const& queue) -> void
+            {
+                if(::omp_in_parallel() != 0)
+                {
+                    // wait for all tasks en-queued before the parallel region
+                    queue.m_spQueueImpl->busyWaitUntilBlockingQueueEmpty();
+#    pragma omp barrier
+                }
+                else
+                {
+                    std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
+                    wait(queue.m_spQueueImpl->blockingQueue);
+                }
+            }
+        };
+
+        //! The CPU OpenMP2 collective device queue event wait trait specialization.
+        template<>
+        struct WaiterWaitFor<cpu::detail::QueueCpuOmp2CollectiveImpl, EventCpu>
+        {
+            ALPAKA_FN_HOST static auto waiterWaitFor(cpu::detail::QueueCpuOmp2CollectiveImpl&, EventCpu const&) -> void
+            {
+#    pragma omp barrier
+            }
+        };
+
+        //! The CPU OpenMP2 collective queue event wait trait specialization.
+        template<>
+        struct WaiterWaitFor<QueueCpuOmp2Collective, EventCpu>
+        {
+            ALPAKA_FN_HOST static auto waiterWaitFor(QueueCpuOmp2Collective& queue, EventCpu const& event) -> void
+            {
+                if(::omp_in_parallel() != 0)
+                {
+                    // wait for all tasks en-queued before the parallel region
+                    queue.m_spQueueImpl->busyWaitUntilBlockingQueueEmpty();
+                    wait(queue);
+                }
+                else
+                    wait(queue.m_spQueueImpl->blockingQueue, event);
+            }
+        };
+    } // namespace trait
+
+    //! The blocking queue trait specialization for a OpenMP2 collective CPU queue.
+    template<>
+    struct test::trait::IsBlockingQueue<QueueCpuOmp2Collective> : std::true_type
+    {
+    };
+} // namespace alpaka
+
+#    include "alpaka/event/EventCpu.hpp"
+
+#endif
diff --git a/include/alpaka/test/queue/QueueTestFixture.hpp b/include/alpaka/test/queue/QueueTestFixture.hpp
new file mode 100644
index 0000000..ad6f815
--- /dev/null
+++ b/include/alpaka/test/queue/QueueTestFixture.hpp
@@ -0,0 +1,23 @@
+/* Copyright 2023 Benjamin Worpitz, Jan Stephan
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+#include "alpaka/alpaka.hpp"
+
+#include <tuple>
+
+namespace alpaka::test
+{
+    template<typename TDevQueue>
+    struct QueueTestFixture
+    {
+        using Dev = std::tuple_element_t<0, TDevQueue>;
+        using Queue = std::tuple_element_t<1, TDevQueue>;
+        using Platform = alpaka::Platform<Dev>;
+
+        Platform m_platform{};
+        Dev m_dev{getDevByIdx(m_platform, 0)};
+        Queue m_queue{m_dev};
+    };
+} // namespace alpaka::test
diff --git a/include/alpaka/traits/Traits.hpp b/include/alpaka/traits/Traits.hpp
new file mode 100644
index 0000000..987a48a
--- /dev/null
+++ b/include/alpaka/traits/Traits.hpp
@@ -0,0 +1,37 @@
+/* Copyright 2022 Antonio Di Pilato
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+
+namespace alpaka
+{
+    //! The common trait.
+    namespace trait
+    {
+        //! The native handle trait.
+        template<typename TImpl, typename TSfinae = void>
+        struct NativeHandle
+        {
+            static auto getNativeHandle(TImpl const&)
+            {
+                static_assert(!sizeof(TImpl), "This type does not have a native handle!");
+                return 0;
+            }
+        };
+    } // namespace trait
+
+    //! Get the native handle of the alpaka object.
+    //! It will return the alpaka object handle if there is any, otherwise it generates a compile time error.
+    template<typename TImpl>
+    ALPAKA_FN_HOST auto getNativeHandle(TImpl const& impl)
+    {
+        return trait::NativeHandle<TImpl>::getNativeHandle(impl);
+    }
+
+    //! Alias to the type of the native handle.
+    template<typename TImpl>
+    using NativeHandle = decltype(getNativeHandle(std::declval<TImpl>()));
+} // namespace alpaka
diff --git a/include/alpaka/vec/Traits.hpp b/include/alpaka/vec/Traits.hpp
new file mode 100644
index 0000000..531fe04
--- /dev/null
+++ b/include/alpaka/vec/Traits.hpp
@@ -0,0 +1,102 @@
+/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/dim/Traits.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/meta/IntegerSequence.hpp"
+
+#include <utility>
+
+namespace alpaka
+{
+    //! The vec traits.
+    namespace trait
+    {
+        //! Trait for selecting a sub-vector.
+        template<typename TVec, typename TIndexSequence, typename TSfinae = void>
+        struct SubVecFromIndices;
+
+        //! Trait for casting a vector.
+        template<typename TVal, typename TVec, typename TSfinae = void>
+        struct CastVec;
+
+        //! Trait for reversing a vector.
+        template<typename TVec, typename TSfinae = void>
+        struct ReverseVec;
+
+        //! Trait for concatenating two vectors.
+        template<typename TVecL, typename TVecR, typename TSfinae = void>
+        struct ConcatVec;
+    } // namespace trait
+
+    //! Builds a new vector by selecting the elements of the source vector in the given order.
+    //! Repeating and swizzling elements is allowed.
+    //! \return The sub-vector consisting of the elements specified by the indices.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TIndexSequence, typename TVec>
+    ALPAKA_FN_HOST_ACC constexpr auto subVecFromIndices(TVec const& vec)
+    {
+        return trait::SubVecFromIndices<TVec, TIndexSequence>::subVecFromIndices(vec);
+    }
+
+    //! \tparam TVec has to specialize SubVecFromIndices.
+    //! \return The sub-vector consisting of the first N elements of the source vector.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TSubDim, typename TVec>
+    ALPAKA_FN_HOST_ACC constexpr auto subVecBegin(TVec const& vec)
+    {
+        static_assert(
+            TSubDim::value <= Dim<TVec>::value,
+            "The sub-Vec has to be smaller (or same size) then the original Vec.");
+
+        //! A sequence of integers from 0 to dim-1.
+        using IdxSubSequence = std::make_integer_sequence<std::size_t, TSubDim::value>;
+        return subVecFromIndices<IdxSubSequence>(vec);
+    }
+
+    //! \tparam TVec has to specialize SubVecFromIndices.
+    //! \return The sub-vector consisting of the last N elements of the source vector.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TSubDim, typename TVec>
+    ALPAKA_FN_HOST_ACC constexpr auto subVecEnd(TVec const& vec)
+    {
+        static_assert(
+            TSubDim::value <= Dim<TVec>::value,
+            "The sub-Vec has to be smaller (or same size) then the original Vec.");
+
+        constexpr std::size_t idxOffset = Dim<TVec>::value - TSubDim::value;
+
+        //! A sequence of integers from 0 to dim-1.
+        using IdxSubSequence = meta::MakeIntegerSequenceOffset<std::size_t, idxOffset, TSubDim::value>;
+        return subVecFromIndices<IdxSubSequence>(vec);
+    }
+
+    //! \return The casted vector.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TVal, typename TVec>
+    ALPAKA_FN_HOST_ACC constexpr auto castVec(TVec const& vec)
+    {
+        return trait::CastVec<TVal, TVec>::castVec(vec);
+    }
+
+    //! \return The reverseVec vector.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TVec>
+    ALPAKA_FN_HOST_ACC constexpr auto reverseVec(TVec const& vec)
+    {
+        return trait::ReverseVec<TVec>::reverseVec(vec);
+    }
+
+    //! \return The concatenated vector.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TVecL, typename TVecR>
+    ALPAKA_FN_HOST_ACC constexpr auto concatVec(TVecL const& vecL, TVecR const& vecR)
+    {
+        return trait::ConcatVec<TVecL, TVecR>::concatVec(vecL, vecR);
+    }
+} // namespace alpaka
diff --git a/include/alpaka/vec/Vec.hpp b/include/alpaka/vec/Vec.hpp
new file mode 100644
index 0000000..d327f60
--- /dev/null
+++ b/include/alpaka/vec/Vec.hpp
@@ -0,0 +1,799 @@
+/* Copyright 2023 Axel Huebl, Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera, Andrea Bocci, Jan Stephan,
+ *                Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Align.hpp"
+#include "alpaka/core/Assert.hpp"
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/Common.hpp"
+#include "alpaka/core/Unreachable.hpp"
+#include "alpaka/dim/DimIntegralConst.hpp"
+#include "alpaka/dim/Traits.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/meta/Fold.hpp"
+#include "alpaka/meta/Functional.hpp"
+#include "alpaka/meta/IntegerSequence.hpp"
+#include "alpaka/vec/Traits.hpp"
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <ostream>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+namespace alpaka
+{
+    template<typename TDim, typename TVal>
+    class Vec;
+
+    //! A n-dimensional vector.
+    template<typename TDim, typename TVal>
+    class Vec final
+    {
+    public:
+        static_assert(TDim::value >= 0u, "Invalid dimensionality");
+
+        using Dim = TDim;
+        using Val = TVal;
+        using value_type = Val; //!< STL-like value_type.
+
+    private:
+        //! A sequence of integers from 0 to dim-1.
+        //! This can be used to write compile time indexing algorithms.
+        using IdxSequence = std::make_integer_sequence<std::size_t, TDim::value>;
+
+    public:
+        ALPAKA_FN_HOST_ACC constexpr Vec() : m_data{}
+        {
+        }
+
+        //! Value constructor.
+        //! This constructor is only available if the number of parameters matches the vector idx.
+        ALPAKA_NO_HOST_ACC_WARNING
+#if BOOST_COMP_NVCC && BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(11, 3, 0)                                              \
+    && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 4, 0)
+        // This constructor tries to avoid SFINAE, which crashes nvcc 11.3. We also need to have a first
+        // argument, so an unconstrained ctor with forwarding references does not hijack the compiler provided
+        // copy-ctor.
+        template<typename... TArgs>
+        ALPAKA_FN_HOST_ACC constexpr Vec(TVal arg0, TArgs&&... args)
+            : m_data{std::move(arg0), static_cast<TVal>(std::forward<TArgs>(args))...}
+        {
+            static_assert(
+                1 + sizeof...(TArgs) == TDim::value && (std::is_convertible_v<std::decay_t<TArgs>, TVal> && ...),
+                "Wrong number of arguments to Vec constructor or types are not convertible to TVal.");
+        }
+#else
+        template<
+            typename... TArgs,
+            typename = std::enable_if_t<
+                sizeof...(TArgs) == TDim::value && (std::is_convertible_v<std::decay_t<TArgs>, TVal> && ...)>>
+        ALPAKA_FN_HOST_ACC constexpr Vec(TArgs&&... args) : m_data{static_cast<TVal>(std::forward<TArgs>(args))...}
+        {
+        }
+#endif
+
+        //! Generator constructor.
+        //! Initializes the vector with the values returned from generator(IC) in order, where IC::value runs from 0 to
+        //! TDim - 1 (inclusive).
+#if BOOST_COMP_NVCC && BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(11, 3, 0)                                              \
+    && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 4, 0)
+        template<typename F>
+        ALPAKA_FN_HOST_ACC constexpr explicit Vec(
+            F&& generator,
+            std::void_t<decltype(generator(std::integral_constant<std::size_t, 0>{}))>* ignore = nullptr)
+            : Vec(std::forward<F>(generator), std::make_index_sequence<TDim::value>{})
+        {
+            static_cast<void>(ignore);
+        }
+#else
+        template<typename F, std::enable_if_t<std::is_invocable_v<F, std::integral_constant<std::size_t, 0>>, int> = 0>
+        ALPAKA_FN_HOST_ACC constexpr explicit Vec(F&& generator)
+            : Vec(std::forward<F>(generator), std::make_index_sequence<TDim::value>{})
+        {
+        }
+#endif
+
+    private:
+        template<typename F, std::size_t... Is>
+        ALPAKA_FN_HOST_ACC constexpr explicit Vec(F&& generator, std::index_sequence<Is...>)
+            : m_data{generator(std::integral_constant<std::size_t, Is>{})...}
+        {
+        }
+
+    public:
+        //! \brief Single value constructor.
+        //!
+        //! Creates a vector with all values set to val.
+        //! \param val The initial value.
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC static constexpr auto all(TVal const& val) -> Vec<TDim, TVal>
+        {
+            Vec<TDim, TVal> v;
+            for(auto& e : v)
+                e = val;
+            return v;
+        }
+
+        //! Zero value constructor.
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC static constexpr auto zeros() -> Vec<TDim, TVal>
+        {
+            return all(static_cast<TVal>(0));
+        }
+
+        //! One value constructor.
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC static constexpr auto ones() -> Vec<TDim, TVal>
+        {
+            return all(static_cast<TVal>(1));
+        }
+
+        ALPAKA_FN_HOST_ACC constexpr auto begin() -> TVal*
+        {
+            return m_data;
+        }
+
+        ALPAKA_FN_HOST_ACC constexpr auto begin() const -> TVal const*
+        {
+            return m_data;
+        }
+
+        ALPAKA_FN_HOST_ACC constexpr auto end() -> TVal*
+        {
+            return m_data + TDim::value;
+        }
+
+        ALPAKA_FN_HOST_ACC constexpr auto end() const -> TVal const*
+        {
+            return m_data + TDim::value;
+        }
+
+        ALPAKA_FN_HOST_ACC constexpr auto front() -> TVal&
+        {
+            return m_data[0];
+        }
+
+        ALPAKA_FN_HOST_ACC constexpr auto front() const -> TVal const&
+        {
+            return m_data[0];
+        }
+
+        ALPAKA_FN_HOST_ACC constexpr auto back() -> TVal&
+        {
+            return m_data[Dim::value - 1];
+        }
+
+        ALPAKA_FN_HOST_ACC constexpr auto back() const -> TVal const&
+        {
+            return m_data[Dim::value - 1];
+        }
+
+        //! access elements by name
+        //!
+        //! names: x,y,z,w
+        //! @{
+        template<typename TDefer = Dim, std::enable_if_t<std::is_same_v<TDefer, Dim> && Dim::value >= 1, int> = 0>
+        ALPAKA_FN_HOST_ACC constexpr decltype(auto) x() const
+        {
+            return m_data[Dim::value - 1];
+        }
+
+        template<typename TDefer = Dim, std::enable_if_t<std::is_same_v<TDefer, Dim> && Dim::value >= 1, int> = 0>
+        ALPAKA_FN_HOST_ACC constexpr decltype(auto) x()
+        {
+            return m_data[Dim::value - 1];
+        }
+
+        template<typename TDefer = Dim, std::enable_if_t<std::is_same_v<TDefer, Dim> && Dim::value >= 2, int> = 0>
+        ALPAKA_FN_HOST_ACC constexpr decltype(auto) y() const
+        {
+            return m_data[Dim::value - 2];
+        }
+
+        template<typename TDefer = Dim, std::enable_if_t<std::is_same_v<TDefer, Dim> && Dim::value >= 2, int> = 0>
+        ALPAKA_FN_HOST_ACC constexpr decltype(auto) y()
+        {
+            return m_data[Dim::value - 2];
+        }
+
+        template<typename TDefer = Dim, std::enable_if_t<std::is_same_v<TDefer, Dim> && Dim::value >= 3, int> = 0>
+        ALPAKA_FN_HOST_ACC constexpr decltype(auto) z() const
+        {
+            return m_data[Dim::value - 3];
+        }
+
+        template<typename TDefer = Dim, std::enable_if_t<std::is_same_v<TDefer, Dim> && Dim::value >= 3, int> = 0>
+        ALPAKA_FN_HOST_ACC constexpr decltype(auto) z()
+        {
+            return m_data[Dim::value - 3];
+        }
+
+        template<typename TDefer = Dim, std::enable_if_t<std::is_same_v<TDefer, Dim> && Dim::value >= 4, int> = 0>
+        ALPAKA_FN_HOST_ACC constexpr decltype(auto) w() const
+        {
+            return m_data[Dim::value - 4];
+        }
+
+        template<typename TDefer = Dim, std::enable_if_t<std::is_same_v<TDefer, Dim> && Dim::value >= 4, int> = 0>
+        ALPAKA_FN_HOST_ACC constexpr decltype(auto) w()
+        {
+            return m_data[Dim::value - 4];
+        }
+
+        //! @}
+
+        //! Value reference accessor at the given non-unsigned integer index.
+        //! \return A reference to the value at the given index.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TIdx, typename = std::enable_if_t<std::is_integral_v<TIdx>>>
+        ALPAKA_FN_HOST_ACC constexpr auto operator[](TIdx const iIdx) -> TVal&
+        {
+            core::assertValueUnsigned(iIdx);
+            auto const idx = static_cast<typename TDim::value_type>(iIdx);
+            core::assertGreaterThan<TDim>(idx);
+            return m_data[idx];
+        }
+
+        //! Value accessor at the given non-unsigned integer index.
+        //! \return The value at the given index.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TIdx, typename = std::enable_if_t<std::is_integral_v<TIdx>>>
+        ALPAKA_FN_HOST_ACC constexpr auto operator[](TIdx const iIdx) const -> TVal
+        {
+            core::assertValueUnsigned(iIdx);
+            auto const idx = static_cast<typename TDim::value_type>(iIdx);
+            core::assertGreaterThan<TDim>(idx);
+            return m_data[idx];
+        }
+
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TFnObj, std::size_t... TIndices>
+        [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto foldrByIndices(
+            TFnObj const& f,
+            std::integer_sequence<std::size_t, TIndices...>) const
+        {
+            return meta::foldr(f, (*this)[TIndices]...);
+        }
+
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TFnObj, std::size_t... TIndices>
+        [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto foldrByIndices(
+            TFnObj const& f,
+            std::integer_sequence<std::size_t, TIndices...>,
+            TVal initial) const
+        {
+            return meta::foldr(f, (*this)[TIndices]..., initial);
+        }
+
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TFnObj>
+        [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto foldrAll(TFnObj const& f) const
+        {
+            return foldrByIndices(f, IdxSequence());
+        }
+
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TFnObj>
+        [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto foldrAll(TFnObj const& f, TVal initial) const
+        {
+            return foldrByIndices(f, IdxSequence(), initial);
+        }
+
+// suppress strange warning produced by nvcc+MSVC in release mode
+#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
+#    pragma warning(push)
+#    pragma warning(disable : 4702) // unreachable code
+#endif
+        //! \return The product of all values.
+        ALPAKA_NO_HOST_ACC_WARNING
+        [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto prod() const -> TVal
+        {
+            return foldrAll(std::multiplies<TVal>{}, TVal{1});
+        }
+#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
+#    pragma warning(pop)
+#endif
+        //! \return The sum of all values.
+        ALPAKA_NO_HOST_ACC_WARNING
+        [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto sum() const -> TVal
+        {
+            return foldrAll(std::plus<TVal>{}, TVal{0});
+        }
+
+        //! \return The min of all values.
+        ALPAKA_NO_HOST_ACC_WARNING
+        [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto min() const -> TVal
+        {
+            return foldrAll(meta::min<TVal>{}, std::numeric_limits<TVal>::max());
+        }
+
+        //! \return The max of all values.
+        ALPAKA_NO_HOST_ACC_WARNING
+        [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto max() const -> TVal
+        {
+            return foldrAll(meta::max<TVal>{}, std::numeric_limits<TVal>::min());
+        }
+
+        //! \return True if all values are true, i.e., the "logical and" of all values.
+        ALPAKA_NO_HOST_ACC_WARNING
+        [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto all() const -> bool
+        {
+            return foldrAll(std::logical_and<TVal>{}, true);
+        }
+
+        //! \return True if any value is true, i.e., the "logical or" of all values.
+        ALPAKA_NO_HOST_ACC_WARNING
+        [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto any() const -> bool
+        {
+            return foldrAll(std::logical_or<TVal>{}, false);
+        }
+
+        //! \return True if none of the values are true
+        ALPAKA_NO_HOST_ACC_WARNING
+        [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto none() const -> bool
+        {
+            return !foldrAll(std::logical_or<TVal>{}, false);
+        }
+
+        //! \return The index of the minimal element.
+        [[nodiscard]] ALPAKA_FN_HOST constexpr auto minElem() const -> typename TDim::value_type
+        {
+            return static_cast<typename TDim::value_type>(
+                std::distance(std::begin(m_data), std::min_element(std::begin(m_data), std::end(m_data))));
+        }
+
+        //! \return The index of the maximal element.
+        [[nodiscard]] ALPAKA_FN_HOST constexpr auto maxElem() const -> typename TDim::value_type
+        {
+            return static_cast<typename TDim::value_type>(
+                std::distance(std::begin(m_data), std::max_element(std::begin(m_data), std::end(m_data))));
+        }
+
+        template<size_t I>
+        ALPAKA_FN_HOST_ACC constexpr auto get() -> TVal&
+        {
+            return (*this)[I];
+        }
+
+        template<size_t I>
+        [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto get() const -> TVal
+        {
+            return (*this)[I];
+        }
+
+        //! \return The element-wise sum of two vectors.
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC friend constexpr auto operator+(Vec const& p, Vec const& q) -> Vec
+        {
+            Vec r;
+#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
+            if(TDim::value > 0)
+#else
+            if constexpr(TDim::value > 0)
+#endif
+            {
+                for(typename TDim::value_type i = 0; i < TDim::value; ++i)
+                    r[i] = p[i] + q[i];
+            }
+            return r;
+        }
+
+        //! \return The element-wise difference of two vectors.
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC friend constexpr auto operator-(Vec const& p, Vec const& q) -> Vec
+        {
+            Vec r;
+#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
+            if(TDim::value > 0)
+#else
+            if constexpr(TDim::value > 0)
+#endif
+            {
+#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
+#    pragma diag_suppress = unsigned_compare_with_zero
+#endif
+                for(typename TDim::value_type i = 0; i < TDim::value; ++i)
+#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
+#    pragma diag_default = unsigned_compare_with_zero
+#endif
+                    r[i] = p[i] - q[i];
+            }
+            return r;
+        }
+
+        //! \return The element-wise product of two vectors.
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC friend constexpr auto operator*(Vec const& p, Vec const& q) -> Vec
+        {
+            Vec r;
+#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
+            if(TDim::value > 0)
+#else
+            if constexpr(TDim::value > 0)
+#endif
+            {
+                for(typename TDim::value_type i = 0; i < TDim::value; ++i)
+                    r[i] = p[i] * q[i];
+            }
+            return r;
+        }
+
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC friend constexpr auto operator==(Vec const& a, Vec const& b) -> bool
+        {
+#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
+            if(TDim::value > 0)
+#else
+            if constexpr(TDim::value > 0)
+#endif
+            {
+#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
+#    pragma diag_suppress = unsigned_compare_with_zero
+#endif
+                for(typename TDim::value_type i(0); i < TDim::value; ++i)
+#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
+#    pragma diag_default = unsigned_compare_with_zero
+#endif
+                {
+                    if(a[i] != b[i])
+                        return false;
+                }
+            }
+            return true;
+        }
+
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC friend constexpr auto operator!=(Vec const& a, Vec const& b) -> bool
+        {
+            return !(a == b);
+        }
+
+        //! \return The element-wise less than relation of two vectors.
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC friend constexpr auto operator<(Vec const& p, Vec const& q) -> Vec<TDim, bool>
+        {
+            Vec<TDim, bool> r;
+#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
+            if(TDim::value > 0)
+#else
+            if constexpr(TDim::value > 0)
+#endif
+            {
+                for(typename TDim::value_type i = 0; i < TDim::value; ++i)
+                    r[i] = p[i] < q[i];
+            }
+            return r;
+        }
+
+        //! \return The element-wise less than relation of two vectors.
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC friend constexpr auto operator<=(Vec const& p, Vec const& q) -> Vec<TDim, bool>
+        {
+            Vec<TDim, bool> r;
+#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
+            if(TDim::value > 0)
+#else
+            if constexpr(TDim::value > 0)
+#endif
+            {
+                for(typename TDim::value_type i = 0; i < TDim::value; ++i)
+                    r[i] = p[i] <= q[i];
+            }
+            return r;
+        }
+
+        //! \return The element-wise greater than relation of two vectors.
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC friend constexpr auto operator>(Vec const& p, Vec const& q) -> Vec<TDim, bool>
+        {
+            Vec<TDim, bool> r;
+#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
+            if(TDim::value > 0)
+#else
+            if constexpr(TDim::value > 0)
+#endif
+            {
+                for(typename TDim::value_type i = 0; i < TDim::value; ++i)
+                    r[i] = p[i] > q[i];
+            }
+            return r;
+        }
+
+        //! \return The element-wise greater equal than relation of two vectors.
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC friend constexpr auto operator>=(Vec const& p, Vec const& q) -> Vec<TDim, bool>
+        {
+            Vec<TDim, bool> r;
+#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
+            if(TDim::value > 0)
+#else
+            if constexpr(TDim::value > 0)
+#endif
+            {
+                for(typename TDim::value_type i = 0; i < TDim::value; ++i)
+                    r[i] = p[i] >= q[i];
+            }
+            return r;
+        }
+
+        //! \return The element-wise logical and relation of two vectors.
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC friend constexpr auto operator&&(Vec const& p, Vec const& q) -> Vec<TDim, bool>
+        {
+            Vec<TDim, bool> r;
+#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
+            if(TDim::value > 0)
+#else
+            if constexpr(TDim::value > 0)
+#endif
+            {
+                for(typename TDim::value_type i = 0; i < TDim::value; ++i)
+                    r[i] = p[i] && q[i];
+            }
+            return r;
+        }
+
+        //! \return The element-wise logical or relation of two vectors.
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC friend constexpr auto operator||(Vec const& p, Vec const& q) -> Vec<TDim, bool>
+        {
+            Vec<TDim, bool> r;
+#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
+            if(TDim::value > 0)
+#else
+            if constexpr(TDim::value > 0)
+#endif
+            {
+                for(typename TDim::value_type i = 0; i < TDim::value; ++i)
+                    r[i] = p[i] || q[i];
+            }
+            return r;
+        }
+
+        ALPAKA_FN_HOST friend constexpr auto operator<<(std::ostream& os, Vec const& v) -> std::ostream&
+        {
+            os << "(";
+#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
+            if(TDim::value > 0)
+#else
+            if constexpr(TDim::value > 0)
+#endif
+            {
+#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
+#    pragma diag_suppress = unsigned_compare_with_zero
+#endif
+                for(typename TDim::value_type i = 0; i < TDim::value; ++i)
+#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
+#    pragma diag_default = unsigned_compare_with_zero
+#endif
+                {
+                    os << v[i];
+                    if(i != TDim::value - 1)
+                        os << ", ";
+                }
+            }
+            else
+                os << ".";
+            os << ")";
+
+            return os;
+        }
+
+    private:
+        // Zero sized arrays are not allowed, therefore zero-dimensional vectors have one member.
+        TVal m_data[TDim::value == 0u ? 1u : TDim::value];
+    };
+
+    template<typename TFirstIndex, typename... TRestIndices>
+    ALPAKA_FN_HOST_ACC Vec(TFirstIndex&&, TRestIndices&&...)
+        -> Vec<DimInt<1 + sizeof...(TRestIndices)>, std::decay_t<TFirstIndex>>;
+
+    template<typename T>
+    inline constexpr bool isVec = false;
+
+    template<typename TDim, typename TVal>
+    inline constexpr bool isVec<Vec<TDim, TVal>> = true;
+
+    //! Converts a Vec to a std::array
+    template<typename TDim, typename TVal>
+    ALPAKA_FN_HOST_ACC constexpr auto toArray(Vec<TDim, TVal> const& v) -> std::array<TVal, TDim::value>
+    {
+        std::array<TVal, TDim::value> a{};
+#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
+        if(TDim::value > 0)
+#else
+        if constexpr(TDim::value > 0)
+#endif
+        {
+            for(unsigned i = 0; i < TDim::value; i++)
+                a[i] = v[i];
+        }
+        return a;
+    }
+
+    //! \return The element-wise minimum of one or more vectors.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<
+        typename TDim,
+        typename TVal,
+        typename... Vecs,
+        typename = std::enable_if_t<(std::is_same_v<Vec<TDim, TVal>, Vecs> && ...)>>
+    ALPAKA_FN_HOST_ACC constexpr auto elementwise_min(Vec<TDim, TVal> const& p, Vecs const&... qs) -> Vec<TDim, TVal>
+    {
+        Vec<TDim, TVal> r;
+#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
+        if(TDim::value > 0)
+#else
+        if constexpr(TDim::value > 0)
+#endif
+        {
+            for(typename TDim::value_type i = 0; i < TDim::value; ++i)
+                r[i] = std::min({p[i], qs[i]...});
+        }
+        return r;
+    }
+
+    //! \return The element-wise maximum of one or more vectors.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<
+        typename TDim,
+        typename TVal,
+        typename... Vecs,
+        typename = std::enable_if_t<(std::is_same_v<Vec<TDim, TVal>, Vecs> && ...)>>
+    ALPAKA_FN_HOST_ACC constexpr auto elementwise_max(Vec<TDim, TVal> const& p, Vecs const&... qs) -> Vec<TDim, TVal>
+    {
+        Vec<TDim, TVal> r;
+#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
+        if(TDim::value > 0)
+#else
+        if constexpr(TDim::value > 0)
+#endif
+        {
+            for(typename TDim::value_type i = 0; i < TDim::value; ++i)
+                r[i] = std::max({p[i], qs[i]...});
+        }
+        return r;
+    }
+
+    namespace trait
+    {
+        //! The Vec dimension get trait specialization.
+        template<typename TDim, typename TVal>
+        struct DimType<Vec<TDim, TVal>>
+        {
+            using type = TDim;
+        };
+
+        //! The Vec idx type trait specialization.
+        template<typename TDim, typename TVal>
+        struct IdxType<Vec<TDim, TVal>>
+        {
+            using type = TVal;
+        };
+
+        //! Specialization for selecting a sub-vector.
+        template<typename TDim, typename TVal, std::size_t... TIndices>
+        struct SubVecFromIndices<Vec<TDim, TVal>, std::index_sequence<TIndices...>>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC static constexpr auto subVecFromIndices(
+                Vec<TDim, TVal> const& vec) -> Vec<DimInt<sizeof...(TIndices)>, TVal>
+            {
+                if constexpr(std::is_same_v<std::index_sequence<TIndices...>, std::make_index_sequence<TDim::value>>)
+                {
+                    return vec; // Return whole vector.
+                }
+                else
+                {
+                    static_assert(
+                        sizeof...(TIndices) <= TDim::value,
+                        "The sub-vector's dimensionality must be smaller than or equal to the original "
+                        "dimensionality.");
+                    return {vec[TIndices]...}; // Return sub-vector.
+                }
+                ALPAKA_UNREACHABLE({});
+            }
+        };
+
+        template<typename TValNew, typename TDim, typename TVal>
+        struct CastVec<TValNew, Vec<TDim, TVal>>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static constexpr auto castVec(Vec<TDim, TVal> const& vec) -> Vec<TDim, TValNew>
+            {
+                if constexpr(std::is_same_v<TValNew, TVal>)
+                {
+                    return vec;
+                }
+                else
+                {
+                    Vec<TDim, TValNew> r;
+#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
+                    if(TDim::value > 0)
+#else
+                    if constexpr(TDim::value > 0)
+#endif
+                    {
+                        for(typename TDim::value_type i = 0; i < TDim::value; ++i)
+                            r[i] = static_cast<TValNew>(vec[i]);
+                    }
+                    return r;
+                }
+                ALPAKA_UNREACHABLE({});
+            }
+        };
+
+        //! ReverseVec specialization for Vec.
+        template<typename TDim, typename TVal>
+        struct ReverseVec<Vec<TDim, TVal>>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static constexpr auto reverseVec(Vec<TDim, TVal> const& vec) -> Vec<TDim, TVal>
+            {
+                if constexpr(TDim::value <= 1)
+                {
+                    return vec;
+                }
+                else
+                {
+                    Vec<TDim, TVal> r;
+                    for(typename TDim::value_type i = 0; i < TDim::value; ++i)
+                        r[i] = vec[TDim::value - 1u - i];
+                    return r;
+                }
+                ALPAKA_UNREACHABLE({});
+            }
+        };
+
+        //! Concatenation specialization for Vec.
+        template<typename TDimL, typename TDimR, typename TVal>
+        struct ConcatVec<Vec<TDimL, TVal>, Vec<TDimR, TVal>>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static constexpr auto concatVec(
+                Vec<TDimL, TVal> const& vecL,
+                Vec<TDimR, TVal> const& vecR) -> Vec<DimInt<TDimL::value + TDimR::value>, TVal>
+            {
+                Vec<DimInt<TDimL::value + TDimR::value>, TVal> r;
+                if constexpr(TDimL::value > 0)
+                {
+                    for(typename TDimL::value_type i = 0; i < TDimL::value; ++i)
+                        r[i] = vecL[i];
+                }
+                if constexpr(TDimR::value > 0)
+                {
+                    for(typename TDimR::value_type i = 0; i < TDimR::value; ++i)
+                        r[TDimL::value + i] = vecR[i];
+                }
+                return r;
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
+
+#if defined(__clang__)
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wmismatched-tags"
+#endif
+namespace std
+{
+    template<typename TDim, typename TVal>
+    struct tuple_size<alpaka::Vec<TDim, TVal>> : integral_constant<size_t, TDim::value>
+    {
+    };
+
+    template<size_t I, typename TDim, typename TVal>
+    struct tuple_element<I, alpaka::Vec<TDim, TVal>>
+    {
+        using type = TVal;
+    };
+} // namespace std
+#if defined(__clang__)
+#    pragma GCC diagnostic pop
+#endif
diff --git a/include/alpaka/version.hpp b/include/alpaka/version.hpp
new file mode 100644
index 0000000..9ea2db7
--- /dev/null
+++ b/include/alpaka/version.hpp
@@ -0,0 +1,14 @@
+/* Copyright 2022 Benjamin Worpitz, Erik Zenker, Jan Stephan
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include <boost/predef/version_number.h>
+
+#define ALPAKA_VERSION_MAJOR 1
+#define ALPAKA_VERSION_MINOR 2
+#define ALPAKA_VERSION_PATCH 0
+
+//! The alpaka library version number
+#define ALPAKA_VERSION BOOST_VERSION_NUMBER(ALPAKA_VERSION_MAJOR, ALPAKA_VERSION_MINOR, ALPAKA_VERSION_PATCH)
diff --git a/include/alpaka/wait/Traits.hpp b/include/alpaka/wait/Traits.hpp
new file mode 100644
index 0000000..c0cfa89
--- /dev/null
+++ b/include/alpaka/wait/Traits.hpp
@@ -0,0 +1,50 @@
+/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/core/Concepts.hpp"
+
+namespace alpaka
+{
+    struct ConceptCurrentThreadWaitFor
+    {
+    };
+
+    //! The wait traits.
+    namespace trait
+    {
+        //! The thread wait trait.
+        template<typename TAwaited, typename TSfinae = void>
+        struct CurrentThreadWaitFor;
+
+        //! The waiter wait trait.
+        template<typename TWaiter, typename TAwaited, typename TSfinae = void>
+        struct WaiterWaitFor;
+    } // namespace trait
+
+    //! Waits the thread for the completion of the given awaited action to complete.
+    //!
+    //! Special Handling for events:
+    //!   If the event is re-enqueued wait() will terminate when the re-enqueued event will be ready and previously
+    //!   enqueued states of the event will be ignored.
+    template<typename TAwaited>
+    ALPAKA_FN_HOST auto wait(TAwaited const& awaited) -> void
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptCurrentThreadWaitFor, TAwaited>;
+        trait::CurrentThreadWaitFor<ImplementationBase>::currentThreadWaitFor(awaited);
+    }
+
+    //! The waiter waits for the given awaited action to complete.
+    //!
+    //! Special Handling if \p waiter is a queue and \p awaited an event:
+    //!   The \p waiter waits for the event state to become ready based on the recently captured event state at the
+    //!   time of the API call even if the event is being re-enqueued later.
+    template<typename TWaiter, typename TAwaited>
+    ALPAKA_FN_HOST auto wait(TWaiter& waiter, TAwaited const& awaited) -> void
+    {
+        trait::WaiterWaitFor<TWaiter, TAwaited>::waiterWaitFor(waiter, awaited);
+    }
+} // namespace alpaka
diff --git a/include/alpaka/warp/Traits.hpp b/include/alpaka/warp/Traits.hpp
new file mode 100644
index 0000000..f4cfb4d
--- /dev/null
+++ b/include/alpaka/warp/Traits.hpp
@@ -0,0 +1,317 @@
+/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Bernhard Manfred Gruber, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/core/Concepts.hpp"
+
+#include <cstdint>
+#include <type_traits>
+
+namespace alpaka::warp
+{
+    struct ConceptWarp
+    {
+    };
+
+    //! The warp traits.
+    namespace trait
+    {
+        //! The warp size trait.
+        template<typename TWarp, typename TSfinae = void>
+        struct GetSize;
+
+        //! The all warp vote trait.
+        template<typename TWarp, typename TSfinae = void>
+        struct All;
+
+        //! The any warp vote trait.
+        template<typename TWarp, typename TSfinae = void>
+        struct Any;
+
+        //! The ballot warp vote trait.
+        template<typename TWarp, typename TSfinae = void>
+        struct Ballot;
+
+        //! The shfl warp swizzling trait.
+        template<typename TWarp, typename TSfinae = void>
+        struct Shfl;
+
+        //! The shfl up warp swizzling trait.
+        template<typename TWarp, typename TSfinae = void>
+        struct ShflUp;
+
+        //! The shfl down warp swizzling trait.
+        template<typename TWarp, typename TSfinae = void>
+        struct ShflDown;
+
+        //! The shfl xor warp swizzling trait.
+        template<typename TWarp, typename TSfinae = void>
+        struct ShflXor;
+
+        //! The active mask trait.
+        template<typename TWarp, typename TSfinae = void>
+        struct Activemask;
+    } // namespace trait
+
+    //! Returns warp size.
+    //!
+    //! \tparam TWarp The warp implementation type.
+    //! \param warp The warp implementation.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TWarp>
+    ALPAKA_FN_ACC auto getSize(TWarp const& warp) -> std::int32_t
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
+        return trait::GetSize<ImplementationBase>::getSize(warp);
+    }
+
+    //! Returns a 32- or 64-bit unsigned integer (depending on the
+    //! accelerator) whose Nth bit is set if and only if the Nth thread
+    //! of the warp is active.
+    //!
+    //! Note: decltype for return type is required there, otherwise
+    //! compilcation with a CPU and a GPU accelerator enabled fails as it
+    //! tries to call device function from a host-device one. The reason
+    //! is unclear, but likely related to deducing the return type.
+    //!
+    //! Note:
+    //! * The programmer must ensure that all threads calling this function are executing
+    //!   the same line of code. In particular it is not portable to write
+    //!   if(a) {activemask} else {activemask}.
+    //!
+    //! \tparam TWarp The warp implementation type.
+    //! \param warp The warp implementation.
+    //! \return 32-bit or 64-bit unsigned type depending on the accelerator.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TWarp>
+    ALPAKA_FN_ACC auto activemask(TWarp const& warp)
+        -> decltype(trait::Activemask<concepts::ImplementationBase<ConceptWarp, TWarp>>::activemask(warp))
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
+        return trait::Activemask<ImplementationBase>::activemask(warp);
+    }
+
+    //! Evaluates predicate for all active threads of the warp and returns
+    //! non-zero if and only if predicate evaluates to non-zero for all of them.
+    //!
+    //! It follows the logic of __all(predicate) in CUDA before version 9.0 and HIP,
+    //! the operation is applied for all active threads.
+    //! The modern CUDA counterpart would be __all_sync(__activemask(), predicate).
+    //!
+    //! Note:
+    //! * The programmer must ensure that all threads calling this function are executing
+    //!   the same line of code. In particular it is not portable to write
+    //!   if(a) {all} else {all}.
+    //!
+    //! \tparam TWarp The warp implementation type.
+    //! \param warp The warp implementation.
+    //! \param predicate The predicate value for current thread.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TWarp>
+    ALPAKA_FN_ACC auto all(TWarp const& warp, std::int32_t predicate) -> std::int32_t
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
+        return trait::All<ImplementationBase>::all(warp, predicate);
+    }
+
+    //! Evaluates predicate for all active threads of the warp and returns
+    //! non-zero if and only if predicate evaluates to non-zero for any of them.
+    //!
+    //! It follows the logic of __any(predicate) in CUDA before version 9.0 and HIP,
+    //! the operation is applied for all active threads.
+    //! The modern CUDA counterpart would be __any_sync(__activemask(), predicate).
+    //!
+    //! Note:
+    //! * The programmer must ensure that all threads calling this function are executing
+    //!   the same line of code. In particular it is not portable to write
+    //!   if(a) {any} else {any}.
+    //!
+    //! \tparam TWarp The warp implementation type.
+    //! \param warp The warp implementation.
+    //! \param predicate The predicate value for current thread.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TWarp>
+    ALPAKA_FN_ACC auto any(TWarp const& warp, std::int32_t predicate) -> std::int32_t
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
+        return trait::Any<ImplementationBase>::any(warp, predicate);
+    }
+
+    //! Evaluates predicate for all non-exited threads in a warp and returns
+    //! a 32- or 64-bit unsigned integer (depending on the accelerator)
+    //! whose Nth bit is set if and only if predicate evaluates to non-zero
+    //! for the Nth thread of the warp and the Nth thread is active.
+    //!
+    //! It follows the logic of __ballot(predicate) in CUDA before version 9.0 and HIP,
+    //! the operation is applied for all active threads.
+    //! The modern CUDA counterpart would be __ballot_sync(__activemask(), predicate).
+    //! Return type is 64-bit to fit all platforms.
+    //!
+    //! Note:
+    //! * The programmer must ensure that all threads calling this function are executing
+    //!   the same line of code. In particular it is not portable to write
+    //!   if(a) {ballot} else {ballot}.
+    //!
+    //! \tparam TWarp The warp implementation type.
+    //! \param warp The warp implementation.
+    //! \param predicate The predicate value for current thread.
+    //! \return 32-bit or 64-bit unsigned type depending on the accelerator.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TWarp>
+    ALPAKA_FN_ACC auto ballot(TWarp const& warp, std::int32_t predicate)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
+        return trait::Ballot<ImplementationBase>::ballot(warp, predicate);
+    }
+
+    //! Exchange data between threads within a warp.
+    //!
+    //! Effectively executes:
+    //!
+    //!     __shared__ int32_t values[warpsize];
+    //!     values[threadIdx.x] = value;
+    //!     __syncthreads();
+    //!     return values[width*(threadIdx.x/width) + srcLane%width];
+    //!
+    //! However, it does not use shared memory.
+    //!
+    //! Notes:
+    //! * The programmer must ensure that all threads calling this
+    //!   function (and the srcLane) are executing the same line of code.
+    //!   In particular it is not portable to write if(a) {shfl} else {shfl}.
+    //!
+    //! * Commonly used with width = warpsize (the default), (returns values[srcLane])
+    //!
+    //! * Width must be a power of 2.
+    //!
+    //! \tparam TWarp   warp implementation type
+    //! \param  warp    warp implementation
+    //! \param  value   value to broadcast (only meaningful from threadIdx == srcLane)
+    //! \param  srcLane source lane sending value
+    //! \param  width   number of threads receiving a single value
+    //! \return val from the thread index srcLane.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TWarp, typename T>
+    ALPAKA_FN_ACC auto shfl(TWarp const& warp, T value, std::int32_t srcLane, std::int32_t width = 0)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
+        return trait::Shfl<ImplementationBase>::shfl(warp, value, srcLane, width ? width : getSize(warp));
+    }
+
+    //! Exchange data between threads within a warp.
+    //! It copies from a lane with lower ID relative to caller.
+    //! The lane ID is calculated by subtracting delta from the caller’s lane ID.
+    //!
+    //! Effectively executes:
+    //!
+    //!     __shared__ int32_t values[warpsize];
+    //!     values[threadIdx.x] = value;
+    //!     __syncthreads();
+    //!     return (threadIdx.x % width >= delta) ? values[threadIdx.x - delta] : values[threadIdx.x];
+    //!
+    //! However, it does not use shared memory.
+    //!
+    //! Notes:
+    //! * The programmer must ensure that all threads calling this
+    //!   function (and the srcLane) are executing the same line of code.
+    //!   In particular it is not portable to write if(a) {shfl} else {shfl}.
+    //!
+    //! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x - delta] if threadIdx.x >=
+    //! delta)
+    //!
+    //! * Width must be a power of 2.
+    //!
+    //! \tparam TWarp   warp implementation type
+    //! \tparam T       value type
+    //! \param  warp    warp implementation
+    //! \param  value   value to broadcast
+    //! \param  offset  corresponds to the delta used to compute the lane ID
+    //! \param  width   size of the group participating in the shuffle operation
+    //! \return val from the thread index lane ID.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TWarp, typename T>
+    ALPAKA_FN_ACC auto shfl_up(TWarp const& warp, T value, std::uint32_t offset, std::int32_t width = 0)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
+        return trait::ShflUp<ImplementationBase>::shfl_up(warp, value, offset, width ? width : getSize(warp));
+    }
+
+    //! Exchange data between threads within a warp.
+    //! It copies from a lane with higher ID relative to caller.
+    //! The lane ID is calculated by adding delta to the caller’s lane ID.
+    //!
+    //! Effectively executes:
+    //!
+    //!     __shared__ int32_t values[warpsize];
+    //!     values[threadIdx.x] = value;
+    //!     __syncthreads();
+    //!     return (threadIdx.x % width + delta < width) ? values[threadIdx.x + delta] : values[threadIdx.x];
+    //!
+    //! However, it does not use shared memory.
+    //!
+    //! Notes:
+    //! * The programmer must ensure that all threads calling this
+    //!   function (and the srcLane) are executing the same line of code.
+    //!   In particular it is not portable to write if(a) {shfl} else {shfl}.
+    //!
+    //! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x+delta] if threadIdx.x+delta <
+    //! warpsize)
+    //!
+    //! * Width must be a power of 2.
+    //!
+    //! \tparam TWarp   warp implementation type
+    //! \tparam T       value type
+    //! \param  warp    warp implementation
+    //! \param  value   value to broadcast
+    //! \param  offset  corresponds to the delta used to compute the lane ID
+    //! \param  width   size of the group participating in the shuffle operation
+    //! \return val from the thread index lane ID.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TWarp, typename T>
+    ALPAKA_FN_ACC auto shfl_down(TWarp const& warp, T value, std::uint32_t offset, std::int32_t width = 0)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
+        return trait::ShflDown<ImplementationBase>::shfl_down(warp, value, offset, width ? width : getSize(warp));
+    }
+
+    //! Exchange data between threads within a warp.
+    //! It copies from a lane based on bitwise XOR of own lane ID.
+    //! The lane ID is calculated by performing a bitwise XOR of the caller’s lane ID with mask
+    //!
+    //! Effectively executes:
+    //!
+    //!     __shared__ int32_t values[warpsize];
+    //!     values[threadIdx.x] = value;
+    //!     __syncthreads();
+    //!     int lane = threadIdx.x ^ mask;
+    //!     return values[lane / width > threadIdx.x / width ? threadIdx.x : lane];
+    //!
+    //! However, it does not use shared memory.
+    //!
+    //! Notes:
+    //! * The programmer must ensure that all threads calling this
+    //!   function (and the srcLane) are executing the same line of code.
+    //!   In particular it is not portable to write if(a) {shfl} else {shfl}.
+    //!
+    //! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x^mask])
+    //!
+    //! * Width must be a power of 2.
+    //!
+    //! \tparam TWarp   warp implementation type
+    //! \tparam T       value type
+    //! \param  warp    warp implementation
+    //! \param  value   value to broadcast
+    //! \param  mask    corresponds to the mask used to compute the lane ID
+    //! \param  width   size of the group participating in the shuffle operation
+    //! \return val from the thread index lane ID.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TWarp, typename T>
+    ALPAKA_FN_ACC auto shfl_xor(TWarp const& warp, T value, std::int32_t mask, std::int32_t width = 0)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
+        return trait::ShflXor<ImplementationBase>::shfl_xor(warp, value, mask, width ? width : getSize(warp));
+    }
+} // namespace alpaka::warp
diff --git a/include/alpaka/warp/WarpGenericSycl.hpp b/include/alpaka/warp/WarpGenericSycl.hpp
new file mode 100644
index 0000000..51957ba
--- /dev/null
+++ b/include/alpaka/warp/WarpGenericSycl.hpp
@@ -0,0 +1,200 @@
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ *
+ * The implementations of Shfl::shfl(), ShflUp::shfl_up(), ShflDown::shfl_down() and ShflXor::shfl_xor() are derived
+ * from Intel DPCT.
+ * Copyright (C) Intel Corporation.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ * See https://llvm.org/LICENSE.txt for license information.
+ */
+
+#pragma once
+
+#include "alpaka/core/Assert.hpp"
+#include "alpaka/warp/Traits.hpp"
+
+#include <cstdint>
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+
+#    include <sycl/sycl.hpp>
+
+namespace alpaka::warp
+{
+    //! The SYCL warp.
+    template<typename TDim>
+    class WarpGenericSycl : public concepts::Implements<alpaka::warp::ConceptWarp, WarpGenericSycl<TDim>>
+    {
+    public:
+        WarpGenericSycl(sycl::nd_item<TDim::value> my_item) : m_item_warp{my_item}
+        {
+        }
+
+        sycl::nd_item<TDim::value> m_item_warp;
+    };
+} // namespace alpaka::warp
+
+namespace alpaka::warp::trait
+{
+    template<typename TDim>
+    struct GetSize<warp::WarpGenericSycl<TDim>>
+    {
+        static auto getSize(warp::WarpGenericSycl<TDim> const& warp) -> std::int32_t
+        {
+            auto const sub_group = warp.m_item_warp.get_sub_group();
+            // SYCL sub-groups are always 1D
+            return static_cast<std::int32_t>(sub_group.get_max_local_range()[0]);
+        }
+    };
+
+    template<typename TDim>
+    struct Activemask<warp::WarpGenericSycl<TDim>>
+    {
+        // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
+        // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
+        // Restrict to warpSize <= 32 for now.
+        static auto activemask(warp::WarpGenericSycl<TDim> const& warp) -> std::uint32_t
+        {
+            static_assert(!sizeof(warp), "activemask is not supported on SYCL");
+            // SYCL does not have an API to get the activemask. It is also questionable (to me, bgruber) whether an
+            // "activemask" even exists on some hardware architectures, since the idea is bound to threads being
+            // "turned off" when they take different control flow in a warp. A SYCL implementation could run each
+            // thread as a SIMD lane, in which cause the "thread" is always active, but some SIMD lanes are either
+            // predicated off, or side-effects are masked out when writing them back.
+            //
+            // An implementation via oneAPI's sycl::ext::oneapi::group_ballot causes UB, because activemask is expected
+            // to be callable when less than all threads are active in a warp (CUDA). But SYCL requires all threads of
+            // a group to call the function.
+            //
+            // Intel's CUDA -> SYCL migration tool also suggests that there is no direct equivalent and the user must
+            // rewrite their kernel logic. See also:
+            // https://oneapi-src.github.io/SYCLomatic/dev_guide/diagnostic_ref/dpct1086.html
+
+            return ~std::uint32_t{0};
+        }
+    };
+
+    template<typename TDim>
+    struct All<warp::WarpGenericSycl<TDim>>
+    {
+        static auto all(warp::WarpGenericSycl<TDim> const& warp, std::int32_t predicate) -> std::int32_t
+        {
+            auto const sub_group = warp.m_item_warp.get_sub_group();
+            return static_cast<std::int32_t>(sycl::all_of_group(sub_group, static_cast<bool>(predicate)));
+        }
+    };
+
+    template<typename TDim>
+    struct Any<warp::WarpGenericSycl<TDim>>
+    {
+        static auto any(warp::WarpGenericSycl<TDim> const& warp, std::int32_t predicate) -> std::int32_t
+        {
+            auto const sub_group = warp.m_item_warp.get_sub_group();
+            return static_cast<std::int32_t>(sycl::any_of_group(sub_group, static_cast<bool>(predicate)));
+        }
+    };
+
+    template<typename TDim>
+    struct Ballot<warp::WarpGenericSycl<TDim>>
+    {
+        // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
+        // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
+        // Restrict to warpSize <= 32 for now.
+        static auto ballot(warp::WarpGenericSycl<TDim> const& warp, std::int32_t predicate) -> std::uint32_t
+        {
+            auto const sub_group = warp.m_item_warp.get_sub_group();
+            auto const mask = sycl::ext::oneapi::group_ballot(sub_group, static_cast<bool>(predicate));
+            // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
+            // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
+            // Restrict to warpSize <= 32 for now.
+            std::uint32_t bits = 0;
+            mask.extract_bits(bits);
+            return bits;
+        }
+    };
+
+    template<typename TDim>
+    struct Shfl<warp::WarpGenericSycl<TDim>>
+    {
+        template<typename T>
+        static auto shfl(warp::WarpGenericSycl<TDim> const& warp, T value, std::int32_t srcLane, std::int32_t width)
+        {
+            ALPAKA_ASSERT_ACC(width > 0);
+            ALPAKA_ASSERT_ACC(srcLane >= 0);
+
+            /* If width < srcLane the sub-group needs to be split into assumed subdivisions. The first item of each
+               subdivision has the assumed index 0. The srcLane index is relative to the subdivisions.
+
+               Example: If we assume a sub-group size of 32 and a width of 16 we will receive two subdivisions:
+               The first starts at sub-group index 0 and the second at sub-group index 16. For srcLane = 4 the
+               first subdivision will access the value at sub-group index 4 and the second at sub-group index 20. */
+            auto const actual_group = warp.m_item_warp.get_sub_group();
+            std::uint32_t const w = static_cast<std::uint32_t>(width);
+            std::uint32_t const start_index = actual_group.get_local_linear_id() / w * w;
+            return sycl::select_from_group(actual_group, value, start_index + static_cast<std::uint32_t>(srcLane) % w);
+        }
+    };
+
+    template<typename TDim>
+    struct ShflUp<warp::WarpGenericSycl<TDim>>
+    {
+        template<typename T>
+        static auto shfl_up(
+            warp::WarpGenericSycl<TDim> const& warp,
+            T value,
+            std::uint32_t offset, /* must be the same for all work-items in the group */
+            std::int32_t width)
+        {
+            auto const actual_group = warp.m_item_warp.get_sub_group();
+            std::uint32_t const w = static_cast<std::uint32_t>(width);
+            std::uint32_t const id = actual_group.get_local_linear_id();
+            std::uint32_t const start_index = id / w * w;
+            T result = sycl::shift_group_right(actual_group, value, offset);
+            if((id - start_index) < offset)
+            {
+                result = value;
+            }
+            return result;
+        }
+    };
+
+    template<typename TDim>
+    struct ShflDown<warp::WarpGenericSycl<TDim>>
+    {
+        template<typename T>
+        static auto shfl_down(
+            warp::WarpGenericSycl<TDim> const& warp,
+            T value,
+            std::uint32_t offset,
+            std::int32_t width)
+        {
+            auto const actual_group = warp.m_item_warp.get_sub_group();
+            std::uint32_t const w = static_cast<std::uint32_t>(width);
+            std::uint32_t const id = actual_group.get_local_linear_id();
+            std::uint32_t const end_index = (id / w + 1) * w;
+            T result = sycl::shift_group_left(actual_group, value, offset);
+            if((id + offset) >= end_index)
+            {
+                result = value;
+            }
+            return result;
+        }
+    };
+
+    template<typename TDim>
+    struct ShflXor<warp::WarpGenericSycl<TDim>>
+    {
+        template<typename T>
+        static auto shfl_xor(warp::WarpGenericSycl<TDim> const& warp, T value, std::int32_t mask, std::int32_t width)
+        {
+            auto const actual_group = warp.m_item_warp.get_sub_group();
+            std::uint32_t const w = static_cast<std::uint32_t>(width);
+            std::uint32_t const id = actual_group.get_local_linear_id();
+            std::uint32_t const start_index = id / w * w;
+            std::uint32_t const target_offset = (id % w) ^ static_cast<std::uint32_t>(mask);
+            return sycl::select_from_group(actual_group, value, target_offset < w ? start_index + target_offset : id);
+        }
+    };
+} // namespace alpaka::warp::trait
+
+#endif
diff --git a/include/alpaka/warp/WarpSingleThread.hpp b/include/alpaka/warp/WarpSingleThread.hpp
new file mode 100644
index 0000000..d271303
--- /dev/null
+++ b/include/alpaka/warp/WarpSingleThread.hpp
@@ -0,0 +1,121 @@
+/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Bernhard Manfred Gruber, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/warp/Traits.hpp"
+
+#include <cstdint>
+
+namespace alpaka::warp
+{
+    //! The single-threaded warp to emulate it on CPUs.
+    class WarpSingleThread : public concepts::Implements<ConceptWarp, WarpSingleThread>
+    {
+    };
+
+    namespace trait
+    {
+        template<>
+        struct GetSize<WarpSingleThread>
+        {
+            static auto getSize(warp::WarpSingleThread const& /*warp*/)
+            {
+                return 1;
+            }
+        };
+
+        template<>
+        struct Activemask<WarpSingleThread>
+        {
+            static auto activemask(warp::WarpSingleThread const& /*warp*/)
+            {
+                return 1u;
+            }
+        };
+
+        template<>
+        struct All<WarpSingleThread>
+        {
+            static auto all(warp::WarpSingleThread const& /*warp*/, std::int32_t predicate)
+            {
+                return predicate;
+            }
+        };
+
+        template<>
+        struct Any<WarpSingleThread>
+        {
+            static auto any(warp::WarpSingleThread const& /*warp*/, std::int32_t predicate)
+            {
+                return predicate;
+            }
+        };
+
+        template<>
+        struct Ballot<WarpSingleThread>
+        {
+            static auto ballot(warp::WarpSingleThread const& /*warp*/, std::int32_t predicate)
+            {
+                return predicate ? 1u : 0u;
+            }
+        };
+
+        template<>
+        struct Shfl<WarpSingleThread>
+        {
+            template<typename T>
+            static auto shfl(
+                warp::WarpSingleThread const& /*warp*/,
+                T val,
+                std::int32_t /*srcLane*/,
+                std::int32_t /*width*/)
+            {
+                return val;
+            }
+        };
+
+        template<>
+        struct ShflUp<WarpSingleThread>
+        {
+            template<typename T>
+            static auto shfl_up(
+                warp::WarpSingleThread const& /*warp*/,
+                T val,
+                std::uint32_t /*srcLane*/,
+                std::int32_t /*width*/)
+            {
+                return val;
+            }
+        };
+
+        template<>
+        struct ShflDown<WarpSingleThread>
+        {
+            template<typename T>
+            static auto shfl_down(
+                warp::WarpSingleThread const& /*warp*/,
+                T val,
+                std::uint32_t /*srcLane*/,
+                std::int32_t /*width*/)
+            {
+                return val;
+            }
+        };
+
+        template<>
+        struct ShflXor<WarpSingleThread>
+        {
+            template<typename T>
+            static auto shfl_xor(
+                warp::WarpSingleThread const& /*warp*/,
+                T val,
+                std::int32_t /*srcLane*/,
+                std::int32_t /*width*/)
+            {
+                return val;
+            }
+        };
+    } // namespace trait
+} // namespace alpaka::warp
diff --git a/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp b/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000..3a6d495
--- /dev/null
+++ b/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,189 @@
+/* Copyright 2023 Sergei Bastrakov, David M. Rogers, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/warp/Traits.hpp"
+
+#include <cstdint>
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+namespace alpaka::warp
+{
+    //! The GPU CUDA/HIP warp.
+    class WarpUniformCudaHipBuiltIn : public concepts::Implements<ConceptWarp, WarpUniformCudaHipBuiltIn>
+    {
+    };
+
+#    if !defined(ALPAKA_HOST_ONLY)
+
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+
+#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+
+    namespace trait
+    {
+        template<>
+        struct GetSize<WarpUniformCudaHipBuiltIn>
+        {
+            __device__ static auto getSize(warp::WarpUniformCudaHipBuiltIn const& /*warp*/) -> std::int32_t
+            {
+                return warpSize;
+            }
+        };
+
+        template<>
+        struct Activemask<WarpUniformCudaHipBuiltIn>
+        {
+            __device__ static auto activemask(warp::WarpUniformCudaHipBuiltIn const& /*warp*/)
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                -> std::uint32_t
+#        else
+                -> std::uint64_t
+#        endif
+            {
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                return __activemask();
+#        else
+                // No HIP intrinsic for it, emulate via ballot
+                return __ballot(1);
+#        endif
+            }
+        };
+
+        template<>
+        struct All<WarpUniformCudaHipBuiltIn>
+        {
+            __device__ static auto all(
+                [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
+                std::int32_t predicate) -> std::int32_t
+            {
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                return __all_sync(0xffff'ffff, predicate);
+#        else
+                return __all(predicate);
+#        endif
+            }
+        };
+
+        template<>
+        struct Any<WarpUniformCudaHipBuiltIn>
+        {
+            __device__ static auto any(
+                [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
+                std::int32_t predicate) -> std::int32_t
+            {
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                return __any_sync(0xffff'ffff, predicate);
+#        else
+                return __any(predicate);
+#        endif
+            }
+        };
+
+        template<>
+        struct Ballot<WarpUniformCudaHipBuiltIn>
+        {
+            __device__ static auto ballot(
+                [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
+                std::int32_t predicate)
+            // return type is required by the compiler
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                -> std::uint32_t
+#        else
+                -> std::uint64_t
+#        endif
+            {
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                return __ballot_sync(0xffff'ffff, predicate);
+#        else
+                return __ballot(predicate);
+#        endif
+            }
+        };
+
+        template<>
+        struct Shfl<WarpUniformCudaHipBuiltIn>
+        {
+            template<typename T>
+            __device__ static auto shfl(
+                [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
+                T val,
+                int srcLane,
+                std::int32_t width) -> T
+            {
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                return __shfl_sync(0xffff'ffff, val, srcLane, width);
+#        else
+                return __shfl(val, srcLane, width);
+#        endif
+            }
+        };
+
+        template<>
+        struct ShflUp<WarpUniformCudaHipBuiltIn>
+        {
+            template<typename T>
+            __device__ static auto shfl_up(
+                [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
+                T val,
+                std::uint32_t offset,
+                std::int32_t width) -> T
+            {
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                return __shfl_up_sync(0xffff'ffff, val, offset, width);
+#        else
+                return __shfl_up(val, offset, width);
+#        endif
+            }
+        };
+
+        template<>
+        struct ShflDown<WarpUniformCudaHipBuiltIn>
+        {
+            template<typename T>
+            __device__ static auto shfl_down(
+                [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
+                T val,
+                std::uint32_t offset,
+                std::int32_t width) -> T
+            {
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                return __shfl_down_sync(0xffff'ffff, val, offset, width);
+#        else
+                return __shfl_down(val, offset, width);
+#        endif
+            }
+        };
+
+        template<>
+        struct ShflXor<WarpUniformCudaHipBuiltIn>
+        {
+            template<typename T>
+            __device__ static auto shfl_xor(
+                [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
+                T val,
+                std::int32_t mask,
+                std::int32_t width) -> T
+            {
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                return __shfl_xor_sync(0xffff'ffff, val, mask, width);
+#        else
+                return __shfl_xor(val, mask, width);
+#        endif
+            }
+        };
+
+    } // namespace trait
+#    endif
+} // namespace alpaka::warp
+
+#endif
diff --git a/include/alpaka/workdiv/Traits.hpp b/include/alpaka/workdiv/Traits.hpp
new file mode 100644
index 0000000..211d688
--- /dev/null
+++ b/include/alpaka/workdiv/Traits.hpp
@@ -0,0 +1,77 @@
+/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/core/Positioning.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/vec/Vec.hpp"
+
+#include <type_traits>
+#include <utility>
+
+namespace alpaka
+{
+    struct ConceptWorkDiv
+    {
+    };
+
+    //! The work division trait.
+    namespace trait
+    {
+        //! The work div trait.
+        template<typename TWorkDiv, typename TOrigin, typename TUnit, typename TSfinae = void>
+        struct GetWorkDiv;
+    } // namespace trait
+
+    //! Get the extent requested.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TOrigin, typename TUnit, typename TWorkDiv>
+    ALPAKA_FN_HOST_ACC auto getWorkDiv(TWorkDiv const& workDiv) -> Vec<Dim<TWorkDiv>, Idx<TWorkDiv>>
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptWorkDiv, TWorkDiv>;
+        return trait::GetWorkDiv<ImplementationBase, TOrigin, TUnit>::getWorkDiv(workDiv);
+    }
+
+    namespace trait
+    {
+        //! The work div grid thread extent trait specialization.
+        template<typename TWorkDiv>
+        struct GetWorkDiv<TWorkDiv, origin::Grid, unit::Threads>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto getWorkDiv(TWorkDiv const& workDiv)
+            {
+                return alpaka::getWorkDiv<origin::Grid, unit::Blocks>(workDiv)
+                       * alpaka::getWorkDiv<origin::Block, unit::Threads>(workDiv);
+            }
+        };
+
+        //! The work div grid element extent trait specialization.
+        template<typename TWorkDiv>
+        struct GetWorkDiv<TWorkDiv, origin::Grid, unit::Elems>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto getWorkDiv(TWorkDiv const& workDiv)
+            {
+                return alpaka::getWorkDiv<origin::Grid, unit::Threads>(workDiv)
+                       * alpaka::getWorkDiv<origin::Thread, unit::Elems>(workDiv);
+            }
+        };
+
+        //! The work div block element extent trait specialization.
+        template<typename TWorkDiv>
+        struct GetWorkDiv<TWorkDiv, origin::Block, unit::Elems>
+        {
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto getWorkDiv(TWorkDiv const& workDiv)
+            {
+                return alpaka::getWorkDiv<origin::Block, unit::Threads>(workDiv)
+                       * alpaka::getWorkDiv<origin::Thread, unit::Elems>(workDiv);
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/workdiv/WorkDivGenericSycl.hpp b/include/alpaka/workdiv/WorkDivGenericSycl.hpp
new file mode 100644
index 0000000..26e0075
--- /dev/null
+++ b/include/alpaka/workdiv/WorkDivGenericSycl.hpp
@@ -0,0 +1,119 @@
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/vec/Vec.hpp"
+#include "alpaka/workdiv/Traits.hpp"
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+
+#    include <sycl/sycl.hpp>
+
+namespace alpaka
+{
+    //! The SYCL accelerator work division.
+    template<typename TDim, typename TIdx>
+    class WorkDivGenericSycl : public concepts::Implements<ConceptWorkDiv, WorkDivGenericSycl<TDim, TIdx>>
+    {
+        static_assert(TDim::value > 0, "The SYCL work division must have a dimension greater than zero.");
+
+    public:
+        using WorkDivBase = WorkDivGenericSycl;
+
+        WorkDivGenericSycl(Vec<TDim, TIdx> const& threadElemExtent, sycl::nd_item<TDim::value> work_item)
+            : m_threadElemExtent{threadElemExtent}
+            , m_item_workdiv{work_item}
+        {
+        }
+
+        Vec<TDim, TIdx> const& m_threadElemExtent;
+        sycl::nd_item<TDim::value> m_item_workdiv;
+    };
+} // namespace alpaka
+
+namespace alpaka::trait
+{
+    //! The SYCL accelerator work division dimension get trait specialization.
+    template<typename TDim, typename TIdx>
+    struct DimType<WorkDivGenericSycl<TDim, TIdx>>
+    {
+        using type = TDim;
+    };
+
+    //! The SYCL accelerator work division idx type trait specialization.
+    template<typename TDim, typename TIdx>
+    struct IdxType<WorkDivGenericSycl<TDim, TIdx>>
+    {
+        using type = TIdx;
+    };
+
+    //! The SYCL accelerator work division grid block extent trait specialization.
+    template<typename TDim, typename TIdx>
+    struct GetWorkDiv<WorkDivGenericSycl<TDim, TIdx>, origin::Grid, unit::Blocks>
+    {
+        //! \return The number of blocks in each dimension of the grid.
+        static auto getWorkDiv(WorkDivGenericSycl<TDim, TIdx> const& workDiv) -> Vec<TDim, TIdx>
+        {
+            if constexpr(TDim::value == 0)
+                return Vec<TDim, TIdx>{};
+            else if constexpr(TDim::value == 1)
+                return Vec<TDim, TIdx>{static_cast<TIdx>(workDiv.m_item_workdiv.get_group_range(0))};
+            else if constexpr(TDim::value == 2)
+            {
+                return Vec<TDim, TIdx>{
+                    static_cast<TIdx>(workDiv.m_item_workdiv.get_group_range(1)),
+                    static_cast<TIdx>(workDiv.m_item_workdiv.get_group_range(0))};
+            }
+            else
+            {
+                return Vec<TDim, TIdx>{
+                    static_cast<TIdx>(workDiv.m_item_workdiv.get_group_range(2)),
+                    static_cast<TIdx>(workDiv.m_item_workdiv.get_group_range(1)),
+                    static_cast<TIdx>(workDiv.m_item_workdiv.get_group_range(0))};
+            }
+        }
+    };
+
+    //! The SYCL accelerator work division block thread extent trait specialization.
+    template<typename TDim, typename TIdx>
+    struct GetWorkDiv<WorkDivGenericSycl<TDim, TIdx>, origin::Block, unit::Threads>
+    {
+        //! \return The number of threads in each dimension of a block.
+        static auto getWorkDiv(WorkDivGenericSycl<TDim, TIdx> const& workDiv) -> Vec<TDim, TIdx>
+        {
+            if constexpr(TDim::value == 0)
+                return Vec<TDim, TIdx>{};
+            else if constexpr(TDim::value == 1)
+                return Vec<TDim, TIdx>{static_cast<TIdx>(workDiv.m_item_workdiv.get_local_range(0))};
+            else if constexpr(TDim::value == 2)
+            {
+                return Vec<TDim, TIdx>{
+                    static_cast<TIdx>(workDiv.m_item_workdiv.get_local_range(1)),
+                    static_cast<TIdx>(workDiv.m_item_workdiv.get_local_range(0))};
+            }
+            else
+            {
+                return Vec<TDim, TIdx>{
+                    static_cast<TIdx>(workDiv.m_item_workdiv.get_local_range(2)),
+                    static_cast<TIdx>(workDiv.m_item_workdiv.get_local_range(1)),
+                    static_cast<TIdx>(workDiv.m_item_workdiv.get_local_range(0))};
+            }
+        }
+    };
+
+    //! The SYCL accelerator work division thread element extent trait specialization.
+    template<typename TDim, typename TIdx>
+    struct GetWorkDiv<WorkDivGenericSycl<TDim, TIdx>, origin::Thread, unit::Elems>
+    {
+        //! \return The number of elements in each dimension of the thread.
+        static auto getWorkDiv(WorkDivGenericSycl<TDim, TIdx> const& workDiv) -> Vec<TDim, TIdx>
+        {
+            return workDiv.m_threadElemExtent;
+        }
+    };
+} // namespace alpaka::trait
+
+#endif
diff --git a/include/alpaka/workdiv/WorkDivHelpers.hpp b/include/alpaka/workdiv/WorkDivHelpers.hpp
new file mode 100644
index 0000000..c15319c
--- /dev/null
+++ b/include/alpaka/workdiv/WorkDivHelpers.hpp
@@ -0,0 +1,554 @@
+/* Copyright 2022 Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/acc/Traits.hpp"
+#include "alpaka/core/Assert.hpp"
+#include "alpaka/core/Common.hpp"
+#include "alpaka/core/Utility.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/extent/Traits.hpp"
+#include "alpaka/kernel/KernelFunctionAttributes.hpp"
+#include "alpaka/kernel/Traits.hpp"
+#include "alpaka/vec/Vec.hpp"
+#include "alpaka/workdiv/WorkDivMembers.hpp"
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <functional>
+#include <set>
+#include <type_traits>
+
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wswitch-default"
+#endif
+
+//! The alpaka library.
+namespace alpaka
+{
+    //! The grid block extent subdivision restrictions.
+    enum class GridBlockExtentSubDivRestrictions
+    {
+        EqualExtent, //!< The block thread extent will be equal in all dimensions.
+        CloseToEqualExtent, //!< The block thread extent will be as close to equal as possible in all dimensions.
+        Unrestricted, //!< The block thread extent will not have any restrictions.
+    };
+
+    namespace detail
+    {
+        //! Finds the largest divisor where divident % divisor == 0
+        //! \param dividend The dividend.
+        //! \param maxDivisor The maximum divisor.
+        //! \return The biggest number that satisfies the following conditions:
+        //!     1) dividend%ret==0
+        //!     2) ret<=maxDivisor
+        template<typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
+        ALPAKA_FN_HOST auto nextDivisorLowerOrEqual(T const& dividend, T const& maxDivisor) -> T
+        {
+            core::assertValueUnsigned(dividend);
+            core::assertValueUnsigned(maxDivisor);
+            ALPAKA_ASSERT(dividend >= maxDivisor);
+
+            T divisor = maxDivisor;
+            while(dividend % divisor != 0)
+                --divisor;
+            return divisor;
+        }
+
+        //! \param val The value to find divisors of.
+        //! \param maxDivisor The maximum.
+        //! \return A list of all divisors less then or equal to the given maximum.
+        template<typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
+        ALPAKA_FN_HOST auto allDivisorsLessOrEqual(T const& val, T const& maxDivisor) -> std::set<T>
+        {
+            std::set<T> divisorSet;
+
+            core::assertValueUnsigned(val);
+            core::assertValueUnsigned(maxDivisor);
+            ALPAKA_ASSERT(maxDivisor <= val);
+
+            for(T i(1); i <= std::min(val, maxDivisor); ++i)
+            {
+                if(val % i == 0)
+                {
+                    divisorSet.insert(static_cast<T>(val / i));
+                }
+            }
+
+            return divisorSet;
+        }
+    } // namespace detail
+
+    //! \tparam TDim The dimensionality of the accelerator device properties.
+    //! \tparam TIdx The idx type of the accelerator device properties.
+    //! \param accDevProps The maxima for the work division.
+    //! \return If the accelerator device properties are valid.
+    template<typename TDim, typename TIdx>
+    ALPAKA_FN_HOST auto isValidAccDevProps(AccDevProps<TDim, TIdx> const& accDevProps) -> bool
+    {
+        // Check that the maximum counts are greater or equal 1.
+        if((accDevProps.m_gridBlockCountMax < 1) || (accDevProps.m_blockThreadCountMax < 1)
+           || (accDevProps.m_threadElemCountMax < 1))
+        {
+            return false;
+        }
+
+        // Store the maxima allowed for extents of grid, blocks and threads.
+        auto const gridBlockExtentMax = subVecEnd<TDim>(accDevProps.m_gridBlockExtentMax);
+        auto const blockThreadExtentMax = subVecEnd<TDim>(accDevProps.m_blockThreadExtentMax);
+        auto const threadElemExtentMax = subVecEnd<TDim>(accDevProps.m_threadElemExtentMax);
+
+        // Check that the extents for all dimensions are correct.
+        for(typename TDim::value_type i(0); i < TDim::value; ++i)
+        {
+            // Check that the maximum extents are greater or equal 1.
+            if((gridBlockExtentMax[i] < 1) || (blockThreadExtentMax[i] < 1) || (threadElemExtentMax[i] < 1))
+            {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    //! Subdivides the given grid thread extent into blocks restricted by the maxima allowed.
+    //! 1. The the maxima block, thread and element extent and counts
+    //! 2. The requirement of the block thread extent to divide the grid thread extent without remainder
+    //! 3. The requirement of the block extent.
+    //!
+    //! \param gridElemExtent The full extent of elements in the grid.
+    //! \param threadElemExtent the number of elements computed per thread.
+    //! \param accDevProps The maxima for the work division.
+    //! \param kernelBlockThreadCountMax The maximum number of threads per block. If it is zero this argument is not
+    //! used, device hard limits are used.
+    //! \param blockThreadMustDivideGridThreadExtent If this is true, the grid thread extent will be multiples of the
+    //! corresponding block thread extent.
+    //!     NOTE: If this is true and gridThreadExtent is prime (or otherwise bad chosen) in a dimension, the block
+    //!     thread extent will be one in this dimension.
+    //! \param gridBlockExtentSubDivRestrictions The grid block extent subdivision restrictions.
+    template<typename TDim, typename TIdx>
+    ALPAKA_FN_HOST auto subDivideGridElems(
+        Vec<TDim, TIdx> const& gridElemExtent,
+        Vec<TDim, TIdx> const& threadElemExtent,
+        AccDevProps<TDim, TIdx> const& accDevProps,
+        TIdx kernelBlockThreadCountMax = static_cast<TIdx>(0u),
+        bool blockThreadMustDivideGridThreadExtent = true,
+        GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions
+        = GridBlockExtentSubDivRestrictions::Unrestricted) -> WorkDivMembers<TDim, TIdx>
+    {
+        using Vec = Vec<TDim, TIdx>;
+        using DimLoopInd = typename TDim::value_type;
+
+        for(DimLoopInd i(0); i < TDim::value; ++i)
+        {
+            ALPAKA_ASSERT(gridElemExtent[i] >= 1);
+            ALPAKA_ASSERT(threadElemExtent[i] >= 1);
+            ALPAKA_ASSERT(threadElemExtent[i] <= accDevProps.m_threadElemExtentMax[i]);
+        }
+        ALPAKA_ASSERT(threadElemExtent.prod() <= accDevProps.m_threadElemCountMax);
+        ALPAKA_ASSERT(isValidAccDevProps(accDevProps));
+
+        // Handle threadElemExtent and compute gridThreadExtent. Afterwards, only the blockThreadExtent has to be
+        // optimized.
+        auto clippedThreadElemExtent = elementwise_min(threadElemExtent, gridElemExtent);
+        auto const gridThreadExtent = [&]
+        {
+            Vec r;
+            for(DimLoopInd i(0u); i < TDim::value; ++i)
+                r[i] = core::divCeil(gridElemExtent[i], clippedThreadElemExtent[i]);
+            return r;
+        }();
+
+        ///////////////////////////////////////////////////////////////////
+        // Try to calculate an optimal blockThreadExtent.
+
+        // Restrict the max block thread extent from the maximum possible to the grid thread extent.
+        // This removes dimensions not required in the grid thread extent.
+        // This has to be done before the blockThreadCountMax clipping to get the maximum correctly.
+        auto blockThreadExtent = elementwise_min(accDevProps.m_blockThreadExtentMax, gridThreadExtent);
+
+        // For equal block thread extent, restrict it to its minimum component.
+        // For example (512, 256, 1024) will get (256, 256, 256).
+        if(gridBlockExtentSubDivRestrictions == GridBlockExtentSubDivRestrictions::EqualExtent)
+            blockThreadExtent = Vec::all(blockThreadExtent.min() != TIdx(0) ? blockThreadExtent.min() : TIdx(1));
+
+        // Choose kernelBlockThreadCountMax if it is not zero. It is less than the accelerator properties.
+        TIdx const& blockThreadCountMax
+            = (kernelBlockThreadCountMax != 0) ? kernelBlockThreadCountMax : accDevProps.m_blockThreadCountMax;
+
+        // Block thread extent could be {1024,1024,1024} although max threads per block is 1024. Block thread extent
+        // shows the max number of threads along each axis, it is not a measure to get max number of threads per block.
+        // It must be further limited (clipped above) by the kernel limit along each axis, using device limits is not
+        // enough.
+        for(typename TDim::value_type i(0); i < TDim::value; ++i)
+        {
+            blockThreadExtent[i] = std::min(blockThreadExtent[i], blockThreadCountMax);
+        }
+
+        // Make the blockThreadExtent product smaller or equal to the accelerator's limit.
+        if(blockThreadCountMax == 1)
+        {
+            blockThreadExtent = Vec::all(core::nthRootFloor(blockThreadCountMax, TIdx{TDim::value}));
+        }
+        else if(blockThreadExtent.prod() > blockThreadCountMax)
+        {
+            switch(gridBlockExtentSubDivRestrictions)
+            {
+            case GridBlockExtentSubDivRestrictions::EqualExtent:
+                blockThreadExtent = Vec::all(core::nthRootFloor(blockThreadCountMax, TIdx{TDim::value}));
+                break;
+            case GridBlockExtentSubDivRestrictions::CloseToEqualExtent:
+                // Very primitive clipping. Just halve the largest value until it fits.
+                while(blockThreadExtent.prod() > blockThreadCountMax)
+                    blockThreadExtent[blockThreadExtent.maxElem()] /= TIdx{2};
+                break;
+            case GridBlockExtentSubDivRestrictions::Unrestricted:
+                // Very primitive clipping. Just halve the smallest value (which is not 1) until it fits.
+                while(blockThreadExtent.prod() > blockThreadCountMax)
+                {
+                    auto const it = std::min_element(
+                        blockThreadExtent.begin(),
+                        blockThreadExtent.end() - 1, //! \todo why omit the last element?
+                        [](TIdx const& a, TIdx const& b)
+                        {
+                            if(a == TIdx{1})
+                                return false;
+                            if(b == TIdx{1})
+                                return true;
+                            return a < b;
+                        });
+                    *it /= TIdx{2};
+                }
+                break;
+            }
+        }
+
+
+        // Make the block thread extent divide the grid thread extent.
+        if(blockThreadMustDivideGridThreadExtent)
+        {
+            switch(gridBlockExtentSubDivRestrictions)
+            {
+            case GridBlockExtentSubDivRestrictions::EqualExtent:
+                {
+                    // For equal size block extent we have to compute the gcd of all grid thread extent that is less
+                    // then the current maximal block thread extent. For this we compute the divisors of all grid
+                    // thread extent less then the current maximal block thread extent.
+                    std::array<std::set<TIdx>, TDim::value> gridThreadExtentDivisors;
+                    for(DimLoopInd i(0u); i < TDim::value; ++i)
+                    {
+                        gridThreadExtentDivisors[i]
+                            = detail::allDivisorsLessOrEqual(gridThreadExtent[i], blockThreadExtent[i]);
+                    }
+                    // The maximal common divisor of all block thread extent is the optimal solution.
+                    std::set<TIdx> intersects[2u];
+                    for(DimLoopInd i(1u); i < TDim::value; ++i)
+                    {
+                        intersects[(i - 1u) % 2u] = gridThreadExtentDivisors[0];
+                        intersects[(i) % 2u].clear();
+                        set_intersection(
+                            std::begin(intersects[(i - 1u) % 2u]),
+                            std::end(intersects[(i - 1u) % 2u]),
+                            std::begin(gridThreadExtentDivisors[i]),
+                            std::end(gridThreadExtentDivisors[i]),
+                            std::inserter(intersects[i % 2], std::begin(intersects[i % 2u])));
+                    }
+                    TIdx const maxCommonDivisor = *(--std::end(intersects[(TDim::value - 1) % 2u]));
+                    blockThreadExtent = Vec::all(maxCommonDivisor);
+                    break;
+                }
+            case GridBlockExtentSubDivRestrictions::CloseToEqualExtent:
+                [[fallthrough]];
+            case GridBlockExtentSubDivRestrictions::Unrestricted:
+                for(DimLoopInd i(0u); i < TDim::value; ++i)
+                {
+                    blockThreadExtent[i] = detail::nextDivisorLowerOrEqual(gridThreadExtent[i], blockThreadExtent[i]);
+                }
+                break;
+            }
+        }
+
+        // grid blocks extent = grid thread / block thread extent. quotient is rounded up.
+        auto gridBlockExtent = [&]
+        {
+            Vec r;
+            for(DimLoopInd i = 0; i < TDim::value; ++i)
+                r[i] = core::divCeil(gridThreadExtent[i], blockThreadExtent[i]);
+            return r;
+        }();
+
+
+        // Store the maxima allowed for extents of grid, blocks and threads.
+        auto const gridBlockExtentMax = subVecEnd<TDim>(accDevProps.m_gridBlockExtentMax);
+        auto const blockThreadExtentMax = subVecEnd<TDim>(accDevProps.m_blockThreadExtentMax);
+        auto const threadElemExtentMax = subVecEnd<TDim>(accDevProps.m_threadElemExtentMax);
+
+        // Check that the extents for all dimensions are correct.
+        for(typename TDim::value_type i(0); i < TDim::value; ++i)
+        {
+            // Check that the maximum extents are greater or equal 1.
+            if(gridBlockExtentMax[i] < gridBlockExtent[i])
+            {
+                gridBlockExtent[i] = gridBlockExtentMax[i];
+            }
+            if(blockThreadExtentMax[i] < blockThreadExtent[i])
+            {
+                blockThreadExtent[i] = blockThreadExtentMax[i];
+            }
+            if(threadElemExtentMax[i] < threadElemExtent[i])
+            {
+                clippedThreadElemExtent[i] = threadElemExtentMax[i];
+            }
+        }
+
+        return WorkDivMembers<TDim, TIdx>(gridBlockExtent, blockThreadExtent, clippedThreadElemExtent);
+    }
+
+    //! Kernel start configuration to determine a valid work division
+    //!
+    //! \tparam TGridElemExtent The type of the grid element extent.
+    //! \tparam TThreadElemExtent The type of the thread element extent.
+    template<
+        typename TAcc,
+        typename TGridElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>,
+        typename TThreadElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>>
+    struct KernelCfg
+    {
+        //! The full extent of elements in the grid.
+        TGridElemExtent const gridElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>::ones();
+        //! The number of elements computed per thread.
+        TThreadElemExtent const threadElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>::ones();
+        //! If this is true, the grid thread extent will be multiples of
+        //! the corresponding block thread extent.
+        //!     NOTE: If this is true and gridThreadExtent is prime (or otherwise bad chosen) in a dimension, the block
+        //!     thread extent will be one in this dimension.
+        bool blockThreadMustDivideGridThreadExtent = true;
+        //! The grid block extent subdivision restrictions.
+        GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions
+            = GridBlockExtentSubDivRestrictions::Unrestricted;
+
+        static_assert(
+            Dim<TGridElemExtent>::value == Dim<TAcc>::value,
+            "The dimension of Acc and the dimension of TGridElemExtent have to be identical!");
+        static_assert(
+            Dim<TGridElemExtent>::value == Dim<TAcc>::value,
+            "The dimension of Acc and the dimension of TThreadElemExtent have to be identical!");
+        static_assert(
+            std::is_same_v<Idx<TGridElemExtent>, Idx<TAcc>>,
+            "The idx type of Acc and the idx type of TGridElemExtent have to be identical!");
+        static_assert(
+            std::is_same_v<Idx<TThreadElemExtent>, Idx<TAcc>>,
+            "The idx type of Acc and the idx type of TThreadElemExtent have to be identical!");
+    };
+
+    //! \tparam TDev The type of the device.
+    //! \tparam TGridElemExtent The type of the grid element extent.
+    //! \tparam TThreadElemExtent The type of the thread element extent.
+    //! \param dev The device the work division should be valid for.
+    //! \param kernelFnObj The kernel function object which should be executed.
+    //! \param args The kernel invocation arguments.
+    //! \return The work division for the accelerator based on the kernel and argument types
+    template<
+        typename TAcc,
+        typename TDev,
+        typename TGridElemExtent,
+        typename TThreadElemExtent,
+        typename TKernelFnObj,
+        typename... TArgs>
+    ALPAKA_FN_HOST auto getValidWorkDiv(
+        KernelCfg<TAcc, TGridElemExtent, TThreadElemExtent> const& kernelCfg,
+        [[maybe_unused]] TDev const& dev,
+        TKernelFnObj const& kernelFnObj,
+        TArgs&&... args) -> WorkDivMembers<Dim<TAcc>, Idx<TAcc>>
+    {
+        using Acc = TAcc;
+
+        // Get max number of threads per block depending on the kernel function attributes.
+        // For GPU backend; number of registers used by the kernel, local and shared memory usage of the kernel
+        // determines the max number of threads per block. This number could be equal or less than the max number of
+        // threads per block defined by device properties.
+        auto const kernelFunctionAttributes
+            = getFunctionAttributes<Acc>(dev, kernelFnObj, std::forward<TArgs>(args)...);
+        auto const threadsPerBlock = kernelFunctionAttributes.maxThreadsPerBlock;
+
+        if constexpr(Dim<TGridElemExtent>::value == 0)
+        {
+            auto const zero = Vec<DimInt<0>, Idx<Acc>>{};
+            ALPAKA_ASSERT(kernelCfg.gridElemExtent == zero);
+            ALPAKA_ASSERT(kernelCfg.threadElemExtent == zero);
+            return WorkDivMembers<DimInt<0>, Idx<Acc>>{zero, zero, zero};
+        }
+        else
+            return subDivideGridElems(
+                getExtents(kernelCfg.gridElemExtent),
+                getExtents(kernelCfg.threadElemExtent),
+                getAccDevProps<Acc>(dev),
+                static_cast<Idx<Acc>>(threadsPerBlock),
+                kernelCfg.blockThreadMustDivideGridThreadExtent,
+                kernelCfg.gridBlockExtentSubDivRestrictions);
+
+        using V [[maybe_unused]] = Vec<Dim<TGridElemExtent>, Idx<TGridElemExtent>>;
+        ALPAKA_UNREACHABLE(WorkDivMembers<Dim<TGridElemExtent>, Idx<TGridElemExtent>>{V{}, V{}, V{}});
+    }
+
+    //! Checks if the work division is supported
+    //!
+    //! \tparam TWorkDiv The type of the work division.
+    //! \tparam TDim The dimensionality of the accelerator device properties.
+    //! \tparam TIdx The idx type of the accelerator device properties.
+    //! \param workDiv The work division to test for validity.
+    //! \param accDevProps The maxima for the work division.
+    //! \return If the work division is valid for the given accelerator device properties.
+    template<typename TWorkDiv, typename TDim, typename TIdx>
+    ALPAKA_FN_HOST auto isValidWorkDiv(TWorkDiv const& workDiv, AccDevProps<TDim, TIdx> const& accDevProps) -> bool
+    {
+        // Get the extents of grid, blocks and threads of the work division to check.
+        auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(workDiv);
+        auto const blockThreadExtent = getWorkDiv<Block, Threads>(workDiv);
+        auto const threadElemExtent = getWorkDiv<Thread, Elems>(workDiv);
+
+        // Check that the maximal counts are satisfied.
+        if(accDevProps.m_gridBlockCountMax < gridBlockExtent.prod())
+        {
+            return false;
+        }
+        if(accDevProps.m_blockThreadCountMax < blockThreadExtent.prod())
+        {
+            return false;
+        }
+        if(accDevProps.m_threadElemCountMax < threadElemExtent.prod())
+        {
+            return false;
+        }
+
+        // Check that the extents for all dimensions are correct.
+        if constexpr(Dim<TWorkDiv>::value > 0)
+        {
+            // Store the maxima allowed for extents of grid, blocks and threads.
+            auto const gridBlockExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_gridBlockExtentMax);
+            auto const blockThreadExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_blockThreadExtentMax);
+            auto const threadElemExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_threadElemExtentMax);
+
+            for(typename Dim<TWorkDiv>::value_type i(0); i < Dim<TWorkDiv>::value; ++i)
+            {
+                // No extent is allowed to be zero or greater then the allowed maximum.
+                if((gridBlockExtent[i] < 1) || (blockThreadExtent[i] < 1) || (threadElemExtent[i] < 1)
+                   || (gridBlockExtentMax[i] < gridBlockExtent[i]) || (blockThreadExtentMax[i] < blockThreadExtent[i])
+                   || (threadElemExtentMax[i] < threadElemExtent[i]))
+                {
+                    return false;
+                }
+            }
+        }
+
+        return true;
+    }
+
+    //! Checks if the work division is supported
+    //!
+    //! \tparam TWorkDiv The type of the work division.
+    //! \tparam TDim The dimensionality of the accelerator device properties.
+    //! \tparam TIdx The idx type of the accelerator device properties.
+    //! \param workDiv The work division to test for validity.
+    //! \param accDevProps The maxima for the work division.
+    //! \param kernelFunctionAttributes Kernel attributes, including the maximum number of threads per block that can
+    //! be used by this kernel on the given device. This number can be equal to or smaller than the the number of
+    //! threads per block supported by the device.
+    //! \return Returns true if the work division is valid for the given accelerator device properties and for the
+    //! given kernel. Otherwise returns false.
+    template<typename TAcc, typename TWorkDiv, typename TDim, typename TIdx>
+    ALPAKA_FN_HOST auto isValidWorkDiv(
+        TWorkDiv const& workDiv,
+        AccDevProps<TDim, TIdx> const& accDevProps,
+        KernelFunctionAttributes const& kernelFunctionAttributes) -> bool
+    {
+        // Get the extents of grid, blocks and threads of the work division to check.
+        auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(workDiv);
+        auto const blockThreadExtent = getWorkDiv<Block, Threads>(workDiv);
+        auto const threadElemExtent = getWorkDiv<Thread, Elems>(workDiv);
+        // Use kernel properties to find the max threads per block for the kernel
+        auto const threadsPerBlockForKernel = kernelFunctionAttributes.maxThreadsPerBlock;
+        // Select the minimum to find the upper bound for the threads per block
+        auto const allowedThreadsPerBlock = std::min(
+            static_cast<TIdx>(threadsPerBlockForKernel),
+            static_cast<TIdx>(accDevProps.m_blockThreadCountMax));
+        // Check that the maximal counts are satisfied.
+        if(accDevProps.m_gridBlockCountMax < gridBlockExtent.prod())
+        {
+            return false;
+        }
+        if(allowedThreadsPerBlock < blockThreadExtent.prod())
+        {
+            return false;
+        }
+        if(accDevProps.m_threadElemCountMax < threadElemExtent.prod())
+        {
+            return false;
+        }
+
+        // Check that the extents for all dimensions are correct.
+        if constexpr(Dim<TWorkDiv>::value > 0)
+        {
+            // Store the maxima allowed for extents of grid, blocks and threads.
+            auto const gridBlockExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_gridBlockExtentMax);
+            auto const blockThreadExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_blockThreadExtentMax);
+            auto const threadElemExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_threadElemExtentMax);
+
+            for(typename Dim<TWorkDiv>::value_type i(0); i < Dim<TWorkDiv>::value; ++i)
+            {
+                // No extent is allowed to be zero or greater then the allowed maximum.
+                if((gridBlockExtent[i] < 1) || (blockThreadExtent[i] < 1) || (threadElemExtent[i] < 1)
+                   || (gridBlockExtentMax[i] < gridBlockExtent[i]) || (blockThreadExtentMax[i] < blockThreadExtent[i])
+                   || (threadElemExtentMax[i] < threadElemExtent[i]))
+                {
+                    return false;
+                }
+            }
+        }
+
+        return true;
+    }
+
+    //! Checks if the work division is supported for the kernel on the device
+    //!
+    //! \tparam TAcc The accelerator to test the validity on.
+    //! \tparam TDev The type of the device.
+    //! \tparam TWorkDiv The type of work division to test for validity.
+    //! \param workDiv The work division to test for validity.
+    //! \param dev The device to test the work division for validity on.
+    //! \param kernelFnObj The kernel function object which should be executed.
+    //! \param args The kernel invocation arguments.
+    //! \return Returns the value of isValidWorkDiv function.
+    template<typename TAcc, typename TWorkDiv, typename TDev, typename TKernelFnObj, typename... TArgs>
+    ALPAKA_FN_HOST auto isValidWorkDiv(
+        TWorkDiv const& workDiv,
+        TDev const& dev,
+        TKernelFnObj const& kernelFnObj,
+        TArgs&&... args) -> bool
+    {
+        return isValidWorkDiv<TAcc>(
+            workDiv,
+            getAccDevProps<TAcc>(dev),
+            getFunctionAttributes<TAcc>(dev, kernelFnObj, std::forward<TArgs>(args)...));
+    }
+
+    //! Checks if the work division is supported by the device
+    //!
+    //! \tparam TAcc The accelerator to test the validity on.
+    //! \param workDiv The work division to test for validity.
+    //! \param dev The device to test the work division for validity on.
+    //! \return If the work division is valid on this accelerator.
+    template<typename TAcc, typename TWorkDiv, typename TDev>
+    ALPAKA_FN_HOST auto isValidWorkDiv(TWorkDiv const& workDiv, TDev const& dev) -> bool
+    {
+        return isValidWorkDiv(workDiv, getAccDevProps<TAcc>(dev));
+    }
+} // namespace alpaka
+
+#if BOOST_COMP_CLANG
+#    pragma clang diagnostic pop
+#endif
diff --git a/include/alpaka/workdiv/WorkDivMembers.hpp b/include/alpaka/workdiv/WorkDivMembers.hpp
new file mode 100644
index 0000000..3d36450
--- /dev/null
+++ b/include/alpaka/workdiv/WorkDivMembers.hpp
@@ -0,0 +1,159 @@
+/* Copyright 2022 Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/Common.hpp"
+#include "alpaka/extent/Traits.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/vec/Vec.hpp"
+#include "alpaka/workdiv/Traits.hpp"
+
+#include <iosfwd>
+
+namespace alpaka
+{
+    //! A basic class holding the work division as grid block extent, block thread and thread element extent.
+    template<typename TDim, typename TIdx>
+    class WorkDivMembers : public concepts::Implements<ConceptWorkDiv, WorkDivMembers<TDim, TIdx>>
+    {
+    public:
+        ALPAKA_FN_HOST_ACC WorkDivMembers() = delete;
+
+        //! Accepts different alpaka vector types and takes the last TDim number of items.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TGridBlockExtent, typename TBlockThreadExtent, typename TThreadElemExtent>
+        ALPAKA_FN_HOST_ACC explicit WorkDivMembers(
+            TGridBlockExtent const& gridBlockExtent = TGridBlockExtent(),
+            TBlockThreadExtent const& blockThreadExtent = TBlockThreadExtent(),
+            TThreadElemExtent const& threadElemExtent = TThreadElemExtent())
+            : m_gridBlockExtent(getExtentVecEnd<TDim>(gridBlockExtent))
+            , m_blockThreadExtent(getExtentVecEnd<TDim>(blockThreadExtent))
+            , m_threadElemExtent(getExtentVecEnd<TDim>(threadElemExtent))
+        {
+        }
+
+        //! \brief Accepts single specific type and is called without explicit template parameters.
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC WorkDivMembers(
+            alpaka::Vec<TDim, TIdx> const& gridBlockExtent,
+            alpaka::Vec<TDim, TIdx> const& blockThreadExtent,
+            alpaka::Vec<TDim, TIdx> const& elemExtent)
+            : m_gridBlockExtent(gridBlockExtent)
+            , m_blockThreadExtent(blockThreadExtent)
+            , m_threadElemExtent(elemExtent)
+        {
+        }
+
+        ALPAKA_NO_HOST_ACC_WARNING
+        ALPAKA_FN_HOST_ACC WorkDivMembers(WorkDivMembers const& other)
+            : m_gridBlockExtent(other.m_gridBlockExtent)
+            , m_blockThreadExtent(other.m_blockThreadExtent)
+            , m_threadElemExtent(other.m_threadElemExtent)
+        {
+        }
+
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TWorkDiv>
+        ALPAKA_FN_HOST_ACC explicit WorkDivMembers(TWorkDiv const& other)
+            : m_gridBlockExtent(subVecEnd<TDim>(getWorkDiv<Grid, Blocks>(other)))
+            , m_blockThreadExtent(subVecEnd<TDim>(getWorkDiv<Block, Threads>(other)))
+            , m_threadElemExtent(subVecEnd<TDim>(getWorkDiv<Thread, Elems>(other)))
+        {
+        }
+
+        WorkDivMembers(WorkDivMembers&&) = default;
+        auto operator=(WorkDivMembers const&) -> WorkDivMembers& = default;
+        auto operator=(WorkDivMembers&&) -> WorkDivMembers& = default;
+
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TWorkDiv>
+        ALPAKA_FN_HOST_ACC auto operator=(TWorkDiv const& other) -> WorkDivMembers<TDim, TIdx>&
+        {
+            m_gridBlockExtent = subVecEnd<TDim>(getWorkDiv<Grid, Blocks>(other));
+            m_blockThreadExtent = subVecEnd<TDim>(getWorkDiv<Block, Threads>(other));
+            m_threadElemExtent = subVecEnd<TDim>(getWorkDiv<Thread, Elems>(other));
+            return *this;
+        }
+
+        ALPAKA_FN_HOST_ACC friend constexpr auto operator==(WorkDivMembers const& a, WorkDivMembers const& b) -> bool
+        {
+            return a.m_gridBlockExtent == b.m_gridBlockExtent && a.m_blockThreadExtent == b.m_blockThreadExtent
+                   && a.m_threadElemExtent == b.m_threadElemExtent;
+        }
+
+        ALPAKA_FN_HOST friend auto operator<<(std::ostream& os, WorkDivMembers const& workDiv) -> std::ostream&
+        {
+            return os << "{gridBlockExtent: " << workDiv.m_gridBlockExtent
+                      << ", blockThreadExtent: " << workDiv.m_blockThreadExtent
+                      << ", threadElemExtent: " << workDiv.m_threadElemExtent << "}";
+        }
+
+    public:
+        Vec<TDim, TIdx> m_gridBlockExtent;
+        Vec<TDim, TIdx> m_blockThreadExtent;
+        Vec<TDim, TIdx> m_threadElemExtent;
+    };
+
+    //! Deduction guide for the constructor which can be called without explicit template type parameters
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TDim, typename TIdx>
+    ALPAKA_FN_HOST_ACC WorkDivMembers(
+        alpaka::Vec<TDim, TIdx> const& gridBlockExtent,
+        alpaka::Vec<TDim, TIdx> const& blockThreadExtent,
+        alpaka::Vec<TDim, TIdx> const& elemExtent) -> WorkDivMembers<TDim, TIdx>;
+
+    namespace trait
+    {
+        //! The WorkDivMembers dimension get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<WorkDivMembers<TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //! The WorkDivMembers idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<WorkDivMembers<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+
+        //! The WorkDivMembers grid block extent trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetWorkDiv<WorkDivMembers<TDim, TIdx>, origin::Grid, unit::Blocks>
+        {
+            //! \return The number of blocks in each dimension of the grid.
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto getWorkDiv(WorkDivMembers<TDim, TIdx> const& workDiv) -> Vec<TDim, TIdx>
+            {
+                return workDiv.m_gridBlockExtent;
+            }
+        };
+
+        //! The WorkDivMembers block thread extent trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetWorkDiv<WorkDivMembers<TDim, TIdx>, origin::Block, unit::Threads>
+        {
+            //! \return The number of threads in each dimension of a block.
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto getWorkDiv(WorkDivMembers<TDim, TIdx> const& workDiv) -> Vec<TDim, TIdx>
+            {
+                return workDiv.m_blockThreadExtent;
+            }
+        };
+
+        //! The WorkDivMembers thread element extent trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetWorkDiv<WorkDivMembers<TDim, TIdx>, origin::Thread, unit::Elems>
+        {
+            //! \return The number of elements in each dimension of a thread.
+            ALPAKA_NO_HOST_ACC_WARNING
+            ALPAKA_FN_HOST_ACC static auto getWorkDiv(WorkDivMembers<TDim, TIdx> const& workDiv) -> Vec<TDim, TIdx>
+            {
+                return workDiv.m_threadElemExtent;
+            }
+        };
+    } // namespace trait
+} // namespace alpaka
diff --git a/include/alpaka/workdiv/WorkDivUniformCudaHipBuiltIn.hpp b/include/alpaka/workdiv/WorkDivUniformCudaHipBuiltIn.hpp
new file mode 100644
index 0000000..8915267
--- /dev/null
+++ b/include/alpaka/workdiv/WorkDivUniformCudaHipBuiltIn.hpp
@@ -0,0 +1,117 @@
+/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/core/Cuda.hpp"
+#include "alpaka/core/Hip.hpp"
+#include "alpaka/idx/Traits.hpp"
+#include "alpaka/vec/Vec.hpp"
+#include "alpaka/workdiv/Traits.hpp"
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+
+namespace alpaka
+{
+    //! The GPU CUDA/HIP accelerator work division.
+    template<typename TDim, typename TIdx>
+    class WorkDivUniformCudaHipBuiltIn
+        : public concepts::Implements<ConceptWorkDiv, WorkDivUniformCudaHipBuiltIn<TDim, TIdx>>
+    {
+    public:
+        ALPAKA_FN_HOST_ACC WorkDivUniformCudaHipBuiltIn(Vec<TDim, TIdx> const& threadElemExtent)
+            : m_threadElemExtent(threadElemExtent)
+        {
+        }
+
+        // \TODO: Optimize! Add WorkDivUniformCudaHipBuiltInNoElems that has no member m_threadElemExtent as well as
+        // AccGpuUniformCudaHipRtNoElems. Use it instead of AccGpuUniformCudaHipRt if the thread element extent is one
+        // to reduce the register usage.
+        Vec<TDim, TIdx> const& m_threadElemExtent;
+    };
+
+#    if !defined(ALPAKA_HOST_ONLY)
+
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#        endif
+
+#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#        endif
+
+    namespace trait
+    {
+        //! The GPU CUDA/HIP accelerator work division dimension get trait specialization.
+        template<typename TDim, typename TIdx>
+        struct DimType<WorkDivUniformCudaHipBuiltIn<TDim, TIdx>>
+        {
+            using type = TDim;
+        };
+
+        //! The GPU CUDA/HIP accelerator work division idx type trait specialization.
+        template<typename TDim, typename TIdx>
+        struct IdxType<WorkDivUniformCudaHipBuiltIn<TDim, TIdx>>
+        {
+            using type = TIdx;
+        };
+
+        //! The GPU CUDA/HIP accelerator work division grid block extent trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetWorkDiv<WorkDivUniformCudaHipBuiltIn<TDim, TIdx>, origin::Grid, unit::Blocks>
+        {
+            //! \return The number of blocks in each dimension of the grid.
+            __device__ static auto getWorkDiv(WorkDivUniformCudaHipBuiltIn<TDim, TIdx> const& /* workDiv */)
+                -> Vec<TDim, TIdx>
+            {
+#        ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+                return castVec<TIdx>(getExtentVecEnd<TDim>(gridDim));
+#        else
+                return getExtentVecEnd<TDim>(Vec<std::integral_constant<typename TDim::value_type, 3>, TIdx>(
+                    static_cast<TIdx>(hipGridDim_z),
+                    static_cast<TIdx>(hipGridDim_y),
+                    static_cast<TIdx>(hipGridDim_x)));
+#        endif
+            }
+        };
+
+        //! The GPU CUDA/HIP accelerator work division block thread extent trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetWorkDiv<WorkDivUniformCudaHipBuiltIn<TDim, TIdx>, origin::Block, unit::Threads>
+        {
+            //! \return The number of threads in each dimension of a block.
+            __device__ static auto getWorkDiv(WorkDivUniformCudaHipBuiltIn<TDim, TIdx> const& /* workDiv */)
+                -> Vec<TDim, TIdx>
+            {
+#        ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+                return castVec<TIdx>(getExtentVecEnd<TDim>(blockDim));
+#        else
+                return getExtentVecEnd<TDim>(Vec<std::integral_constant<typename TDim::value_type, 3>, TIdx>(
+                    static_cast<TIdx>(hipBlockDim_z),
+                    static_cast<TIdx>(hipBlockDim_y),
+                    static_cast<TIdx>(hipBlockDim_x)));
+#        endif
+            }
+        };
+
+        //! The GPU CUDA/HIP accelerator work division thread element extent trait specialization.
+        template<typename TDim, typename TIdx>
+        struct GetWorkDiv<WorkDivUniformCudaHipBuiltIn<TDim, TIdx>, origin::Thread, unit::Elems>
+        {
+            //! \return The number of blocks in each dimension of the grid.
+            __device__ static auto getWorkDiv(WorkDivUniformCudaHipBuiltIn<TDim, TIdx> const& workDiv)
+                -> Vec<TDim, TIdx>
+            {
+                return workDiv.m_threadElemExtent;
+            }
+        };
+    } // namespace trait
+
+#    endif
+
+} // namespace alpaka
+
+#endif
diff --git a/run.py b/run.py
new file mode 100644
index 0000000..b38f59f
--- /dev/null
+++ b/run.py
@@ -0,0 +1,68 @@
+import subprocess
+import sys
+import os
+
+def build_kernel_tests():
+    """
+    Calls the Makefile to build the kernel tests.
+    Returns True if successful, False otherwise.
+    """
+    print("Building project with Make...")
+    try:
+        # Check if Makefile exists
+        if not os.path.exists("Makefile"):
+            print("Error: Makefile not found in current directory")
+            return False
+
+        # Run 'make'.
+        # capture_output=False lets the user see the compiler output in real-time
+        subprocess.run(["make"], check=True)
+        
+        print("✅ Build successful\n")
+        return True
+        
+    except subprocess.CalledProcessError:
+        print("❌ Build failed. Please fix C++ errors before running benchmarks.")
+        return False
+    except FileNotFoundError:
+        print("❌ Error: 'make' command not found. Is it installed?")
+        return False
+
+def run_benchmark(executable_path, args):
+    """
+    Runs the compiled executable with arguments.
+    """
+    if not os.path.exists(executable_path):
+        print(f"❌ Error: Executable '{executable_path}' not found after build.")
+        return
+
+    print(f"🚀 Running {executable_path} with args: {args}...")
+    try:
+        # Construct the command
+        cmd = [executable_path] + [str(a) for a in args]
+        
+        # Run and capture output for parsing
+        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+        
+        print("--- Output ---")
+        print(result.stdout)
+        
+        # TODO: Add your parsing logic here (regex or string split) to get the time
+        
+    except subprocess.CalledProcessError as e:
+        print(f"❌ Execution failed with return code {e.returncode}")
+        print("Stderr:", e.stderr)
+
+if __name__ == "__main__":
+    # Build Phase
+    if not build_kernel_tests():
+        sys.exit(1)
+
+    # Benchmark Phase
+    # Adjust this path to match where your Makefile outputs the binary
+    binary_path = "./build/alpaka_test_kernel" 
+    
+    input_sizes = [1024, 2048, 4096]
+    
+    for size in input_sizes:
+        run_benchmark(binary_path, [size])

From 3077d7415c91beb72cb6b3902a769e5bfe66c152 Mon Sep 17 00:00:00 2001
From: Francesco Derme <francesco.derme02@gmail.com>
Date: Tue, 16 Dec 2025 11:25:28 +0100
Subject: [PATCH 02/33] run.py

---
 .gitattributes                                |    2 -
 include/alpaka/acc/AccCpuOmp2Blocks.hpp       |  234 ---
 include/alpaka/acc/AccCpuOmp2Threads.hpp      |  237 ---
 include/alpaka/acc/AccCpuSerial.hpp           |  227 ---
 include/alpaka/acc/AccCpuSycl.hpp             |   38 -
 include/alpaka/acc/AccCpuTbbBlocks.hpp        |  228 ---
 include/alpaka/acc/AccCpuThreads.hpp          |  245 ---
 include/alpaka/acc/AccDevProps.hpp            |   34 -
 include/alpaka/acc/AccFpgaSyclIntel.hpp       |   38 -
 include/alpaka/acc/AccGenericSycl.hpp         |  214 ---
 include/alpaka/acc/AccGpuCudaRt.hpp           |   34 -
 include/alpaka/acc/AccGpuHipRt.hpp            |   34 -
 include/alpaka/acc/AccGpuSyclIntel.hpp        |   38 -
 include/alpaka/acc/AccGpuUniformCudaHipRt.hpp |  307 ----
 include/alpaka/acc/Tag.hpp                    |   72 -
 include/alpaka/acc/TagAccIsEnabled.hpp        |   36 -
 include/alpaka/acc/Traits.hpp                 |  115 --
 include/alpaka/alpaka.hpp                     |  229 ---
 include/alpaka/atomic/AtomicAtomicRef.hpp     |  237 ---
 include/alpaka/atomic/AtomicCpu.hpp           |   30 -
 include/alpaka/atomic/AtomicGenericSycl.hpp   |  263 ---
 include/alpaka/atomic/AtomicHierarchy.hpp     |   34 -
 include/alpaka/atomic/AtomicNoOp.hpp          |   37 -
 include/alpaka/atomic/AtomicOmpBuiltIn.hpp    |  320 ----
 include/alpaka/atomic/AtomicStdLibLock.hpp    |  103 --
 .../alpaka/atomic/AtomicUniformCudaHip.hpp    |  512 ------
 .../atomic/AtomicUniformCudaHipBuiltIn.hpp    |  321 ----
 include/alpaka/atomic/Op.hpp                  |  249 ---
 include/alpaka/atomic/Traits.hpp              |  304 ----
 .../dyn/BlockSharedDynMemberAllocKiB.hpp      |   15 -
 .../dyn/BlockSharedMemDynGenericSycl.hpp      |   43 -
 .../shared/dyn/BlockSharedMemDynMember.hpp    |  113 --
 ...BlockSharedMemDynUniformCudaHipBuiltIn.hpp |   57 -
 include/alpaka/block/shared/dyn/Traits.hpp    |   44 -
 .../shared/st/BlockSharedMemStGenericSycl.hpp |   67 -
 .../shared/st/BlockSharedMemStMember.hpp      |   59 -
 .../st/BlockSharedMemStMemberMasterSync.hpp   |   86 -
 .../BlockSharedMemStUniformCudaHipBuiltIn.hpp |   60 -
 include/alpaka/block/shared/st/Traits.hpp     |   59 -
 .../st/detail/BlockSharedMemStMemberImpl.hpp  |  145 --
 .../alpaka/block/sync/BlockSyncBarrierOmp.hpp |  109 --
 .../block/sync/BlockSyncBarrierThread.hpp     |   62 -
 .../block/sync/BlockSyncGenericSycl.hpp       |   79 -
 include/alpaka/block/sync/BlockSyncNoOp.hpp   |   40 -
 .../sync/BlockSyncUniformCudaHipBuiltIn.hpp   |  122 --
 include/alpaka/block/sync/Traits.hpp          |  107 --
 include/alpaka/core/Align.hpp                 |   65 -
 include/alpaka/core/AlignedAlloc.hpp          |   23 -
 include/alpaka/core/ApiCudaRt.hpp             |  402 -----
 include/alpaka/core/ApiHipRt.hpp              |  441 -----
 include/alpaka/core/Assert.hpp                |  105 --
 include/alpaka/core/BarrierThread.hpp         |  168 --
 include/alpaka/core/BoostPredef.hpp           |   79 -
 include/alpaka/core/CallbackThread.hpp        |  171 --
 include/alpaka/core/ClipCast.hpp              |   27 -
 include/alpaka/core/Common.hpp                |  221 ---
 include/alpaka/core/Concepts.hpp              |   67 -
 include/alpaka/core/Cuda.hpp                  |   58 -
 include/alpaka/core/CudaHipCommon.hpp         |  161 --
 include/alpaka/core/Debug.hpp                 |   77 -
 include/alpaka/core/Decay.hpp                 |   16 -
 include/alpaka/core/DemangleTypeNames.hpp     |   23 -
 include/alpaka/core/Hip.hpp                   |   14 -
 include/alpaka/core/OmpSchedule.hpp           |   88 -
 include/alpaka/core/Positioning.hpp           |   49 -
 include/alpaka/core/RemoveRestrict.hpp        |   35 -
 include/alpaka/core/RuntimeMacros.hpp         |   52 -
 include/alpaka/core/Sycl.hpp                  |  199 ---
 include/alpaka/core/ThreadPool.hpp            |  104 --
 include/alpaka/core/UniformCudaHip.hpp        |  113 --
 include/alpaka/core/Unreachable.hpp           |   25 -
 include/alpaka/core/Unroll.hpp                |   25 -
 include/alpaka/core/Utility.hpp               |   62 -
 include/alpaka/core/Vectorize.hpp             |  358 ----
 include/alpaka/dev/DevCpu.hpp                 |  207 ---
 include/alpaka/dev/DevCpuSycl.hpp             |   17 -
 include/alpaka/dev/DevCudaRt.hpp              |   18 -
 include/alpaka/dev/DevFpgaSyclIntel.hpp       |   17 -
 include/alpaka/dev/DevGenericSycl.hpp         |  282 ----
 include/alpaka/dev/DevGpuSyclIntel.hpp        |   17 -
 include/alpaka/dev/DevHipRt.hpp               |   18 -
 include/alpaka/dev/DevUniformCudaHipRt.hpp    |  269 ---
 include/alpaka/dev/Traits.hpp                 |  140 --
 include/alpaka/dev/common/QueueRegistry.hpp   |   59 -
 include/alpaka/dev/cpu/SysInfo.hpp            |  237 ---
 include/alpaka/dev/cpu/Wait.hpp               |   27 -
 include/alpaka/dim/DimArithmetic.hpp          |   19 -
 include/alpaka/dim/DimIntegralConst.hpp       |   16 -
 include/alpaka/dim/Traits.hpp                 |   20 -
 include/alpaka/elem/Traits.hpp                |   33 -
 include/alpaka/event/EventCpu.hpp             |   13 -
 include/alpaka/event/EventCpuSycl.hpp         |   17 -
 include/alpaka/event/EventCudaRt.hpp          |   18 -
 include/alpaka/event/EventFpgaSyclIntel.hpp   |   17 -
 include/alpaka/event/EventGenericSycl.hpp     |  161 --
 include/alpaka/event/EventGenericThreads.hpp  |  395 -----
 include/alpaka/event/EventGpuSyclIntel.hpp    |   17 -
 include/alpaka/event/EventHipRt.hpp           |   18 -
 .../alpaka/event/EventUniformCudaHipRt.hpp    |  263 ---
 include/alpaka/event/Traits.hpp               |   38 -
 include/alpaka/example/ExampleDefaultAcc.hpp  |   41 -
 .../alpaka/example/ExecuteForEachAccTag.hpp   |   27 -
 include/alpaka/exec/ElementIndex.hpp          |   18 -
 include/alpaka/exec/IndependentElements.hpp   |  454 -----
 include/alpaka/exec/Once.hpp                  |   56 -
 include/alpaka/exec/UniformElements.hpp       | 1145 -------------
 include/alpaka/extent/Traits.hpp              |  162 --
 include/alpaka/idx/Accessors.hpp              |  116 --
 include/alpaka/idx/MapIdx.hpp                 |   98 --
 include/alpaka/idx/Traits.hpp                 |   44 -
 include/alpaka/idx/bt/IdxBtGenericSycl.hpp    |   77 -
 include/alpaka/idx/bt/IdxBtLinear.hpp         |   72 -
 include/alpaka/idx/bt/IdxBtOmp.hpp            |   77 -
 include/alpaka/idx/bt/IdxBtRefThreadIdMap.hpp |   77 -
 .../idx/bt/IdxBtUniformCudaHipBuiltIn.hpp     |   81 -
 include/alpaka/idx/bt/IdxBtZero.hpp           |   53 -
 include/alpaka/idx/gb/IdxGbGenericSycl.hpp    |   77 -
 include/alpaka/idx/gb/IdxGbLinear.hpp         |   73 -
 include/alpaka/idx/gb/IdxGbRef.hpp            |   59 -
 .../idx/gb/IdxGbUniformCudaHipBuiltIn.hpp     |   81 -
 include/alpaka/intrinsic/IntrinsicCpu.hpp     |   88 -
 .../alpaka/intrinsic/IntrinsicFallback.hpp    |   77 -
 .../alpaka/intrinsic/IntrinsicGenericSycl.hpp |   57 -
 .../IntrinsicUniformCudaHipBuiltIn.hpp        |   78 -
 include/alpaka/intrinsic/Traits.hpp           |   84 -
 .../kernel/KernelFunctionAttributes.hpp       |   25 -
 include/alpaka/kernel/SyclSubgroupSize.hpp    |  120 --
 .../alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp |  991 -----------
 .../kernel/TaskKernelCpuOmp2Threads.hpp       |  232 ---
 include/alpaka/kernel/TaskKernelCpuSerial.hpp |  171 --
 include/alpaka/kernel/TaskKernelCpuSycl.hpp   |   20 -
 .../alpaka/kernel/TaskKernelCpuTbbBlocks.hpp  |  183 --
 .../alpaka/kernel/TaskKernelCpuThreads.hpp    |  240 ---
 .../alpaka/kernel/TaskKernelFpgaSyclIntel.hpp |   20 -
 .../alpaka/kernel/TaskKernelGenericSycl.hpp   |  314 ----
 include/alpaka/kernel/TaskKernelGpuCudaRt.hpp |   19 -
 include/alpaka/kernel/TaskKernelGpuHipRt.hpp  |   18 -
 .../alpaka/kernel/TaskKernelGpuSyclIntel.hpp  |   20 -
 .../kernel/TaskKernelGpuUniformCudaHipRt.hpp  |  373 -----
 include/alpaka/kernel/Traits.hpp              |  383 -----
 include/alpaka/math/Complex.hpp               |  582 -------
 include/alpaka/math/FloatEqualExact.hpp       |   50 -
 include/alpaka/math/MathGenericSycl.hpp       |  751 ---------
 include/alpaka/math/MathStdLib.hpp            |  299 ----
 .../alpaka/math/MathUniformCudaHipBuiltIn.hpp | 1373 ---------------
 include/alpaka/math/Traits.hpp                | 1488 -----------------
 include/alpaka/mem/alloc/AllocCpuAligned.hpp  |   67 -
 include/alpaka/mem/alloc/AllocCpuNew.hpp      |   39 -
 include/alpaka/mem/alloc/Traits.hpp           |   46 -
 include/alpaka/mem/buf/BufCpu.hpp             |  314 ----
 include/alpaka/mem/buf/BufCpuSycl.hpp         |   19 -
 include/alpaka/mem/buf/BufCudaRt.hpp          |   18 -
 include/alpaka/mem/buf/BufFpgaSyclIntel.hpp   |   19 -
 include/alpaka/mem/buf/BufGenericSycl.hpp     |  272 ---
 include/alpaka/mem/buf/BufGpuSyclIntel.hpp    |   19 -
 include/alpaka/mem/buf/BufHipRt.hpp           |   18 -
 .../alpaka/mem/buf/BufUniformCudaHipRt.hpp    |  422 -----
 include/alpaka/mem/buf/SetKernel.hpp          |   58 -
 include/alpaka/mem/buf/Traits.hpp             |  192 ---
 include/alpaka/mem/buf/cpu/Copy.hpp           |  220 ---
 include/alpaka/mem/buf/cpu/Set.hpp            |  186 ---
 include/alpaka/mem/buf/sycl/Common.hpp        |   57 -
 include/alpaka/mem/buf/sycl/Copy.hpp          |  240 ---
 include/alpaka/mem/buf/sycl/Set.hpp           |  212 ---
 .../alpaka/mem/buf/uniformCudaHip/Copy.hpp    |  643 -------
 include/alpaka/mem/buf/uniformCudaHip/Set.hpp |  385 -----
 include/alpaka/mem/fence/MemFenceCpu.hpp      |   61 -
 .../alpaka/mem/fence/MemFenceCpuSerial.hpp    |   49 -
 .../alpaka/mem/fence/MemFenceGenericSycl.hpp  |   60 -
 .../alpaka/mem/fence/MemFenceOmp2Blocks.hpp   |   54 -
 .../alpaka/mem/fence/MemFenceOmp2Threads.hpp  |   68 -
 .../fence/MemFenceUniformCudaHipBuiltIn.hpp   |   65 -
 include/alpaka/mem/fence/Traits.hpp           |   66 -
 include/alpaka/mem/global/DeviceGlobalCpu.hpp |  151 --
 .../mem/global/DeviceGlobalGenericSycl.hpp    |   96 --
 .../DeviceGlobalUniformCudaHipBuiltIn.hpp     |  187 ---
 include/alpaka/mem/global/Traits.hpp          |   45 -
 include/alpaka/mem/view/Traits.hpp            |  614 -------
 include/alpaka/mem/view/ViewAccessOps.hpp     |  151 --
 include/alpaka/mem/view/ViewConst.hpp         |  115 --
 include/alpaka/mem/view/ViewPlainPtr.hpp      |  192 ---
 include/alpaka/mem/view/ViewStdArray.hpp      |   94 --
 include/alpaka/mem/view/ViewStdVector.hpp     |   92 -
 include/alpaka/mem/view/ViewSubView.hpp       |  217 ---
 include/alpaka/meta/Apply.hpp                 |   22 -
 include/alpaka/meta/CartesianProduct.hpp      |   84 -
 include/alpaka/meta/Concatenate.hpp           |   29 -
 include/alpaka/meta/DependentFalseType.hpp    |   17 -
 include/alpaka/meta/Filter.hpp                |   47 -
 include/alpaka/meta/Fold.hpp                  |   24 -
 include/alpaka/meta/ForEachType.hpp           |   52 -
 include/alpaka/meta/Functional.hpp            |   30 -
 include/alpaka/meta/InheritFromList.hpp       |   16 -
 include/alpaka/meta/IntegerSequence.hpp       |  125 --
 include/alpaka/meta/Integral.hpp              |   56 -
 include/alpaka/meta/IsArrayOrVector.hpp       |   65 -
 include/alpaka/meta/IsStrictBase.hpp          |   15 -
 include/alpaka/meta/NdLoop.hpp                |   85 -
 include/alpaka/meta/NonZero.hpp               |   27 -
 include/alpaka/meta/Set.hpp                   |   60 -
 include/alpaka/meta/Transform.hpp             |   22 -
 include/alpaka/meta/TypeListOps.hpp           |   95 --
 include/alpaka/meta/Unique.hpp                |   41 -
 include/alpaka/offset/Traits.hpp              |  132 --
 include/alpaka/platform/PlatformCpu.hpp       |   69 -
 include/alpaka/platform/PlatformCpuSycl.hpp   |   33 -
 include/alpaka/platform/PlatformCudaRt.hpp    |   18 -
 .../alpaka/platform/PlatformFpgaSyclIntel.hpp |   51 -
 .../alpaka/platform/PlatformGenericSycl.hpp   |  746 ---------
 .../alpaka/platform/PlatformGpuSyclIntel.hpp  |   36 -
 include/alpaka/platform/PlatformHipRt.hpp     |   18 -
 .../platform/PlatformUniformCudaHipRt.hpp     |  265 ---
 include/alpaka/platform/Traits.hpp            |   94 --
 include/alpaka/queue/Properties.hpp           |   20 -
 include/alpaka/queue/QueueCpuBlocking.hpp     |   13 -
 include/alpaka/queue/QueueCpuNonBlocking.hpp  |   13 -
 include/alpaka/queue/QueueCpuSyclBlocking.hpp |   17 -
 .../alpaka/queue/QueueCpuSyclNonBlocking.hpp  |   17 -
 include/alpaka/queue/QueueCudaRtBlocking.hpp  |   18 -
 .../alpaka/queue/QueueCudaRtNonBlocking.hpp   |   18 -
 .../queue/QueueFpgaSyclIntelBlocking.hpp      |   17 -
 .../queue/QueueFpgaSyclIntelNonBlocking.hpp   |   17 -
 .../alpaka/queue/QueueGenericSyclBlocking.hpp |   17 -
 .../queue/QueueGenericSyclNonBlocking.hpp     |   17 -
 .../queue/QueueGenericThreadsBlocking.hpp     |  166 --
 .../queue/QueueGenericThreadsNonBlocking.hpp  |  156 --
 .../queue/QueueGpuSyclIntelBlocking.hpp       |   17 -
 .../queue/QueueGpuSyclIntelNonBlocking.hpp    |   17 -
 include/alpaka/queue/QueueHipRtBlocking.hpp   |   18 -
 .../alpaka/queue/QueueHipRtNonBlocking.hpp    |   18 -
 .../queue/QueueUniformCudaHipRtBlocking.hpp   |   19 -
 .../QueueUniformCudaHipRtNonBlocking.hpp      |   19 -
 include/alpaka/queue/Traits.hpp               |   71 -
 include/alpaka/queue/cpu/ICpuQueue.hpp        |   14 -
 .../alpaka/queue/cpu/IGenericThreadsQueue.hpp |   35 -
 .../queue/cuda_hip/QueueUniformCudaHipRt.hpp  |  245 ---
 .../queue/sycl/QueueGenericSyclBase.hpp       |  289 ----
 .../rand/Philox/MultiplyAndSplit64to32.hpp    |   43 -
 .../alpaka/rand/Philox/PhiloxBaseCommon.hpp   |   92 -
 .../alpaka/rand/Philox/PhiloxConstants.hpp    |   70 -
 include/alpaka/rand/Philox/PhiloxSingle.hpp   |  148 --
 .../alpaka/rand/Philox/PhiloxStateless.hpp    |  125 --
 .../rand/Philox/PhiloxStatelessKeyedBase.hpp  |   36 -
 include/alpaka/rand/Philox/PhiloxVector.hpp   |  102 --
 include/alpaka/rand/RandDefault.hpp           |  216 ---
 include/alpaka/rand/RandGenericSycl.hpp       |  198 ---
 include/alpaka/rand/RandPhilox.hpp            |  201 ---
 include/alpaka/rand/RandPhiloxStateless.hpp   |   30 -
 include/alpaka/rand/RandStdLib.hpp            |  279 ----
 .../alpaka/rand/RandUniformCudaHipRand.hpp    |  283 ----
 include/alpaka/rand/TinyMT/Engine.hpp         |   66 -
 include/alpaka/rand/TinyMT/LICENSE.txt        |   38 -
 include/alpaka/rand/TinyMT/tinymt32.h         |  429 -----
 include/alpaka/rand/Traits.hpp                |  100 --
 include/alpaka/standalone/CpuOmp2Blocks.hpp   |    9 -
 include/alpaka/standalone/CpuOmp2Threads.hpp  |    9 -
 include/alpaka/standalone/CpuSerial.hpp       |    9 -
 include/alpaka/standalone/CpuSycl.hpp         |   13 -
 include/alpaka/standalone/CpuTbbBlocks.hpp    |    9 -
 include/alpaka/standalone/CpuThreads.hpp      |    9 -
 include/alpaka/standalone/FpgaSyclIntel.hpp   |   13 -
 include/alpaka/standalone/GenericSycl.hpp     |    9 -
 include/alpaka/standalone/GpuCudaRt.hpp       |   21 -
 include/alpaka/standalone/GpuHipRt.hpp        |    9 -
 include/alpaka/standalone/GpuSyclIntel.hpp    |   13 -
 include/alpaka/test/Array.hpp                 |   29 -
 include/alpaka/test/Check.hpp                 |   19 -
 include/alpaka/test/Extent.hpp                |   42 -
 .../alpaka/test/KernelExecutionFixture.hpp    |  105 --
 include/alpaka/test/MeasureKernelRunTime.hpp  |   47 -
 include/alpaka/test/acc/TestAccs.hpp          |  183 --
 include/alpaka/test/dim/TestDims.hpp          |   34 -
 .../test/event/EventHostManualTrigger.hpp     |  779 ---------
 include/alpaka/test/idx/TestIdxs.hpp          |   28 -
 include/alpaka/test/mem/view/Iterator.hpp     |  143 --
 include/alpaka/test/mem/view/ViewTest.hpp     |  264 ---
 include/alpaka/test/queue/Queue.hpp           |  146 --
 .../test/queue/QueueCpuOmp2Collective.hpp     |  297 ----
 .../alpaka/test/queue/QueueTestFixture.hpp    |   23 -
 include/alpaka/traits/Traits.hpp              |   37 -
 include/alpaka/vec/Traits.hpp                 |  102 --
 include/alpaka/vec/Vec.hpp                    |  799 ---------
 include/alpaka/version.hpp                    |   14 -
 include/alpaka/wait/Traits.hpp                |   50 -
 include/alpaka/warp/Traits.hpp                |  317 ----
 include/alpaka/warp/WarpGenericSycl.hpp       |  200 ---
 include/alpaka/warp/WarpSingleThread.hpp      |  121 --
 .../alpaka/warp/WarpUniformCudaHipBuiltIn.hpp |  189 ---
 include/alpaka/workdiv/Traits.hpp             |   77 -
 include/alpaka/workdiv/WorkDivGenericSycl.hpp |  119 --
 include/alpaka/workdiv/WorkDivHelpers.hpp     |  554 ------
 include/alpaka/workdiv/WorkDivMembers.hpp     |  159 --
 .../workdiv/WorkDivUniformCudaHipBuiltIn.hpp  |  117 --
 293 files changed, 39602 deletions(-)
 delete mode 100644 .gitattributes
 delete mode 100644 include/alpaka/acc/AccCpuOmp2Blocks.hpp
 delete mode 100644 include/alpaka/acc/AccCpuOmp2Threads.hpp
 delete mode 100644 include/alpaka/acc/AccCpuSerial.hpp
 delete mode 100644 include/alpaka/acc/AccCpuSycl.hpp
 delete mode 100644 include/alpaka/acc/AccCpuTbbBlocks.hpp
 delete mode 100644 include/alpaka/acc/AccCpuThreads.hpp
 delete mode 100644 include/alpaka/acc/AccDevProps.hpp
 delete mode 100644 include/alpaka/acc/AccFpgaSyclIntel.hpp
 delete mode 100644 include/alpaka/acc/AccGenericSycl.hpp
 delete mode 100644 include/alpaka/acc/AccGpuCudaRt.hpp
 delete mode 100644 include/alpaka/acc/AccGpuHipRt.hpp
 delete mode 100644 include/alpaka/acc/AccGpuSyclIntel.hpp
 delete mode 100644 include/alpaka/acc/AccGpuUniformCudaHipRt.hpp
 delete mode 100644 include/alpaka/acc/Tag.hpp
 delete mode 100644 include/alpaka/acc/TagAccIsEnabled.hpp
 delete mode 100644 include/alpaka/acc/Traits.hpp
 delete mode 100644 include/alpaka/alpaka.hpp
 delete mode 100644 include/alpaka/atomic/AtomicAtomicRef.hpp
 delete mode 100644 include/alpaka/atomic/AtomicCpu.hpp
 delete mode 100644 include/alpaka/atomic/AtomicGenericSycl.hpp
 delete mode 100644 include/alpaka/atomic/AtomicHierarchy.hpp
 delete mode 100644 include/alpaka/atomic/AtomicNoOp.hpp
 delete mode 100644 include/alpaka/atomic/AtomicOmpBuiltIn.hpp
 delete mode 100644 include/alpaka/atomic/AtomicStdLibLock.hpp
 delete mode 100644 include/alpaka/atomic/AtomicUniformCudaHip.hpp
 delete mode 100644 include/alpaka/atomic/AtomicUniformCudaHipBuiltIn.hpp
 delete mode 100644 include/alpaka/atomic/Op.hpp
 delete mode 100644 include/alpaka/atomic/Traits.hpp
 delete mode 100644 include/alpaka/block/shared/dyn/BlockSharedDynMemberAllocKiB.hpp
 delete mode 100644 include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp
 delete mode 100644 include/alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp
 delete mode 100644 include/alpaka/block/shared/dyn/BlockSharedMemDynUniformCudaHipBuiltIn.hpp
 delete mode 100644 include/alpaka/block/shared/dyn/Traits.hpp
 delete mode 100644 include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp
 delete mode 100644 include/alpaka/block/shared/st/BlockSharedMemStMember.hpp
 delete mode 100644 include/alpaka/block/shared/st/BlockSharedMemStMemberMasterSync.hpp
 delete mode 100644 include/alpaka/block/shared/st/BlockSharedMemStUniformCudaHipBuiltIn.hpp
 delete mode 100644 include/alpaka/block/shared/st/Traits.hpp
 delete mode 100644 include/alpaka/block/shared/st/detail/BlockSharedMemStMemberImpl.hpp
 delete mode 100644 include/alpaka/block/sync/BlockSyncBarrierOmp.hpp
 delete mode 100644 include/alpaka/block/sync/BlockSyncBarrierThread.hpp
 delete mode 100644 include/alpaka/block/sync/BlockSyncGenericSycl.hpp
 delete mode 100644 include/alpaka/block/sync/BlockSyncNoOp.hpp
 delete mode 100644 include/alpaka/block/sync/BlockSyncUniformCudaHipBuiltIn.hpp
 delete mode 100644 include/alpaka/block/sync/Traits.hpp
 delete mode 100644 include/alpaka/core/Align.hpp
 delete mode 100644 include/alpaka/core/AlignedAlloc.hpp
 delete mode 100644 include/alpaka/core/ApiCudaRt.hpp
 delete mode 100644 include/alpaka/core/ApiHipRt.hpp
 delete mode 100644 include/alpaka/core/Assert.hpp
 delete mode 100644 include/alpaka/core/BarrierThread.hpp
 delete mode 100644 include/alpaka/core/BoostPredef.hpp
 delete mode 100644 include/alpaka/core/CallbackThread.hpp
 delete mode 100644 include/alpaka/core/ClipCast.hpp
 delete mode 100644 include/alpaka/core/Common.hpp
 delete mode 100644 include/alpaka/core/Concepts.hpp
 delete mode 100644 include/alpaka/core/Cuda.hpp
 delete mode 100644 include/alpaka/core/CudaHipCommon.hpp
 delete mode 100644 include/alpaka/core/Debug.hpp
 delete mode 100644 include/alpaka/core/Decay.hpp
 delete mode 100644 include/alpaka/core/DemangleTypeNames.hpp
 delete mode 100644 include/alpaka/core/Hip.hpp
 delete mode 100644 include/alpaka/core/OmpSchedule.hpp
 delete mode 100644 include/alpaka/core/Positioning.hpp
 delete mode 100644 include/alpaka/core/RemoveRestrict.hpp
 delete mode 100644 include/alpaka/core/RuntimeMacros.hpp
 delete mode 100644 include/alpaka/core/Sycl.hpp
 delete mode 100644 include/alpaka/core/ThreadPool.hpp
 delete mode 100644 include/alpaka/core/UniformCudaHip.hpp
 delete mode 100644 include/alpaka/core/Unreachable.hpp
 delete mode 100644 include/alpaka/core/Unroll.hpp
 delete mode 100644 include/alpaka/core/Utility.hpp
 delete mode 100644 include/alpaka/core/Vectorize.hpp
 delete mode 100644 include/alpaka/dev/DevCpu.hpp
 delete mode 100644 include/alpaka/dev/DevCpuSycl.hpp
 delete mode 100644 include/alpaka/dev/DevCudaRt.hpp
 delete mode 100644 include/alpaka/dev/DevFpgaSyclIntel.hpp
 delete mode 100644 include/alpaka/dev/DevGenericSycl.hpp
 delete mode 100644 include/alpaka/dev/DevGpuSyclIntel.hpp
 delete mode 100644 include/alpaka/dev/DevHipRt.hpp
 delete mode 100644 include/alpaka/dev/DevUniformCudaHipRt.hpp
 delete mode 100644 include/alpaka/dev/Traits.hpp
 delete mode 100644 include/alpaka/dev/common/QueueRegistry.hpp
 delete mode 100644 include/alpaka/dev/cpu/SysInfo.hpp
 delete mode 100644 include/alpaka/dev/cpu/Wait.hpp
 delete mode 100644 include/alpaka/dim/DimArithmetic.hpp
 delete mode 100644 include/alpaka/dim/DimIntegralConst.hpp
 delete mode 100644 include/alpaka/dim/Traits.hpp
 delete mode 100644 include/alpaka/elem/Traits.hpp
 delete mode 100644 include/alpaka/event/EventCpu.hpp
 delete mode 100644 include/alpaka/event/EventCpuSycl.hpp
 delete mode 100644 include/alpaka/event/EventCudaRt.hpp
 delete mode 100644 include/alpaka/event/EventFpgaSyclIntel.hpp
 delete mode 100644 include/alpaka/event/EventGenericSycl.hpp
 delete mode 100644 include/alpaka/event/EventGenericThreads.hpp
 delete mode 100644 include/alpaka/event/EventGpuSyclIntel.hpp
 delete mode 100644 include/alpaka/event/EventHipRt.hpp
 delete mode 100644 include/alpaka/event/EventUniformCudaHipRt.hpp
 delete mode 100644 include/alpaka/event/Traits.hpp
 delete mode 100644 include/alpaka/example/ExampleDefaultAcc.hpp
 delete mode 100644 include/alpaka/example/ExecuteForEachAccTag.hpp
 delete mode 100644 include/alpaka/exec/ElementIndex.hpp
 delete mode 100644 include/alpaka/exec/IndependentElements.hpp
 delete mode 100644 include/alpaka/exec/Once.hpp
 delete mode 100644 include/alpaka/exec/UniformElements.hpp
 delete mode 100644 include/alpaka/extent/Traits.hpp
 delete mode 100644 include/alpaka/idx/Accessors.hpp
 delete mode 100644 include/alpaka/idx/MapIdx.hpp
 delete mode 100644 include/alpaka/idx/Traits.hpp
 delete mode 100644 include/alpaka/idx/bt/IdxBtGenericSycl.hpp
 delete mode 100644 include/alpaka/idx/bt/IdxBtLinear.hpp
 delete mode 100644 include/alpaka/idx/bt/IdxBtOmp.hpp
 delete mode 100644 include/alpaka/idx/bt/IdxBtRefThreadIdMap.hpp
 delete mode 100644 include/alpaka/idx/bt/IdxBtUniformCudaHipBuiltIn.hpp
 delete mode 100644 include/alpaka/idx/bt/IdxBtZero.hpp
 delete mode 100644 include/alpaka/idx/gb/IdxGbGenericSycl.hpp
 delete mode 100644 include/alpaka/idx/gb/IdxGbLinear.hpp
 delete mode 100644 include/alpaka/idx/gb/IdxGbRef.hpp
 delete mode 100644 include/alpaka/idx/gb/IdxGbUniformCudaHipBuiltIn.hpp
 delete mode 100644 include/alpaka/intrinsic/IntrinsicCpu.hpp
 delete mode 100644 include/alpaka/intrinsic/IntrinsicFallback.hpp
 delete mode 100644 include/alpaka/intrinsic/IntrinsicGenericSycl.hpp
 delete mode 100644 include/alpaka/intrinsic/IntrinsicUniformCudaHipBuiltIn.hpp
 delete mode 100644 include/alpaka/intrinsic/Traits.hpp
 delete mode 100644 include/alpaka/kernel/KernelFunctionAttributes.hpp
 delete mode 100644 include/alpaka/kernel/SyclSubgroupSize.hpp
 delete mode 100644 include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp
 delete mode 100644 include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp
 delete mode 100644 include/alpaka/kernel/TaskKernelCpuSerial.hpp
 delete mode 100644 include/alpaka/kernel/TaskKernelCpuSycl.hpp
 delete mode 100644 include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp
 delete mode 100644 include/alpaka/kernel/TaskKernelCpuThreads.hpp
 delete mode 100644 include/alpaka/kernel/TaskKernelFpgaSyclIntel.hpp
 delete mode 100644 include/alpaka/kernel/TaskKernelGenericSycl.hpp
 delete mode 100644 include/alpaka/kernel/TaskKernelGpuCudaRt.hpp
 delete mode 100644 include/alpaka/kernel/TaskKernelGpuHipRt.hpp
 delete mode 100644 include/alpaka/kernel/TaskKernelGpuSyclIntel.hpp
 delete mode 100644 include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp
 delete mode 100644 include/alpaka/kernel/Traits.hpp
 delete mode 100644 include/alpaka/math/Complex.hpp
 delete mode 100644 include/alpaka/math/FloatEqualExact.hpp
 delete mode 100644 include/alpaka/math/MathGenericSycl.hpp
 delete mode 100644 include/alpaka/math/MathStdLib.hpp
 delete mode 100644 include/alpaka/math/MathUniformCudaHipBuiltIn.hpp
 delete mode 100644 include/alpaka/math/Traits.hpp
 delete mode 100644 include/alpaka/mem/alloc/AllocCpuAligned.hpp
 delete mode 100644 include/alpaka/mem/alloc/AllocCpuNew.hpp
 delete mode 100644 include/alpaka/mem/alloc/Traits.hpp
 delete mode 100644 include/alpaka/mem/buf/BufCpu.hpp
 delete mode 100644 include/alpaka/mem/buf/BufCpuSycl.hpp
 delete mode 100644 include/alpaka/mem/buf/BufCudaRt.hpp
 delete mode 100644 include/alpaka/mem/buf/BufFpgaSyclIntel.hpp
 delete mode 100644 include/alpaka/mem/buf/BufGenericSycl.hpp
 delete mode 100644 include/alpaka/mem/buf/BufGpuSyclIntel.hpp
 delete mode 100644 include/alpaka/mem/buf/BufHipRt.hpp
 delete mode 100644 include/alpaka/mem/buf/BufUniformCudaHipRt.hpp
 delete mode 100644 include/alpaka/mem/buf/SetKernel.hpp
 delete mode 100644 include/alpaka/mem/buf/Traits.hpp
 delete mode 100644 include/alpaka/mem/buf/cpu/Copy.hpp
 delete mode 100644 include/alpaka/mem/buf/cpu/Set.hpp
 delete mode 100644 include/alpaka/mem/buf/sycl/Common.hpp
 delete mode 100644 include/alpaka/mem/buf/sycl/Copy.hpp
 delete mode 100644 include/alpaka/mem/buf/sycl/Set.hpp
 delete mode 100644 include/alpaka/mem/buf/uniformCudaHip/Copy.hpp
 delete mode 100644 include/alpaka/mem/buf/uniformCudaHip/Set.hpp
 delete mode 100644 include/alpaka/mem/fence/MemFenceCpu.hpp
 delete mode 100644 include/alpaka/mem/fence/MemFenceCpuSerial.hpp
 delete mode 100644 include/alpaka/mem/fence/MemFenceGenericSycl.hpp
 delete mode 100644 include/alpaka/mem/fence/MemFenceOmp2Blocks.hpp
 delete mode 100644 include/alpaka/mem/fence/MemFenceOmp2Threads.hpp
 delete mode 100644 include/alpaka/mem/fence/MemFenceUniformCudaHipBuiltIn.hpp
 delete mode 100644 include/alpaka/mem/fence/Traits.hpp
 delete mode 100644 include/alpaka/mem/global/DeviceGlobalCpu.hpp
 delete mode 100644 include/alpaka/mem/global/DeviceGlobalGenericSycl.hpp
 delete mode 100644 include/alpaka/mem/global/DeviceGlobalUniformCudaHipBuiltIn.hpp
 delete mode 100644 include/alpaka/mem/global/Traits.hpp
 delete mode 100644 include/alpaka/mem/view/Traits.hpp
 delete mode 100644 include/alpaka/mem/view/ViewAccessOps.hpp
 delete mode 100644 include/alpaka/mem/view/ViewConst.hpp
 delete mode 100644 include/alpaka/mem/view/ViewPlainPtr.hpp
 delete mode 100644 include/alpaka/mem/view/ViewStdArray.hpp
 delete mode 100644 include/alpaka/mem/view/ViewStdVector.hpp
 delete mode 100644 include/alpaka/mem/view/ViewSubView.hpp
 delete mode 100644 include/alpaka/meta/Apply.hpp
 delete mode 100644 include/alpaka/meta/CartesianProduct.hpp
 delete mode 100644 include/alpaka/meta/Concatenate.hpp
 delete mode 100644 include/alpaka/meta/DependentFalseType.hpp
 delete mode 100644 include/alpaka/meta/Filter.hpp
 delete mode 100644 include/alpaka/meta/Fold.hpp
 delete mode 100644 include/alpaka/meta/ForEachType.hpp
 delete mode 100644 include/alpaka/meta/Functional.hpp
 delete mode 100644 include/alpaka/meta/InheritFromList.hpp
 delete mode 100644 include/alpaka/meta/IntegerSequence.hpp
 delete mode 100644 include/alpaka/meta/Integral.hpp
 delete mode 100644 include/alpaka/meta/IsArrayOrVector.hpp
 delete mode 100644 include/alpaka/meta/IsStrictBase.hpp
 delete mode 100644 include/alpaka/meta/NdLoop.hpp
 delete mode 100644 include/alpaka/meta/NonZero.hpp
 delete mode 100644 include/alpaka/meta/Set.hpp
 delete mode 100644 include/alpaka/meta/Transform.hpp
 delete mode 100644 include/alpaka/meta/TypeListOps.hpp
 delete mode 100644 include/alpaka/meta/Unique.hpp
 delete mode 100644 include/alpaka/offset/Traits.hpp
 delete mode 100644 include/alpaka/platform/PlatformCpu.hpp
 delete mode 100644 include/alpaka/platform/PlatformCpuSycl.hpp
 delete mode 100644 include/alpaka/platform/PlatformCudaRt.hpp
 delete mode 100644 include/alpaka/platform/PlatformFpgaSyclIntel.hpp
 delete mode 100644 include/alpaka/platform/PlatformGenericSycl.hpp
 delete mode 100644 include/alpaka/platform/PlatformGpuSyclIntel.hpp
 delete mode 100644 include/alpaka/platform/PlatformHipRt.hpp
 delete mode 100644 include/alpaka/platform/PlatformUniformCudaHipRt.hpp
 delete mode 100644 include/alpaka/platform/Traits.hpp
 delete mode 100644 include/alpaka/queue/Properties.hpp
 delete mode 100644 include/alpaka/queue/QueueCpuBlocking.hpp
 delete mode 100644 include/alpaka/queue/QueueCpuNonBlocking.hpp
 delete mode 100644 include/alpaka/queue/QueueCpuSyclBlocking.hpp
 delete mode 100644 include/alpaka/queue/QueueCpuSyclNonBlocking.hpp
 delete mode 100644 include/alpaka/queue/QueueCudaRtBlocking.hpp
 delete mode 100644 include/alpaka/queue/QueueCudaRtNonBlocking.hpp
 delete mode 100644 include/alpaka/queue/QueueFpgaSyclIntelBlocking.hpp
 delete mode 100644 include/alpaka/queue/QueueFpgaSyclIntelNonBlocking.hpp
 delete mode 100644 include/alpaka/queue/QueueGenericSyclBlocking.hpp
 delete mode 100644 include/alpaka/queue/QueueGenericSyclNonBlocking.hpp
 delete mode 100644 include/alpaka/queue/QueueGenericThreadsBlocking.hpp
 delete mode 100644 include/alpaka/queue/QueueGenericThreadsNonBlocking.hpp
 delete mode 100644 include/alpaka/queue/QueueGpuSyclIntelBlocking.hpp
 delete mode 100644 include/alpaka/queue/QueueGpuSyclIntelNonBlocking.hpp
 delete mode 100644 include/alpaka/queue/QueueHipRtBlocking.hpp
 delete mode 100644 include/alpaka/queue/QueueHipRtNonBlocking.hpp
 delete mode 100644 include/alpaka/queue/QueueUniformCudaHipRtBlocking.hpp
 delete mode 100644 include/alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp
 delete mode 100644 include/alpaka/queue/Traits.hpp
 delete mode 100644 include/alpaka/queue/cpu/ICpuQueue.hpp
 delete mode 100644 include/alpaka/queue/cpu/IGenericThreadsQueue.hpp
 delete mode 100644 include/alpaka/queue/cuda_hip/QueueUniformCudaHipRt.hpp
 delete mode 100644 include/alpaka/queue/sycl/QueueGenericSyclBase.hpp
 delete mode 100644 include/alpaka/rand/Philox/MultiplyAndSplit64to32.hpp
 delete mode 100644 include/alpaka/rand/Philox/PhiloxBaseCommon.hpp
 delete mode 100644 include/alpaka/rand/Philox/PhiloxConstants.hpp
 delete mode 100644 include/alpaka/rand/Philox/PhiloxSingle.hpp
 delete mode 100644 include/alpaka/rand/Philox/PhiloxStateless.hpp
 delete mode 100644 include/alpaka/rand/Philox/PhiloxStatelessKeyedBase.hpp
 delete mode 100644 include/alpaka/rand/Philox/PhiloxVector.hpp
 delete mode 100644 include/alpaka/rand/RandDefault.hpp
 delete mode 100644 include/alpaka/rand/RandGenericSycl.hpp
 delete mode 100644 include/alpaka/rand/RandPhilox.hpp
 delete mode 100644 include/alpaka/rand/RandPhiloxStateless.hpp
 delete mode 100644 include/alpaka/rand/RandStdLib.hpp
 delete mode 100644 include/alpaka/rand/RandUniformCudaHipRand.hpp
 delete mode 100644 include/alpaka/rand/TinyMT/Engine.hpp
 delete mode 100644 include/alpaka/rand/TinyMT/LICENSE.txt
 delete mode 100644 include/alpaka/rand/TinyMT/tinymt32.h
 delete mode 100644 include/alpaka/rand/Traits.hpp
 delete mode 100644 include/alpaka/standalone/CpuOmp2Blocks.hpp
 delete mode 100644 include/alpaka/standalone/CpuOmp2Threads.hpp
 delete mode 100644 include/alpaka/standalone/CpuSerial.hpp
 delete mode 100644 include/alpaka/standalone/CpuSycl.hpp
 delete mode 100644 include/alpaka/standalone/CpuTbbBlocks.hpp
 delete mode 100644 include/alpaka/standalone/CpuThreads.hpp
 delete mode 100644 include/alpaka/standalone/FpgaSyclIntel.hpp
 delete mode 100644 include/alpaka/standalone/GenericSycl.hpp
 delete mode 100644 include/alpaka/standalone/GpuCudaRt.hpp
 delete mode 100644 include/alpaka/standalone/GpuHipRt.hpp
 delete mode 100644 include/alpaka/standalone/GpuSyclIntel.hpp
 delete mode 100644 include/alpaka/test/Array.hpp
 delete mode 100644 include/alpaka/test/Check.hpp
 delete mode 100644 include/alpaka/test/Extent.hpp
 delete mode 100644 include/alpaka/test/KernelExecutionFixture.hpp
 delete mode 100644 include/alpaka/test/MeasureKernelRunTime.hpp
 delete mode 100644 include/alpaka/test/acc/TestAccs.hpp
 delete mode 100644 include/alpaka/test/dim/TestDims.hpp
 delete mode 100644 include/alpaka/test/event/EventHostManualTrigger.hpp
 delete mode 100644 include/alpaka/test/idx/TestIdxs.hpp
 delete mode 100644 include/alpaka/test/mem/view/Iterator.hpp
 delete mode 100644 include/alpaka/test/mem/view/ViewTest.hpp
 delete mode 100644 include/alpaka/test/queue/Queue.hpp
 delete mode 100644 include/alpaka/test/queue/QueueCpuOmp2Collective.hpp
 delete mode 100644 include/alpaka/test/queue/QueueTestFixture.hpp
 delete mode 100644 include/alpaka/traits/Traits.hpp
 delete mode 100644 include/alpaka/vec/Traits.hpp
 delete mode 100644 include/alpaka/vec/Vec.hpp
 delete mode 100644 include/alpaka/version.hpp
 delete mode 100644 include/alpaka/wait/Traits.hpp
 delete mode 100644 include/alpaka/warp/Traits.hpp
 delete mode 100644 include/alpaka/warp/WarpGenericSycl.hpp
 delete mode 100644 include/alpaka/warp/WarpSingleThread.hpp
 delete mode 100644 include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp
 delete mode 100644 include/alpaka/workdiv/Traits.hpp
 delete mode 100644 include/alpaka/workdiv/WorkDivGenericSycl.hpp
 delete mode 100644 include/alpaka/workdiv/WorkDivHelpers.hpp
 delete mode 100644 include/alpaka/workdiv/WorkDivMembers.hpp
 delete mode 100644 include/alpaka/workdiv/WorkDivUniformCudaHipBuiltIn.hpp

diff --git a/.gitattributes b/.gitattributes
deleted file mode 100644
index f5d92e1..0000000
--- a/.gitattributes
+++ /dev/null
@@ -1,2 +0,0 @@
-# External libraries
-include/alpaka linguist-vendored
diff --git a/include/alpaka/acc/AccCpuOmp2Blocks.hpp b/include/alpaka/acc/AccCpuOmp2Blocks.hpp
deleted file mode 100644
index 27661f5..0000000
--- a/include/alpaka/acc/AccCpuOmp2Blocks.hpp
+++ /dev/null
@@ -1,234 +0,0 @@
-/* Copyright 2024 Axel Huebl, Benjamin Worpitz, René Widera, Jan Stephan, Bernhard Manfred Gruber, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-// Base classes.
-#include "alpaka/atomic/AtomicCpu.hpp"
-#include "alpaka/atomic/AtomicHierarchy.hpp"
-#include "alpaka/atomic/AtomicNoOp.hpp"
-#include "alpaka/atomic/AtomicOmpBuiltIn.hpp"
-#include "alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp"
-#include "alpaka/block/shared/st/BlockSharedMemStMember.hpp"
-#include "alpaka/block/sync/BlockSyncNoOp.hpp"
-#include "alpaka/core/DemangleTypeNames.hpp"
-#include "alpaka/idx/bt/IdxBtZero.hpp"
-#include "alpaka/idx/gb/IdxGbRef.hpp"
-#include "alpaka/intrinsic/IntrinsicCpu.hpp"
-#include "alpaka/math/MathStdLib.hpp"
-#include "alpaka/mem/fence/MemFenceOmp2Blocks.hpp"
-#include "alpaka/rand/RandDefault.hpp"
-#include "alpaka/rand/RandStdLib.hpp"
-#include "alpaka/warp/WarpSingleThread.hpp"
-#include "alpaka/workdiv/WorkDivMembers.hpp"
-
-// Specialized traits.
-#include "alpaka/acc/Traits.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/kernel/Traits.hpp"
-#include "alpaka/platform/Traits.hpp"
-
-// Implementation details.
-#include "alpaka/acc/Tag.hpp"
-#include "alpaka/core/ClipCast.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/dev/DevCpu.hpp"
-
-#include <limits>
-#include <typeinfo>
-
-#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
-
-#    if _OPENMP < 200203
-#        error If ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
-#    endif
-
-namespace alpaka
-{
-    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-    class TaskKernelCpuOmp2Blocks;
-
-    //! The CPU OpenMP 2.0 block accelerator.
-    //!
-    //! This accelerator allows parallel kernel execution on a CPU device.
-    //! It uses OpenMP 2.0 to implement the grid block parallelism.
-    //! The block idx is restricted to 1x1x1.
-    template<typename TDim, typename TIdx>
-    class AccCpuOmp2Blocks final
-        : public WorkDivMembers<TDim, TIdx>
-        , public gb::IdxGbRef<TDim, TIdx>
-        , public bt::IdxBtZero<TDim, TIdx>
-        , public AtomicHierarchy<
-              AtomicCpu, // grid atomics
-              AtomicOmpBuiltIn, // block atomics
-              AtomicNoOp> // thread atomics
-        , public math::MathStdLib
-        , public BlockSharedMemDynMember<>
-        , public BlockSharedMemStMember<>
-        , public BlockSyncNoOp
-        , public IntrinsicCpu
-        , public MemFenceOmp2Blocks
-#    ifdef ALPAKA_DISABLE_VENDOR_RNG
-        , public rand::RandDefault
-#    else
-        , public rand::RandStdLib
-#    endif
-        , public warp::WarpSingleThread
-        , public concepts::Implements<ConceptAcc, AccCpuOmp2Blocks<TDim, TIdx>>
-    {
-        static_assert(
-            sizeof(TIdx) >= sizeof(int),
-            "Index type is not supported, consider using int or a larger type.");
-
-    public:
-        // Partial specialization with the correct TDim and TIdx is not allowed.
-        template<typename TDim2, typename TIdx2, typename TKernelFnObj, typename... TArgs>
-        friend class ::alpaka::TaskKernelCpuOmp2Blocks;
-
-        AccCpuOmp2Blocks(AccCpuOmp2Blocks const&) = delete;
-        AccCpuOmp2Blocks(AccCpuOmp2Blocks&&) = delete;
-        auto operator=(AccCpuOmp2Blocks const&) -> AccCpuOmp2Blocks& = delete;
-        auto operator=(AccCpuOmp2Blocks&&) -> AccCpuOmp2Blocks& = delete;
-
-    private:
-        template<typename TWorkDiv>
-        ALPAKA_FN_HOST AccCpuOmp2Blocks(TWorkDiv const& workDiv, std::size_t const& blockSharedMemDynSizeBytes)
-            : WorkDivMembers<TDim, TIdx>(workDiv)
-            , gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx)
-            , BlockSharedMemDynMember<>(blockSharedMemDynSizeBytes)
-            , BlockSharedMemStMember<>(staticMemBegin(), staticMemCapacity())
-            , m_gridBlockIdx(Vec<TDim, TIdx>::zeros())
-        {
-        }
-
-    private:
-        // getIdx
-        Vec<TDim, TIdx> mutable m_gridBlockIdx; //!< The index of the currently executed block.
-    };
-
-    namespace trait
-    {
-        //! The CPU OpenMP 2.0 block accelerator accelerator type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct AccType<AccCpuOmp2Blocks<TDim, TIdx>>
-        {
-            using type = AccCpuOmp2Blocks<TDim, TIdx>;
-        };
-
-        //! The CPU OpenMP 2.0 block single thread accelerator type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct IsSingleThreadAcc<AccCpuOmp2Blocks<TDim, TIdx>> : std::true_type
-        {
-        };
-
-        //! The CPU OpenMP 2.0 block multi thread accelerator type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct IsMultiThreadAcc<AccCpuOmp2Blocks<TDim, TIdx>> : std::false_type
-        {
-        };
-
-        //! The CPU OpenMP 2.0 block accelerator device properties get trait specialization.
-        template<typename TDim, typename TIdx>
-        struct GetAccDevProps<AccCpuOmp2Blocks<TDim, TIdx>>
-        {
-            ALPAKA_FN_HOST static auto getAccDevProps(DevCpu const& dev) -> alpaka::AccDevProps<TDim, TIdx>
-            {
-                return {// m_multiProcessorCount
-                        alpaka::core::clipCast<TIdx>(omp_get_max_threads()),
-                        // m_gridBlockExtentMax
-                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_gridBlockCountMax
-                        std::numeric_limits<TIdx>::max(),
-                        // m_blockThreadExtentMax
-                        Vec<TDim, TIdx>::ones(),
-                        // m_blockThreadCountMax
-                        static_cast<TIdx>(1),
-                        // m_threadElemExtentMax
-                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_threadElemCountMax
-                        std::numeric_limits<TIdx>::max(),
-                        // m_sharedMemSizeBytes
-                        static_cast<size_t>(AccCpuOmp2Blocks<TDim, TIdx>::staticAllocBytes()),
-                        // m_globalMemSizeBytes
-                        getMemBytes(dev)};
-            }
-        };
-
-        //! The CPU OpenMP 2.0 block accelerator name trait specialization.
-        template<typename TDim, typename TIdx>
-        struct GetAccName<AccCpuOmp2Blocks<TDim, TIdx>>
-        {
-            ALPAKA_FN_HOST static auto getAccName() -> std::string
-            {
-                return "AccCpuOmp2Blocks<" + std::to_string(TDim::value) + "," + core::demangled<TIdx> + ">";
-            }
-        };
-
-        //! The CPU OpenMP 2.0 block accelerator device type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct DevType<AccCpuOmp2Blocks<TDim, TIdx>>
-        {
-            using type = DevCpu;
-        };
-
-        //! The CPU OpenMP 2.0 block accelerator dimension getter trait specialization.
-        template<typename TDim, typename TIdx>
-        struct DimType<AccCpuOmp2Blocks<TDim, TIdx>>
-        {
-            using type = TDim;
-        };
-
-        //! The CPU OpenMP 2.0 block accelerator execution task type trait specialization.
-        template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
-        struct CreateTaskKernel<AccCpuOmp2Blocks<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
-        {
-            ALPAKA_FN_HOST static auto createTaskKernel(
-                TWorkDiv const& workDiv,
-                TKernelFnObj const& kernelFnObj,
-                TArgs&&... args)
-            {
-                if(workDiv.m_blockThreadExtent.prod() != static_cast<TIdx>(1u))
-                {
-                    throw std::runtime_error(
-                        "The given work division is not valid for a single thread Acc: "
-                        + getAccName<AccCpuOmp2Blocks<TDim, TIdx>>() + ". Threads per block should be 1!");
-                }
-
-                return TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>(
-                    workDiv,
-                    kernelFnObj,
-                    std::forward<TArgs>(args)...);
-            }
-        };
-
-        //! The CPU OpenMP 2.0 block execution task platform type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct PlatformType<AccCpuOmp2Blocks<TDim, TIdx>>
-        {
-            using type = PlatformCpu;
-        };
-
-        //! The CPU OpenMP 2.0 block accelerator idx type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct IdxType<AccCpuOmp2Blocks<TDim, TIdx>>
-        {
-            using type = TIdx;
-        };
-
-        template<typename TDim, typename TIdx>
-        struct AccToTag<alpaka::AccCpuOmp2Blocks<TDim, TIdx>>
-        {
-            using type = alpaka::TagCpuOmp2Blocks;
-        };
-
-        template<typename TDim, typename TIdx>
-        struct TagToAcc<alpaka::TagCpuOmp2Blocks, TDim, TIdx>
-        {
-            using type = alpaka::AccCpuOmp2Blocks<TDim, TIdx>;
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/acc/AccCpuOmp2Threads.hpp b/include/alpaka/acc/AccCpuOmp2Threads.hpp
deleted file mode 100644
index bc326bc..0000000
--- a/include/alpaka/acc/AccCpuOmp2Threads.hpp
+++ /dev/null
@@ -1,237 +0,0 @@
-/* Copyright 2024 Axel Huebl, Benjamin Worpitz, René Widera, Jan Stephan, Bernhard Manfred Gruber, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-// Base classes.
-#include "alpaka/atomic/AtomicCpu.hpp"
-#include "alpaka/atomic/AtomicHierarchy.hpp"
-#include "alpaka/atomic/AtomicOmpBuiltIn.hpp"
-#include "alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp"
-#include "alpaka/block/shared/st/BlockSharedMemStMemberMasterSync.hpp"
-#include "alpaka/block/sync/BlockSyncBarrierOmp.hpp"
-#include "alpaka/core/DemangleTypeNames.hpp"
-#include "alpaka/idx/bt/IdxBtOmp.hpp"
-#include "alpaka/idx/gb/IdxGbRef.hpp"
-#include "alpaka/intrinsic/IntrinsicCpu.hpp"
-#include "alpaka/math/MathStdLib.hpp"
-#include "alpaka/mem/fence/MemFenceOmp2Threads.hpp"
-#include "alpaka/rand/RandDefault.hpp"
-#include "alpaka/rand/RandStdLib.hpp"
-#include "alpaka/warp/WarpSingleThread.hpp"
-#include "alpaka/workdiv/WorkDivMembers.hpp"
-
-// Specialized traits.
-#include "alpaka/acc/Traits.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/kernel/Traits.hpp"
-#include "alpaka/platform/Traits.hpp"
-
-// Implementation details.
-#include "alpaka/acc/Tag.hpp"
-#include "alpaka/core/ClipCast.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/dev/DevCpu.hpp"
-
-#include <limits>
-#include <typeinfo>
-
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
-
-#    if _OPENMP < 200203
-#        error If ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
-#    endif
-
-#    include <omp.h>
-
-namespace alpaka
-{
-    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-    class TaskKernelCpuOmp2Threads;
-
-    //! The CPU OpenMP 2.0 thread accelerator.
-    //!
-    //! This accelerator allows parallel kernel execution on a CPU device.
-    //! It uses OpenMP 2.0 to implement the block thread parallelism.
-    template<typename TDim, typename TIdx>
-    class AccCpuOmp2Threads final
-        : public WorkDivMembers<TDim, TIdx>
-        , public gb::IdxGbRef<TDim, TIdx>
-        , public bt::IdxBtOmp<TDim, TIdx>
-        , public AtomicHierarchy<
-              AtomicCpu, // grid atomics
-              AtomicOmpBuiltIn, // block atomics
-              AtomicOmpBuiltIn> // thread atomics
-        , public math::MathStdLib
-        , public BlockSharedMemDynMember<>
-        , public BlockSharedMemStMemberMasterSync<>
-        , public BlockSyncBarrierOmp
-        , public IntrinsicCpu
-        , public MemFenceOmp2Threads
-#    ifdef ALPAKA_DISABLE_VENDOR_RNG
-        , public rand::RandDefault
-#    else
-        , public rand::RandStdLib
-#    endif
-        , public warp::WarpSingleThread
-        , public concepts::Implements<ConceptAcc, AccCpuOmp2Threads<TDim, TIdx>>
-    {
-        static_assert(
-            sizeof(TIdx) >= sizeof(int),
-            "Index type is not supported, consider using int or a larger type.");
-
-    public:
-        // Partial specialization with the correct TDim and TIdx is not allowed.
-        template<typename TDim2, typename TIdx2, typename TKernelFnObj, typename... TArgs>
-        friend class ::alpaka::TaskKernelCpuOmp2Threads;
-
-        AccCpuOmp2Threads(AccCpuOmp2Threads const&) = delete;
-        AccCpuOmp2Threads(AccCpuOmp2Threads&&) = delete;
-        auto operator=(AccCpuOmp2Threads const&) -> AccCpuOmp2Threads& = delete;
-        auto operator=(AccCpuOmp2Threads&&) -> AccCpuOmp2Threads& = delete;
-
-    private:
-        template<typename TWorkDiv>
-        ALPAKA_FN_HOST AccCpuOmp2Threads(TWorkDiv const& workDiv, std::size_t const& blockSharedMemDynSizeBytes)
-            : WorkDivMembers<TDim, TIdx>(workDiv)
-            , gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx)
-            , BlockSharedMemDynMember<>(blockSharedMemDynSizeBytes)
-            , BlockSharedMemStMemberMasterSync<>(
-                  staticMemBegin(),
-                  staticMemCapacity(),
-                  [this]() { syncBlockThreads(*this); },
-                  []() noexcept { return (::omp_get_thread_num() == 0); })
-            , m_gridBlockIdx(Vec<TDim, TIdx>::zeros())
-        {
-        }
-
-    private:
-        // getIdx
-        Vec<TDim, TIdx> mutable m_gridBlockIdx; //!< The index of the currently executed block.
-    };
-
-    namespace trait
-    {
-        //! The CPU OpenMP 2.0 thread accelerator accelerator type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct AccType<AccCpuOmp2Threads<TDim, TIdx>>
-        {
-            using type = AccCpuOmp2Threads<TDim, TIdx>;
-        };
-
-        //! The CPU OpenMP 2.0 thread single thread accelerator type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct IsSingleThreadAcc<AccCpuOmp2Threads<TDim, TIdx>> : std::false_type
-        {
-        };
-
-        //! The CPU OpenMP 2.0 thread multi thread accelerator type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct IsMultiThreadAcc<AccCpuOmp2Threads<TDim, TIdx>> : std::true_type
-        {
-        };
-
-        //! The CPU OpenMP 2.0 thread accelerator device properties get trait specialization.
-        template<typename TDim, typename TIdx>
-        struct GetAccDevProps<AccCpuOmp2Threads<TDim, TIdx>>
-        {
-            ALPAKA_FN_HOST static auto getAccDevProps(DevCpu const& dev) -> alpaka::AccDevProps<TDim, TIdx>
-            {
-#    ifdef ALPAKA_CI
-                auto const blockThreadCountMax = alpaka::core::clipCast<TIdx>(std::min(4, ::omp_get_max_threads()));
-#    else
-                auto const blockThreadCountMax = alpaka::core::clipCast<TIdx>(::omp_get_max_threads());
-#    endif
-                auto const memBytes = getMemBytes(dev);
-                return {// m_multiProcessorCount
-                        static_cast<TIdx>(1),
-                        // m_gridBlockExtentMax
-                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_gridBlockCountMax
-                        std::numeric_limits<TIdx>::max(),
-                        // m_blockThreadExtentMax
-                        Vec<TDim, TIdx>::all(blockThreadCountMax),
-                        // m_blockThreadCountMax
-                        blockThreadCountMax,
-                        // m_threadElemExtentMax
-                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_threadElemCountMax
-                        std::numeric_limits<TIdx>::max(),
-                        // m_sharedMemSizeBytes
-                        memBytes,
-                        // m_globalMemSizeBytes
-                        memBytes};
-            }
-        };
-
-        //! The CPU OpenMP 2.0 thread accelerator name trait specialization.
-        template<typename TDim, typename TIdx>
-        struct GetAccName<AccCpuOmp2Threads<TDim, TIdx>>
-        {
-            ALPAKA_FN_HOST static auto getAccName() -> std::string
-            {
-                return "AccCpuOmp2Threads<" + std::to_string(TDim::value) + "," + core::demangled<TIdx> + ">";
-            }
-        };
-
-        //! The CPU OpenMP 2.0 thread accelerator device type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct DevType<AccCpuOmp2Threads<TDim, TIdx>>
-        {
-            using type = DevCpu;
-        };
-
-        //! The CPU OpenMP 2.0 thread accelerator dimension getter trait specialization.
-        template<typename TDim, typename TIdx>
-        struct DimType<AccCpuOmp2Threads<TDim, TIdx>>
-        {
-            using type = TDim;
-        };
-
-        //! The CPU OpenMP 2.0 thread accelerator execution task type trait specialization.
-        template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
-        struct CreateTaskKernel<AccCpuOmp2Threads<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
-        {
-            ALPAKA_FN_HOST static auto createTaskKernel(
-                TWorkDiv const& workDiv,
-                TKernelFnObj const& kernelFnObj,
-                TArgs&&... args)
-            {
-                return TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>(
-                    workDiv,
-                    kernelFnObj,
-                    std::forward<TArgs>(args)...);
-            }
-        };
-
-        //! The CPU OpenMP 2.0 thread execution task platform type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct PlatformType<AccCpuOmp2Threads<TDim, TIdx>>
-        {
-            using type = PlatformCpu;
-        };
-
-        //! The CPU OpenMP 2.0 thread accelerator idx type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct IdxType<AccCpuOmp2Threads<TDim, TIdx>>
-        {
-            using type = TIdx;
-        };
-
-        template<typename TDim, typename TIdx>
-        struct AccToTag<alpaka::AccCpuOmp2Threads<TDim, TIdx>>
-        {
-            using type = alpaka::TagCpuOmp2Threads;
-        };
-
-        template<typename TDim, typename TIdx>
-        struct TagToAcc<alpaka::TagCpuOmp2Threads, TDim, TIdx>
-        {
-            using type = alpaka::AccCpuOmp2Threads<TDim, TIdx>;
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/acc/AccCpuSerial.hpp b/include/alpaka/acc/AccCpuSerial.hpp
deleted file mode 100644
index e1b223f..0000000
--- a/include/alpaka/acc/AccCpuSerial.hpp
+++ /dev/null
@@ -1,227 +0,0 @@
-/* Copyright 2024 Axel Huebl, Benjamin Worpitz, René Widera, Jan Stephan, Bernhard Manfred Gruber, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-// Base classes.
-#include "alpaka/atomic/AtomicCpu.hpp"
-#include "alpaka/atomic/AtomicHierarchy.hpp"
-#include "alpaka/atomic/AtomicNoOp.hpp"
-#include "alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp"
-#include "alpaka/block/shared/st/BlockSharedMemStMember.hpp"
-#include "alpaka/block/sync/BlockSyncNoOp.hpp"
-#include "alpaka/core/DemangleTypeNames.hpp"
-#include "alpaka/idx/bt/IdxBtZero.hpp"
-#include "alpaka/idx/gb/IdxGbRef.hpp"
-#include "alpaka/intrinsic/IntrinsicCpu.hpp"
-#include "alpaka/math/MathStdLib.hpp"
-#include "alpaka/mem/fence/MemFenceCpuSerial.hpp"
-#include "alpaka/rand/RandDefault.hpp"
-#include "alpaka/rand/RandStdLib.hpp"
-#include "alpaka/warp/WarpSingleThread.hpp"
-#include "alpaka/workdiv/WorkDivMembers.hpp"
-
-// Specialized traits.
-#include "alpaka/acc/Traits.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/kernel/Traits.hpp"
-#include "alpaka/platform/Traits.hpp"
-
-// Implementation details.
-#include "alpaka/acc/Tag.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/dev/DevCpu.hpp"
-
-#include <memory>
-#include <typeinfo>
-
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
-
-namespace alpaka
-{
-    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-    class TaskKernelCpuSerial;
-
-    //! The CPU serial accelerator.
-    //!
-    //! This accelerator allows serial kernel execution on a CPU device.
-    //! The block idx is restricted to 1x1x1 and all blocks are executed serially so there is no parallelism at all.
-    template<typename TDim, typename TIdx>
-    class AccCpuSerial final
-        : public WorkDivMembers<TDim, TIdx>
-        , public gb::IdxGbRef<TDim, TIdx>
-        , public bt::IdxBtZero<TDim, TIdx>
-        , public AtomicHierarchy<
-              AtomicCpu, // grid atomics
-              AtomicNoOp, // block atomics
-              AtomicNoOp> // thread atomics
-        , public math::MathStdLib
-        , public BlockSharedMemDynMember<>
-        , public BlockSharedMemStMember<>
-        , public BlockSyncNoOp
-        , public IntrinsicCpu
-        , public MemFenceCpuSerial
-#    ifdef ALPAKA_DISABLE_VENDOR_RNG
-        , public rand::RandDefault
-#    else
-        , public rand::RandStdLib
-#    endif
-        , public warp::WarpSingleThread
-        , public concepts::Implements<ConceptAcc, AccCpuSerial<TDim, TIdx>>
-    {
-        static_assert(
-            sizeof(TIdx) >= sizeof(int),
-            "Index type is not supported, consider using int or a larger type.");
-
-    public:
-        // Partial specialization with the correct TDim and TIdx is not allowed.
-        template<typename TDim2, typename TIdx2, typename TKernelFnObj, typename... TArgs>
-        friend class ::alpaka::TaskKernelCpuSerial;
-
-        AccCpuSerial(AccCpuSerial const&) = delete;
-        AccCpuSerial(AccCpuSerial&&) = delete;
-        auto operator=(AccCpuSerial const&) -> AccCpuSerial& = delete;
-        auto operator=(AccCpuSerial&&) -> AccCpuSerial& = delete;
-
-    private:
-        template<typename TWorkDiv>
-        ALPAKA_FN_HOST AccCpuSerial(TWorkDiv const& workDiv, size_t const& blockSharedMemDynSizeBytes)
-            : WorkDivMembers<TDim, TIdx>(workDiv)
-            , gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx)
-            , BlockSharedMemDynMember<>(blockSharedMemDynSizeBytes)
-            , BlockSharedMemStMember<>(staticMemBegin(), staticMemCapacity())
-            , m_gridBlockIdx(Vec<TDim, TIdx>::zeros())
-        {
-        }
-
-    private:
-        // getIdx
-        Vec<TDim, TIdx> mutable m_gridBlockIdx; //!< The index of the currently executed block.
-    };
-
-    namespace trait
-    {
-        //! The CPU serial accelerator accelerator type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct AccType<AccCpuSerial<TDim, TIdx>>
-        {
-            using type = AccCpuSerial<TDim, TIdx>;
-        };
-
-        //! The CPU serial single thread accelerator type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct IsSingleThreadAcc<AccCpuSerial<TDim, TIdx>> : std::true_type
-        {
-        };
-
-        //! The CPU serial multi thread accelerator type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct IsMultiThreadAcc<AccCpuSerial<TDim, TIdx>> : std::false_type
-        {
-        };
-
-        //! The CPU serial accelerator device properties get trait specialization.
-        template<typename TDim, typename TIdx>
-        struct GetAccDevProps<AccCpuSerial<TDim, TIdx>>
-        {
-            ALPAKA_FN_HOST static auto getAccDevProps(DevCpu const& dev) -> AccDevProps<TDim, TIdx>
-            {
-                return {// m_multiProcessorCount
-                        static_cast<TIdx>(1),
-                        // m_gridBlockExtentMax
-                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_gridBlockCountMax
-                        std::numeric_limits<TIdx>::max(),
-                        // m_blockThreadExtentMax
-                        Vec<TDim, TIdx>::ones(),
-                        // m_blockThreadCountMax
-                        static_cast<TIdx>(1),
-                        // m_threadElemExtentMax
-                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_threadElemCountMax
-                        std::numeric_limits<TIdx>::max(),
-                        // m_sharedMemSizeBytes
-                        static_cast<size_t>(AccCpuSerial<TDim, TIdx>::staticAllocBytes()),
-                        // m_globalMemSizeBytes
-                        getMemBytes(dev)};
-            }
-        };
-
-        //! The CPU serial accelerator name trait specialization.
-        template<typename TDim, typename TIdx>
-        struct GetAccName<AccCpuSerial<TDim, TIdx>>
-        {
-            ALPAKA_FN_HOST static auto getAccName() -> std::string
-            {
-                return "AccCpuSerial<" + std::to_string(TDim::value) + "," + core::demangled<TIdx> + ">";
-            }
-        };
-
-        //! The CPU serial accelerator device type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct DevType<AccCpuSerial<TDim, TIdx>>
-        {
-            using type = DevCpu;
-        };
-
-        //! The CPU serial accelerator dimension getter trait specialization.
-        template<typename TDim, typename TIdx>
-        struct DimType<AccCpuSerial<TDim, TIdx>>
-        {
-            using type = TDim;
-        };
-
-        //! The CPU serial accelerator execution task type trait specialization.
-        template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
-        struct CreateTaskKernel<AccCpuSerial<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
-        {
-            ALPAKA_FN_HOST static auto createTaskKernel(
-                TWorkDiv const& workDiv,
-                TKernelFnObj const& kernelFnObj,
-                TArgs&&... args)
-            {
-                if(workDiv.m_blockThreadExtent.prod() != static_cast<TIdx>(1u))
-                {
-                    throw std::runtime_error(
-                        "The given work division is not valid for a single thread Acc: "
-                        + getAccName<AccCpuSerial<TDim, TIdx>>() + ". Threads per block should be 1!");
-                }
-
-                return TaskKernelCpuSerial<TDim, TIdx, TKernelFnObj, TArgs...>(
-                    workDiv,
-                    kernelFnObj,
-                    std::forward<TArgs>(args)...);
-            }
-        };
-
-        //! The CPU serial execution task platform type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct PlatformType<AccCpuSerial<TDim, TIdx>>
-        {
-            using type = PlatformCpu;
-        };
-
-        //! The CPU serial accelerator idx type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct IdxType<AccCpuSerial<TDim, TIdx>>
-        {
-            using type = TIdx;
-        };
-
-        template<typename TDim, typename TIdx>
-        struct AccToTag<alpaka::AccCpuSerial<TDim, TIdx>>
-        {
-            using type = alpaka::TagCpuSerial;
-        };
-
-        template<typename TDim, typename TIdx>
-        struct TagToAcc<alpaka::TagCpuSerial, TDim, TIdx>
-        {
-            using type = alpaka::AccCpuSerial<TDim, TIdx>;
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/acc/AccCpuSycl.hpp b/include/alpaka/acc/AccCpuSycl.hpp
deleted file mode 100644
index e4e7378..0000000
--- a/include/alpaka/acc/AccCpuSycl.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2024 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/acc/AccGenericSycl.hpp"
-#include "alpaka/acc/Tag.hpp"
-#include "alpaka/core/Sycl.hpp"
-
-#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_CPU)
-
-namespace alpaka
-{
-    //! The CPU SYCL accelerator.
-    //!
-    //! This accelerator allows parallel kernel execution on a oneAPI-capable CPU target device.
-    template<typename TDim, typename TIdx>
-    using AccCpuSycl = AccGenericSycl<TagCpuSycl, TDim, TIdx>;
-
-    namespace trait
-    {
-        template<typename TDim, typename TIdx>
-        struct AccToTag<alpaka::AccCpuSycl<TDim, TIdx>>
-        {
-            using type = alpaka::TagCpuSycl;
-        };
-
-        template<typename TDim, typename TIdx>
-        struct TagToAcc<alpaka::TagCpuSycl, TDim, TIdx>
-        {
-            using type = alpaka::AccCpuSycl<TDim, TIdx>;
-        };
-    } // namespace trait
-
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/acc/AccCpuTbbBlocks.hpp b/include/alpaka/acc/AccCpuTbbBlocks.hpp
deleted file mode 100644
index d283523..0000000
--- a/include/alpaka/acc/AccCpuTbbBlocks.hpp
+++ /dev/null
@@ -1,228 +0,0 @@
-/* Copyright 2024 Axel Huebl, Benjamin Worpitz, Erik Zenker, René Widera, Jan Stephan, Bernhard Manfred Gruber,
- *                Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-// Base classes.
-#include "alpaka/atomic/AtomicCpu.hpp"
-#include "alpaka/atomic/AtomicHierarchy.hpp"
-#include "alpaka/atomic/AtomicNoOp.hpp"
-#include "alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp"
-#include "alpaka/block/shared/st/BlockSharedMemStMember.hpp"
-#include "alpaka/block/sync/BlockSyncNoOp.hpp"
-#include "alpaka/core/DemangleTypeNames.hpp"
-#include "alpaka/idx/bt/IdxBtZero.hpp"
-#include "alpaka/idx/gb/IdxGbRef.hpp"
-#include "alpaka/intrinsic/IntrinsicCpu.hpp"
-#include "alpaka/math/MathStdLib.hpp"
-#include "alpaka/mem/fence/MemFenceCpu.hpp"
-#include "alpaka/rand/RandDefault.hpp"
-#include "alpaka/rand/RandStdLib.hpp"
-#include "alpaka/warp/WarpSingleThread.hpp"
-#include "alpaka/workdiv/WorkDivMembers.hpp"
-
-// Specialized traits.
-#include "alpaka/acc/Traits.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/kernel/Traits.hpp"
-#include "alpaka/platform/Traits.hpp"
-
-// Implementation details.
-#include "alpaka/acc/Tag.hpp"
-#include "alpaka/core/ClipCast.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/dev/DevCpu.hpp"
-
-#include <memory>
-#include <typeinfo>
-
-#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
-
-#    include <tbb/tbb.h>
-
-namespace alpaka
-{
-    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-    class TaskKernelCpuTbbBlocks;
-
-    //! The CPU TBB block accelerator.
-    template<typename TDim, typename TIdx>
-    class AccCpuTbbBlocks final
-        : public WorkDivMembers<TDim, TIdx>
-        , public gb::IdxGbRef<TDim, TIdx>
-        , public bt::IdxBtZero<TDim, TIdx>
-        , public AtomicHierarchy<
-              AtomicCpu, // grid atomics
-              AtomicCpu, // block atomics
-              AtomicNoOp> // thread atomics
-        , public math::MathStdLib
-        , public BlockSharedMemDynMember<>
-        , public BlockSharedMemStMember<>
-        , public BlockSyncNoOp
-        , public IntrinsicCpu
-        , public MemFenceCpu
-#    ifdef ALPAKA_DISABLE_VENDOR_RNG
-        , public rand::RandDefault
-#    else
-        , public rand::RandStdLib
-#    endif
-        , public warp::WarpSingleThread
-        , public concepts::Implements<ConceptAcc, AccCpuTbbBlocks<TDim, TIdx>>
-    {
-        static_assert(
-            sizeof(TIdx) >= sizeof(int),
-            "Index type is not supported, consider using int or a larger type.");
-
-    public:
-        // Partial specialization with the correct TDim and TIdx is not allowed.
-        template<typename TDim2, typename TIdx2, typename TKernelFnObj, typename... TArgs>
-        friend class ::alpaka::TaskKernelCpuTbbBlocks;
-
-        AccCpuTbbBlocks(AccCpuTbbBlocks const&) = delete;
-        AccCpuTbbBlocks(AccCpuTbbBlocks&&) = delete;
-        auto operator=(AccCpuTbbBlocks const&) -> AccCpuTbbBlocks& = delete;
-        auto operator=(AccCpuTbbBlocks&&) -> AccCpuTbbBlocks& = delete;
-
-    private:
-        template<typename TWorkDiv>
-        ALPAKA_FN_HOST AccCpuTbbBlocks(TWorkDiv const& workDiv, std::size_t const& blockSharedMemDynSizeBytes)
-            : WorkDivMembers<TDim, TIdx>(workDiv)
-            , gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx)
-            , BlockSharedMemDynMember<>(blockSharedMemDynSizeBytes)
-            , BlockSharedMemStMember<>(staticMemBegin(), staticMemCapacity())
-            , m_gridBlockIdx(Vec<TDim, TIdx>::zeros())
-        {
-        }
-
-    private:
-        // getIdx
-        Vec<TDim, TIdx> mutable m_gridBlockIdx; //!< The index of the currently executed block.
-    };
-
-    namespace trait
-    {
-        //! The CPU TBB block accelerator type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct AccType<AccCpuTbbBlocks<TDim, TIdx>>
-        {
-            using type = AccCpuTbbBlocks<TDim, TIdx>;
-        };
-
-        //! The CPU TBB block single thread accelerator type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct IsSingleThreadAcc<AccCpuTbbBlocks<TDim, TIdx>> : std::true_type
-        {
-        };
-
-        //! The CPU TBB block multi thread accelerator type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct IsMultiThreadAcc<AccCpuTbbBlocks<TDim, TIdx>> : std::false_type
-        {
-        };
-
-        //! The CPU TBB block accelerator device properties get trait specialization.
-        template<typename TDim, typename TIdx>
-        struct GetAccDevProps<AccCpuTbbBlocks<TDim, TIdx>>
-        {
-            ALPAKA_FN_HOST static auto getAccDevProps(DevCpu const& dev) -> AccDevProps<TDim, TIdx>
-            {
-                return {// m_multiProcessorCount
-                        alpaka::core::clipCast<TIdx>(tbb::this_task_arena::max_concurrency()),
-                        // m_gridBlockExtentMax
-                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_gridBlockCountMax
-                        std::numeric_limits<TIdx>::max(),
-                        // m_blockThreadExtentMax
-                        Vec<TDim, TIdx>::ones(),
-                        // m_blockThreadCountMax
-                        static_cast<TIdx>(1),
-                        // m_threadElemExtentMax
-                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_threadElemCountMax
-                        std::numeric_limits<TIdx>::max(),
-                        // m_sharedMemSizeBytes
-                        static_cast<size_t>(AccCpuTbbBlocks<TDim, TIdx>::staticAllocBytes()),
-                        // m_globalMemSizeBytes
-                        getMemBytes(dev)};
-            }
-        };
-
-        //! The CPU TBB block accelerator name trait specialization.
-        template<typename TDim, typename TIdx>
-        struct GetAccName<AccCpuTbbBlocks<TDim, TIdx>>
-        {
-            ALPAKA_FN_HOST static auto getAccName() -> std::string
-            {
-                return "AccCpuTbbBlocks<" + std::to_string(TDim::value) + "," + core::demangled<TIdx> + ">";
-            }
-        };
-
-        //! The CPU TBB block accelerator device type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct DevType<AccCpuTbbBlocks<TDim, TIdx>>
-        {
-            using type = DevCpu;
-        };
-
-        //! The CPU TBB block accelerator dimension getter trait specialization.
-        template<typename TDim, typename TIdx>
-        struct DimType<AccCpuTbbBlocks<TDim, TIdx>>
-        {
-            using type = TDim;
-        };
-
-        //! The CPU TBB block accelerator execution task type trait specialization.
-        template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
-        struct CreateTaskKernel<AccCpuTbbBlocks<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
-        {
-            ALPAKA_FN_HOST static auto createTaskKernel(
-                TWorkDiv const& workDiv,
-                TKernelFnObj const& kernelFnObj,
-                TArgs&&... args)
-            {
-                if(workDiv.m_blockThreadExtent.prod() != static_cast<TIdx>(1u))
-                {
-                    throw std::runtime_error(
-                        "The given work division is not valid for a single thread Acc: "
-                        + getAccName<AccCpuTbbBlocks<TDim, TIdx>>() + ". Threads per block should be 1!");
-                }
-
-                return TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>(
-                    workDiv,
-                    kernelFnObj,
-                    std::forward<TArgs>(args)...);
-            }
-        };
-
-        //! The CPU TBB block execution task platform type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct PlatformType<AccCpuTbbBlocks<TDim, TIdx>>
-        {
-            using type = PlatformCpu;
-        };
-
-        //! The CPU TBB block accelerator idx type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct IdxType<AccCpuTbbBlocks<TDim, TIdx>>
-        {
-            using type = TIdx;
-        };
-
-        template<typename TDim, typename TIdx>
-        struct AccToTag<alpaka::AccCpuTbbBlocks<TDim, TIdx>>
-        {
-            using type = alpaka::TagCpuTbbBlocks;
-        };
-
-        template<typename TDim, typename TIdx>
-        struct TagToAcc<alpaka::TagCpuTbbBlocks, TDim, TIdx>
-        {
-            using type = alpaka::AccCpuTbbBlocks<TDim, TIdx>;
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/acc/AccCpuThreads.hpp b/include/alpaka/acc/AccCpuThreads.hpp
deleted file mode 100644
index ce8f04a..0000000
--- a/include/alpaka/acc/AccCpuThreads.hpp
+++ /dev/null
@@ -1,245 +0,0 @@
-/* Copyright 2024 Axel Huebl, Benjamin Worpitz, René Widera, Jan Stephan, Bernhard Manfred Gruber, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-// Base classes.
-#include "alpaka/atomic/AtomicCpu.hpp"
-#include "alpaka/atomic/AtomicHierarchy.hpp"
-#include "alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp"
-#include "alpaka/block/shared/st/BlockSharedMemStMemberMasterSync.hpp"
-#include "alpaka/block/sync/BlockSyncBarrierThread.hpp"
-#include "alpaka/core/DemangleTypeNames.hpp"
-#include "alpaka/idx/bt/IdxBtRefThreadIdMap.hpp"
-#include "alpaka/idx/gb/IdxGbRef.hpp"
-#include "alpaka/intrinsic/IntrinsicCpu.hpp"
-#include "alpaka/math/MathStdLib.hpp"
-#include "alpaka/mem/fence/MemFenceCpu.hpp"
-#include "alpaka/rand/RandDefault.hpp"
-#include "alpaka/rand/RandStdLib.hpp"
-#include "alpaka/warp/WarpSingleThread.hpp"
-#include "alpaka/workdiv/WorkDivMembers.hpp"
-
-// Specialized traits.
-#include "alpaka/acc/Traits.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/kernel/Traits.hpp"
-#include "alpaka/platform/Traits.hpp"
-
-// Implementation details.
-#include "alpaka/acc/Tag.hpp"
-#include "alpaka/core/BoostPredef.hpp"
-#include "alpaka/core/ClipCast.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/dev/DevCpu.hpp"
-
-#include <memory>
-#include <thread>
-#include <typeinfo>
-
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
-
-namespace alpaka
-{
-    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-    class TaskKernelCpuThreads;
-
-    //! The CPU threads accelerator.
-    //!
-    //! This accelerator allows parallel kernel execution on a CPU device.
-    //! It uses std::thread to implement the parallelism.
-    template<typename TDim, typename TIdx>
-    class AccCpuThreads final
-        : public WorkDivMembers<TDim, TIdx>
-        , public gb::IdxGbRef<TDim, TIdx>
-        , public bt::IdxBtRefThreadIdMap<TDim, TIdx>
-        , public AtomicHierarchy<
-              AtomicCpu, // grid atomics
-              AtomicCpu, // block atomics
-              AtomicCpu> // thread atomics
-        , public math::MathStdLib
-        , public BlockSharedMemDynMember<>
-        , public BlockSharedMemStMemberMasterSync<>
-        , public BlockSyncBarrierThread<TIdx>
-        , public IntrinsicCpu
-        , public MemFenceCpu
-#    ifdef ALPAKA_DISABLE_VENDOR_RNG
-        , public rand::RandDefault
-#    else
-        , public rand::RandStdLib
-#    endif
-        , public warp::WarpSingleThread
-        , public concepts::Implements<ConceptAcc, AccCpuThreads<TDim, TIdx>>
-    {
-        static_assert(
-            sizeof(TIdx) >= sizeof(int),
-            "Index type is not supported, consider using int or a larger type.");
-
-    public:
-        // Partial specialization with the correct TDim and TIdx is not allowed.
-        template<typename TDim2, typename TIdx2, typename TKernelFnObj, typename... TArgs>
-        friend class ::alpaka::TaskKernelCpuThreads;
-
-        AccCpuThreads(AccCpuThreads const&) = delete;
-        AccCpuThreads(AccCpuThreads&&) = delete;
-        auto operator=(AccCpuThreads const&) -> AccCpuThreads& = delete;
-        auto operator=(AccCpuThreads&&) -> AccCpuThreads& = delete;
-
-    private:
-        template<typename TWorkDiv>
-        ALPAKA_FN_HOST AccCpuThreads(TWorkDiv const& workDiv, std::size_t const& blockSharedMemDynSizeBytes)
-            : WorkDivMembers<TDim, TIdx>(workDiv)
-            , gb::IdxGbRef<TDim, TIdx>(m_gridBlockIdx)
-            , bt::IdxBtRefThreadIdMap<TDim, TIdx>(m_threadToIndexMap)
-            , BlockSharedMemDynMember<>(blockSharedMemDynSizeBytes)
-            , BlockSharedMemStMemberMasterSync<>(
-                  staticMemBegin(),
-                  staticMemCapacity(),
-                  [this]() { syncBlockThreads(*this); },
-                  [this]() noexcept { return (m_idMasterThread == std::this_thread::get_id()); })
-            , BlockSyncBarrierThread<TIdx>(getWorkDiv<Block, Threads>(workDiv).prod())
-            , m_gridBlockIdx(Vec<TDim, TIdx>::zeros())
-        {
-        }
-
-    private:
-        // getIdx
-        std::mutex mutable m_mtxMapInsert; //!< The mutex used to secure insertion into the ThreadIdToIdxMap.
-        typename bt::IdxBtRefThreadIdMap<TDim, TIdx>::
-            ThreadIdToIdxMap mutable m_threadToIndexMap; //!< The mapping of thread id's to indices.
-        Vec<TDim, TIdx> mutable m_gridBlockIdx; //!< The index of the currently executed block.
-
-        // allocBlockSharedArr
-        std::thread::id mutable m_idMasterThread; //!< The id of the master thread.
-    };
-
-    namespace trait
-    {
-        //! The CPU threads accelerator accelerator type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct AccType<AccCpuThreads<TDim, TIdx>>
-        {
-            using type = AccCpuThreads<TDim, TIdx>;
-        };
-
-        //! The CPU threads single thread accelerator type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct IsSingleThreadAcc<AccCpuThreads<TDim, TIdx>> : std::false_type
-        {
-        };
-
-        //! The CPU threads multi thread accelerator type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct IsMultiThreadAcc<AccCpuThreads<TDim, TIdx>> : std::true_type
-        {
-        };
-
-        //! The CPU threads accelerator device properties get trait specialization.
-        template<typename TDim, typename TIdx>
-        struct GetAccDevProps<AccCpuThreads<TDim, TIdx>>
-        {
-            ALPAKA_FN_HOST static auto getAccDevProps(DevCpu const& dev) -> AccDevProps<TDim, TIdx>
-            {
-#    ifdef ALPAKA_CI
-                auto const blockThreadCountMax = static_cast<TIdx>(8);
-#    else
-                // \TODO: Magic number. What is the maximum? Just set a reasonable value? There is a implementation
-                // defined maximum where the creation of a new thread crashes. std::thread::hardware_concurrency can
-                // return 0, so 1 is the default case?
-                auto const blockThreadCountMax = std::max(
-                    static_cast<TIdx>(1),
-                    alpaka::core::clipCast<TIdx>(std::thread::hardware_concurrency() * 8));
-#    endif
-                auto const memBytes = getMemBytes(dev);
-                return {// m_multiProcessorCount
-                        static_cast<TIdx>(1),
-                        // m_gridBlockExtentMax
-                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_gridBlockCountMax
-                        std::numeric_limits<TIdx>::max(),
-                        // m_blockThreadExtentMax
-                        Vec<TDim, TIdx>::all(blockThreadCountMax),
-                        // m_blockThreadCountMax
-                        blockThreadCountMax,
-                        // m_threadElemExtentMax
-                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_threadElemCountMax
-                        std::numeric_limits<TIdx>::max(),
-                        // m_sharedMemSizeBytes
-                        memBytes,
-                        // m_globalMemSizeBytes
-                        memBytes};
-            }
-        };
-
-        //! The CPU threads accelerator name trait specialization.
-        template<typename TDim, typename TIdx>
-        struct GetAccName<AccCpuThreads<TDim, TIdx>>
-        {
-            ALPAKA_FN_HOST static auto getAccName() -> std::string
-            {
-                return "AccCpuThreads<" + std::to_string(TDim::value) + "," + core::demangled<TIdx> + ">";
-            }
-        };
-
-        //! The CPU threads accelerator device type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct DevType<AccCpuThreads<TDim, TIdx>>
-        {
-            using type = DevCpu;
-        };
-
-        //! The CPU threads accelerator dimension getter trait specialization.
-        template<typename TDim, typename TIdx>
-        struct DimType<AccCpuThreads<TDim, TIdx>>
-        {
-            using type = TDim;
-        };
-
-        //! The CPU threads accelerator execution task type trait specialization.
-        template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
-        struct CreateTaskKernel<AccCpuThreads<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
-        {
-            ALPAKA_FN_HOST static auto createTaskKernel(
-                TWorkDiv const& workDiv,
-                TKernelFnObj const& kernelFnObj,
-                TArgs&&... args)
-            {
-                return TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>(
-                    workDiv,
-                    kernelFnObj,
-                    std::forward<TArgs>(args)...);
-            }
-        };
-
-        //! The CPU threads execution task platform type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct PlatformType<AccCpuThreads<TDim, TIdx>>
-        {
-            using type = PlatformCpu;
-        };
-
-        //! The CPU threads accelerator idx type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct IdxType<AccCpuThreads<TDim, TIdx>>
-        {
-            using type = TIdx;
-        };
-
-        template<typename TDim, typename TIdx>
-        struct AccToTag<alpaka::AccCpuThreads<TDim, TIdx>>
-        {
-            using type = alpaka::TagCpuThreads;
-        };
-
-        template<typename TDim, typename TIdx>
-        struct TagToAcc<alpaka::TagCpuThreads, TDim, TIdx>
-        {
-            using type = alpaka::AccCpuThreads<TDim, TIdx>;
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/acc/AccDevProps.hpp b/include/alpaka/acc/AccDevProps.hpp
deleted file mode 100644
index a199d54..0000000
--- a/include/alpaka/acc/AccDevProps.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright 2024 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/vec/Vec.hpp"
-
-namespace alpaka
-{
-    //! The acceleration properties on a device.
-    //
-    // \TODO:
-    //  TIdx m_maxClockFrequencyHz;            //!< Maximum clock frequency of the device in Hz.
-    template<typename TDim, typename TIdx>
-    struct AccDevProps
-    {
-        static_assert(
-            sizeof(TIdx) >= sizeof(int),
-            "Index type is not supported, consider using int or a larger type.");
-
-        // Please keep the order of data members so aggregate initialization does not break!
-        TIdx m_multiProcessorCount; //!< The number of multiprocessors.
-        Vec<TDim, TIdx> m_gridBlockExtentMax; //!< The maximum number of blocks in each dimension of the grid.
-        TIdx m_gridBlockCountMax; //!< The maximum number of blocks in a grid.
-        Vec<TDim, TIdx> m_blockThreadExtentMax; //!< The maximum number of threads in each dimension of a block.
-        TIdx m_blockThreadCountMax; //!< The maximum number of threads in a block.
-        Vec<TDim, TIdx> m_threadElemExtentMax; //!< The maximum number of elements in each dimension of a thread.
-        TIdx m_threadElemCountMax; //!< The maximum number of elements in a threads.
-        size_t m_sharedMemSizeBytes; //!< The size of shared memory per block
-        size_t m_globalMemSizeBytes; //!< The size of global memory
-    };
-} // namespace alpaka
diff --git a/include/alpaka/acc/AccFpgaSyclIntel.hpp b/include/alpaka/acc/AccFpgaSyclIntel.hpp
deleted file mode 100644
index d0e099f..0000000
--- a/include/alpaka/acc/AccFpgaSyclIntel.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2024 Jan Stephan, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/acc/AccGenericSycl.hpp"
-#include "alpaka/acc/Tag.hpp"
-#include "alpaka/core/Sycl.hpp"
-
-#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_FPGA)
-
-namespace alpaka
-{
-    //! The Intel FPGA SYCL accelerator.
-    //!
-    //! This accelerator allows parallel kernel execution on a oneAPI-capable Intel FPGA target device.
-    template<typename TDim, typename TIdx>
-    using AccFpgaSyclIntel = AccGenericSycl<TagFpgaSyclIntel, TDim, TIdx>;
-
-    namespace trait
-    {
-        template<typename TDim, typename TIdx>
-        struct AccToTag<alpaka::AccFpgaSyclIntel<TDim, TIdx>>
-        {
-            using type = alpaka::TagFpgaSyclIntel;
-        };
-
-        template<typename TDim, typename TIdx>
-        struct TagToAcc<alpaka::TagFpgaSyclIntel, TDim, TIdx>
-        {
-            using type = alpaka::AccFpgaSyclIntel<TDim, TIdx>;
-        };
-    } // namespace trait
-
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/acc/AccGenericSycl.hpp b/include/alpaka/acc/AccGenericSycl.hpp
deleted file mode 100644
index 4679344..0000000
--- a/include/alpaka/acc/AccGenericSycl.hpp
+++ /dev/null
@@ -1,214 +0,0 @@
-/* Copyright 2024 Jan Stephan, Antonio Di Pilato, Andrea Bocci, Luca Ferragina, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-// Base classes.
-#include "alpaka/atomic/AtomicGenericSycl.hpp"
-#include "alpaka/atomic/AtomicHierarchy.hpp"
-#include "alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp"
-#include "alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp"
-#include "alpaka/block/sync/BlockSyncGenericSycl.hpp"
-#include "alpaka/dev/DevGenericSycl.hpp"
-#include "alpaka/idx/bt/IdxBtGenericSycl.hpp"
-#include "alpaka/idx/gb/IdxGbGenericSycl.hpp"
-#include "alpaka/intrinsic/IntrinsicGenericSycl.hpp"
-#include "alpaka/math/MathGenericSycl.hpp"
-#include "alpaka/mem/fence/MemFenceGenericSycl.hpp"
-#include "alpaka/platform/PlatformGenericSycl.hpp"
-#include "alpaka/rand/RandDefault.hpp"
-#include "alpaka/rand/RandGenericSycl.hpp"
-#include "alpaka/warp/WarpGenericSycl.hpp"
-#include "alpaka/workdiv/WorkDivGenericSycl.hpp"
-
-// Specialized traits.
-#include "alpaka/acc/Traits.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/kernel/Traits.hpp"
-#include "alpaka/platform/Traits.hpp"
-#include "alpaka/vec/Vec.hpp"
-
-// Implementation details.
-#include "alpaka/core/BoostPredef.hpp"
-#include "alpaka/core/ClipCast.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/core/Sycl.hpp"
-
-#include <cstddef>
-#include <string>
-#include <type_traits>
-
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-
-#    include <sycl/sycl.hpp>
-
-namespace alpaka
-{
-    template<typename TTag, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-    class TaskKernelGenericSycl;
-
-    //! The SYCL accelerator.
-    //!
-    //! This accelerator allows parallel kernel execution on SYCL devices.
-    template<typename TTag, typename TDim, typename TIdx>
-    class AccGenericSycl
-        : public WorkDivGenericSycl<TDim, TIdx>
-        , public gb::IdxGbGenericSycl<TDim, TIdx>
-        , public bt::IdxBtGenericSycl<TDim, TIdx>
-        , public AtomicHierarchy<AtomicGenericSycl, AtomicGenericSycl, AtomicGenericSycl>
-        , public math::MathGenericSycl
-        , public BlockSharedMemDynGenericSycl
-        , public BlockSharedMemStGenericSycl
-        , public BlockSyncGenericSycl<TDim>
-        , public IntrinsicGenericSycl
-        , public MemFenceGenericSycl
-#    ifdef ALPAKA_DISABLE_VENDOR_RNG
-        , public rand::RandDefault
-#    else
-        , public rand::RandGenericSycl<TDim>
-#    endif
-        , public warp::WarpGenericSycl<TDim>
-        , public concepts::Implements<ConceptAcc, AccGenericSycl<TTag, TDim, TIdx>>
-    {
-        static_assert(TDim::value > 0, "The SYCL accelerator must have a dimension greater than zero.");
-
-    public:
-        AccGenericSycl(AccGenericSycl const&) = delete;
-        AccGenericSycl(AccGenericSycl&&) = delete;
-        auto operator=(AccGenericSycl const&) -> AccGenericSycl& = delete;
-        auto operator=(AccGenericSycl&&) -> AccGenericSycl& = delete;
-
-        AccGenericSycl(
-            Vec<TDim, TIdx> const& threadElemExtent,
-            sycl::nd_item<TDim::value> work_item,
-            sycl::local_accessor<std::byte> dyn_shared_acc,
-            sycl::local_accessor<std::byte> st_shared_acc)
-            : WorkDivGenericSycl<TDim, TIdx>{threadElemExtent, work_item}
-            , gb::IdxGbGenericSycl<TDim, TIdx>{work_item}
-            , bt::IdxBtGenericSycl<TDim, TIdx>{work_item}
-            , BlockSharedMemDynGenericSycl{dyn_shared_acc}
-            , BlockSharedMemStGenericSycl{st_shared_acc}
-            , BlockSyncGenericSycl<TDim>{work_item}
-#    ifndef ALPAKA_DISABLE_VENDOR_RNG
-            , rand::RandGenericSycl<TDim>{work_item}
-#    endif
-            , warp::WarpGenericSycl<TDim>{work_item}
-        {
-        }
-    };
-} // namespace alpaka
-
-namespace alpaka::trait
-{
-    //! The SYCL accelerator type trait specialization.
-    template<typename TTag, typename TDim, typename TIdx>
-    struct AccType<AccGenericSycl<TTag, TDim, TIdx>>
-    {
-        using type = AccGenericSycl<TTag, TDim, TIdx>;
-    };
-
-    //! The SYCL single thread accelerator type trait specialization.
-    template<typename TTag, typename TDim, typename TIdx>
-    struct IsSingleThreadAcc<AccGenericSycl<TTag, TDim, TIdx>> : std::false_type
-    {
-    };
-
-    //! The SYCL multi thread accelerator type trait specialization.
-    template<typename TTag, typename TDim, typename TIdx>
-    struct IsMultiThreadAcc<AccGenericSycl<TTag, TDim, TIdx>> : std::true_type
-    {
-    };
-
-    //! The SYCL accelerator device properties get trait specialization.
-    template<typename TTag, typename TDim, typename TIdx>
-    struct GetAccDevProps<AccGenericSycl<TTag, TDim, TIdx>>
-    {
-        static auto getAccDevProps(DevGenericSycl<TTag> const& dev) -> AccDevProps<TDim, TIdx>
-        {
-            auto const device = dev.getNativeHandle().first;
-            auto const max_threads_dim
-                = device.template get_info<sycl::info::device::max_work_item_sizes<TDim::value>>();
-            Vec<TDim, TIdx> max_threads_dim_vec{};
-            for(int i = 0; i < static_cast<int>(TDim::value); i++)
-                max_threads_dim_vec[i] = alpaka::core::clipCast<TIdx>(max_threads_dim[i]);
-            return {// m_multiProcessorCount
-                    alpaka::core::clipCast<TIdx>(device.template get_info<sycl::info::device::max_compute_units>()),
-                    // m_gridBlockExtentMax
-                    getExtentVecEnd<TDim>(Vec<DimInt<3u>, TIdx>(
-                        // WARNING: There is no SYCL way to determine these values
-                        std::numeric_limits<TIdx>::max(),
-                        std::numeric_limits<TIdx>::max(),
-                        std::numeric_limits<TIdx>::max())),
-                    // m_gridBlockCountMax
-                    std::numeric_limits<TIdx>::max(),
-                    // m_blockThreadExtentMax
-                    max_threads_dim_vec,
-                    // m_blockThreadCountMax
-                    alpaka::core::clipCast<TIdx>(device.template get_info<sycl::info::device::max_work_group_size>()),
-                    // m_threadElemExtentMax
-                    Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                    // m_threadElemCountMax
-                    std::numeric_limits<TIdx>::max(),
-                    // m_sharedMemSizeBytes
-                    device.template get_info<sycl::info::device::local_mem_size>(),
-                    // m_globalMemSizeBytes
-                    getMemBytes(dev)};
-        }
-    };
-
-    //! The SYCL accelerator name trait specialization.
-    template<typename TTag, typename TDim, typename TIdx>
-    struct GetAccName<AccGenericSycl<TTag, TDim, TIdx>>
-    {
-        static auto getAccName() -> std::string
-        {
-            return std::string("Acc") + core::demangled<TTag>.substr(__builtin_strlen("alpaka::Tag")) + "<"
-                   + std::to_string(TDim::value) + "," + core::demangled<TIdx> + ">";
-        }
-    };
-
-    //! The SYCL accelerator device type trait specialization.
-    template<typename TTag, typename TDim, typename TIdx>
-    struct DevType<AccGenericSycl<TTag, TDim, TIdx>>
-    {
-        using type = DevGenericSycl<TTag>;
-    };
-
-    //! The SYCL accelerator dimension getter trait specialization.
-    template<typename TTag, typename TDim, typename TIdx>
-    struct DimType<AccGenericSycl<TTag, TDim, TIdx>>
-    {
-        using type = TDim;
-    };
-
-    //! The SYCL accelerator execution task type trait specialization.
-    template<typename TTag, typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
-    struct CreateTaskKernel<AccGenericSycl<TTag, TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
-    {
-        static auto createTaskKernel(TWorkDiv const& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
-        {
-            return TaskKernelGenericSycl<TTag, AccGenericSycl<TTag, TDim, TIdx>, TDim, TIdx, TKernelFnObj, TArgs...>{
-                workDiv,
-                kernelFnObj,
-                std::forward<TArgs>(args)...};
-        }
-    };
-
-    //! The SYCL execution task platform type trait specialization.
-    template<typename TTag, typename TDim, typename TIdx>
-    struct PlatformType<AccGenericSycl<TTag, TDim, TIdx>>
-    {
-        using type = PlatformGenericSycl<TTag>;
-    };
-
-    //! The SYCL accelerator idx type trait specialization.
-    template<typename TTag, typename TDim, typename TIdx>
-    struct IdxType<AccGenericSycl<TTag, TDim, TIdx>>
-    {
-        using type = TIdx;
-    };
-} // namespace alpaka::trait
-
-#endif
diff --git a/include/alpaka/acc/AccGpuCudaRt.hpp b/include/alpaka/acc/AccGpuCudaRt.hpp
deleted file mode 100644
index 5f27e51..0000000
--- a/include/alpaka/acc/AccGpuCudaRt.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright 2022 Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/acc/AccGpuUniformCudaHipRt.hpp"
-#include "alpaka/acc/Tag.hpp"
-#include "alpaka/core/ApiCudaRt.hpp"
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-namespace alpaka
-{
-    template<typename TDim, typename TIdx>
-    using AccGpuCudaRt = AccGpuUniformCudaHipRt<ApiCudaRt, TDim, TIdx>;
-
-    namespace trait
-    {
-        template<typename TDim, typename TIdx>
-        struct AccToTag<alpaka::AccGpuCudaRt<TDim, TIdx>>
-        {
-            using type = alpaka::TagGpuCudaRt;
-        };
-
-        template<typename TDim, typename TIdx>
-        struct TagToAcc<alpaka::TagGpuCudaRt, TDim, TIdx>
-        {
-            using type = alpaka::AccGpuCudaRt<TDim, TIdx>;
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
diff --git a/include/alpaka/acc/AccGpuHipRt.hpp b/include/alpaka/acc/AccGpuHipRt.hpp
deleted file mode 100644
index 43c94ab..0000000
--- a/include/alpaka/acc/AccGpuHipRt.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright 2022 Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/acc/AccGpuUniformCudaHipRt.hpp"
-#include "alpaka/acc/Tag.hpp"
-#include "alpaka/core/ApiHipRt.hpp"
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-namespace alpaka
-{
-    template<typename TDim, typename TIdx>
-    using AccGpuHipRt = AccGpuUniformCudaHipRt<ApiHipRt, TDim, TIdx>;
-
-    namespace trait
-    {
-        template<typename TDim, typename TIdx>
-        struct AccToTag<alpaka::AccGpuHipRt<TDim, TIdx>>
-        {
-            using type = alpaka::TagGpuHipRt;
-        };
-
-        template<typename TDim, typename TIdx>
-        struct TagToAcc<alpaka::TagGpuHipRt, TDim, TIdx>
-        {
-            using type = alpaka::AccGpuHipRt<TDim, TIdx>;
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#endif // ALPAKA_ACC_GPU_HIP_ENABLED
diff --git a/include/alpaka/acc/AccGpuSyclIntel.hpp b/include/alpaka/acc/AccGpuSyclIntel.hpp
deleted file mode 100644
index 2e75b43..0000000
--- a/include/alpaka/acc/AccGpuSyclIntel.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2024 Jan Stephan, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/acc/AccGenericSycl.hpp"
-#include "alpaka/acc/Tag.hpp"
-#include "alpaka/core/Sycl.hpp"
-
-#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_GPU)
-
-namespace alpaka
-{
-    //! The Intel GPU SYCL accelerator.
-    //!
-    //! This accelerator allows parallel kernel execution on a oneAPI-capable Intel GPU target device.
-    template<typename TDim, typename TIdx>
-    using AccGpuSyclIntel = AccGenericSycl<TagGpuSyclIntel, TDim, TIdx>;
-
-    namespace trait
-    {
-        template<typename TDim, typename TIdx>
-        struct AccToTag<alpaka::AccGpuSyclIntel<TDim, TIdx>>
-        {
-            using type = alpaka::TagGpuSyclIntel;
-        };
-
-        template<typename TDim, typename TIdx>
-        struct TagToAcc<alpaka::TagGpuSyclIntel, TDim, TIdx>
-        {
-            using type = alpaka::AccGpuSyclIntel<TDim, TIdx>;
-        };
-    } // namespace trait
-
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp b/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp
deleted file mode 100644
index bc0e8cb..0000000
--- a/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp
+++ /dev/null
@@ -1,307 +0,0 @@
-/* Copyright 2024 Benjamin Worpitz, René Widera, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Antonio Di Pilato
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-// Base classes.
-#include "alpaka/atomic/AtomicHierarchy.hpp"
-#include "alpaka/atomic/AtomicUniformCudaHipBuiltIn.hpp"
-#include "alpaka/block/shared/dyn/BlockSharedMemDynUniformCudaHipBuiltIn.hpp"
-#include "alpaka/block/shared/st/BlockSharedMemStUniformCudaHipBuiltIn.hpp"
-#include "alpaka/block/sync/BlockSyncUniformCudaHipBuiltIn.hpp"
-#include "alpaka/core/DemangleTypeNames.hpp"
-#include "alpaka/idx/bt/IdxBtUniformCudaHipBuiltIn.hpp"
-#include "alpaka/idx/gb/IdxGbUniformCudaHipBuiltIn.hpp"
-#include "alpaka/intrinsic/IntrinsicUniformCudaHipBuiltIn.hpp"
-#include "alpaka/math/MathUniformCudaHipBuiltIn.hpp"
-#include "alpaka/mem/fence/MemFenceUniformCudaHipBuiltIn.hpp"
-#include "alpaka/rand/RandDefault.hpp"
-#include "alpaka/rand/RandUniformCudaHipRand.hpp"
-#include "alpaka/warp/WarpUniformCudaHipBuiltIn.hpp"
-#include "alpaka/workdiv/WorkDivUniformCudaHipBuiltIn.hpp"
-
-// Specialized traits.
-#include "alpaka/acc/Traits.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/kernel/Traits.hpp"
-#include "alpaka/platform/Traits.hpp"
-
-// Implementation details.
-#include "alpaka/core/ClipCast.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/core/Cuda.hpp"
-#include "alpaka/dev/DevUniformCudaHipRt.hpp"
-
-#include <typeinfo>
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-
-namespace alpaka
-{
-    template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-    class TaskKernelGpuUniformCudaHipRt;
-
-    //! The GPU CUDA accelerator.
-    //!
-    //! This accelerator allows parallel kernel execution on devices supporting CUDA.
-    template<typename TApi, typename TDim, typename TIdx>
-    class AccGpuUniformCudaHipRt final
-        : public WorkDivUniformCudaHipBuiltIn<TDim, TIdx>
-        , public gb::IdxGbUniformCudaHipBuiltIn<TDim, TIdx>
-        , public bt::IdxBtUniformCudaHipBuiltIn<TDim, TIdx>
-        , public AtomicHierarchy<
-              AtomicUniformCudaHipBuiltIn, // grid atomics
-              AtomicUniformCudaHipBuiltIn, // block atomics
-              AtomicUniformCudaHipBuiltIn> // thread atomics
-        , public math::MathUniformCudaHipBuiltIn
-        , public BlockSharedMemDynUniformCudaHipBuiltIn
-        , public BlockSharedMemStUniformCudaHipBuiltIn
-        , public BlockSyncUniformCudaHipBuiltIn
-        , public IntrinsicUniformCudaHipBuiltIn
-        , public MemFenceUniformCudaHipBuiltIn
-#    ifdef ALPAKA_DISABLE_VENDOR_RNG
-        , public rand::RandDefault
-#    else
-        , public rand::RandUniformCudaHipRand<TApi>
-#    endif
-        , public warp::WarpUniformCudaHipBuiltIn
-        , public concepts::Implements<ConceptAcc, AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
-    {
-        static_assert(
-            sizeof(TIdx) >= sizeof(int),
-            "Index type is not supported, consider using int or a larger type.");
-
-    public:
-        AccGpuUniformCudaHipRt(AccGpuUniformCudaHipRt const&) = delete;
-        AccGpuUniformCudaHipRt(AccGpuUniformCudaHipRt&&) = delete;
-        auto operator=(AccGpuUniformCudaHipRt const&) -> AccGpuUniformCudaHipRt& = delete;
-        auto operator=(AccGpuUniformCudaHipRt&&) -> AccGpuUniformCudaHipRt& = delete;
-
-        ALPAKA_FN_HOST_ACC AccGpuUniformCudaHipRt(Vec<TDim, TIdx> const& threadElemExtent)
-            : WorkDivUniformCudaHipBuiltIn<TDim, TIdx>(threadElemExtent)
-        {
-        }
-    };
-
-    namespace trait
-    {
-        //! The GPU CUDA accelerator accelerator type trait specialization.
-        template<typename TApi, typename TDim, typename TIdx>
-        struct AccType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
-        {
-            using type = AccGpuUniformCudaHipRt<TApi, TDim, TIdx>;
-        };
-
-        //! The GPU CUDA single thread accelerator type trait specialization.
-        template<typename TApi, typename TDim, typename TIdx>
-        struct IsSingleThreadAcc<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>> : std::false_type
-        {
-        };
-
-        //! The GPU CUDA multi thread accelerator type trait specialization.
-        template<typename TApi, typename TDim, typename TIdx>
-        struct IsMultiThreadAcc<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>> : std::true_type
-        {
-        };
-
-        //! The GPU CUDA accelerator device properties get trait specialization.
-        template<typename TApi, typename TDim, typename TIdx>
-        struct GetAccDevProps<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
-        {
-            ALPAKA_FN_HOST static auto getAccDevProps(DevUniformCudaHipRt<TApi> const& dev) -> AccDevProps<TDim, TIdx>
-            {
-#    ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-                // Reading only the necessary attributes with cudaDeviceGetAttribute is faster than reading all with
-                // cuda https://devblogs.nvidia.com/cuda-pro-tip-the-fast-way-to-query-device-properties/
-                int multiProcessorCount = {};
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
-                    &multiProcessorCount,
-                    TApi::deviceAttributeMultiprocessorCount,
-                    dev.getNativeHandle()));
-
-                int maxGridSize[3] = {};
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
-                    &maxGridSize[0],
-                    TApi::deviceAttributeMaxGridDimX,
-                    dev.getNativeHandle()));
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
-                    &maxGridSize[1],
-                    TApi::deviceAttributeMaxGridDimY,
-                    dev.getNativeHandle()));
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
-                    &maxGridSize[2],
-                    TApi::deviceAttributeMaxGridDimZ,
-                    dev.getNativeHandle()));
-
-                int maxBlockDim[3] = {};
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
-                    &maxBlockDim[0],
-                    TApi::deviceAttributeMaxBlockDimX,
-                    dev.getNativeHandle()));
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
-                    &maxBlockDim[1],
-                    TApi::deviceAttributeMaxBlockDimY,
-                    dev.getNativeHandle()));
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
-                    &maxBlockDim[2],
-                    TApi::deviceAttributeMaxBlockDimZ,
-                    dev.getNativeHandle()));
-
-                int maxThreadsPerBlock = {};
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
-                    &maxThreadsPerBlock,
-                    TApi::deviceAttributeMaxThreadsPerBlock,
-                    dev.getNativeHandle()));
-
-                int sharedMemSizeBytes = {};
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
-                    &sharedMemSizeBytes,
-                    TApi::deviceAttributeMaxSharedMemoryPerBlock,
-                    dev.getNativeHandle()));
-
-                return {// m_multiProcessorCount
-                        alpaka::core::clipCast<TIdx>(multiProcessorCount),
-                        // m_gridBlockExtentMax
-                        getExtentVecEnd<TDim>(Vec<DimInt<3u>, TIdx>(
-                            alpaka::core::clipCast<TIdx>(maxGridSize[2u]),
-                            alpaka::core::clipCast<TIdx>(maxGridSize[1u]),
-                            alpaka::core::clipCast<TIdx>(maxGridSize[0u]))),
-                        // m_gridBlockCountMax
-                        std::numeric_limits<TIdx>::max(),
-                        // m_blockThreadExtentMax
-                        getExtentVecEnd<TDim>(Vec<DimInt<3u>, TIdx>(
-                            alpaka::core::clipCast<TIdx>(maxBlockDim[2u]),
-                            alpaka::core::clipCast<TIdx>(maxBlockDim[1u]),
-                            alpaka::core::clipCast<TIdx>(maxBlockDim[0u]))),
-                        // m_blockThreadCountMax
-                        alpaka::core::clipCast<TIdx>(maxThreadsPerBlock),
-                        // m_threadElemExtentMax
-                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_threadElemCountMax
-                        std::numeric_limits<TIdx>::max(),
-                        // m_sharedMemSizeBytes
-                        static_cast<size_t>(sharedMemSizeBytes),
-                        // m_globalMemSizeBytes
-                        getMemBytes(dev)};
-
-#    else
-                typename TApi::DeviceProp_t properties;
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::getDeviceProperties(&properties, dev.getNativeHandle()));
-
-                return {// m_multiProcessorCount
-                        alpaka::core::clipCast<TIdx>(properties.multiProcessorCount),
-                        // m_gridBlockExtentMax
-                        getExtentVecEnd<TDim>(Vec<DimInt<3u>, TIdx>(
-                            alpaka::core::clipCast<TIdx>(properties.maxGridSize[2u]),
-                            alpaka::core::clipCast<TIdx>(properties.maxGridSize[1u]),
-                            alpaka::core::clipCast<TIdx>(properties.maxGridSize[0u]))),
-                        // m_gridBlockCountMax
-                        std::numeric_limits<TIdx>::max(),
-                        // m_blockThreadExtentMax
-                        getExtentVecEnd<TDim>(Vec<DimInt<3u>, TIdx>(
-                            alpaka::core::clipCast<TIdx>(properties.maxThreadsDim[2u]),
-                            alpaka::core::clipCast<TIdx>(properties.maxThreadsDim[1u]),
-                            alpaka::core::clipCast<TIdx>(properties.maxThreadsDim[0u]))),
-                        // m_blockThreadCountMax
-                        alpaka::core::clipCast<TIdx>(properties.maxThreadsPerBlock),
-                        // m_threadElemExtentMax
-                        Vec<TDim, TIdx>::all(std::numeric_limits<TIdx>::max()),
-                        // m_threadElemCountMax
-                        std::numeric_limits<TIdx>::max(),
-                        // m_sharedMemSizeBytes
-                        static_cast<size_t>(properties.sharedMemPerBlock),
-                        // m_globalMemSizeBytes
-                        getMemBytes(dev)};
-#    endif
-            }
-        };
-
-        //! The GPU CUDA accelerator name trait specialization.
-        template<typename TApi, typename TDim, typename TIdx>
-        struct GetAccName<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
-        {
-            ALPAKA_FN_HOST static auto getAccName() -> std::string
-            {
-                return std::string("AccGpu") + TApi::name + "Rt<" + std::to_string(TDim::value) + ","
-                       + core::demangled<TIdx> + ">";
-            }
-        };
-
-        //! The GPU CUDA accelerator device type trait specialization.
-        template<typename TApi, typename TDim, typename TIdx>
-        struct DevType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
-        {
-            using type = DevUniformCudaHipRt<TApi>;
-        };
-
-        //! The GPU CUDA accelerator dimension getter trait specialization.
-        template<typename TApi, typename TDim, typename TIdx>
-        struct DimType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
-        {
-            using type = TDim;
-        };
-    } // namespace trait
-
-    namespace detail
-    {
-        //! specialization of the TKernelFnObj return type evaluation
-        //
-        // It is not possible to determine the result type of a __device__ lambda for CUDA on the host side.
-        // https://github.com/alpaka-group/alpaka/pull/695#issuecomment-446103194
-        // The execution task TaskKernelGpuUniformCudaHipRt is therefore performing this check on device side.
-        template<typename TApi, typename TDim, typename TIdx>
-        struct CheckFnReturnType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
-        {
-            template<typename TKernelFnObj, typename... TArgs>
-            void operator()(TKernelFnObj const&, TArgs const&...)
-            {
-            }
-        };
-    } // namespace detail
-
-    namespace trait
-    {
-        //! The GPU CUDA accelerator execution task type trait specialization.
-        template<
-            typename TApi,
-            typename TDim,
-            typename TIdx,
-            typename TWorkDiv,
-            typename TKernelFnObj,
-            typename... TArgs>
-        struct CreateTaskKernel<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
-        {
-            ALPAKA_FN_HOST static auto createTaskKernel(
-                TWorkDiv const& workDiv,
-                TKernelFnObj const& kernelFnObj,
-                TArgs&&... args)
-            {
-                return TaskKernelGpuUniformCudaHipRt<
-                    TApi,
-                    AccGpuUniformCudaHipRt<TApi, TDim, TIdx>,
-                    TDim,
-                    TIdx,
-                    TKernelFnObj,
-                    TArgs...>(workDiv, kernelFnObj, std::forward<TArgs>(args)...);
-            }
-        };
-
-        //! The CPU CUDA execution task platform type trait specialization.
-        template<typename TApi, typename TDim, typename TIdx>
-        struct PlatformType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
-        {
-            using type = PlatformUniformCudaHipRt<TApi>;
-        };
-
-        //! The GPU CUDA accelerator idx type trait specialization.
-        template<typename TApi, typename TDim, typename TIdx>
-        struct IdxType<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>
-        {
-            using type = TIdx;
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/acc/Tag.hpp b/include/alpaka/acc/Tag.hpp
deleted file mode 100644
index f7880af..0000000
--- a/include/alpaka/acc/Tag.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright 2023 Simeon Ehrig, Jan Stephan, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/BoostPredef.hpp"
-
-#include <iostream>
-#include <type_traits>
-
-#define CREATE_ACC_TAG(tag_name)                                                                                      \
-    struct tag_name                                                                                                   \
-    {                                                                                                                 \
-        static std::string get_name()                                                                                 \
-        {                                                                                                             \
-            return #tag_name;                                                                                         \
-        }                                                                                                             \
-    }
-
-namespace alpaka
-{
-    CREATE_ACC_TAG(TagCpuOmp2Blocks);
-    CREATE_ACC_TAG(TagCpuOmp2Threads);
-    CREATE_ACC_TAG(TagCpuSerial);
-    CREATE_ACC_TAG(TagCpuSycl);
-    CREATE_ACC_TAG(TagCpuTbbBlocks);
-    CREATE_ACC_TAG(TagCpuThreads);
-    CREATE_ACC_TAG(TagFpgaSyclIntel);
-    CREATE_ACC_TAG(TagGenericSycl);
-    CREATE_ACC_TAG(TagGpuCudaRt);
-    CREATE_ACC_TAG(TagGpuHipRt);
-    CREATE_ACC_TAG(TagGpuSyclIntel);
-
-    namespace trait
-    {
-        template<typename TAcc>
-        struct AccToTag;
-
-        template<typename TTag, typename TDim, typename TIdx>
-        struct TagToAcc;
-    } // namespace trait
-
-    //! \brief maps an acc type to a tag type
-    //! \tparam TAcc alpaka acc type
-    template<typename TAcc>
-    using AccToTag = typename trait::AccToTag<TAcc>::type;
-
-    //! \brief maps a tag type to an acc type
-    //! \tparam TTag alpaka tag type
-    //! \tparam TDim dimension of the mapped acc type
-    //! \tparam TIdx index type of the mapped acc type
-    template<typename TTag, typename TDim, typename TIdx>
-    using TagToAcc = typename trait::TagToAcc<TTag, TDim, TIdx>::type;
-
-    template<typename TAcc, typename... TTag>
-    inline constexpr bool accMatchesTags = (std::is_same_v<alpaka::AccToTag<TAcc>, TTag> || ...);
-
-    //! list of all available tags
-    using AccTags = std::tuple<
-        alpaka::TagCpuSerial,
-        alpaka::TagCpuThreads,
-        alpaka::TagCpuTbbBlocks,
-        alpaka::TagCpuOmp2Blocks,
-        alpaka::TagCpuOmp2Threads,
-        alpaka::TagGpuCudaRt,
-        alpaka::TagGpuHipRt,
-        alpaka::TagCpuSycl,
-        alpaka::TagFpgaSyclIntel,
-        alpaka::TagGpuSyclIntel>;
-
-} // namespace alpaka
diff --git a/include/alpaka/acc/TagAccIsEnabled.hpp b/include/alpaka/acc/TagAccIsEnabled.hpp
deleted file mode 100644
index c21fd2b..0000000
--- a/include/alpaka/acc/TagAccIsEnabled.hpp
+++ /dev/null
@@ -1,36 +0,0 @@
-#pragma once
-
-// include all Acc's because of the struct AccIsEnabled
-// if an acc is not include, it will be not enabled independent of the compiler flags
-#include "alpaka/acc/AccCpuOmp2Blocks.hpp"
-#include "alpaka/acc/AccCpuOmp2Threads.hpp"
-#include "alpaka/acc/AccCpuSerial.hpp"
-#include "alpaka/acc/AccCpuSycl.hpp"
-#include "alpaka/acc/AccCpuTbbBlocks.hpp"
-#include "alpaka/acc/AccCpuThreads.hpp"
-#include "alpaka/acc/AccFpgaSyclIntel.hpp"
-#include "alpaka/acc/AccGpuCudaRt.hpp"
-#include "alpaka/acc/AccGpuHipRt.hpp"
-#include "alpaka/dim/DimIntegralConst.hpp"
-#include "alpaka/meta/Filter.hpp"
-
-#include <type_traits>
-
-namespace alpaka
-{
-    //! \brief check if the accelerator is enabled for a given tag
-    //! \tparam TTag alpaka tag type
-    template<typename TTag, typename = void>
-    struct AccIsEnabled : std::false_type
-    {
-    };
-
-    template<typename TTag>
-    struct AccIsEnabled<TTag, std::void_t<TagToAcc<TTag, alpaka::DimInt<1>, int>>> : std::true_type
-    {
-    };
-
-    //! list of all tags where the related accelerator is enabled
-    using EnabledAccTags = alpaka::meta::Filter<AccTags, alpaka::AccIsEnabled>;
-
-} // namespace alpaka
diff --git a/include/alpaka/acc/Traits.hpp b/include/alpaka/acc/Traits.hpp
deleted file mode 100644
index 48fa0b1..0000000
--- a/include/alpaka/acc/Traits.hpp
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright 2024 Benjamin Worpitz, Bernhard Manfred Gruber, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/acc/AccDevProps.hpp"
-#include "alpaka/core/Common.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/core/DemangleTypeNames.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/dim/Traits.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/kernel/Traits.hpp"
-#include "alpaka/platform/Traits.hpp"
-#include "alpaka/queue/Traits.hpp"
-
-#include <string>
-#include <type_traits>
-#include <typeinfo>
-
-namespace alpaka
-{
-    struct ConceptAcc
-    {
-    };
-
-    //! True if TAcc is an accelerator, i.e. if it implements the ConceptAcc concept.
-    template<typename TAcc>
-    inline constexpr bool isAccelerator = concepts::ImplementsConcept<ConceptAcc, TAcc>::value;
-
-    //! The accelerator traits.
-    namespace trait
-    {
-        //! The accelerator type trait.
-        template<typename T, typename TSfinae = void>
-        struct AccType;
-
-        //! The single thread accelerator trait.
-        //!
-        //! If TAcc is an accelerator that supports only a single thread per block, inherit from std::true_type.
-        //! If TAcc is not an accelerator, or an accelerator that supports multiple threads per block, inherit from
-        //! std::false_type.
-        template<typename TAcc, typename TSfinae = void>
-        struct IsSingleThreadAcc : std::false_type
-        {
-        };
-
-        //! The multi thread accelerator trait.
-        //!
-        //! If TAcc is an accelerator that supports multiple threads per block, inherit from std::true_type.
-        //! If TAcc is not an accelerator, or an accelerator that supports only a single thread per block, inherit from
-        //! std::false_type.
-        template<typename TAcc, typename TSfinae = void>
-        struct IsMultiThreadAcc : std::false_type
-        {
-        };
-
-        //! The device properties get trait.
-        template<typename TAcc, typename TSfinae = void>
-        struct GetAccDevProps;
-
-        //! The accelerator name trait.
-        //!
-        //! The default implementation returns the mangled class name.
-        template<typename TAcc, typename TSfinae = void>
-        struct GetAccName
-        {
-            ALPAKA_FN_HOST static auto getAccName() -> std::string
-            {
-                return core::demangled<TAcc>;
-            }
-        };
-    } // namespace trait
-
-    //! The accelerator type trait alias template to remove the ::type.
-    template<typename T>
-    using Acc = typename trait::AccType<T>::type;
-
-    //! True if TAcc is an accelerator that supports only a single thread per block, false otherwise.
-    template<typename TAcc>
-    inline constexpr bool isSingleThreadAcc = trait::IsSingleThreadAcc<TAcc>::value;
-
-    //! True if TAcc is an accelerator that supports multiple threads per block, false otherwise.
-    template<typename TAcc>
-    inline constexpr bool isMultiThreadAcc = trait::IsMultiThreadAcc<TAcc>::value;
-
-    //! \return The acceleration properties on the given device.
-    template<typename TAcc, typename TDev>
-    ALPAKA_FN_HOST auto getAccDevProps(TDev const& dev) -> AccDevProps<Dim<TAcc>, Idx<TAcc>>
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptAcc, TAcc>;
-        return trait::GetAccDevProps<ImplementationBase>::getAccDevProps(dev);
-    }
-
-    //! \return The accelerator name
-    //!
-    //! \tparam TAcc The accelerator type.
-    template<typename TAcc>
-    ALPAKA_FN_HOST auto getAccName() -> std::string
-    {
-        return trait::GetAccName<TAcc>::getAccName();
-    }
-
-    namespace trait
-    {
-        template<typename TAcc, typename TProperty>
-        struct QueueType<TAcc, TProperty, std::enable_if_t<concepts::ImplementsConcept<ConceptAcc, TAcc>::value>>
-        {
-            using type = typename QueueType<typename alpaka::trait::PlatformType<TAcc>::type, TProperty>::type;
-        };
-
-    } // namespace trait
-
-} // namespace alpaka
diff --git a/include/alpaka/alpaka.hpp b/include/alpaka/alpaka.hpp
deleted file mode 100644
index fe410cf..0000000
--- a/include/alpaka/alpaka.hpp
+++ /dev/null
@@ -1,229 +0,0 @@
-/* Copyright 2023 Axel Hübl, Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera, Bernhard Manfred Gruber,
- *                Jan Stephan, Antonio Di Pilato, Luca Ferragina, Aurora Perego, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-// Include the whole library.
-
-// version number
-#include "alpaka/version.hpp"
-// acc
-#include "alpaka/acc/AccCpuOmp2Blocks.hpp"
-#include "alpaka/acc/AccCpuOmp2Threads.hpp"
-#include "alpaka/acc/AccCpuSerial.hpp"
-#include "alpaka/acc/AccCpuSycl.hpp"
-#include "alpaka/acc/AccCpuTbbBlocks.hpp"
-#include "alpaka/acc/AccCpuThreads.hpp"
-#include "alpaka/acc/AccDevProps.hpp"
-#include "alpaka/acc/AccFpgaSyclIntel.hpp"
-#include "alpaka/acc/AccGenericSycl.hpp"
-#include "alpaka/acc/AccGpuCudaRt.hpp"
-#include "alpaka/acc/AccGpuHipRt.hpp"
-#include "alpaka/acc/AccGpuSyclIntel.hpp"
-#include "alpaka/acc/Tag.hpp"
-#include "alpaka/acc/TagAccIsEnabled.hpp"
-#include "alpaka/acc/Traits.hpp"
-// atomic
-#include "alpaka/atomic/AtomicCpu.hpp"
-#include "alpaka/atomic/AtomicGenericSycl.hpp"
-#include "alpaka/atomic/AtomicNoOp.hpp"
-#include "alpaka/atomic/AtomicOmpBuiltIn.hpp"
-#include "alpaka/atomic/AtomicUniformCudaHipBuiltIn.hpp"
-#include "alpaka/atomic/Op.hpp"
-#include "alpaka/atomic/Traits.hpp"
-// block
-// shared
-// dynamic
-#include "alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp"
-#include "alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp"
-#include "alpaka/block/shared/dyn/BlockSharedMemDynUniformCudaHipBuiltIn.hpp"
-#include "alpaka/block/shared/dyn/Traits.hpp"
-// static
-#include "alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp"
-#include "alpaka/block/shared/st/BlockSharedMemStMember.hpp"
-#include "alpaka/block/shared/st/BlockSharedMemStMemberMasterSync.hpp"
-#include "alpaka/block/shared/st/BlockSharedMemStUniformCudaHipBuiltIn.hpp"
-#include "alpaka/block/shared/st/Traits.hpp"
-// sync
-#include "alpaka/block/sync/BlockSyncBarrierOmp.hpp"
-#include "alpaka/block/sync/BlockSyncBarrierThread.hpp"
-#include "alpaka/block/sync/BlockSyncGenericSycl.hpp"
-#include "alpaka/block/sync/BlockSyncNoOp.hpp"
-#include "alpaka/block/sync/BlockSyncUniformCudaHipBuiltIn.hpp"
-#include "alpaka/block/sync/Traits.hpp"
-// core
-#include "alpaka/core/Align.hpp"
-#include "alpaka/core/AlignedAlloc.hpp"
-#include "alpaka/core/Assert.hpp"
-#include "alpaka/core/BarrierThread.hpp"
-#include "alpaka/core/BoostPredef.hpp"
-#include "alpaka/core/ClipCast.hpp"
-#include "alpaka/core/Common.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/core/Cuda.hpp"
-#include "alpaka/core/Debug.hpp"
-#include "alpaka/core/Hip.hpp"
-#include "alpaka/core/OmpSchedule.hpp"
-#include "alpaka/core/Positioning.hpp"
-#include "alpaka/core/RemoveRestrict.hpp"
-#include "alpaka/core/RuntimeMacros.hpp"
-#include "alpaka/core/Sycl.hpp"
-#include "alpaka/core/ThreadPool.hpp"
-#include "alpaka/core/Unreachable.hpp"
-#include "alpaka/core/Unroll.hpp"
-#include "alpaka/core/Utility.hpp"
-#include "alpaka/core/Vectorize.hpp"
-// dev
-#include "alpaka/dev/DevCpu.hpp"
-#include "alpaka/dev/DevCpuSycl.hpp"
-#include "alpaka/dev/DevCudaRt.hpp"
-#include "alpaka/dev/DevFpgaSyclIntel.hpp"
-#include "alpaka/dev/DevGenericSycl.hpp"
-#include "alpaka/dev/DevGpuSyclIntel.hpp"
-#include "alpaka/dev/DevHipRt.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/dev/cpu/Wait.hpp"
-// dim
-#include "alpaka/dim/DimArithmetic.hpp"
-#include "alpaka/dim/DimIntegralConst.hpp"
-#include "alpaka/dim/Traits.hpp"
-// event
-#include "alpaka/event/EventCpu.hpp"
-#include "alpaka/event/EventCpuSycl.hpp"
-#include "alpaka/event/EventCudaRt.hpp"
-#include "alpaka/event/EventFpgaSyclIntel.hpp"
-#include "alpaka/event/EventGenericSycl.hpp"
-#include "alpaka/event/EventGpuSyclIntel.hpp"
-#include "alpaka/event/EventHipRt.hpp"
-#include "alpaka/event/Traits.hpp"
-// exec
-#include "alpaka/exec/ElementIndex.hpp"
-#include "alpaka/exec/IndependentElements.hpp"
-#include "alpaka/exec/Once.hpp"
-#include "alpaka/exec/UniformElements.hpp"
-// extent
-#include "alpaka/extent/Traits.hpp"
-// idx
-#include "alpaka/idx/Accessors.hpp"
-#include "alpaka/idx/MapIdx.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/idx/bt/IdxBtGenericSycl.hpp"
-#include "alpaka/idx/bt/IdxBtOmp.hpp"
-#include "alpaka/idx/bt/IdxBtRefThreadIdMap.hpp"
-#include "alpaka/idx/bt/IdxBtUniformCudaHipBuiltIn.hpp"
-#include "alpaka/idx/bt/IdxBtZero.hpp"
-#include "alpaka/idx/gb/IdxGbGenericSycl.hpp"
-#include "alpaka/idx/gb/IdxGbRef.hpp"
-#include "alpaka/idx/gb/IdxGbUniformCudaHipBuiltIn.hpp"
-// kernel
-#include "alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp"
-#include "alpaka/kernel/TaskKernelCpuOmp2Threads.hpp"
-#include "alpaka/kernel/TaskKernelCpuSerial.hpp"
-#include "alpaka/kernel/TaskKernelCpuSycl.hpp"
-#include "alpaka/kernel/TaskKernelCpuTbbBlocks.hpp"
-#include "alpaka/kernel/TaskKernelCpuThreads.hpp"
-#include "alpaka/kernel/TaskKernelFpgaSyclIntel.hpp"
-#include "alpaka/kernel/TaskKernelGenericSycl.hpp"
-#include "alpaka/kernel/TaskKernelGpuCudaRt.hpp"
-#include "alpaka/kernel/TaskKernelGpuHipRt.hpp"
-#include "alpaka/kernel/TaskKernelGpuSyclIntel.hpp"
-#include "alpaka/kernel/Traits.hpp"
-// math
-#include "alpaka/math/Complex.hpp"
-#include "alpaka/math/MathGenericSycl.hpp"
-#include "alpaka/math/MathStdLib.hpp"
-#include "alpaka/math/MathUniformCudaHipBuiltIn.hpp"
-// mem
-#include "alpaka/mem/alloc/AllocCpuAligned.hpp"
-#include "alpaka/mem/alloc/AllocCpuNew.hpp"
-#include "alpaka/mem/alloc/Traits.hpp"
-#include "alpaka/mem/buf/BufCpu.hpp"
-#include "alpaka/mem/buf/BufCpuSycl.hpp"
-#include "alpaka/mem/buf/BufCudaRt.hpp"
-#include "alpaka/mem/buf/BufFpgaSyclIntel.hpp"
-#include "alpaka/mem/buf/BufGenericSycl.hpp"
-#include "alpaka/mem/buf/BufGpuSyclIntel.hpp"
-#include "alpaka/mem/buf/BufHipRt.hpp"
-#include "alpaka/mem/buf/Traits.hpp"
-#include "alpaka/mem/fence/MemFenceCpu.hpp"
-#include "alpaka/mem/fence/MemFenceCpuSerial.hpp"
-#include "alpaka/mem/fence/MemFenceGenericSycl.hpp"
-#include "alpaka/mem/fence/MemFenceOmp2Blocks.hpp"
-#include "alpaka/mem/fence/MemFenceOmp2Threads.hpp"
-#include "alpaka/mem/fence/MemFenceUniformCudaHipBuiltIn.hpp"
-#include "alpaka/mem/fence/Traits.hpp"
-#include "alpaka/mem/global/DeviceGlobalCpu.hpp"
-#include "alpaka/mem/global/DeviceGlobalGenericSycl.hpp"
-#include "alpaka/mem/global/DeviceGlobalUniformCudaHipBuiltIn.hpp"
-#include "alpaka/mem/global/Traits.hpp"
-#include "alpaka/mem/view/Traits.hpp"
-#include "alpaka/mem/view/ViewConst.hpp"
-#include "alpaka/mem/view/ViewPlainPtr.hpp"
-#include "alpaka/mem/view/ViewStdArray.hpp"
-#include "alpaka/mem/view/ViewStdVector.hpp"
-#include "alpaka/mem/view/ViewSubView.hpp"
-// meta
-#include "alpaka/meta/Apply.hpp"
-#include "alpaka/meta/CartesianProduct.hpp"
-#include "alpaka/meta/Concatenate.hpp"
-#include "alpaka/meta/DependentFalseType.hpp"
-#include "alpaka/meta/Filter.hpp"
-#include "alpaka/meta/Fold.hpp"
-#include "alpaka/meta/ForEachType.hpp"
-#include "alpaka/meta/Functional.hpp"
-#include "alpaka/meta/IntegerSequence.hpp"
-#include "alpaka/meta/Integral.hpp"
-#include "alpaka/meta/IsArrayOrVector.hpp"
-#include "alpaka/meta/IsStrictBase.hpp"
-#include "alpaka/meta/NdLoop.hpp"
-#include "alpaka/meta/NonZero.hpp"
-#include "alpaka/meta/Set.hpp"
-#include "alpaka/meta/Transform.hpp"
-#include "alpaka/meta/TypeListOps.hpp"
-// offset
-#include "alpaka/offset/Traits.hpp"
-// platform
-#include "alpaka/platform/PlatformCpu.hpp"
-#include "alpaka/platform/PlatformCpuSycl.hpp"
-#include "alpaka/platform/PlatformCudaRt.hpp"
-#include "alpaka/platform/PlatformFpgaSyclIntel.hpp"
-#include "alpaka/platform/PlatformGpuSyclIntel.hpp"
-#include "alpaka/platform/PlatformHipRt.hpp"
-#include "alpaka/platform/Traits.hpp"
-// rand
-#include "alpaka/rand/RandDefault.hpp"
-#include "alpaka/rand/RandGenericSycl.hpp"
-#include "alpaka/rand/RandPhilox.hpp"
-#include "alpaka/rand/RandStdLib.hpp"
-#include "alpaka/rand/RandUniformCudaHipRand.hpp"
-#include "alpaka/rand/Traits.hpp"
-// idx
-#include "alpaka/idx/Traits.hpp"
-// queue
-#include "alpaka/queue/Properties.hpp"
-#include "alpaka/queue/QueueCpuBlocking.hpp"
-#include "alpaka/queue/QueueCpuNonBlocking.hpp"
-#include "alpaka/queue/QueueCpuSyclBlocking.hpp"
-#include "alpaka/queue/QueueCpuSyclNonBlocking.hpp"
-#include "alpaka/queue/QueueCudaRtBlocking.hpp"
-#include "alpaka/queue/QueueCudaRtNonBlocking.hpp"
-#include "alpaka/queue/QueueFpgaSyclIntelBlocking.hpp"
-#include "alpaka/queue/QueueFpgaSyclIntelNonBlocking.hpp"
-#include "alpaka/queue/QueueGpuSyclIntelBlocking.hpp"
-#include "alpaka/queue/QueueGpuSyclIntelNonBlocking.hpp"
-#include "alpaka/queue/QueueHipRtBlocking.hpp"
-#include "alpaka/queue/QueueHipRtNonBlocking.hpp"
-#include "alpaka/queue/Traits.hpp"
-// traits
-#include "alpaka/traits/Traits.hpp"
-// wait
-#include "alpaka/wait/Traits.hpp"
-// workdiv
-#include "alpaka/workdiv/Traits.hpp"
-#include "alpaka/workdiv/WorkDivHelpers.hpp"
-#include "alpaka/workdiv/WorkDivMembers.hpp"
-// vec
-#include "alpaka/vec/Traits.hpp"
-#include "alpaka/vec/Vec.hpp"
diff --git a/include/alpaka/atomic/AtomicAtomicRef.hpp b/include/alpaka/atomic/AtomicAtomicRef.hpp
deleted file mode 100644
index 61b825c..0000000
--- a/include/alpaka/atomic/AtomicAtomicRef.hpp
+++ /dev/null
@@ -1,237 +0,0 @@
-/* Copyright 2022 Felice Pantaleo, Andrea Bocci, Jan Stephan
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/atomic/Traits.hpp"
-#include "alpaka/core/BoostPredef.hpp"
-
-#include <array>
-#include <atomic>
-#include <type_traits>
-
-#ifndef ALPAKA_DISABLE_ATOMIC_ATOMICREF
-#    ifndef ALPAKA_HAS_STD_ATOMIC_REF
-#        include <boost/atomic.hpp>
-#    endif
-
-namespace alpaka
-{
-    namespace detail
-    {
-#    if defined(ALPAKA_HAS_STD_ATOMIC_REF)
-        template<typename T>
-        using atomic_ref = std::atomic_ref<T>;
-#    else
-        template<typename T>
-        using atomic_ref = boost::atomic_ref<T>;
-#    endif
-    } // namespace detail
-
-    //! The atomic ops based on atomic_ref for CPU accelerators.
-    //
-    //  Atomics can be used in the grids, blocks and threads hierarchy levels.
-    //
-
-    class AtomicAtomicRef
-    {
-    };
-
-    template<typename T>
-    void isSupportedByAtomicAtomicRef()
-    {
-        static_assert(
-            std::is_trivially_copyable_v<T> && alpaka::detail::atomic_ref<T>::required_alignment <= alignof(T),
-            "Type not supported by AtomicAtomicRef, please recompile defining "
-            "ALPAKA_DISABLE_ATOMIC_ATOMICREF.");
-    }
-
-    namespace trait
-    {
-        //! The CPU accelerators AtomicAdd.
-        template<typename T, typename THierarchy>
-        struct AtomicOp<AtomicAdd, AtomicAtomicRef, T, THierarchy>
-        {
-            ALPAKA_FN_HOST static auto atomicOp(AtomicAtomicRef const&, T* const addr, T const& value) -> T
-            {
-                isSupportedByAtomicAtomicRef<T>();
-                alpaka::detail::atomic_ref<T> ref(*addr);
-                return ref.fetch_add(value);
-            }
-        };
-
-        //! The CPU accelerators AtomicSub.
-        template<typename T, typename THierarchy>
-        struct AtomicOp<AtomicSub, AtomicAtomicRef, T, THierarchy>
-        {
-            ALPAKA_FN_HOST static auto atomicOp(AtomicAtomicRef const&, T* const addr, T const& value) -> T
-            {
-                isSupportedByAtomicAtomicRef<T>();
-                alpaka::detail::atomic_ref<T> ref(*addr);
-                return ref.fetch_sub(value);
-            }
-        };
-
-        //! The CPU accelerators AtomicMin.
-        template<typename T, typename THierarchy>
-        struct AtomicOp<AtomicMin, AtomicAtomicRef, T, THierarchy>
-        {
-            ALPAKA_FN_HOST static auto atomicOp(AtomicAtomicRef const&, T* const addr, T const& value) -> T
-            {
-                isSupportedByAtomicAtomicRef<T>();
-                alpaka::detail::atomic_ref<T> ref(*addr);
-                T old = ref;
-                T result = old;
-                result = std::min(result, value);
-                while(!ref.compare_exchange_weak(old, result))
-                {
-                    result = old;
-                    result = std::min(result, value);
-                }
-                return old;
-            }
-        };
-
-        //! The CPU accelerators AtomicMax.
-        template<typename T, typename THierarchy>
-        struct AtomicOp<AtomicMax, AtomicAtomicRef, T, THierarchy>
-        {
-            ALPAKA_FN_HOST static auto atomicOp(AtomicAtomicRef const&, T* const addr, T const& value) -> T
-            {
-                isSupportedByAtomicAtomicRef<T>();
-                alpaka::detail::atomic_ref<T> ref(*addr);
-                T old = ref;
-                T result = old;
-                result = std::max(result, value);
-                while(!ref.compare_exchange_weak(old, result))
-                {
-                    result = old;
-                    result = std::max(result, value);
-                }
-                return old;
-            }
-        };
-
-        //! The CPU accelerators AtomicExch.
-        template<typename T, typename THierarchy>
-        struct AtomicOp<AtomicExch, AtomicAtomicRef, T, THierarchy>
-        {
-            ALPAKA_FN_HOST static auto atomicOp(AtomicAtomicRef const&, T* const addr, T const& value) -> T
-            {
-                isSupportedByAtomicAtomicRef<T>();
-                alpaka::detail::atomic_ref<T> ref(*addr);
-                T old = ref;
-                T result = value;
-                while(!ref.compare_exchange_weak(old, result))
-                {
-                    result = value;
-                }
-                return old;
-            }
-        };
-
-        //! The CPU accelerators AtomicInc.
-        template<typename T, typename THierarchy>
-        struct AtomicOp<AtomicInc, AtomicAtomicRef, T, THierarchy>
-        {
-            ALPAKA_FN_HOST static auto atomicOp(AtomicAtomicRef const&, T* const addr, T const& value) -> T
-            {
-                isSupportedByAtomicAtomicRef<T>();
-                alpaka::detail::atomic_ref<T> ref(*addr);
-                T old = ref;
-                T result = ((old >= value) ? 0 : static_cast<T>(old + 1));
-                while(!ref.compare_exchange_weak(old, result))
-                {
-                    result = ((old >= value) ? 0 : static_cast<T>(old + 1));
-                }
-                return old;
-            }
-        };
-
-        //! The CPU accelerators AtomicDec.
-        template<typename T, typename THierarchy>
-        struct AtomicOp<AtomicDec, AtomicAtomicRef, T, THierarchy>
-        {
-            ALPAKA_FN_HOST static auto atomicOp(AtomicAtomicRef const&, T* const addr, T const& value) -> T
-            {
-                isSupportedByAtomicAtomicRef<T>();
-                alpaka::detail::atomic_ref<T> ref(*addr);
-                T old = ref;
-                T result = ((old >= value) ? 0 : static_cast<T>(old - 1));
-                while(!ref.compare_exchange_weak(old, result))
-                {
-                    result = ((old >= value) ? 0 : static_cast<T>(old - 1));
-                }
-                return old;
-            }
-        };
-
-        //! The CPU accelerators AtomicAnd.
-        template<typename T, typename THierarchy>
-        struct AtomicOp<AtomicAnd, AtomicAtomicRef, T, THierarchy>
-        {
-            ALPAKA_FN_HOST static auto atomicOp(AtomicAtomicRef const&, T* const addr, T const& value) -> T
-            {
-                isSupportedByAtomicAtomicRef<T>();
-                alpaka::detail::atomic_ref<T> ref(*addr);
-                return ref.fetch_and(value);
-            }
-        };
-
-        //! The CPU accelerators AtomicOr.
-        template<typename T, typename THierarchy>
-        struct AtomicOp<AtomicOr, AtomicAtomicRef, T, THierarchy>
-        {
-            ALPAKA_FN_HOST static auto atomicOp(AtomicAtomicRef const&, T* const addr, T const& value) -> T
-            {
-                isSupportedByAtomicAtomicRef<T>();
-                alpaka::detail::atomic_ref<T> ref(*addr);
-                return ref.fetch_or(value);
-            }
-        };
-
-        //! The CPU accelerators AtomicXor.
-        template<typename T, typename THierarchy>
-        struct AtomicOp<AtomicXor, AtomicAtomicRef, T, THierarchy>
-        {
-            ALPAKA_FN_HOST static auto atomicOp(AtomicAtomicRef const&, T* const addr, T const& value) -> T
-            {
-                isSupportedByAtomicAtomicRef<T>();
-                alpaka::detail::atomic_ref<T> ref(*addr);
-                return ref.fetch_xor(value);
-            }
-        };
-
-        //! The CPU accelerators AtomicCas.
-        template<typename T, typename THierarchy>
-        struct AtomicOp<AtomicCas, AtomicAtomicRef, T, THierarchy>
-        {
-            ALPAKA_FN_HOST static auto atomicOp(
-                AtomicAtomicRef const&,
-                T* const addr,
-                T const& compare,
-                T const& value) -> T
-            {
-                isSupportedByAtomicAtomicRef<T>();
-                alpaka::detail::atomic_ref<T> ref(*addr);
-                T old = ref;
-                T result;
-                do
-                {
-#    if BOOST_COMP_GNUC || BOOST_COMP_CLANG
-#        pragma GCC diagnostic push
-#        pragma GCC diagnostic ignored "-Wfloat-equal"
-#    endif
-                    result = ((old == compare) ? value : old);
-#    if BOOST_COMP_GNUC || BOOST_COMP_CLANG
-#        pragma GCC diagnostic pop
-#    endif
-                } while(!ref.compare_exchange_weak(old, result));
-                return old;
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/atomic/AtomicCpu.hpp b/include/alpaka/atomic/AtomicCpu.hpp
deleted file mode 100644
index 5667bd0..0000000
--- a/include/alpaka/atomic/AtomicCpu.hpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2024 Andrea Bocci, Felice Pantaleo
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/BoostPredef.hpp"
-
-// clang 9/10/11 together with nvcc<11.6.0 as host compiler fails at compile time when using boost::atomic_ref
-#ifdef BOOST_COMP_CLANG_AVAILABLE
-#    if(BOOST_COMP_CLANG < BOOST_VERSION_NUMBER(12, 0, 0) && BOOST_COMP_NVCC                                          \
-        && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 6, 0))
-#        if !defined(ALPAKA_DISABLE_ATOMIC_ATOMICREF)
-#            define ALPAKA_DISABLE_ATOMIC_ATOMICREF
-#        endif
-#    endif
-#endif // BOOST_COMP_CLANG_AVAILABLE
-
-#include "alpaka/atomic/AtomicAtomicRef.hpp"
-#include "alpaka/atomic/AtomicStdLibLock.hpp"
-
-namespace alpaka
-{
-#ifndef ALPAKA_DISABLE_ATOMIC_ATOMICREF
-    using AtomicCpu = AtomicAtomicRef;
-#else
-    using AtomicCpu = AtomicStdLibLock<16>;
-#endif // ALPAKA_DISABLE_ATOMIC_ATOMICREF
-
-} // namespace alpaka
diff --git a/include/alpaka/atomic/AtomicGenericSycl.hpp b/include/alpaka/atomic/AtomicGenericSycl.hpp
deleted file mode 100644
index bdfa53b..0000000
--- a/include/alpaka/atomic/AtomicGenericSycl.hpp
+++ /dev/null
@@ -1,263 +0,0 @@
-/* Copyright 2023 Jan Stephan, Andrea Bocci, Luca Ferragina
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/atomic/Op.hpp"
-#include "alpaka/atomic/Traits.hpp"
-#include "alpaka/core/Positioning.hpp"
-#include "alpaka/meta/DependentFalseType.hpp"
-
-#include <cstdint>
-#include <type_traits>
-
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-
-#    include <sycl/sycl.hpp>
-
-namespace alpaka
-{
-    //! The SYCL accelerator atomic ops.
-    //
-    //  Atomics can used in the hierarchy level grids, blocks and threads.
-    //  Atomics are not guaranteed to be safe between devices
-    class AtomicGenericSycl
-    {
-    };
-
-    namespace detail
-    {
-        template<typename THierarchy>
-        struct SyclMemoryScope
-        {
-        };
-
-        template<>
-        struct SyclMemoryScope<hierarchy::Grids>
-        {
-            static constexpr auto value = sycl::memory_scope::device;
-        };
-
-        template<>
-        struct SyclMemoryScope<hierarchy::Blocks>
-        {
-            static constexpr auto value = sycl::memory_scope::device;
-        };
-
-        template<>
-        struct SyclMemoryScope<hierarchy::Threads>
-        {
-            static constexpr auto value = sycl::memory_scope::work_group;
-        };
-
-        template<typename T, typename THierarchy>
-        using sycl_atomic_ref = sycl::atomic_ref<T, sycl::memory_order::relaxed, SyclMemoryScope<THierarchy>::value>;
-
-        template<typename THierarchy, typename T, typename TOp>
-        inline auto callAtomicOp(T* const addr, TOp&& op)
-        {
-            auto ref = sycl_atomic_ref<T, THierarchy>{*addr};
-            return op(ref);
-        }
-
-        template<typename TRef, typename T, typename TEval>
-        inline auto casWithCondition(T* const addr, TEval&& eval)
-        {
-            auto ref = TRef{*addr};
-            auto old_val = ref.load();
-
-            // prefer compare_exchange_weak when in a loop, assuming that eval is not expensive
-            while(!ref.compare_exchange_weak(old_val, eval(old_val)))
-            {
-            }
-
-            return old_val;
-        }
-    } // namespace detail
-} // namespace alpaka
-
-namespace alpaka::trait
-{
-    // Add.
-    //! The SYCL accelerator atomic operation.
-    template<typename T, typename THierarchy>
-    struct AtomicOp<AtomicAdd, AtomicGenericSycl, T, THierarchy>
-    {
-        static_assert(std::is_integral_v<T> || std::is_floating_point_v<T>, "SYCL atomics do not support this type");
-
-        static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T
-        {
-            return alpaka::detail::callAtomicOp<THierarchy>(
-                addr,
-                [&value](auto& ref) { return ref.fetch_add(value); });
-        }
-    };
-
-    // Sub.
-    //! The SYCL accelerator atomic operation.
-    template<typename T, typename THierarchy>
-    struct AtomicOp<AtomicSub, AtomicGenericSycl, T, THierarchy>
-    {
-        static_assert(std::is_integral_v<T> || std::is_floating_point_v<T>, "SYCL atomics do not support this type");
-
-        static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T
-        {
-            return alpaka::detail::callAtomicOp<THierarchy>(
-                addr,
-                [&value](auto& ref) { return ref.fetch_sub(value); });
-        }
-    };
-
-    // Min.
-    //! The SYCL accelerator atomic operation.
-    template<typename T, typename THierarchy>
-    struct AtomicOp<AtomicMin, AtomicGenericSycl, T, THierarchy>
-    {
-        static_assert(std::is_integral_v<T> || std::is_floating_point_v<T>, "SYCL atomics do not support this type");
-
-        static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T
-        {
-            return alpaka::detail::callAtomicOp<THierarchy>(
-                addr,
-                [&value](auto& ref) { return ref.fetch_min(value); });
-        }
-    };
-
-    // Max.
-    //! The SYCL accelerator atomic operation.
-    template<typename T, typename THierarchy>
-    struct AtomicOp<AtomicMax, AtomicGenericSycl, T, THierarchy>
-    {
-        static_assert(std::is_integral_v<T> || std::is_floating_point_v<T>, "SYCL atomics do not support this type");
-
-        static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T
-        {
-            return alpaka::detail::callAtomicOp<THierarchy>(
-                addr,
-                [&value](auto& ref) { return ref.fetch_max(value); });
-        }
-    };
-
-    // Exch.
-    //! The SYCL accelerator atomic operation.
-    template<typename T, typename THierarchy>
-    struct AtomicOp<AtomicExch, AtomicGenericSycl, T, THierarchy>
-    {
-        static_assert(
-            (std::is_integral_v<T> || std::is_floating_point_v<T>) and(sizeof(T) == 4 || sizeof(T) == 8),
-            "SYCL atomics do not support this type");
-
-        static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T
-        {
-            return alpaka::detail::callAtomicOp<THierarchy>(addr, [&value](auto& ref) { return ref.exchange(value); });
-        }
-    };
-
-    // Inc.
-    //! The SYCL accelerator atomic operation.
-    template<typename T, typename THierarchy>
-    struct AtomicOp<AtomicInc, AtomicGenericSycl, T, THierarchy>
-    {
-        static_assert(
-            std::is_unsigned_v<T> && (sizeof(T) == 4 || sizeof(T) == 8),
-            "SYCL atomics support only 32- and 64-bits unsigned integral types");
-
-        static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T
-        {
-            auto inc = [&value](auto old_val)
-            { return (old_val >= value) ? static_cast<T>(0) : (old_val + static_cast<T>(1)); };
-            return alpaka::detail::casWithCondition<alpaka::detail::sycl_atomic_ref<T, THierarchy>>(addr, inc);
-        }
-    };
-
-    // Dec.
-    //! The SYCL accelerator atomic operation.
-    template<typename T, typename THierarchy>
-    struct AtomicOp<AtomicDec, AtomicGenericSycl, T, THierarchy>
-    {
-        static_assert(
-            std::is_unsigned_v<T> && (sizeof(T) == 4 || sizeof(T) == 8),
-            "SYCL atomics support only 32- and 64-bits unsigned integral types");
-
-        static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T
-        {
-            auto dec = [&value](auto& old_val)
-            { return ((old_val == 0) || (old_val > value)) ? value : (old_val - static_cast<T>(1)); };
-            return alpaka::detail::casWithCondition<alpaka::detail::sycl_atomic_ref<T, THierarchy>>(addr, dec);
-        }
-    };
-
-    // And.
-    //! The SYCL accelerator atomic operation.
-    template<typename T, typename THierarchy>
-    struct AtomicOp<AtomicAnd, AtomicGenericSycl, T, THierarchy>
-    {
-        static_assert(std::is_integral_v<T>, "Bitwise operations only supported for integral types.");
-
-        static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T
-        {
-            return alpaka::detail::callAtomicOp<THierarchy>(
-                addr,
-                [&value](auto& ref) { return ref.fetch_and(value); });
-        }
-    };
-
-    // Or.
-    //! The SYCL accelerator atomic operation.
-    template<typename T, typename THierarchy>
-    struct AtomicOp<AtomicOr, AtomicGenericSycl, T, THierarchy>
-    {
-        static_assert(std::is_integral_v<T>, "Bitwise operations only supported for integral types.");
-
-        static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T
-        {
-            return alpaka::detail::callAtomicOp<THierarchy>(addr, [&value](auto& ref) { return ref.fetch_or(value); });
-        }
-    };
-
-    // Xor.
-    //! The SYCL accelerator atomic operation.
-    template<typename T, typename THierarchy>
-    struct AtomicOp<AtomicXor, AtomicGenericSycl, T, THierarchy>
-    {
-        static_assert(std::is_integral_v<T>, "Bitwise operations only supported for integral types.");
-
-        static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T
-        {
-            return alpaka::detail::callAtomicOp<THierarchy>(
-                addr,
-                [&value](auto& ref) { return ref.fetch_xor(value); });
-        }
-    };
-
-    // Cas.
-    //! The SYCL accelerator atomic operation.
-    template<typename T, typename THierarchy>
-    struct AtomicOp<AtomicCas, AtomicGenericSycl, T, THierarchy>
-    {
-        static_assert(std::is_integral_v<T> || std::is_floating_point_v<T>, "SYCL atomics do not support this type");
-
-        static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& expected, T const& desired) -> T
-        {
-            auto cas = [&expected, &desired](auto& ref)
-            {
-                auto expected_ = expected;
-                // Atomically compares the value of `ref` with the value of `expected`.
-                // If the values are equal, replaces the value of `ref` with `desired`.
-                // Otherwise updates `expected` with the value of `ref`.
-                // Returns a bool telling us if the exchange happened or not, but the Alpaka API does not make use of
-                // it.
-                ref.compare_exchange_strong(expected_, desired);
-
-                // If the update succeded, return the previous value of `ref`.
-                // Otherwise, return the current value of `ref`.
-                return expected_;
-            };
-
-            return alpaka::detail::callAtomicOp<THierarchy>(addr, cas);
-        }
-    };
-} // namespace alpaka::trait
-
-#endif
diff --git a/include/alpaka/atomic/AtomicHierarchy.hpp b/include/alpaka/atomic/AtomicHierarchy.hpp
deleted file mode 100644
index d9c3c3a..0000000
--- a/include/alpaka/atomic/AtomicHierarchy.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright 2020 Benjamin Worpitz, René Widera, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/atomic/Traits.hpp"
-#include "alpaka/meta/InheritFromList.hpp"
-#include "alpaka/meta/Unique.hpp"
-
-#include <tuple>
-
-namespace alpaka
-{
-    //! build a single class to inherit from different atomic implementations
-    //
-    //  This implementation inherit from all three hierarchies.
-    //  The multiple usage of the same type for different levels is allowed.
-    //  The class provide the feature that each atomic operation can be focused
-    //  to a hierarchy level in alpaka. A operation to a hierarchy is independent
-    //  to the memory hierarchy.
-    //
-    //  \tparam TGridAtomic atomic implementation for atomic operations between grids within a device
-    //  \tparam TBlockAtomic atomic implementation for atomic operations between blocks within a grid
-    //  \tparam TThreadAtomic atomic implementation for atomic operations between threads within a block
-    template<typename TGridAtomic, typename TBlockAtomic, typename TThreadAtomic>
-    using AtomicHierarchy = alpaka::meta::InheritFromList<alpaka::meta::Unique<std::tuple<
-        TGridAtomic,
-        TBlockAtomic,
-        TThreadAtomic,
-        concepts::Implements<ConceptAtomicGrids, TGridAtomic>,
-        concepts::Implements<ConceptAtomicBlocks, TBlockAtomic>,
-        concepts::Implements<ConceptAtomicThreads, TThreadAtomic>>>>;
-} // namespace alpaka
diff --git a/include/alpaka/atomic/AtomicNoOp.hpp b/include/alpaka/atomic/AtomicNoOp.hpp
deleted file mode 100644
index d51a2c3..0000000
--- a/include/alpaka/atomic/AtomicNoOp.hpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/atomic/Traits.hpp"
-
-namespace alpaka
-{
-    //! The NoOp atomic ops.
-    class AtomicNoOp
-    {
-    };
-
-    namespace trait
-    {
-        //! The NoOp atomic operation.
-        template<typename TOp, typename T, typename THierarchy>
-        struct AtomicOp<TOp, AtomicNoOp, T, THierarchy>
-        {
-            ALPAKA_FN_HOST static auto atomicOp(AtomicNoOp const& /* atomic */, T* const addr, T const& value) -> T
-            {
-                return TOp()(addr, value);
-            }
-
-            ALPAKA_FN_HOST static auto atomicOp(
-                AtomicNoOp const& /* atomic */,
-                T* const addr,
-                T const& compare,
-                T const& value) -> T
-            {
-                return TOp()(addr, compare, value);
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/atomic/AtomicOmpBuiltIn.hpp b/include/alpaka/atomic/AtomicOmpBuiltIn.hpp
deleted file mode 100644
index e1f0ba0..0000000
--- a/include/alpaka/atomic/AtomicOmpBuiltIn.hpp
+++ /dev/null
@@ -1,320 +0,0 @@
-/* Copyright 2022 René Widera, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/atomic/Op.hpp"
-#include "alpaka/atomic/Traits.hpp"
-#include "alpaka/core/BoostPredef.hpp"
-
-#ifdef _OPENMP
-
-namespace alpaka
-{
-    //! The OpenMP accelerators atomic ops.
-    //
-    //  Atomics can be used in the blocks and threads hierarchy levels.
-    //  Atomics are not guaranteed to be safe between devices or grids.
-    class AtomicOmpBuiltIn
-    {
-    };
-
-    namespace trait
-    {
-// check for OpenMP 3.1+
-// "omp atomic capture" is not supported before OpenMP 3.1
-#    if _OPENMP >= 201107
-
-        //! The OpenMP accelerators atomic operation: ADD
-        template<typename T, typename THierarchy>
-        struct AtomicOp<AtomicAdd, AtomicOmpBuiltIn, T, THierarchy>
-        {
-            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T const& value) -> T
-            {
-                T old;
-                auto& ref(*addr);
-// atomically update ref, but capture the original value in old
-#        if BOOST_COMP_GNUC
-#            pragma GCC diagnostic push
-#            pragma GCC diagnostic ignored "-Wconversion"
-#        endif
-#        pragma omp atomic capture
-                {
-                    old = ref;
-                    ref += value;
-                }
-#        if BOOST_COMP_GNUC
-#            pragma GCC diagnostic pop
-#        endif
-                return old;
-            }
-        };
-
-        //! The OpenMP accelerators atomic operation: SUB
-        template<typename T, typename THierarchy>
-        struct AtomicOp<AtomicSub, AtomicOmpBuiltIn, T, THierarchy>
-        {
-            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T const& value) -> T
-            {
-                T old;
-                auto& ref(*addr);
-// atomically update ref, but capture the original value in old
-#        if BOOST_COMP_GNUC
-#            pragma GCC diagnostic push
-#            pragma GCC diagnostic ignored "-Wconversion"
-#        endif
-#        pragma omp atomic capture
-                {
-                    old = ref;
-                    ref -= value;
-                }
-#        if BOOST_COMP_GNUC
-#            pragma GCC diagnostic pop
-#        endif
-                return old;
-            }
-        };
-
-        //! The OpenMP accelerators atomic operation: EXCH
-        template<typename T, typename THierarchy>
-        struct AtomicOp<AtomicExch, AtomicOmpBuiltIn, T, THierarchy>
-        {
-            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T const& value) -> T
-            {
-                T old;
-                auto& ref(*addr);
-// atomically update ref, but capture the original value in old
-#        pragma omp atomic capture
-                {
-                    old = ref;
-                    ref = value;
-                }
-                return old;
-            }
-        };
-
-        //! The OpenMP accelerators atomic operation: AND
-        template<typename T, typename THierarchy>
-        struct AtomicOp<AtomicAnd, AtomicOmpBuiltIn, T, THierarchy>
-        {
-            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T const& value) -> T
-            {
-                T old;
-                auto& ref(*addr);
-// atomically update ref, but capture the original value in old
-#        if BOOST_COMP_GNUC
-#            pragma GCC diagnostic push
-#            pragma GCC diagnostic ignored "-Wconversion"
-#        endif
-#        pragma omp atomic capture
-                {
-                    old = ref;
-                    ref &= value;
-                }
-#        if BOOST_COMP_GNUC
-#            pragma GCC diagnostic pop
-#        endif
-                return old;
-            }
-        };
-
-        //! The OpenMP accelerators atomic operation: OR
-        template<typename T, typename THierarchy>
-        struct AtomicOp<AtomicOr, AtomicOmpBuiltIn, T, THierarchy>
-        {
-            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T const& value) -> T
-            {
-                T old;
-                auto& ref(*addr);
-// atomically update ref, but capture the original value in old
-#        if BOOST_COMP_GNUC
-#            pragma GCC diagnostic push
-#            pragma GCC diagnostic ignored "-Wconversion"
-#        endif
-#        pragma omp atomic capture
-                {
-                    old = ref;
-                    ref |= value;
-                }
-#        if BOOST_COMP_GNUC
-#            pragma GCC diagnostic pop
-#        endif
-                return old;
-            }
-        };
-
-        //! The OpenMP accelerators atomic operation: XOR
-        template<typename T, typename THierarchy>
-        struct AtomicOp<AtomicXor, AtomicOmpBuiltIn, T, THierarchy>
-        {
-            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T const& value) -> T
-            {
-                T old;
-                auto& ref(*addr);
-// atomically update ref, but capture the original value in old
-#        if BOOST_COMP_GNUC
-#            pragma GCC diagnostic push
-#            pragma GCC diagnostic ignored "-Wconversion"
-#        endif
-#        pragma omp atomic capture
-                {
-                    old = ref;
-                    ref ^= value;
-                }
-#        if BOOST_COMP_GNUC
-#            pragma GCC diagnostic pop
-#        endif
-                return old;
-            }
-        };
-
-#    endif // _OPENMP >= 201107
-
-// check for OpenMP 5.1+
-// "omp atomic compare" was introduced with OpenMP 5.1
-#    if _OPENMP >= 202011
-
-        //! The OpenMP accelerators atomic operation: Min
-        template<typename T, typename THierarchy>
-        struct AtomicOp<AtomicMin, AtomicOmpBuiltIn, T, THierarchy>
-        {
-            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T value) -> T
-            {
-                T old;
-                auto& ref(*addr);
-// atomically update ref, but capture the original value in old
-#        pragma omp atomic capture compare
-                {
-                    old = ref;
-                    // Do not remove the curly brackets of the if body else
-                    // icpx 2024.0 is not able to compile the atomics.
-                    if(value < ref)
-                    {
-                        ref = value;
-                    }
-                }
-                return old;
-            }
-        };
-
-        //! The OpenMP accelerators atomic operation: Max
-        template<typename T, typename THierarchy>
-        struct AtomicOp<AtomicMax, AtomicOmpBuiltIn, T, THierarchy>
-        {
-            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T value) -> T
-            {
-                T old;
-                auto& ref(*addr);
-// atomically update ref, but capture the original value in old
-#        pragma omp atomic capture compare
-                {
-                    old = ref;
-                    // Do not remove the curly brackets of the if body else
-                    // icpx 2024.0 is not able to compile the atomics.
-                    if(value > ref)
-                    {
-                        ref = value;
-                    }
-                }
-                return old;
-            }
-        };
-
-        //! The OpenMP accelerators atomic operation: Inc
-        template<typename T, typename THierarchy>
-        struct AtomicOp<AtomicInc, AtomicOmpBuiltIn, T, THierarchy>
-        {
-            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T const& value) -> T
-            {
-                // TODO(bgruber): atomic increment with wrap around is not implementable in OpenMP 5.1
-                T old;
-#        pragma omp critical(AlpakaOmpAtomicOp)
-                {
-                    old = AtomicInc{}(addr, value);
-                }
-                return old;
-            }
-        };
-
-        //! The OpenMP accelerators atomic operation: Dec
-        template<typename T, typename THierarchy>
-        struct AtomicOp<AtomicDec, AtomicOmpBuiltIn, T, THierarchy>
-        {
-            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T const& value) -> T
-            {
-                // TODO(bgruber): atomic decrement with wrap around is not implementable in OpenMP 5.1
-                T old;
-#        pragma omp critical(AlpakaOmpAtomicOp)
-                {
-                    old = AtomicDec{}(addr, value);
-                }
-                return old;
-            }
-        };
-
-        //! The OpenMP accelerators atomic operation: Cas
-        template<typename T, typename THierarchy>
-        struct AtomicOp<AtomicCas, AtomicOmpBuiltIn, T, THierarchy>
-        {
-            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T compare, T value) -> T
-            {
-                T old;
-                auto& ref(*addr);
-// atomically update ref, but capture the original value in old
-#        pragma omp atomic capture compare
-                {
-                    old = ref;
-                    // Do not remove the curly brackets of the if body else
-                    // icpx 2024.0 is not able to compile the atomics.
-                    if(ref == compare)
-                    {
-                        ref = value;
-                    }
-                }
-                return old;
-            }
-        };
-
-#    else
-        //! The OpenMP accelerators atomic operation
-        //
-        // generic implementations for operations where native atomics are not available
-        template<typename TOp, typename T, typename THierarchy>
-        struct AtomicOp<TOp, AtomicOmpBuiltIn, T, THierarchy>
-        {
-            ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T const& value) -> T
-            {
-                T old;
-                // \TODO: Currently not only the access to the same memory location is protected by a mutex but all
-                // atomic ops on all threads.
-#        pragma omp critical(AlpakaOmpAtomicOp)
-                {
-                    old = TOp()(addr, value);
-                }
-                return old;
-            }
-
-            ALPAKA_FN_HOST static auto atomicOp(
-                AtomicOmpBuiltIn const&,
-                T* const addr,
-                T const& compare,
-                T const& value) -> T
-            {
-                T old;
-                // \TODO: Currently not only the access to the same memory location is protected by a mutex but all
-                // atomic ops on all threads.
-#        pragma omp critical(AlpakaOmpAtomicOp2)
-                {
-                    old = TOp()(addr, compare, value);
-                }
-                return old;
-            }
-        };
-
-#    endif // _OPENMP >= 202011
-
-    } // namespace trait
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/atomic/AtomicStdLibLock.hpp b/include/alpaka/atomic/AtomicStdLibLock.hpp
deleted file mode 100644
index 16a659f..0000000
--- a/include/alpaka/atomic/AtomicStdLibLock.hpp
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Matthias Werner, René Widera, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/atomic/Traits.hpp"
-#include "alpaka/core/BoostPredef.hpp"
-
-#include <array>
-#include <mutex>
-
-#ifdef ALPAKA_DISABLE_ATOMIC_ATOMICREF
-
-namespace alpaka
-{
-    //! The CPU threads accelerator atomic ops.
-    //
-    //  Atomics can be used in the grids, blocks and threads hierarchy levels.
-    //  Atomics are not guaranteed to be save between devices.
-    //
-    // \tparam THashTableSize size of the hash table to allow concurrency between
-    //                        atomics to different addresses
-    template<size_t THashTableSize>
-    class AtomicStdLibLock
-    {
-    public:
-        template<typename TAtomic, typename TOp, typename T, typename THierarchy, typename TSfinae>
-        friend struct trait::AtomicOp;
-
-        static constexpr auto nextPowerOf2(size_t const value, size_t const bit = 0u) -> size_t
-        {
-            return value <= (static_cast<size_t>(1u) << bit) ? (static_cast<size_t>(1u) << bit)
-                                                             : nextPowerOf2(value, bit + 1u);
-        }
-
-        //! get a hash value of the pointer
-        //
-        // This is no perfect hash, there will be collisions if the size of pointer type
-        // is not a power of two.
-        template<typename TPtr>
-        static auto hash(TPtr const* const ptr) -> size_t
-        {
-            auto const ptrAddr = reinterpret_cast<size_t>(ptr);
-            // using power of two for the next division will increase the performance
-            constexpr size_t typeSizePowerOf2 = nextPowerOf2(sizeof(TPtr));
-            // division removes the stride between indices
-            return (ptrAddr / typeSizePowerOf2);
-        }
-
-        template<typename TPtr>
-        auto getMutex(TPtr const* const ptr) const -> std::mutex&
-        {
-            //! get the size of the hash table
-            //
-            // The size is at least 1 or THashTableSize rounded up to the next power of 2
-            constexpr size_t hashTableSize = THashTableSize == 0u ? 1u : nextPowerOf2(THashTableSize);
-
-            size_t const hashedAddr = hash(ptr) & (hashTableSize - 1u);
-#    if BOOST_COMP_CLANG
-#        pragma clang diagnostic push
-#        pragma clang diagnostic ignored "-Wexit-time-destructors"
-#    endif
-            static std::array<
-                std::mutex,
-                hashTableSize>
-                m_mtxAtomic; //!< The mutex protecting access for an atomic operation.
-#    if BOOST_COMP_CLANG
-#        pragma clang diagnostic pop
-#    endif
-            return m_mtxAtomic[hashedAddr];
-        }
-    };
-
-    namespace trait
-    {
-        //! The CPU threads accelerator atomic operation.
-        template<typename TOp, typename T, typename THierarchy, size_t THashTableSize>
-        struct AtomicOp<TOp, AtomicStdLibLock<THashTableSize>, T, THierarchy>
-        {
-            ALPAKA_FN_HOST static auto atomicOp(
-                AtomicStdLibLock<THashTableSize> const& atomic,
-                T* const addr,
-                T const& value) -> T
-            {
-                std::lock_guard<std::mutex> lock(atomic.getMutex(addr));
-                return TOp()(addr, value);
-            }
-
-            ALPAKA_FN_HOST static auto atomicOp(
-                AtomicStdLibLock<THashTableSize> const& atomic,
-                T* const addr,
-                T const& compare,
-                T const& value) -> T
-            {
-                std::lock_guard<std::mutex> lock(atomic.getMutex(addr));
-                return TOp()(addr, compare, value);
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/atomic/AtomicUniformCudaHip.hpp b/include/alpaka/atomic/AtomicUniformCudaHip.hpp
deleted file mode 100644
index 330e3a4..0000000
--- a/include/alpaka/atomic/AtomicUniformCudaHip.hpp
+++ /dev/null
@@ -1,512 +0,0 @@
-/* Copyright 2022 René Widera
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/atomic/Op.hpp"
-#include "alpaka/core/BoostPredef.hpp"
-#include "alpaka/core/Positioning.hpp"
-#include "alpaka/core/Utility.hpp"
-
-#include <type_traits>
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-
-namespace alpaka
-{
-    //! The GPU CUDA/HIP accelerator atomic ops.
-    //
-    //  Atomics can be used in the hierarchy level grids, blocks and threads.
-    //  Atomics are not guaranteed to be safe between devices.
-    class AtomicUniformCudaHipBuiltIn
-    {
-    };
-} // namespace alpaka
-
-#    if !defined(ALPAKA_HOST_ONLY)
-
-#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
-#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#        endif
-
-#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
-#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#        endif
-
-//! clang is providing a builtin for different atomic functions even if these is not supported for architectures < 6.0
-#        define CLANG_CUDA_PTX_WORKAROUND                                                                             \
-            (BOOST_COMP_CLANG && BOOST_LANG_CUDA && BOOST_ARCH_PTX < BOOST_VERSION_NUMBER(6, 0, 0))
-
-//! These types must be in the global namespace for checking existence of respective functions in global namespace via
-//! SFINAE, so we use inline namespace.
-inline namespace alpakaGlobal
-{
-    //! Provide an interface to builtin atomic functions.
-    //
-    // To check for the existence of builtin functions located in the global namespace :: directly.
-    // This would not be possible without having these types in global namespace.
-    // If the functor is inheriting from std::false_type an signature is explicitly not available. This can be used to
-    // explicitly disable builtin function in case the builtin is broken.
-    // If the functor is inheriting from std::true_type a specialization must implement one of the following
-    // interfaces.
-    // \code{.cpp}
-    //    // interface for all atomics except atomicCas
-    //    __device__ static T atomic( T* add, T value);
-    //    // interface for atomicCas only
-    //    __device__ static T atomic( T* add, T compare, T value);
-    // \endcode
-    template<typename TOp, typename T, typename THierarchy, typename TSfinae = void>
-    struct AlpakaBuiltInAtomic : std::false_type
-    {
-    };
-
-    // Cas.
-    template<typename T, typename THierarchy>
-    struct AlpakaBuiltInAtomic<
-        alpaka::AtomicCas,
-        T,
-        THierarchy,
-        typename std::void_t<
-            decltype(atomicCAS(alpaka::core::declval<T*>(), alpaka::core::declval<T>(), alpaka::core::declval<T>()))>>
-        : std::true_type
-    {
-        static __device__ T atomic(T* add, T compare, T value)
-        {
-            return atomicCAS(add, compare, value);
-        }
-    };
-
-#        if !CLANG_CUDA_PTX_WORKAROUND
-    template<typename T>
-    struct AlpakaBuiltInAtomic<
-        alpaka::AtomicCas,
-        T,
-        alpaka::hierarchy::Threads,
-        typename std::void_t<decltype(atomicCAS_block(
-            alpaka::core::declval<T*>(),
-            alpaka::core::declval<T>(),
-            alpaka::core::declval<T>()))>> : std::true_type
-    {
-        static __device__ T atomic(T* add, T compare, T value)
-        {
-            return atomicCAS_block(add, compare, value);
-        }
-    };
-#        endif
-
-
-    // Add.
-    template<typename T, typename THierarchy>
-    struct AlpakaBuiltInAtomic<
-        alpaka::AtomicAdd,
-        T,
-        THierarchy,
-        typename std::void_t<decltype(atomicAdd(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
-        : std::true_type
-    {
-        static __device__ T atomic(T* add, T value)
-        {
-            return atomicAdd(add, value);
-        }
-    };
-
-
-#        if !CLANG_CUDA_PTX_WORKAROUND
-    template<typename T>
-    struct AlpakaBuiltInAtomic<
-        alpaka::AtomicAdd,
-        T,
-        alpaka::hierarchy::Threads,
-        typename std::void_t<decltype(atomicAdd_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
-        : std::true_type
-    {
-        static __device__ T atomic(T* add, T value)
-        {
-            return atomicAdd_block(add, value);
-        }
-    };
-#        endif
-
-#        if CLANG_CUDA_PTX_WORKAROUND
-    // clang is providing a builtin for atomicAdd even if these is not supported by the current architecture
-    template<typename THierarchy>
-    struct AlpakaBuiltInAtomic<alpaka::AtomicAdd, double, THierarchy> : std::false_type
-    {
-    };
-#        endif
-
-#        if(BOOST_LANG_HIP)
-    // HIP shows bad performance with builtin atomicAdd(float*,float) for the hierarchy threads therefore we do not
-    // call the buildin method and instead use the atomicCAS emulation. For details see:
-    // https://github.com/alpaka-group/alpaka/issues/1657
-    template<>
-    struct AlpakaBuiltInAtomic<alpaka::AtomicAdd, float, alpaka::hierarchy::Threads> : std::false_type
-    {
-    };
-#        endif
-
-    // Sub.
-
-    template<typename T, typename THierarchy>
-    struct AlpakaBuiltInAtomic<
-        alpaka::AtomicSub,
-        T,
-        THierarchy,
-        typename std::void_t<decltype(atomicSub(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
-        : std::true_type
-    {
-        static __device__ T atomic(T* add, T value)
-        {
-            return atomicSub(add, value);
-        }
-    };
-
-#        if !CLANG_CUDA_PTX_WORKAROUND
-    template<typename T>
-    struct AlpakaBuiltInAtomic<
-        alpaka::AtomicSub,
-        T,
-        alpaka::hierarchy::Threads,
-        typename std::void_t<decltype(atomicSub_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
-        : std::true_type
-    {
-        static __device__ T atomic(T* add, T value)
-        {
-            return atomicSub_block(add, value);
-        }
-    };
-#        endif
-
-    // Min.
-    template<typename T, typename THierarchy>
-    struct AlpakaBuiltInAtomic<
-        alpaka::AtomicMin,
-        T,
-        THierarchy,
-        typename std::void_t<decltype(atomicMin(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
-        : std::true_type
-    {
-        static __device__ T atomic(T* add, T value)
-        {
-            return atomicMin(add, value);
-        }
-    };
-
-#        if !CLANG_CUDA_PTX_WORKAROUND
-    template<typename T>
-    struct AlpakaBuiltInAtomic<
-        alpaka::AtomicMin,
-        T,
-        alpaka::hierarchy::Threads,
-        typename std::void_t<decltype(atomicMin_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
-        : std::true_type
-    {
-        static __device__ T atomic(T* add, T value)
-        {
-            return atomicMin_block(add, value);
-        }
-    };
-#        endif
-
-// disable HIP atomicMin: see https://github.com/ROCm-Developer-Tools/hipamd/pull/40
-#        if(BOOST_LANG_HIP)
-    template<typename THierarchy>
-    struct AlpakaBuiltInAtomic<alpaka::AtomicMin, float, THierarchy> : std::false_type
-    {
-    };
-
-    template<>
-    struct AlpakaBuiltInAtomic<alpaka::AtomicMin, float, alpaka::hierarchy::Threads> : std::false_type
-    {
-    };
-
-    template<typename THierarchy>
-    struct AlpakaBuiltInAtomic<alpaka::AtomicMin, double, THierarchy> : std::false_type
-    {
-    };
-
-    template<>
-    struct AlpakaBuiltInAtomic<alpaka::AtomicMin, double, alpaka::hierarchy::Threads> : std::false_type
-    {
-    };
-
-#            if !__has_builtin(__hip_atomic_compare_exchange_strong)
-    template<typename THierarchy>
-    struct AlpakaBuiltInAtomic<alpaka::AtomicMin, unsigned long long, THierarchy> : std::false_type
-    {
-    };
-
-    template<>
-    struct AlpakaBuiltInAtomic<alpaka::AtomicMin, unsigned long long, alpaka::hierarchy::Threads> : std::false_type
-    {
-    };
-#            endif
-#        endif
-
-    // Max.
-
-    template<typename T, typename THierarchy>
-    struct AlpakaBuiltInAtomic<
-        alpaka::AtomicMax,
-        T,
-        THierarchy,
-        typename std::void_t<decltype(atomicMax(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
-        : std::true_type
-    {
-        static __device__ T atomic(T* add, T value)
-        {
-            return atomicMax(add, value);
-        }
-    };
-
-#        if !CLANG_CUDA_PTX_WORKAROUND
-    template<typename T>
-    struct AlpakaBuiltInAtomic<
-        alpaka::AtomicMax,
-        T,
-        alpaka::hierarchy::Threads,
-        typename std::void_t<decltype(atomicMax_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
-        : std::true_type
-    {
-        static __device__ T atomic(T* add, T value)
-        {
-            return atomicMax_block(add, value);
-        }
-    };
-#        endif
-
-    // disable HIP atomicMax: see https://github.com/ROCm-Developer-Tools/hipamd/pull/40
-#        if(BOOST_LANG_HIP)
-    template<typename THierarchy>
-    struct AlpakaBuiltInAtomic<alpaka::AtomicMax, float, THierarchy> : std::false_type
-    {
-    };
-
-    template<>
-    struct AlpakaBuiltInAtomic<alpaka::AtomicMax, float, alpaka::hierarchy::Threads> : std::false_type
-    {
-    };
-
-    template<typename THierarchy>
-    struct AlpakaBuiltInAtomic<alpaka::AtomicMax, double, THierarchy> : std::false_type
-    {
-    };
-
-    template<>
-    struct AlpakaBuiltInAtomic<alpaka::AtomicMax, double, alpaka::hierarchy::Threads> : std::false_type
-    {
-    };
-
-#            if !__has_builtin(__hip_atomic_compare_exchange_strong)
-    template<typename THierarchy>
-    struct AlpakaBuiltInAtomic<alpaka::AtomicMax, unsigned long long, THierarchy> : std::false_type
-    {
-    };
-
-    template<>
-    struct AlpakaBuiltInAtomic<alpaka::AtomicMax, unsigned long long, alpaka::hierarchy::Threads> : std::false_type
-    {
-    };
-#            endif
-#        endif
-
-
-    // Exch.
-
-    template<typename T, typename THierarchy>
-    struct AlpakaBuiltInAtomic<
-        alpaka::AtomicExch,
-        T,
-        THierarchy,
-        typename std::void_t<decltype(atomicExch(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
-        : std::true_type
-    {
-        static __device__ T atomic(T* add, T value)
-        {
-            return atomicExch(add, value);
-        }
-    };
-
-#        if !CLANG_CUDA_PTX_WORKAROUND
-    template<typename T>
-    struct AlpakaBuiltInAtomic<
-        alpaka::AtomicExch,
-        T,
-        alpaka::hierarchy::Threads,
-        typename std::void_t<decltype(atomicExch_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
-        : std::true_type
-    {
-        static __device__ T atomic(T* add, T value)
-        {
-            return atomicExch_block(add, value);
-        }
-    };
-#        endif
-
-    // Inc.
-
-    template<typename T, typename THierarchy>
-    struct AlpakaBuiltInAtomic<
-        alpaka::AtomicInc,
-        T,
-        THierarchy,
-        typename std::void_t<decltype(atomicInc(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
-        : std::true_type
-    {
-        static __device__ T atomic(T* add, T value)
-        {
-            return atomicInc(add, value);
-        }
-    };
-
-#        if !CLANG_CUDA_PTX_WORKAROUND
-    template<typename T>
-    struct AlpakaBuiltInAtomic<
-        alpaka::AtomicInc,
-        T,
-        alpaka::hierarchy::Threads,
-        typename std::void_t<decltype(atomicInc_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
-        : std::true_type
-    {
-        static __device__ T atomic(T* add, T value)
-        {
-            return atomicInc_block(add, value);
-        }
-    };
-#        endif
-
-    // Dec.
-
-    template<typename T, typename THierarchy>
-    struct AlpakaBuiltInAtomic<
-        alpaka::AtomicDec,
-        T,
-        THierarchy,
-        typename std::void_t<decltype(atomicDec(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
-        : std::true_type
-    {
-        static __device__ T atomic(T* add, T value)
-        {
-            return atomicDec(add, value);
-        }
-    };
-
-#        if !CLANG_CUDA_PTX_WORKAROUND
-    template<typename T>
-    struct AlpakaBuiltInAtomic<
-        alpaka::AtomicDec,
-        T,
-        alpaka::hierarchy::Threads,
-        typename std::void_t<decltype(atomicDec_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
-        : std::true_type
-    {
-        static __device__ T atomic(T* add, T value)
-        {
-            return atomicDec_block(add, value);
-        }
-    };
-#        endif
-
-    // And.
-
-    template<typename T, typename THierarchy>
-    struct AlpakaBuiltInAtomic<
-        alpaka::AtomicAnd,
-        T,
-        THierarchy,
-        typename std::void_t<decltype(atomicAnd(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
-        : std::true_type
-    {
-        static __device__ T atomic(T* add, T value)
-        {
-            return atomicAnd(add, value);
-        }
-    };
-
-#        if !CLANG_CUDA_PTX_WORKAROUND
-    template<typename T>
-    struct AlpakaBuiltInAtomic<
-        alpaka::AtomicAnd,
-        T,
-        alpaka::hierarchy::Threads,
-        typename std::void_t<decltype(atomicAnd_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
-        : std::true_type
-    {
-        static __device__ T atomic(T* add, T value)
-        {
-            return atomicAnd_block(add, value);
-        }
-    };
-#        endif
-
-    // Or.
-
-    template<typename T, typename THierarchy>
-    struct AlpakaBuiltInAtomic<
-        alpaka::AtomicOr,
-        T,
-        THierarchy,
-        typename std::void_t<decltype(atomicOr(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
-        : std::true_type
-    {
-        static __device__ T atomic(T* add, T value)
-        {
-            return atomicOr(add, value);
-        }
-    };
-
-#        if !CLANG_CUDA_PTX_WORKAROUND
-    template<typename T>
-    struct AlpakaBuiltInAtomic<
-        alpaka::AtomicOr,
-        T,
-        alpaka::hierarchy::Threads,
-        typename std::void_t<decltype(atomicOr_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
-        : std::true_type
-    {
-        static __device__ T atomic(T* add, T value)
-        {
-            return atomicOr_block(add, value);
-        }
-    };
-#        endif
-
-    // Xor.
-
-    template<typename T, typename THierarchy>
-    struct AlpakaBuiltInAtomic<
-        alpaka::AtomicXor,
-        T,
-        THierarchy,
-        typename std::void_t<decltype(atomicXor(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
-        : std::true_type
-    {
-        static __device__ T atomic(T* add, T value)
-        {
-            return atomicXor(add, value);
-        }
-    };
-
-#        if !CLANG_CUDA_PTX_WORKAROUND
-    template<typename T>
-    struct AlpakaBuiltInAtomic<
-        alpaka::AtomicXor,
-        T,
-        alpaka::hierarchy::Threads,
-        typename std::void_t<decltype(atomicXor_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
-        : std::true_type
-    {
-        static __device__ T atomic(T* add, T value)
-        {
-            return atomicXor_block(add, value);
-        }
-    };
-#        endif
-
-} // namespace alpakaGlobal
-
-#        undef CLANG_CUDA_PTX_WORKAROUND
-#    endif
-
-#endif
diff --git a/include/alpaka/atomic/AtomicUniformCudaHipBuiltIn.hpp b/include/alpaka/atomic/AtomicUniformCudaHipBuiltIn.hpp
deleted file mode 100644
index 86c5120..0000000
--- a/include/alpaka/atomic/AtomicUniformCudaHipBuiltIn.hpp
+++ /dev/null
@@ -1,321 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, René Widera, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Antonio Di Pilato
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/atomic/AtomicUniformCudaHip.hpp"
-#include "alpaka/atomic/Op.hpp"
-#include "alpaka/atomic/Traits.hpp"
-#include "alpaka/core/BoostPredef.hpp"
-#include "alpaka/core/Decay.hpp"
-#include "alpaka/core/Unreachable.hpp"
-
-#include <limits>
-#include <type_traits>
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-
-#    if !defined(ALPAKA_HOST_ONLY)
-
-#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
-#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#        endif
-
-#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
-#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#        endif
-
-namespace alpaka::trait
-{
-    namespace detail
-    {
-        struct EmulationBase
-        {
-            //! reinterprets an address as an 32bit value for atomicCas emulation usage
-            template<typename TAddressType>
-            static __device__ auto reinterpretAddress(TAddressType* address)
-                -> std::enable_if_t<sizeof(TAddressType) == 4u, unsigned int*>
-            {
-                return reinterpret_cast<unsigned int*>(address);
-            }
-
-            //! reinterprets a address as an 64bit value for atomicCas emulation usage
-            template<typename TAddressType>
-            static __device__ auto reinterpretAddress(TAddressType* address)
-                -> std::enable_if_t<sizeof(TAddressType) == 8u, unsigned long long int*>
-            {
-                return reinterpret_cast<unsigned long long int*>(address);
-            }
-
-            //! reinterprets a value to be usable for the atomicCAS emulation
-            template<typename T_Type>
-            static __device__ auto reinterpretValue(T_Type value)
-            {
-                return *reinterpretAddress(&value);
-            }
-        };
-
-        //! Emulate atomic
-        //
-        // The default implementation will emulate all atomic functions with atomicCAS.
-        template<
-            typename TOp,
-            typename TAtomic,
-            typename T,
-            typename THierarchy,
-            typename TSfinae = void,
-            typename TDefer = void>
-        struct EmulateAtomic : private EmulationBase
-        {
-        public:
-            static __device__ auto atomic(
-                alpaka::AtomicUniformCudaHipBuiltIn const& ctx,
-                T* const addr,
-                T const& value) -> T
-            {
-                auto* const addressAsIntegralType = reinterpretAddress(addr);
-                using EmulatedType = std::decay_t<decltype(*addressAsIntegralType)>;
-
-                // Emulating atomics with atomicCAS is mentioned in the programming guide too.
-                // http://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions
-#        if BOOST_LANG_HIP
-#            if __has_builtin(__hip_atomic_load)
-                EmulatedType old{__hip_atomic_load(addressAsIntegralType, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT)};
-#            else
-                EmulatedType old{__atomic_load_n(addressAsIntegralType, __ATOMIC_RELAXED)};
-#            endif
-#        else
-                EmulatedType old{*addressAsIntegralType};
-#        endif
-                EmulatedType assumed;
-                do
-                {
-                    assumed = old;
-                    T v = *(reinterpret_cast<T*>(&assumed));
-                    TOp{}(&v, value);
-                    using Cas = alpaka::trait::
-                        AtomicOp<alpaka::AtomicCas, alpaka::AtomicUniformCudaHipBuiltIn, EmulatedType, THierarchy>;
-                    old = Cas::atomicOp(ctx, addressAsIntegralType, assumed, reinterpretValue(v));
-                    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
-                } while(assumed != old);
-                return *(reinterpret_cast<T*>(&old));
-            }
-        };
-
-        //! Emulate AtomicCas with equivalent unisigned integral type
-        template<typename T, typename THierarchy>
-        struct EmulateAtomic<alpaka::AtomicCas, alpaka::AtomicUniformCudaHipBuiltIn, T, THierarchy>
-            : private EmulationBase
-        {
-            static __device__ auto atomic(
-                alpaka::AtomicUniformCudaHipBuiltIn const& ctx,
-                T* const addr,
-                T const& compare,
-                T const& value) -> T
-            {
-                auto* const addressAsIntegralType = reinterpretAddress(addr);
-                using EmulatedType = std::decay_t<decltype(*addressAsIntegralType)>;
-                EmulatedType reinterpretedCompare = reinterpretValue(compare);
-                EmulatedType reinterpretedValue = reinterpretValue(value);
-
-                auto old = alpaka::trait::
-                    AtomicOp<alpaka::AtomicCas, alpaka::AtomicUniformCudaHipBuiltIn, EmulatedType, THierarchy>::
-                        atomicOp(ctx, addressAsIntegralType, reinterpretedCompare, reinterpretedValue);
-
-                return *(reinterpret_cast<T*>(&old));
-            }
-        };
-
-        //! Emulate AtomicSub with atomicAdd
-        template<typename T, typename THierarchy>
-        struct EmulateAtomic<alpaka::AtomicSub, alpaka::AtomicUniformCudaHipBuiltIn, T, THierarchy>
-        {
-            static __device__ auto atomic(
-                alpaka::AtomicUniformCudaHipBuiltIn const& ctx,
-                T* const addr,
-                T const& value) -> T
-            {
-                return alpaka::trait::AtomicOp<alpaka::AtomicAdd, alpaka::AtomicUniformCudaHipBuiltIn, T, THierarchy>::
-                    atomicOp(ctx, addr, -value);
-            }
-        };
-
-        //! AtomicDec can not be implemented for floating point types!
-        template<typename T, typename THierarchy>
-        struct EmulateAtomic<
-            alpaka::AtomicDec,
-            alpaka::AtomicUniformCudaHipBuiltIn,
-            T,
-            THierarchy,
-            std::enable_if_t<std::is_floating_point_v<T>>>
-        {
-            static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const&, T* const, T const&) -> T
-            {
-                static_assert(
-                    !sizeof(T),
-                    "EmulateAtomic<alpaka::AtomicDec> is not supported for floating point data types!");
-                return T{};
-            }
-        };
-
-        //! AtomicInc can not be implemented for floating point types!
-        template<typename T, typename THierarchy>
-        struct EmulateAtomic<
-            alpaka::AtomicInc,
-            alpaka::AtomicUniformCudaHipBuiltIn,
-            T,
-            THierarchy,
-            std::enable_if_t<std::is_floating_point_v<T>>>
-        {
-            static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const&, T* const, T const&) -> T
-            {
-                static_assert(
-                    !sizeof(T),
-                    "EmulateAtomic<alpaka::AtomicInc> is not supported for floating point data types!");
-                return T{};
-            }
-        };
-
-        //! AtomicAnd can not be implemented for floating point types!
-        template<typename T, typename THierarchy>
-        struct EmulateAtomic<
-            alpaka::AtomicAnd,
-            alpaka::AtomicUniformCudaHipBuiltIn,
-            T,
-            THierarchy,
-            std::enable_if_t<std::is_floating_point_v<T>>>
-        {
-            static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const&, T* const, T const&) -> T
-            {
-                static_assert(
-                    !sizeof(T),
-                    "EmulateAtomic<alpaka::AtomicAnd> is not supported for floating point data types!");
-                return T{};
-            }
-        };
-
-        //! AtomicOr can not be implemented for floating point types!
-        template<typename T, typename THierarchy>
-        struct EmulateAtomic<
-            alpaka::AtomicOr,
-            alpaka::AtomicUniformCudaHipBuiltIn,
-            T,
-            THierarchy,
-            std::enable_if_t<std::is_floating_point_v<T>>>
-        {
-            static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const&, T* const, T const&) -> T
-            {
-                static_assert(
-                    !sizeof(T),
-                    "EmulateAtomic<alpaka::AtomicOr> is not supported for floating point data types!");
-                return T{};
-            }
-        };
-
-        //! AtomicXor can not be implemented for floating point types!
-        template<typename T, typename THierarchy>
-        struct EmulateAtomic<
-            alpaka::AtomicXor,
-            alpaka::AtomicUniformCudaHipBuiltIn,
-            T,
-            THierarchy,
-            std::enable_if_t<std::is_floating_point_v<T>>>
-        {
-            static __device__ auto atomic(alpaka::AtomicUniformCudaHipBuiltIn const&, T* const, T const&) -> T
-            {
-                static_assert(
-                    !sizeof(T),
-                    "EmulateAtomic<alpaka::AtomicXor> is not supported for floating point data types!");
-                return T{};
-            }
-        };
-
-    } // namespace detail
-
-    //! Generic atomic implementation
-    //
-    // - unsigned long int will be redirected to unsigned long long int or unsigned int implementation depending if
-    //   unsigned long int is a 64 or 32bit data type.
-    // - Atomics which are not available as builtin atomic will be emulated.
-    template<typename TOp, typename T, typename THierarchy>
-    struct AtomicOp<TOp, AtomicUniformCudaHipBuiltIn, T, THierarchy>
-    {
-        static __device__ auto atomicOp(
-            AtomicUniformCudaHipBuiltIn const& ctx,
-            [[maybe_unused]] T* const addr,
-            [[maybe_unused]] T const& value) -> T
-        {
-            static_assert(
-                sizeof(T) == 4u || sizeof(T) == 8u,
-                "atomicOp<TOp, AtomicUniformCudaHipBuiltIn, T>(atomic, addr, value) is not supported! Only 64 and "
-                "32bit atomics are supported.");
-
-            if constexpr(::AlpakaBuiltInAtomic<TOp, T, THierarchy>::value)
-                return ::AlpakaBuiltInAtomic<TOp, T, THierarchy>::atomic(addr, value);
-
-            else if constexpr(std::is_same_v<unsigned long int, T>)
-            {
-                if constexpr(sizeof(T) == 4u && ::AlpakaBuiltInAtomic<TOp, unsigned int, THierarchy>::value)
-                    return ::AlpakaBuiltInAtomic<TOp, unsigned int, THierarchy>::atomic(
-                        reinterpret_cast<unsigned int*>(addr),
-                        static_cast<unsigned int>(value));
-                else if constexpr(
-                    sizeof(T) == 8u && ::AlpakaBuiltInAtomic<TOp, unsigned long long int, THierarchy>::value) // LP64
-                {
-                    return ::AlpakaBuiltInAtomic<TOp, unsigned long long int, THierarchy>::atomic(
-                        reinterpret_cast<unsigned long long int*>(addr),
-                        static_cast<unsigned long long int>(value));
-                }
-            }
-
-            return detail::EmulateAtomic<TOp, AtomicUniformCudaHipBuiltIn, T, THierarchy>::atomic(ctx, addr, value);
-        }
-    };
-
-    template<typename T, typename THierarchy>
-    struct AtomicOp<AtomicCas, AtomicUniformCudaHipBuiltIn, T, THierarchy>
-    {
-        static __device__ auto atomicOp(
-            [[maybe_unused]] AtomicUniformCudaHipBuiltIn const& ctx,
-            [[maybe_unused]] T* const addr,
-            [[maybe_unused]] T const& compare,
-            [[maybe_unused]] T const& value) -> T
-        {
-            static_assert(
-                sizeof(T) == 4u || sizeof(T) == 8u,
-                "atomicOp<AtomicCas, AtomicUniformCudaHipBuiltIn, T>(atomic, addr, compare, value) is not "
-                "supported! Only 64 and "
-                "32bit atomics are supported.");
-
-            if constexpr(::AlpakaBuiltInAtomic<AtomicCas, T, THierarchy>::value)
-                return ::AlpakaBuiltInAtomic<AtomicCas, T, THierarchy>::atomic(addr, compare, value);
-
-            else if constexpr(std::is_same_v<unsigned long int, T>)
-            {
-                if constexpr(sizeof(T) == 4u && ::AlpakaBuiltInAtomic<AtomicCas, unsigned int, THierarchy>::value)
-                    return ::AlpakaBuiltInAtomic<AtomicCas, unsigned int, THierarchy>::atomic(
-                        reinterpret_cast<unsigned int*>(addr),
-                        static_cast<unsigned int>(compare),
-                        static_cast<unsigned int>(value));
-                else if constexpr(
-                    sizeof(T) == 8u
-                    && ::AlpakaBuiltInAtomic<AtomicCas, unsigned long long int, THierarchy>::value) // LP64
-                {
-                    return ::AlpakaBuiltInAtomic<AtomicCas, unsigned long long int, THierarchy>::atomic(
-                        reinterpret_cast<unsigned long long int*>(addr),
-                        static_cast<unsigned long long int>(compare),
-                        static_cast<unsigned long long int>(value));
-                }
-            }
-
-            return detail::EmulateAtomic<AtomicCas, AtomicUniformCudaHipBuiltIn, T, THierarchy>::atomic(
-                ctx,
-                addr,
-                compare,
-                value);
-        }
-    };
-} // namespace alpaka::trait
-#    endif
-#endif
diff --git a/include/alpaka/atomic/Op.hpp b/include/alpaka/atomic/Op.hpp
deleted file mode 100644
index 2912556..0000000
--- a/include/alpaka/atomic/Op.hpp
+++ /dev/null
@@ -1,249 +0,0 @@
-/* Copyright 2020 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/BoostPredef.hpp"
-#include "alpaka/core/Common.hpp"
-
-#include <algorithm>
-#include <type_traits>
-
-namespace alpaka
-{
-    //! The addition function object.
-    struct AtomicAdd
-    {
-        //! \return The old value of addr.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
-        {
-            auto const old = *addr;
-            auto& ref = *addr;
-#if BOOST_COMP_GNUC
-#    pragma GCC diagnostic push
-#    pragma GCC diagnostic ignored "-Wconversion"
-#endif
-            ref += value;
-            return old;
-#if BOOST_COMP_GNUC
-#    pragma GCC diagnostic pop
-#endif
-        }
-    };
-
-    //! The subtraction function object.
-    struct AtomicSub
-    {
-        //! \return The old value of addr.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
-        {
-            auto const old = *addr;
-            auto& ref = *addr;
-#if BOOST_COMP_GNUC
-#    pragma GCC diagnostic push
-#    pragma GCC diagnostic ignored "-Wconversion"
-#endif
-            ref -= value;
-#if BOOST_COMP_GNUC
-#    pragma GCC diagnostic pop
-#endif
-            return old;
-        }
-    };
-
-    //! The minimum function object.
-    struct AtomicMin
-    {
-        //! \return The old value of addr.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
-        {
-            auto const old = *addr;
-            auto& ref = *addr;
-            ref = std::min(ref, value);
-            return old;
-        }
-    };
-
-    //! The maximum function object.
-    struct AtomicMax
-    {
-        //! \return The old value of addr.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
-        {
-            auto const old = *addr;
-            auto& ref = *addr;
-            ref = std::max(ref, value);
-            return old;
-        }
-    };
-
-    //! The exchange function object.
-    struct AtomicExch
-    {
-        //! \return The old value of addr.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
-        {
-            auto const old = *addr;
-            auto& ref = *addr;
-            ref = value;
-            return old;
-        }
-    };
-
-    //! The increment function object.
-    struct AtomicInc
-    {
-        //! Increments up to value, then reset to 0.
-        //!
-        //! \return The old value of addr.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
-        {
-            auto const old = *addr;
-            auto& ref = *addr;
-            ref = ((old >= value) ? static_cast<T>(0) : static_cast<T>(old + static_cast<T>(1)));
-            return old;
-        }
-    };
-
-    //! The decrement function object.
-    struct AtomicDec
-    {
-        //! Decrement down to 0, then reset to value.
-        //!
-        //! \return The old value of addr.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
-        {
-            auto const old = *addr;
-            auto& ref = *addr;
-            ref = (((old == static_cast<T>(0)) || (old > value)) ? value : static_cast<T>(old - static_cast<T>(1)));
-            return old;
-        }
-    };
-
-    //! The and function object.
-    struct AtomicAnd
-    {
-        //! \return The old value of addr.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
-        {
-            auto const old = *addr;
-            auto& ref = *addr;
-            ref &= value;
-            return old;
-        }
-    };
-
-    //! The or function object.
-    struct AtomicOr
-    {
-        //! \return The old value of addr.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
-        {
-            auto const old = *addr;
-            auto& ref = *addr;
-            ref |= value;
-            return old;
-        }
-    };
-
-    //! The exclusive or function object.
-    struct AtomicXor
-    {
-        //! \return The old value of addr.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
-        {
-            auto const old = *addr;
-            auto& ref = *addr;
-            ref ^= value;
-            return old;
-        }
-    };
-
-    //! The compare and swap function object.
-    struct AtomicCas
-    {
-        //! AtomicCas for non floating point values
-        // \return The old value of addr.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T, std::enable_if_t<!std::is_floating_point_v<T>, bool> = true>
-        ALPAKA_FN_HOST_ACC auto operator()(T* addr, T const& compare, T const& value) const -> T
-        {
-            auto const old = *addr;
-            auto& ref = *addr;
-
-// gcc-7.4.0 assumes for an optimization that a signed overflow does not occur here.
-// That's fine, so ignore that warning.
-#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC == BOOST_VERSION_NUMBER(7, 4, 0))
-#    pragma GCC diagnostic push
-#    pragma GCC diagnostic ignored "-Wstrict-overflow"
-#endif
-            // check if values are bit-wise equal
-            ref = ((old == compare) ? value : old);
-#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC == BOOST_VERSION_NUMBER(7, 4, 0))
-#    pragma GCC diagnostic pop
-#endif
-            return old;
-        }
-
-        //! AtomicCas for floating point values
-        // \return The old value of addr.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T, std::enable_if_t<std::is_floating_point_v<T>, bool> = true>
-        ALPAKA_FN_HOST_ACC auto operator()(T* addr, T const& compare, T const& value) const -> T
-        {
-            static_assert(sizeof(T) == 4u || sizeof(T) == 8u, "AtomicCas is supporting only 32bit and 64bit values!");
-            // Type to reinterpret too to perform the bit comparison
-            using BitType = std::conditional_t<sizeof(T) == 4u, unsigned int, unsigned long long>;
-
-            // type used to have a safe way to reinterprete the data into another type
-            // std::variant can not be used because clang8 has issues to compile std::variant
-            struct BitUnion
-            {
-                union
-                {
-                    T value;
-                    BitType r;
-                };
-            };
-
-            auto const old = *addr;
-            auto& ref = *addr;
-
-// gcc-7.4.0 assumes for an optimization that a signed overflow does not occur here.
-// That's fine, so ignore that warning.
-#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC == BOOST_VERSION_NUMBER(7, 4, 0))
-#    pragma GCC diagnostic push
-#    pragma GCC diagnostic ignored "-Wstrict-overflow"
-#endif
-            BitUnion o{old};
-            BitUnion c{compare};
-
-            ref = ((o.r == c.r) ? value : old);
-#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC == BOOST_VERSION_NUMBER(7, 4, 0))
-#    pragma GCC diagnostic pop
-#endif
-            return old;
-        }
-    };
-} // namespace alpaka
diff --git a/include/alpaka/atomic/Traits.hpp b/include/alpaka/atomic/Traits.hpp
deleted file mode 100644
index 160da8c..0000000
--- a/include/alpaka/atomic/Traits.hpp
+++ /dev/null
@@ -1,304 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, René Widera, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/atomic/Op.hpp"
-#include "alpaka/core/Common.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/core/Positioning.hpp"
-
-#include <type_traits>
-
-namespace alpaka
-{
-    struct ConceptAtomicGrids
-    {
-    };
-
-    struct ConceptAtomicBlocks
-    {
-    };
-
-    struct ConceptAtomicThreads
-    {
-    };
-
-    namespace detail
-    {
-        template<typename THierarchy>
-        struct AtomicHierarchyConceptType;
-
-        template<>
-        struct AtomicHierarchyConceptType<hierarchy::Threads>
-        {
-            using type = ConceptAtomicThreads;
-        };
-
-        template<>
-        struct AtomicHierarchyConceptType<hierarchy::Blocks>
-        {
-            using type = ConceptAtomicBlocks;
-        };
-
-        template<>
-        struct AtomicHierarchyConceptType<hierarchy::Grids>
-        {
-            using type = ConceptAtomicGrids;
-        };
-    } // namespace detail
-
-    template<typename THierarchy>
-    using AtomicHierarchyConcept = typename detail::AtomicHierarchyConceptType<THierarchy>::type;
-
-    //! The atomic operation trait.
-    namespace trait
-    {
-        //! The atomic operation trait.
-        template<typename TOp, typename TAtomic, typename T, typename THierarchy, typename TSfinae = void>
-        struct AtomicOp;
-    } // namespace trait
-
-    //! Executes the given operation atomically.
-    //!
-    //! \tparam TOp The operation type.
-    //! \tparam T The value type.
-    //! \tparam TAtomic The atomic implementation type.
-    //! \param addr The value to change atomically.
-    //! \param value The value used in the atomic operation.
-    //! \param atomic The atomic implementation.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TOp, typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
-    ALPAKA_FN_HOST_ACC auto atomicOp(
-        TAtomic const& atomic,
-        T* const addr,
-        T const& value,
-        THierarchy const& = THierarchy()) -> T
-    {
-        using ImplementationBase = typename concepts::ImplementationBase<AtomicHierarchyConcept<THierarchy>, TAtomic>;
-        return trait::AtomicOp<TOp, ImplementationBase, T, THierarchy>::atomicOp(atomic, addr, value);
-    }
-
-    //! Executes the given operation atomically.
-    //!
-    //! \tparam TOp The operation type.
-    //! \tparam TAtomic The atomic implementation type.
-    //! \tparam T The value type.
-    //! \param atomic The atomic implementation.
-    //! \param addr The value to change atomically.
-    //! \param compare The comparison value used in the atomic operation.
-    //! \param value The value used in the atomic operation.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TOp, typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
-    ALPAKA_FN_HOST_ACC auto atomicOp(
-        TAtomic const& atomic,
-        T* const addr,
-        T const& compare,
-        T const& value,
-        THierarchy const& = THierarchy()) -> T
-    {
-        using ImplementationBase = typename concepts::ImplementationBase<AtomicHierarchyConcept<THierarchy>, TAtomic>;
-        return trait::AtomicOp<TOp, ImplementationBase, T, THierarchy>::atomicOp(atomic, addr, compare, value);
-    }
-
-    //! Executes an atomic add operation.
-    //!
-    //! \tparam T The value type.
-    //! \tparam TAtomic The atomic implementation type.
-    //! \param addr The value to change atomically.
-    //! \param value The value used in the atomic operation.
-    //! \param atomic The atomic implementation.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
-    ALPAKA_FN_HOST_ACC auto atomicAdd(
-        TAtomic const& atomic,
-        T* const addr,
-        T const& value,
-        THierarchy const& hier = THierarchy()) -> T
-    {
-        return atomicOp<AtomicAdd>(atomic, addr, value, hier);
-    }
-
-    //! Executes an atomic sub operation.
-    //!
-    //! \tparam T The value type.
-    //! \tparam TAtomic The atomic implementation type.
-    //! \param addr The value to change atomically.
-    //! \param value The value used in the atomic operation.
-    //! \param atomic The atomic implementation.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
-    ALPAKA_FN_HOST_ACC auto atomicSub(
-        TAtomic const& atomic,
-        T* const addr,
-        T const& value,
-        THierarchy const& hier = THierarchy()) -> T
-    {
-        return atomicOp<AtomicSub>(atomic, addr, value, hier);
-    }
-
-    //! Executes an atomic min operation.
-    //!
-    //! \tparam T The value type.
-    //! \tparam TAtomic The atomic implementation type.
-    //! \param addr The value to change atomically.
-    //! \param value The value used in the atomic operation.
-    //! \param atomic The atomic implementation.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
-    ALPAKA_FN_HOST_ACC auto atomicMin(
-        TAtomic const& atomic,
-        T* const addr,
-        T const& value,
-        THierarchy const& hier = THierarchy()) -> T
-    {
-        return atomicOp<AtomicMin>(atomic, addr, value, hier);
-    }
-
-    //! Executes an atomic max operation.
-    //!
-    //! \tparam T The value type.
-    //! \tparam TAtomic The atomic implementation type.
-    //! \param addr The value to change atomically.
-    //! \param value The value used in the atomic operation.
-    //! \param atomic The atomic implementation.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
-    ALPAKA_FN_HOST_ACC auto atomicMax(
-        TAtomic const& atomic,
-        T* const addr,
-        T const& value,
-        THierarchy const& hier = THierarchy()) -> T
-    {
-        return atomicOp<AtomicMax>(atomic, addr, value, hier);
-    }
-
-    //! Executes an atomic exchange operation.
-    //!
-    //! \tparam T The value type.
-    //! \tparam TAtomic The atomic implementation type.
-    //! \param addr The value to change atomically.
-    //! \param value The value used in the atomic operation.
-    //! \param atomic The atomic implementation.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
-    ALPAKA_FN_HOST_ACC auto atomicExch(
-        TAtomic const& atomic,
-        T* const addr,
-        T const& value,
-        THierarchy const& hier = THierarchy()) -> T
-    {
-        return atomicOp<AtomicExch>(atomic, addr, value, hier);
-    }
-
-    //! Executes an atomic increment operation.
-    //!
-    //! \tparam T The value type.
-    //! \tparam TAtomic The atomic implementation type.
-    //! \param addr The value to change atomically.
-    //! \param value The value used in the atomic operation.
-    //! \param atomic The atomic implementation.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
-    ALPAKA_FN_HOST_ACC auto atomicInc(
-        TAtomic const& atomic,
-        T* const addr,
-        T const& value,
-        THierarchy const& hier = THierarchy()) -> T
-    {
-        return atomicOp<AtomicInc>(atomic, addr, value, hier);
-    }
-
-    //! Executes an atomic decrement operation.
-    //!
-    //! \tparam T The value type.
-    //! \tparam TAtomic The atomic implementation type.
-    //! \param addr The value to change atomically.
-    //! \param value The value used in the atomic operation.
-    //! \param atomic The atomic implementation.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
-    ALPAKA_FN_HOST_ACC auto atomicDec(
-        TAtomic const& atomic,
-        T* const addr,
-        T const& value,
-        THierarchy const& hier = THierarchy()) -> T
-    {
-        return atomicOp<AtomicDec>(atomic, addr, value, hier);
-    }
-
-    //! Executes an atomic and operation.
-    //!
-    //! \tparam T The value type.
-    //! \tparam TAtomic The atomic implementation type.
-    //! \param addr The value to change atomically.
-    //! \param value The value used in the atomic operation.
-    //! \param atomic The atomic implementation.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
-    ALPAKA_FN_HOST_ACC auto atomicAnd(
-        TAtomic const& atomic,
-        T* const addr,
-        T const& value,
-        THierarchy const& hier = THierarchy()) -> T
-    {
-        return atomicOp<AtomicAnd>(atomic, addr, value, hier);
-    }
-
-    //! Executes an atomic or operation.
-    //!
-    //! \tparam T The value type.
-    //! \tparam TAtomic The atomic implementation type.
-    //! \param addr The value to change atomically.
-    //! \param value The value used in the atomic operation.
-    //! \param atomic The atomic implementation.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
-    ALPAKA_FN_HOST_ACC auto atomicOr(
-        TAtomic const& atomic,
-        T* const addr,
-        T const& value,
-        THierarchy const& hier = THierarchy()) -> T
-    {
-        return atomicOp<AtomicOr>(atomic, addr, value, hier);
-    }
-
-    //! Executes an atomic xor operation.
-    //!
-    //! \tparam T The value type.
-    //! \tparam TAtomic The atomic implementation type.
-    //! \param addr The value to change atomically.
-    //! \param value The value used in the atomic operation.
-    //! \param atomic The atomic implementation.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
-    ALPAKA_FN_HOST_ACC auto atomicXor(
-        TAtomic const& atomic,
-        T* const addr,
-        T const& value,
-        THierarchy const& hier = THierarchy()) -> T
-    {
-        return atomicOp<AtomicXor>(atomic, addr, value, hier);
-    }
-
-    //! Executes an atomic compare-and-swap operation.
-    //!
-    //! \tparam TAtomic The atomic implementation type.
-    //! \tparam T The value type.
-    //! \param atomic The atomic implementation.
-    //! \param addr The value to change atomically.
-    //! \param compare The comparison value used in the atomic operation.
-    //! \param value The value used in the atomic operation.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TAtomic, typename T, typename THierarchy = hierarchy::Grids>
-    ALPAKA_FN_HOST_ACC auto atomicCas(
-        TAtomic const& atomic,
-        T* const addr,
-        T const& compare,
-        T const& value,
-        THierarchy const& hier = THierarchy()) -> T
-    {
-        return atomicOp<AtomicCas>(atomic, addr, compare, value, hier);
-    }
-} // namespace alpaka
diff --git a/include/alpaka/block/shared/dyn/BlockSharedDynMemberAllocKiB.hpp b/include/alpaka/block/shared/dyn/BlockSharedDynMemberAllocKiB.hpp
deleted file mode 100644
index 88e4d4b..0000000
--- a/include/alpaka/block/shared/dyn/BlockSharedDynMemberAllocKiB.hpp
+++ /dev/null
@@ -1,15 +0,0 @@
-/* Copyright 2022 Jeffrey Kelling
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include <cstdint>
-
-namespace alpaka
-{
-#ifndef ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB
-#    define ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB 47u
-#endif
-    constexpr std::uint32_t BlockSharedDynMemberAllocKiB = ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB;
-} // namespace alpaka
diff --git a/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp b/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp
deleted file mode 100644
index 0c09cf1..0000000
--- a/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2023 Jan Stephan, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/block/shared/dyn/Traits.hpp"
-
-#include <cstddef>
-
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-#    include <sycl/sycl.hpp>
-
-namespace alpaka
-{
-    //! The SYCL block shared memory allocator.
-    class BlockSharedMemDynGenericSycl
-        : public concepts::Implements<ConceptBlockSharedDyn, BlockSharedMemDynGenericSycl>
-    {
-    public:
-        using BlockSharedMemDynBase = BlockSharedMemDynGenericSycl;
-
-        BlockSharedMemDynGenericSycl(sycl::local_accessor<std::byte> accessor) : m_accessor{accessor}
-        {
-        }
-
-        sycl::local_accessor<std::byte> m_accessor;
-    };
-} // namespace alpaka
-
-namespace alpaka::trait
-{
-    template<typename T>
-    struct GetDynSharedMem<T, BlockSharedMemDynGenericSycl>
-    {
-        static auto getMem(BlockSharedMemDynGenericSycl const& shared) -> T*
-        {
-            return reinterpret_cast<T*>(shared.m_accessor.get_multi_ptr<sycl::access::decorated::no>().get());
-        }
-    };
-} // namespace alpaka::trait
-
-#endif
diff --git a/include/alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp b/include/alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp
deleted file mode 100644
index c6a3239..0000000
--- a/include/alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright 2023 Jeffrey Kelling, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/block/shared/dyn/BlockSharedDynMemberAllocKiB.hpp"
-#include "alpaka/block/shared/dyn/Traits.hpp"
-#include "alpaka/core/Assert.hpp"
-#include "alpaka/core/Vectorize.hpp"
-
-#include <array>
-#include <cstdint>
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace detail
-    {
-        //! "namespace" for static constexpr members that should be in BlockSharedMemDynMember
-        //! but cannot be because having a static const member breaks GCC 10
-        //! OpenMP target: type not mappable.
-        template<std::size_t TStaticAllocKiB>
-        struct BlockSharedMemDynMemberStatic
-        {
-            //! Storage size in bytes
-            static constexpr std::uint32_t staticAllocBytes = static_cast<std::uint32_t>(TStaticAllocKiB << 10u);
-        };
-    } // namespace detail
-
-#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
-#    pragma warning(push)
-#    pragma warning(disable : 4324) // warning C4324: structure was padded due to alignment specifier
-#endif
-    //! Dynamic block shared memory provider using fixed-size
-    //! member array to allocate memory on the stack or in shared
-    //! memory.
-    template<std::size_t TStaticAllocKiB = BlockSharedDynMemberAllocKiB>
-    class alignas(core::vectorization::defaultAlignment) BlockSharedMemDynMember
-        : public concepts::Implements<ConceptBlockSharedDyn, BlockSharedMemDynMember<TStaticAllocKiB>>
-    {
-    public:
-        BlockSharedMemDynMember(std::size_t sizeBytes) : m_dynPitch(getPitch(sizeBytes))
-        {
-            ALPAKA_ASSERT_ACC(static_cast<std::uint32_t>(sizeBytes) <= staticAllocBytes());
-        }
-
-        auto dynMemBegin() const -> uint8_t*
-        {
-            return std::data(m_mem);
-        }
-
-        /*! \return the pointer to the begin of data after the portion allocated as dynamical shared memory.
-         */
-        auto staticMemBegin() const -> uint8_t*
-        {
-            return std::data(m_mem) + m_dynPitch;
-        }
-
-        /*! \return the remaining capacity for static block shared memory,
-                    returns a 32-bit type for register efficiency on GPUs
-            */
-        auto staticMemCapacity() const -> std::uint32_t
-        {
-            return staticAllocBytes() - m_dynPitch;
-        }
-
-        //! \return size of statically allocated memory available for both
-        //!         dynamic and static shared memory. Value is of a 32-bit type
-        //!         for register efficiency on GPUs
-        static constexpr auto staticAllocBytes() -> std::uint32_t
-        {
-            return detail::BlockSharedMemDynMemberStatic<TStaticAllocKiB>::staticAllocBytes;
-        }
-
-    private:
-        static auto getPitch(std::size_t sizeBytes) -> std::uint32_t
-        {
-            constexpr auto alignment = core::vectorization::defaultAlignment;
-            return static_cast<std::uint32_t>((sizeBytes / alignment + (sizeBytes % alignment > 0u)) * alignment);
-        }
-
-        mutable std::array<uint8_t, detail::BlockSharedMemDynMemberStatic<TStaticAllocKiB>::staticAllocBytes> m_mem;
-        std::uint32_t m_dynPitch;
-    };
-#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
-#    pragma warning(pop)
-#endif
-
-    namespace trait
-    {
-        template<typename T, std::size_t TStaticAllocKiB>
-        struct GetDynSharedMem<T, BlockSharedMemDynMember<TStaticAllocKiB>>
-        {
-#if BOOST_COMP_GNUC
-#    pragma GCC diagnostic push
-#    pragma GCC diagnostic ignored                                                                                    \
-        "-Wcast-align" // "cast from 'unsigned char*' to 'unsigned int*' increases required alignment of target type"
-#endif
-            static auto getMem(BlockSharedMemDynMember<TStaticAllocKiB> const& mem) -> T*
-            {
-                static_assert(
-                    core::vectorization::defaultAlignment >= alignof(T),
-                    "Unable to get block shared dynamic memory for types with alignment higher than "
-                    "defaultAlignment!");
-                return reinterpret_cast<T*>(mem.dynMemBegin());
-            }
-#if BOOST_COMP_GNUC
-#    pragma GCC diagnostic pop
-#endif
-        };
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/block/shared/dyn/BlockSharedMemDynUniformCudaHipBuiltIn.hpp b/include/alpaka/block/shared/dyn/BlockSharedMemDynUniformCudaHipBuiltIn.hpp
deleted file mode 100644
index 8364019..0000000
--- a/include/alpaka/block/shared/dyn/BlockSharedMemDynUniformCudaHipBuiltIn.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, René Widera, Andrea Bocci, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/block/shared/dyn/Traits.hpp"
-#include "alpaka/core/BoostPredef.hpp"
-#include "alpaka/core/Concepts.hpp"
-
-#include <cstddef>
-#include <type_traits>
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-
-namespace alpaka
-{
-    //! The GPU CUDA/HIP block shared memory allocator.
-    class BlockSharedMemDynUniformCudaHipBuiltIn
-        : public concepts::Implements<ConceptBlockSharedDyn, BlockSharedMemDynUniformCudaHipBuiltIn>
-    {
-    };
-
-#    if !defined(ALPAKA_HOST_ONLY)
-
-#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
-#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#        endif
-
-#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
-#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#        endif
-
-    namespace trait
-    {
-        template<typename T>
-        struct GetDynSharedMem<T, BlockSharedMemDynUniformCudaHipBuiltIn>
-        {
-            __device__ static auto getMem(BlockSharedMemDynUniformCudaHipBuiltIn const&) -> T*
-            {
-                // Because unaligned access to variables is not allowed in device code,
-                // we use the widest possible alignment supported by CUDA types to have
-                // all types aligned correctly.
-                // See:
-                //   - http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared
-                //   - http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#vector-types
-                extern __shared__ std::byte shMem alignas(std::max_align_t)[];
-                return reinterpret_cast<T*>(shMem);
-            }
-        };
-    } // namespace trait
-
-#    endif
-
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/block/shared/dyn/Traits.hpp b/include/alpaka/block/shared/dyn/Traits.hpp
deleted file mode 100644
index 17df89c..0000000
--- a/include/alpaka/block/shared/dyn/Traits.hpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/core/Concepts.hpp"
-
-#include <type_traits>
-
-namespace alpaka
-{
-    struct ConceptBlockSharedDyn
-    {
-    };
-
-    //! The block shared dynamic memory operation traits.
-    namespace trait
-    {
-        //! The block shared dynamic memory get trait.
-        template<typename T, typename TBlockSharedMemDyn, typename TSfinae = void>
-        struct GetDynSharedMem;
-    } // namespace trait
-
-    //! Get block shared dynamic memory.
-    //!
-    //! The available size of the memory can be defined by specializing the trait
-    //! BlockSharedMemDynSizeBytes for a kernel.
-    //! The Memory can be accessed by all threads within a block.
-    //! Access to the memory is not thread safe.
-    //!
-    //! \tparam T The element type.
-    //! \tparam TBlockSharedMemDyn The block shared dynamic memory implementation type.
-    //! \param blockSharedMemDyn The block shared dynamic memory implementation.
-    //! \return Pointer to pre-allocated contiguous memory.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TBlockSharedMemDyn>
-    ALPAKA_FN_ACC auto getDynSharedMem(TBlockSharedMemDyn const& blockSharedMemDyn) -> T*
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptBlockSharedDyn, TBlockSharedMemDyn>;
-        return trait::GetDynSharedMem<T, ImplementationBase>::getMem(blockSharedMemDyn);
-    }
-} // namespace alpaka
diff --git a/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp b/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp
deleted file mode 100644
index 060414d..0000000
--- a/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright 2023 Jan Stephan, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/block/shared/st/Traits.hpp"
-#include "alpaka/block/shared/st/detail/BlockSharedMemStMemberImpl.hpp"
-
-#include <cstddef>
-#include <cstdint>
-
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-
-#    include <sycl/sycl.hpp>
-
-namespace alpaka
-{
-    //! The generic SYCL shared memory allocator.
-    class BlockSharedMemStGenericSycl
-        : public alpaka::detail::BlockSharedMemStMemberImpl<>
-        , public concepts::Implements<ConceptBlockSharedSt, BlockSharedMemStGenericSycl>
-    {
-    public:
-        BlockSharedMemStGenericSycl(sycl::local_accessor<std::byte> accessor)
-            : BlockSharedMemStMemberImpl(
-                reinterpret_cast<std::uint8_t*>(accessor.get_multi_ptr<sycl::access::decorated::no>().get()),
-                accessor.size())
-            , m_accessor{accessor}
-        {
-        }
-
-    private:
-        sycl::local_accessor<std::byte> m_accessor;
-    };
-} // namespace alpaka
-
-namespace alpaka::trait
-{
-    template<typename T, std::size_t TUniqueId>
-    struct DeclareSharedVar<T, TUniqueId, BlockSharedMemStGenericSycl>
-    {
-        static auto declareVar(BlockSharedMemStGenericSycl const& smem) -> T&
-        {
-            auto* data = smem.template getVarPtr<T>(TUniqueId);
-
-            if(!data)
-            {
-                smem.template alloc<T>(TUniqueId);
-                data = smem.template getLatestVarPtr<T>();
-            }
-            ALPAKA_ASSERT(data != nullptr);
-            return *data;
-        }
-    };
-
-    template<>
-    struct FreeSharedVars<BlockSharedMemStGenericSycl>
-    {
-        static auto freeVars(BlockSharedMemStGenericSycl const&) -> void
-        {
-            // shared memory block data will be reused
-        }
-    };
-} // namespace alpaka::trait
-
-#endif
diff --git a/include/alpaka/block/shared/st/BlockSharedMemStMember.hpp b/include/alpaka/block/shared/st/BlockSharedMemStMember.hpp
deleted file mode 100644
index 93c65e5..0000000
--- a/include/alpaka/block/shared/st/BlockSharedMemStMember.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright 2022 Jeffrey Kelling, Rene Widera, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/block/shared/st/Traits.hpp"
-#include "alpaka/block/shared/st/detail/BlockSharedMemStMemberImpl.hpp"
-#include "alpaka/core/Assert.hpp"
-#include "alpaka/core/Vectorize.hpp"
-
-#include <algorithm>
-#include <cstdint>
-#include <type_traits>
-
-namespace alpaka
-{
-    //! Static block shared memory provider using a pointer to
-    //! externally allocated fixed-size memory, likely provided by
-    //! BlockSharedMemDynMember.
-    //! \warning This class is not thread safe!
-    template<std::size_t TDataAlignBytes = core::vectorization::defaultAlignment>
-    class BlockSharedMemStMember
-        : public detail::BlockSharedMemStMemberImpl<TDataAlignBytes>
-        , public concepts::Implements<ConceptBlockSharedSt, BlockSharedMemStMember<TDataAlignBytes>>
-    {
-    public:
-        using detail::BlockSharedMemStMemberImpl<TDataAlignBytes>::BlockSharedMemStMemberImpl;
-    };
-
-    namespace trait
-    {
-        template<typename T, std::size_t TDataAlignBytes, std::size_t TuniqueId>
-        struct DeclareSharedVar<T, TuniqueId, BlockSharedMemStMember<TDataAlignBytes>>
-        {
-            static auto declareVar(BlockSharedMemStMember<TDataAlignBytes> const& smem) -> T&
-            {
-                auto* data = smem.template getVarPtr<T>(TuniqueId);
-
-                if(!data)
-                {
-                    smem.template alloc<T>(TuniqueId);
-                    data = smem.template getLatestVarPtr<T>();
-                }
-                ALPAKA_ASSERT(data != nullptr);
-                return *data;
-            }
-        };
-
-        template<std::size_t TDataAlignBytes>
-        struct FreeSharedVars<BlockSharedMemStMember<TDataAlignBytes>>
-        {
-            static auto freeVars(BlockSharedMemStMember<TDataAlignBytes> const&) -> void
-            {
-                // shared memory block data will be reused
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/block/shared/st/BlockSharedMemStMemberMasterSync.hpp b/include/alpaka/block/shared/st/BlockSharedMemStMemberMasterSync.hpp
deleted file mode 100644
index 65bd304..0000000
--- a/include/alpaka/block/shared/st/BlockSharedMemStMemberMasterSync.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/block/shared/st/Traits.hpp"
-#include "alpaka/block/shared/st/detail/BlockSharedMemStMemberImpl.hpp"
-#include "alpaka/core/AlignedAlloc.hpp"
-#include "alpaka/core/Common.hpp"
-#include "alpaka/core/Vectorize.hpp"
-
-#include <functional>
-#include <memory>
-#include <utility>
-#include <vector>
-
-namespace alpaka
-{
-    template<std::size_t TDataAlignBytes = core::vectorization::defaultAlignment>
-    class BlockSharedMemStMemberMasterSync
-        : public detail::BlockSharedMemStMemberImpl<TDataAlignBytes>
-        , public concepts::Implements<ConceptBlockSharedSt, BlockSharedMemStMemberMasterSync<TDataAlignBytes>>
-    {
-    public:
-        BlockSharedMemStMemberMasterSync(
-            uint8_t* mem,
-            std::size_t capacity,
-            std::function<void()> fnSync,
-            std::function<bool()> fnIsMasterThread)
-            : detail::BlockSharedMemStMemberImpl<TDataAlignBytes>(mem, capacity)
-            , m_syncFn(std::move(fnSync))
-            , m_isMasterThreadFn(std::move(fnIsMasterThread))
-        {
-        }
-
-        std::function<void()> m_syncFn;
-        std::function<bool()> m_isMasterThreadFn;
-    };
-
-    namespace trait
-    {
-#if BOOST_COMP_GNUC
-#    pragma GCC diagnostic push
-#    pragma GCC diagnostic ignored                                                                                    \
-        "-Wcast-align" // "cast from 'unsigned char*' to 'unsigned int*' increases required alignment of target type"
-#endif
-        template<typename T, std::size_t TDataAlignBytes, std::size_t TuniqueId>
-        struct DeclareSharedVar<T, TuniqueId, BlockSharedMemStMemberMasterSync<TDataAlignBytes>>
-        {
-            ALPAKA_FN_HOST static auto declareVar(
-                BlockSharedMemStMemberMasterSync<TDataAlignBytes> const& blockSharedMemSt) -> T&
-            {
-                auto* data = blockSharedMemSt.template getVarPtr<T>(TuniqueId);
-
-                if(!data)
-                {
-                    // Assure that all threads have executed the return of the last allocBlockSharedArr function (if
-                    // there was one before).
-                    blockSharedMemSt.m_syncFn();
-                    if(blockSharedMemSt.m_isMasterThreadFn())
-                    {
-                        blockSharedMemSt.template alloc<T>(TuniqueId);
-                    }
-
-                    blockSharedMemSt.m_syncFn();
-                    // lookup for the data chunk allocated by the master thread
-                    data = blockSharedMemSt.template getLatestVarPtr<T>();
-                }
-                ALPAKA_ASSERT(data != nullptr);
-                return *data;
-            }
-        };
-#if BOOST_COMP_GNUC
-#    pragma GCC diagnostic pop
-#endif
-        template<std::size_t TDataAlignBytes>
-        struct FreeSharedVars<BlockSharedMemStMemberMasterSync<TDataAlignBytes>>
-        {
-            ALPAKA_FN_HOST static auto freeVars(BlockSharedMemStMemberMasterSync<TDataAlignBytes> const&) -> void
-            {
-                // shared memory block data will be reused
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/block/shared/st/BlockSharedMemStUniformCudaHipBuiltIn.hpp b/include/alpaka/block/shared/st/BlockSharedMemStUniformCudaHipBuiltIn.hpp
deleted file mode 100644
index 9f4ed0c..0000000
--- a/include/alpaka/block/shared/st/BlockSharedMemStUniformCudaHipBuiltIn.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Erik Zenker, René Widera, Matthias Werner, Andrea Bocci, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/block/shared/st/Traits.hpp"
-#include "alpaka/core/BoostPredef.hpp"
-#include "alpaka/core/Concepts.hpp"
-
-#include <cstdint>
-#include <type_traits>
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-
-namespace alpaka
-{
-    //! The GPU CUDA/HIP block shared memory allocator.
-    class BlockSharedMemStUniformCudaHipBuiltIn
-        : public concepts::Implements<ConceptBlockSharedSt, BlockSharedMemStUniformCudaHipBuiltIn>
-    {
-    };
-
-#    if !defined(ALPAKA_HOST_ONLY)
-
-#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
-#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#        endif
-
-#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
-#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#        endif
-
-    namespace trait
-    {
-        template<typename T, std::size_t TuniqueId>
-        struct DeclareSharedVar<T, TuniqueId, BlockSharedMemStUniformCudaHipBuiltIn>
-        {
-            __device__ static auto declareVar(BlockSharedMemStUniformCudaHipBuiltIn const&) -> T&
-            {
-                __shared__ uint8_t shMem alignas(alignof(T))[sizeof(T)];
-                return *(reinterpret_cast<T*>(shMem));
-            }
-        };
-
-        template<>
-        struct FreeSharedVars<BlockSharedMemStUniformCudaHipBuiltIn>
-        {
-            __device__ static auto freeVars(BlockSharedMemStUniformCudaHipBuiltIn const&) -> void
-            {
-                // Nothing to do. CUDA/HIP block shared memory is automatically freed when all threads left the block.
-            }
-        };
-    } // namespace trait
-
-#    endif
-
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/block/shared/st/Traits.hpp b/include/alpaka/block/shared/st/Traits.hpp
deleted file mode 100644
index 3cc7ab2..0000000
--- a/include/alpaka/block/shared/st/Traits.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/core/Concepts.hpp"
-
-#include <type_traits>
-
-namespace alpaka
-{
-    struct ConceptBlockSharedSt
-    {
-    };
-
-    //! The block shared static memory operation trait.
-    namespace trait
-    {
-        //! The block shared static memory variable allocation operation trait.
-        template<typename T, std::size_t TuniqueId, typename TBlockSharedMemSt, typename TSfinae = void>
-        struct DeclareSharedVar;
-        //! The block shared static memory free operation trait.
-        template<typename TBlockSharedMemSt, typename TSfinae = void>
-        struct FreeSharedVars;
-    } // namespace trait
-
-    //! Declare a block shared variable.
-    //!
-    //! The variable is uninitialized and not default constructed!
-    //! The variable can be accessed by all threads within a block.
-    //! Access to the variable is not thread safe.
-    //!
-    //! \tparam T The element type.
-    //! \tparam TuniqueId id those is unique inside a kernel
-    //! \tparam TBlockSharedMemSt The block shared allocator implementation type.
-    //! \param blockSharedMemSt The block shared allocator implementation.
-    //! \return Uninitialized variable stored in shared memory.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, std::size_t TuniqueId, typename TBlockSharedMemSt>
-    ALPAKA_FN_ACC auto declareSharedVar(TBlockSharedMemSt const& blockSharedMemSt) -> T&
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptBlockSharedSt, TBlockSharedMemSt>;
-        return trait::DeclareSharedVar<T, TuniqueId, ImplementationBase>::declareVar(blockSharedMemSt);
-    }
-
-    //! Frees all memory used by block shared variables.
-    //!
-    //! \tparam TBlockSharedMemSt The block shared allocator implementation type.
-    //! \param blockSharedMemSt The block shared allocator implementation.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TBlockSharedMemSt>
-    ALPAKA_FN_ACC auto freeSharedVars(TBlockSharedMemSt& blockSharedMemSt) -> void
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptBlockSharedSt, TBlockSharedMemSt>;
-        trait::FreeSharedVars<ImplementationBase>::freeVars(blockSharedMemSt);
-    }
-} // namespace alpaka
diff --git a/include/alpaka/block/shared/st/detail/BlockSharedMemStMemberImpl.hpp b/include/alpaka/block/shared/st/detail/BlockSharedMemStMemberImpl.hpp
deleted file mode 100644
index eb09790..0000000
--- a/include/alpaka/block/shared/st/detail/BlockSharedMemStMemberImpl.hpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/* Copyright 2022 Jeffrey Kelling, Rene Widera, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/block/shared/st/Traits.hpp"
-#include "alpaka/core/Assert.hpp"
-#include "alpaka/core/Vectorize.hpp"
-
-#include <algorithm>
-#include <cstdint>
-#include <functional>
-#include <limits>
-#include <type_traits>
-
-namespace alpaka::detail
-{
-    //! Implementation of static block shared memory provider.
-    //!
-    //! externally allocated fixed-size memory, likely provided by BlockSharedMemDynMember.
-    template<std::size_t TMinDataAlignBytes = core::vectorization::defaultAlignment>
-    class BlockSharedMemStMemberImpl
-    {
-        struct MetaData
-        {
-            //! Unique id if the next data chunk.
-            std::uint32_t id = std::numeric_limits<std::uint32_t>::max();
-            //! Offset to the next meta data header, relative to m_mem.
-            //! To access the meta data header the offset must by aligned first.
-            std::uint32_t offset = 0;
-        };
-
-        static constexpr std::uint32_t metaDataSize = sizeof(MetaData);
-
-    public:
-#ifndef NDEBUG
-        BlockSharedMemStMemberImpl(std::uint8_t* mem, std::size_t capacity)
-            : m_mem(mem)
-            , m_capacity(static_cast<std::uint32_t>(capacity))
-        {
-            ALPAKA_ASSERT_ACC((m_mem == nullptr) == (m_capacity == 0u));
-        }
-#else
-        BlockSharedMemStMemberImpl(std::uint8_t* mem, std::size_t) : m_mem(mem)
-        {
-        }
-#endif
-
-        template<typename T>
-        void alloc(std::uint32_t id) const
-        {
-            // Add meta data chunk in front of the user data
-            m_allocdBytes = varChunkEnd<MetaData>(m_allocdBytes);
-            ALPAKA_ASSERT_ACC(m_allocdBytes <= m_capacity);
-            auto* meta = getLatestVarPtr<MetaData>();
-
-            // Allocate variable
-            m_allocdBytes = varChunkEnd<T>(m_allocdBytes);
-            ALPAKA_ASSERT_ACC(m_allocdBytes <= m_capacity);
-
-            // Update meta data with id and offset for the allocated variable.
-            meta->id = id;
-            meta->offset = m_allocdBytes;
-        }
-
-#if BOOST_COMP_GNUC
-#    pragma GCC diagnostic push
-#    pragma GCC diagnostic ignored                                                                                    \
-        "-Wcast-align" // "cast from 'unsigned char*' to 'unsigned int*' increases required alignment of target type"
-#endif
-
-        //! Give the pointer to an exiting variable
-        //!
-        //! @tparam T type of the variable
-        //! @param id unique id of the variable
-        //! @return nullptr if variable with id not exists
-        template<typename T>
-        auto getVarPtr(std::uint32_t id) const -> T*
-        {
-            // Offset in bytes to the next unaligned meta data header behind the variable.
-            std::uint32_t off = 0;
-
-            // Iterate over allocated data only
-            while(off < m_allocdBytes)
-            {
-                // Adjust offset to be aligned
-                std::uint32_t const alignedMetaDataOffset
-                    = varChunkEnd<MetaData>(off) - static_cast<std::uint32_t>(sizeof(MetaData));
-                ALPAKA_ASSERT_ACC(
-                    (alignedMetaDataOffset + static_cast<std::uint32_t>(sizeof(MetaData))) <= m_allocdBytes);
-                auto* metaDataPtr = reinterpret_cast<MetaData*>(m_mem + alignedMetaDataOffset);
-                off = metaDataPtr->offset;
-
-                if(metaDataPtr->id == id)
-                    return reinterpret_cast<T*>(&m_mem[off - sizeof(T)]);
-            }
-
-            // Variable not found.
-            return nullptr;
-        }
-
-        //! Get last allocated variable.
-        template<typename T>
-        auto getLatestVarPtr() const -> T*
-        {
-            return reinterpret_cast<T*>(&m_mem[m_allocdBytes - sizeof(T)]);
-        }
-
-    private:
-#if BOOST_COMP_GNUC
-#    pragma GCC diagnostic pop
-#endif
-
-        //! Byte offset to the end of the memory chunk
-        //!
-        //! Calculate bytes required to store a type with a aligned starting address in m_mem.
-        //! Start offset to the origin of the user data chunk can be calculated with `result - sizeof(T)`.
-        //! The padding is always before the origin of the user data chunk and can be zero byte.
-        //!
-        //! \tparam T type should fit into the chunk
-        //! \param byteOffset Current byte offset.
-        //! \result Byte offset to the end of the data chunk, relative to m_mem..
-        template<typename T>
-        auto varChunkEnd(std::uint32_t byteOffset) const -> std::uint32_t
-        {
-            auto const ptr = reinterpret_cast<std::size_t>(m_mem + byteOffset);
-            constexpr size_t align = std::max(TMinDataAlignBytes, alignof(T));
-            std::size_t const newPtrAdress = ((ptr + align - 1u) / align) * align + sizeof(T);
-            return static_cast<uint32_t>(newPtrAdress - reinterpret_cast<std::size_t>(m_mem));
-        }
-
-        //! Offset in bytes relative to m_mem to next free data area.
-        //! The last aligned before the free area is always a meta data header.
-        mutable std::uint32_t m_allocdBytes = 0u;
-
-        //! Memory layout
-        //! |Header|Padding|Variable|Padding|Header|....uninitialized Data ....
-        //! Size of padding can be zero if data after padding is already aligned.
-        std::uint8_t* const m_mem;
-#ifndef NDEBUG
-        const std::uint32_t m_capacity;
-#endif
-    };
-} // namespace alpaka::detail
diff --git a/include/alpaka/block/sync/BlockSyncBarrierOmp.hpp b/include/alpaka/block/sync/BlockSyncBarrierOmp.hpp
deleted file mode 100644
index c8d9ace..0000000
--- a/include/alpaka/block/sync/BlockSyncBarrierOmp.hpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright 2023 Axel Hübl, Benjamin Worpitz, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/block/sync/Traits.hpp"
-#include "alpaka/core/Common.hpp"
-
-#include <cstdint>
-
-#ifdef _OPENMP
-
-namespace alpaka
-{
-    //! The OpenMP barrier block synchronization.
-    class BlockSyncBarrierOmp : public concepts::Implements<ConceptBlockSync, BlockSyncBarrierOmp>
-    {
-    public:
-        std::uint8_t mutable m_generation = 0u;
-        int mutable m_result[2];
-    };
-
-    namespace trait
-    {
-        template<>
-        struct SyncBlockThreads<BlockSyncBarrierOmp>
-        {
-            ALPAKA_FN_HOST static auto syncBlockThreads(BlockSyncBarrierOmp const& /* blockSync */) -> void
-            {
-// NOTE: This waits for all threads in all blocks.
-// If multiple blocks are executed in parallel this is not optimal.
-#    pragma omp barrier
-            }
-        };
-
-        namespace detail
-        {
-            template<typename TOp>
-            struct AtomicOp;
-
-            template<>
-            struct AtomicOp<BlockCount>
-            {
-                void operator()(int& result, bool value)
-                {
-#    pragma omp atomic
-                    result += static_cast<int>(value);
-                }
-            };
-
-            template<>
-            struct AtomicOp<BlockAnd>
-            {
-                void operator()(int& result, bool value)
-                {
-#    pragma omp atomic
-                    result &= static_cast<int>(value);
-                }
-            };
-
-            template<>
-            struct AtomicOp<BlockOr>
-            {
-                void operator()(int& result, bool value)
-                {
-#    pragma omp atomic
-                    result |= static_cast<int>(value);
-                }
-            };
-        } // namespace detail
-
-        template<typename TOp>
-        struct SyncBlockThreadsPredicate<TOp, BlockSyncBarrierOmp>
-        {
-            ALPAKA_NO_HOST_ACC_WARNING
-
-            ALPAKA_FN_ACC static auto syncBlockThreadsPredicate(BlockSyncBarrierOmp const& blockSync, int predicate)
-                -> int
-            {
-// The first thread initializes the value.
-// There is an implicit barrier at the end of omp single.
-// NOTE: This code is executed only once for all OpenMP threads.
-// If multiple blocks with multiple threads are executed in parallel
-// this reduction is executed only for one block!
-#    pragma omp single
-                {
-                    ++blockSync.m_generation;
-                    blockSync.m_result[blockSync.m_generation % 2u] = TOp::InitialValue;
-                }
-
-                auto const generationMod2(blockSync.m_generation % 2u);
-                int& result(blockSync.m_result[generationMod2]);
-                bool const predicateBool(predicate != 0);
-
-                detail::AtomicOp<TOp>()(result, predicateBool);
-
-// Wait for all threads to write their predicate into the vector.
-// NOTE: This waits for all threads in all blocks.
-// If multiple blocks are executed in parallel this is not optimal.
-#    pragma omp barrier
-
-                return blockSync.m_result[generationMod2];
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/block/sync/BlockSyncBarrierThread.hpp b/include/alpaka/block/sync/BlockSyncBarrierThread.hpp
deleted file mode 100644
index 61cb6b9..0000000
--- a/include/alpaka/block/sync/BlockSyncBarrierThread.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/block/sync/Traits.hpp"
-#include "alpaka/core/BarrierThread.hpp"
-#include "alpaka/core/Common.hpp"
-
-#include <map>
-#include <mutex>
-#include <thread>
-
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
-
-namespace alpaka
-{
-    //! The thread id map barrier block synchronization.
-    template<typename TIdx>
-    class BlockSyncBarrierThread : public concepts::Implements<ConceptBlockSync, BlockSyncBarrierThread<TIdx>>
-    {
-    public:
-        using Barrier = core::threads::BarrierThread<TIdx>;
-        using BarrierWithPredicate = core::threads::BarrierThreadWithPredicate<TIdx>;
-
-        ALPAKA_FN_HOST BlockSyncBarrierThread(TIdx const& blockThreadCount)
-            : m_barrier(blockThreadCount)
-            , m_barrierWithPredicate(blockThreadCount)
-        {
-        }
-
-        Barrier mutable m_barrier;
-        BarrierWithPredicate mutable m_barrierWithPredicate;
-    };
-
-    namespace trait
-    {
-        template<typename TIdx>
-        struct SyncBlockThreads<BlockSyncBarrierThread<TIdx>>
-        {
-            ALPAKA_FN_HOST static auto syncBlockThreads(BlockSyncBarrierThread<TIdx> const& blockSync) -> void
-            {
-                blockSync.m_barrier.wait();
-            }
-        };
-
-        template<typename TOp, typename TIdx>
-        struct SyncBlockThreadsPredicate<TOp, BlockSyncBarrierThread<TIdx>>
-        {
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_ACC static auto syncBlockThreadsPredicate(
-                BlockSyncBarrierThread<TIdx> const& blockSync,
-                int predicate) -> int
-            {
-                return blockSync.m_barrierWithPredicate.template wait<TOp>(predicate);
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/block/sync/BlockSyncGenericSycl.hpp b/include/alpaka/block/sync/BlockSyncGenericSycl.hpp
deleted file mode 100644
index 67e9749..0000000
--- a/include/alpaka/block/sync/BlockSyncGenericSycl.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright 2022 Jan Stephan
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/block/sync/Traits.hpp"
-
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-
-#    include <sycl/sycl.hpp>
-
-namespace alpaka
-{
-    //! The SYCL block synchronization.
-    template<typename TDim>
-    class BlockSyncGenericSycl : public concepts::Implements<ConceptBlockSync, BlockSyncGenericSycl<TDim>>
-    {
-    public:
-        using BlockSyncBase = BlockSyncGenericSycl<TDim>;
-
-        BlockSyncGenericSycl(sycl::nd_item<TDim::value> work_item) : my_item{work_item}
-        {
-        }
-
-        sycl::nd_item<TDim::value> my_item;
-    };
-} // namespace alpaka
-
-namespace alpaka::trait
-{
-    template<typename TDim>
-    struct SyncBlockThreads<BlockSyncGenericSycl<TDim>>
-    {
-        static auto syncBlockThreads(BlockSyncGenericSycl<TDim> const& blockSync) -> void
-        {
-            blockSync.my_item.barrier();
-        }
-    };
-
-    template<typename TDim>
-    struct SyncBlockThreadsPredicate<BlockCount, BlockSyncGenericSycl<TDim>>
-    {
-        static auto syncBlockThreadsPredicate(BlockSyncGenericSycl<TDim> const& blockSync, int predicate) -> int
-        {
-            auto const group = blockSync.my_item.get_group();
-            blockSync.my_item.barrier();
-
-            auto const counter = (predicate != 0) ? 1 : 0;
-            return sycl::reduce_over_group(group, counter, sycl::plus<>{});
-        }
-    };
-
-    template<typename TDim>
-    struct SyncBlockThreadsPredicate<BlockAnd, BlockSyncGenericSycl<TDim>>
-    {
-        static auto syncBlockThreadsPredicate(BlockSyncGenericSycl<TDim> const& blockSync, int predicate) -> int
-        {
-            auto const group = blockSync.my_item.get_group();
-            blockSync.my_item.barrier();
-
-            return static_cast<int>(sycl::all_of_group(group, static_cast<bool>(predicate)));
-        }
-    };
-
-    template<typename TDim>
-    struct SyncBlockThreadsPredicate<BlockOr, BlockSyncGenericSycl<TDim>>
-    {
-        static auto syncBlockThreadsPredicate(BlockSyncGenericSycl<TDim> const& blockSync, int predicate) -> int
-        {
-            auto const group = blockSync.my_item.get_group();
-            blockSync.my_item.barrier();
-
-            return static_cast<int>(sycl::any_of_group(group, static_cast<bool>(predicate)));
-        }
-    };
-} // namespace alpaka::trait
-
-#endif
diff --git a/include/alpaka/block/sync/BlockSyncNoOp.hpp b/include/alpaka/block/sync/BlockSyncNoOp.hpp
deleted file mode 100644
index 57aae90..0000000
--- a/include/alpaka/block/sync/BlockSyncNoOp.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/block/sync/Traits.hpp"
-#include "alpaka/core/Common.hpp"
-
-namespace alpaka
-{
-    //! The no op block synchronization.
-    class BlockSyncNoOp : public concepts::Implements<ConceptBlockSync, BlockSyncNoOp>
-    {
-    };
-
-    namespace trait
-    {
-        template<>
-        struct SyncBlockThreads<BlockSyncNoOp>
-        {
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_ACC static auto syncBlockThreads(BlockSyncNoOp const& /* blockSync */) -> void
-            {
-                // Nothing to do.
-            }
-        };
-
-        template<typename TOp>
-        struct SyncBlockThreadsPredicate<TOp, BlockSyncNoOp>
-        {
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_ACC static auto syncBlockThreadsPredicate(BlockSyncNoOp const& /* blockSync */, int predicate)
-                -> int
-            {
-                return predicate;
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/block/sync/BlockSyncUniformCudaHipBuiltIn.hpp b/include/alpaka/block/sync/BlockSyncUniformCudaHipBuiltIn.hpp
deleted file mode 100644
index ddc369d..0000000
--- a/include/alpaka/block/sync/BlockSyncUniformCudaHipBuiltIn.hpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Matthias Werner, Andrea Bocci, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/block/sync/Traits.hpp"
-#include "alpaka/core/BoostPredef.hpp"
-#include "alpaka/core/Concepts.hpp"
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-
-namespace alpaka
-{
-    //! The GPU CUDA/HIP block synchronization.
-    class BlockSyncUniformCudaHipBuiltIn
-        : public concepts::Implements<ConceptBlockSync, BlockSyncUniformCudaHipBuiltIn>
-    {
-    };
-
-#    if !defined(ALPAKA_HOST_ONLY)
-
-#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
-#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#        endif
-
-#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
-#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#        endif
-
-    namespace trait
-    {
-        template<>
-        struct SyncBlockThreads<BlockSyncUniformCudaHipBuiltIn>
-        {
-            __device__ static auto syncBlockThreads(BlockSyncUniformCudaHipBuiltIn const& /*blockSync*/) -> void
-            {
-                __syncthreads();
-            }
-        };
-
-        template<>
-        struct SyncBlockThreadsPredicate<BlockCount, BlockSyncUniformCudaHipBuiltIn>
-        {
-            __device__ static auto syncBlockThreadsPredicate(
-                BlockSyncUniformCudaHipBuiltIn const& /*blockSync*/,
-                int predicate) -> int
-            {
-#        if defined(__HIP_ARCH_HAS_SYNC_THREAD_EXT__) && __HIP_ARCH_HAS_SYNC_THREAD_EXT__ == 0 && BOOST_COMP_HIP
-                // workaround for unsupported syncthreads_* operation on AMD hardware without sync extension
-                __shared__ int tmp;
-                __syncthreads();
-                if(threadIdx.x == 0)
-                    tmp = 0;
-                __syncthreads();
-                if(predicate)
-                    ::atomicAdd(&tmp, 1);
-                __syncthreads();
-
-                return tmp;
-#        else
-                return __syncthreads_count(predicate);
-#        endif
-            }
-        };
-
-        template<>
-        struct SyncBlockThreadsPredicate<BlockAnd, BlockSyncUniformCudaHipBuiltIn>
-        {
-            __device__ static auto syncBlockThreadsPredicate(
-                BlockSyncUniformCudaHipBuiltIn const& /*blockSync*/,
-                int predicate) -> int
-            {
-#        if defined(__HIP_ARCH_HAS_SYNC_THREAD_EXT__) && __HIP_ARCH_HAS_SYNC_THREAD_EXT__ == 0 && BOOST_COMP_HIP
-                // workaround for unsupported syncthreads_* operation on AMD hardware without sync extension
-                __shared__ int tmp;
-                __syncthreads();
-                if(threadIdx.x == 0)
-                    tmp = 1;
-                __syncthreads();
-                if(!predicate)
-                    ::atomicAnd(&tmp, 0);
-                __syncthreads();
-
-                return tmp;
-#        else
-                return __syncthreads_and(predicate);
-#        endif
-            }
-        };
-
-        template<>
-        struct SyncBlockThreadsPredicate<BlockOr, BlockSyncUniformCudaHipBuiltIn>
-        {
-            __device__ static auto syncBlockThreadsPredicate(
-                BlockSyncUniformCudaHipBuiltIn const& /*blockSync*/,
-                int predicate) -> int
-            {
-#        if defined(__HIP_ARCH_HAS_SYNC_THREAD_EXT__) && __HIP_ARCH_HAS_SYNC_THREAD_EXT__ == 0 && BOOST_COMP_HIP
-                // workaround for unsupported syncthreads_* operation on AMD hardware without sync extension
-                __shared__ int tmp;
-                __syncthreads();
-                if(threadIdx.x == 0)
-                    tmp = 0;
-                __syncthreads();
-                if(predicate)
-                    ::atomicOr(&tmp, 1);
-                __syncthreads();
-
-                return tmp;
-#        else
-                return __syncthreads_or(predicate);
-#        endif
-            }
-        };
-    } // namespace trait
-
-#    endif
-
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/block/sync/Traits.hpp b/include/alpaka/block/sync/Traits.hpp
deleted file mode 100644
index f6c6563..0000000
--- a/include/alpaka/block/sync/Traits.hpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/core/Concepts.hpp"
-
-#include <type_traits>
-
-namespace alpaka
-{
-    struct ConceptBlockSync
-    {
-    };
-
-    //! The block synchronization traits.
-    namespace trait
-    {
-        //! The block synchronization operation trait.
-        template<typename TBlockSync, typename TSfinae = void>
-        struct SyncBlockThreads;
-
-        //! The block synchronization and predicate operation trait.
-        template<typename TOp, typename TBlockSync, typename TSfinae = void>
-        struct SyncBlockThreadsPredicate;
-    } // namespace trait
-
-    //! Synchronizes all threads within the current block (independently for all blocks).
-    //!
-    //! \tparam TBlockSync The block synchronization implementation type.
-    //! \param blockSync The block synchronization implementation.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TBlockSync>
-    ALPAKA_FN_ACC auto syncBlockThreads(TBlockSync const& blockSync) -> void
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptBlockSync, TBlockSync>;
-        trait::SyncBlockThreads<ImplementationBase>::syncBlockThreads(blockSync);
-    }
-
-    //! The counting function object.
-    struct BlockCount
-    {
-        enum
-        {
-            InitialValue = 0u
-        };
-
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        ALPAKA_FN_HOST_ACC auto operator()(T const& currentResult, T const& value) const -> T
-        {
-            return currentResult + static_cast<T>(value != static_cast<T>(0));
-        }
-    };
-
-    //! The logical and function object.
-    struct BlockAnd
-    {
-        enum
-        {
-            InitialValue = 1u
-        };
-
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        ALPAKA_FN_HOST_ACC auto operator()(T const& currentResult, T const& value) const -> T
-        {
-            return static_cast<T>(currentResult && (value != static_cast<T>(0)));
-        }
-    };
-
-    //! The logical or function object.
-    struct BlockOr
-    {
-        enum
-        {
-            InitialValue = 0u
-        };
-
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        ALPAKA_FN_HOST_ACC auto operator()(T const& currentResult, T const& value) const -> T
-        {
-            return static_cast<T>(currentResult || (value != static_cast<T>(0)));
-        }
-    };
-
-    //! Synchronizes all threads within the current block (independently for all blocks),
-    //! evaluates the predicate for all threads and returns the combination of all the results
-    //! computed via TOp.
-    //!
-    //! \tparam TOp The operation used to combine the predicate values of all threads.
-    //! \tparam TBlockSync The block synchronization implementation type.
-    //! \param blockSync The block synchronization implementation.
-    //! \param predicate The predicate value of the current thread.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TOp, typename TBlockSync>
-    ALPAKA_FN_ACC auto syncBlockThreadsPredicate(TBlockSync const& blockSync, int predicate) -> int
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptBlockSync, TBlockSync>;
-        return trait::SyncBlockThreadsPredicate<TOp, ImplementationBase>::syncBlockThreadsPredicate(
-            blockSync,
-            predicate);
-    }
-} // namespace alpaka
diff --git a/include/alpaka/core/Align.hpp b/include/alpaka/core/Align.hpp
deleted file mode 100644
index d2be014..0000000
--- a/include/alpaka/core/Align.hpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, René Widera, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/BoostPredef.hpp"
-
-#include <cstddef>
-#include <type_traits>
-
-namespace alpaka::core
-{
-    //! Rounds to the next higher power of two (if not already power of two).
-    // Adapted from llvm/ADT/SmallPtrSet.h
-    template<std::size_t N>
-    struct RoundUpToPowerOfTwo;
-
-    //! Defines implementation details that should not be used directly by the user.
-    namespace detail
-    {
-        //! Base case for N being a power of two.
-        template<std::size_t N, bool TisPowerTwo>
-        struct RoundUpToPowerOfTwoHelper : std::integral_constant<std::size_t, N>
-        {
-        };
-
-        //! Case for N not being a power of two.
-        // We could just use NextVal = N+1, but this converges faster.  N|(N-1) sets
-        // the right-most zero bits to one all at once, e.g. 0b0011000 -> 0b0011111.
-        template<std::size_t N>
-        struct RoundUpToPowerOfTwoHelper<N, false>
-            : std::integral_constant<std::size_t, RoundUpToPowerOfTwo<(N | (N - 1)) + 1>::value>
-        {
-        };
-    } // namespace detail
-
-    template<std::size_t N>
-    struct RoundUpToPowerOfTwo
-        : std::integral_constant<std::size_t, detail::RoundUpToPowerOfTwoHelper<N, (N & (N - 1)) == 0>::value>
-    {
-    };
-
-    //! The alignment specifics.
-    namespace align
-    {
-        //! Calculates the optimal alignment for data of the given size.
-        template<std::size_t TsizeBytes>
-        struct OptimalAlignment
-            : std::integral_constant<
-                  std::size_t,
-#if BOOST_COMP_GNUC
-                  // GCC does not support alignments larger then 128: "warning: requested alignment 256 is larger
-                  // than 128[-Wattributes]".
-                  (TsizeBytes > 64) ? 128 :
-#endif
-                                    (RoundUpToPowerOfTwo<TsizeBytes>::value)>
-        {
-        };
-    } // namespace align
-} // namespace alpaka::core
-
-// The optimal alignment for a type is the next higher or equal power of two.
-#define ALPAKA_OPTIMAL_ALIGNMENT(...)                                                                                 \
-    ::alpaka::core::align::OptimalAlignment<sizeof(std::remove_cv_t<__VA_ARGS__>)>::value
diff --git a/include/alpaka/core/AlignedAlloc.hpp b/include/alpaka/core/AlignedAlloc.hpp
deleted file mode 100644
index 2dca319..0000000
--- a/include/alpaka/core/AlignedAlloc.hpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright 2022 René Widera, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/BoostPredef.hpp"
-#include "alpaka/core/Common.hpp"
-
-#include <new>
-
-namespace alpaka::core
-{
-    ALPAKA_FN_INLINE ALPAKA_FN_HOST auto alignedAlloc(size_t alignment, size_t size) -> void*
-    {
-        return ::operator new(size, std::align_val_t{alignment});
-    }
-
-    ALPAKA_FN_INLINE ALPAKA_FN_HOST void alignedFree(size_t alignment, void* ptr)
-    {
-        ::operator delete(ptr, std::align_val_t{alignment});
-    }
-} // namespace alpaka::core
diff --git a/include/alpaka/core/ApiCudaRt.hpp b/include/alpaka/core/ApiCudaRt.hpp
deleted file mode 100644
index ee2cdb2..0000000
--- a/include/alpaka/core/ApiCudaRt.hpp
+++ /dev/null
@@ -1,402 +0,0 @@
-/* Copyright 2022 Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include <boost/predef.h>
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-#    include <cuda_runtime_api.h>
-
-namespace alpaka
-{
-    struct ApiCudaRt
-    {
-        // Names
-        static constexpr char name[] = "Cuda";
-        static constexpr auto version = BOOST_PREDEF_MAKE_10_VVRRP(CUDART_VERSION);
-
-        // Types
-        using DeviceAttr_t = ::cudaDeviceAttr;
-        using DeviceProp_t = ::cudaDeviceProp;
-        using Error_t = ::cudaError_t;
-        using Event_t = ::cudaEvent_t;
-        using Extent_t = ::cudaExtent;
-        using Flag_t = unsigned int;
-        using FuncAttributes_t = ::cudaFuncAttributes;
-        using HostFn_t = void (*)(void* data); // same as cudaHostFn_t, without the CUDART_CB calling convention
-        using Limit_t = ::cudaLimit;
-        using Memcpy3DParms_t = ::cudaMemcpy3DParms;
-        using MemcpyKind_t = ::cudaMemcpyKind;
-        using PitchedPtr_t = ::cudaPitchedPtr;
-        using Pos_t = ::cudaPos;
-        using Stream_t = ::cudaStream_t;
-
-        // Constants
-        static constexpr Error_t success = ::cudaSuccess;
-        static constexpr Error_t errorNotReady = ::cudaErrorNotReady;
-        static constexpr Error_t errorHostMemoryAlreadyRegistered = ::cudaErrorHostMemoryAlreadyRegistered;
-        static constexpr Error_t errorHostMemoryNotRegistered = ::cudaErrorHostMemoryNotRegistered;
-        static constexpr Error_t errorUnsupportedLimit = ::cudaErrorUnsupportedLimit;
-        static constexpr Error_t errorUnknown = ::cudaErrorUnknown;
-
-        static constexpr Flag_t eventDefault = cudaEventDefault;
-        static constexpr Flag_t eventBlockingSync = cudaEventBlockingSync;
-        static constexpr Flag_t eventDisableTiming = cudaEventDisableTiming;
-        static constexpr Flag_t eventInterprocess = cudaEventInterprocess;
-
-        static constexpr Flag_t hostMallocDefault = cudaHostAllocDefault;
-        static constexpr Flag_t hostMallocMapped = cudaHostAllocMapped;
-        static constexpr Flag_t hostMallocPortable = cudaHostAllocPortable;
-        static constexpr Flag_t hostMallocWriteCombined = cudaHostAllocWriteCombined;
-        static constexpr Flag_t hostMallocCoherent = cudaHostAllocDefault; // Not supported.
-        static constexpr Flag_t hostMallocNonCoherent = cudaHostAllocDefault; // Not supported.
-
-        static constexpr Flag_t hostRegisterDefault = cudaHostRegisterDefault;
-        static constexpr Flag_t hostRegisterPortable = cudaHostRegisterPortable;
-        static constexpr Flag_t hostRegisterMapped = cudaHostRegisterMapped;
-        static constexpr Flag_t hostRegisterIoMemory = cudaHostRegisterIoMemory;
-
-        static constexpr MemcpyKind_t memcpyDefault = ::cudaMemcpyDefault;
-        static constexpr MemcpyKind_t memcpyDeviceToDevice = ::cudaMemcpyDeviceToDevice;
-        static constexpr MemcpyKind_t memcpyDeviceToHost = ::cudaMemcpyDeviceToHost;
-        static constexpr MemcpyKind_t memcpyHostToDevice = ::cudaMemcpyHostToDevice;
-
-        static constexpr Flag_t streamDefault = cudaStreamDefault;
-        static constexpr Flag_t streamNonBlocking = cudaStreamNonBlocking;
-
-        static constexpr DeviceAttr_t deviceAttributeMaxBlockDimX = ::cudaDevAttrMaxBlockDimX;
-        static constexpr DeviceAttr_t deviceAttributeMaxBlockDimY = ::cudaDevAttrMaxBlockDimY;
-        static constexpr DeviceAttr_t deviceAttributeMaxBlockDimZ = ::cudaDevAttrMaxBlockDimZ;
-        static constexpr DeviceAttr_t deviceAttributeMaxGridDimX = ::cudaDevAttrMaxGridDimX;
-        static constexpr DeviceAttr_t deviceAttributeMaxGridDimY = ::cudaDevAttrMaxGridDimY;
-        static constexpr DeviceAttr_t deviceAttributeMaxGridDimZ = ::cudaDevAttrMaxGridDimZ;
-        static constexpr DeviceAttr_t deviceAttributeMaxSharedMemoryPerBlock = ::cudaDevAttrMaxSharedMemoryPerBlock;
-        static constexpr DeviceAttr_t deviceAttributeMaxThreadsPerBlock = ::cudaDevAttrMaxThreadsPerBlock;
-        static constexpr DeviceAttr_t deviceAttributeMultiprocessorCount = ::cudaDevAttrMultiProcessorCount;
-        static constexpr DeviceAttr_t deviceAttributeWarpSize = ::cudaDevAttrWarpSize;
-
-        static constexpr Limit_t limitPrintfFifoSize = ::cudaLimitPrintfFifoSize;
-        static constexpr Limit_t limitMallocHeapSize = ::cudaLimitMallocHeapSize;
-
-        // Host function helper
-        // Encapsulates the different function signatures used by cudaStreamAddCallback and cudaLaunchHostFn, and the
-        // different calling conventions used by CUDA (__stdcall on Win32) and HIP (standard).
-        struct HostFnAdaptor
-        {
-            HostFn_t func_;
-            void* data_;
-
-            static void CUDART_CB hostFunction(void* data)
-            {
-                auto ptr = reinterpret_cast<HostFnAdaptor*>(data);
-                ptr->func_(ptr->data_);
-                delete ptr;
-            }
-
-            static void CUDART_CB streamCallback(Stream_t, Error_t, void* data)
-            {
-                auto ptr = reinterpret_cast<HostFnAdaptor*>(data);
-                ptr->func_(ptr->data_);
-                delete ptr;
-            }
-        };
-
-        // Runtime API
-        static inline Error_t deviceGetAttribute(int* value, DeviceAttr_t attr, int device)
-        {
-            return ::cudaDeviceGetAttribute(value, attr, device);
-        }
-
-        static inline Error_t deviceGetLimit(size_t* pValue, Limit_t limit)
-        {
-            return ::cudaDeviceGetLimit(pValue, limit);
-        }
-
-        static inline Error_t deviceReset()
-        {
-            return ::cudaDeviceReset();
-        }
-
-        static inline Error_t deviceSetLimit(Limit_t limit, size_t value)
-        {
-            return ::cudaDeviceSetLimit(limit, value);
-        }
-
-        static inline Error_t deviceSynchronize()
-        {
-            return ::cudaDeviceSynchronize();
-        }
-
-        static inline Error_t eventCreate(Event_t* event)
-        {
-            return ::cudaEventCreate(event);
-        }
-
-        static inline Error_t eventCreateWithFlags(Event_t* event, Flag_t flags)
-        {
-            return ::cudaEventCreateWithFlags(event, flags);
-        }
-
-        static inline Error_t eventDestroy(Event_t event)
-        {
-            return ::cudaEventDestroy(event);
-        }
-
-        static inline Error_t eventQuery(Event_t event)
-        {
-            return ::cudaEventQuery(event);
-        }
-
-        static inline Error_t eventRecord(Event_t event, Stream_t stream)
-        {
-            return ::cudaEventRecord(event, stream);
-        }
-
-        static inline Error_t eventSynchronize(Event_t event)
-        {
-            return ::cudaEventSynchronize(event);
-        }
-
-        static inline Error_t free(void* devPtr)
-        {
-            return ::cudaFree(devPtr);
-        }
-
-        static inline Error_t freeAsync([[maybe_unused]] void* devPtr, [[maybe_unused]] Stream_t stream)
-        {
-#    if CUDART_VERSION >= 11020
-            return ::cudaFreeAsync(devPtr, stream);
-#    else
-            // Not implemented.
-            return errorUnknown;
-#    endif
-        }
-
-        static inline Error_t funcGetAttributes(FuncAttributes_t* attr, void const* func)
-        {
-            return ::cudaFuncGetAttributes(attr, func);
-        }
-
-        template<typename T>
-        static inline Error_t funcGetAttributes(FuncAttributes_t* attr, T* func)
-        {
-#    if BOOST_COMP_GNUC
-#        pragma GCC diagnostic push
-#        pragma GCC diagnostic ignored "-Wconditionally-supported"
-#    endif
-            return ::cudaFuncGetAttributes(attr, reinterpret_cast<void const*>(func));
-#    if BOOST_COMP_GNUC
-#        pragma GCC diagnostic pop
-#    endif
-        }
-
-        static inline Error_t getDeviceCount(int* count)
-        {
-            return ::cudaGetDeviceCount(count);
-        }
-
-        static inline Error_t getDeviceProperties(DeviceProp_t* prop, int device)
-        {
-            return ::cudaGetDeviceProperties(prop, device);
-        }
-
-        static inline char const* getErrorName(Error_t error)
-        {
-            return ::cudaGetErrorName(error);
-        }
-
-        static inline char const* getErrorString(Error_t error)
-        {
-            return ::cudaGetErrorString(error);
-        }
-
-        static inline Error_t getLastError()
-        {
-            return ::cudaGetLastError();
-        }
-
-        static inline Error_t getSymbolAddress(void** devPtr, void const* symbol)
-        {
-            return ::cudaGetSymbolAddress(devPtr, symbol);
-        }
-
-        template<class T>
-        static inline Error_t getSymbolAddress(void** devPtr, T const& symbol)
-        {
-            return ::cudaGetSymbolAddress(devPtr, symbol);
-        }
-
-        static inline Error_t hostGetDevicePointer(void** pDevice, void* pHost, Flag_t flags)
-        {
-            return ::cudaHostGetDevicePointer(pDevice, pHost, flags);
-        }
-
-        static inline Error_t hostFree(void* ptr)
-        {
-            return ::cudaFreeHost(ptr);
-        }
-
-        static inline Error_t hostMalloc(void** ptr, size_t size, Flag_t flags)
-        {
-            return ::cudaHostAlloc(ptr, size, flags);
-        }
-
-        static inline Error_t hostRegister(void* ptr, size_t size, Flag_t flags)
-        {
-            return ::cudaHostRegister(ptr, size, flags);
-        }
-
-        static inline Error_t hostUnregister(void* ptr)
-        {
-            return ::cudaHostUnregister(ptr);
-        }
-
-        static inline Error_t launchHostFunc(Stream_t stream, HostFn_t fn, void* userData)
-        {
-#    if CUDART_VERSION >= 10000
-            // Wrap the host function using the proper calling convention
-            return ::cudaLaunchHostFunc(stream, HostFnAdaptor::hostFunction, new HostFnAdaptor{fn, userData});
-#    else
-            // Emulate cudaLaunchHostFunc using cudaStreamAddCallback with a callback adaptor.
-            return ::cudaStreamAddCallback(stream, HostFnAdaptor::streamCallback, new HostFnAdaptor{fn, userData}, 0);
-#    endif
-        }
-
-        static inline Error_t malloc(void** devPtr, size_t size)
-        {
-            return ::cudaMalloc(devPtr, size);
-        }
-
-        static inline Error_t malloc3D(PitchedPtr_t* pitchedDevPtr, Extent_t extent)
-        {
-            return ::cudaMalloc3D(pitchedDevPtr, extent);
-        }
-
-        static inline Error_t mallocAsync(
-            [[maybe_unused]] void** devPtr,
-            [[maybe_unused]] size_t size,
-            [[maybe_unused]] Stream_t stream)
-        {
-#    if CUDART_VERSION >= 11020
-            return ::cudaMallocAsync(devPtr, size, stream);
-#    else
-            // Not implemented.
-            return errorUnknown;
-#    endif
-        }
-
-        static inline Error_t mallocPitch(void** devPtr, size_t* pitch, size_t width, size_t height)
-        {
-            return ::cudaMallocPitch(devPtr, pitch, width, height);
-        }
-
-        static inline Error_t memGetInfo(size_t* free, size_t* total)
-        {
-            return ::cudaMemGetInfo(free, total);
-        }
-
-        static inline Error_t memcpy(void* dst, void const* src, size_t count, MemcpyKind_t kind)
-        {
-            return ::cudaMemcpy(dst, src, count, kind);
-        }
-
-        static inline Error_t memcpy2DAsync(
-            void* dst,
-            size_t dpitch,
-            void const* src,
-            size_t spitch,
-            size_t width,
-            size_t height,
-            MemcpyKind_t kind,
-            Stream_t stream)
-        {
-            return ::cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream);
-        }
-
-        static inline Error_t memcpy3DAsync(Memcpy3DParms_t const* p, Stream_t stream)
-        {
-            return ::cudaMemcpy3DAsync(p, stream);
-        }
-
-        static inline Error_t memcpyAsync(void* dst, void const* src, size_t count, MemcpyKind_t kind, Stream_t stream)
-        {
-            return ::cudaMemcpyAsync(dst, src, count, kind, stream);
-        }
-
-        static inline Error_t memset2DAsync(
-            void* devPtr,
-            size_t pitch,
-            int value,
-            size_t width,
-            size_t height,
-            Stream_t stream)
-        {
-            return ::cudaMemset2DAsync(devPtr, pitch, value, width, height, stream);
-        }
-
-        static inline Error_t memset3DAsync(PitchedPtr_t pitchedDevPtr, int value, Extent_t extent, Stream_t stream)
-        {
-            return ::cudaMemset3DAsync(pitchedDevPtr, value, extent, stream);
-        }
-
-        static inline Error_t memsetAsync(void* devPtr, int value, size_t count, Stream_t stream)
-        {
-            return ::cudaMemsetAsync(devPtr, value, count, stream);
-        }
-
-        static inline Error_t setDevice(int device)
-        {
-            return ::cudaSetDevice(device);
-        }
-
-        static inline Error_t streamCreate(Stream_t* pStream)
-        {
-            return ::cudaStreamCreate(pStream);
-        }
-
-        static inline Error_t streamCreateWithFlags(Stream_t* pStream, Flag_t flags)
-        {
-            return ::cudaStreamCreateWithFlags(pStream, flags);
-        }
-
-        static inline Error_t streamDestroy(Stream_t stream)
-        {
-            return ::cudaStreamDestroy(stream);
-        }
-
-        static inline Error_t streamQuery(Stream_t stream)
-        {
-            return ::cudaStreamQuery(stream);
-        }
-
-        static inline Error_t streamSynchronize(Stream_t stream)
-        {
-            return ::cudaStreamSynchronize(stream);
-        }
-
-        static inline Error_t streamWaitEvent(Stream_t stream, Event_t event, Flag_t flags)
-        {
-            return ::cudaStreamWaitEvent(stream, event, flags);
-        }
-
-        static inline PitchedPtr_t makePitchedPtr(void* d, size_t p, size_t xsz, size_t ysz)
-        {
-            return ::make_cudaPitchedPtr(d, p, xsz, ysz);
-        }
-
-        static inline Pos_t makePos(size_t x, size_t y, size_t z)
-        {
-            return ::make_cudaPos(x, y, z);
-        }
-
-        static inline Extent_t makeExtent(size_t w, size_t h, size_t d)
-        {
-            return ::make_cudaExtent(w, h, d);
-        }
-    };
-
-} // namespace alpaka
-
-#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
diff --git a/include/alpaka/core/ApiHipRt.hpp b/include/alpaka/core/ApiHipRt.hpp
deleted file mode 100644
index d765246..0000000
--- a/include/alpaka/core/ApiHipRt.hpp
+++ /dev/null
@@ -1,441 +0,0 @@
-/* Copyright 2022 Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include <boost/predef.h>
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#    include <hip/hip_runtime_api.h>
-#    include <hip/hip_version.h>
-
-namespace alpaka
-{
-    struct ApiHipRt
-    {
-        // Names
-        static constexpr char name[] = "Hip";
-        static constexpr auto version = BOOST_VERSION_NUMBER(HIP_VERSION_MAJOR, HIP_VERSION_MINOR, 0);
-
-        // Types
-        using DeviceAttr_t = ::hipDeviceAttribute_t;
-        using DeviceProp_t = ::hipDeviceProp_t;
-        using Error_t = ::hipError_t;
-        using Event_t = ::hipEvent_t;
-        using Extent_t = ::hipExtent;
-        using Flag_t = unsigned int;
-        using FuncAttributes_t = ::hipFuncAttributes;
-        using HostFn_t = void (*)(void* data); // same as hipHostFn_t
-        using Limit_t = ::hipLimit_t;
-        using Memcpy3DParms_t = ::hipMemcpy3DParms;
-        using MemcpyKind_t = ::hipMemcpyKind;
-        using PitchedPtr_t = ::hipPitchedPtr;
-        using Pos_t = ::hipPos;
-        using Stream_t = ::hipStream_t;
-
-        // Constants
-        static constexpr Error_t success = ::hipSuccess;
-        static constexpr Error_t errorNotReady = ::hipErrorNotReady;
-        static constexpr Error_t errorHostMemoryAlreadyRegistered = ::hipErrorHostMemoryAlreadyRegistered;
-        static constexpr Error_t errorHostMemoryNotRegistered = ::hipErrorHostMemoryNotRegistered;
-        static constexpr Error_t errorUnsupportedLimit = ::hipErrorUnsupportedLimit;
-        static constexpr Error_t errorUnknown = ::hipErrorUnknown;
-
-        static constexpr Flag_t eventDefault = hipEventDefault;
-        static constexpr Flag_t eventBlockingSync = hipEventBlockingSync;
-        static constexpr Flag_t eventDisableTiming = hipEventDisableTiming;
-        static constexpr Flag_t eventInterprocess = hipEventInterprocess;
-
-        static constexpr Flag_t hostMallocDefault = hipHostMallocDefault;
-        static constexpr Flag_t hostMallocMapped = hipHostMallocMapped;
-        static constexpr Flag_t hostMallocPortable = hipHostMallocPortable;
-        static constexpr Flag_t hostMallocWriteCombined = hipHostMallocWriteCombined;
-        static constexpr Flag_t hostMallocCoherent = hipHostMallocCoherent;
-        static constexpr Flag_t hostMallocNonCoherent = hipHostMallocNonCoherent;
-
-        static constexpr Flag_t hostRegisterDefault = hipHostRegisterDefault;
-        static constexpr Flag_t hostRegisterPortable = hipHostRegisterPortable;
-        static constexpr Flag_t hostRegisterMapped = hipHostRegisterMapped;
-        static constexpr Flag_t hostRegisterIoMemory = hipHostRegisterIoMemory;
-
-        static constexpr MemcpyKind_t memcpyDefault = ::hipMemcpyDefault;
-        static constexpr MemcpyKind_t memcpyDeviceToDevice = ::hipMemcpyDeviceToDevice;
-        static constexpr MemcpyKind_t memcpyDeviceToHost = ::hipMemcpyDeviceToHost;
-        static constexpr MemcpyKind_t memcpyHostToDevice = ::hipMemcpyHostToDevice;
-
-        static constexpr Flag_t streamDefault = hipStreamDefault;
-        static constexpr Flag_t streamNonBlocking = hipStreamNonBlocking;
-
-        static constexpr DeviceAttr_t deviceAttributeMaxBlockDimX = ::hipDeviceAttributeMaxBlockDimX;
-        static constexpr DeviceAttr_t deviceAttributeMaxBlockDimY = ::hipDeviceAttributeMaxBlockDimY;
-        static constexpr DeviceAttr_t deviceAttributeMaxBlockDimZ = ::hipDeviceAttributeMaxBlockDimZ;
-        static constexpr DeviceAttr_t deviceAttributeMaxGridDimX = ::hipDeviceAttributeMaxGridDimX;
-        static constexpr DeviceAttr_t deviceAttributeMaxGridDimY = ::hipDeviceAttributeMaxGridDimY;
-        static constexpr DeviceAttr_t deviceAttributeMaxGridDimZ = ::hipDeviceAttributeMaxGridDimZ;
-        static constexpr DeviceAttr_t deviceAttributeMaxSharedMemoryPerBlock
-            = ::hipDeviceAttributeMaxSharedMemoryPerBlock;
-        static constexpr DeviceAttr_t deviceAttributeMaxThreadsPerBlock = ::hipDeviceAttributeMaxThreadsPerBlock;
-        static constexpr DeviceAttr_t deviceAttributeMultiprocessorCount = ::hipDeviceAttributeMultiprocessorCount;
-        static constexpr DeviceAttr_t deviceAttributeWarpSize = ::hipDeviceAttributeWarpSize;
-
-#    if HIP_VERSION >= 40'500'000
-        static constexpr Limit_t limitPrintfFifoSize = ::hipLimitPrintfFifoSize;
-#    else
-        static constexpr Limit_t limitPrintfFifoSize
-            = static_cast<Limit_t>(0x01); // Implemented only in ROCm 4.5.0 and later.
-#    endif
-        static constexpr Limit_t limitMallocHeapSize = ::hipLimitMallocHeapSize;
-
-        // Host function helper
-        // Encapsulates the different function signatures used by hipStreamAddCallback and hipLaunchHostFn, and the
-        // different calling conventions used by CUDA (__stdcall on Win32) and HIP (standard).
-        struct HostFnAdaptor
-        {
-            HostFn_t func_;
-            void* data_;
-
-            static void hostFunction(void* data)
-            {
-                auto ptr = reinterpret_cast<HostFnAdaptor*>(data);
-                ptr->func_(ptr->data_);
-                delete ptr;
-            }
-
-            static void streamCallback(Stream_t, Error_t, void* data)
-            {
-                auto ptr = reinterpret_cast<HostFnAdaptor*>(data);
-                ptr->func_(ptr->data_);
-                delete ptr;
-            }
-        };
-
-        // Runtime API
-        static inline Error_t deviceGetAttribute(int* value, DeviceAttr_t attr, int device)
-        {
-            return ::hipDeviceGetAttribute(value, attr, device);
-        }
-
-        static inline Error_t deviceGetLimit(size_t* pValue, Limit_t limit)
-        {
-#    if HIP_VERSION < 40'500'000
-            if(limit == limitPrintfFifoSize)
-            {
-                // Implemented only in ROCm 4.5.0 and later.
-                return errorUnsupportedLimit;
-            }
-#    endif
-            return ::hipDeviceGetLimit(pValue, limit);
-        }
-
-        static inline Error_t deviceReset()
-        {
-            return ::hipDeviceReset();
-        }
-
-        static inline Error_t deviceSetLimit(Limit_t /* limit */, size_t /* value */)
-        {
-            // Not implemented.
-            return errorUnsupportedLimit;
-        }
-
-        static inline Error_t deviceSynchronize()
-        {
-            return ::hipDeviceSynchronize();
-        }
-
-        static inline Error_t eventCreate(Event_t* event)
-        {
-            return ::hipEventCreate(event);
-        }
-
-        static inline Error_t eventCreateWithFlags(Event_t* event, Flag_t flags)
-        {
-            return ::hipEventCreateWithFlags(event, flags);
-        }
-
-        static inline Error_t eventDestroy(Event_t event)
-        {
-            return ::hipEventDestroy(event);
-        }
-
-        static inline Error_t eventQuery(Event_t event)
-        {
-            return ::hipEventQuery(event);
-        }
-
-        static inline Error_t eventRecord(Event_t event, Stream_t stream)
-        {
-            return ::hipEventRecord(event, stream);
-        }
-
-        static inline Error_t eventSynchronize(Event_t event)
-        {
-            return ::hipEventSynchronize(event);
-        }
-
-        static inline Error_t free(void* devPtr)
-        {
-            return ::hipFree(devPtr);
-        }
-
-        static inline Error_t freeAsync([[maybe_unused]] void* devPtr, [[maybe_unused]] Stream_t stream)
-        {
-            // stream-ordered memory operations are fully implemented only in ROCm 5.3.0 and later.
-#    if HIP_VERSION >= 50'300'000
-            // hipFreeAsync fails on a null pointer deallocation
-            if(devPtr)
-            {
-                return ::hipFreeAsync(devPtr, stream);
-            }
-            else
-            {
-                return ::hipSuccess;
-            }
-#    else
-            // Not implemented.
-            return errorUnknown;
-#    endif
-        }
-
-        static inline Error_t funcGetAttributes(FuncAttributes_t* attr, void const* func)
-        {
-            return ::hipFuncGetAttributes(attr, func);
-        }
-
-        template<typename T>
-        static inline Error_t funcGetAttributes(FuncAttributes_t* attr, T* func)
-        {
-#    if BOOST_COMP_GNUC
-#        pragma GCC diagnostic push
-#        pragma GCC diagnostic ignored "-Wconditionally-supported"
-#    endif
-            return ::hipFuncGetAttributes(attr, reinterpret_cast<void const*>(func));
-#    if BOOST_COMP_GNUC
-#        pragma GCC diagnostic pop
-#    endif
-        }
-
-        static inline Error_t getDeviceCount(int* count)
-        {
-            return ::hipGetDeviceCount(count);
-        }
-
-        static inline Error_t getDeviceProperties(DeviceProp_t* prop, int device)
-        {
-            return ::hipGetDeviceProperties(prop, device);
-        }
-
-        static inline char const* getErrorName(Error_t error)
-        {
-            return ::hipGetErrorName(error);
-        }
-
-        static inline char const* getErrorString(Error_t error)
-        {
-            return ::hipGetErrorString(error);
-        }
-
-        static inline Error_t getLastError()
-        {
-            return ::hipGetLastError();
-        }
-
-        static inline Error_t getSymbolAddress(void** devPtr, void const* symbol)
-        {
-            return ::hipGetSymbolAddress(devPtr, symbol);
-        }
-
-        template<class T>
-        static inline Error_t getSymbolAddress(void** devPtr, T const& symbol)
-        {
-            return ::hipGetSymbolAddress(devPtr, symbol);
-        }
-
-        static inline Error_t hostGetDevicePointer(void** pDevice, void* pHost, Flag_t flags)
-        {
-            return ::hipHostGetDevicePointer(pDevice, pHost, flags);
-        }
-
-        static inline Error_t hostFree(void* ptr)
-        {
-            return ::hipHostFree(ptr);
-        }
-
-        static inline Error_t hostMalloc(void** ptr, size_t size, Flag_t flags)
-        {
-            return ::hipHostMalloc(ptr, size, flags);
-        }
-
-        static inline Error_t hostRegister(void* ptr, size_t size, Flag_t flags)
-        {
-            return ::hipHostRegister(ptr, size, flags);
-        }
-
-        static inline Error_t hostUnregister(void* ptr)
-        {
-            return ::hipHostUnregister(ptr);
-        }
-
-        static inline Error_t launchHostFunc(Stream_t stream, HostFn_t fn, void* userData)
-        {
-            // hipLaunchHostFunc is implemented only in ROCm 5.4.0 and later.
-#    if HIP_VERSION >= 50'400'000
-            // Wrap the host function using the proper calling convention.
-            return ::hipLaunchHostFunc(stream, HostFnAdaptor::hostFunction, new HostFnAdaptor{fn, userData});
-#    else
-            // Emulate hipLaunchHostFunc using hipStreamAddCallback with a callback adaptor.
-            return ::hipStreamAddCallback(stream, HostFnAdaptor::streamCallback, new HostFnAdaptor{fn, userData}, 0);
-#    endif
-        }
-
-        static inline Error_t malloc(void** devPtr, size_t size)
-        {
-            return ::hipMalloc(devPtr, size);
-        }
-
-        static inline Error_t malloc3D(PitchedPtr_t* pitchedDevPtr, Extent_t extent)
-        {
-            return ::hipMalloc3D(pitchedDevPtr, extent);
-        }
-
-        static inline Error_t mallocAsync(
-            [[maybe_unused]] void** devPtr,
-            [[maybe_unused]] size_t size,
-            [[maybe_unused]] Stream_t stream)
-        {
-            // stream-ordered memory operations are fully implemented only in ROCm 5.3.0 and later.
-#    if HIP_VERSION >= 50'600'000
-            return ::hipMallocAsync(devPtr, size, stream);
-#    elif HIP_VERSION >= 50'300'000
-            // before ROCm 5.6.0, hipMallocAsync fails for an allocation of 0 bytes
-            if(size > 0)
-            {
-                return ::hipMallocAsync(devPtr, size, stream);
-            }
-            else
-            {
-                // make sure the pointer can safely be passed to hipFreeAsync
-                *devPtr = nullptr;
-                return ::hipSuccess;
-            }
-#    else
-            // Not implemented.
-            return errorUnknown;
-#    endif
-        }
-
-        static inline Error_t mallocPitch(void** devPtr, size_t* pitch, size_t width, size_t height)
-        {
-            return ::hipMallocPitch(devPtr, pitch, width, height);
-        }
-
-        static inline Error_t memGetInfo(size_t* free, size_t* total)
-        {
-            return ::hipMemGetInfo(free, total);
-        }
-
-        static inline Error_t memcpy(void* dst, void const* src, size_t count, MemcpyKind_t kind)
-        {
-            return ::hipMemcpy(dst, src, count, kind);
-        }
-
-        static inline Error_t memcpy2DAsync(
-            void* dst,
-            size_t dpitch,
-            void const* src,
-            size_t spitch,
-            size_t width,
-            size_t height,
-            MemcpyKind_t kind,
-            Stream_t stream)
-        {
-            return ::hipMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream);
-        }
-
-        static inline Error_t memcpy3DAsync(Memcpy3DParms_t const* p, Stream_t stream)
-        {
-            return ::hipMemcpy3DAsync(p, stream);
-        }
-
-        static inline Error_t memcpyAsync(void* dst, void const* src, size_t count, MemcpyKind_t kind, Stream_t stream)
-        {
-            return ::hipMemcpyAsync(dst, src, count, kind, stream);
-        }
-
-        static inline Error_t memset2DAsync(
-            void* devPtr,
-            size_t pitch,
-            int value,
-            size_t width,
-            size_t height,
-            Stream_t stream)
-        {
-            return ::hipMemset2DAsync(devPtr, pitch, value, width, height, stream);
-        }
-
-        static inline Error_t memset3DAsync(PitchedPtr_t pitchedDevPtr, int value, Extent_t extent, Stream_t stream)
-        {
-            return ::hipMemset3DAsync(pitchedDevPtr, value, extent, stream);
-        }
-
-        static inline Error_t memsetAsync(void* devPtr, int value, size_t count, Stream_t stream)
-        {
-            return ::hipMemsetAsync(devPtr, value, count, stream);
-        }
-
-        static inline Error_t setDevice(int device)
-        {
-            return ::hipSetDevice(device);
-        }
-
-        static inline Error_t streamCreate(Stream_t* pStream)
-        {
-            return ::hipStreamCreate(pStream);
-        }
-
-        static inline Error_t streamCreateWithFlags(Stream_t* pStream, Flag_t flags)
-        {
-            return ::hipStreamCreateWithFlags(pStream, flags);
-        }
-
-        static inline Error_t streamDestroy(Stream_t stream)
-        {
-            return ::hipStreamDestroy(stream);
-        }
-
-        static inline Error_t streamQuery(Stream_t stream)
-        {
-            return ::hipStreamQuery(stream);
-        }
-
-        static inline Error_t streamSynchronize(Stream_t stream)
-        {
-            return ::hipStreamSynchronize(stream);
-        }
-
-        static inline Error_t streamWaitEvent(Stream_t stream, Event_t event, Flag_t flags)
-        {
-            return ::hipStreamWaitEvent(stream, event, flags);
-        }
-
-        static inline PitchedPtr_t makePitchedPtr(void* d, size_t p, size_t xsz, size_t ysz)
-        {
-            return ::make_hipPitchedPtr(d, p, xsz, ysz);
-        }
-
-        static inline Pos_t makePos(size_t x, size_t y, size_t z)
-        {
-            return ::make_hipPos(x, y, z);
-        }
-
-        static inline Extent_t makeExtent(size_t w, size_t h, size_t d)
-        {
-            return ::make_hipExtent(w, h, d);
-        }
-    };
-
-} // namespace alpaka
-
-#endif // ALPAKA_ACC_GPU_HIP_ENABLED
diff --git a/include/alpaka/core/Assert.hpp b/include/alpaka/core/Assert.hpp
deleted file mode 100644
index 7ad2a2b..0000000
--- a/include/alpaka/core/Assert.hpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-
-#include <cassert>
-#include <type_traits>
-
-//! The assert can be explicit disabled by defining NDEBUG
-#define ALPAKA_ASSERT(...) assert(__VA_ARGS__)
-
-//! Macro which expands to a noop.
-//! Macro enforces an semicolon after the call.
-#define ALPAKA_NOOP(...)                                                                                              \
-    do                                                                                                                \
-    {                                                                                                                 \
-    } while(false)
-
-//! ALPAKA_ASSERT_ACC_IMPL is an assert-like macro.
-//! It can be disabled setting the ALPAKA_DISABLE_ASSERT_ACC preprocessor symbol or the NDEBUG preprocessor symbol.
-#if !defined(ALPAKA_DISABLE_ASSERT_ACC)
-#    define ALPAKA_ASSERT_ACC_IMPL(...) ALPAKA_ASSERT(__VA_ARGS__)
-#else
-#    define ALPAKA_ASSERT_ACC_IMPL(...) ALPAKA_NOOP(__VA_ARGS__)
-#endif
-
-//! ALPAKA_ASSERT_ACC is an assert-like macro.
-//!
-//! In device code for a GPU or SYCL backend it can be disabled setting the ALPAKA_DISABLE_ASSERT_ACC preprocessor
-//! symbol or the NDEBUG preprocessor symbol. In device code for a native C++ CPU backend and in host code, it is
-//! equivalent to ALPAKA_ASSERT, and can be disabled setting the NDEBUG preprocessor symbol.
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDA_ARCH__)
-// CUDA device code
-#    define ALPAKA_ASSERT_ACC(...) ALPAKA_ASSERT_ACC_IMPL(__VA_ARGS__)
-#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_DEVICE_COMPILE__)
-// HIP/ROCm device code
-#    define ALPAKA_ASSERT_ACC(...) ALPAKA_ASSERT_ACC_IMPL(__VA_ARGS__)
-#elif defined(ALPAKA_ACC_SYCL_ENABLED) && defined(__SYCL_DEVICE_ONLY__)
-// SYCL/oneAPI device code
-#    if defined(SYCL_EXT_ONEAPI_ASSERT)
-#        define ALPAKA_ASSERT_ACC(...) ALPAKA_ASSERT_ACC_IMPL(__VA_ARGS__)
-#    else
-#        define ALPAKA_ASSERT_ACC(...) ALPAKA_NOOP(__VA_ARGS__)
-#    endif
-// add here any other #elif conditions for non-CPU backends
-// ...
-#else
-// CPU backend, or host code
-#    define ALPAKA_ASSERT_ACC(...) ALPAKA_ASSERT(__VA_ARGS__)
-#endif
-
-namespace alpaka::core
-{
-    namespace detail
-    {
-        template<typename TArg>
-        struct AssertValueUnsigned
-        {
-            ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC static constexpr auto assertValueUnsigned(
-                [[maybe_unused]] TArg const& arg)
-            {
-                if constexpr(std::is_signed_v<TArg>)
-                    ALPAKA_ASSERT_ACC(arg >= 0);
-
-                // Nothing to do for unsigned types.
-            }
-        };
-    } // namespace detail
-
-    //! This method checks integral values if they are greater or equal zero.
-    //! The implementation prevents warnings for checking this for unsigned types.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TArg>
-    ALPAKA_FN_HOST_ACC constexpr auto assertValueUnsigned(TArg const& arg) -> void
-    {
-        detail::AssertValueUnsigned<TArg>::assertValueUnsigned(arg);
-    }
-
-    namespace detail
-    {
-        template<typename TLhs, typename TRhs>
-        struct AssertGreaterThan
-        {
-            ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC static constexpr auto assertGreaterThan(
-                [[maybe_unused]] TRhs const& rhs)
-            {
-                if constexpr(std::is_signed_v<TRhs> || (TLhs::value != 0u))
-                    ALPAKA_ASSERT_ACC(TLhs::value > rhs);
-
-                // Nothing to do for unsigned types comparing to zero.
-            }
-        };
-    } // namespace detail
-
-    //! This function asserts that the integral value TLhs is greater than TRhs.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TLhs, typename TRhs>
-    ALPAKA_FN_HOST_ACC constexpr auto assertGreaterThan(TRhs const& rhs) -> void
-    {
-        detail::AssertGreaterThan<TLhs, TRhs>::assertGreaterThan(rhs);
-    }
-} // namespace alpaka::core
diff --git a/include/alpaka/core/BarrierThread.hpp b/include/alpaka/core/BarrierThread.hpp
deleted file mode 100644
index ff38eb3..0000000
--- a/include/alpaka/core/BarrierThread.hpp
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-// Uncomment this to disable the standard spinlock behaviour of the threads
-// #define ALPAKA_THREAD_BARRIER_DISABLE_SPINLOCK
-
-#include "alpaka/block/sync/Traits.hpp"
-#include "alpaka/core/Common.hpp"
-
-#include <condition_variable>
-#include <mutex>
-#ifndef ALPAKA_THREAD_BARRIER_DISABLE_SPINLOCK
-#    include <atomic>
-#    include <thread>
-#endif
-
-namespace alpaka::core
-{
-    namespace threads
-    {
-        //! A self-resetting barrier.
-        template<typename TIdx>
-        class BarrierThread final
-        {
-        public:
-            explicit BarrierThread(TIdx const& threadCount)
-                : m_threadCount(threadCount)
-                , m_curThreadCount(threadCount)
-                , m_generation(0)
-            {
-            }
-
-            //! Waits for all the other threads to reach the barrier.
-            auto wait() -> void
-            {
-                TIdx const generationWhenEnteredTheWait = m_generation;
-#ifdef ALPAKA_THREAD_BARRIER_DISABLE_SPINLOCK
-                std::unique_lock<std::mutex> lock(m_mtxBarrier);
-#endif
-                if(--m_curThreadCount == 0)
-                {
-                    m_curThreadCount = m_threadCount;
-                    ++m_generation;
-#ifdef ALPAKA_THREAD_BARRIER_DISABLE_SPINLOCK
-                    m_cvAllThreadsReachedBarrier.notify_all();
-#endif
-                }
-                else
-                {
-#ifdef ALPAKA_THREAD_BARRIER_DISABLE_SPINLOCK
-                    m_cvAllThreadsReachedBarrier.wait(
-                        lock,
-                        [this, generationWhenEnteredTheWait] { return generationWhenEnteredTheWait != m_generation; });
-#else
-                    while(generationWhenEnteredTheWait == m_generation)
-                    {
-                        std::this_thread::yield();
-                    }
-#endif
-                }
-            }
-
-        private:
-#ifdef ALPAKA_THREAD_BARRIER_DISABLE_SPINLOCK
-            std::mutex m_mtxBarrier;
-            std::condition_variable m_cvAllThreadsReachedBarrier;
-#endif
-            const TIdx m_threadCount;
-#ifdef ALPAKA_THREAD_BARRIER_DISABLE_SPINLOCK
-            TIdx m_curThreadCount;
-            TIdx m_generation;
-#else
-            std::atomic<TIdx> m_curThreadCount;
-            std::atomic<TIdx> m_generation;
-#endif
-        };
-
-        namespace detail
-        {
-            template<typename TOp>
-            struct AtomicOp;
-
-            template<>
-            struct AtomicOp<BlockCount>
-            {
-                void operator()(std::atomic<int>& result, bool value)
-                {
-                    result += static_cast<int>(value);
-                }
-            };
-
-            template<>
-            struct AtomicOp<BlockAnd>
-            {
-                void operator()(std::atomic<int>& result, bool value)
-                {
-                    result &= static_cast<int>(value);
-                }
-            };
-
-            template<>
-            struct AtomicOp<BlockOr>
-            {
-                void operator()(std::atomic<int>& result, bool value)
-                {
-                    result |= static_cast<int>(value);
-                }
-            };
-        } // namespace detail
-
-        //! A self-resetting barrier with barrier.
-        template<typename TIdx>
-        class BarrierThreadWithPredicate final
-        {
-        public:
-            explicit BarrierThreadWithPredicate(TIdx const& threadCount)
-                : m_threadCount(threadCount)
-                , m_curThreadCount(threadCount)
-                , m_generation(0)
-            {
-            }
-
-            //! Waits for all the other threads to reach the barrier.
-            template<typename TOp>
-            ALPAKA_FN_HOST auto wait(int predicate) -> int
-            {
-                TIdx const generationWhenEnteredTheWait = m_generation;
-                std::unique_lock<std::mutex> lock(m_mtxBarrier);
-
-                auto const generationMod2 = m_generation % static_cast<TIdx>(2u);
-                if(m_curThreadCount == m_threadCount)
-                {
-                    m_result[generationMod2] = TOp::InitialValue;
-                }
-
-                std::atomic<int>& result(m_result[generationMod2]);
-                bool const predicateBool(predicate != 0);
-
-                detail::AtomicOp<TOp>()(result, predicateBool);
-
-                if(--m_curThreadCount == 0)
-                {
-                    m_curThreadCount = m_threadCount;
-                    ++m_generation;
-                    m_cvAllThreadsReachedBarrier.notify_all();
-                }
-                else
-                {
-                    m_cvAllThreadsReachedBarrier.wait(
-                        lock,
-                        [this, generationWhenEnteredTheWait] { return generationWhenEnteredTheWait != m_generation; });
-                }
-                return m_result[generationMod2];
-            }
-
-        private:
-            std::mutex m_mtxBarrier;
-            std::condition_variable m_cvAllThreadsReachedBarrier;
-            const TIdx m_threadCount;
-            TIdx m_curThreadCount;
-            TIdx m_generation;
-            std::atomic<int> m_result[2];
-        };
-    } // namespace threads
-} // namespace alpaka::core
diff --git a/include/alpaka/core/BoostPredef.hpp b/include/alpaka/core/BoostPredef.hpp
deleted file mode 100644
index bcd2d35..0000000
--- a/include/alpaka/core/BoostPredef.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright 2023 Benjamin Worpitz, Matthias Werner, René Widera, Sergei Bastrakov, Jeffrey Kelling,
- *                Bernhard Manfred Gruber, Jan Stephan
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include <boost/predef.h>
-
-#ifdef __INTEL_COMPILER
-#    warning                                                                                                          \
-        "The Intel Classic compiler (icpc) is no longer supported. Please upgrade to the Intel LLVM compiler (ipcx)."
-#endif
-
-//---------------------------------------HIP-----------------------------------
-// __HIP__ is defined by both hip-clang and vanilla clang in HIP mode.
-// https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_porting_guide.md#compiler-defines-summary
-#if !defined(BOOST_LANG_HIP)
-#    if defined(__HIP__)
-/* BOOST_LANG_CUDA is enabled when either __CUDACC__ (nvcc) or __CUDA__ (clang) are defined. This occurs when
-   nvcc / clang encounter a CUDA source file. Since there are no HIP source files we treat every source file
-   as HIP when we are using a HIP-capable compiler. */
-#        include <hip/hip_version.h>
-// HIP doesn't give us a patch level for the last entry, just a gitdate
-#        define BOOST_LANG_HIP BOOST_VERSION_NUMBER(HIP_VERSION_MAJOR, HIP_VERSION_MINOR, 0)
-#    else
-#        define BOOST_LANG_HIP BOOST_VERSION_NUMBER_NOT_AVAILABLE
-#    endif
-#endif
-
-// HSA device architecture detection (HSA generated via HIP(clang))
-#if !defined(BOOST_ARCH_HSA)
-#    if defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__ == 1 && defined(__HIP__)
-// __HIP_DEVICE_COMPILE__ does not represent feature capability of target device like CUDA_ARCH.
-// For feature detection there are special macros, see ROCm's HIP porting guide.
-#        define BOOST_ARCH_HSA BOOST_VERSION_NUMBER_AVAILABLE
-#    else
-#        define BOOST_ARCH_HSA BOOST_VERSION_NUMBER_NOT_AVAILABLE
-#    endif
-#endif
-
-// HIP compiler detection
-#if !defined(BOOST_COMP_HIP)
-#    if defined(__HIP__) // Defined by hip-clang and vanilla clang in HIP mode.
-#        include <hip/hip_version.h>
-// HIP doesn't give us a patch level for the last entry, just a gitdate
-#        define BOOST_COMP_HIP BOOST_VERSION_NUMBER(HIP_VERSION_MAJOR, HIP_VERSION_MINOR, 0)
-#    else
-#        define BOOST_COMP_HIP BOOST_VERSION_NUMBER_NOT_AVAILABLE
-#    endif
-#endif
-
-// clang CUDA compiler detection
-// Currently __CUDA__ is only defined by clang when compiling CUDA code.
-#if defined(__clang__) && defined(__CUDA__)
-#    define BOOST_COMP_CLANG_CUDA BOOST_COMP_CLANG
-#else
-#    define BOOST_COMP_CLANG_CUDA BOOST_VERSION_NUMBER_NOT_AVAILABLE
-#endif
-
-// PGI and NV HPC SDK compiler detection
-// As of Boost 1.74, Boost.Predef's compiler detection is a bit weird. Recent PGI compilers will be identified as
-// BOOST_COMP_PGI_EMULATED. Boost.Predef has lackluster front-end support and mistakes the EDG front-end
-// for an actual compiler.
-// TODO: Whenever you look at this code please check whether https://github.com/boostorg/predef/issues/28 and
-// https://github.com/boostorg/predef/issues/51 have been resolved.
-// BOOST_COMP_PGI_EMULATED is defined by boost instead of BOOST_COMP_PGI
-#if defined(BOOST_COMP_PGI) && defined(BOOST_COMP_PGI_EMULATED)
-#    undef BOOST_COMP_PGI
-#    define BOOST_COMP_PGI BOOST_COMP_PGI_EMULATED
-#endif
-
-// Intel LLVM compiler detection
-#if !defined(BOOST_COMP_ICPX)
-#    if defined(SYCL_LANGUAGE_VERSION) && defined(__INTEL_LLVM_COMPILER)
-// The version string for icpx 2023.1.0 is 20230100. In Boost.Predef this becomes (53,1,0).
-#        define BOOST_COMP_ICPX BOOST_PREDEF_MAKE_YYYYMMDD(__INTEL_LLVM_COMPILER)
-#    endif
-#endif
diff --git a/include/alpaka/core/CallbackThread.hpp b/include/alpaka/core/CallbackThread.hpp
deleted file mode 100644
index 91ecf78..0000000
--- a/include/alpaka/core/CallbackThread.hpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright 2022 Antonio Di Pilato
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/BoostPredef.hpp"
-
-#include <cassert>
-#include <condition_variable>
-#include <functional>
-#include <future>
-#include <iostream>
-#include <mutex>
-#include <queue>
-#include <thread>
-
-namespace alpaka::core
-{
-    class CallbackThread
-    {
-#if BOOST_COMP_CLANG
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored "-Wweak-vtables"
-#endif
-        // A custom class is used because std::function<F> requires F to be copyable, and std::packaged_task provides a
-        // std::future which will keep the task alive and we cannot control the moment the future is set.
-        //! \todo with C++23 std::move_only_function should be used
-        struct Task
-#if BOOST_COMP_CLANG
-#    pragma clang diagnostic pop
-#endif
-        {
-            virtual ~Task() = default;
-            virtual void run() = 0;
-        };
-
-        template<typename Function>
-        struct FunctionHolder : Task
-        {
-            Function m_func;
-
-            template<typename FunctionFwd>
-            explicit FunctionHolder(FunctionFwd&& func) : m_func{std::forward<FunctionFwd>(func)}
-            {
-            }
-
-            void run() override
-            {
-                // if m_func throws, let it propagate
-                m_func();
-            }
-        };
-
-        using TaskPackage = std::pair<std::unique_ptr<Task>, std::promise<void>>;
-
-    public:
-        ~CallbackThread()
-        {
-            {
-                std::unique_lock<std::mutex> lock{m_mutex};
-                m_stop = true;
-                m_cond.notify_one();
-            }
-
-            if(m_thread.joinable())
-            {
-                if(std::this_thread::get_id() == m_thread.get_id())
-                {
-                    std::cerr << "ERROR in ~CallbackThread: thread joins itself" << std::endl;
-                    std::abort();
-                }
-                m_thread.join();
-            }
-        }
-
-        //! It is guaranteed that the task is fully destroyed before the future's result is set.
-        //! @{
-        template<typename NullaryFunction>
-        auto submit(NullaryFunction&& nf) -> std::future<void>
-        {
-            using DecayedFunction = std::decay_t<NullaryFunction>;
-            static_assert(
-                std::is_void_v<std::invoke_result_t<DecayedFunction>>,
-                "Submitted function must not have any arguments and return void.");
-
-            // FunctionHolder stores a copy of the user's task, but may be constructed from an expiring value to avoid
-            // the copy. We do NOT store a reference to the users task, which could dangle if the user isn't careful.
-            auto tp = std::pair(
-                std::unique_ptr<Task>(new FunctionHolder<DecayedFunction>{std::forward<NullaryFunction>(nf)}),
-                std::promise<void>{});
-            auto f = tp.second.get_future();
-            {
-                std::unique_lock<std::mutex> lock{m_mutex};
-                m_tasks.emplace(std::move(tp));
-                if(!m_thread.joinable())
-                    startWorkerThread();
-                m_cond.notify_one();
-            }
-
-            return f;
-        }
-
-        //! @}
-
-        //! @return True if queue is empty and no task is executed else false.
-        //! If only one tasks is enqueued and the task is executed the task will see the queue as not empty.
-        //! During the destruction of this single enqueued task the queue will already be accounted as empty.
-        [[nodiscard]] auto empty()
-        {
-            std::unique_lock<std::mutex> lock{m_mutex};
-            return m_tasks.empty();
-        }
-
-    private:
-        std::thread m_thread;
-        std::condition_variable m_cond;
-        std::mutex m_mutex;
-        bool m_stop{false};
-        std::queue<TaskPackage> m_tasks;
-
-        auto startWorkerThread() -> void
-        {
-            m_thread = std::thread(
-                [this]
-                {
-                    while(true)
-                    {
-                        std::promise<void> taskPromise;
-                        std::exception_ptr eptr;
-                        {
-                            // Task is destroyed before promise is updated but after the queue state is up to date.
-                            std::unique_ptr<Task> task = nullptr;
-                            {
-                                std::unique_lock<std::mutex> lock{m_mutex};
-                                m_cond.wait(lock, [this] { return m_stop || !m_tasks.empty(); });
-
-                                if(m_stop && m_tasks.empty())
-                                    break;
-
-                                task = std::move(m_tasks.front().first);
-                                taskPromise = std::move(m_tasks.front().second);
-                            }
-                            assert(task);
-                            try
-                            {
-                                task->run();
-                            }
-                            catch(...)
-                            {
-                                eptr = std::current_exception();
-                            }
-                            {
-                                std::unique_lock<std::mutex> lock{m_mutex};
-                                // Pop empty data from the queue, task and promise will be destroyed later in a
-                                // well-defined order.
-                                m_tasks.pop();
-                            }
-                            // Task will be destroyed here, the queue status is already updated.
-                        }
-                        // In case the executed tasks is the last task in the queue the waiting threads will see the
-                        // queue as empty.
-                        if(eptr)
-                            taskPromise.set_exception(std::move(eptr));
-                        else
-                            taskPromise.set_value();
-                    }
-                });
-        }
-    };
-} // namespace alpaka::core
diff --git a/include/alpaka/core/ClipCast.hpp b/include/alpaka/core/ClipCast.hpp
deleted file mode 100644
index aa8c712..0000000
--- a/include/alpaka/core/ClipCast.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/meta/Integral.hpp"
-
-#include <algorithm>
-#include <limits>
-
-namespace alpaka::core
-{
-    //! \return The input casted and clipped to T.
-    template<typename T, typename V>
-    auto clipCast(V const& val) -> T
-    {
-        static_assert(
-            std::is_integral_v<T> && std::is_integral_v<V>,
-            "clipCast can not be called with non-integral types!");
-
-        constexpr auto max = static_cast<V>(std::numeric_limits<alpaka::meta::LowerMax<T, V>>::max());
-        constexpr auto min = static_cast<V>(std::numeric_limits<alpaka::meta::HigherMin<T, V>>::min());
-
-        return static_cast<T>(std::max(min, std::min(max, val)));
-    }
-} // namespace alpaka::core
diff --git a/include/alpaka/core/Common.hpp b/include/alpaka/core/Common.hpp
deleted file mode 100644
index 3b181ee..0000000
--- a/include/alpaka/core/Common.hpp
+++ /dev/null
@@ -1,221 +0,0 @@
-/* Copyright 2024 Axel Hübl, Benjamin Worpitz, Matthias Werner, Jan Stephan, René Widera, Andrea Bocci, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/BoostPredef.hpp"
-#include "alpaka/core/Debug.hpp"
-
-// Boost.Uuid errors with VS2017 when intrin.h is not included
-#if defined(_MSC_VER) && _MSC_VER >= 1910
-#    include <intrin.h>
-#endif
-
-#if BOOST_LANG_HIP
-// HIP defines some keywords like __forceinline__ in header files.
-#    include <hip/hip_runtime.h>
-#endif
-
-//! All functions that can be used on an accelerator have to be attributed with ALPAKA_FN_ACC or ALPAKA_FN_HOST_ACC.
-//!
-//! \code{.cpp}
-//! Usage:
-//! ALPAKA_FN_ACC
-//! auto add(std::int32_t a, std::int32_t b)
-//! -> std::int32_t;
-//! \endcode
-//! @{
-#if BOOST_LANG_CUDA || BOOST_LANG_HIP
-#    if defined(ALPAKA_ACC_GPU_CUDA_ONLY_MODE) || defined(ALPAKA_ACC_GPU_HIP_ONLY_MODE)
-#        define ALPAKA_FN_ACC __device__
-#    else
-#        define ALPAKA_FN_ACC __device__ __host__
-#    endif
-#    define ALPAKA_FN_HOST_ACC __device__ __host__
-#    define ALPAKA_FN_HOST __host__
-#else
-#    define ALPAKA_FN_ACC
-#    define ALPAKA_FN_HOST_ACC
-#    define ALPAKA_FN_HOST
-#endif
-//! @}
-
-//! All functions marked with ALPAKA_FN_ACC or ALPAKA_FN_HOST_ACC that are exported to / imported from different
-//! translation units have to be attributed with ALPAKA_FN_EXTERN. Note that this needs to be applied to both the
-//! declaration and the definition.
-//!
-//! Usage:
-//! ALPAKA_FN_ACC ALPAKA_FN_EXTERN auto add(std::int32_t a, std::int32_t b) -> std::int32_t;
-//!
-//! Warning: If this is used together with the SYCL back-end make sure that your SYCL runtime supports generic
-//! address spaces. Otherwise it is forbidden to use pointers as parameter or return type for functions marked
-//! with ALPAKA_FN_EXTERN.
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-/*
-   This is required by the SYCL standard, section 5.10.1 "SYCL functions and member functions linkage":
-
-   The default behavior in SYCL applications is that all the definitions and declarations of the functions and member
-   functions are available to the SYCL compiler, in the same translation unit. When this is not the case, all the
-   symbols that need to be exported to a SYCL library or from a C++ library to a SYCL application need to be defined
-   using the macro: SYCL_EXTERNAL.
-*/
-#    define ALPAKA_FN_EXTERN SYCL_EXTERNAL
-#else
-#    define ALPAKA_FN_EXTERN
-#endif
-
-//! Disable nvcc warning:
-//! 'calling a __host__ function from __host__ __device__ function.'
-//! Usage:
-//! ALPAKA_NO_HOST_ACC_WARNING
-//! ALPAKA_FN_HOST_ACC function_declaration()
-//! WARNING: Only use this method if there is no other way.
-//! Most cases can be solved by #if BOOST_ARCH_PTX or #if BOOST_LANG_CUDA.
-#if(BOOST_LANG_CUDA && !BOOST_COMP_CLANG_CUDA)
-#    if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
-#        define ALPAKA_NO_HOST_ACC_WARNING __pragma(hd_warning_disable)
-#    else
-#        define ALPAKA_NO_HOST_ACC_WARNING _Pragma("hd_warning_disable")
-#    endif
-#else
-#    define ALPAKA_NO_HOST_ACC_WARNING
-#endif
-
-//! Macro defining the inline function attribute.
-//!
-//! The macro should stay on the left hand side of keywords, e.g. 'static', 'constexpr', 'explicit' or the return type.
-#if BOOST_LANG_CUDA || BOOST_LANG_HIP
-#    define ALPAKA_FN_INLINE __forceinline__
-#elif BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
-// TODO: With C++20 [[msvc::forceinline]] can be used.
-#    define ALPAKA_FN_INLINE __forceinline
-#else
-// For gcc, clang, and clang-based compilers like Intel icpx
-#    define ALPAKA_FN_INLINE [[gnu::always_inline]] inline
-#endif
-
-//! This macro defines a variable lying in global accelerator device memory.
-//!
-//! Example:
-//!   ALPAKA_STATIC_ACC_MEM_GLOBAL alpaka::DevGlobal<TAcc, int> variable;
-//!
-//! Those variables behave like ordinary variables when used in file-scope,
-//! but inside kernels the get() method must be used to access the variable.
-//! They are declared inline to resolve to a single instance across multiple
-//! translation units.
-//! Like ordinary variables, only one definition is allowed (ODR)
-//! Failure to do so might lead to linker errors.
-//!
-//! In contrast to ordinary variables, you can not define such variables
-//! as static compilation unit local variables with internal linkage
-//! because this is forbidden by CUDA.
-//!
-//! \attention It is not allowed to initialize the variable together with the declaration.
-//!            To initialize the variable alpaka::memcpy must be used.
-//! \code{.cpp}
-//! ALPAKA_STATIC_ACC_MEM_GLOBAL alpaka::DevGlobal<TAcc, int> foo;
-//!
-//! struct DeviceMemoryKernel
-//! {
-//!    ALPAKA_NO_HOST_ACC_WARNING
-//!    template<typename TAcc>
-//!    ALPAKA_FN_ACC void operator()(TAcc const& acc) const
-//!    {
-//!      auto a = foo<TAcc>.get();
-//!    }
-//!  }
-//!
-//! void initFoo() {
-//!     auto extent = alpaka::Vec<alpaka::DimInt<1u>, size_t>{1};
-//!     int initialValue = 42;
-//!     alpaka::ViewPlainPtr<DevHost, int, alpaka::DimInt<1u>, size_t> bufHost(&initialValue, devHost, extent);
-//!     alpaka::memcpy(queue, foo<Acc>, bufHost, extent);
-//! }
-//! \endcode
-#if((BOOST_LANG_CUDA && BOOST_COMP_CLANG_CUDA) || (BOOST_LANG_CUDA && BOOST_COMP_NVCC && BOOST_ARCH_PTX)              \
-    || BOOST_LANG_HIP)
-#    if defined(__CUDACC_RDC__) || defined(__CLANG_RDC__)
-#        define ALPAKA_STATIC_ACC_MEM_GLOBAL                                                                          \
-            template<typename TAcc>                                                                                   \
-            __device__ inline
-#    else
-#        define ALPAKA_STATIC_ACC_MEM_GLOBAL                                                                          \
-            template<typename TAcc>                                                                                   \
-            __device__ static
-#    endif
-#else
-#    define ALPAKA_STATIC_ACC_MEM_GLOBAL                                                                              \
-        template<typename TAcc>                                                                                       \
-        inline
-#endif
-
-//! This macro defines a variable lying in constant accelerator device memory.
-//!
-//! Example:
-//!   ALPAKA_STATIC_ACC_MEM_CONSTANT alpaka::DevGlobal<TAcc, const int> variable;
-//!
-//! Those variables behave like ordinary variables when used in file-scope,
-//! but inside kernels the get() method must be used to access the variable.
-//! They are declared inline to resolve to a single instance across multiple
-//! translation units.
-//! Like ordinary variables, only one definition is allowed (ODR)
-//! Failure to do so might lead to linker errors.
-//!
-//! In contrast to ordinary variables, you can not define such variables
-//! as static compilation unit local variables with internal linkage
-//! because this is forbidden by CUDA.
-//!
-//! \attention It is not allowed to initialize the variable together with the declaration.
-//!            To initialize the variable alpaka::memcpy must be used.
-//! \code{.cpp}
-//! ALPAKA_STATIC_ACC_MEM_CONSTANT alpaka::DevGlobal<TAcc, const int> foo;
-//!
-//! struct DeviceMemoryKernel
-//! {
-//!    ALPAKA_NO_HOST_ACC_WARNING
-//!    template<typename TAcc>
-//!    ALPAKA_FN_ACC void operator()(TAcc const& acc) const
-//!    {
-//!      auto a = foo<TAcc>.get();
-//!    }
-//!  }
-//!
-//! void initFoo() {
-//!     auto extent = alpaka::Vec<alpaka::DimInt<1u>, size_t>{1};
-//!     int initialValue = 42;
-//!     alpaka::ViewPlainPtr<DevHost, int, alpaka::DimInt<1u>, size_t> bufHost(&initialValue, devHost, extent);
-//!     alpaka::memcpy(queue, foo<Acc>, bufHost, extent);
-//! }
-//! \endcode
-#if((BOOST_LANG_CUDA && BOOST_COMP_CLANG_CUDA) || (BOOST_LANG_CUDA && BOOST_COMP_NVCC && BOOST_ARCH_PTX)              \
-    || BOOST_LANG_HIP)
-#    if defined(__CUDACC_RDC__) || defined(__CLANG_RDC__)
-#        define ALPAKA_STATIC_ACC_MEM_CONSTANT                                                                        \
-            template<typename TAcc>                                                                                   \
-            __constant__ inline
-#    else
-#        define ALPAKA_STATIC_ACC_MEM_CONSTANT                                                                        \
-            template<typename TAcc>                                                                                   \
-            __constant__ static
-#    endif
-#else
-#    define ALPAKA_STATIC_ACC_MEM_CONSTANT                                                                            \
-        template<typename TAcc>                                                                                       \
-        inline
-#endif
-
-//! This macro disables memory optimizations for annotated device memory.
-//!
-//! Example:
-//!   ALPAKA_DEVICE_VOLATILE float* ptr;
-//!
-//! This is useful for pointers, (shared) variables and shared memory which are used in combination with
-//! the alpaka::mem_fence() function. It ensures that memory annotated with this macro will always be written directly
-//! to memory (and not to a register or cache because of compiler optimizations).
-#if(BOOST_LANG_CUDA && BOOST_ARCH_PTX)                                                                                \
-    || (BOOST_LANG_HIP && defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__ == 1)
-#    define ALPAKA_DEVICE_VOLATILE volatile
-#else
-#    define ALPAKA_DEVICE_VOLATILE
-#endif
diff --git a/include/alpaka/core/Concepts.hpp b/include/alpaka/core/Concepts.hpp
deleted file mode 100644
index 443f347..0000000
--- a/include/alpaka/core/Concepts.hpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include <type_traits>
-
-namespace alpaka::concepts
-{
-    //! Tag used in class inheritance hierarchies that describes that a specific concept (TConcept)
-    //! is implemented by the given base class (TBase).
-    template<typename TConcept, typename TBase>
-    struct Implements
-    {
-    };
-
-    //! Checks whether the concept is implemented by the given class
-    template<typename TConcept, typename TDerived>
-    struct ImplementsConcept
-    {
-        template<typename TBase>
-        static auto implements(Implements<TConcept, TBase>&) -> std::true_type;
-        static auto implements(...) -> std::false_type;
-
-        static constexpr auto value = decltype(implements(std::declval<TDerived&>()))::value;
-    };
-
-    namespace detail
-    {
-        //! Returns the type that implements the given concept in the inheritance hierarchy.
-        template<typename TConcept, typename TDerived, typename Sfinae = void>
-        struct ImplementationBaseType;
-
-        //! Base case for types that do not inherit from "Implements<TConcept, ...>" is the type itself.
-        template<typename TConcept, typename TDerived>
-        struct ImplementationBaseType<
-            TConcept,
-            TDerived,
-            std::enable_if_t<!ImplementsConcept<TConcept, TDerived>::value>>
-        {
-            using type = TDerived;
-        };
-
-        //! For types that inherit from "Implements<TConcept, ...>" it finds the base class (TBase) which
-        //! implements the concept.
-        template<typename TConcept, typename TDerived>
-        struct ImplementationBaseType<
-            TConcept,
-            TDerived,
-            std::enable_if_t<ImplementsConcept<TConcept, TDerived>::value>>
-        {
-            template<typename TBase>
-            static auto implementer(Implements<TConcept, TBase>&) -> TBase;
-
-            using type = decltype(implementer(std::declval<TDerived&>()));
-
-            static_assert(
-                std::is_base_of_v<type, TDerived>,
-                "The type implementing the concept has to be a publicly accessible base class!");
-        };
-    } // namespace detail
-
-    //! Returns the type that implements the given concept in the inheritance hierarchy.
-    template<typename TConcept, typename TDerived>
-    using ImplementationBase = typename detail::ImplementationBaseType<TConcept, TDerived>::type;
-} // namespace alpaka::concepts
diff --git a/include/alpaka/core/Cuda.hpp b/include/alpaka/core/Cuda.hpp
deleted file mode 100644
index 8332ad3..0000000
--- a/include/alpaka/core/Cuda.hpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera, Andrea Bocci, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/CudaHipCommon.hpp"
-
-#include <iostream>
-#include <stdexcept>
-#include <string>
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-namespace alpaka::cuda::detail
-{
-    //! CUDA driver API error checking with log and exception, ignoring specific error values
-    ALPAKA_FN_HOST inline auto cudaDrvCheck(CUresult const& error, char const* desc, char const* file, int const& line)
-        -> void
-    {
-        if(error == CUDA_SUCCESS)
-            return;
-
-        char const* cu_err_name = nullptr;
-        char const* cu_err_string = nullptr;
-        CUresult cu_result_name = cuGetErrorName(error, &cu_err_name);
-        CUresult cu_result_string = cuGetErrorString(error, &cu_err_string);
-        std::string sError = std::string(file) + "(" + std::to_string(line) + ") " + std::string(desc) + " : '";
-        if(cu_result_name == CUDA_SUCCESS && cu_result_string == CUDA_SUCCESS)
-        {
-            sError += std::string(cu_err_name) + "': '" + std::string(cu_err_string) + "'!";
-        }
-        else
-        {
-            // cuGetError*() failed, so append corresponding error message
-            if(cu_result_name == CUDA_ERROR_INVALID_VALUE)
-            {
-                sError += " cuGetErrorName: 'Invalid Value'!";
-            }
-            if(cu_result_string == CUDA_ERROR_INVALID_VALUE)
-            {
-                sError += " cuGetErrorString: 'Invalid Value'!";
-            }
-        }
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-        std::cerr << sError << std::endl;
-#    endif
-        ALPAKA_DEBUG_BREAK;
-        throw std::runtime_error(sError);
-    }
-} // namespace alpaka::cuda::detail
-
-//! CUDA driver error checking with log and exception.
-#    define ALPAKA_CUDA_DRV_CHECK(cmd) ::alpaka::cuda::detail::cudaDrvCheck(cmd, #cmd, __FILE__, __LINE__)
-
-#    include "alpaka/core/UniformCudaHip.hpp"
-
-#endif
diff --git a/include/alpaka/core/CudaHipCommon.hpp b/include/alpaka/core/CudaHipCommon.hpp
deleted file mode 100644
index b3fdd7d..0000000
--- a/include/alpaka/core/CudaHipCommon.hpp
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright 2023 Axel Hübl, Benjamin Worpitz, Matthias Werner, René Widera, Andrea Bocci, Bernhard Manfred Gruber,
-                  Jan Stephan
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/elem/Traits.hpp"
-#include "alpaka/extent/Traits.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/meta/Concatenate.hpp"
-#include "alpaka/meta/TypeListOps.hpp"
-#include "alpaka/offset/Traits.hpp"
-#include "alpaka/vec/Vec.hpp"
-
-#include <tuple>
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-
-#    ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-#        include <cuda.h>
-#        include <cuda_runtime.h>
-#    endif
-
-#    ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-#        include <hip/hip_runtime.h>
-#    endif
-
-namespace alpaka
-{
-    namespace detail
-    {
-        using CudaHipBuiltinTypes1 = std::
-            tuple<char1, double1, float1, int1, long1, longlong1, short1, uchar1, uint1, ulong1, ulonglong1, ushort1>;
-        using CudaHipBuiltinTypes2 = std::
-            tuple<char2, double2, float2, int2, long2, longlong2, short2, uchar2, uint2, ulong2, ulonglong2, ushort2>;
-        using CudaHipBuiltinTypes3 = std::tuple<
-            char3,
-            dim3,
-            double3,
-            float3,
-            int3,
-            long3,
-            longlong3,
-            short3,
-            uchar3,
-            uint3,
-            ulong3,
-            ulonglong3,
-            ushort3
-// CUDA built-in variables have special types in clang native CUDA compilation
-// defined in cuda_builtin_vars.h
-#    if BOOST_COMP_CLANG_CUDA
-            ,
-            __cuda_builtin_threadIdx_t,
-            __cuda_builtin_blockIdx_t,
-            __cuda_builtin_blockDim_t,
-            __cuda_builtin_gridDim_t
-#    endif
-            >;
-        using CudaHipBuiltinTypes4 = std::
-            tuple<char4, double4, float4, int4, long4, longlong4, short4, uchar4, uint4, ulong4, ulonglong4, ushort4>;
-        using CudaHipBuiltinTypes = meta::
-            Concatenate<CudaHipBuiltinTypes1, CudaHipBuiltinTypes2, CudaHipBuiltinTypes3, CudaHipBuiltinTypes4>;
-
-        template<typename T>
-        inline constexpr auto isCudaHipBuiltInType = meta::Contains<CudaHipBuiltinTypes, T>::value;
-    } // namespace detail
-
-#    ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-    namespace cuda::trait
-    {
-        template<typename T>
-        inline constexpr auto isCudaBuiltInType = alpaka::detail::isCudaHipBuiltInType<T>;
-    } // namespace cuda::trait
-#    endif
-
-#    ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-    namespace hip::trait
-    {
-        template<typename T>
-        inline constexpr auto isHipBuiltInType = alpaka::detail::isCudaHipBuiltInType<T>;
-    } // namespace hip::trait
-#    endif
-
-    namespace trait
-    {
-        //! The CUDA/HIP vectors 1D dimension get trait specialization.
-        template<typename T>
-        struct DimType<T, std::enable_if_t<meta::Contains<alpaka::detail::CudaHipBuiltinTypes1, T>::value>>
-        {
-            using type = DimInt<1u>;
-        };
-
-        //! The CUDA/HIP vectors 2D dimension get trait specialization.
-        template<typename T>
-        struct DimType<T, std::enable_if_t<meta::Contains<alpaka::detail::CudaHipBuiltinTypes2, T>::value>>
-        {
-            using type = DimInt<2u>;
-        };
-
-        //! The CUDA/HIP vectors 3D dimension get trait specialization.
-        template<typename T>
-        struct DimType<T, std::enable_if_t<meta::Contains<alpaka::detail::CudaHipBuiltinTypes3, T>::value>>
-        {
-            using type = DimInt<3u>;
-        };
-
-        //! The CUDA/HIP vectors 4D dimension get trait specialization.
-        template<typename T>
-        struct DimType<T, std::enable_if_t<meta::Contains<alpaka::detail::CudaHipBuiltinTypes4, T>::value>>
-        {
-            using type = DimInt<4u>;
-        };
-
-        //! The CUDA/HIP vectors elem type trait specialization.
-        template<typename T>
-        struct ElemType<T, std::enable_if_t<alpaka::detail::isCudaHipBuiltInType<T>>>
-        {
-            using type = decltype(std::declval<T>().x);
-        };
-
-        template<typename TCudaHipBuiltin>
-        struct GetExtents<TCudaHipBuiltin, std::enable_if_t<alpaka::detail::isCudaHipBuiltInType<TCudaHipBuiltin>>>
-        {
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC auto operator()(TCudaHipBuiltin const& value) const
-                -> Vec<Dim<TCudaHipBuiltin>, Idx<TCudaHipBuiltin>>
-            {
-                constexpr auto dim = Dim<TCudaHipBuiltin>::value;
-                if constexpr(dim == 1)
-                    return {value.x};
-                else if constexpr(dim == 2)
-                    return {value.y, value.x};
-                else if constexpr(dim == 3)
-                    return {value.z, value.y, value.x};
-                else if constexpr(dim == 4)
-                    return {value.w, value.z, value.y, value.x};
-                else
-                    static_assert(sizeof(value) == 0, "Not implemented");
-
-                ALPAKA_UNREACHABLE({});
-            }
-        };
-
-        template<typename TCudaHipBuiltin>
-        struct GetOffsets<TCudaHipBuiltin, std::enable_if_t<alpaka::detail::isCudaHipBuiltInType<TCudaHipBuiltin>>>
-            : GetExtents<TCudaHipBuiltin>
-        {
-        };
-
-        //! The CUDA/HIP vectors idx type trait specialization.
-        template<typename TIdx>
-        struct IdxType<TIdx, std::enable_if_t<alpaka::detail::isCudaHipBuiltInType<TIdx>>>
-        {
-            using type = std::size_t;
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/core/Debug.hpp b/include/alpaka/core/Debug.hpp
deleted file mode 100644
index dc70ed5..0000000
--- a/include/alpaka/core/Debug.hpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright 2022 Alexander Matthes, Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/BoostPredef.hpp"
-
-#include <iostream>
-#include <string>
-#include <utility>
-
-//! The no debug level.
-#define ALPAKA_DEBUG_DISABLED 0
-//! The minimal debug level.
-#define ALPAKA_DEBUG_MINIMAL 1
-//! The full debug level.
-#define ALPAKA_DEBUG_FULL 2
-
-#ifndef ALPAKA_DEBUG
-//! Set the minimum log level if it is not defined.
-#    define ALPAKA_DEBUG ALPAKA_DEBUG_DISABLED
-#endif
-
-namespace alpaka::core::detail
-{
-    //! Scope logger.
-    class ScopeLogStdOut final
-    {
-    public:
-        explicit ScopeLogStdOut(std::string sScope) : m_sScope(std::move(sScope))
-        {
-            std::cout << "[+] " << m_sScope << std::endl;
-        }
-
-        ScopeLogStdOut(ScopeLogStdOut const&) = delete;
-        ScopeLogStdOut(ScopeLogStdOut&&) = delete;
-        auto operator=(ScopeLogStdOut const&) -> ScopeLogStdOut& = delete;
-        auto operator=(ScopeLogStdOut&&) -> ScopeLogStdOut& = delete;
-
-        ~ScopeLogStdOut()
-        {
-            std::cout << "[-] " << m_sScope << std::endl;
-        }
-
-    private:
-        std::string const m_sScope;
-    };
-} // namespace alpaka::core::detail
-
-// Define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE.
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-#    define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE ::alpaka::core::detail::ScopeLogStdOut const scopeLogStdOut(__func__)
-#else
-#    define ALPAKA_DEBUG_MINIMAL_LOG_SCOPE
-#endif
-
-// Define ALPAKA_DEBUG_FULL_LOG_SCOPE.
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-#    define ALPAKA_DEBUG_FULL_LOG_SCOPE ::alpaka::core::detail::ScopeLogStdOut const scopeLogStdOut(__func__)
-#else
-#    define ALPAKA_DEBUG_FULL_LOG_SCOPE
-#endif
-
-// Define ALPAKA_DEBUG_BREAK.
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-#    if BOOST_COMP_GNUC || BOOST_COMP_CLANG
-#        define ALPAKA_DEBUG_BREAK ::__builtin_trap()
-#    elif BOOST_COMP_MSVC
-#        define ALPAKA_DEBUG_BREAK ::__debugbreak()
-#    else
-#        define ALPAKA_DEBUG_BREAK
-  // #error debug-break for current compiler not implemented!
-#    endif
-#else
-#    define ALPAKA_DEBUG_BREAK
-#endif
diff --git a/include/alpaka/core/Decay.hpp b/include/alpaka/core/Decay.hpp
deleted file mode 100644
index 6b978f5..0000000
--- a/include/alpaka/core/Decay.hpp
+++ /dev/null
@@ -1,16 +0,0 @@
-/* Copyright 2023 Sergei Bastrakov, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/BoostPredef.hpp"
-
-#include <type_traits>
-
-namespace alpaka
-{
-    //! Provides a decaying wrapper around std::is_same. Example: is_decayed_v<volatile float, float> returns true.
-    template<typename T, typename U>
-    inline constexpr auto is_decayed_v = std::is_same_v<std::decay_t<T>, std::decay_t<U>>;
-} // namespace alpaka
diff --git a/include/alpaka/core/DemangleTypeNames.hpp b/include/alpaka/core/DemangleTypeNames.hpp
deleted file mode 100644
index 5650054..0000000
--- a/include/alpaka/core/DemangleTypeNames.hpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright 2022 Andrea Bocci, Antonio Di Pilato
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/BoostPredef.hpp"
-
-#include <boost/core/demangle.hpp>
-
-namespace alpaka::core
-{
-#if BOOST_COMP_CLANG
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored "-Wexit-time-destructors"
-#    pragma clang diagnostic ignored "-Wmissing-variable-declarations"
-#endif
-    template<typename T>
-    inline const std::string demangled = boost::core::demangle(typeid(T).name());
-#if BOOST_COMP_CLANG
-#    pragma clang diagnostic pop
-#endif
-} // namespace alpaka::core
diff --git a/include/alpaka/core/Hip.hpp b/include/alpaka/core/Hip.hpp
deleted file mode 100644
index 2c2e425..0000000
--- a/include/alpaka/core/Hip.hpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/CudaHipCommon.hpp"
-#include "alpaka/core/UniformCudaHip.hpp"
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-#    if !BOOST_LANG_HIP && !defined(ALPAKA_HOST_ONLY)
-#        error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#    endif
-#endif
diff --git a/include/alpaka/core/OmpSchedule.hpp b/include/alpaka/core/OmpSchedule.hpp
deleted file mode 100644
index 722b77b..0000000
--- a/include/alpaka/core/OmpSchedule.hpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright 2022 Sergei Bastrakov, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-
-#ifdef _OPENMP
-#    include <omp.h>
-#endif
-
-#include <cstdint>
-
-namespace alpaka::omp
-{
-    //! Representation of OpenMP schedule information: kind and chunk size. This class can be used regardless of
-    //! whether OpenMP is enabled.
-    struct Schedule
-    {
-        //! Schedule kinds corresponding to arguments of OpenMP schedule clause
-        //!
-        //! Kinds also present in omp_sched_t enum have the same integer values.
-        //! It is enum, not enum class, for shorter usage as omp::Schedule::[kind] and to keep interface of 0.6.0.
-        enum Kind
-        {
-            // Corresponds to not setting schedule
-            NoSchedule,
-            Static = 1u,
-            Dynamic = 2u,
-            Guided = 3u,
-            // Auto supported since OpenMP 3.0
-#if defined _OPENMP && _OPENMP >= 200805
-            Auto = 4u,
-#endif
-            Runtime = 5u
-        };
-
-        //! Schedule kind.
-        Kind kind;
-
-        //! Chunk size. Same as in OpenMP, value 0 corresponds to default chunk size. Using int and not a
-        //! fixed-width type to match OpenMP API.
-        int chunkSize;
-
-        //! Create a schedule with the given kind and chunk size
-        ALPAKA_FN_HOST constexpr Schedule(Kind myKind = NoSchedule, int myChunkSize = 0)
-            : kind(myKind)
-            , chunkSize(myChunkSize)
-        {
-        }
-    };
-
-    //! Get the OpenMP schedule that is applied when the runtime schedule is used.
-    //!
-    //! For OpenMP >= 3.0 returns the value of the internal control variable run-sched-var.
-    //! Without OpenMP or with OpenMP < 3.0, returns the default schedule.
-    //!
-    //! \return Schedule object.
-    ALPAKA_FN_HOST inline auto getSchedule()
-    {
-        // Getting a runtime schedule requires OpenMP 3.0 or newer
-#if defined _OPENMP && _OPENMP >= 200805
-        omp_sched_t ompKind;
-        int chunkSize = 0;
-        omp_get_schedule(&ompKind, &chunkSize);
-        return Schedule{static_cast<Schedule::Kind>(ompKind), chunkSize};
-#else
-        return Schedule{};
-#endif
-    }
-
-    //! Set the OpenMP schedule that is applied when the runtime schedule is used for future parallel regions.
-    //!
-    //! For OpenMP >= 3.0 sets the value of the internal control variable run-sched-var according to the given
-    //! schedule. Without OpenMP or with OpenMP < 3.0, does nothing.
-    //!
-    //! Note that calling from inside a parallel region does not have an immediate effect.
-    ALPAKA_FN_HOST inline void setSchedule(Schedule schedule)
-    {
-        if((schedule.kind != Schedule::NoSchedule) && (schedule.kind != Schedule::Runtime))
-        {
-#if defined _OPENMP && _OPENMP >= 200805
-            omp_set_schedule(static_cast<omp_sched_t>(schedule.kind), schedule.chunkSize);
-#endif
-        }
-    }
-} // namespace alpaka::omp
diff --git a/include/alpaka/core/Positioning.hpp b/include/alpaka/core/Positioning.hpp
deleted file mode 100644
index 8f3d9b8..0000000
--- a/include/alpaka/core/Positioning.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz, René Widera
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-namespace alpaka
-{
-    //! Defines the parallelism hierarchy levels of alpaka
-    namespace hierarchy
-    {
-        struct Grids
-        {
-        };
-
-        struct Blocks
-        {
-        };
-
-        struct Threads
-        {
-        };
-    } // namespace hierarchy
-
-    //! Defines the origins available for getting extent and indices of kernel executions.
-    namespace origin
-    {
-        //! This type is used to get the extents/indices relative to the grid.
-        struct Grid;
-        //! This type is used to get the extent/indices relative to a/the current block.
-        struct Block;
-        //! This type is used to get the extents relative to the thread.
-        struct Thread;
-    } // namespace origin
-
-    //! Defines the units available for getting extent and indices of kernel executions.
-    namespace unit
-    {
-        //! This type is used to get the extent/indices in units of blocks.
-        struct Blocks;
-        //! This type is used to get the extent/indices in units of threads.
-        struct Threads;
-        //! This type is used to get the extents/indices in units of elements.
-        struct Elems;
-    } // namespace unit
-
-    using namespace origin;
-    using namespace unit;
-} // namespace alpaka
diff --git a/include/alpaka/core/RemoveRestrict.hpp b/include/alpaka/core/RemoveRestrict.hpp
deleted file mode 100644
index 316630f..0000000
--- a/include/alpaka/core/RemoveRestrict.hpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright 2021 Rene Widera
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/BoostPredef.hpp"
-
-namespace alpaka
-{
-    //! Removes __restrict__ from a type
-    template<typename T>
-    struct remove_restrict
-    {
-        using type = T;
-    };
-
-#if BOOST_COMP_MSVC
-    template<typename T>
-    struct remove_restrict<T* __restrict>
-    {
-        using type = T*;
-    };
-#else
-    template<typename T>
-    struct remove_restrict<T* __restrict__>
-    {
-        using type = T*;
-    };
-#endif
-
-    //! Helper to remove __restrict__ from a type
-    template<typename T>
-    using remove_restrict_t = typename remove_restrict<T>::type;
-} // namespace alpaka
diff --git a/include/alpaka/core/RuntimeMacros.hpp b/include/alpaka/core/RuntimeMacros.hpp
deleted file mode 100644
index 80faa33..0000000
--- a/include/alpaka/core/RuntimeMacros.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2022  Andrea Bocci, Mehmet Yusufoglu, René Widera, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-// Implementation details
-#include "alpaka/core/Sycl.hpp"
-
-//! ALPAKA_THROW_ACC either aborts(terminating the program and creating a core dump) or throws std::runtime_error
-//! depending on the Acc. The std::runtime_error exception can be catched in the catch block.
-//!
-//! For CUDA __trap function is used which triggers std::runtime_error but can be catched during wait not exec.
-//! For HIP abort() function is used and calls __builtin_trap()
-//! For Sycl assert(false) is not used since it can be disabled -DNDEBUG compile option. abort() is used although it
-//! generates a runtime error instead of aborting in GPUs: "Caught synchronous SYCL exception: Unresolved Symbol
-//! <abort> -999 (Unknown PI error)."
-//!
-//! The OpenMP specification mandates that exceptions thrown by some thread must be handled by the same thread.
-//! Therefore std::runtime_error thrown by ALPAKA_THROW_ACC aborts the the program for OpenMP backends. If needed
-//! the SIGABRT signal can be catched by signal handler.
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDA_ARCH__)
-#    define ALPAKA_THROW_ACC(MSG)                                                                                     \
-        {                                                                                                             \
-            printf(                                                                                                   \
-                "alpaka encountered a user-defined error condition while running on the CUDA back-end:\n%s",          \
-                (MSG));                                                                                               \
-            __trap();                                                                                                 \
-        }
-#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_DEVICE_COMPILE__)
-#    define ALPAKA_THROW_ACC(MSG)                                                                                     \
-        {                                                                                                             \
-            printf(                                                                                                   \
-                "alpaka encountered a user-defined error condition while running on the HIP back-end:\n%s",           \
-                (MSG));                                                                                               \
-            abort();                                                                                                  \
-        }
-#elif defined(ALPAKA_ACC_SYCL_ENABLED) && defined(__SYCL_DEVICE_ONLY__)
-#    define ALPAKA_THROW_ACC(MSG)                                                                                     \
-        {                                                                                                             \
-            printf(                                                                                                   \
-                "alpaka encountered a user-defined error condition while running on the SYCL back-end:\n%s",          \
-                (MSG));                                                                                               \
-            abort();                                                                                                  \
-        }
-#else
-#    define ALPAKA_THROW_ACC(MSG)                                                                                     \
-        {                                                                                                             \
-            printf("alpaka encountered a user-defined error condition:\n%s", (MSG));                                  \
-            throw std::runtime_error(MSG);                                                                            \
-        }
-#endif
diff --git a/include/alpaka/core/Sycl.hpp b/include/alpaka/core/Sycl.hpp
deleted file mode 100644
index c29fccd..0000000
--- a/include/alpaka/core/Sycl.hpp
+++ /dev/null
@@ -1,199 +0,0 @@
-/* Copyright 2023 Jan Stephan, Luca Ferragina, Aurora Perego, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/elem/Traits.hpp"
-#include "alpaka/extent/Traits.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/meta/IntegerSequence.hpp"
-#include "alpaka/offset/Traits.hpp"
-#include "alpaka/vec/Vec.hpp"
-
-#include <array>
-#include <cstddef>
-#include <cstdio> // the #define printf(...) breaks <cstdio> if it is included afterwards
-#include <iostream>
-#include <stdexcept>
-#include <string>
-#include <type_traits>
-#include <utility>
-
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-
-#    include <sycl/sycl.hpp>
-
-// if SYCL is enabled with the AMD backend the printf will be killed because of missing compiler support
-#    ifdef __AMDGCN__
-#        define printf(...)
-#    else
-
-#        ifdef __SYCL_DEVICE_ONLY__
-using AlpakaFormat = char const* [[clang::opencl_constant]];
-#        else
-using AlpakaFormat = char const*;
-#        endif
-
-#        if BOOST_COMP_CLANG
-#            pragma clang diagnostic push
-#            pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
-#        endif
-
-#        define printf(FORMAT, ...)                                                                                   \
-            do                                                                                                        \
-            {                                                                                                         \
-                static auto const format = AlpakaFormat{FORMAT};                                                      \
-                sycl::ext::oneapi::experimental::printf(format, ##__VA_ARGS__);                                       \
-            } while(false)
-
-#        if BOOST_COMP_CLANG
-#            pragma clang diagnostic pop
-#        endif
-
-#    endif
-
-// SYCL vector types trait specializations.
-namespace alpaka
-{
-    namespace detail
-    {
-        // Remove std::is_same boilerplate
-        template<typename T, typename... Ts>
-        struct is_any : std::bool_constant<(std::is_same_v<T, Ts> || ...)>
-        {
-        };
-    } // namespace detail
-
-    //! In contrast to CUDA SYCL doesn't know 1D vectors. It does
-    //! support OpenCL's data types which have additional requirements
-    //! on top of those in the C++ standard. Note that SYCL's equivalent
-    //! to CUDA's dim3 type is a different class type and thus not used
-    //! here.
-    template<typename T>
-    struct IsSyclBuiltInType
-        : detail::is_any<
-              T,
-              // built-in scalar types - these are the standard C++ built-in types, std::size_t, std::byte and
-              // sycl::half
-              sycl::half,
-
-              // 2 component vector types
-              sycl::char2,
-              sycl::uchar2,
-              sycl::short2,
-              sycl::ushort2,
-              sycl::int2,
-              sycl::uint2,
-              sycl::long2,
-              sycl::ulong2,
-              sycl::float2,
-              sycl::double2,
-              sycl::half2,
-
-              // 3 component vector types
-              sycl::char3,
-              sycl::uchar3,
-              sycl::short3,
-              sycl::ushort3,
-              sycl::int3,
-              sycl::uint3,
-              sycl::long3,
-              sycl::ulong3,
-              sycl::float3,
-              sycl::double3,
-              sycl::half3,
-
-              // 4 component vector types
-              sycl::char4,
-              sycl::uchar4,
-              sycl::short4,
-              sycl::ushort4,
-              sycl::int4,
-              sycl::uint4,
-              sycl::long4,
-              sycl::ulong4,
-              sycl::float4,
-              sycl::double4,
-              sycl::half4,
-
-              // 8 component vector types
-              sycl::char8,
-              sycl::uchar8,
-              sycl::short8,
-              sycl::ushort8,
-              sycl::int8,
-              sycl::uint8,
-              sycl::long8,
-              sycl::ulong8,
-              sycl::float8,
-              sycl::double8,
-              sycl::half8,
-
-              // 16 component vector types
-              sycl::char16,
-              sycl::uchar16,
-              sycl::short16,
-              sycl::ushort16,
-              sycl::int16,
-              sycl::uint16,
-              sycl::long16,
-              sycl::ulong16,
-              sycl::float16,
-              sycl::double16,
-              sycl::half16>
-    {
-    };
-} // namespace alpaka
-
-namespace alpaka::trait
-{
-    //! SYCL's types get trait specialization.
-    template<typename T>
-    struct DimType<T, std::enable_if_t<IsSyclBuiltInType<T>::value>>
-    {
-        using type = std::conditional_t<std::is_scalar_v<T>, DimInt<std::size_t{1}>, DimInt<T::size()>>;
-    };
-
-    //! The SYCL vectors' elem type trait specialization.
-    template<typename T>
-    struct ElemType<T, std::enable_if_t<IsSyclBuiltInType<T>::value>>
-    {
-        using type = std::conditional_t<std::is_scalar_v<T>, T, typename T::element_type>;
-    };
-
-    //! The SYCL vectors' extent get trait specialization.
-    template<typename T>
-    struct GetExtents<T, std::enable_if_t<IsSyclBuiltInType<T>::value>>
-    {
-        auto operator()(T const& value) const
-        {
-            if constexpr(std::is_scalar_v<T>)
-                return value;
-            else
-                return impl(value, std::make_index_sequence<Dim<T>::value>{});
-        }
-
-    private:
-        template<std::size_t... Is>
-        auto impl(T const& value, std::index_sequence<Is...>) const
-        {
-            return Vec{value.template swizzle<Is>()...};
-        }
-    };
-
-    //! The SYCL vectors' offset get trait specialization.
-    template<typename T>
-    struct GetOffsets<T, std::enable_if_t<IsSyclBuiltInType<T>::value>> : GetExtents<T>
-    {
-    };
-
-    //! The SYCL vectors' idx type trait specialization.
-    template<typename TIdx>
-    struct IdxType<TIdx, std::enable_if_t<IsSyclBuiltInType<TIdx>::value>>
-    {
-        using type = std::size_t;
-    };
-} // namespace alpaka::trait
-
-#endif
diff --git a/include/alpaka/core/ThreadPool.hpp b/include/alpaka/core/ThreadPool.hpp
deleted file mode 100644
index b59555a..0000000
--- a/include/alpaka/core/ThreadPool.hpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright 2023 Benjamin Worpitz, René Widera, Jan Stephan, Bernhard Manfred Gruber, Jeffrey Kelling
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-
-#include <atomic>
-#include <future>
-#include <mutex>
-#include <optional>
-#include <queue>
-#include <vector>
-
-namespace alpaka::core::detail
-{
-    //! A thread pool yielding when there is not enough work to be done.
-    struct ThreadPool final
-    {
-        using Task = std::packaged_task<void()>;
-
-        //! Creates a thread pool with a given thread count
-        explicit ThreadPool(std::size_t threadCount)
-        {
-            if(threadCount < 1)
-                throw std::invalid_argument("The argument 'threadCount' has to be greate or equal to one!");
-            m_threads.reserve(threadCount);
-            for(std::size_t i = 0; i < threadCount; ++i)
-                m_threads.emplace_back([this] { threadFunc(); });
-        }
-
-        //! Destroys the thread pool, blocking until all enqueued work is done.
-        ~ThreadPool()
-        {
-            m_stop = true; // Signal that concurrent executors should not perform any new work
-            for(auto& t : m_threads)
-            {
-                if(std::this_thread::get_id() == t.get_id())
-                {
-                    std::cerr << "ERROR in ThreadPool joins itself" << std::endl;
-                    std::abort();
-                }
-                t.join();
-            }
-        }
-
-        //! Runs the given function on one of the pool in First In First Out (FIFO) order.
-        //!
-        //! \param task Function object to be called on the pool. Takes an arbitrary number of arguments. Must return
-        //!             void.
-        //! \param args Arguments for task, cannot be moved. If such parameters must be used, use a lambda and capture
-        //!             via move then move the lambda.
-        //! \return     A future to the created task.
-        template<typename TFnObj, typename... TArgs>
-        auto enqueueTask(TFnObj&& task, TArgs&&... args) -> std::future<void>
-        {
-#if BOOST_COMP_MSVC
-// MSVC 14.39.33519 is throwing an error because the noexcept type deduction is not defined in original C++17
-// error C2065: 'task': undeclared identifier
-// see: https://stackoverflow.com/a/72467726
-#    define ALPAKA_NOEXCEPT(...)
-#else
-#    define ALPAKA_NOEXCEPT(...) noexcept(__VA_ARGS__)
-#endif
-            auto ptask
-                = Task{[=, t = std::forward<TFnObj>(task)]() ALPAKA_NOEXCEPT(noexcept(task(args...))) { t(args...); }};
-#undef ALPAKA_NOEXCEPT
-
-            auto future = ptask.get_future();
-            {
-                std::lock_guard<std::mutex> lock{m_mutex};
-                m_tasks.push(std::move(ptask));
-            }
-            return future;
-        }
-
-    private:
-        void threadFunc()
-        {
-            while(!m_stop.load(std::memory_order_relaxed))
-            {
-                std::optional<Task> task;
-                {
-                    std::lock_guard<std::mutex> lock{m_mutex};
-                    if(!m_tasks.empty())
-                    {
-                        task = std::move(m_tasks.front());
-                        m_tasks.pop();
-                    }
-                }
-                if(task)
-                    (*task)();
-                else
-                    std::this_thread::yield();
-            }
-        }
-
-        std::vector<std::thread> m_threads;
-        std::queue<Task> m_tasks; // TODO(bgruber): we could consider a lock-free queue here
-        std::mutex m_mutex;
-        std::atomic<bool> m_stop = false;
-    };
-} // namespace alpaka::core::detail
diff --git a/include/alpaka/core/UniformCudaHip.hpp b/include/alpaka/core/UniformCudaHip.hpp
deleted file mode 100644
index 0896f9d..0000000
--- a/include/alpaka/core/UniformCudaHip.hpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera, Jan Stephan, Andrea Bocci, Bernhard
- * Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/BoostPredef.hpp"
-#include "alpaka/core/Cuda.hpp"
-#include "alpaka/core/Hip.hpp"
-
-#include <initializer_list>
-#include <stdexcept>
-#include <string>
-#include <tuple>
-#include <type_traits>
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-
-namespace alpaka::uniform_cuda_hip::detail
-{
-    //! CUDA/HIP runtime API error checking with log and exception, ignoring specific error values
-    template<typename TApi, bool TThrow>
-    ALPAKA_FN_HOST inline void rtCheck(
-        typename TApi::Error_t const& error,
-        char const* desc,
-        char const* file,
-        int const& line) noexcept(!TThrow)
-    {
-        if(error != TApi::success)
-        {
-            auto const sError = std::string{
-                std::string(file) + "(" + std::to_string(line) + ") " + std::string(desc) + " : '"
-                + TApi::getErrorName(error) + "': '" + std::string(TApi::getErrorString(error)) + "'!"};
-
-            if constexpr(!TThrow || ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL)
-                std::cerr << sError << std::endl;
-
-            ALPAKA_DEBUG_BREAK;
-            // reset the last error to allow user side error handling. Using std::ignore to discard unneeded
-            // return values is suggested by the C++ core guidelines.
-            std::ignore = TApi::getLastError();
-
-            if constexpr(TThrow)
-                throw std::runtime_error(sError);
-        }
-    }
-
-    //! CUDA/HIP runtime API error checking with log and exception, ignoring specific error values
-    template<typename TApi, bool TThrow>
-    ALPAKA_FN_HOST inline void rtCheckIgnore(
-        typename TApi::Error_t const& error,
-        char const* cmd,
-        char const* file,
-        int const& line,
-        std::initializer_list<typename TApi::Error_t> ignoredErrorCodes) noexcept(!TThrow)
-    {
-        if(error != TApi::success)
-        {
-            // If the error code is not one of the ignored ones.
-            if(std::find(std::cbegin(ignoredErrorCodes), std::cend(ignoredErrorCodes), error)
-               == std::cend(ignoredErrorCodes))
-            {
-                using namespace std::literals;
-                rtCheck<TApi, TThrow>(error, ("'"s + std::string(cmd) + "' returned error "s).c_str(), file, line);
-            }
-            else
-            {
-                // reset the last error to avoid propagation to the next CUDA/HIP API call. Using std::ignore
-                // to discard unneeded return values is recommended by the C++ core guidelines.
-                std::ignore = TApi::getLastError();
-            }
-        }
-    }
-
-    //! CUDA/HIP runtime API last error checking with log and exception.
-    template<typename TApi, bool TThrow>
-    ALPAKA_FN_HOST inline void rtCheckLastError(char const* desc, char const* file, int const& line) noexcept(!TThrow)
-    {
-        typename TApi::Error_t const error(TApi::getLastError());
-        rtCheck<TApi, TThrow>(error, desc, file, line);
-    }
-} // namespace alpaka::uniform_cuda_hip::detail
-
-#    define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IMPL(cmd, throw, ...)                                                    \
-        do                                                                                                            \
-        {                                                                                                             \
-            ::alpaka::uniform_cuda_hip::detail::rtCheckLastError<TApi, throw>(                                        \
-                "'" #cmd "' A previous API call (not this one) set the error ",                                       \
-                __FILE__,                                                                                             \
-                __LINE__);                                                                                            \
-            ::alpaka::uniform_cuda_hip::detail::rtCheckIgnore<TApi, throw>(                                           \
-                cmd,                                                                                                  \
-                #cmd,                                                                                                 \
-                __FILE__,                                                                                             \
-                __LINE__,                                                                                             \
-                {__VA_ARGS__});                                                                                       \
-        } while(0)
-
-//! CUDA/HIP runtime error checking with log and exception, ignoring specific error values
-#    define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE(cmd, ...)                                                         \
-        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IMPL(cmd, true, __VA_ARGS__)
-
-//! CUDA/HIP runtime error checking with log and exception.
-#    define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cmd) ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IMPL(cmd, true, )
-
-//! CUDA/HIP runtime error checking with log and exception, ignoring specific error values
-#    define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE_NOEXCEPT(cmd, ...)                                                \
-        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IMPL(cmd, false, __VA_ARGS__)
-
-//! CUDA/HIP runtime error checking with log.
-#    define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(cmd) ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IMPL(cmd, false, )
-#endif
diff --git a/include/alpaka/core/Unreachable.hpp b/include/alpaka/core/Unreachable.hpp
deleted file mode 100644
index 7b1b9ff..0000000
--- a/include/alpaka/core/Unreachable.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright 2022 Jan Stephan, Jeffrey Kelling
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/BoostPredef.hpp"
-
-//! Before CUDA 11.5 nvcc is unable to correctly identify return statements in 'if constexpr' branches. It will issue
-//! a false warning about a missing return statement unless it is told that the following code section is unreachable.
-//!
-//! \param x A dummy value for the expected return type of the calling function.
-#if(BOOST_COMP_NVCC && BOOST_ARCH_PTX)
-#    if BOOST_LANG_CUDA >= BOOST_VERSION_NUMBER(11, 3, 0)
-#        define ALPAKA_UNREACHABLE(...) __builtin_unreachable()
-#    else
-#        define ALPAKA_UNREACHABLE(...) return __VA_ARGS__
-#    endif
-#elif BOOST_COMP_MSVC
-#    define ALPAKA_UNREACHABLE(...) __assume(false)
-#elif BOOST_COMP_GNUC || BOOST_COMP_CLANG
-#    define ALPAKA_UNREACHABLE(...) __builtin_unreachable()
-#else
-#    define ALPAKA_UNREACHABLE(...)
-#endif
diff --git a/include/alpaka/core/Unroll.hpp b/include/alpaka/core/Unroll.hpp
deleted file mode 100644
index 10794e6..0000000
--- a/include/alpaka/core/Unroll.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright 2021 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/BoostPredef.hpp"
-
-//! Suggests unrolling of the directly following loop to the compiler.
-//!
-//! Usage:
-//!  `ALPAKA_UNROLL
-//!  for(...){...}`
-// \TODO: Implement for other compilers.
-#if BOOST_ARCH_PTX
-#    define ALPAKA_UNROLL_STRINGIFY(x) #x
-#    define ALPAKA_UNROLL(...) _Pragma(ALPAKA_UNROLL_STRINGIFY(unroll __VA_ARGS__))
-#elif BOOST_COMP_IBM || BOOST_COMP_SUNPRO || BOOST_COMP_HPACC
-#    define ALPAKA_UNROLL_STRINGIFY(x) #x
-#    define ALPAKA_UNROLL(...) _Pragma(ALPAKA_UNROLL_STRINGIFY(unroll(__VA_ARGS__)))
-#elif BOOST_COMP_PGI
-#    define ALPAKA_UNROLL(...) _Pragma("unroll")
-#else
-#    define ALPAKA_UNROLL(...)
-#endif
diff --git a/include/alpaka/core/Utility.hpp b/include/alpaka/core/Utility.hpp
deleted file mode 100644
index 2610027..0000000
--- a/include/alpaka/core/Utility.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright 2024 Benjamin Worpitz, René Widera, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-
-#include <type_traits>
-#include <utility>
-
-namespace alpaka::core
-{
-    //! convert any type to a reference type
-    //
-    // This function is equivalent to std::declval() but can be used
-    // within an alpaka accelerator kernel too.
-    // This function can be used only within std::decltype().
-#if BOOST_LANG_CUDA && BOOST_COMP_CLANG_CUDA || BOOST_COMP_HIP
-    template<class T>
-    ALPAKA_FN_HOST_ACC std::add_rvalue_reference_t<T> declval();
-#else
-    using std::declval;
-#endif
-
-    /// Returns the ceiling of a / b, as integer.
-    template<typename Integral, typename = std::enable_if_t<std::is_integral_v<Integral>>>
-    [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto divCeil(Integral a, Integral b) -> Integral
-    {
-        return (a + b - 1) / b;
-    }
-
-    /// Computes the nth power of base, in integers.
-    template<typename Integral, typename = std::enable_if_t<std::is_integral_v<Integral>>>
-    [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto intPow(Integral base, Integral n) -> Integral
-    {
-        if(n == 0)
-            return 1;
-        auto r = base;
-        for(Integral i = 1; i < n; i++)
-            r *= base;
-        return r;
-    }
-
-    /// Computes the floor of the nth root of value, in integers.
-    template<typename Integral, typename = std::enable_if_t<std::is_integral_v<Integral>>>
-    [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto nthRootFloor(Integral value, Integral n) -> Integral
-    {
-        // adapted from: https://en.wikipedia.org/wiki/Integer_square_root
-        Integral L = 0;
-        Integral R = value + 1;
-        while(L != R - 1)
-        {
-            Integral const M = (L + R) / 2;
-            if(intPow(M, n) <= value)
-                L = M;
-            else
-                R = M;
-        }
-        return L;
-    }
-
-} // namespace alpaka::core
diff --git a/include/alpaka/core/Vectorize.hpp b/include/alpaka/core/Vectorize.hpp
deleted file mode 100644
index 55f0e6f..0000000
--- a/include/alpaka/core/Vectorize.hpp
+++ /dev/null
@@ -1,358 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-
-#include <cstddef>
-#include <cstdint>
-
-//! Suggests vectorization of the directly following loop to the compiler.
-//!
-//! Usage:
-//!  `ALPAKA_VECTORIZE_HINT
-//!  for(...){...}`
-// \TODO: Implement for other compilers.
-// See: http://stackoverflow.com/questions/2706286/pragmas-swp-ivdep-prefetch-support-in-various-compilers
-/*#if BOOST_COMP_HPACC
-    #define ALPAKA_VECTORIZE_HINT(...)  _Pragma("ivdep")
-#elif BOOST_COMP_PGI
-    #define ALPAKA_VECTORIZE_HINT(...)  _Pragma("vector")
-#elif BOOST_COMP_MSVC
-    #define ALPAKA_VECTORIZE_HINT(...)  __pragma(loop(ivdep))
-#elif BOOST_COMP_GNUC
-    #define ALPAKA_VECTORIZE_HINT(...)  _Pragma("GCC ivdep")
-#else
-    #define ALPAKA_VECTORIZE_HINT(...)
-#endif*/
-
-namespace alpaka::core::vectorization
-{
-    // The alignment required to enable optimal performance dependant on the target architecture.
-    constexpr std::size_t defaultAlignment =
-#if defined(__AVX512BW__) || defined(__AVX512F__) || defined(__MIC__)
-        64u
-#elif defined(__AVX__) || defined(__AVX2__)
-        32u
-#else
-        16u
-#endif
-        ;
-
-    // Number of elements of the given type that can be processed in parallel in a vector register.
-    // By default there is no vectorization.
-    template<typename TElem>
-    struct GetVectorizationSizeElems
-    {
-        static constexpr std::size_t value = 1u;
-    };
-
-    // Number of elements of the given type that can be processed in parallel in a vector register.
-    template<>
-    struct GetVectorizationSizeElems<double>
-    {
-        static constexpr std::size_t value =
-#if defined(__AVX512F__) || defined(__MIC__)
-            // addition (AVX512F,KNC): vaddpd / _mm512_add_pd
-            // subtraction (AVX512F,KNC): vsubpd / _mm512_sub_pd
-            // multiplication (AVX512F,KNC): vmulpd / _mm512_mul_pd
-            8u;
-#elif defined(__AVX__)
-            // addition (AVX): vaddpd / _mm256_add_pd
-            // subtraction (AVX): vsubpd / _mm256_sub_pd
-            // multiplication (AVX): vmulpd / _mm256_mul_pd
-            4u;
-#elif defined(__SSE2__)
-            // addition (SSE2): addpd / _mm_add_pd
-            // subtraction (SSE2): subpd / _mm_sub_pd
-            // multiplication (SSE2): mulpd / _mm_mul_pd
-            2u;
-#elif defined(__ARM_NEON__)
-            // No support for double precision vectorization!
-            1u;
-#elif defined(__ALTIVEC__)
-            2u;
-#else
-            1u;
-#endif
-    };
-
-    // Number of elements of the given type that can be processed in parallel in a vector register.
-    template<>
-    struct GetVectorizationSizeElems<float>
-    {
-        static constexpr std::size_t value =
-#if defined(__AVX512F__) || defined(__MIC__)
-            // addition (AVX512F,KNC): vaddps / _mm512_add_ps
-            // subtraction (AVX512F,KNC): vsubps / _mm512_sub_ps
-            // multiplication (AVX512F,KNC): vmulps / _mm512_mul_ps
-            16u;
-#elif defined(__AVX__)
-            // addition (AVX): vaddps / _mm256_add_ps
-            // subtraction (AVX): vsubps / _mm256_sub_ps
-            // multiplication (AVX): vmulps / _mm256_mul_ps
-            8u;
-#elif defined(__SSE__)
-            // addition (SSE): addps / _mm_add_ps
-            // subtraction (SSE): subps / _mm_sub_ps
-            // multiplication (SSE): mulps / _mm_mul_ps
-            4u;
-#elif defined(__ARM_NEON__)
-            4u;
-#elif defined(__ALTIVEC__)
-            4u;
-#else
-            1u;
-#endif
-    };
-
-    // Number of elements of the given type that can be processed in parallel in a vector register.
-    template<>
-    struct GetVectorizationSizeElems<std::int8_t>
-    {
-        static constexpr std::size_t value =
-#if defined(__AVX512BW__)
-            // addition (AVX512BW): vpaddb / _mm512_mask_add_epi8
-            // subtraction (AVX512BW): vpsubb / _mm512_sub_epi8
-            // multiplication: -
-            64u;
-#elif defined(__AVX2__)
-            // addition (AVX2): vpaddb / _mm256_add_epi8
-            // subtraction (AVX2): vpsubb / _mm256_sub_epi8
-            // multiplication: -
-            32u;
-#elif defined(__SSE2__)
-            // addition (SSE2): paddb / _mm_add_epi8
-            // subtraction (SSE2): psubb / _mm_sub_epi8
-            // multiplication: -
-            16u;
-#elif defined(__ARM_NEON__)
-            16u;
-#elif defined(__ALTIVEC__)
-            16u;
-#elif defined(__CUDA_ARCH__)
-            // addition: __vadd4
-            // subtraction: __vsub4
-            // multiplication: -
-            4u;
-#else
-            1u;
-#endif
-    };
-
-    // Number of elements of the given type that can be processed in parallel in a vector register.
-    template<>
-    struct GetVectorizationSizeElems<std::uint8_t>
-    {
-        static constexpr std::size_t value =
-#if defined(__AVX512BW__)
-            // addition (AVX512BW): vpaddb / _mm512_mask_add_epi8
-            // subtraction (AVX512BW): vpsubb / _mm512_sub_epi8
-            // multiplication: -
-            64u;
-#elif defined(__AVX2__)
-            // addition (AVX2): vpaddb / _mm256_add_epi8
-            // subtraction (AVX2): vpsubb / _mm256_sub_epi8
-            // multiplication: -
-            32u;
-#elif defined(__SSE2__)
-            // addition (SSE2): paddb / _mm_add_epi8
-            // subtraction (SSE2): psubb / _mm_sub_epi8
-            // multiplication: -
-            16u;
-#elif defined(__ARM_NEON__)
-            16u;
-#elif defined(__ALTIVEC__)
-            16u;
-#elif defined(__CUDA_ARCH__)
-            // addition: __vadd4
-            // subtraction: __vsub4
-            // multiplication: -
-            4u;
-#else
-            1u;
-#endif
-    };
-
-    // Number of elements of the given type that can be processed in parallel in a vector register.
-    template<>
-    struct GetVectorizationSizeElems<std::int16_t>
-    {
-        static constexpr std::size_t value =
-#if defined(__AVX512BW__)
-            // addition (AVX512BW): vpaddw / _mm512_mask_add_epi16
-            // subtraction (AVX512BW): vpsubw / _mm512_mask_sub_epi16
-            // multiplication (AVX512BW): vpmullw / _mm512_mask_mullo_epi16
-            32u;
-#elif defined(__AVX2__)
-            // addition (AVX2): vpaddw / _mm256_add_epi16
-            // subtraction (AVX2): vpsubw / _mm256_sub_epi16
-            // multiplication (AVX2): vpmullw / _mm256_mullo_epi16
-            16u;
-#elif defined(__SSE2__)
-            // addition (SSE2): paddw / _mm_add_epi16
-            // subtraction (SSE2): psubw / _mm_sub_epi16
-            // multiplication (SSE2): pmullw / _mm_mullo_epi16
-            8u;
-#elif defined(__ARM_NEON__)
-            8u;
-#elif defined(__ALTIVEC__)
-            8u;
-#elif defined(__CUDA_ARCH__)
-            // addition: __vadd2
-            // subtraction: __vsub2
-            // multiplication: -
-            2u;
-#else
-            1u;
-#endif
-    };
-
-    // Number of elements of the given type that can be processed in parallel in a vector register.
-    template<>
-    struct GetVectorizationSizeElems<std::uint16_t>
-    {
-        static constexpr std::size_t value =
-#if defined(__AVX512BW__)
-            // addition (AVX512BW): vpaddusw / _mm512_mask_adds_epu16
-            // subtraction (AVX512BW): vpsubw / _mm512_subs_epu16
-            // multiplication: ?
-            32u;
-#elif defined(__AVX2__)
-            // addition (AVX2): vpaddusw / _mm256_adds_epu16
-            // subtraction (AVX2): vpsubusw / _mm256_subs_epu16
-            // multiplication: ?
-            16u;
-#elif defined(__SSE2__)
-            // addition (SSE2): paddusw / _mm_adds_epu16
-            // subtraction (SSE2): psubusw / _mm_subs_epu16
-            // multiplication: ?
-            8u;
-#elif defined(__ARM_NEON__)
-            8u;
-#elif defined(__ALTIVEC__)
-            8u;
-#elif defined(__CUDA_ARCH__)
-            // addition: __vadd2
-            // subtraction: __vsub2
-            // multiplication: -
-            2u;
-#else
-            1u;
-#endif
-    };
-
-    // Number of elements of the given type that can be processed in parallel in a vector register.
-    template<>
-    struct GetVectorizationSizeElems<std::int32_t>
-    {
-        static constexpr std::size_t value =
-#if defined(__AVX512F__) || defined(__MIC__)
-            // addition (AVX512F,KNC): vpaddd / _mm512_mask_add_epi32
-            // subtraction (AVX512F,KNC): vpsubd / _mm512_mask_sub_epi32
-            // multiplication (AVX512F,KNC): vpmulld / _mm512_mask_mullo_epi32
-            16u;
-#elif defined(__AVX2__)
-            // addition (AVX2): vpaddd / _mm256_add_epi32
-            // subtraction (AVX2): vpsubd / _mm256_sub_epi32
-            // multiplication (AVX2): vpmulld / _mm256_mullo_epi32
-            8u;
-#elif defined(__SSE2__)
-            // addition (SSE2): paddd / _mm_add_epi32
-            // subtraction (SSE2): psubd / _mm_sub_epi32
-            // multiplication (SSE4.1): pmulld / _mm_mullo_epi32
-            4u;
-#elif defined(__ARM_NEON__)
-            4u;
-#elif defined(__ALTIVEC__)
-            4u;
-#else
-            1u;
-#endif
-    };
-
-    // Number of elements of the given type that can be processed in parallel in a vector register.
-    template<>
-    struct GetVectorizationSizeElems<std::uint32_t>
-    {
-        static constexpr std::size_t value =
-#if defined(__AVX512F__) || defined(__MIC__)
-            // addition (AVX512F,KNC): vpaddd / _mm512_mask_add_epi32
-            // subtraction (AVX512F,KNC): vpsubd / _mm512_mask_sub_epi32
-            // multiplication: ?
-            16u;
-#elif defined(__AVX2__)
-            // addition (AVX2): vpaddd / _mm256_add_epi32
-            // subtraction (AVX2): vpsubd / _mm256_sub_epi32
-            // multiplication: ?
-            8u;
-#elif defined(__SSE2__)
-            // addition (SSE2): paddd / _mm_add_epi32
-            // subtraction (SSE2): psubd / _mm_sub_epi32
-            // multiplication: ?
-            4u;
-#elif defined(__ARM_NEON__)
-            4u;
-#elif defined(__ALTIVEC__)
-            4u;
-#else
-            1u;
-#endif
-    };
-
-    // Number of elements of the given type that can be processed in parallel in a vector register.
-    template<>
-    struct GetVectorizationSizeElems<std::int64_t>
-    {
-        static constexpr std::size_t value =
-#if defined(__AVX512F__)
-            // addition (AVX512F): vpaddq / _mm512_mask_add_epi64
-            // subtraction (AVX512F): vpsubq / _mm512_mask_sub_epi64
-            // multiplication (AVX512DQ): vpmullq / _mm512_mask_mullo_epi64
-            8u;
-#elif defined(__AVX2__)
-            // addition (AVX2): vpaddq / _mm256_add_epi64
-            // subtraction (AVX2): vpsubq / _mm256_sub_epi64
-            // multiplication: -
-            4u;
-#elif defined(__SSE2__)
-            // addition (SSE2): paddq / _mm_add_epi64
-            // subtraction (SSE2): psubq / _mm_sub_epi64
-            // multiplication: -
-            2u;
-#elif defined(__ARM_NEON__)
-            2u;
-#else
-            1u;
-#endif
-    };
-
-    // Number of elements of the given type that can be processed in parallel in a vector register.
-    template<>
-    struct GetVectorizationSizeElems<std::uint64_t>
-    {
-        static constexpr std::size_t value =
-#if defined(__AVX512F__)
-            // addition (AVX512F): vpaddq / _mm512_mask_add_epi64
-            // subtraction (AVX512F): vpsubq / _mm512_mask_sub_epi64
-            // multiplication: ?
-            8u;
-#elif defined(__AVX2__)
-            // addition (AVX2): vpaddq / _mm256_add_epi64
-            // subtraction (AVX2): vpsubq / _mm256_sub_epi64
-            // multiplication: ?
-            4u;
-#elif defined(__SSE2__)
-            // addition (SSE2): paddq / _mm_add_epi64
-            // subtraction (SSE2): psubq / _mm_sub_epi64
-            // multiplication: ?
-            2u;
-#elif defined(__ARM_NEON__)
-            2u;
-#else
-            1u;
-#endif
-    };
-} // namespace alpaka::core::vectorization
diff --git a/include/alpaka/dev/DevCpu.hpp b/include/alpaka/dev/DevCpu.hpp
deleted file mode 100644
index e36c263..0000000
--- a/include/alpaka/dev/DevCpu.hpp
+++ /dev/null
@@ -1,207 +0,0 @@
-/* Copyright 2024 Axel Huebl, Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber,
- *                Antonio Di Pilato, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/dev/common/QueueRegistry.hpp"
-#include "alpaka/dev/cpu/SysInfo.hpp"
-#include "alpaka/mem/buf/Traits.hpp"
-#include "alpaka/platform/Traits.hpp"
-#include "alpaka/queue/Properties.hpp"
-#include "alpaka/queue/QueueGenericThreadsBlocking.hpp"
-#include "alpaka/queue/QueueGenericThreadsNonBlocking.hpp"
-#include "alpaka/queue/Traits.hpp"
-#include "alpaka/queue/cpu/IGenericThreadsQueue.hpp"
-#include "alpaka/traits/Traits.hpp"
-#include "alpaka/wait/Traits.hpp"
-
-#include <algorithm>
-#include <cstddef>
-#include <map>
-#include <memory>
-#include <mutex>
-#include <string>
-#include <vector>
-
-namespace alpaka
-{
-    class DevCpu;
-
-    namespace cpu
-    {
-        using ICpuQueue = IGenericThreadsQueue<DevCpu>;
-    } // namespace cpu
-
-    namespace trait
-    {
-        template<typename TPlatform, typename TSfinae>
-        struct GetDevByIdx;
-    } // namespace trait
-    struct PlatformCpu;
-
-    //! The CPU device.
-    namespace cpu::detail
-    {
-        //! The CPU device implementation.
-        using DevCpuImpl = alpaka::detail::QueueRegistry<cpu::ICpuQueue>;
-    } // namespace cpu::detail
-
-    //! The CPU device handle.
-    class DevCpu
-        : public concepts::Implements<ConceptCurrentThreadWaitFor, DevCpu>
-        , public concepts::Implements<ConceptDev, DevCpu>
-    {
-        friend struct trait::GetDevByIdx<PlatformCpu>;
-
-    protected:
-        DevCpu() : m_spDevCpuImpl(std::make_shared<cpu::detail::DevCpuImpl>())
-        {
-        }
-
-    public:
-        auto operator==(DevCpu const&) const -> bool
-        {
-            return true;
-        }
-
-        auto operator!=(DevCpu const& rhs) const -> bool
-        {
-            return !((*this) == rhs);
-        }
-
-        [[nodiscard]] ALPAKA_FN_HOST auto getAllQueues() const -> std::vector<std::shared_ptr<cpu::ICpuQueue>>
-        {
-            return m_spDevCpuImpl->getAllExistingQueues();
-        }
-
-        //! Registers the given queue on this device.
-        //! NOTE: Every queue has to be registered for correct functionality of device wait operations!
-        ALPAKA_FN_HOST auto registerQueue(std::shared_ptr<cpu::ICpuQueue> spQueue) const -> void
-        {
-            m_spDevCpuImpl->registerQueue(spQueue);
-        }
-
-        [[nodiscard]] auto getNativeHandle() const noexcept
-        {
-            return 0;
-        }
-
-    private:
-        std::shared_ptr<cpu::detail::DevCpuImpl> m_spDevCpuImpl;
-    };
-
-    namespace trait
-    {
-        //! The CPU device name get trait specialization.
-        template<>
-        struct GetName<DevCpu>
-        {
-            ALPAKA_FN_HOST static auto getName(DevCpu const& /* dev */) -> std::string
-            {
-                return cpu::detail::getCpuName();
-            }
-        };
-
-        //! The CPU device available memory get trait specialization.
-        template<>
-        struct GetMemBytes<DevCpu>
-        {
-            ALPAKA_FN_HOST static auto getMemBytes(DevCpu const& /* dev */) -> std::size_t
-            {
-                return cpu::detail::getTotalGlobalMemSizeBytes();
-            }
-        };
-
-        //! The CPU device free memory get trait specialization.
-        template<>
-        struct GetFreeMemBytes<DevCpu>
-        {
-            ALPAKA_FN_HOST static auto getFreeMemBytes(DevCpu const& /* dev */) -> std::size_t
-            {
-                return cpu::detail::getFreeGlobalMemSizeBytes();
-            }
-        };
-
-        //! The CPU device warp size get trait specialization.
-        template<>
-        struct GetWarpSizes<DevCpu>
-        {
-            ALPAKA_FN_HOST static auto getWarpSizes(DevCpu const& /* dev */) -> std::vector<std::size_t>
-            {
-                return {1u};
-            }
-        };
-
-        //! The CPU device preferred warp size get trait specialization.
-        template<>
-        struct GetPreferredWarpSize<DevCpu>
-        {
-            ALPAKA_FN_HOST static constexpr auto getPreferredWarpSize(DevCpu const& /* dev */) -> std::size_t
-            {
-                return 1u;
-            }
-        };
-
-        //! The CPU device reset trait specialization.
-        template<>
-        struct Reset<DevCpu>
-        {
-            ALPAKA_FN_HOST static auto reset(DevCpu const& /* dev */) -> void
-            {
-                ALPAKA_DEBUG_FULL_LOG_SCOPE;
-                // The CPU does nothing on reset.
-            }
-        };
-
-        //! The CPU device native handle type trait specialization.
-        template<>
-        struct NativeHandle<DevCpu>
-        {
-            [[nodiscard]] static auto getNativeHandle(DevCpu const& dev)
-            {
-                return dev.getNativeHandle();
-            }
-        };
-    } // namespace trait
-
-    template<typename TElem, typename TDim, typename TIdx>
-    class BufCpu;
-
-    namespace trait
-    {
-        //! The CPU device memory buffer type trait specialization.
-        template<typename TElem, typename TDim, typename TIdx>
-        struct BufType<DevCpu, TElem, TDim, TIdx>
-        {
-            using type = BufCpu<TElem, TDim, TIdx>;
-        };
-
-        //! The CPU device platform type trait specialization.
-        template<>
-        struct PlatformType<DevCpu>
-        {
-            using type = PlatformCpu;
-        };
-    } // namespace trait
-
-    using QueueCpuNonBlocking = QueueGenericThreadsNonBlocking<DevCpu>;
-    using QueueCpuBlocking = QueueGenericThreadsBlocking<DevCpu>;
-
-    namespace trait
-    {
-        template<>
-        struct QueueType<DevCpu, Blocking>
-        {
-            using type = QueueCpuBlocking;
-        };
-
-        template<>
-        struct QueueType<DevCpu, NonBlocking>
-        {
-            using type = QueueCpuNonBlocking;
-        };
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/dev/DevCpuSycl.hpp b/include/alpaka/dev/DevCpuSycl.hpp
deleted file mode 100644
index bc88ce9..0000000
--- a/include/alpaka/dev/DevCpuSycl.hpp
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Copyright 2024 Jan Stephan, Andrea Bocci, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/acc/Tag.hpp"
-#include "alpaka/dev/DevGenericSycl.hpp"
-
-#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_CPU)
-
-namespace alpaka
-{
-    using DevCpuSycl = DevGenericSycl<TagCpuSycl>;
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/dev/DevCudaRt.hpp b/include/alpaka/dev/DevCudaRt.hpp
deleted file mode 100644
index 92dcba3..0000000
--- a/include/alpaka/dev/DevCudaRt.hpp
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright 2022 Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/ApiCudaRt.hpp"
-#include "alpaka/dev/DevUniformCudaHipRt.hpp"
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-namespace alpaka
-{
-    //! The CUDA RT device handle.
-    using DevCudaRt = DevUniformCudaHipRt<ApiCudaRt>;
-} // namespace alpaka
-
-#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
diff --git a/include/alpaka/dev/DevFpgaSyclIntel.hpp b/include/alpaka/dev/DevFpgaSyclIntel.hpp
deleted file mode 100644
index c0c66ef..0000000
--- a/include/alpaka/dev/DevFpgaSyclIntel.hpp
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Copyright 2024 Jan Stephan, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/acc/Tag.hpp"
-#include "alpaka/dev/DevGenericSycl.hpp"
-
-#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_FPGA)
-
-namespace alpaka
-{
-    using DevFpgaSyclIntel = DevGenericSycl<TagFpgaSyclIntel>;
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/dev/DevGenericSycl.hpp b/include/alpaka/dev/DevGenericSycl.hpp
deleted file mode 100644
index efbcad9..0000000
--- a/include/alpaka/dev/DevGenericSycl.hpp
+++ /dev/null
@@ -1,282 +0,0 @@
-/* Copyright 2024 Jan Stephan, Antonio Di Pilato, Luca Ferragina, Aurora Perego, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/acc/Traits.hpp"
-#include "alpaka/core/Common.hpp"
-#include "alpaka/core/Sycl.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/mem/buf/Traits.hpp"
-#include "alpaka/platform/Traits.hpp"
-#include "alpaka/queue/Properties.hpp"
-#include "alpaka/queue/Traits.hpp"
-#include "alpaka/queue/sycl/QueueGenericSyclBase.hpp"
-#include "alpaka/traits/Traits.hpp"
-#include "alpaka/wait/Traits.hpp"
-
-#include <algorithm>
-#include <cstddef>
-#include <functional>
-#include <memory>
-#include <mutex>
-#include <shared_mutex>
-#include <string>
-#include <utility>
-#include <vector>
-
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-
-#    include <sycl/sycl.hpp>
-
-namespace alpaka
-{
-    namespace trait
-    {
-        template<typename TPlatform, typename TSfinae>
-        struct GetDevByIdx;
-    } // namespace trait
-
-    template<typename TTag>
-    using QueueGenericSyclBlocking = detail::QueueGenericSyclBase<TTag, true>;
-
-    template<typename TTag>
-    using QueueGenericSyclNonBlocking = detail::QueueGenericSyclBase<TTag, false>;
-
-    template<typename TTag>
-    struct PlatformGenericSycl;
-
-    template<typename TElem, typename TDim, typename TIdx, typename TTag>
-    class BufGenericSycl;
-
-    namespace detail
-    {
-        class DevGenericSyclImpl
-        {
-        public:
-            DevGenericSyclImpl(sycl::device device, sycl::context context)
-                : m_device{std::move(device)}
-                , m_context{std::move(context)}
-            {
-            }
-
-            // Don't call this without locking first!
-            auto clean_queues() -> void
-            {
-                // Clean up dead queues
-                auto const start = std::begin(m_queues);
-                auto const old_end = std::end(m_queues);
-                auto const new_end = std::remove_if(start, old_end, [](auto q_ptr) { return q_ptr.expired(); });
-                m_queues.erase(new_end, old_end);
-            }
-
-            auto register_queue(std::shared_ptr<QueueGenericSyclImpl> const& queue) -> void
-            {
-                std::lock_guard<std::shared_mutex> lock{m_mutex};
-
-                clean_queues();
-                m_queues.emplace_back(queue);
-            }
-
-            auto register_dependency(sycl::event event) -> void
-            {
-                std::shared_lock<std::shared_mutex> lock{m_mutex};
-
-                for(auto& q_ptr : m_queues)
-                {
-                    if(auto ptr = q_ptr.lock(); ptr != nullptr)
-                        ptr->register_dependency(event);
-                }
-            }
-
-            auto wait()
-            {
-                std::shared_lock<std::shared_mutex> lock{m_mutex};
-
-                for(auto& q_ptr : m_queues)
-                {
-                    if(auto ptr = q_ptr.lock(); ptr != nullptr)
-                        ptr->wait();
-                }
-            }
-
-            auto get_device() const -> sycl::device
-            {
-                return m_device;
-            }
-
-            auto get_context() const -> sycl::context
-            {
-                return m_context;
-            }
-
-        private:
-            sycl::device m_device;
-            sycl::context m_context;
-            std::vector<std::weak_ptr<QueueGenericSyclImpl>> m_queues;
-            std::shared_mutex mutable m_mutex;
-        };
-    } // namespace detail
-
-    //! The SYCL device handle.
-    template<typename TTag>
-    class DevGenericSycl
-        : public concepts::Implements<ConceptCurrentThreadWaitFor, DevGenericSycl<TTag>>
-        , public concepts::Implements<ConceptDev, DevGenericSycl<TTag>>
-    {
-        friend struct trait::GetDevByIdx<PlatformGenericSycl<TTag>>;
-
-    public:
-        DevGenericSycl(sycl::device device, sycl::context context)
-            : m_impl{std::make_shared<detail::DevGenericSyclImpl>(std::move(device), std::move(context))}
-        {
-        }
-
-        friend auto operator==(DevGenericSycl const& lhs, DevGenericSycl const& rhs) -> bool
-        {
-            return (lhs.m_impl == rhs.m_impl);
-        }
-
-        friend auto operator!=(DevGenericSycl const& lhs, DevGenericSycl const& rhs) -> bool
-        {
-            return !(lhs == rhs);
-        }
-
-        [[nodiscard]] auto getNativeHandle() const -> std::pair<sycl::device, sycl::context>
-        {
-            return std::make_pair(m_impl->get_device(), m_impl->get_context());
-        }
-
-        std::shared_ptr<detail::DevGenericSyclImpl> m_impl;
-    };
-
-    namespace trait
-    {
-        //! The SYCL device name get trait specialization.
-        template<typename TTag>
-        struct GetName<DevGenericSycl<TTag>>
-        {
-            static auto getName(DevGenericSycl<TTag> const& dev) -> std::string
-            {
-                auto const device = dev.getNativeHandle().first;
-                return device.template get_info<sycl::info::device::name>();
-            }
-        };
-
-        //! The SYCL device available memory get trait specialization.
-        template<typename TTag>
-        struct GetMemBytes<DevGenericSycl<TTag>>
-        {
-            static auto getMemBytes(DevGenericSycl<TTag> const& dev) -> std::size_t
-            {
-                auto const device = dev.getNativeHandle().first;
-                return device.template get_info<sycl::info::device::global_mem_size>();
-            }
-        };
-
-        //! The SYCL device free memory get trait specialization.
-        template<typename TTag>
-        struct GetFreeMemBytes<DevGenericSycl<TTag>>
-        {
-            static auto getFreeMemBytes(DevGenericSycl<TTag> const& /* dev */) -> std::size_t
-            {
-                static_assert(
-                    !sizeof(PlatformGenericSycl<TTag>),
-                    "Querying free device memory not supported for SYCL devices.");
-                return std::size_t{};
-            }
-        };
-
-        //! The SYCL device warp size get trait specialization.
-        template<typename TTag>
-        struct GetWarpSizes<DevGenericSycl<TTag>>
-        {
-            static auto getWarpSizes(DevGenericSycl<TTag> const& dev) -> std::vector<std::size_t>
-            {
-                auto const device = dev.getNativeHandle().first;
-                std::vector<std::size_t> warp_sizes = device.template get_info<sycl::info::device::sub_group_sizes>();
-                // The CPU runtime supports a sub-group size of 64, but the SYCL implementation currently does not
-                auto find64 = std::find(warp_sizes.begin(), warp_sizes.end(), 64);
-                if(find64 != warp_sizes.end())
-                    warp_sizes.erase(find64);
-                // Sort the warp sizes in decreasing order
-                std::sort(warp_sizes.begin(), warp_sizes.end(), std::greater<>{});
-                return warp_sizes;
-            }
-        };
-
-        //! The SYCL device preferred warp size get trait specialization.
-        template<typename TTag>
-        struct GetPreferredWarpSize<DevGenericSycl<TTag>>
-        {
-            static auto getPreferredWarpSize(DevGenericSycl<TTag> const& dev) -> std::size_t
-            {
-                return GetWarpSizes<DevGenericSycl<TTag>>::getWarpSizes(dev).front();
-            }
-        };
-
-        //! The SYCL device reset trait specialization.
-        template<typename TTag>
-        struct Reset<DevGenericSycl<TTag>>
-        {
-            static auto reset(DevGenericSycl<TTag> const&) -> void
-            {
-                static_assert(
-                    !sizeof(PlatformGenericSycl<TTag>),
-                    "Explicit device reset not supported for SYCL devices");
-            }
-        };
-
-        //! The SYCL device native handle trait specialization.
-        template<typename TTag>
-        struct NativeHandle<DevGenericSycl<TTag>>
-        {
-            [[nodiscard]] static auto getNativeHandle(DevGenericSycl<TTag> const& dev)
-            {
-                return dev.getNativeHandle();
-            }
-        };
-
-        //! The SYCL device memory buffer type trait specialization.
-        template<typename TElem, typename TDim, typename TIdx, typename TTag>
-        struct BufType<DevGenericSycl<TTag>, TElem, TDim, TIdx>
-        {
-            using type = BufGenericSycl<TElem, TDim, TIdx, TTag>;
-        };
-
-        //! The SYCL device platform type trait specialization.
-        template<typename TTag>
-        struct PlatformType<DevGenericSycl<TTag>>
-        {
-            using type = PlatformGenericSycl<TTag>;
-        };
-
-        //! The thread SYCL device wait specialization.
-        template<typename TTag>
-        struct CurrentThreadWaitFor<DevGenericSycl<TTag>>
-        {
-            static auto currentThreadWaitFor(DevGenericSycl<TTag> const& dev) -> void
-            {
-                dev.m_impl->wait();
-            }
-        };
-
-        //! The SYCL blocking queue trait specialization.
-        template<typename TTag>
-        struct QueueType<DevGenericSycl<TTag>, Blocking>
-        {
-            using type = QueueGenericSyclBlocking<TTag>;
-        };
-
-        //! The SYCL non-blocking queue trait specialization.
-        template<typename TTag>
-        struct QueueType<DevGenericSycl<TTag>, NonBlocking>
-        {
-            using type = QueueGenericSyclNonBlocking<TTag>;
-        };
-
-    } // namespace trait
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/dev/DevGpuSyclIntel.hpp b/include/alpaka/dev/DevGpuSyclIntel.hpp
deleted file mode 100644
index 2850126..0000000
--- a/include/alpaka/dev/DevGpuSyclIntel.hpp
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Copyright 2024 Jan Stephan, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/acc/Tag.hpp"
-#include "alpaka/dev/DevGenericSycl.hpp"
-
-#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_GPU)
-
-namespace alpaka
-{
-    using DevGpuSyclIntel = DevGenericSycl<TagGpuSyclIntel>;
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/dev/DevHipRt.hpp b/include/alpaka/dev/DevHipRt.hpp
deleted file mode 100644
index 819c2f5..0000000
--- a/include/alpaka/dev/DevHipRt.hpp
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright 2022 Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/ApiHipRt.hpp"
-#include "alpaka/dev/DevUniformCudaHipRt.hpp"
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-namespace alpaka
-{
-    //! The HIP RT device handle.
-    using DevHipRt = DevUniformCudaHipRt<ApiHipRt>;
-} // namespace alpaka
-
-#endif // ALPAKA_ACC_GPU_HIP_ENABLED
diff --git a/include/alpaka/dev/DevUniformCudaHipRt.hpp b/include/alpaka/dev/DevUniformCudaHipRt.hpp
deleted file mode 100644
index 876d8ca..0000000
--- a/include/alpaka/dev/DevUniformCudaHipRt.hpp
+++ /dev/null
@@ -1,269 +0,0 @@
-/* Copyright 2024 Benjamin Worpitz, Jakob Krude, René Widera, Andrea Bocci, Bernhard Manfred Gruber,
- *                Antonio Di Pilato, Jan Stephan, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/ApiCudaRt.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/core/Cuda.hpp"
-#include "alpaka/core/Hip.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/dev/common/QueueRegistry.hpp"
-#include "alpaka/mem/buf/Traits.hpp"
-#include "alpaka/platform/Traits.hpp"
-#include "alpaka/queue/Properties.hpp"
-#include "alpaka/queue/Traits.hpp"
-#include "alpaka/queue/cuda_hip/QueueUniformCudaHipRt.hpp"
-#include "alpaka/traits/Traits.hpp"
-#include "alpaka/wait/Traits.hpp"
-
-#include <cstddef>
-#include <string>
-#include <vector>
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-
-namespace alpaka
-{
-    namespace trait
-    {
-        template<typename TPlatform, typename TSfinae>
-        struct GetDevByIdx;
-    } // namespace trait
-
-    namespace uniform_cuda_hip::detail
-    {
-        template<typename TApi, bool TBlocking>
-        class QueueUniformCudaHipRt;
-    } // namespace uniform_cuda_hip::detail
-
-    template<typename TApi>
-    using QueueUniformCudaHipRtBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, true>;
-
-    template<typename TApi>
-    using QueueUniformCudaHipRtNonBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, false>;
-
-    template<typename TApi>
-    struct PlatformUniformCudaHipRt;
-
-    template<typename TApi, typename TElem, typename TDim, typename TIdx>
-    struct BufUniformCudaHipRt;
-
-    //! The CUDA/HIP RT device handle.
-    template<typename TApi>
-    class DevUniformCudaHipRt
-        : public concepts::Implements<ConceptCurrentThreadWaitFor, DevUniformCudaHipRt<TApi>>
-        , public concepts::Implements<ConceptDev, DevUniformCudaHipRt<TApi>>
-    {
-        friend struct trait::GetDevByIdx<PlatformUniformCudaHipRt<TApi>>;
-
-        using IDeviceQueue = uniform_cuda_hip::detail::QueueUniformCudaHipRtImpl<TApi>;
-
-    protected:
-        DevUniformCudaHipRt() : m_QueueRegistry{std::make_shared<alpaka::detail::QueueRegistry<IDeviceQueue>>()}
-        {
-        }
-
-    public:
-        ALPAKA_FN_HOST auto operator==(DevUniformCudaHipRt const& rhs) const -> bool
-        {
-            return m_iDevice == rhs.m_iDevice;
-        }
-
-        ALPAKA_FN_HOST auto operator!=(DevUniformCudaHipRt const& rhs) const -> bool
-        {
-            return !((*this) == rhs);
-        }
-
-        [[nodiscard]] auto getNativeHandle() const noexcept -> int
-        {
-            return m_iDevice;
-        }
-
-        [[nodiscard]] ALPAKA_FN_HOST auto getAllQueues() const -> std::vector<std::shared_ptr<IDeviceQueue>>
-        {
-            return m_QueueRegistry->getAllExistingQueues();
-        }
-
-        //! Registers the given queue on this device.
-        //! NOTE: Every queue has to be registered for correct functionality of device wait operations!
-        ALPAKA_FN_HOST auto registerQueue(std::shared_ptr<IDeviceQueue> spQueue) const -> void
-        {
-            m_QueueRegistry->registerQueue(spQueue);
-        }
-
-    private:
-        DevUniformCudaHipRt(int iDevice)
-            : m_iDevice(iDevice)
-            , m_QueueRegistry(std::make_shared<alpaka::detail::QueueRegistry<IDeviceQueue>>())
-        {
-        }
-
-        int m_iDevice;
-
-        std::shared_ptr<alpaka::detail::QueueRegistry<IDeviceQueue>> m_QueueRegistry;
-    };
-
-    namespace trait
-    {
-        //! The CUDA/HIP RT device name get trait specialization.
-        template<typename TApi>
-        struct GetName<DevUniformCudaHipRt<TApi>>
-        {
-            ALPAKA_FN_HOST static auto getName(DevUniformCudaHipRt<TApi> const& dev) -> std::string
-            {
-                // There is cuda/hip-DeviceGetAttribute as faster alternative to cuda/hip-GetDeviceProperties to get a
-                // single device property but it has no option to get the name
-                typename TApi::DeviceProp_t devProp;
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::getDeviceProperties(&devProp, dev.getNativeHandle()));
-
-                return std::string(devProp.name);
-            }
-        };
-
-        //! The CUDA/HIP RT device available memory get trait specialization.
-        template<typename TApi>
-        struct GetMemBytes<DevUniformCudaHipRt<TApi>>
-        {
-            ALPAKA_FN_HOST static auto getMemBytes(DevUniformCudaHipRt<TApi> const& dev) -> std::size_t
-            {
-                // Set the current device to wait for.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(dev.getNativeHandle()));
-
-                std::size_t freeInternal(0u);
-                std::size_t totalInternal(0u);
-
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memGetInfo(&freeInternal, &totalInternal));
-
-                return totalInternal;
-            }
-        };
-
-        //! The CUDA/HIP RT device free memory get trait specialization.
-        template<typename TApi>
-        struct GetFreeMemBytes<DevUniformCudaHipRt<TApi>>
-        {
-            ALPAKA_FN_HOST static auto getFreeMemBytes(DevUniformCudaHipRt<TApi> const& dev) -> std::size_t
-            {
-                // Set the current device to wait for.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(dev.getNativeHandle()));
-
-                std::size_t freeInternal(0u);
-                std::size_t totalInternal(0u);
-
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memGetInfo(&freeInternal, &totalInternal));
-
-                return freeInternal;
-            }
-        };
-
-        //! The CUDA/HIP RT device warp size get trait specialization.
-        template<typename TApi>
-        struct GetWarpSizes<DevUniformCudaHipRt<TApi>>
-        {
-            ALPAKA_FN_HOST static auto getWarpSizes(DevUniformCudaHipRt<TApi> const& dev) -> std::vector<std::size_t>
-            {
-                return {GetPreferredWarpSize<DevUniformCudaHipRt<TApi>>::getPreferredWarpSize(dev)};
-            }
-        };
-
-        //! The CUDA/HIP RT preferred device warp size get trait specialization.
-        template<typename TApi>
-        struct GetPreferredWarpSize<DevUniformCudaHipRt<TApi>>
-        {
-            ALPAKA_FN_HOST static auto getPreferredWarpSize(DevUniformCudaHipRt<TApi> const& dev) -> std::size_t
-            {
-                int warpSize = 0;
-
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
-                    TApi::deviceGetAttribute(&warpSize, TApi::deviceAttributeWarpSize, dev.getNativeHandle()));
-                return static_cast<std::size_t>(warpSize);
-            }
-        };
-
-#    ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-        //! The CUDA RT preferred device warp size get trait specialization.
-        template<>
-        struct GetPreferredWarpSize<DevUniformCudaHipRt<ApiCudaRt>>
-        {
-            ALPAKA_FN_HOST static constexpr auto getPreferredWarpSize(DevUniformCudaHipRt<ApiCudaRt> const& /* dev */)
-                -> std::size_t
-            {
-                // All CUDA GPUs to date have a warp size of 32 threads.
-                return 32u;
-            }
-        };
-#    endif // ALPAKA_ACC_GPU_CUDA_ENABLED
-
-        //! The CUDA/HIP RT device reset trait specialization.
-        template<typename TApi>
-        struct Reset<DevUniformCudaHipRt<TApi>>
-        {
-            ALPAKA_FN_HOST static auto reset(DevUniformCudaHipRt<TApi> const& dev) -> void
-            {
-                ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                // Set the current device to wait for.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(dev.getNativeHandle()));
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceReset());
-            }
-        };
-
-        //! The CUDA/HIP RT device native handle trait specialization.
-        template<typename TApi>
-        struct NativeHandle<DevUniformCudaHipRt<TApi>>
-        {
-            [[nodiscard]] static auto getNativeHandle(DevUniformCudaHipRt<TApi> const& dev)
-            {
-                return dev.getNativeHandle();
-            }
-        };
-
-        //! The CUDA/HIP RT device memory buffer type trait specialization.
-        template<typename TApi, typename TElem, typename TDim, typename TIdx>
-        struct BufType<DevUniformCudaHipRt<TApi>, TElem, TDim, TIdx>
-        {
-            using type = BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>;
-        };
-
-        //! The CUDA/HIP RT device platform type trait specialization.
-        template<typename TApi>
-        struct PlatformType<DevUniformCudaHipRt<TApi>>
-        {
-            using type = PlatformUniformCudaHipRt<TApi>;
-        };
-
-        //! The thread CUDA/HIP device wait specialization.
-        //!
-        //! Blocks until the device has completed all preceding requested tasks.
-        //! Tasks that are enqueued or queues that are created after this call is made are not waited for.
-        template<typename TApi>
-        struct CurrentThreadWaitFor<DevUniformCudaHipRt<TApi>>
-        {
-            ALPAKA_FN_HOST static auto currentThreadWaitFor(DevUniformCudaHipRt<TApi> const& dev) -> void
-            {
-                ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                // Set the current device to wait for.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(dev.getNativeHandle()));
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceSynchronize());
-            }
-        };
-
-        template<typename TApi>
-        struct QueueType<DevUniformCudaHipRt<TApi>, Blocking>
-        {
-            using type = QueueUniformCudaHipRtBlocking<TApi>;
-        };
-
-        template<typename TApi>
-        struct QueueType<DevUniformCudaHipRt<TApi>, NonBlocking>
-        {
-            using type = QueueUniformCudaHipRtNonBlocking<TApi>;
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/dev/Traits.hpp b/include/alpaka/dev/Traits.hpp
deleted file mode 100644
index a3954f2..0000000
--- a/include/alpaka/dev/Traits.hpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright 2024 Benjamin Worpitz, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/core/Concepts.hpp"
-
-#include <algorithm>
-#include <cctype>
-#include <cstddef>
-#include <string>
-#include <vector>
-
-namespace alpaka
-{
-    //! The device traits.
-    namespace trait
-    {
-        //! The device type trait.
-        template<typename T, typename TSfinae = void>
-        struct DevType;
-
-        //! The device get trait.
-        template<typename T, typename TSfinae = void>
-        struct GetDev;
-
-        //! The device name get trait.
-        template<typename TDev, typename TSfinae = void>
-        struct GetName;
-
-        //! The device memory size get trait.
-        template<typename TDev, typename TSfinae = void>
-        struct GetMemBytes;
-
-        //! The device free memory size get trait.
-        template<typename T, typename TSfinae = void>
-        struct GetFreeMemBytes;
-
-        //! The device warp size get trait.
-        template<typename T, typename TSfinae = void>
-        struct GetWarpSizes;
-
-        //! The device preferred warp size get trait.
-        template<typename T, typename TSfinae = void>
-        struct GetPreferredWarpSize;
-
-        //! The device reset trait.
-        template<typename T, typename TSfinae = void>
-        struct Reset;
-    } // namespace trait
-
-    //! The device type trait alias template to remove the ::type.
-    template<typename T>
-    using Dev = typename trait::DevType<T>::type;
-
-    struct ConceptGetDev;
-
-    struct ConceptDev;
-
-    //! True if TDev is a device, i.e. if it implements the ConceptDev concept.
-    template<typename TDev>
-    inline constexpr bool isDevice = concepts::ImplementsConcept<ConceptDev, std::decay_t<TDev>>::value;
-
-    //! \return The device this object is bound to.
-    template<typename T>
-    ALPAKA_FN_HOST auto getDev(T const& t)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptGetDev, T>;
-        return trait::GetDev<ImplementationBase>::getDev(t);
-    }
-
-    namespace detail
-    {
-        inline auto trim(std::string s) -> std::string
-        {
-            auto const pred = [](char c) { return !std::isspace(c); };
-            s.erase(std::find_if(rbegin(s), rend(s), pred).base(), end(s));
-            s.erase(begin(s), std::find_if(begin(s), end(s), pred));
-            return s;
-        }
-    } // namespace detail
-
-    //! \return The device name with leading/trailing space characters trimmed off.
-    template<typename TDev>
-    ALPAKA_FN_HOST auto getName(TDev const& dev) -> std::string
-    {
-        return detail::trim(trait::GetName<TDev>::getName(dev));
-    }
-
-    //! \return The memory on the device in Bytes. Returns 0 if querying memory
-    //!  is not supported.
-    template<typename TDev>
-    ALPAKA_FN_HOST auto getMemBytes(TDev const& dev) -> std::size_t
-    {
-        return trait::GetMemBytes<TDev>::getMemBytes(dev);
-    }
-
-    //! \return The free memory on the device in Bytes.
-    //
-    //! \note Do not use this query if getMemBytes returned 0.
-    template<typename TDev>
-    ALPAKA_FN_HOST auto getFreeMemBytes(TDev const& dev) -> std::size_t
-    {
-        return trait::GetFreeMemBytes<TDev>::getFreeMemBytes(dev);
-    }
-
-    //! \return The supported warp sizes on the device in number of threads.
-    template<typename TDev>
-    ALPAKA_FN_HOST auto getWarpSizes(TDev const& dev) -> std::vector<std::size_t>
-    {
-        return trait::GetWarpSizes<TDev>::getWarpSizes(dev);
-    }
-
-    //! \return The preferred warp size on the device in number of threads.
-    template<typename TDev>
-    ALPAKA_FN_HOST constexpr auto getPreferredWarpSize(TDev const& dev) -> std::size_t
-    {
-        return trait::GetPreferredWarpSize<TDev>::getPreferredWarpSize(dev);
-    }
-
-    //! Resets the device.
-    //! What this method does is dependent on the accelerator.
-    template<typename TDev>
-    ALPAKA_FN_HOST auto reset(TDev const& dev) -> void
-    {
-        trait::Reset<TDev>::reset(dev);
-    }
-
-    namespace trait
-    {
-        //! Get device type
-        template<typename TDev>
-        struct DevType<TDev, std::enable_if_t<concepts::ImplementsConcept<ConceptDev, TDev>::value>>
-        {
-            using type = typename concepts::ImplementationBase<ConceptDev, TDev>;
-        };
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/dev/common/QueueRegistry.hpp b/include/alpaka/dev/common/QueueRegistry.hpp
deleted file mode 100644
index 62055fc..0000000
--- a/include/alpaka/dev/common/QueueRegistry.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Jeffrey Kelling
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-
-#include <deque>
-#include <functional>
-#include <memory>
-#include <mutex>
-
-namespace alpaka::detail
-{
-    //! The CPU/GPU device queue registry implementation.
-    //!
-    //! @tparam TQueue queue implementation
-    template<typename TQueue>
-    struct QueueRegistry
-    {
-        ALPAKA_FN_HOST auto getAllExistingQueues() const -> std::vector<std::shared_ptr<TQueue>>
-        {
-            std::vector<std::shared_ptr<TQueue>> vspQueues;
-
-            std::lock_guard<std::mutex> lk(m_Mutex);
-            vspQueues.reserve(std::size(m_queues));
-
-            for(auto it = std::begin(m_queues); it != std::end(m_queues);)
-            {
-                auto spQueue = it->lock();
-                if(spQueue)
-                {
-                    vspQueues.emplace_back(std::move(spQueue));
-                    ++it;
-                }
-                else
-                {
-                    it = m_queues.erase(it);
-                }
-            }
-            return vspQueues;
-        }
-
-        //! Registers the given queue on this device.
-        //! NOTE: Every queue has to be registered for correct functionality of device wait operations!
-        ALPAKA_FN_HOST auto registerQueue(std::shared_ptr<TQueue> const& spQueue) const -> void
-        {
-            std::lock_guard<std::mutex> lk(m_Mutex);
-
-            // Register this queue on the device.
-            m_queues.push_back(spQueue);
-        }
-
-    private:
-        std::mutex mutable m_Mutex;
-        std::deque<std::weak_ptr<TQueue>> mutable m_queues;
-    };
-} // namespace alpaka::detail
diff --git a/include/alpaka/dev/cpu/SysInfo.hpp b/include/alpaka/dev/cpu/SysInfo.hpp
deleted file mode 100644
index 1dc989f..0000000
--- a/include/alpaka/dev/cpu/SysInfo.hpp
+++ /dev/null
@@ -1,237 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Daniel Vollmer, Erik Zenker, René Widera, Bernhard Manfred Gruber, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/BoostPredef.hpp"
-
-#if BOOST_OS_WINDOWS || BOOST_OS_CYGWIN
-#    ifndef NOMINMAX
-#        define NOMINMAX
-#    endif
-#    ifndef WIN32_LEAN_AND_MEAN
-#        define WIN32_LEAN_AND_MEAN
-#    endif
-// We could use some more macros to reduce the number of sub-headers included, but this would restrict user code.
-#    include <windows.h>
-#elif BOOST_OS_UNIX || BOOST_OS_MACOS
-#    include <sys/param.h>
-#    include <sys/types.h>
-#    include <unistd.h>
-
-#    include <cstdint>
-#    if BOOST_OS_BSD || BOOST_OS_MACOS
-#        include <sys/sysctl.h>
-#    endif
-#endif
-
-#if BOOST_OS_LINUX
-#    include <fstream>
-#endif
-
-#include <cstring>
-#include <stdexcept>
-#include <string>
-
-#if BOOST_ARCH_X86
-#    if BOOST_COMP_GNUC || BOOST_COMP_CLANG || BOOST_COMP_PGI
-#        include <cpuid.h>
-#    elif BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
-#        include <intrin.h>
-#    endif
-#endif
-
-namespace alpaka::cpu::detail
-{
-    constexpr int NO_CPUID = 0;
-    constexpr int UNKNOWN_CPU = 0;
-    constexpr int UNKNOWN_COMPILER = 1;
-#if BOOST_ARCH_X86
-#    if BOOST_COMP_GNUC || BOOST_COMP_CLANG || BOOST_COMP_PGI
-    inline auto cpuid(std::uint32_t level, std::uint32_t subfunction, std::uint32_t ex[4]) -> void
-    {
-        __cpuid_count(level, subfunction, ex[0], ex[1], ex[2], ex[3]);
-    }
-
-#    elif BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
-    inline auto cpuid(std::uint32_t level, std::uint32_t subfunction, std::uint32_t ex[4]) -> void
-    {
-        __cpuidex(reinterpret_cast<int*>(ex), level, subfunction);
-    }
-#    else
-    inline auto cpuid(std::uint32_t, std::uint32_t, std::uint32_t ex[4]) -> void
-    {
-        ex[0] = ex[2] = ex[3] = NO_CPUID;
-        ex[1] = UNKNOWN_COMPILER;
-    }
-#    endif
-#else
-    inline auto cpuid(std::uint32_t, std::uint32_t, std::uint32_t ex[4]) -> void
-    {
-        ex[0] = ex[2] = ex[3] = NO_CPUID;
-        ex[1] = UNKNOWN_CPU;
-    }
-#endif
-    //! \return The name of the CPU the code is running on.
-    inline auto getCpuName() -> std::string
-    {
-        // Get extended ids.
-        std::uint32_t ex[4] = {0};
-        cpuid(0x8000'0000, 0, ex);
-        std::uint32_t const nExIds(ex[0]);
-
-        if(!nExIds)
-        {
-            switch(ex[1])
-            {
-            case UNKNOWN_COMPILER:
-                return "<unknown: compiler>";
-            case UNKNOWN_CPU:
-                return "<unknown: CPU>";
-            default:
-                return "<unknown>";
-            }
-        }
-#if BOOST_ARCH_X86
-        // Get the information associated with each extended ID.
-        char cpuBrandString[0x40] = {0};
-        for(std::uint32_t i(0x8000'0000); i <= nExIds; ++i)
-        {
-            cpuid(i, 0, ex);
-
-            // Interpret CPU brand string and cache information.
-            if(i == 0x8000'0002)
-            {
-                std::memcpy(cpuBrandString, ex, sizeof(ex));
-            }
-            else if(i == 0x8000'0003)
-            {
-                std::memcpy(cpuBrandString + 16, ex, sizeof(ex));
-            }
-            else if(i == 0x8000'0004)
-            {
-                std::memcpy(cpuBrandString + 32, ex, sizeof(ex));
-            }
-        }
-        return std::string(cpuBrandString);
-#else
-        return std::string("unknown");
-#endif
-    }
-
-    //! \return Pagesize in bytes used by the system.
-    inline size_t getPageSize()
-    {
-#if BOOST_OS_WINDOWS || BOOST_OS_CYGWIN
-        SYSTEM_INFO si;
-        GetSystemInfo(&si);
-        return si.dwPageSize;
-#elif BOOST_OS_UNIX || BOOST_OS_MACOS
-#    if defined(_SC_PAGESIZE)
-        return static_cast<std::size_t>(sysconf(_SC_PAGESIZE));
-#    else
-        // this is legacy and only used as fallback
-        return = static_cast<size_t>(getpagesize());
-#    endif
-#else
-#    error "getPageSize not implemented for this system!"
-        return 0;
-#endif
-    }
-
-    //! \return The total number of bytes of global memory.
-    //! Adapted from David Robert Nadeau:
-    //! http://nadeausoftware.com/articles/2012/09/c_c_tip_how_get_physical_memory_size_system
-    inline auto getTotalGlobalMemSizeBytes() -> std::size_t
-    {
-#if BOOST_OS_WINDOWS
-        MEMORYSTATUSEX status;
-        status.dwLength = sizeof(status);
-        GlobalMemoryStatusEx(&status);
-        return static_cast<std::size_t>(status.ullTotalPhys);
-
-#elif BOOST_OS_CYGWIN
-        // New 64-bit MEMORYSTATUSEX isn't available.
-        MEMORYSTATUS status;
-        status.dwLength = sizeof(status);
-        GlobalMemoryStatus(&status);
-        return static_cast<std::size_t>(status.dwTotalPhys);
-
-#elif BOOST_OS_UNIX || BOOST_OS_MACOS
-        // Unix : Prefer sysctl() over sysconf() except sysctl() with HW_REALMEM and HW_PHYSMEM which are not
-        // always reliable
-#    if defined(CTL_HW) && (defined(HW_MEMSIZE) || defined(HW_PHYSMEM64))
-        int mib[2]
-            = { CTL_HW,
-#        if defined(HW_MEMSIZE) // OSX
-                HW_MEMSIZE
-#        elif defined(HW_PHYSMEM64) // NetBSD, OpenBSD.
-                HW_PHYSMEM64
-#        endif
-              };
-        std::uint64_t size(0);
-        std::size_t sizeLen{sizeof(size)};
-        if(sysctl(mib, 2, &size, &sizeLen, nullptr, 0) < 0)
-            throw std::logic_error("getTotalGlobalMemSizeBytes failed calling sysctl!");
-        return static_cast<std::size_t>(size);
-
-#    elif defined(_SC_AIX_REALMEM) // AIX.
-        return static_cast<std::size_t>(sysconf(_SC_AIX_REALMEM)) * static_cast<std::size_t>(1024);
-
-#    elif defined(_SC_PHYS_PAGES) // Linux, FreeBSD, OpenBSD, Solaris.
-        return static_cast<std::size_t>(sysconf(_SC_PHYS_PAGES)) * getPageSize();
-
-#    elif defined(CTL_HW)                                                                                             \
-        && (defined(HW_PHYSMEM) || defined(HW_REALMEM)) // FreeBSD, DragonFly BSD, NetBSD, OpenBSD, and OSX.
-        int mib[2]
-            = { CTL_HW,
-#        if defined(HW_REALMEM) // FreeBSD.
-                HW_REALMEM
-#        elif defined(HW_PYSMEM) // Others.
-                HW_PHYSMEM
-#        endif
-              };
-        std::uint32_t size(0);
-        std::size_t const sizeLen{sizeof(size)};
-        if(sysctl(mib, 2, &size, &sizeLen, nullptr, 0) < 0)
-            throw std::logic_error("getTotalGlobalMemSizeBytes failed calling sysctl!");
-        return static_cast<std::size_t>(size);
-#    endif
-
-#else
-#    error "getTotalGlobalMemSizeBytes not implemented for this system!"
-#endif
-    }
-
-    //! \return The free number of bytes of global memory.
-    //! \throws std::logic_error if not implemented on the system and std::runtime_error on other errors.
-    inline auto getFreeGlobalMemSizeBytes() -> std::size_t
-    {
-#if BOOST_OS_WINDOWS
-        MEMORYSTATUSEX status;
-        status.dwLength = sizeof(status);
-        GlobalMemoryStatusEx(&status);
-        return static_cast<std::size_t>(status.ullAvailPhys);
-#elif BOOST_OS_LINUX
-#    if defined(_SC_AVPHYS_PAGES)
-        return static_cast<std::size_t>(sysconf(_SC_AVPHYS_PAGES)) * getPageSize();
-#    else
-        // this is legacy and only used as fallback
-        return static_cast<std::size_t>(get_avphys_pages()) * getPageSize();
-#    endif
-#elif BOOST_OS_MACOS
-        int free_pages = 0;
-        std::size_t len = sizeof(free_pages);
-        if(sysctlbyname("vm.page_free_count", &free_pages, &len, nullptr, 0) < 0)
-        {
-            throw std::logic_error("getFreeGlobalMemSizeBytes failed calling sysctl(vm.page_free_count)!");
-        }
-
-        return static_cast<std::size_t>(free_pages) * getPageSize();
-#else
-#    error "getFreeGlobalMemSizeBytes not implemented for this system!"
-#endif
-    }
-
-} // namespace alpaka::cpu::detail
diff --git a/include/alpaka/dev/cpu/Wait.hpp b/include/alpaka/dev/cpu/Wait.hpp
deleted file mode 100644
index 1983674..0000000
--- a/include/alpaka/dev/cpu/Wait.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Rene Widera, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/dev/DevCpu.hpp"
-#include "alpaka/event/EventCpu.hpp"
-#include "alpaka/wait/Traits.hpp"
-
-namespace alpaka::trait
-{
-    //! The CPU device thread wait specialization.
-    //!
-    //! Blocks until the device has completed all preceding requested tasks.
-    //! Tasks that are enqueued or queues that are created after this call is made are not waited for.
-    template<>
-    struct CurrentThreadWaitFor<DevCpu>
-    {
-        ALPAKA_FN_HOST static auto currentThreadWaitFor(DevCpu const& dev) -> void
-        {
-            ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-            generic::currentThreadWaitForDevice(dev);
-        }
-    };
-} // namespace alpaka::trait
diff --git a/include/alpaka/dim/DimArithmetic.hpp b/include/alpaka/dim/DimArithmetic.hpp
deleted file mode 100644
index f0b0edc..0000000
--- a/include/alpaka/dim/DimArithmetic.hpp
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/dim/DimIntegralConst.hpp"
-
-#include <type_traits>
-
-namespace alpaka::trait
-{
-    //! The arithmetic type dimension getter trait specialization.
-    template<typename T>
-    struct DimType<T, std::enable_if_t<std::is_arithmetic_v<T>>>
-    {
-        using type = DimInt<1u>;
-    };
-} // namespace alpaka::trait
diff --git a/include/alpaka/dim/DimIntegralConst.hpp b/include/alpaka/dim/DimIntegralConst.hpp
deleted file mode 100644
index 69c85b5..0000000
--- a/include/alpaka/dim/DimIntegralConst.hpp
+++ /dev/null
@@ -1,16 +0,0 @@
-/* Copyright 2020 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/dim/Traits.hpp"
-
-#include <type_traits>
-
-namespace alpaka
-{
-    // N(th) dimension(s).
-    template<std::size_t N>
-    using DimInt = std::integral_constant<std::size_t, N>;
-} // namespace alpaka
diff --git a/include/alpaka/dim/Traits.hpp b/include/alpaka/dim/Traits.hpp
deleted file mode 100644
index 706b0a7..0000000
--- a/include/alpaka/dim/Traits.hpp
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright 2020 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-namespace alpaka
-{
-    //! The dimension trait.
-    namespace trait
-    {
-        //! The dimension getter type trait.
-        template<typename T, typename TSfinae = void>
-        struct DimType;
-    } // namespace trait
-
-    //! The dimension type trait alias template to remove the ::type.
-    template<typename T>
-    using Dim = typename trait::DimType<T>::type;
-} // namespace alpaka
diff --git a/include/alpaka/elem/Traits.hpp b/include/alpaka/elem/Traits.hpp
deleted file mode 100644
index 690ce76..0000000
--- a/include/alpaka/elem/Traits.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include <type_traits>
-
-namespace alpaka
-{
-    //! The element trait.
-    namespace trait
-    {
-        //! The element type trait.
-        template<typename TView, typename TSfinae = void>
-        struct ElemType;
-    } // namespace trait
-
-    //! The element type trait alias template to remove the ::type.
-    template<typename TView>
-    using Elem = std::remove_volatile_t<typename trait::ElemType<TView>::type>;
-
-    // Trait specializations for unsigned integral types.
-    namespace trait
-    {
-        //! The fundamental type elem type trait specialization.
-        template<typename T>
-        struct ElemType<T, std::enable_if_t<std::is_fundamental_v<T>>>
-        {
-            using type = T;
-        };
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/event/EventCpu.hpp b/include/alpaka/event/EventCpu.hpp
deleted file mode 100644
index d883621..0000000
--- a/include/alpaka/event/EventCpu.hpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/* Copyright 2020 Jeffrey Kelling, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/dev/DevCpu.hpp"
-#include "alpaka/event/EventGenericThreads.hpp"
-
-namespace alpaka
-{
-    using EventCpu = EventGenericThreads<DevCpu>;
-} // namespace alpaka
diff --git a/include/alpaka/event/EventCpuSycl.hpp b/include/alpaka/event/EventCpuSycl.hpp
deleted file mode 100644
index 91a9517..0000000
--- a/include/alpaka/event/EventCpuSycl.hpp
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Copyright 2024 Jan Stephan, Andrea Bocci, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/acc/Tag.hpp"
-#include "alpaka/event/EventGenericSycl.hpp"
-
-#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_CPU)
-
-namespace alpaka
-{
-    using EventCpuSycl = EventGenericSycl<TagCpuSycl>;
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/event/EventCudaRt.hpp b/include/alpaka/event/EventCudaRt.hpp
deleted file mode 100644
index 4dfba7c..0000000
--- a/include/alpaka/event/EventCudaRt.hpp
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright 2022 Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/ApiCudaRt.hpp"
-#include "alpaka/event/EventUniformCudaHipRt.hpp"
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-namespace alpaka
-{
-    //! The CUDA RT device event.
-    using EventCudaRt = EventUniformCudaHipRt<ApiCudaRt>;
-} // namespace alpaka
-
-#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
diff --git a/include/alpaka/event/EventFpgaSyclIntel.hpp b/include/alpaka/event/EventFpgaSyclIntel.hpp
deleted file mode 100644
index 3646fe7..0000000
--- a/include/alpaka/event/EventFpgaSyclIntel.hpp
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Copyright 2024 Jan Stephan, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/acc/Tag.hpp"
-#include "alpaka/event/EventGenericSycl.hpp"
-
-#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_FPGA)
-
-namespace alpaka
-{
-    using EventFpgaSyclIntel = EventGenericSycl<TagFpgaSyclIntel>;
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/event/EventGenericSycl.hpp b/include/alpaka/event/EventGenericSycl.hpp
deleted file mode 100644
index 7ea8538..0000000
--- a/include/alpaka/event/EventGenericSycl.hpp
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright 2024 Jan Stephan, Antonio Di Pilato, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Sycl.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/event/Traits.hpp"
-#include "alpaka/queue/QueueGenericSyclBlocking.hpp"
-#include "alpaka/queue/QueueGenericSyclNonBlocking.hpp"
-#include "alpaka/wait/Traits.hpp"
-
-#include <functional>
-#include <memory>
-#include <stdexcept>
-
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-
-#    include <sycl/sycl.hpp>
-
-namespace alpaka
-{
-    //! The SYCL device event.
-    template<typename TTag>
-    class EventGenericSycl final
-    {
-    public:
-        explicit EventGenericSycl(DevGenericSycl<TTag> const& dev) : m_dev{dev}
-        {
-        }
-
-        friend auto operator==(EventGenericSycl const& lhs, EventGenericSycl const& rhs) -> bool
-        {
-            return (lhs.m_event == rhs.m_event);
-        }
-
-        friend auto operator!=(EventGenericSycl const& lhs, EventGenericSycl const& rhs) -> bool
-        {
-            return !(lhs == rhs);
-        }
-
-        [[nodiscard]] auto getNativeHandle() const
-        {
-            return m_event;
-        }
-
-        void setEvent(sycl::event const& event)
-        {
-            m_event = event;
-        }
-
-        DevGenericSycl<TTag> m_dev;
-
-    private:
-        sycl::event m_event{};
-    };
-} // namespace alpaka
-
-namespace alpaka::trait
-{
-    //! The SYCL device event device get trait specialization.
-    template<typename TTag>
-    struct GetDev<EventGenericSycl<TTag>>
-    {
-        static auto getDev(EventGenericSycl<TTag> const& event) -> DevGenericSycl<TTag>
-        {
-            return event.m_dev;
-        }
-    };
-
-    //! The SYCL device event test trait specialization.
-    template<typename TTag>
-    struct IsComplete<EventGenericSycl<TTag>>
-    {
-        static auto isComplete(EventGenericSycl<TTag> const& event)
-        {
-            auto const status
-                = event.getNativeHandle().template get_info<sycl::info::event::command_execution_status>();
-            return (status == sycl::info::event_command_status::complete);
-        }
-    };
-
-    //! The SYCL queue enqueue trait specialization.
-    template<typename TTag>
-    struct Enqueue<QueueGenericSyclNonBlocking<TTag>, EventGenericSycl<TTag>>
-    {
-        static auto enqueue(QueueGenericSyclNonBlocking<TTag>& queue, EventGenericSycl<TTag>& event)
-        {
-            event.setEvent(queue.m_spQueueImpl->get_last_event());
-        }
-    };
-
-    //! The SYCL queue enqueue trait specialization.
-    template<typename TTag>
-    struct Enqueue<QueueGenericSyclBlocking<TTag>, EventGenericSycl<TTag>>
-    {
-        static auto enqueue(QueueGenericSyclBlocking<TTag>& queue, EventGenericSycl<TTag>& event)
-        {
-            event.setEvent(queue.m_spQueueImpl->get_last_event());
-        }
-    };
-
-    //! The SYCL device event thread wait trait specialization.
-    //!
-    //! Waits until the event itself and therefore all tasks preceding it in the queue it is enqueued to have been
-    //! completed. If the event is not enqueued to a queue the method returns immediately.
-    template<typename TTag>
-    struct CurrentThreadWaitFor<EventGenericSycl<TTag>>
-    {
-        static auto currentThreadWaitFor(EventGenericSycl<TTag> const& event)
-        {
-            event.getNativeHandle().wait_and_throw();
-        }
-    };
-
-    //! The SYCL queue event wait trait specialization.
-    template<typename TTag>
-    struct WaiterWaitFor<QueueGenericSyclNonBlocking<TTag>, EventGenericSycl<TTag>>
-    {
-        static auto waiterWaitFor(QueueGenericSyclNonBlocking<TTag>& queue, EventGenericSycl<TTag> const& event)
-        {
-            queue.m_spQueueImpl->register_dependency(event.getNativeHandle());
-        }
-    };
-
-    //! The SYCL queue event wait trait specialization.
-    template<typename TTag>
-    struct WaiterWaitFor<QueueGenericSyclBlocking<TTag>, EventGenericSycl<TTag>>
-    {
-        static auto waiterWaitFor(QueueGenericSyclBlocking<TTag>& queue, EventGenericSycl<TTag> const& event)
-        {
-            queue.m_spQueueImpl->register_dependency(event.getNativeHandle());
-        }
-    };
-
-    //! The SYCL device event wait trait specialization.
-    //!
-    //! Any future work submitted in any queue of this device will wait for event to complete before beginning
-    //! execution.
-    template<typename TTag>
-    struct WaiterWaitFor<DevGenericSycl<TTag>, EventGenericSycl<TTag>>
-    {
-        static auto waiterWaitFor(DevGenericSycl<TTag>& dev, EventGenericSycl<TTag> const& event)
-        {
-            dev.m_impl->register_dependency(event.getNativeHandle());
-        }
-    };
-
-    //! The SYCL device event native handle trait specialization.
-    template<typename TTag>
-    struct NativeHandle<EventGenericSycl<TTag>>
-    {
-        [[nodiscard]] static auto getNativeHandle(EventGenericSycl<TTag> const& event)
-        {
-            return event.getNativeHandle();
-        }
-    };
-} // namespace alpaka::trait
-
-#endif
diff --git a/include/alpaka/event/EventGenericThreads.hpp b/include/alpaka/event/EventGenericThreads.hpp
deleted file mode 100644
index b588839..0000000
--- a/include/alpaka/event/EventGenericThreads.hpp
+++ /dev/null
@@ -1,395 +0,0 @@
-/* Copyright 2023 Axel Hübl, Benjamin Worpitz, Matthias Werner, René Widera, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Assert.hpp"
-#include "alpaka/core/Utility.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/event/Traits.hpp"
-#include "alpaka/queue/QueueGenericThreadsBlocking.hpp"
-#include "alpaka/queue/QueueGenericThreadsNonBlocking.hpp"
-#include "alpaka/wait/Traits.hpp"
-
-#include <condition_variable>
-#include <future>
-#include <mutex>
-#include <utility>
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-#    include <iostream>
-#endif
-
-namespace alpaka
-{
-    namespace generic::detail
-    {
-        //! The CPU device event implementation.
-        template<typename TDev>
-        class EventGenericThreadsImpl final
-            : public concepts::Implements<ConceptCurrentThreadWaitFor, EventGenericThreadsImpl<TDev>>
-        {
-        public:
-            EventGenericThreadsImpl(TDev dev) noexcept : m_dev(std::move(dev))
-            {
-            }
-
-            EventGenericThreadsImpl(EventGenericThreadsImpl<TDev> const&) = delete;
-            auto operator=(EventGenericThreadsImpl<TDev> const&) -> EventGenericThreadsImpl<TDev>& = delete;
-
-            auto isReady() noexcept -> bool
-            {
-                return (m_LastReadyEnqueueCount == m_enqueueCount);
-            }
-
-            auto wait(std::size_t const& enqueueCount, std::unique_lock<std::mutex>& lk) const noexcept -> void
-            {
-                ALPAKA_ASSERT(enqueueCount <= m_enqueueCount);
-
-                while(enqueueCount > m_LastReadyEnqueueCount)
-                {
-                    auto future = m_future;
-                    lk.unlock();
-                    future.get();
-                    lk.lock();
-                }
-            }
-
-            TDev const m_dev; //!< The device this event is bound to.
-
-            std::mutex mutable m_mutex; //!< The mutex used to synchronize access to the event.
-            std::shared_future<void> m_future; //!< The future signaling the event completion.
-            std::size_t m_enqueueCount = 0u; //!< The number of times this event has been enqueued.
-            std::size_t m_LastReadyEnqueueCount = 0u; //!< The time this event has been ready the last time.
-                                                      //!< Ready means that the event was not waiting within a queue
-                                                      //!< (not enqueued or already completed). If m_enqueueCount ==
-                                                      //!< m_LastReadyEnqueueCount, the event is currently not enqueued
-        };
-    } // namespace generic::detail
-
-    //! The CPU device event.
-    template<typename TDev>
-    class EventGenericThreads final
-        : public concepts::Implements<ConceptCurrentThreadWaitFor, EventGenericThreads<TDev>>
-        , public concepts::Implements<ConceptGetDev, EventGenericThreads<TDev>>
-    {
-    public:
-        //! \param bBusyWaiting Unused. EventGenericThreads never does busy waiting.
-        EventGenericThreads(TDev const& dev, [[maybe_unused]] bool bBusyWaiting = true)
-            : m_spEventImpl(std::make_shared<generic::detail::EventGenericThreadsImpl<TDev>>(dev))
-        {
-        }
-
-        auto operator==(EventGenericThreads<TDev> const& rhs) const -> bool
-        {
-            return (m_spEventImpl == rhs.m_spEventImpl);
-        }
-
-        auto operator!=(EventGenericThreads<TDev> const& rhs) const -> bool
-        {
-            return !((*this) == rhs);
-        }
-
-    public:
-        std::shared_ptr<generic::detail::EventGenericThreadsImpl<TDev>> m_spEventImpl;
-    };
-
-    namespace trait
-    {
-        //! The CPU device event device type trait specialization.
-        template<typename TDev>
-        struct DevType<EventGenericThreads<TDev>>
-        {
-            using type = TDev;
-        };
-
-        //! The CPU device event device get trait specialization.
-        template<typename TDev>
-        struct GetDev<EventGenericThreads<TDev>>
-        {
-            ALPAKA_FN_HOST static auto getDev(EventGenericThreads<TDev> const& event) -> TDev
-            {
-                return event.m_spEventImpl->m_dev;
-            }
-        };
-
-        //! The CPU device event test trait specialization.
-        template<typename TDev>
-        struct IsComplete<EventGenericThreads<TDev>>
-        {
-            //! \return If the event is not waiting within a queue (not enqueued or already handled).
-            ALPAKA_FN_HOST static auto isComplete(EventGenericThreads<TDev> const& event) -> bool
-            {
-                std::lock_guard<std::mutex> lk(event.m_spEventImpl->m_mutex);
-
-                return event.m_spEventImpl->isReady();
-            }
-        };
-
-        //! The CPU non-blocking device queue enqueue trait specialization.
-        template<typename TDev>
-        struct Enqueue<alpaka::generic::detail::QueueGenericThreadsNonBlockingImpl<TDev>, EventGenericThreads<TDev>>
-        {
-            ALPAKA_FN_HOST static auto enqueue(
-                [[maybe_unused]] alpaka::generic::detail::QueueGenericThreadsNonBlockingImpl<TDev>& queueImpl,
-                EventGenericThreads<TDev>& event) -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                // Copy the shared pointer of the event implementation.
-                // This is forwarded to the lambda that is enqueued into the queue to ensure that the event
-                // implementation is alive as long as it is enqueued.
-                auto spEventImpl = event.m_spEventImpl;
-
-                // Setting the event state and enqueuing it has to be atomic.
-                std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
-
-                ++spEventImpl->m_enqueueCount;
-
-                auto const enqueueCount = spEventImpl->m_enqueueCount;
-
-                // Enqueue a task that only resets the events flag if it is completed.
-                spEventImpl->m_future = queueImpl.m_workerThread.submit(
-                    [spEventImpl, enqueueCount]() mutable
-                    {
-                        std::unique_lock<std::mutex> lk2(spEventImpl->m_mutex);
-
-                        // Nothing to do if it has been re-enqueued to a later position in the queue.
-                        if(enqueueCount == spEventImpl->m_enqueueCount)
-                        {
-                            spEventImpl->m_LastReadyEnqueueCount
-                                = std::max(enqueueCount, spEventImpl->m_LastReadyEnqueueCount);
-                        }
-                    });
-            }
-        };
-
-        //! The CPU non-blocking device queue enqueue trait specialization.
-        template<typename TDev>
-        struct Enqueue<QueueGenericThreadsNonBlocking<TDev>, EventGenericThreads<TDev>>
-        {
-            ALPAKA_FN_HOST static auto enqueue(
-                QueueGenericThreadsNonBlocking<TDev>& queue,
-                EventGenericThreads<TDev>& event) -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                alpaka::enqueue(*queue.m_spQueueImpl, event);
-            }
-        };
-
-        //! The CPU blocking device queue enqueue trait specialization.
-        template<typename TDev>
-        struct Enqueue<alpaka::generic::detail::QueueGenericThreadsBlockingImpl<TDev>, EventGenericThreads<TDev>>
-        {
-            ALPAKA_FN_HOST static auto enqueue(
-                alpaka::generic::detail::QueueGenericThreadsBlockingImpl<TDev>& queueImpl,
-                EventGenericThreads<TDev>& event) -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                std::promise<void> promise;
-                {
-                    std::lock_guard<std::mutex> lk(queueImpl.m_mutex);
-
-                    queueImpl.m_bCurrentlyExecutingTask = true;
-
-                    auto& eventImpl(*event.m_spEventImpl);
-
-                    {
-                        // Setting the event state and enqueuing it has to be atomic.
-                        std::lock_guard<std::mutex> evLk(eventImpl.m_mutex);
-
-                        ++eventImpl.m_enqueueCount;
-                        // NOTE: Difference to non-blocking version: directly set the event state instead of enqueuing.
-                        eventImpl.m_LastReadyEnqueueCount = eventImpl.m_enqueueCount;
-
-                        eventImpl.m_future = promise.get_future();
-                    }
-
-                    queueImpl.m_bCurrentlyExecutingTask = false;
-                }
-                promise.set_value();
-            }
-        };
-
-        //! The CPU blocking device queue enqueue trait specialization.
-        template<typename TDev>
-        struct Enqueue<QueueGenericThreadsBlocking<TDev>, EventGenericThreads<TDev>>
-        {
-            ALPAKA_FN_HOST static auto enqueue(
-                QueueGenericThreadsBlocking<TDev>& queue,
-                EventGenericThreads<TDev>& event) -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                alpaka::enqueue(*queue.m_spQueueImpl, event);
-            }
-        };
-    } // namespace trait
-
-    namespace trait
-    {
-        namespace generic
-        {
-            template<typename TDev>
-            ALPAKA_FN_HOST auto currentThreadWaitForDevice(TDev const& dev) -> void
-            {
-                // Get all the queues on the device at the time of invocation.
-                // All queues added afterwards are ignored.
-                auto vQueues = dev.getAllQueues();
-                // Furthermore there should not even be a chance to enqueue something between getting the queues and
-                // adding our wait events!
-                std::vector<EventGenericThreads<TDev>> vEvents;
-                for(auto&& spQueue : vQueues)
-                {
-                    vEvents.emplace_back(dev);
-                    spQueue->enqueue(vEvents.back());
-                }
-
-                // Now wait for all the events.
-                for(auto&& event : vEvents)
-                {
-                    wait(event);
-                }
-            }
-        } // namespace generic
-
-        //! The CPU device event thread wait trait specialization.
-        //!
-        //! Waits until the event itself and therefore all tasks preceding it in the queue it is enqueued to have been
-        //! completed. If the event is not enqueued to a queue the method returns immediately.
-        template<typename TDev>
-        struct CurrentThreadWaitFor<EventGenericThreads<TDev>>
-        {
-            ALPAKA_FN_HOST static auto currentThreadWaitFor(EventGenericThreads<TDev> const& event) -> void
-            {
-                wait(*event.m_spEventImpl);
-            }
-        };
-
-        //! The CPU device event implementation thread wait trait specialization.
-        //!
-        //! Waits until the event itself and therefore all tasks preceding it in the queue it is enqueued to have been
-        //! completed. If the event is not enqueued to a queue the method returns immediately.
-        //!
-        //! NOTE: This method is for internal usage only.
-        template<typename TDev>
-        struct CurrentThreadWaitFor<alpaka::generic::detail::EventGenericThreadsImpl<TDev>>
-        {
-            ALPAKA_FN_HOST static auto currentThreadWaitFor(
-                alpaka::generic::detail::EventGenericThreadsImpl<TDev> const& eventImpl) -> void
-            {
-                std::unique_lock<std::mutex> lk(eventImpl.m_mutex);
-
-                auto const enqueueCount = eventImpl.m_enqueueCount;
-                eventImpl.wait(enqueueCount, lk);
-            }
-        };
-
-        //! The CPU non-blocking device queue event wait trait specialization.
-        template<typename TDev>
-        struct WaiterWaitFor<
-            alpaka::generic::detail::QueueGenericThreadsNonBlockingImpl<TDev>,
-            EventGenericThreads<TDev>>
-        {
-            ALPAKA_FN_HOST static auto waiterWaitFor(
-                alpaka::generic::detail::QueueGenericThreadsNonBlockingImpl<TDev>& queueImpl,
-                EventGenericThreads<TDev> const& event) -> void
-            {
-                // Copy the shared pointer of the event implementation.
-                // This is forwarded to the lambda that is enqueued into the queue to ensure that the event
-                // implementation is alive as long as it is enqueued.
-                auto spEventImpl = event.m_spEventImpl;
-
-                std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
-
-                if(!spEventImpl->isReady())
-                {
-                    auto oldFuture = spEventImpl->m_future;
-
-                    // Enqueue a task that waits for the given future of the event.
-                    queueImpl.m_workerThread.submit([oldFuture]() { oldFuture.get(); });
-                }
-            }
-        };
-
-        //! The CPU non-blocking device queue event wait trait specialization.
-        template<typename TDev>
-        struct WaiterWaitFor<QueueGenericThreadsNonBlocking<TDev>, EventGenericThreads<TDev>>
-        {
-            ALPAKA_FN_HOST static auto waiterWaitFor(
-                QueueGenericThreadsNonBlocking<TDev>& queue,
-                EventGenericThreads<TDev> const& event) -> void
-            {
-                wait(*queue.m_spQueueImpl, event);
-            }
-        };
-
-        //! The CPU blocking device queue event wait trait specialization.
-        template<typename TDev>
-        struct WaiterWaitFor<alpaka::generic::detail::QueueGenericThreadsBlockingImpl<TDev>, EventGenericThreads<TDev>>
-        {
-            ALPAKA_FN_HOST static auto waiterWaitFor(
-                alpaka::generic::detail::QueueGenericThreadsBlockingImpl<TDev>& /* queueImpl */,
-                EventGenericThreads<TDev> const& event) -> void
-            {
-                // NOTE: Difference to non-blocking version: directly wait for event.
-                wait(*event.m_spEventImpl);
-            }
-        };
-
-        //! The CPU blocking device queue event wait trait specialization.
-        template<typename TDev>
-        struct WaiterWaitFor<QueueGenericThreadsBlocking<TDev>, EventGenericThreads<TDev>>
-        {
-            ALPAKA_FN_HOST static auto waiterWaitFor(
-                QueueGenericThreadsBlocking<TDev>& queue,
-                EventGenericThreads<TDev> const& event) -> void
-            {
-                wait(*queue.m_spQueueImpl, event);
-            }
-        };
-
-        //! The CPU non-blocking device event wait trait specialization.
-        //!
-        //! Any future work submitted in any queue of this device will wait for event to complete before beginning
-        //! execution.
-        template<typename TDev>
-        struct WaiterWaitFor<TDev, EventGenericThreads<TDev>>
-        {
-            ALPAKA_FN_HOST static auto waiterWaitFor(TDev& dev, EventGenericThreads<TDev> const& event) -> void
-            {
-                // Get all the queues on the device at the time of invocation.
-                // All queues added afterwards are ignored.
-                auto vspQueues(dev.getAllQueues());
-
-                // Let all the queues wait for this event.
-                // Furthermore there should not even be a chance to enqueue something between getting the queues and
-                // adding our wait events!
-                for(auto&& spQueue : vspQueues)
-                {
-                    spQueue->wait(event);
-                }
-            }
-        };
-
-        //! The CPU non-blocking device queue thread wait trait specialization.
-        //!
-        //! Blocks execution of the calling thread until the queue has finished processing all previously requested
-        //! tasks (kernels, data copies, ...)
-        template<typename TDev>
-        struct CurrentThreadWaitFor<QueueGenericThreadsNonBlocking<TDev>>
-        {
-            ALPAKA_FN_HOST static auto currentThreadWaitFor(QueueGenericThreadsNonBlocking<TDev> const& queue) -> void
-            {
-                // Enqueue a dummy tasks into the worker thread of the queue will provide a future we can wait for.
-                // Previously we enqueued an event into the queue but this will not guarantee that queue is empty
-                // after the event is finished because the event handling can be finished before the event task is
-                // fully removed from the queue.
-                auto f = queue.m_spQueueImpl->m_workerThread.submit([]() noexcept {});
-                f.wait();
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/event/EventGpuSyclIntel.hpp b/include/alpaka/event/EventGpuSyclIntel.hpp
deleted file mode 100644
index 508fb57..0000000
--- a/include/alpaka/event/EventGpuSyclIntel.hpp
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Copyright 2024 Jan Stephan, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/acc/Tag.hpp"
-#include "alpaka/event/EventGenericSycl.hpp"
-
-#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_GPU)
-
-namespace alpaka
-{
-    using EventGpuSyclIntel = EventGenericSycl<TagGpuSyclIntel>;
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/event/EventHipRt.hpp b/include/alpaka/event/EventHipRt.hpp
deleted file mode 100644
index 06c9bd1..0000000
--- a/include/alpaka/event/EventHipRt.hpp
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright 2022 Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/ApiHipRt.hpp"
-#include "alpaka/event/EventUniformCudaHipRt.hpp"
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-namespace alpaka
-{
-    //! The HIP RT device event.
-    using EventHipRt = EventUniformCudaHipRt<ApiHipRt>;
-} // namespace alpaka
-
-#endif // ALPAKA_ACC_GPU_HIP_ENABLED
diff --git a/include/alpaka/event/EventUniformCudaHipRt.hpp b/include/alpaka/event/EventUniformCudaHipRt.hpp
deleted file mode 100644
index 63f1f2f..0000000
--- a/include/alpaka/event/EventUniformCudaHipRt.hpp
+++ /dev/null
@@ -1,263 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Andrea Bocci, Bernhard Manfred Gruber, Antonio Di Pilato
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/core/Cuda.hpp"
-#include "alpaka/core/Hip.hpp"
-#include "alpaka/dev/DevUniformCudaHipRt.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/event/Traits.hpp"
-#include "alpaka/queue/QueueUniformCudaHipRtBlocking.hpp"
-#include "alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp"
-#include "alpaka/wait/Traits.hpp"
-
-#include <functional>
-#include <memory>
-#include <stdexcept>
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-
-namespace alpaka
-{
-    namespace uniform_cuda_hip::detail
-    {
-        //! The CUDA/HIP RT device event implementation.
-        template<typename TApi>
-        class EventUniformCudaHipImpl final
-        {
-        public:
-            ALPAKA_FN_HOST EventUniformCudaHipImpl(DevUniformCudaHipRt<TApi> const& dev, bool bBusyWait)
-                : m_dev(dev)
-                , m_UniformCudaHipEvent()
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                // Set the current device.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(m_dev.getNativeHandle()));
-
-                // Create the event on the current device with the specified flags. Valid flags include:
-                // - cuda/hip-EventDefault: Default event creation flag.
-                // - cuda/hip-EventBlockingSync : Specifies that event should use blocking synchronization.
-                //   A host thread that uses cuda/hip-EventSynchronize() to wait on an event created with this flag
-                //   will block until the event actually completes.
-                // - cuda/hip-EventDisableTiming : Specifies that the created event does not need to record timing
-                // data.
-                //   Events created with this flag specified and the cuda/hip-EventBlockingSync flag not specified
-                //   will provide the best performance when used with cudaStreamWaitEvent() and cudaEventQuery().
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::eventCreateWithFlags(
-                    &m_UniformCudaHipEvent,
-                    (bBusyWait ? TApi::eventDefault : TApi::eventBlockingSync) | TApi::eventDisableTiming));
-            }
-
-            EventUniformCudaHipImpl(EventUniformCudaHipImpl const&) = delete;
-            auto operator=(EventUniformCudaHipImpl const&) -> EventUniformCudaHipImpl& = delete;
-
-            ALPAKA_FN_HOST ~EventUniformCudaHipImpl()
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                // In case event has been recorded but has not yet been completed when cuda/hip-EventDestroy() is
-                // called, the function will return immediately and the resources associated with event will be
-                // released automatically once the device has completed event.
-                // -> No need to synchronize here.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::eventDestroy(m_UniformCudaHipEvent));
-            }
-
-            [[nodiscard]] auto getNativeHandle() const noexcept
-            {
-                return m_UniformCudaHipEvent;
-            }
-
-        public:
-            DevUniformCudaHipRt<TApi> const m_dev; //!< The device this event is bound to.
-
-        private:
-            typename TApi::Event_t m_UniformCudaHipEvent;
-        };
-    } // namespace uniform_cuda_hip::detail
-
-    //! The CUDA/HIP RT device event.
-    template<typename TApi>
-    class EventUniformCudaHipRt final
-        : public concepts::Implements<ConceptCurrentThreadWaitFor, EventUniformCudaHipRt<TApi>>
-        , public concepts::Implements<ConceptGetDev, EventUniformCudaHipRt<TApi>>
-    {
-    public:
-        ALPAKA_FN_HOST EventUniformCudaHipRt(DevUniformCudaHipRt<TApi> const& dev, bool bBusyWait = true)
-            : m_spEventImpl(std::make_shared<uniform_cuda_hip::detail::EventUniformCudaHipImpl<TApi>>(dev, bBusyWait))
-        {
-            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-        }
-
-        ALPAKA_FN_HOST auto operator==(EventUniformCudaHipRt<TApi> const& rhs) const -> bool
-        {
-            return (m_spEventImpl == rhs.m_spEventImpl);
-        }
-
-        ALPAKA_FN_HOST auto operator!=(EventUniformCudaHipRt<TApi> const& rhs) const -> bool
-        {
-            return !((*this) == rhs);
-        }
-
-        [[nodiscard]] auto getNativeHandle() const noexcept
-        {
-            return m_spEventImpl->getNativeHandle();
-        }
-
-    public:
-        std::shared_ptr<uniform_cuda_hip::detail::EventUniformCudaHipImpl<TApi>> m_spEventImpl;
-    };
-
-    namespace trait
-    {
-        //! The CUDA/HIP RT device event device type trait specialization.
-        template<typename TApi>
-        struct DevType<EventUniformCudaHipRt<TApi>>
-        {
-            using type = DevUniformCudaHipRt<TApi>;
-        };
-
-        //! The CUDA/HIP RT device event device get trait specialization.
-        template<typename TApi>
-        struct GetDev<EventUniformCudaHipRt<TApi>>
-        {
-            ALPAKA_FN_HOST static auto getDev(EventUniformCudaHipRt<TApi> const& event) -> DevUniformCudaHipRt<TApi>
-            {
-                return event.m_spEventImpl->m_dev;
-            }
-        };
-
-        //! The CUDA/HIP RT device event test trait specialization.
-        template<typename TApi>
-        struct IsComplete<EventUniformCudaHipRt<TApi>>
-        {
-            ALPAKA_FN_HOST static auto isComplete(EventUniformCudaHipRt<TApi> const& event) -> bool
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                // Query is allowed even for events on non current device.
-                typename TApi::Error_t ret = TApi::success;
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE(
-                    ret = TApi::eventQuery(event.getNativeHandle()),
-                    TApi::errorNotReady);
-                return (ret == TApi::success);
-            }
-        };
-
-        //! The CUDA/HIP RT queue enqueue trait specialization.
-        template<typename TApi>
-        struct Enqueue<QueueUniformCudaHipRtNonBlocking<TApi>, EventUniformCudaHipRt<TApi>>
-        {
-            ALPAKA_FN_HOST static auto enqueue(
-                QueueUniformCudaHipRtNonBlocking<TApi>& queue,
-                EventUniformCudaHipRt<TApi>& event) -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::eventRecord(event.getNativeHandle(), queue.getNativeHandle()));
-            }
-        };
-
-        //! The CUDA/HIP RT queue enqueue trait specialization.
-        template<typename TApi>
-        struct Enqueue<QueueUniformCudaHipRtBlocking<TApi>, EventUniformCudaHipRt<TApi>>
-        {
-            ALPAKA_FN_HOST static auto enqueue(
-                QueueUniformCudaHipRtBlocking<TApi>& queue,
-                EventUniformCudaHipRt<TApi>& event) -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::eventRecord(event.getNativeHandle(), queue.getNativeHandle()));
-            }
-        };
-
-        //! The CUDA/HIP RT device event thread wait trait specialization.
-        //!
-        //! Waits until the event itself and therefore all tasks preceding it in the queue it is enqueued to have been
-        //! completed. If the event is not enqueued to a queue the method returns immediately.
-        template<typename TApi>
-        struct CurrentThreadWaitFor<EventUniformCudaHipRt<TApi>>
-        {
-            ALPAKA_FN_HOST static auto currentThreadWaitFor(EventUniformCudaHipRt<TApi> const& event) -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                // Sync is allowed even for events on non current device.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::eventSynchronize(event.getNativeHandle()));
-            }
-        };
-
-        //! The CUDA/HIP RT queue event wait trait specialization.
-        template<typename TApi>
-        struct WaiterWaitFor<QueueUniformCudaHipRtNonBlocking<TApi>, EventUniformCudaHipRt<TApi>>
-        {
-            ALPAKA_FN_HOST static auto waiterWaitFor(
-                QueueUniformCudaHipRtNonBlocking<TApi>& queue,
-                EventUniformCudaHipRt<TApi> const& event) -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
-                    TApi::streamWaitEvent(queue.getNativeHandle(), event.getNativeHandle(), 0));
-            }
-        };
-
-        //! The CUDA/HIP RT queue event wait trait specialization.
-        template<typename TApi>
-        struct WaiterWaitFor<QueueUniformCudaHipRtBlocking<TApi>, EventUniformCudaHipRt<TApi>>
-        {
-            ALPAKA_FN_HOST static auto waiterWaitFor(
-                QueueUniformCudaHipRtBlocking<TApi>& queue,
-                EventUniformCudaHipRt<TApi> const& event) -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
-                    TApi::streamWaitEvent(queue.getNativeHandle(), event.getNativeHandle(), 0));
-            }
-        };
-
-        //! The CUDA/HIP RT device event wait trait specialization.
-        //!
-        //! Any future work submitted in any queue of this device will wait for event to complete before beginning
-        //! execution.
-        template<typename TApi>
-        struct WaiterWaitFor<DevUniformCudaHipRt<TApi>, EventUniformCudaHipRt<TApi>>
-        {
-            ALPAKA_FN_HOST static auto waiterWaitFor(
-                DevUniformCudaHipRt<TApi>& dev,
-                EventUniformCudaHipRt<TApi> const& event) -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                // Set the current device.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(dev.getNativeHandle()));
-
-                // Get all the queues on the device at the time of invocation.
-                // All queues added afterwards are ignored.
-                auto vQueues = dev.getAllQueues();
-                for(auto&& spQueue : vQueues)
-                {
-                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
-                        TApi::streamWaitEvent(spQueue->getNativeHandle(), event.getNativeHandle(), 0));
-                }
-            }
-        };
-
-        //! The CUDA/HIP RT event native handle trait specialization.
-        template<typename TApi>
-        struct NativeHandle<EventUniformCudaHipRt<TApi>>
-        {
-            [[nodiscard]] static auto getNativeHandle(EventUniformCudaHipRt<TApi> const& event)
-            {
-                return event.getNativeHandle();
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/event/Traits.hpp b/include/alpaka/event/Traits.hpp
deleted file mode 100644
index 7acb7ab..0000000
--- a/include/alpaka/event/Traits.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/dev/Traits.hpp"
-
-namespace alpaka
-{
-    //! The event management traits.
-    namespace trait
-    {
-        //! The event type trait.
-        template<typename T, typename TSfinae = void>
-        struct EventType;
-
-        //! The event tester trait.
-        template<typename TEvent, typename TSfinae = void>
-        struct IsComplete;
-    } // namespace trait
-
-    //! The event type trait alias template to remove the ::type.
-    template<typename T>
-    using Event = typename trait::EventType<T>::type;
-
-    //! Tests if the given event has already been completed.
-    //!
-    //! \warning This function is allowed to return false negatives. An already completed event can reported as
-    //! uncompleted because the status information are not fully propagated by the used alpaka backend.
-    //! \return true event is finished/complete else false.
-    template<typename TEvent>
-    ALPAKA_FN_HOST auto isComplete(TEvent const& event) -> bool
-    {
-        return trait::IsComplete<TEvent>::isComplete(event);
-    }
-} // namespace alpaka
diff --git a/include/alpaka/example/ExampleDefaultAcc.hpp b/include/alpaka/example/ExampleDefaultAcc.hpp
deleted file mode 100644
index 22f77f9..0000000
--- a/include/alpaka/example/ExampleDefaultAcc.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright 2023 Jeffrey Kelling, Bernhard Manfred Gruber, Jan Stephan, Aurora Perego, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#include "alpaka/alpaka.hpp"
-
-#pragma once
-
-namespace alpaka
-{
-    //! Alias for the default accelerator used by examples. From a list of
-    //! all accelerators the first one which is enabled is chosen.
-    //! AccCpuSerial is selected last.
-    template<class TDim, class TIdx>
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-    using ExampleDefaultAcc = alpaka::AccGpuCudaRt<TDim, TIdx>;
-#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-    using ExampleDefaultAcc = alpaka::AccGpuHipRt<TDim, TIdx>;
-#elif defined(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED)
-    using ExampleDefaultAcc = alpaka::AccCpuOmp2Blocks<TDim, TIdx>;
-#elif defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED)
-    using ExampleDefaultAcc = alpaka::AccCpuTbbBlocks<TDim, TIdx>;
-#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED)
-    using ExampleDefaultAcc = alpaka::AccCpuOmp2Threads<TDim, TIdx>;
-#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
-    using ExampleDefaultAcc = alpaka::AccCpuThreads<TDim, TIdx>;
-#elif defined(ALPAKA_ACC_SYCL_ENABLED)
-#    if defined(ALPAKA_SYCL_ONEAPI_CPU)
-    using ExampleDefaultAcc = alpaka::AccCpuSycl<TDim, TIdx>;
-#    elif defined(ALPAKA_SYCL_ONEAPI_FPGA)
-    using ExampleDefaultAcc = alpaka::AccFpgaSyclIntel<TDim, TIdx>;
-#    elif defined(ALPAKA_SYCL_ONEAPI_GPU)
-    using ExampleDefaultAcc = alpaka::AccGpuSyclIntel<TDim, TIdx>;
-#    endif
-#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
-    using ExampleDefaultAcc = alpaka::AccCpuSerial<TDim, TIdx>;
-#else
-    class ExampleDefaultAcc;
-#    warning "No supported backend selected."
-#endif
-} // namespace alpaka
diff --git a/include/alpaka/example/ExecuteForEachAccTag.hpp b/include/alpaka/example/ExecuteForEachAccTag.hpp
deleted file mode 100644
index 1eae3d8..0000000
--- a/include/alpaka/example/ExecuteForEachAccTag.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2023 Jeffrey Kelling, Bernhard Manfred Gruber, Jan Stephan, Aurora Perego, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#include "alpaka/alpaka.hpp"
-
-#include <functional>
-#include <tuple>
-#include <utility>
-
-#pragma once
-
-namespace alpaka
-{
-    //! execute a callable for each active accelerator tag
-    //
-    // @param callable callable which can be invoked with an accelerator tag
-    // @return disjunction of all invocation results
-    //
-    template<typename TCallable>
-    inline auto executeForEachAccTag(TCallable&& callable)
-    {
-        // Execute the callable once for each enabled accelerator.
-        // Pass the tag as first argument to the callable.
-        return std::apply([=](auto const&... tags) { return (callable(tags) || ...); }, alpaka::EnabledAccTags{});
-    }
-} // namespace alpaka
diff --git a/include/alpaka/exec/ElementIndex.hpp b/include/alpaka/exec/ElementIndex.hpp
deleted file mode 100644
index 061c597..0000000
--- a/include/alpaka/exec/ElementIndex.hpp
+++ /dev/null
@@ -1,18 +0,0 @@
-#pragma once
-
-namespace alpaka
-{
-
-    /* ElementIndex
-     *
-     * An aggregate that containes the `.global` and `.local` indices of an element along a given dimension.
-     */
-
-    template<typename TIdx>
-    struct ElementIndex
-    {
-        TIdx global; // Index of the element along a given dimension, relative to the whole problem space.
-        TIdx local; // Index of the element along a given dimension, relative to the current group.
-    };
-
-} // namespace alpaka
diff --git a/include/alpaka/exec/IndependentElements.hpp b/include/alpaka/exec/IndependentElements.hpp
deleted file mode 100644
index 447fa7e..0000000
--- a/include/alpaka/exec/IndependentElements.hpp
+++ /dev/null
@@ -1,454 +0,0 @@
-#pragma once
-
-#include "alpaka/acc/Traits.hpp"
-#include "alpaka/idx/Accessors.hpp"
-
-#include <algorithm>
-#include <ciso646> // workaround for MSVC in c++17 mode - TODO: remove once we move to c++20
-#include <cstddef>
-#include <type_traits>
-
-namespace alpaka
-{
-
-    namespace detail
-    {
-
-        /* IndependentGroupsAlong
-         *
-         * `IndependentGroupsAlong<TAcc, Dim>(acc, groups)` returns a one-dimensional iteratable range than spans the
-         * group indices from 0 to `groups`; the groups are assigned to the blocks along the `Dim` dimension. If
-         * `groups` is not specified, it defaults to the number of blocks along the `Dim` dimension.
-         *
-         * `independentGroupsAlong<Dim>(acc, ...)` is a shorthand for `IndependentGroupsAlong<TAcc, Dim>(acc, ...)`
-         * that can infer the accelerator type from the argument.
-         *
-         * In a 1-dimensional kernel, `independentGroups(acc, ...)` is a shorthand for `IndependentGroupsAlong<TAcc,
-         * 0>(acc, ...)`.
-         *
-         * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop),
-         * followed by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). For convenience
-         * when converting CUDA or HIP code, `independentGroupsAlongX(acc, ...)`, `Y` and `Z` are shorthands for
-         * `IndependentGroupsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
-         *
-         * `independentGroupsAlong<Dim>(acc, ...)` should be called consistently by all the threads in a block. All
-         * threads in a block see the same loop iterations, while threads in different blocks may see a different
-         * number of iterations.
-         * If the work division has more blocks than the required number of groups, the first blocks will perform one
-         * iteration of the loop, while the other blocks will exit the loop immediately.
-         * If the work division has less blocks than the required number of groups, some of the blocks will perform
-         * more than one iteration, in order to cover then whole problem space.
-         *
-         * For example,
-         *
-         *   for (auto group: independentGroupsAlong<Dim>(acc, 7))
-         *
-         * will return the group range from 0 to 6, distributed across all blocks in the work division.
-         * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the
-         * other blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from
-         * 0 to 6 will process one group while block 7 will no process any.
-         * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the
-         * loop, in order to cover then whole problem space. For example if the work division has 4 blocks, block 0
-         * will process the groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6,
-         * and block 3 will process group 3.
-         */
-
-        template<
-            typename TAcc,
-            std::size_t Dim,
-            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
-        class IndependentGroupsAlong
-        {
-        public:
-            using Idx = alpaka::Idx<TAcc>;
-
-            ALPAKA_FN_ACC inline IndependentGroupsAlong(TAcc const& acc)
-                : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
-                , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
-                , extent_{stride_}
-            {
-            }
-
-            ALPAKA_FN_ACC inline IndependentGroupsAlong(TAcc const& acc, Idx groups)
-                : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
-                , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
-                , extent_{groups}
-            {
-            }
-
-            class const_iterator;
-            using iterator = const_iterator;
-
-            ALPAKA_FN_ACC inline const_iterator begin() const
-            {
-                return const_iterator(stride_, extent_, first_);
-            }
-
-            ALPAKA_FN_ACC inline const_iterator end() const
-            {
-                return const_iterator(stride_, extent_, extent_);
-            }
-
-            class const_iterator
-            {
-                friend class IndependentGroupsAlong;
-
-                ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first)
-                    : stride_{stride}
-                    , extent_{extent}
-                    , first_{std::min(first, extent)}
-                {
-                }
-
-            public:
-                ALPAKA_FN_ACC inline Idx operator*() const
-                {
-                    return first_;
-                }
-
-                // pre-increment the iterator
-                ALPAKA_FN_ACC inline const_iterator& operator++()
-                {
-                    // increment the first-element-in-block index by the grid stride
-                    first_ += stride_;
-                    if(first_ < extent_)
-                        return *this;
-
-                    // the iterator has reached or passed the end of the extent, clamp it to the extent
-                    first_ = extent_;
-                    return *this;
-                }
-
-                // post-increment the iterator
-                ALPAKA_FN_ACC inline const_iterator operator++(int)
-                {
-                    const_iterator old = *this;
-                    ++(*this);
-                    return old;
-                }
-
-                ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const
-                {
-                    return (first_ == other.first_);
-                }
-
-                ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const
-                {
-                    return not(*this == other);
-                }
-
-            private:
-                // non-const to support iterator copy and assignment
-                Idx stride_;
-                Idx extent_;
-                // modified by the pre/post-increment operator
-                Idx first_;
-            };
-
-        private:
-            Idx const first_;
-            Idx const stride_;
-            Idx const extent_;
-        };
-
-    } // namespace detail
-
-    /* independentGroups
-     *
-     * `independentGroups(acc, groups)` returns a one-dimensional iteratable range than spans the group indices from 0
-     * to `groups`. If `groups` is not specified, it defaults to the number of blocks.
-     *
-     * `independentGroups(acc, ...)` is a shorthand for `detail::IndependentGroupsAlong<TAcc, 0>(acc, ...)`.
-     *
-     * `independentGroups(acc, ...)` should be called consistently by all the threads in a block. All threads in a
-     * block see the same loop iterations, while threads in different blocks may see a different number of iterations.
-     * If the work division has more blocks than the required number of groups, the first blocks will perform one
-     * iteration of the loop, while the other blocks will exit the loop immediately.
-     * If the work division has less blocks than the required number of groups, some of the blocks will perform more
-     * than one iteration, in order to cover then whole problem space.
-     *
-     * For example,
-     *
-     *   for (auto group: independentGroups(acc, 7))
-     *
-     * will return the group range from 0 to 6, distributed across all blocks in the work division.
-     * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
-     * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6
-     * will process one group while block 7 will no process any.
-     * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the
-     * loop, in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will
-     * process the groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block
-     * 3 will process group 3.
-     *
-     * Note that `independentGroups(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels,
-     * use
-     *   - `independentGroupsAlong<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
-     *   - `independentGroupsAlongX(acc, ...)`, `independentGroupsAlongY(acc, ...)`, or `independentGroupsAlongZ(acc,
-     *     ...)` to loop along the fastest, second-fastest, or third-fastest dimension.
-     */
-
-    template<
-        typename TAcc,
-        typename... TArgs,
-        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
-    ALPAKA_FN_ACC inline auto independentGroups(TAcc const& acc, TArgs... args)
-    {
-        using Idx = alpaka::Idx<TAcc>;
-        return detail::IndependentGroupsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
-    }
-
-    /* independentGroupsAlong<Dim>
-     *
-     * `independentGroupsAlong<Dim>(acc, ...)` is a shorthand for `detail::IndependentGroupsAlong<TAcc, Dim>(acc, ...)`
-     * that can infer the accelerator type from the argument.
-     */
-
-    template<
-        std::size_t Dim,
-        typename TAcc,
-        typename... TArgs,
-        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
-    ALPAKA_FN_ACC inline auto independentGroupsAlong(TAcc const& acc, TArgs... args)
-    {
-        using Idx = alpaka::Idx<TAcc>;
-        return detail::IndependentGroupsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
-    }
-
-    /* independentGroupsAlongX, Y, Z
-     *
-     * Like `independentGroups` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
-     * dimensions.
-     */
-
-    template<
-        typename TAcc,
-        typename... TArgs,
-        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
-    ALPAKA_FN_ACC inline auto independentGroupsAlongX(TAcc const& acc, TArgs... args)
-    {
-        using Idx = alpaka::Idx<TAcc>;
-        return detail::IndependentGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
-    }
-
-    template<
-        typename TAcc,
-        typename... TArgs,
-        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
-    ALPAKA_FN_ACC inline auto independentGroupsAlongY(TAcc const& acc, TArgs... args)
-    {
-        using Idx = alpaka::Idx<TAcc>;
-        return detail::IndependentGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
-    }
-
-    template<
-        typename TAcc,
-        typename... TArgs,
-        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
-    ALPAKA_FN_ACC inline auto independentGroupsAlongZ(TAcc const& acc, TArgs... args)
-    {
-        using Idx = alpaka::Idx<TAcc>;
-        return detail::IndependentGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
-    }
-
-    namespace detail
-    {
-
-        /* IndependentGroupElementsAlong
-         *
-         * `independentGroupElementsAlong<Dim>(acc, ...)` is a shorthand for `IndependentGroupElementsAlong<TAcc,
-         * Dim>(acc, ...)` that can infer the accelerator type from the argument.
-         */
-
-        template<
-            typename TAcc,
-            std::size_t Dim,
-            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
-        class IndependentGroupElementsAlong
-        {
-        public:
-            using Idx = alpaka::Idx<TAcc>;
-
-            ALPAKA_FN_ACC inline IndependentGroupElementsAlong(TAcc const& acc)
-                : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
-                , thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_}
-                , stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_}
-                , extent_{stride_}
-            {
-            }
-
-            ALPAKA_FN_ACC inline IndependentGroupElementsAlong(TAcc const& acc, Idx extent)
-                : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
-                , thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_}
-                , stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_}
-                , extent_{extent}
-            {
-            }
-
-            ALPAKA_FN_ACC inline IndependentGroupElementsAlong(TAcc const& acc, Idx first, Idx extent)
-                : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
-                , thread_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_ + first}
-                , stride_{alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[Dim] * elements_}
-                , extent_{extent}
-            {
-            }
-
-            class const_iterator;
-            using iterator = const_iterator;
-
-            ALPAKA_FN_ACC inline const_iterator begin() const
-            {
-                return const_iterator(elements_, stride_, extent_, thread_);
-            }
-
-            ALPAKA_FN_ACC inline const_iterator end() const
-            {
-                return const_iterator(elements_, stride_, extent_, extent_);
-            }
-
-            class const_iterator
-            {
-                friend class IndependentGroupElementsAlong;
-
-                ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first)
-                    : elements_{elements}
-                    ,
-                    // we need to reduce the stride by on element range because index_ is later increased with each
-                    // increment
-                    stride_{stride - elements}
-                    , extent_{extent}
-                    , index_{std::min(first, extent)}
-                {
-                }
-
-            public:
-                ALPAKA_FN_ACC inline Idx operator*() const
-                {
-                    return index_;
-                }
-
-                // pre-increment the iterator
-                ALPAKA_FN_ACC inline const_iterator& operator++()
-                {
-                    ++indexElem_;
-                    ++index_;
-                    if(indexElem_ >= elements_)
-                    {
-                        indexElem_ = 0;
-                        index_ += stride_;
-                    }
-                    if(index_ >= extent_)
-                        index_ = extent_;
-
-                    return *this;
-                }
-
-                // post-increment the iterator
-                ALPAKA_FN_ACC inline const_iterator operator++(int)
-                {
-                    const_iterator old = *this;
-                    ++(*this);
-                    return old;
-                }
-
-                ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const
-                {
-                    return (*(*this) == *other);
-                }
-
-                ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const
-                {
-                    return not(*this == other);
-                }
-
-            private:
-                // non-const to support iterator copy and assignment
-                Idx elements_;
-                Idx stride_;
-                Idx extent_;
-                // modified by the pre/post-increment operator
-                Idx index_;
-                Idx indexElem_ = 0;
-            };
-
-        private:
-            Idx const elements_;
-            Idx const thread_;
-            Idx const stride_;
-            Idx const extent_;
-        };
-
-    } // namespace detail
-
-    /* independentGroupElements
-     */
-
-    template<
-        typename TAcc,
-        typename... TArgs,
-        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
-    ALPAKA_FN_ACC inline auto independentGroupElements(TAcc const& acc, TArgs... args)
-    {
-        using Idx = alpaka::Idx<TAcc>;
-        return detail::IndependentGroupElementsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
-    }
-
-    /* independentGroupElementsAlong<Dim>
-     *
-     * `independentGroupElementsAlong<Dim>(acc, ...)` is a shorthand for `detail::IndependentGroupElementsAlong<TAcc,
-     * Dim>(acc, ...)` that can infer the accelerator type from the argument.
-     */
-
-    template<
-        std::size_t Dim,
-        typename TAcc,
-        typename... TArgs,
-        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
-    ALPAKA_FN_ACC inline auto independentGroupElementsAlong(TAcc const& acc, TArgs... args)
-    {
-        using Idx = alpaka::Idx<TAcc>;
-        return detail::IndependentGroupElementsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
-    }
-
-    /* independentGroupElementsAlongX, Y, Z
-     *
-     * Like `independentGroupElements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
-     * dimensions.
-     */
-
-    template<
-        typename TAcc,
-        typename... TArgs,
-        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
-    ALPAKA_FN_ACC inline auto independentGroupElementsAlongX(TAcc const& acc, TArgs... args)
-    {
-        using Idx = alpaka::Idx<TAcc>;
-        return detail::IndependentGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(
-            acc,
-            static_cast<Idx>(args)...);
-    }
-
-    template<
-        typename TAcc,
-        typename... TArgs,
-        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
-    ALPAKA_FN_ACC inline auto independentGroupElementsAlongY(TAcc const& acc, TArgs... args)
-    {
-        using Idx = alpaka::Idx<TAcc>;
-        return detail::IndependentGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(
-            acc,
-            static_cast<Idx>(args)...);
-    }
-
-    template<
-        typename TAcc,
-        typename... TArgs,
-        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
-    ALPAKA_FN_ACC inline auto independentGroupElementsAlongZ(TAcc const& acc, TArgs... args)
-    {
-        using Idx = alpaka::Idx<TAcc>;
-        return detail::IndependentGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(
-            acc,
-            static_cast<Idx>(args)...);
-    }
-
-} // namespace alpaka
diff --git a/include/alpaka/exec/Once.hpp b/include/alpaka/exec/Once.hpp
deleted file mode 100644
index 8a2f2cb..0000000
--- a/include/alpaka/exec/Once.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2024 Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/acc/Tag.hpp"
-#include "alpaka/acc/Traits.hpp"
-#include "alpaka/dim/DimIntegralConst.hpp"
-#include "alpaka/idx/Accessors.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/vec/Vec.hpp"
-
-#include <type_traits>
-
-namespace alpaka
-{
-
-    /* oncePerGrid
-     *
-     * `oncePerGrid(acc)` returns true for a single thread within the kernel execution grid.
-     *
-     * Usually the condition is true for block 0 and thread 0, but these indices should not be relied upon.
-     */
-
-    template<typename TAcc, typename = std::enable_if_t<isAccelerator<TAcc>>>
-    ALPAKA_FN_ACC inline constexpr bool oncePerGrid(TAcc const& acc)
-    {
-        using Dim = alpaka::Dim<TAcc>;
-        using Idx = alpaka::Idx<TAcc>;
-        using Vec = alpaka::Vec<Dim, Idx>;
-
-        // Workaround for a weird bug in oneAPI 2024.x targetting the CPU backend and FPGA emulator.
-        if constexpr(accMatchesTags<TAcc, TagCpuSycl, TagFpgaSyclIntel>)
-        {
-            // SYCL accelerator specific code
-            return acc.m_item_workdiv.get_global_linear_id() == 0;
-        }
-
-        return getIdx<Grid, Threads>(acc) == Vec::zeros();
-    }
-
-    /* oncePerBlock
-     *
-     * `oncePerBlock(acc)` returns true for a single thread within the block.
-     *
-     * Usually the condition is true for thread 0, but this index should not be relied upon.
-     */
-
-    template<typename TAcc, typename = std::enable_if_t<isAccelerator<TAcc>>>
-    ALPAKA_FN_ACC inline constexpr bool oncePerBlock(TAcc const& acc)
-    {
-        return getIdx<Block, Threads>(acc) == Vec<Dim<TAcc>, Idx<TAcc>>::zeros();
-    }
-
-} // namespace alpaka
diff --git a/include/alpaka/exec/UniformElements.hpp b/include/alpaka/exec/UniformElements.hpp
deleted file mode 100644
index 2bfbc94..0000000
--- a/include/alpaka/exec/UniformElements.hpp
+++ /dev/null
@@ -1,1145 +0,0 @@
-#pragma once
-
-#include "alpaka/acc/Traits.hpp"
-#include "alpaka/core/Utility.hpp"
-#include "alpaka/exec/ElementIndex.hpp"
-#include "alpaka/idx/Accessors.hpp"
-
-#include <algorithm>
-#include <ciso646> // workaround for MSVC in c++17 mode - TODO: remove once we move to c++20
-#include <cstddef>
-#include <type_traits>
-
-namespace alpaka
-{
-
-    namespace detail
-    {
-
-        /* UniformElementsAlong
-         *
-         * `UniformElementsAlong<TAcc, Dim>(acc [, first], extent)` returns a one-dimensional iteratable range that
-         * spans the element indices from `first` (inclusive) to `extent` (exlusive) along the `Dim` dimension. If
-         * `first` is not specified, it defaults to 0. If `extent` is not specified, it defaults to the kernel grid
-         * size along the `Dim` dimension.
-         *
-         * `uniformElementsAlong<Dim>(acc, ...)` is a shorthand for `UniformElementsAlong<TAcc, Dim>(acc, ...)` that
-         * can infer the accelerator type from the argument.
-         *
-         * In a 1-dimensional kernel, `uniformElements(acc, ...)` is a shorthand for `UniformElementsAlong<TAcc,
-         * 0>(acc, ...)`.
-         *
-         * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop),
-         * followed by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). For convenience
-         * when converting CUDA or HIP code, `uniformElementsAlongX(acc, ...)`, `Y` and `Z` are shorthands for
-         * `UniformElementsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
-         *
-         * To cover the problem space, different threads may execute a different number of iterations. As a result, it
-         * is not safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop. If
-         * a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
-         * loop over each group's elements, and synchronise only in the outer loop:
-         *
-         *  for (auto group : uniformGroupsAlong<Dim>(acc, extent)) {
-         *    for (auto element : uniformGroupElementsAlong<Dim>(acc, group, extent)) {
-         *       // first part of the computation
-         *       // no synchronisations here
-         *       ...
-         *    }
-         *    // wait for all threads to complete the first part
-         *    alpaka::syncBlockThreads();
-         *    for (auto element : uniformGroupElementsAlong<Dim>(acc, group, extent)) {
-         *       // second part of the computation
-         *       // no synchronisations here
-         *       ...
-         *    }
-         *    // wait for all threads to complete the second part
-         *    alpaka::syncBlockThreads();
-         *    ...
-         *  }
-         *
-         * Warp-level primitives require that all threads in the warp execute the same function. If `extent` is not a
-         * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for
-         * example, the kernel may hang. To avoid this problem, round up `extent` to a multiple of the warp size, and
-         * check the element index explicitly inside the loop:
-         *
-         *  for (auto element : uniformElementsAlong<N-1>(acc, round_up_by(extent, alpaka::warp::getSize(acc)))) {
-         *    bool flag = false;
-         *    if (element < extent) {
-         *      // do some work and compute a result flag only for the valid elements
-         *      flag = do_some_work();
-         *    }
-         *    // check if any valid element had a positive result
-         *    if (alpaka::warp::any(acc, flag)) {
-         *      // ...
-         *    }
-         *  }
-         *
-         * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension,
-         * `N-1`.
-         */
-
-        template<
-            typename TAcc,
-            std::size_t Dim,
-            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
-        class UniformElementsAlong
-        {
-        public:
-            using Idx = alpaka::Idx<TAcc>;
-
-            ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc)
-                : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
-                , first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
-                , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
-                , extent_{stride_}
-            {
-            }
-
-            ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc, Idx extent)
-                : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
-                , first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
-                , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
-                , extent_{extent}
-            {
-            }
-
-            ALPAKA_FN_ACC inline UniformElementsAlong(TAcc const& acc, Idx first, Idx extent)
-                : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
-                , first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_ + first}
-                , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[Dim] * elements_}
-                , extent_{extent}
-            {
-            }
-
-            class const_iterator;
-            using iterator = const_iterator;
-
-            ALPAKA_FN_ACC inline const_iterator begin() const
-            {
-                return const_iterator(elements_, stride_, extent_, first_);
-            }
-
-            ALPAKA_FN_ACC inline const_iterator end() const
-            {
-                return const_iterator(elements_, stride_, extent_, extent_);
-            }
-
-            class const_iterator
-            {
-                friend class UniformElementsAlong;
-
-                ALPAKA_FN_ACC inline const_iterator(Idx elements, Idx stride, Idx extent, Idx first)
-                    : elements_{elements}
-                    ,
-                    // we need to reduce the stride by on element range because index_ is later increased with each
-                    // increment
-                    stride_{stride - elements}
-                    , extent_{extent}
-                    , index_{std::min(first, extent)}
-                {
-                }
-
-            public:
-                ALPAKA_FN_ACC inline Idx operator*() const
-                {
-                    return index_;
-                }
-
-                // pre-increment the iterator
-                ALPAKA_FN_ACC inline const_iterator& operator++()
-                {
-                    // increment the index along the elements processed by the current thread
-                    ++indexElem_;
-                    ++index_;
-                    if(indexElem_ >= elements_)
-                    {
-                        indexElem_ = 0;
-                        index_ += stride_;
-                    }
-                    if(index_ >= extent_)
-                        index_ = extent_;
-
-                    return *this;
-                }
-
-                // post-increment the iterator
-                ALPAKA_FN_ACC inline const_iterator operator++(int)
-                {
-                    const_iterator old = *this;
-                    ++(*this);
-                    return old;
-                }
-
-                ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const
-                {
-                    return (*(*this) == *other);
-                }
-
-                ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const
-                {
-                    return not(*this == other);
-                }
-
-            private:
-                // non-const to support iterator copy and assignment
-                Idx elements_;
-                Idx stride_;
-                Idx extent_;
-                // modified by the pre/post-increment operator
-                Idx index_;
-                Idx indexElem_ = 0;
-            };
-
-        private:
-            Idx const elements_;
-            Idx const first_;
-            Idx const stride_;
-            Idx const extent_;
-        };
-
-    } // namespace detail
-
-    /* uniformElements
-     *
-     * `uniformElements(acc [, first], extent)` returns a one-dimensional iteratable range that spans the element
-     * indices from `first` (inclusive) to `extent` (exlusive). If `first` is not specified, it defaults to 0. If
-     * `extent` is not specified, it defaults to the kernel grid size.
-     *
-     * `uniformElements(acc, ...)` is a shorthand for `detail::UniformElementsAlong<TAcc, 0>(acc, ...)`.
-     *
-     * To cover the problem space, different threads may execute a different number of iterations. As a result, it is
-     * not safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop. If a
-     * block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner loop
-     * over each group's elements, and synchronise only in the outer loop:
-     *
-     *  for (auto group : uniformGroups(acc, extent)) {
-     *    for (auto element : uniformGroupElements(acc, group, extent)) {
-     *       // first part of the computation
-     *       // no synchronisations here
-     *       ...
-     *    }
-     *    // wait for all threads to complete the first part
-     *    alpaka::syncBlockThreads();
-     *    for (auto element : uniformGroupElements(acc, group, extent)) {
-     *       // second part of the computation
-     *       // no synchronisations here
-     *       ...
-     *    }
-     *    // wait for all threads to complete the second part
-     *    alpaka::syncBlockThreads();
-     *    ...
-     *  }
-     *
-     * Warp-level primitives require that all threads in the warp execute the same function. If `extent` is not a
-     * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example,
-     * the kernel may hang. To avoid this problem, round up `extent` to a multiple of the warp size, and check the
-     * element index explicitly inside the loop:
-     *
-     *  for (auto element : uniformElements(acc, round_up_by(extent, alpaka::warp::getSize(acc)))) {
-     *    bool flag = false;
-     *    if (element < extent) {
-     *      // do some work and compute a result flag only for elements up to extent
-     *      flag = do_some_work();
-     *    }
-     *    // check if any valid element had a positive result
-     *    if (alpaka::warp::any(acc, flag)) {
-     *      // ...
-     *    }
-     *  }
-     *
-     * Note that `uniformElements(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels,
-     * use
-     *   - `uniformElementsND(acc, ...)` to cover an N-dimensional problem space with a single loop;
-     *   - `uniformElementsAlong<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
-     *   - `uniformElementsAlongX(acc, ...)`, `uniformElementsAlongY(acc, ...)`, or `uniformElementsAlongZ(acc, ...)`
-     *     to loop along the fastest, second-fastest, or third-fastest dimension.
-     */
-
-    template<
-        typename TAcc,
-        typename... TArgs,
-        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
-    ALPAKA_FN_ACC inline auto uniformElements(TAcc const& acc, TArgs... args)
-    {
-        using Idx = alpaka::Idx<TAcc>;
-        return detail::UniformElementsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
-    }
-
-    /* uniformElementsAlong<Dim>
-     *
-     * `uniformElementsAlong<Dim>(acc, ...)` is a shorthand for `detail::UniformElementsAlong<TAcc, Dim>(acc, ...)`
-     * that can infer the accelerator type from the argument.
-     */
-
-    template<
-        std::size_t Dim,
-        typename TAcc,
-        typename... TArgs,
-        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
-    ALPAKA_FN_ACC inline auto uniformElementsAlong(TAcc const& acc, TArgs... args)
-    {
-        using Idx = alpaka::Idx<TAcc>;
-        return detail::UniformElementsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
-    }
-
-    /* uniformElementsAlongX, Y, Z
-     *
-     * Like `uniformElements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
-     * dimensions.
-     */
-
-    template<
-        typename TAcc,
-        typename... TArgs,
-        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
-    ALPAKA_FN_ACC inline auto uniformElementsAlongX(TAcc const& acc, TArgs... args)
-    {
-        using Idx = alpaka::Idx<TAcc>;
-        return detail::UniformElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
-    }
-
-    template<
-        typename TAcc,
-        typename... TArgs,
-        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
-    ALPAKA_FN_ACC inline auto uniformElementsAlongY(TAcc const& acc, TArgs... args)
-    {
-        using Idx = alpaka::Idx<TAcc>;
-        return detail::UniformElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
-    }
-
-    template<
-        typename TAcc,
-        typename... TArgs,
-        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
-    ALPAKA_FN_ACC inline auto uniformElementsAlongZ(TAcc const& acc, TArgs... args)
-    {
-        using Idx = alpaka::Idx<TAcc>;
-        return detail::UniformElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
-    }
-
-    namespace detail
-    {
-
-        /* UniformElementsND
-         *
-         * `UniformElementsND(acc, extent)` returns an N-dimensional iteratable range that spans the element indices
-         * required to cover the given problem size, indicated by `extent`.
-         *
-         * `uniformElementsND(acc, ...)` is an alias for `UniformElementsND<TAcc>(acc, ...)`.
-         *
-         * To cover the problem space, different threads may execute a different number of iterations. As a result, it
-         * is not safe to call `alpaka::syncBlockThreads()` and other block-level synchronisations within this loop. If
-         * a block synchronisation is needed, one should split the loop into an outer loop over the groups and an inner
-         * loop over each group's elements, and synchronise only in the outer loop:
-         *
-         *  for (auto group0 : uniformGroupsAlong<0>(acc, extent[0])) {
-         *    for (auto group1 : uniformGroupsAlong<1>(acc, extent[1])) {
-         *      for (auto element0 : uniformGroupElementsAlong<0>(acc, group0, extent[0])) {
-         *        for (auto element1 : uniformGroupElementsAlong<1>(acc, group1, extent[1])) {
-         *           // first part of the computation
-         *           // no synchronisations here
-         *           ...
-         *        }
-         *      }
-         *      // wait for all threads to complete the first part
-         *      alpaka::syncBlockThreads();
-         *      for (auto element0 : uniformGroupElementsAlong<0>(acc, group0, extent[0])) {
-         *        for (auto element1 : uniformGroupElementsAlong<1>(acc, group1, extent[1])) {
-         *           // second part of the computation
-         *           // no synchronisations here
-         *           ...
-         *        }
-         *      }
-         *      // wait for all threads to complete the second part
-         *      alpaka::syncBlockThreads();
-         *      ...
-         *    }
-         *  }
-         *
-         * For more details, see `UniformElementsAlong<TAcc, Dim>(acc, ...)`.
-         */
-
-        template<
-            typename TAcc,
-            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
-        class UniformElementsND
-        {
-        public:
-            using Dim = alpaka::Dim<TAcc>;
-            using Idx = alpaka::Idx<TAcc>;
-            using Vec = alpaka::Vec<Dim, Idx>;
-
-            ALPAKA_FN_ACC inline UniformElementsND(TAcc const& acc)
-                : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)}
-                , thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_}
-                , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_}
-                , extent_{stride_}
-            {
-            }
-
-            ALPAKA_FN_ACC inline UniformElementsND(TAcc const& acc, Vec extent)
-                : elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)}
-                , thread_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_}
-                , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_}
-                , extent_{extent}
-            {
-            }
-
-            // tag used to construct an end iterator
-            struct at_end_t
-            {
-            };
-
-            class const_iterator;
-            using iterator = const_iterator;
-
-            ALPAKA_FN_ACC inline const_iterator begin() const
-            {
-                // check that all dimensions of the current thread index are within the extent
-                if((thread_ < extent_).all())
-                {
-                    // construct an iterator pointing to the first element to be processed by the current thread
-                    return const_iterator{this, thread_};
-                }
-                else
-                {
-                    // construct an end iterator, pointing post the end of the extent
-                    return const_iterator{this, at_end_t{}};
-                }
-            }
-
-            ALPAKA_FN_ACC inline const_iterator end() const
-            {
-                // construct an end iterator, pointing post the end of the extent
-                return const_iterator{this, at_end_t{}};
-            }
-
-            class const_iterator
-            {
-                friend class UniformElementsND;
-
-            public:
-                ALPAKA_FN_ACC inline Vec operator*() const
-                {
-                    return index_;
-                }
-
-                // pre-increment the iterator
-                ALPAKA_FN_ACC inline constexpr const_iterator operator++()
-                {
-                    increment();
-                    return *this;
-                }
-
-                // post-increment the iterator
-                ALPAKA_FN_ACC inline constexpr const_iterator operator++(int)
-                {
-                    const_iterator old = *this;
-                    increment();
-                    return old;
-                }
-
-                ALPAKA_FN_ACC inline constexpr bool operator==(const_iterator const& other) const
-                {
-                    return (index_ == other.index_);
-                }
-
-                ALPAKA_FN_ACC inline constexpr bool operator!=(const_iterator const& other) const
-                {
-                    return not(*this == other);
-                }
-
-            private:
-                // construct an iterator pointing to the first element to be processed by the current thread
-                ALPAKA_FN_ACC inline const_iterator(UniformElementsND const* loop, Vec first)
-                    : loop_{loop}
-                    , first_{alpaka::elementwise_min(first, loop->extent_)}
-                    , range_{alpaka::elementwise_min(first + loop->elements_, loop->extent_)}
-                    , index_{first_}
-                {
-                }
-
-                // construct an end iterator, pointing post the end of the extent
-                ALPAKA_FN_ACC inline const_iterator(UniformElementsND const* loop, at_end_t const&)
-                    : loop_{loop}
-                    , first_{loop_->extent_}
-                    , range_{loop_->extent_}
-                    , index_{loop_->extent_}
-                {
-                }
-
-                template<size_t I>
-                ALPAKA_FN_ACC inline constexpr bool nth_elements_loop()
-                {
-                    bool overflow = false;
-                    ++index_[I];
-                    if(index_[I] >= range_[I])
-                    {
-                        index_[I] = first_[I];
-                        overflow = true;
-                    }
-                    return overflow;
-                }
-
-                template<size_t N>
-                ALPAKA_FN_ACC inline constexpr bool do_elements_loops()
-                {
-                    if constexpr(N == 0)
-                    {
-                        // overflow
-                        return true;
-                    }
-                    else
-                    {
-                        if(not nth_elements_loop<N - 1>())
-                        {
-                            return false;
-                        }
-                        else
-                        {
-                            return do_elements_loops<N - 1>();
-                        }
-                    }
-                    ALPAKA_UNREACHABLE(false);
-                }
-
-                template<size_t I>
-                ALPAKA_FN_ACC inline constexpr bool nth_strided_loop()
-                {
-                    bool overflow = false;
-                    first_[I] += loop_->stride_[I];
-                    if(first_[I] >= loop_->extent_[I])
-                    {
-                        first_[I] = loop_->thread_[I];
-                        overflow = true;
-                    }
-                    index_[I] = first_[I];
-                    range_[I] = std::min(first_[I] + loop_->elements_[I], loop_->extent_[I]);
-                    return overflow;
-                }
-
-                template<size_t N>
-                ALPAKA_FN_ACC inline constexpr bool do_strided_loops()
-                {
-                    if constexpr(N == 0)
-                    {
-                        // overflow
-                        return true;
-                    }
-                    else
-                    {
-                        if(not nth_strided_loop<N - 1>())
-                        {
-                            return false;
-                        }
-                        else
-                        {
-                            return do_strided_loops<N - 1>();
-                        }
-                    }
-                    ALPAKA_UNREACHABLE(false);
-                }
-
-                // increment the iterator
-                ALPAKA_FN_ACC inline constexpr void increment()
-                {
-                    // linear N-dimensional loops over the elements associated to the thread;
-                    // do_elements_loops<>() returns true if any of those loops overflows
-                    if(not do_elements_loops<Dim::value>())
-                    {
-                        // the elements loops did not overflow, return the next index
-                        return;
-                    }
-
-                    // strided N-dimensional loop over the threads in the kernel launch grid;
-                    // do_strided_loops<>() returns true if any of those loops overflows
-                    if(not do_strided_loops<Dim::value>())
-                    {
-                        // the strided loops did not overflow, return the next index
-                        return;
-                    }
-
-                    // the iterator has reached or passed the end of the extent, clamp it to the extent
-                    first_ = loop_->extent_;
-                    range_ = loop_->extent_;
-                    index_ = loop_->extent_;
-                }
-
-                // const pointer to the UniformElementsND that the iterator refers to
-                UniformElementsND const* loop_;
-
-                // modified by the pre/post-increment operator
-                Vec first_; // first element processed by this thread
-                Vec range_; // last element processed by this thread
-                Vec index_; // current element processed by this thread
-            };
-
-        private:
-            Vec const elements_;
-            Vec const thread_;
-            Vec const stride_;
-            Vec const extent_;
-        };
-
-    } // namespace detail
-
-    /* uniformElementsND
-     *
-     * `uniformElementsND(acc, ...)` is a shorthand for `detail::UniformElementsND<TAcc>(acc, ...)`.
-     */
-
-    template<
-        typename TAcc,
-        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
-    ALPAKA_FN_ACC inline auto uniformElementsND(TAcc const& acc)
-    {
-        return detail::UniformElementsND<TAcc>(acc);
-    }
-
-    template<
-        typename TAcc,
-        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
-    ALPAKA_FN_ACC inline auto uniformElementsND(
-        TAcc const& acc,
-        alpaka::Vec<alpaka::Dim<TAcc>, alpaka::Idx<TAcc>> extent)
-    {
-        return detail::UniformElementsND<TAcc>(acc, extent);
-    }
-
-    namespace detail
-    {
-
-        /* UniformGroupsAlong
-         *
-         * `UniformGroupsAlong<Dim>(acc, elements)` returns a one-dimensional iteratable range than spans the group
-         * indices required to cover the given problem size along the `Dim` dimension, in units of the block size.
-         * `elements` indicates the total number of elements, across all groups; if not specified, it defaults to the
-         * kernel grid size along the `Dim` dimension.
-         *
-         * `uniformGroupsAlong<Dim>(acc, ...)` is a shorthand for `UniformGroupsAlong<TAcc, Dim>(acc, ...)` that can
-         * infer the accelerator type from the argument.
-         *
-         * In a 1-dimensional kernel, `uniformGroups(acc, ...)` is a shorthand for `UniformGroupsAlong<Tacc, 0>(acc,
-         * ...)`.
-         *
-         * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop),
-         * followed by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). For convenience
-         * when converting CUDA or HIP code, `uniformGroupsAlongX(acc, ...)`, `Y` and `Z` are shorthands for
-         * `UniformGroupsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
-         *
-         * `uniformGroupsAlong<Dim>(acc, ...)` should be called consistently by all the threads in a block. All
-         * threads in a block see the same loop iterations, while threads in different blocks may see a different
-         * number of iterations. If the work division has more blocks than the required number of groups, the first
-         * blocks will perform one iteration of the loop, while the other blocks will exit the loop immediately. If the
-         * work division has less blocks than the required number of groups, some of the blocks will perform more than
-         * one iteration, in order to cover then whole problem space.
-         *
-         * If the problem size is not a multiple of the block size, the last group will process a number of elements
-         * smaller than the block size. However, also in this case all threads in the block will execute the same
-         * number of iterations of this loop: this makes it safe to use block-level synchronisations in the loop body.
-         * It is left to the inner loop (or the user) to ensure that only the correct number of threads process any
-         * data; this logic is implemented by `uniformGroupElementsAlong<Dim>(acc, group, elements)`.
-         *
-         * For example, if the block size is 64 and there are 400 elements
-         *
-         *   for (auto group: uniformGroupsAlong<Dim>(acc, 400)
-         *
-         * will return the group range from 0 to 6, distributed across all blocks in the work division: group 0 should
-         * cover the elements from 0 to 63, group 1 should cover the elements from 64 to 127, etc., until the last
-         * group, group 6, should cover the elements from 384 to 399. All the threads of the block will process this
-         * last group; it is up to the inner loop to not process the non-existing elements after 399.
-         *
-         * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the
-         * other blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from
-         * 0 to 6 will process one group while block 7 will no process any.
-         *
-         * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the
-         * loop, in order to cover then whole problem space. For example if the work division has 4 blocks, block 0
-         * will process the groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6,
-         * and block 3 will process group 3.
-         *
-         * See `UniformElementsAlong<TAcc, Dim>(acc, ...)` for a concrete example using `uniformGroupsAlong<Dim>` and
-         * `uniformGroupElementsAlong<Dim>`.
-         */
-
-        template<
-            typename TAcc,
-            std::size_t Dim,
-            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
-        class UniformGroupsAlong
-        {
-        public:
-            using Idx = alpaka::Idx<TAcc>;
-
-            ALPAKA_FN_ACC inline UniformGroupsAlong(TAcc const& acc)
-                : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
-                , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
-                , extent_{stride_}
-            {
-            }
-
-            // extent is the total number of elements (not blocks)
-            ALPAKA_FN_ACC inline UniformGroupsAlong(TAcc const& acc, Idx extent)
-                : first_{alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
-                , stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[Dim]}
-                , extent_{alpaka::core::divCeil(extent, alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim])}
-            {
-            }
-
-            class const_iterator;
-            using iterator = const_iterator;
-
-            ALPAKA_FN_ACC inline const_iterator begin() const
-            {
-                return const_iterator(stride_, extent_, first_);
-            }
-
-            ALPAKA_FN_ACC inline const_iterator end() const
-            {
-                return const_iterator(stride_, extent_, extent_);
-            }
-
-            class const_iterator
-            {
-                friend class UniformGroupsAlong;
-
-                ALPAKA_FN_ACC inline const_iterator(Idx stride, Idx extent, Idx first)
-                    : stride_{stride}
-                    , extent_{extent}
-                    , first_{std::min(first, extent)}
-                {
-                }
-
-            public:
-                ALPAKA_FN_ACC inline Idx operator*() const
-                {
-                    return first_;
-                }
-
-                // pre-increment the iterator
-                ALPAKA_FN_ACC inline const_iterator& operator++()
-                {
-                    // increment the first-element-in-block index by the grid stride
-                    first_ += stride_;
-                    if(first_ < extent_)
-                        return *this;
-
-                    // the iterator has reached or passed the end of the extent, clamp it to the extent
-                    first_ = extent_;
-                    return *this;
-                }
-
-                // post-increment the iterator
-                ALPAKA_FN_ACC inline const_iterator operator++(int)
-                {
-                    const_iterator old = *this;
-                    ++(*this);
-                    return old;
-                }
-
-                ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const
-                {
-                    return (first_ == other.first_);
-                }
-
-                ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const
-                {
-                    return not(*this == other);
-                }
-
-            private:
-                // non-const to support iterator copy and assignment
-                Idx stride_;
-                Idx extent_;
-                // modified by the pre/post-increment operator
-                Idx first_;
-            };
-
-        private:
-            Idx const first_;
-            Idx const stride_;
-            Idx const extent_;
-        };
-
-    } // namespace detail
-
-    /* uniformGroups
-     *
-     * `uniformGroups(acc, elements)` returns a one-dimensional iteratable range than spans the group indices required
-     * to cover the given problem size, in units of the block size. `elements` indicates the total number of elements,
-     * across all groups; if not specified, it defaults to the kernel grid size.
-     *
-     * `uniformGroups(acc, ...)` is a shorthand for `detail::UniformGroupsAlong<TAcc, 0>(acc, ...)`.
-     *
-     * `uniformGroups(acc, ...)` should be called consistently by all the threads in a block. All threads in a block
-     * see the same loop iterations, while threads in different blocks may see a different number of iterations. If the
-     * work division has more blocks than the required number of groups, the first blocks will perform one iteration of
-     * the loop, while the other blocks will exit the loop immediately. If the work division has less blocks than the
-     * required number of groups, some of the blocks will perform more than one iteration, in order to cover then whole
-     * problem space.
-     *
-     * If the problem size is not a multiple of the block size, the last group will process a number of elements
-     * smaller than the block size. However, also in this case all threads in the block will execute the same number of
-     * iterations of this loop: this makes it safe to use block-level synchronisations in the loop body. It is left to
-     * the inner loop (or the user) to ensure that only the correct number of threads process any data; this logic is
-     * implemented by `uniformGroupElements(acc, group, elements)`.
-     *
-     * For example, if the block size is 64 and there are 400 elements
-     *
-     *   for (auto group: uniformGroups(acc, 400)
-     *
-     * will return the group range from 0 to 6, distributed across all blocks in the work division: group 0 should
-     * cover the elements from 0 to 63, group 1 should cover the elements from 64 to 127, etc., until the last group,
-     * group 6, should cover the elements from 384 to 399. All the threads of the block will process this last group;
-     * it is up to the inner loop to not process the non-existing elements after 399.
-     *
-     * If the work division has more than 7 blocks, the first 7 will perform one iteration of the loop, while the other
-     * blocks will exit the loop immediately. For example if the work division has 8 blocks, the blocks from 0 to 6
-     * will process one group while block 7 will no process any.
-     *
-     * If the work division has less than 7 blocks, some of the blocks will perform more than one iteration of the
-     * loop, in order to cover then whole problem space. For example if the work division has 4 blocks, block 0 will
-     * process the groups 0 and 4, block 1 will process groups 1 and 5, group 2 will process groups 2 and 6, and block
-     * 3 will process group 3.
-     *
-     * See `uniformElements(acc, ...)` for a concrete example using `uniformGroups` and `uniformGroupElements`.
-     *
-     * Note that `uniformGroups(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional kernels,
-     * use
-     *   - `uniformGroupsAlong<Dim>(acc, ...)` to perform the iteration explicitly along dimension `Dim`;
-     *   - `uniformGroupsAlongX(acc, ...)`, `uniformGroupsAlongY(acc, ...)`, or `uniformGroupsAlongZ(acc, ...)` to loop
-     *     along the fastest, second-fastest, or third-fastest dimension.
-     */
-
-    template<
-        typename TAcc,
-        typename... TArgs,
-        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
-    ALPAKA_FN_ACC inline auto uniformGroups(TAcc const& acc, TArgs... args)
-    {
-        using Idx = alpaka::Idx<TAcc>;
-        return detail::UniformGroupsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
-    }
-
-    /* uniformGroupsAlong<Dim>
-     *
-     * `uniformGroupsAlong<Dim>(acc, ...)` is a shorthand for `detail::UniformGroupsAlong<TAcc, Dim>(acc, ...)` that
-     * can infer the accelerator type from the argument.
-     */
-
-    template<
-        std::size_t Dim,
-        typename TAcc,
-        typename... TArgs,
-        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
-    ALPAKA_FN_ACC inline auto uniformGroupsAlong(TAcc const& acc, TArgs... args)
-    {
-        using Idx = alpaka::Idx<TAcc>;
-        return detail::UniformGroupsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
-    }
-
-    /* uniformGroupsAlongX, Y, Z
-     *
-     * Like `uniformGroups` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
-     * dimensions.
-     */
-
-    template<
-        typename TAcc,
-        typename... TArgs,
-        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
-    ALPAKA_FN_ACC inline auto uniformGroupsAlongX(TAcc const& acc, TArgs... args)
-    {
-        using Idx = alpaka::Idx<TAcc>;
-        return detail::UniformGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
-    }
-
-    template<
-        typename TAcc,
-        typename... TArgs,
-        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
-    ALPAKA_FN_ACC inline auto uniformGroupsAlongY(TAcc const& acc, TArgs... args)
-    {
-        using Idx = alpaka::Idx<TAcc>;
-        return detail::UniformGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
-    }
-
-    template<
-        typename TAcc,
-        typename... TArgs,
-        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
-    ALPAKA_FN_ACC inline auto uniformGroupsAlongZ(TAcc const& acc, TArgs... args)
-    {
-        using Idx = alpaka::Idx<TAcc>;
-        return detail::UniformGroupsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
-    }
-
-    namespace detail
-    {
-
-        /* UniformGroupElementsAlong
-         *
-         * `UniformGroupElementsAlong<TAcc, Dim>(acc, group, elements)` returns a one-dimensional iteratable range that
-         * spans all the elements within the given `group` along dimension `Dim`, as obtained from
-         * `UniformGroupsAlong<Dim>`, up to `elements` (exclusive). `elements` indicates the total number of elements
-         * across all groups; if not specified, it defaults to the kernel grid size.
-         *
-         * `uniformGroupElementsAlong<Dim>(acc, ...)` is a shorthand for `UniformGroupElementsAlong<TAcc, Dim>(acc,
-         * ...)` that can infer the accelerator type from the argument.
-         *
-         * In a 1-dimensional kernel, `uniformGroupElements(acc, ...)` is a shorthand for
-         * `UniformGroupElementsAlong<0>(acc, ...)`.
-         *
-         * In an N-dimensional kernel, dimension 0 is the one that increases more slowly (e.g. the outer loop),
-         * followed by dimension 1, up to dimension N-1 that increases fastest (e.g. the inner loop). For convenience
-         * when converting CUDA or HIP code, `uniformGroupElementsAlongX(acc, ...)`, `Y` and `Z` are shorthands for
-         * `UniformGroupElementsAlong<TAcc, N-1>(acc, ...)`, `<N-2>` and `<N-3>`.
-         *
-         * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local`
-         * indices of the corresponding element. The global index spans a subset of the range from 0 to `elements`
-         * (excluded), while the local index spans the range from 0 to the block size (excluded).
-         *
-         * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier if
-         * the global element index reaches `elements`.
-         *
-         * If the problem size is not a multiple of the block size, different threads may execute a different number of
-         * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block
-         * synchronisation is needed, one should split the loop, and synchronise the threads between the loops.
-         * See `UniformElementsAlong<Dim>(acc, ...)` for a concrete example using `uniformGroupsAlong<Dim>` and
-         * `uniformGroupElementsAlong<Dim>`.
-         *
-         * Warp-level primitives require that all threads in the warp execute the same function. If `elements` is not a
-         * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for
-         * example, the kernel may hang. To avoid this problem, round up `elements` to a multiple of the warp size, and
-         * check the element index explicitly inside the loop:
-         *
-         *  for (auto element : uniformGroupElementsAlong<N-1>(acc, group, round_up_by(elements,
-         * alpaka::warp::getSize(acc)))) { bool flag = false; if (element < elements) {
-         *      // do some work and compute a result flag only for the valid elements
-         *      flag = do_some_work();
-         *    }
-         *    // check if any valid element had a positive result
-         *    if (alpaka::warp::any(acc, flag)) {
-         *      // ...
-         *    }
-         *  }
-         *
-         * Note that the use of warp-level primitives is usually suitable only for the fastest-looping dimension,
-         * `N-1`.
-         */
-
-        template<
-            typename TAcc,
-            std::size_t Dim,
-            typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
-        class UniformGroupElementsAlong
-        {
-        public:
-            using Idx = alpaka::Idx<TAcc>;
-
-            ALPAKA_FN_ACC inline UniformGroupElementsAlong(TAcc const& acc, Idx block)
-                : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim]}
-                , local_{alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim] * alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
-                , range_{local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim]}
-            {
-            }
-
-            ALPAKA_FN_ACC inline UniformGroupElementsAlong(TAcc const& acc, Idx block, Idx extent)
-                : first_{block * alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[Dim]}
-                , local_{std::min(
-                      extent - first_,
-                      alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[Dim]
-                          * alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim])}
-                , range_{
-                      std::min(extent - first_, local_ + alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[Dim])}
-            {
-            }
-
-            class const_iterator;
-            using iterator = const_iterator;
-
-            ALPAKA_FN_ACC inline const_iterator begin() const
-            {
-                return const_iterator(local_, first_, range_);
-            }
-
-            ALPAKA_FN_ACC inline const_iterator end() const
-            {
-                return const_iterator(range_, first_, range_);
-            }
-
-            class const_iterator
-            {
-                friend class UniformGroupElementsAlong;
-
-                ALPAKA_FN_ACC inline const_iterator(Idx local, Idx first, Idx range)
-                    : index_{local}
-                    , first_{first}
-                    , range_{range}
-                {
-                }
-
-            public:
-                ALPAKA_FN_ACC inline ElementIndex<Idx> operator*() const
-                {
-                    return ElementIndex<Idx>{index_ + first_, index_};
-                }
-
-                // pre-increment the iterator
-                ALPAKA_FN_ACC inline const_iterator& operator++()
-                {
-                    // increment the index along the elements processed by the current thread
-                    ++index_;
-                    if(index_ < range_)
-                        return *this;
-
-                    // the iterator has reached or passed the end of the extent, clamp it to the extent
-                    index_ = range_;
-                    return *this;
-                }
-
-                // post-increment the iterator
-                ALPAKA_FN_ACC inline const_iterator operator++(int)
-                {
-                    const_iterator old = *this;
-                    ++(*this);
-                    return old;
-                }
-
-                ALPAKA_FN_ACC inline bool operator==(const_iterator const& other) const
-                {
-                    return (index_ == other.index_);
-                }
-
-                ALPAKA_FN_ACC inline bool operator!=(const_iterator const& other) const
-                {
-                    return not(*this == other);
-                }
-
-            private:
-                // modified by the pre/post-increment operator
-                Idx index_;
-                // non-const to support iterator copy and assignment
-                Idx first_;
-                Idx range_;
-            };
-
-        private:
-            Idx const first_;
-            Idx const local_;
-            Idx const range_;
-        };
-
-    } // namespace detail
-
-    /* uniformGroupElements
-     *
-     * `uniformGroupElements(acc, group, elements)` returns a one-dimensional iteratable range that spans all the
-     * elements within the given `group`, as obtained from `uniformGroups`, up to `elements` (exclusive). `elements`
-     * indicates the total number of elements across all groups; if not specified, it defaults to the kernel grid size.
-     *
-     * `uniformGroupElements(acc, ...)` is a shorthand for `detail::UniformGroupElementsAlong<0>(acc, ...)`.
-     *
-     * Iterating over the range yields values of type `ElementIndex`, that provide the `.global` and `.local` indices
-     * of the corresponding element. The global index spans a subset of the range from 0 to `elements` (excluded),
-     * while the local index spans the range from 0 to the block size (excluded).
-     *
-     * The loop will perform a number of iterations up to the number of elements per thread, stopping earlier if the
-     * global element index reaches `elements`.
-     *
-     * If the problem size is not a multiple of the block size, different threads may execute a different number of
-     * iterations. As a result, it is not safe to call `alpaka::syncBlockThreads()` within this loop. If a block
-     * synchronisation is needed, one should split the loop, and synchronise the threads between the loops.
-     * See `uniformElements(acc, ...)` for a concrete example using `uniformGroups` and `uniformGroupElements`.
-     *
-     * Warp-level primitives require that all threads in the warp execute the same function. If `elements` is not a
-     * multiple of the warp size, some of the warps may be incomplete, leading to undefined behaviour - for example,
-     * the kernel may hang. To avoid this problem, round up `elements` to a multiple of the warp size, and check the
-     * element index explicitly inside the loop:
-     *
-     *  for (auto element : uniformGroupElements(acc, group, round_up_by(elements, alpaka::warp::getSize(acc)))) {
-     *    bool flag = false;
-     *    if (element < elements) {
-     *      // do some work and compute a result flag only for the valid elements
-     *      flag = do_some_work();
-     *    }
-     *    // check if any valid element had a positive result
-     *    if (alpaka::warp::any(acc, flag)) {
-     *      // ...
-     *    }
-     *  }
-     *
-     * Note that `uniformGroupElements(acc, ...)` is only suitable for one-dimensional kernels. For N-dimensional
-     * kernels, use
-     *   - `detail::UniformGroupElementsAlong<Dim>(acc, ...)` to perform the iteration explicitly along dimension
-     *     `Dim`;
-     *   - `uniformGroupElementsAlongX(acc, ...)`, `uniformGroupElementsAlongY(acc, ...)`, or
-     *     `uniformGroupElementsAlongZ(acc, ...)` to loop along the fastest, second-fastest, or third-fastest
-     *     dimension.
-     */
-
-    template<
-        typename TAcc,
-        typename... TArgs,
-        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
-    ALPAKA_FN_ACC inline auto uniformGroupElements(TAcc const& acc, TArgs... args)
-    {
-        using Idx = alpaka::Idx<TAcc>;
-        return detail::UniformGroupElementsAlong<TAcc, 0>(acc, static_cast<Idx>(args)...);
-    }
-
-    /* uniformGroupElementsAlong<Dim>
-     *
-     * `uniformGroupElementsAlong<Dim>(acc, ...)` is a shorthand for `detail::UniformGroupElementsAlong<TAcc,
-     * Dim>(acc, ...)` that can infer the accelerator type from the argument.
-     */
-
-    template<
-        std::size_t Dim,
-        typename TAcc,
-        typename... TArgs,
-        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value >= Dim>>
-    ALPAKA_FN_ACC inline auto uniformGroupElementsAlong(TAcc const& acc, TArgs... args)
-    {
-        using Idx = alpaka::Idx<TAcc>;
-        return detail::UniformGroupElementsAlong<TAcc, Dim>(acc, static_cast<Idx>(args)...);
-    }
-
-    /* uniformGroupElementsAlongX, Y, Z
-     *
-     * Like `uniformGroupElements` for N-dimensional kernels, along the fastest, second-fastest, and third-fastest
-     * dimensions.
-     */
-
-    template<
-        typename TAcc,
-        typename... TArgs,
-        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
-    ALPAKA_FN_ACC inline auto uniformGroupElementsAlongX(TAcc const& acc, TArgs... args)
-    {
-        using Idx = alpaka::Idx<TAcc>;
-        return detail::UniformGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 1>(acc, static_cast<Idx>(args)...);
-    }
-
-    template<
-        typename TAcc,
-        typename... TArgs,
-        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 1)>>
-    ALPAKA_FN_ACC inline auto uniformGroupElementsAlongY(TAcc const& acc, TArgs... args)
-    {
-        using Idx = alpaka::Idx<TAcc>;
-        return detail::UniformGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 2>(acc, static_cast<Idx>(args)...);
-    }
-
-    template<
-        typename TAcc,
-        typename... TArgs,
-        typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 2)>>
-    ALPAKA_FN_ACC inline auto uniformGroupElementsAlongZ(TAcc const& acc, TArgs... args)
-    {
-        using Idx = alpaka::Idx<TAcc>;
-        return detail::UniformGroupElementsAlong<TAcc, alpaka::Dim<TAcc>::value - 3>(acc, static_cast<Idx>(args)...);
-    }
-
-} // namespace alpaka
diff --git a/include/alpaka/extent/Traits.hpp b/include/alpaka/extent/Traits.hpp
deleted file mode 100644
index 460269f..0000000
--- a/include/alpaka/extent/Traits.hpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/* Copyright 2023 Benjamin Worpitz, Andrea Bocci, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/core/Unreachable.hpp"
-#include "alpaka/dim/DimIntegralConst.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/meta/Fold.hpp"
-#include "alpaka/vec/Vec.hpp"
-
-#include <functional>
-#include <type_traits>
-#include <utility>
-
-namespace alpaka
-{
-    //! The extent traits.
-    namespace trait
-    {
-        //! The extent get trait.
-        //!
-        //! If not specialized explicitly it returns 1.
-        template<typename TIdxIntegralConst, typename TExtent, typename TSfinae = void>
-        struct [[deprecated("Specialize GetExtents instead")]] GetExtent
-        {
-            ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC static auto getExtent(TExtent const&) -> Idx<TExtent>
-            {
-                return static_cast<Idx<TExtent>>(1);
-            } // namespace trait
-        }; // namespace alpaka
-
-        //! The GetExtents trait for getting the extents of an object as an alpaka::Vec.
-        template<typename TExtent, typename TSfinae = void>
-        struct GetExtents;
-    } // namespace trait
-
-    //! \return The extent in the given dimension.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<std::size_t Tidx, typename TExtent>
-    [[deprecated("use getExtents(extent)[Tidx] instead")]] ALPAKA_FN_HOST_ACC auto getExtent(
-        TExtent const& extent = TExtent()) -> Idx<TExtent>
-    {
-#if BOOST_COMP_CLANG || BOOST_COMP_GNUC
-#    pragma GCC diagnostic push
-#    pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-        return trait::GetExtent<DimInt<Tidx>, TExtent>::getExtent(extent);
-#if BOOST_COMP_CLANG || BOOST_COMP_GNUC
-#    pragma GCC diagnostic pop
-#endif
-    }
-
-    //! \return The extents of the given object.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T>
-    ALPAKA_FN_HOST_ACC auto getExtents(T const& object) -> Vec<Dim<T>, Idx<T>>
-    {
-        return trait::GetExtents<T>{}(object);
-    }
-
-    //! \tparam T has to specialize GetExtent.
-    //! \return The extents of the given object.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T>
-    [[deprecated("use getExtents() instead")]] ALPAKA_FN_HOST_ACC constexpr auto getExtentVec(T const& object = {})
-        -> Vec<Dim<T>, Idx<T>>
-    {
-        return getExtents(object);
-    }
-
-    //! \tparam T has to specialize GetExtent.
-    //! \return The extent but only the last TDim elements.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TDim, typename T>
-    ALPAKA_FN_HOST_ACC constexpr auto getExtentVecEnd(T const& object = {}) -> Vec<TDim, Idx<T>>
-    {
-        static_assert(TDim::value <= Dim<T>::value, "Cannot get more items than the extent holds");
-
-        [[maybe_unused]] auto const e = getExtents(object);
-        Vec<TDim, Idx<T>> v{};
-        if constexpr(TDim::value > 0)
-        {
-            for(unsigned i = 0; i < TDim::value; i++)
-                v[i] = e[(Dim<T>::value - TDim::value) + i];
-        }
-        return v;
-    }
-
-    //! \return The width.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TExtent>
-    ALPAKA_FN_HOST_ACC auto getWidth(TExtent const& extent = TExtent()) -> Idx<TExtent>
-    {
-        if constexpr(Dim<TExtent>::value >= 1)
-            return getExtents(extent)[Dim<TExtent>::value - 1u];
-        else
-            return 1;
-
-        ALPAKA_UNREACHABLE({});
-    }
-
-    //! \return The height.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TExtent>
-    ALPAKA_FN_HOST_ACC auto getHeight(TExtent const& extent = TExtent()) -> Idx<TExtent>
-    {
-        if constexpr(Dim<TExtent>::value >= 2)
-            return getExtents(extent)[Dim<TExtent>::value - 2u];
-        else
-            return 1;
-
-        ALPAKA_UNREACHABLE({});
-    }
-
-    //! \return The depth.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TExtent>
-    ALPAKA_FN_HOST_ACC auto getDepth(TExtent const& extent = TExtent()) -> Idx<TExtent>
-    {
-        if constexpr(Dim<TExtent>::value >= 3)
-            return getExtents(extent)[Dim<TExtent>::value - 3u];
-        else
-            return 1;
-
-        ALPAKA_UNREACHABLE({});
-    }
-
-    //! \return The product of the extents of the given object.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T>
-    ALPAKA_FN_HOST_ACC auto getExtentProduct(T const& object) -> Idx<T>
-    {
-        return getExtents(object).prod();
-    }
-
-    namespace trait
-    {
-        //! The Vec extent get trait specialization.
-        template<typename TDim, typename TVal>
-        struct GetExtents<Vec<TDim, TVal>>
-        {
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC constexpr auto operator()(Vec<TDim, TVal> const& extent) const -> Vec<TDim, TVal>
-            {
-                return extent;
-            }
-        };
-
-        template<typename Integral>
-        struct GetExtents<Integral, std::enable_if_t<std::is_integral_v<Integral>>>
-        {
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC auto operator()(Integral i) const
-            {
-                return Vec{i};
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/idx/Accessors.hpp b/include/alpaka/idx/Accessors.hpp
deleted file mode 100644
index f329728..0000000
--- a/include/alpaka/idx/Accessors.hpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/core/Positioning.hpp"
-#include "alpaka/dim/DimIntegralConst.hpp"
-#include "alpaka/dim/Traits.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/vec/Vec.hpp"
-#include "alpaka/workdiv/Traits.hpp"
-
-#include <utility>
-
-namespace alpaka
-{
-    //! Get the indices requested.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TOrigin, typename TUnit, typename TIdx, typename TWorkDiv>
-    ALPAKA_FN_HOST_ACC auto getIdx(TIdx const& idx, TWorkDiv const& workDiv) -> Vec<Dim<TWorkDiv>, Idx<TIdx>>
-    {
-        return trait::GetIdx<TIdx, TOrigin, TUnit>::getIdx(idx, workDiv);
-    }
-
-    //! Get the indices requested.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TOrigin, typename TUnit, typename TIdxWorkDiv>
-    ALPAKA_FN_HOST_ACC auto getIdx(TIdxWorkDiv const& idxWorkDiv) -> Vec<Dim<TIdxWorkDiv>, Idx<TIdxWorkDiv>>
-    {
-        return trait::GetIdx<TIdxWorkDiv, TOrigin, TUnit>::getIdx(idxWorkDiv, idxWorkDiv);
-    }
-
-    namespace trait
-    {
-        //! The grid block index get trait specialization for classes with IdxGbBase member type.
-        template<typename TIdxGb>
-        struct GetIdx<TIdxGb, origin::Grid, unit::Blocks>
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptIdxGb, TIdxGb>;
-
-            //! \return The index of the current thread in the grid.
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<typename TWorkDiv>
-            ALPAKA_FN_HOST_ACC static auto getIdx(TIdxGb const& idx, TWorkDiv const& workDiv)
-                -> Vec<Dim<ImplementationBase>, Idx<ImplementationBase>>
-            {
-                return trait::GetIdx<ImplementationBase, origin::Grid, unit::Blocks>::getIdx(idx, workDiv);
-            }
-        };
-
-        //! The block thread index get trait specialization for classes with IdxBtBase member type.
-        template<typename TIdxBt>
-        struct GetIdx<TIdxBt, origin::Block, unit::Threads>
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptIdxBt, TIdxBt>;
-
-            //! \return The index of the current thread in the grid.
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<typename TWorkDiv>
-            ALPAKA_FN_HOST_ACC static auto getIdx(TIdxBt const& idx, TWorkDiv const& workDiv)
-                -> Vec<Dim<ImplementationBase>, Idx<ImplementationBase>>
-            {
-                return trait::GetIdx<ImplementationBase, origin::Block, unit::Threads>::getIdx(idx, workDiv);
-            }
-        };
-
-        //! The grid thread index get trait specialization.
-        template<typename TIdx>
-        struct GetIdx<TIdx, origin::Grid, unit::Threads>
-        {
-            //! \return The index of the current thread in the grid.
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<typename TWorkDiv>
-            ALPAKA_FN_HOST_ACC static auto getIdx(TIdx const& idx, TWorkDiv const& workDiv)
-            {
-                return alpaka::getIdx<origin::Grid, unit::Blocks>(idx, workDiv)
-                           * getWorkDiv<origin::Block, unit::Threads>(workDiv)
-                       + alpaka::getIdx<origin::Block, unit::Threads>(idx, workDiv);
-            }
-        };
-    } // namespace trait
-
-    //! Get the index of the first element this thread computes.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TIdxWorkDiv, typename TGridThreadIdx, typename TThreadElemExtent>
-    ALPAKA_FN_HOST_ACC auto getIdxThreadFirstElem(
-        [[maybe_unused]] TIdxWorkDiv const& idxWorkDiv,
-        TGridThreadIdx const& gridThreadIdx,
-        TThreadElemExtent const& threadElemExtent) -> Vec<Dim<TIdxWorkDiv>, Idx<TIdxWorkDiv>>
-    {
-        return gridThreadIdx * threadElemExtent;
-    }
-
-    //! Get the index of the first element this thread computes.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TIdxWorkDiv, typename TGridThreadIdx>
-    ALPAKA_FN_HOST_ACC auto getIdxThreadFirstElem(TIdxWorkDiv const& idxWorkDiv, TGridThreadIdx const& gridThreadIdx)
-        -> Vec<Dim<TIdxWorkDiv>, Idx<TIdxWorkDiv>>
-    {
-        auto const threadElemExtent(alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(idxWorkDiv));
-        return getIdxThreadFirstElem(idxWorkDiv, gridThreadIdx, threadElemExtent);
-    }
-
-    //! Get the index of the first element this thread computes.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TIdxWorkDiv>
-    ALPAKA_FN_HOST_ACC auto getIdxThreadFirstElem(TIdxWorkDiv const& idxWorkDiv)
-        -> Vec<Dim<TIdxWorkDiv>, Idx<TIdxWorkDiv>>
-    {
-        auto const gridThreadIdx(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(idxWorkDiv));
-        return getIdxThreadFirstElem(idxWorkDiv, gridThreadIdx);
-    }
-} // namespace alpaka
diff --git a/include/alpaka/idx/MapIdx.hpp b/include/alpaka/idx/MapIdx.hpp
deleted file mode 100644
index f081252..0000000
--- a/include/alpaka/idx/MapIdx.hpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright 2023 Axel Hübl, Benjamin Worpitz, Erik Zenker, Jan Stephan, Jeffrey Kelling, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/core/Unreachable.hpp"
-#include "alpaka/vec/Traits.hpp"
-#include "alpaka/vec/Vec.hpp"
-
-#include <type_traits>
-
-namespace alpaka
-{
-    //! Maps an N-dimensional index to an N-dimensional position. At least one dimension must always be 1 or zero.
-    //!
-    //! \tparam TDimOut Dimension of the index vector to map to.
-    //! \param in The index vector to map from.
-    //! \param extent The extents of the input or output space, whichever has more than 1 dimensions.
-    ALPAKA_NO_HOST_ACC_WARNING template<
-        std::size_t TDimOut,
-        std::size_t TDimIn,
-        std::size_t TDimExtents,
-        typename TElem>
-    ALPAKA_FN_HOST_ACC auto mapIdx(Vec<DimInt<TDimIn>, TElem> const& in, Vec<DimInt<TDimExtents>, TElem> const& extent)
-        -> Vec<DimInt<TDimOut>, TElem>
-    {
-        if constexpr(TDimOut == 0 || TDimIn == 0)
-            return Vec<DimInt<TDimOut>, TElem>::zeros();
-        else if constexpr(TDimOut == TDimIn)
-            return in;
-        else if constexpr(TDimOut == 1)
-        {
-            TElem out = in[0];
-            for(std::size_t d = 1; d < TDimIn; ++d)
-                out = static_cast<TElem>(out * extent[d] + in[d]);
-            return {out};
-        }
-        else if constexpr(TDimIn == 1)
-        {
-            auto flat = in.front();
-            Vec<DimInt<TDimOut>, TElem> out;
-            for(std::size_t d = TDimOut - 1u; d > 0; d--)
-            {
-                out[d] = static_cast<TElem>(flat % extent[d]);
-                flat /= extent[d];
-            }
-            out.front() = static_cast<TElem>(flat);
-            return out;
-        }
-        else
-            static_assert(!sizeof(TElem), "Not implemented");
-
-        ALPAKA_UNREACHABLE({});
-    }
-
-    //! Maps an N dimensional index to a N dimensional position based on the pitches of a view without padding or a
-    //! byte view. At least one dimension must always be 1 or zero.
-    //!
-    //! \tparam TDimOut Dimension of the index vector to map to.
-    //! \param in The index vector to map from.
-    //! \param pitches The pitches of the input or output space, whichever has more than 1 dimensions.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<std::size_t TDimOut, std::size_t TDimIn, std::size_t TidxDimPitch, typename TElem>
-    ALPAKA_FN_HOST_ACC auto mapIdxPitchBytes(
-        Vec<DimInt<TDimIn>, TElem> const& in,
-        Vec<DimInt<TidxDimPitch>, TElem> const& pitches) -> Vec<DimInt<TDimOut>, TElem>
-    {
-        if constexpr(TDimOut == 0 || TDimIn == 0)
-            return Vec<DimInt<TDimOut>, TElem>::zeros();
-        else if constexpr(TDimOut == TDimIn)
-            return in;
-        else if constexpr(TDimOut == 1)
-        {
-            using DimMinusOne = DimInt<TDimIn - 1>;
-            return {in.back() + (subVecBegin<DimMinusOne>(pitches) * subVecBegin<DimMinusOne>(in)).sum()};
-        }
-        else if constexpr(TDimIn == 1)
-        {
-            auto result = Vec<DimInt<TDimOut>, TElem>::zeros();
-
-            TElem out = in.front();
-            for(std::size_t d = 0; d < TDimOut - 1u; ++d)
-            {
-                result[d] = static_cast<TElem>(out / pitches[d]);
-                out %= pitches[d];
-            }
-            result.back() = out;
-
-            return result;
-        }
-        else
-            static_assert(!sizeof(TElem), "Not implemented");
-
-        ALPAKA_UNREACHABLE({});
-    }
-} // namespace alpaka
diff --git a/include/alpaka/idx/Traits.hpp b/include/alpaka/idx/Traits.hpp
deleted file mode 100644
index 88e2365..0000000
--- a/include/alpaka/idx/Traits.hpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include <type_traits>
-#include <utility>
-
-namespace alpaka
-{
-    struct ConceptIdxBt
-    {
-    };
-
-    struct ConceptIdxGb
-    {
-    };
-
-    //! The idx trait.
-    namespace trait
-    {
-        //! The idx type trait.
-        template<typename T, typename TSfinae = void>
-        struct IdxType;
-    } // namespace trait
-
-    template<typename T>
-    using Idx = typename trait::IdxType<T>::type;
-
-    namespace trait
-    {
-        //! The arithmetic idx type trait specialization.
-        template<typename T>
-        struct IdxType<T, std::enable_if_t<std::is_arithmetic_v<T>>>
-        {
-            using type = std::decay_t<T>;
-        };
-
-        //! The index get trait.
-        template<typename TIdx, typename TOrigin, typename TUnit, typename TSfinae = void>
-        struct GetIdx;
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/idx/bt/IdxBtGenericSycl.hpp b/include/alpaka/idx/bt/IdxBtGenericSycl.hpp
deleted file mode 100644
index 54ef780..0000000
--- a/include/alpaka/idx/bt/IdxBtGenericSycl.hpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright 2023 Jan Stephan, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/core/Positioning.hpp"
-#include "alpaka/core/Sycl.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/vec/Vec.hpp"
-
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-
-#    include <sycl/sycl.hpp>
-
-namespace alpaka::bt
-{
-    //! The SYCL accelerator ND index provider.
-    template<typename TDim, typename TIdx>
-    class IdxBtGenericSycl : public concepts::Implements<ConceptIdxBt, IdxBtGenericSycl<TDim, TIdx>>
-    {
-    public:
-        using IdxBtBase = IdxBtGenericSycl;
-
-        explicit IdxBtGenericSycl(sycl::nd_item<TDim::value> work_item) : m_item_bt{work_item}
-        {
-        }
-
-        sycl::nd_item<TDim::value> m_item_bt;
-    };
-} // namespace alpaka::bt
-
-namespace alpaka::trait
-{
-    //! The SYCL accelerator index dimension get trait specialization.
-    template<typename TDim, typename TIdx>
-    struct DimType<bt::IdxBtGenericSycl<TDim, TIdx>>
-    {
-        using type = TDim;
-    };
-
-    //! The SYCL accelerator block thread index get trait specialization.
-    template<typename TDim, typename TIdx>
-    struct GetIdx<bt::IdxBtGenericSycl<TDim, TIdx>, origin::Block, unit::Threads>
-    {
-        //! \return The index of the current thread in the block.
-        template<typename TWorkDiv>
-        static auto getIdx(bt::IdxBtGenericSycl<TDim, TIdx> const& idx, TWorkDiv const&) -> Vec<TDim, TIdx>
-        {
-            if constexpr(TDim::value == 1)
-                return Vec<TDim, TIdx>{static_cast<TIdx>(idx.m_item_bt.get_local_id(0))};
-            else if constexpr(TDim::value == 2)
-            {
-                return Vec<TDim, TIdx>{
-                    static_cast<TIdx>(idx.m_item_bt.get_local_id(1)),
-                    static_cast<TIdx>(idx.m_item_bt.get_local_id(0))};
-            }
-            else
-            {
-                return Vec<TDim, TIdx>{
-                    static_cast<TIdx>(idx.m_item_bt.get_local_id(2)),
-                    static_cast<TIdx>(idx.m_item_bt.get_local_id(1)),
-                    static_cast<TIdx>(idx.m_item_bt.get_local_id(0))};
-            }
-        }
-    };
-
-    //! The SYCL accelerator block thread index idx type trait specialization.
-    template<typename TDim, typename TIdx>
-    struct IdxType<bt::IdxBtGenericSycl<TDim, TIdx>>
-    {
-        using type = TIdx;
-    };
-} // namespace alpaka::trait
-
-#endif
diff --git a/include/alpaka/idx/bt/IdxBtLinear.hpp b/include/alpaka/idx/bt/IdxBtLinear.hpp
deleted file mode 100644
index 53f876c..0000000
--- a/include/alpaka/idx/bt/IdxBtLinear.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright 2022 Axel Huebl, Jeffrey Kelling, Benjamin Worpitz, René Widera, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/core/Positioning.hpp"
-#include "alpaka/idx/MapIdx.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/vec/Vec.hpp"
-#include "alpaka/workdiv/Traits.hpp"
-
-namespace alpaka
-{
-    namespace bt
-    {
-        //! General ND bt index provider based on a linear index.
-        template<typename TDim, typename TIdx>
-        class IdxBtLinear : public concepts::Implements<ConceptIdxBt, IdxBtLinear<TDim, TIdx>>
-        {
-        public:
-            IdxBtLinear(TIdx blockThreadIdx) : m_blockThreadIdx(blockThreadIdx)
-            {
-            }
-
-            const TIdx m_blockThreadIdx;
-        };
-    } // namespace bt
-
-    namespace trait
-    {
-        //! The IdxBtLinear index dimension get trait specialization.
-        template<typename TDim, typename TIdx>
-        struct DimType<bt::IdxBtLinear<TDim, TIdx>>
-        {
-            using type = TDim;
-        };
-
-        //! The IdxBtLinear block thread index get trait specialization.
-        template<typename TDim, typename TIdx>
-        struct GetIdx<bt::IdxBtLinear<TDim, TIdx>, origin::Block, unit::Threads>
-        {
-            //! \return The index of the current thread in the block.
-            template<typename TWorkDiv>
-            static auto getIdx(bt::IdxBtLinear<TDim, TIdx> const& idx, TWorkDiv const& workDiv) -> Vec<TDim, TIdx>
-            {
-                return mapIdx<TDim::value>(
-                    Vec<DimInt<1u>, TIdx>(idx.m_blockThreadIdx),
-                    getWorkDiv<Block, Threads>(workDiv));
-            }
-        };
-
-        template<typename TIdx>
-        struct GetIdx<bt::IdxBtLinear<DimInt<1u>, TIdx>, origin::Block, unit::Threads>
-        {
-            //! \return The index of the current thread in the block.
-            template<typename TWorkDiv>
-            static auto getIdx(bt::IdxBtLinear<DimInt<1u>, TIdx> const& idx, TWorkDiv const&) -> Vec<DimInt<1u>, TIdx>
-            {
-                return idx.m_blockThreadIdx;
-            }
-        };
-
-        //! The IdxBtLinear block thread index idx type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct IdxType<bt::IdxBtLinear<TDim, TIdx>>
-        {
-            using type = TIdx;
-        };
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/idx/bt/IdxBtOmp.hpp b/include/alpaka/idx/bt/IdxBtOmp.hpp
deleted file mode 100644
index df5a96a..0000000
--- a/include/alpaka/idx/bt/IdxBtOmp.hpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Assert.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/core/Positioning.hpp"
-#include "alpaka/idx/MapIdx.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/vec/Vec.hpp"
-#include "alpaka/workdiv/Traits.hpp"
-
-#ifdef _OPENMP
-
-#    include <omp.h>
-
-namespace alpaka
-{
-    namespace bt
-    {
-        //! The OpenMP accelerator index provider.
-        template<typename TDim, typename TIdx>
-        class IdxBtOmp : public concepts::Implements<ConceptIdxBt, IdxBtOmp<TDim, TIdx>>
-        {
-        };
-    } // namespace bt
-
-    namespace trait
-    {
-        //! The OpenMP accelerator index dimension get trait specialization.
-        template<typename TDim, typename TIdx>
-        struct DimType<bt::IdxBtOmp<TDim, TIdx>>
-        {
-            using type = TDim;
-        };
-
-        //! The OpenMP accelerator block thread index get trait specialization.
-        template<typename TDim, typename TIdx>
-        struct GetIdx<bt::IdxBtOmp<TDim, TIdx>, origin::Block, unit::Threads>
-        {
-            //! \return The index of the current thread in the block.
-            template<typename TWorkDiv>
-            static auto getIdx(bt::IdxBtOmp<TDim, TIdx> const& /* idx */, TWorkDiv const& workDiv) -> Vec<TDim, TIdx>
-            {
-                // We assume that the thread id is positive.
-                ALPAKA_ASSERT_ACC(::omp_get_thread_num() >= 0);
-                // \TODO: Would it be faster to precompute the index and cache it inside an array?
-                return mapIdx<TDim::value>(
-                    Vec<DimInt<1u>, TIdx>(static_cast<TIdx>(::omp_get_thread_num())),
-                    getWorkDiv<Block, Threads>(workDiv));
-            }
-        };
-
-        template<typename TIdx>
-        struct GetIdx<bt::IdxBtOmp<DimInt<1u>, TIdx>, origin::Block, unit::Threads>
-        {
-            //! \return The index of the current thread in the block.
-            template<typename TWorkDiv>
-            static auto getIdx(bt::IdxBtOmp<DimInt<1u>, TIdx> const& /* idx */, TWorkDiv const&)
-                -> Vec<DimInt<1u>, TIdx>
-            {
-                return Vec<DimInt<1u>, TIdx>(static_cast<TIdx>(::omp_get_thread_num()));
-            }
-        };
-
-        //! The OpenMP accelerator block thread index idx type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct IdxType<bt::IdxBtOmp<TDim, TIdx>>
-        {
-            using type = TIdx;
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/idx/bt/IdxBtRefThreadIdMap.hpp b/include/alpaka/idx/bt/IdxBtRefThreadIdMap.hpp
deleted file mode 100644
index 4d94d0f..0000000
--- a/include/alpaka/idx/bt/IdxBtRefThreadIdMap.hpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Assert.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/core/Positioning.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/vec/Vec.hpp"
-
-#include <map>
-#include <thread>
-
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
-
-namespace alpaka
-{
-    namespace bt
-    {
-        //! The threads accelerator index provider.
-        template<typename TDim, typename TIdx>
-        class IdxBtRefThreadIdMap : public concepts::Implements<ConceptIdxBt, IdxBtRefThreadIdMap<TDim, TIdx>>
-        {
-        public:
-            using ThreadIdToIdxMap = std::map<std::thread::id, Vec<TDim, TIdx>>;
-
-            ALPAKA_FN_HOST IdxBtRefThreadIdMap(ThreadIdToIdxMap const& mThreadToIndices)
-                : m_threadToIndexMap(mThreadToIndices)
-            {
-            }
-
-            ALPAKA_FN_HOST IdxBtRefThreadIdMap(IdxBtRefThreadIdMap const&) = delete;
-            ALPAKA_FN_HOST auto operator=(IdxBtRefThreadIdMap const&) -> IdxBtRefThreadIdMap& = delete;
-
-        public:
-            ThreadIdToIdxMap const& m_threadToIndexMap; //!< The mapping of thread id's to thread indices.
-        };
-    } // namespace bt
-
-    namespace trait
-    {
-        //! The CPU threads accelerator index dimension get trait specialization.
-        template<typename TDim, typename TIdx>
-        struct DimType<bt::IdxBtRefThreadIdMap<TDim, TIdx>>
-        {
-            using type = TDim;
-        };
-
-        //! The CPU threads accelerator block thread index get trait specialization.
-        template<typename TDim, typename TIdx>
-        struct GetIdx<bt::IdxBtRefThreadIdMap<TDim, TIdx>, origin::Block, unit::Threads>
-        {
-            //! \return The index of the current thread in the block.
-            template<typename TWorkDiv>
-            ALPAKA_FN_HOST static auto getIdx(
-                bt::IdxBtRefThreadIdMap<TDim, TIdx> const& idx,
-                TWorkDiv const& /* workDiv */) -> Vec<TDim, TIdx>
-            {
-                auto const threadId = std::this_thread::get_id();
-                auto const threadEntry = idx.m_threadToIndexMap.find(threadId);
-                ALPAKA_ASSERT(threadEntry != std::end(idx.m_threadToIndexMap));
-                return threadEntry->second;
-            }
-        };
-
-        //! The CPU threads accelerator block thread index idx type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct IdxType<bt::IdxBtRefThreadIdMap<TDim, TIdx>>
-        {
-            using type = TIdx;
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/idx/bt/IdxBtUniformCudaHipBuiltIn.hpp b/include/alpaka/idx/bt/IdxBtUniformCudaHipBuiltIn.hpp
deleted file mode 100644
index ff0366f..0000000
--- a/include/alpaka/idx/bt/IdxBtUniformCudaHipBuiltIn.hpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera, Jan Stephan, Andrea Bocci, Bernhard
- * Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/BoostPredef.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/core/Cuda.hpp"
-#include "alpaka/core/Hip.hpp"
-#include "alpaka/core/Positioning.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/vec/Vec.hpp"
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-
-namespace alpaka
-{
-    namespace bt
-    {
-        //! The CUDA/HIP accelerator ND index provider.
-        template<typename TDim, typename TIdx>
-        class IdxBtUniformCudaHipBuiltIn
-            : public concepts::Implements<ConceptIdxBt, IdxBtUniformCudaHipBuiltIn<TDim, TIdx>>
-        {
-        };
-    } // namespace bt
-
-#    if !defined(ALPAKA_HOST_ONLY)
-
-#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
-#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#        endif
-
-#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
-#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#        endif
-
-    namespace trait
-    {
-        //! The GPU CUDA/HIP accelerator index dimension get trait specialization.
-        template<typename TDim, typename TIdx>
-        struct DimType<bt::IdxBtUniformCudaHipBuiltIn<TDim, TIdx>>
-        {
-            using type = TDim;
-        };
-
-        //! The GPU CUDA/HIP accelerator block thread index get trait specialization.
-        template<typename TDim, typename TIdx>
-        struct GetIdx<bt::IdxBtUniformCudaHipBuiltIn<TDim, TIdx>, origin::Block, unit::Threads>
-        {
-            //! \return The index of the current thread in the block.
-            template<typename TWorkDiv>
-            __device__ static auto getIdx(bt::IdxBtUniformCudaHipBuiltIn<TDim, TIdx> const& /* idx */, TWorkDiv const&)
-                -> Vec<TDim, TIdx>
-            {
-#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-                return castVec<TIdx>(getOffsetVecEnd<TDim>(threadIdx));
-#        else
-                return getOffsetVecEnd<TDim>(Vec<std::integral_constant<typename TDim::value_type, 3>, TIdx>(
-                    static_cast<TIdx>(hipThreadIdx_z),
-                    static_cast<TIdx>(hipThreadIdx_y),
-                    static_cast<TIdx>(hipThreadIdx_x)));
-#        endif
-            }
-        };
-
-        //! The GPU CUDA/HIP accelerator block thread index idx type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct IdxType<bt::IdxBtUniformCudaHipBuiltIn<TDim, TIdx>>
-        {
-            using type = TIdx;
-        };
-    } // namespace trait
-
-#    endif
-
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/idx/bt/IdxBtZero.hpp b/include/alpaka/idx/bt/IdxBtZero.hpp
deleted file mode 100644
index be90326..0000000
--- a/include/alpaka/idx/bt/IdxBtZero.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/core/Positioning.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/vec/Vec.hpp"
-
-namespace alpaka
-{
-    namespace bt
-    {
-        //! A zero block thread index provider.
-        template<typename TDim, typename TIdx>
-        class IdxBtZero : public concepts::Implements<ConceptIdxBt, IdxBtZero<TDim, TIdx>>
-        {
-        };
-    } // namespace bt
-
-    namespace trait
-    {
-        //! The zero block thread index provider dimension get trait specialization.
-        template<typename TDim, typename TIdx>
-        struct DimType<bt::IdxBtZero<TDim, TIdx>>
-        {
-            using type = TDim;
-        };
-
-        //! The zero block thread index provider block thread index get trait specialization.
-        template<typename TDim, typename TIdx>
-        struct GetIdx<bt::IdxBtZero<TDim, TIdx>, origin::Block, unit::Threads>
-        {
-            //! \return The index of the current thread in the block.
-            template<typename TWorkDiv>
-            ALPAKA_FN_HOST static auto getIdx(
-                bt::IdxBtZero<TDim, TIdx> const& /* idx */,
-                TWorkDiv const& /* workDiv */) -> Vec<TDim, TIdx>
-            {
-                return Vec<TDim, TIdx>::zeros();
-            }
-        };
-
-        //! The zero block thread index idx type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct IdxType<bt::IdxBtZero<TDim, TIdx>>
-        {
-            using type = TIdx;
-        };
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/idx/gb/IdxGbGenericSycl.hpp b/include/alpaka/idx/gb/IdxGbGenericSycl.hpp
deleted file mode 100644
index 42547ef..0000000
--- a/include/alpaka/idx/gb/IdxGbGenericSycl.hpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright 2023 Jan Stephan, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/core/Positioning.hpp"
-#include "alpaka/core/Sycl.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/vec/Vec.hpp"
-
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-
-#    include <sycl/sycl.hpp>
-
-namespace alpaka::gb
-{
-    //! The SYCL accelerator ND index provider.
-    template<typename TDim, typename TIdx>
-    class IdxGbGenericSycl : public concepts::Implements<ConceptIdxGb, IdxGbGenericSycl<TDim, TIdx>>
-    {
-    public:
-        using IdxGbBase = IdxGbGenericSycl;
-
-        explicit IdxGbGenericSycl(sycl::nd_item<TDim::value> work_item) : m_item_gb{work_item}
-        {
-        }
-
-        sycl::nd_item<TDim::value> m_item_gb;
-    };
-} // namespace alpaka::gb
-
-namespace alpaka::trait
-{
-    //! The SYCL accelerator index dimension get trait specialization.
-    template<typename TDim, typename TIdx>
-    struct DimType<gb::IdxGbGenericSycl<TDim, TIdx>>
-    {
-        using type = TDim;
-    };
-
-    //! The SYCL accelerator grid block index get trait specialization.
-    template<typename TDim, typename TIdx>
-    struct GetIdx<gb::IdxGbGenericSycl<TDim, TIdx>, origin::Grid, unit::Blocks>
-    {
-        //! \return The index of the current block in the grid.
-        template<typename TWorkDiv>
-        static auto getIdx(gb::IdxGbGenericSycl<TDim, TIdx> const& idx, TWorkDiv const&)
-        {
-            if constexpr(TDim::value == 1)
-                return Vec<TDim, TIdx>(static_cast<TIdx>(idx.m_item_gb.get_group(0)));
-            else if constexpr(TDim::value == 2)
-            {
-                return Vec<TDim, TIdx>(
-                    static_cast<TIdx>(idx.m_item_gb.get_group(1)),
-                    static_cast<TIdx>(idx.m_item_gb.get_group(0)));
-            }
-            else
-            {
-                return Vec<TDim, TIdx>(
-                    static_cast<TIdx>(idx.m_item_gb.get_group(2)),
-                    static_cast<TIdx>(idx.m_item_gb.get_group(1)),
-                    static_cast<TIdx>(idx.m_item_gb.get_group(0)));
-            }
-        }
-    };
-
-    //! The SYCL accelerator grid block index idx type trait specialization.
-    template<typename TDim, typename TIdx>
-    struct IdxType<gb::IdxGbGenericSycl<TDim, TIdx>>
-    {
-        using type = TIdx;
-    };
-} // namespace alpaka::trait
-
-#endif
diff --git a/include/alpaka/idx/gb/IdxGbLinear.hpp b/include/alpaka/idx/gb/IdxGbLinear.hpp
deleted file mode 100644
index d35eb50..0000000
--- a/include/alpaka/idx/gb/IdxGbLinear.hpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright 2022 Jeffrey Kelling, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/core/Positioning.hpp"
-#include "alpaka/idx/MapIdx.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/vec/Vec.hpp"
-#include "alpaka/workdiv/Traits.hpp"
-
-namespace alpaka
-{
-    namespace gb
-    {
-        //! General ND index provider based on a linear index.
-        template<typename TDim, typename TIdx>
-        class IdxGbLinear : public concepts::Implements<ConceptIdxGb, IdxGbLinear<TDim, TIdx>>
-        {
-        public:
-            IdxGbLinear(TIdx const& teamOffset = static_cast<TIdx>(0u)) : m_gridBlockIdx(teamOffset)
-            {
-            }
-
-            TIdx const m_gridBlockIdx;
-        };
-    } // namespace gb
-
-    namespace trait
-    {
-        //! The IdxGbLinear index dimension get trait specialization.
-        template<typename TDim, typename TIdx>
-        struct DimType<gb::IdxGbLinear<TDim, TIdx>>
-        {
-            using type = TDim;
-        };
-
-        //! The IdxGbLinear grid block index get trait specialization.
-        template<typename TDim, typename TIdx>
-        struct GetIdx<gb::IdxGbLinear<TDim, TIdx>, origin::Grid, unit::Blocks>
-        {
-            //! \return The index of the current block in the grid.
-            template<typename TWorkDiv>
-            static auto getIdx(gb::IdxGbLinear<TDim, TIdx> const& idx, TWorkDiv const& workDiv) -> Vec<TDim, TIdx>
-            {
-                // \TODO: Would it be faster to precompute the index and cache it inside an array?
-                return mapIdx<TDim::value>(
-                    Vec<DimInt<1u>, TIdx>(idx.m_gridBlockIdx),
-                    getWorkDiv<Grid, Blocks>(workDiv));
-            }
-        };
-
-        template<typename TIdx>
-        struct GetIdx<gb::IdxGbLinear<DimInt<1u>, TIdx>, origin::Grid, unit::Blocks>
-        {
-            //! \return The index of the current block in the grid.
-            template<typename TWorkDiv>
-            static auto getIdx(gb::IdxGbLinear<DimInt<1u>, TIdx> const& idx, TWorkDiv const&) -> Vec<DimInt<1u>, TIdx>
-            {
-                return idx.m_gridBlockIdx;
-            }
-        };
-
-        //! The IdxGbLinear grid block index idx type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct IdxType<gb::IdxGbLinear<TDim, TIdx>>
-        {
-            using type = TIdx;
-        };
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/idx/gb/IdxGbRef.hpp b/include/alpaka/idx/gb/IdxGbRef.hpp
deleted file mode 100644
index 6e3d9a6..0000000
--- a/include/alpaka/idx/gb/IdxGbRef.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/core/Positioning.hpp"
-#include "alpaka/dim/Traits.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/vec/Vec.hpp"
-
-namespace alpaka
-{
-    namespace gb
-    {
-        //! A IdxGbRef grid block index.
-        template<typename TDim, typename TIdx>
-        class IdxGbRef : public concepts::Implements<ConceptIdxGb, IdxGbRef<TDim, TIdx>>
-        {
-        public:
-            IdxGbRef(Vec<TDim, TIdx> const& gridBlockIdx) : m_gridBlockIdx(gridBlockIdx)
-            {
-            }
-
-            Vec<TDim, TIdx> const& m_gridBlockIdx;
-        };
-    } // namespace gb
-
-    namespace trait
-    {
-        //! The IdxGbRef grid block index dimension get trait specialization.
-        template<typename TDim, typename TIdx>
-        struct DimType<gb::IdxGbRef<TDim, TIdx>>
-        {
-            using type = TDim;
-        };
-
-        //! The IdxGbRef grid block index grid block index get trait specialization.
-        template<typename TDim, typename TIdx>
-        struct GetIdx<gb::IdxGbRef<TDim, TIdx>, origin::Grid, unit::Blocks>
-        {
-            //! \return The index of the current block in the grid.
-            template<typename TWorkDiv>
-            ALPAKA_FN_HOST static auto getIdx(gb::IdxGbRef<TDim, TIdx> const& idx, TWorkDiv const& /* workDiv */)
-                -> Vec<TDim, TIdx>
-            {
-                return idx.m_gridBlockIdx;
-            }
-        };
-
-        //! The IdxGbRef grid block index idx type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct IdxType<gb::IdxGbRef<TDim, TIdx>>
-        {
-            using type = TIdx;
-        };
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/idx/gb/IdxGbUniformCudaHipBuiltIn.hpp b/include/alpaka/idx/gb/IdxGbUniformCudaHipBuiltIn.hpp
deleted file mode 100644
index a643533..0000000
--- a/include/alpaka/idx/gb/IdxGbUniformCudaHipBuiltIn.hpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, René Widera, Matthias Werner, Jan Stephan, Andrea Bocci, Bernhard
- * Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/BoostPredef.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/core/Cuda.hpp"
-#include "alpaka/core/Hip.hpp"
-#include "alpaka/core/Positioning.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/vec/Vec.hpp"
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-
-namespace alpaka
-{
-    namespace gb
-    {
-        //! The CUDA/HIP accelerator ND index provider.
-        template<typename TDim, typename TIdx>
-        class IdxGbUniformCudaHipBuiltIn
-            : public concepts::Implements<ConceptIdxGb, IdxGbUniformCudaHipBuiltIn<TDim, TIdx>>
-        {
-        };
-    } // namespace gb
-
-#    if !defined(ALPAKA_HOST_ONLY)
-
-#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
-#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#        endif
-
-#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
-#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#        endif
-
-    namespace trait
-    {
-        //! The GPU CUDA/HIP accelerator index dimension get trait specialization.
-        template<typename TDim, typename TIdx>
-        struct DimType<gb::IdxGbUniformCudaHipBuiltIn<TDim, TIdx>>
-        {
-            using type = TDim;
-        };
-
-        //! The GPU CUDA/HIP accelerator grid block index get trait specialization.
-        template<typename TDim, typename TIdx>
-        struct GetIdx<gb::IdxGbUniformCudaHipBuiltIn<TDim, TIdx>, origin::Grid, unit::Blocks>
-        {
-            //! \return The index of the current block in the grid.
-            template<typename TWorkDiv>
-            __device__ static auto getIdx(gb::IdxGbUniformCudaHipBuiltIn<TDim, TIdx> const& /* idx */, TWorkDiv const&)
-                -> Vec<TDim, TIdx>
-            {
-#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-                return castVec<TIdx>(getOffsetVecEnd<TDim>(blockIdx));
-#        else
-                return getOffsetVecEnd<TDim>(Vec<std::integral_constant<typename TDim::value_type, 3>, TIdx>(
-                    static_cast<TIdx>(hipBlockIdx_z),
-                    static_cast<TIdx>(hipBlockIdx_y),
-                    static_cast<TIdx>(hipBlockIdx_x)));
-#        endif
-            }
-        };
-
-        //! The GPU CUDA/HIP accelerator grid block index idx type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct IdxType<gb::IdxGbUniformCudaHipBuiltIn<TDim, TIdx>>
-        {
-            using type = TIdx;
-        };
-    } // namespace trait
-
-#    endif
-
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/intrinsic/IntrinsicCpu.hpp b/include/alpaka/intrinsic/IntrinsicCpu.hpp
deleted file mode 100644
index 5db927b..0000000
--- a/include/alpaka/intrinsic/IntrinsicCpu.hpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/BoostPredef.hpp"
-#include "alpaka/core/Unreachable.hpp"
-#include "alpaka/intrinsic/IntrinsicFallback.hpp"
-#include "alpaka/intrinsic/Traits.hpp"
-
-#include <bitset>
-#include <climits>
-#if __has_include(<version>) // Not part of the C++17 standard but all major standard libraries include this
-#    include <version>
-#endif
-#ifdef __cpp_lib_bitops
-#    include <bit>
-#endif
-
-#if BOOST_COMP_MSVC
-#    include <intrin.h>
-#endif
-
-namespace alpaka
-{
-    //! The CPU intrinsic.
-    class IntrinsicCpu : public concepts::Implements<ConceptIntrinsic, IntrinsicCpu>
-    {
-    };
-
-    namespace trait
-    {
-        template<>
-        struct Popcount<IntrinsicCpu>
-        {
-            template<typename UnsignedIntegral>
-            static auto popcount(IntrinsicCpu const& /*intrinsic*/, UnsignedIntegral value) -> std::int32_t
-            {
-#ifdef __cpp_lib_bitops
-                return std::popcount(value);
-#elif BOOST_COMP_GNUC || BOOST_COMP_CLANG
-                if constexpr(sizeof(UnsignedIntegral) == 8)
-                    return __builtin_popcountll(value);
-                else
-                    return __builtin_popcount(value);
-#elif BOOST_COMP_MSVC
-                if constexpr(sizeof(UnsignedIntegral) == 8)
-                    return static_cast<std::int32_t>(__popcnt64(value));
-                else
-                    return __popcnt(value);
-#else
-                // Fallback to standard library
-                return static_cast<std::int32_t>(std::bitset<sizeof(UnsignedIntegral) * CHAR_BIT>(value).count());
-#endif
-                ALPAKA_UNREACHABLE(0);
-            }
-        };
-
-        template<>
-        struct Ffs<IntrinsicCpu>
-        {
-            template<typename Integral>
-            static auto ffs(IntrinsicCpu const& /*intrinsic*/, Integral value) -> std::int32_t
-            {
-#ifdef __cpp_lib_bitops
-                return value == 0 ? 0 : std::countr_zero(static_cast<std::make_unsigned_t<Integral>>(value)) + 1;
-#elif BOOST_COMP_GNUC || BOOST_COMP_CLANG
-                if constexpr(sizeof(Integral) == 8)
-                    return __builtin_ffsll(value);
-                else
-                    return __builtin_ffs(value);
-#elif BOOST_COMP_MSVC
-                // Implementation based on
-                // https://gitlab.freedesktop.org/cairo/cairo/commit/f5167dc2e1a13d8c4e5d66d7178a24b9b5e7ac7a
-                unsigned long index = 0u;
-                if constexpr(sizeof(Integral) == 8)
-                    return _BitScanForward64(&index, value) == 0 ? 0 : static_cast<std::int32_t>(index + 1u);
-                else
-                    return _BitScanForward(&index, value) == 0 ? 0 : static_cast<std::int32_t>(index + 1u);
-#else
-                return alpaka::detail::ffsFallback(value);
-#endif
-                ALPAKA_UNREACHABLE(0);
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/intrinsic/IntrinsicFallback.hpp b/include/alpaka/intrinsic/IntrinsicFallback.hpp
deleted file mode 100644
index 1e9f3a4..0000000
--- a/include/alpaka/intrinsic/IntrinsicFallback.hpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright 2022 Sergei Bastrakov, Jeffrey Kelling, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/intrinsic/Traits.hpp"
-
-namespace alpaka
-{
-    namespace detail
-    {
-        //! Fallback implementation of popcount.
-        template<typename TValue>
-        static auto popcountFallback(TValue value) -> std::int32_t
-        {
-            TValue count = 0;
-            while(value != 0)
-            {
-                count += value & 1u;
-                value >>= 1u;
-            }
-            return static_cast<std::int32_t>(count);
-        }
-
-        //! Fallback implementation of ffs.
-        template<typename TValue>
-        static auto ffsFallback(TValue value) -> std::int32_t
-        {
-            if(value == 0)
-                return 0;
-            std::int32_t result = 1;
-            while((value & 1) == 0)
-            {
-                value >>= 1;
-                result++;
-            }
-            return result;
-        }
-    } // namespace detail
-
-    //! The Fallback intrinsic.
-    class IntrinsicFallback : public concepts::Implements<ConceptIntrinsic, IntrinsicFallback>
-    {
-    };
-
-    namespace trait
-    {
-        template<>
-        struct Popcount<IntrinsicFallback>
-        {
-            static auto popcount(IntrinsicFallback const& /*intrinsic*/, std::uint32_t value) -> std::int32_t
-            {
-                return alpaka::detail::popcountFallback(value);
-            }
-
-            static auto popcount(IntrinsicFallback const& /*intrinsic*/, std::uint64_t value) -> std::int32_t
-            {
-                return alpaka::detail::popcountFallback(value);
-            }
-        };
-
-        template<>
-        struct Ffs<IntrinsicFallback>
-        {
-            static auto ffs(IntrinsicFallback const& /*intrinsic*/, std::int32_t value) -> std::int32_t
-            {
-                return alpaka::detail::ffsFallback(value);
-            }
-
-            static auto ffs(IntrinsicFallback const& /*intrinsic*/, std::int64_t value) -> std::int32_t
-            {
-                return alpaka::detail::ffsFallback(value);
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/intrinsic/IntrinsicGenericSycl.hpp b/include/alpaka/intrinsic/IntrinsicGenericSycl.hpp
deleted file mode 100644
index 395043a..0000000
--- a/include/alpaka/intrinsic/IntrinsicGenericSycl.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright 2022 Jan Stephan
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/intrinsic/IntrinsicFallback.hpp"
-#include "alpaka/intrinsic/Traits.hpp"
-
-#include <cstdint>
-
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-
-#    include <sycl/sycl.hpp>
-
-namespace alpaka
-{
-    //! The SYCL intrinsic.
-    class IntrinsicGenericSycl : public concepts::Implements<ConceptIntrinsic, IntrinsicGenericSycl>
-    {
-    };
-} // namespace alpaka
-
-namespace alpaka::trait
-{
-    template<>
-    struct Popcount<IntrinsicGenericSycl>
-    {
-        static auto popcount(IntrinsicGenericSycl const&, std::uint32_t value) -> std::int32_t
-        {
-            return static_cast<std::int32_t>(sycl::popcount(value));
-        }
-
-        static auto popcount(IntrinsicGenericSycl const&, std::uint64_t value) -> std::int32_t
-        {
-            return static_cast<std::int32_t>(sycl::popcount(value));
-        }
-    };
-
-    template<>
-    struct Ffs<IntrinsicGenericSycl>
-    {
-        static auto ffs(IntrinsicGenericSycl const&, std::int32_t value) -> std::int32_t
-        {
-            // There is no FFS operation in SYCL but we can emulate it using popcount.
-            return (value == 0) ? 0 : sycl::popcount(value ^ ~(-value));
-        }
-
-        static auto ffs(IntrinsicGenericSycl const&, std::int64_t value) -> std::int32_t
-        {
-            // There is no FFS operation in SYCL but we can emulate it using popcount.
-            return (value == 0l) ? 0 : static_cast<std::int32_t>(sycl::popcount(value ^ ~(-value)));
-        }
-    };
-} // namespace alpaka::trait
-
-#endif
diff --git a/include/alpaka/intrinsic/IntrinsicUniformCudaHipBuiltIn.hpp b/include/alpaka/intrinsic/IntrinsicUniformCudaHipBuiltIn.hpp
deleted file mode 100644
index c73f973..0000000
--- a/include/alpaka/intrinsic/IntrinsicUniformCudaHipBuiltIn.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2022 Sergei Bastrakov, Andrea Bocci, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/BoostPredef.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/intrinsic/Traits.hpp"
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-
-namespace alpaka
-{
-    //! The GPU CUDA/HIP intrinsic.
-    class IntrinsicUniformCudaHipBuiltIn
-        : public concepts::Implements<ConceptIntrinsic, IntrinsicUniformCudaHipBuiltIn>
-    {
-    };
-
-#    if !defined(ALPAKA_HOST_ONLY)
-
-#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
-#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#        endif
-
-#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
-#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#        endif
-
-    namespace trait
-    {
-        template<>
-        struct Popcount<IntrinsicUniformCudaHipBuiltIn>
-        {
-            __device__ static auto popcount(IntrinsicUniformCudaHipBuiltIn const& /*intrinsic*/, std::uint32_t value)
-                -> std::int32_t
-            {
-#        if BOOST_COMP_CLANG && BOOST_LANG_CUDA
-                return __popc(static_cast<int>(value));
-#        else
-                return static_cast<std::int32_t>(__popc(static_cast<unsigned int>(value)));
-#        endif
-            }
-
-            __device__ static auto popcount(IntrinsicUniformCudaHipBuiltIn const& /*intrinsic*/, std::uint64_t value)
-                -> std::int32_t
-            {
-#        if BOOST_COMP_CLANG && BOOST_LANG_CUDA
-                return __popcll(static_cast<long long>(value));
-#        else
-                return static_cast<std::int32_t>(__popcll(static_cast<unsigned long long>(value)));
-#        endif
-            }
-        };
-
-        template<>
-        struct Ffs<IntrinsicUniformCudaHipBuiltIn>
-        {
-            __device__ static auto ffs(IntrinsicUniformCudaHipBuiltIn const& /*intrinsic*/, std::int32_t value)
-                -> std::int32_t
-            {
-                return static_cast<std::int32_t>(__ffs(static_cast<int>(value)));
-            }
-
-            __device__ static auto ffs(IntrinsicUniformCudaHipBuiltIn const& /*intrinsic*/, std::int64_t value)
-                -> std::int32_t
-            {
-                return static_cast<std::int32_t>(__ffsll(static_cast<long long>(value)));
-            }
-        };
-    } // namespace trait
-
-#    endif
-
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/intrinsic/Traits.hpp b/include/alpaka/intrinsic/Traits.hpp
deleted file mode 100644
index 8aea0a4..0000000
--- a/include/alpaka/intrinsic/Traits.hpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright 2022 Sergei Bastrakov, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/core/Concepts.hpp"
-
-#include <cstdint>
-#include <type_traits>
-
-namespace alpaka
-{
-    struct ConceptIntrinsic
-    {
-    };
-
-    //! The intrinsics traits.
-    namespace trait
-    {
-        //! The popcount trait.
-        template<typename TWarp, typename TSfinae = void>
-        struct Popcount;
-
-        //! The ffs trait.
-        template<typename TWarp, typename TSfinae = void>
-        struct Ffs;
-    } // namespace trait
-
-    //! Returns the number of 1 bits in the given 32-bit value.
-    //!
-    //! \tparam TIntrinsic The intrinsic implementation type.
-    //! \param intrinsic The intrinsic implementation.
-    //! \param value The input value.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TIntrinsic>
-    ALPAKA_FN_ACC auto popcount(TIntrinsic const& intrinsic, std::uint32_t value) -> std::int32_t
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptIntrinsic, TIntrinsic>;
-        return trait::Popcount<ImplementationBase>::popcount(intrinsic, value);
-    }
-
-    //! Returns the number of 1 bits in the given 64-bit value.
-    //!
-    //! \tparam TIntrinsic The intrinsic implementation type.
-    //! \param intrinsic The intrinsic implementation.
-    //! \param value The input value.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TIntrinsic>
-    ALPAKA_FN_ACC auto popcount(TIntrinsic const& intrinsic, std::uint64_t value) -> std::int32_t
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptIntrinsic, TIntrinsic>;
-        return trait::Popcount<ImplementationBase>::popcount(intrinsic, value);
-    }
-
-    //! Returns the 1-based position of the least significant bit set to 1
-    //! in the given 32-bit value. Returns 0 for input value 0.
-    //!
-    //! \tparam TIntrinsic The intrinsic implementation type.
-    //! \param intrinsic The intrinsic implementation.
-    //! \param value The input value.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TIntrinsic>
-    ALPAKA_FN_ACC auto ffs(TIntrinsic const& intrinsic, std::int32_t value) -> std::int32_t
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptIntrinsic, TIntrinsic>;
-        return trait::Ffs<ImplementationBase>::ffs(intrinsic, value);
-    }
-
-    //! Returns the 1-based position of the least significant bit set to 1
-    //! in the given 64-bit value. Returns 0 for input value 0.
-    //!
-    //! \tparam TIntrinsic The intrinsic implementation type.
-    //! \param intrinsic The intrinsic implementation.
-    //! \param value The input value.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TIntrinsic>
-    ALPAKA_FN_ACC auto ffs(TIntrinsic const& intrinsic, std::int64_t value) -> std::int32_t
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptIntrinsic, TIntrinsic>;
-        return trait::Ffs<ImplementationBase>::ffs(intrinsic, value);
-    }
-} // namespace alpaka
diff --git a/include/alpaka/kernel/KernelFunctionAttributes.hpp b/include/alpaka/kernel/KernelFunctionAttributes.hpp
deleted file mode 100644
index 0371430..0000000
--- a/include/alpaka/kernel/KernelFunctionAttributes.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright 2022 René Widera, Mehmet Yusufoglu
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include <cstddef>
-
-namespace alpaka
-{
-    //! Kernel function attributes struct. Attributes are filled by calling the API of the accelerator using the kernel
-    //! function as an argument. In case of a CPU backend, maxThreadsPerBlock is set to 1 and other values remain zero
-    //! since there are no correponding API functions to get the values.
-    struct KernelFunctionAttributes
-    {
-        std::size_t constSizeBytes{0};
-        std::size_t localSizeBytes{0};
-        std::size_t sharedSizeBytes{0};
-        int maxDynamicSharedSizeBytes{0};
-        int numRegs{0};
-        // This field is ptx or isa version if the backend is GPU
-        int asmVersion{0};
-        int maxThreadsPerBlock{0};
-    };
-} // namespace alpaka
diff --git a/include/alpaka/kernel/SyclSubgroupSize.hpp b/include/alpaka/kernel/SyclSubgroupSize.hpp
deleted file mode 100644
index 1c7124b..0000000
--- a/include/alpaka/kernel/SyclSubgroupSize.hpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright 2023 Andrea Bocci, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-
-#    ifdef __SYCL_DEVICE_ONLY__
-
-#        if(__SYCL_TARGET_INTEL_GPU_BDW__) || /* Broadwell Intel graphics architecture */                             \
-            (__SYCL_TARGET_INTEL_GPU_SKL__) || /* Skylake Intel graphics architecture */                              \
-            (__SYCL_TARGET_INTEL_GPU_KBL__) || /* Kaby Lake Intel graphics architecture */                            \
-            (__SYCL_TARGET_INTEL_GPU_CFL__) || /* Coffee Lake Intel graphics architecture */                          \
-            (__SYCL_TARGET_INTEL_GPU_APL__) || /* Apollo Lake Intel graphics architecture */                          \
-            (__SYCL_TARGET_INTEL_GPU_GLK__) || /* Gemini Lake Intel graphics architecture */                          \
-            (__SYCL_TARGET_INTEL_GPU_WHL__) || /* Whiskey Lake Intel graphics architecture */                         \
-            (__SYCL_TARGET_INTEL_GPU_AML__) || /* Amber Lake Intel graphics architecture */                           \
-            (__SYCL_TARGET_INTEL_GPU_CML__) || /* Comet Lake Intel graphics architecture */                           \
-            (__SYCL_TARGET_INTEL_GPU_ICLLP__) || /* Ice Lake Intel graphics architecture */                           \
-            (__SYCL_TARGET_INTEL_GPU_EHL__) || /* Elkhart Lake or Jasper Lake Intel graphics architecture */          \
-            (__SYCL_TARGET_INTEL_GPU_TGLLP__) || /* Tiger Lake Intel graphics architecture */                         \
-            (__SYCL_TARGET_INTEL_GPU_RKL__) || /* Rocket Lake Intel graphics architecture */                          \
-            (__SYCL_TARGET_INTEL_GPU_ADL_S__) || /* Alder Lake S or Raptor Lake S Intel graphics architecture */      \
-            (__SYCL_TARGET_INTEL_GPU_ADL_P__) || /* Alder Lake P Intel graphics architecture */                       \
-            (__SYCL_TARGET_INTEL_GPU_ADL_N__) || /* Alder Lake N Intel graphics architecture */                       \
-            (__SYCL_TARGET_INTEL_GPU_DG1__) || /* DG1 Intel graphics architecture */                                  \
-            (__SYCL_TARGET_INTEL_GPU_ACM_G10__) || /* Alchemist G10 Intel graphics architecture */                    \
-            (__SYCL_TARGET_INTEL_GPU_ACM_G11__) || /* Alchemist G11 Intel graphics architecture */                    \
-            (__SYCL_TARGET_INTEL_GPU_ACM_G12__) || /* Alchemist G12 Intel graphics architecture */                    \
-            (__SYCL_TARGET_INTEL_GPU_MTL_U__) || /* Meteor Lake U/S or Arrow Lake U/S Intel graphics architecture */  \
-            (__SYCL_TARGET_INTEL_GPU_MTL_H__) || /* Meteor Lake H Intel graphics architecture */                      \
-            (__SYCL_TARGET_INTEL_GPU_ARL_H__) || /* Arrow Lake H Intel graphics architecture */                       \
-            (__SYCL_TARGET_INTEL_GPU_BMG_G21__) || /* Battlemage G21 Intel graphics architecture */                   \
-            (__SYCL_TARGET_INTEL_GPU_LNL_M__) /* Lunar Lake Intel graphics architecture */
-
-#            define SYCL_SUBGROUP_SIZE (8 | 16 | 32)
-
-#        elif(__SYCL_TARGET_INTEL_GPU_PVC__) || /* Ponte Vecchio Intel graphics architecture */                       \
-            (__SYCL_TARGET_INTEL_GPU_PVC_VG__) /* Ponte Vecchio VG Intel graphics architecture */
-
-#            define SYCL_SUBGROUP_SIZE (16 | 32)
-
-#        elif(__SYCL_TARGET_INTEL_X86_64__) /* generate code ahead of time for x86_64 CPUs */
-
-#            define SYCL_SUBGROUP_SIZE (4 | 8 | 16 | 32 | 64)
-
-#        elif(__SYCL_TARGET_NVIDIA_GPU_SM50__) || /* NVIDIA Maxwell architecture (compute capability 5.0) */          \
-            (__SYCL_TARGET_NVIDIA_GPU_SM52__) || /* NVIDIA Maxwell architecture (compute capability 5.2) */           \
-            (__SYCL_TARGET_NVIDIA_GPU_SM53__) || /* NVIDIA Jetson TX1 / Nano (compute capability 5.3) */              \
-            (__SYCL_TARGET_NVIDIA_GPU_SM60__) || /* NVIDIA Pascal architecture (compute capability 6.0) */            \
-            (__SYCL_TARGET_NVIDIA_GPU_SM61__) || /* NVIDIA Pascal architecture (compute capability 6.1) */            \
-            (__SYCL_TARGET_NVIDIA_GPU_SM62__) || /* NVIDIA Jetson TX2 (compute capability 6.2) */                     \
-            (__SYCL_TARGET_NVIDIA_GPU_SM70__) || /* NVIDIA Volta architecture (compute capability 7.0) */             \
-            (__SYCL_TARGET_NVIDIA_GPU_SM72__) || /* NVIDIA Jetson AGX (compute capability 7.2) */                     \
-            (__SYCL_TARGET_NVIDIA_GPU_SM75__) || /* NVIDIA Turing architecture (compute capability 7.5) */            \
-            (__SYCL_TARGET_NVIDIA_GPU_SM80__) || /* NVIDIA Ampere architecture (compute capability 8.0) */            \
-            (__SYCL_TARGET_NVIDIA_GPU_SM86__) || /* NVIDIA Ampere architecture (compute capability 8.6) */            \
-            (__SYCL_TARGET_NVIDIA_GPU_SM87__) || /* NVIDIA Jetson/Drive AGX Orin (compute capability 8.7) */          \
-            (__SYCL_TARGET_NVIDIA_GPU_SM89__) || /* NVIDIA Ada Lovelace arch. (compute capability 8.9) */             \
-            (__SYCL_TARGET_NVIDIA_GPU_SM90__) /* NVIDIA Hopper architecture (compute capability 9.0) */
-
-#            define SYCL_SUBGROUP_SIZE (32)
-
-#        elif(__SYCL_TARGET_AMD_GPU_GFX700__) || /* AMD GCN 2.0 Sea Islands architecture (gfx 7.0) */                 \
-            (__SYCL_TARGET_AMD_GPU_GFX701__) || /* AMD GCN 2.0 Sea Islands architecture (gfx 7.0) */                  \
-            (__SYCL_TARGET_AMD_GPU_GFX702__) || /* AMD GCN 2.0 Sea Islands architecture (gfx 7.0) */                  \
-            (__SYCL_TARGET_AMD_GPU_GFX801__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.0) */             \
-            (__SYCL_TARGET_AMD_GPU_GFX802__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.0) */             \
-            (__SYCL_TARGET_AMD_GPU_GFX803__) || /* AMD GCN 4.0 Arctic Islands architecture (gfx 8.0) */               \
-            (__SYCL_TARGET_AMD_GPU_GFX805__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.0) */             \
-            (__SYCL_TARGET_AMD_GPU_GFX810__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.1) */             \
-            (__SYCL_TARGET_AMD_GPU_GFX900__) || /* AMD GCN 5.0 Vega architecture (gfx 9.0) */                         \
-            (__SYCL_TARGET_AMD_GPU_GFX902__) || /* AMD GCN 5.0 Vega architecture (gfx 9.0) */                         \
-            (__SYCL_TARGET_AMD_GPU_GFX904__) || /* AMD GCN 5.0 Vega architecture (gfx 9.0) */                         \
-            (__SYCL_TARGET_AMD_GPU_GFX906__) || /* AMD GCN 5.1 Vega II architecture (gfx 9.0) */                      \
-            (__SYCL_TARGET_AMD_GPU_GFX908__) || /* AMD CDNA 1.0 Arcturus architecture (gfx 9.0) */                    \
-            (__SYCL_TARGET_AMD_GPU_GFX909__) || /* AMD GCN 5.0 Raven 2 architecture (gfx 9.0) */                      \
-            (__SYCL_TARGET_AMD_GPU_GFX90A__) || /* AMD CDNA 2.0 Aldebaran architecture (gfx 9.0) */                   \
-            (__SYCL_TARGET_AMD_GPU_GFX90C__) || /* AMD GCN 5.1 Renoir architecture (gfx 9.0) */                       \
-            (__SYCL_TARGET_AMD_GPU_GFX940__) || /* AMD CDNA 3.0 Aqua Vanjaram architecture (gfx 9.4) */               \
-            (__SYCL_TARGET_AMD_GPU_GFX941__) || /* AMD CDNA 3.0 Aqua Vanjaram architecture (gfx 9.4) */               \
-            (__SYCL_TARGET_AMD_GPU_GFX942__) /* AMD CDNA 3.0 Aqua Vanjaram architecture (gfx 9.4) */
-
-#            define SYCL_SUBGROUP_SIZE (64)
-
-#        elif(__SYCL_TARGET_AMD_GPU_GFX1010__) || /* AMD RDNA 1.0 Navi 10 architecture (gfx 10.1) */                  \
-            (__SYCL_TARGET_AMD_GPU_GFX1011__) || /* AMD RDNA 1.0 Navi 12 architecture (gfx 10.1) */                   \
-            (__SYCL_TARGET_AMD_GPU_GFX1012__) || /* AMD RDNA 1.0 Navi 14 architecture (gfx 10.1) */                   \
-            (__SYCL_TARGET_AMD_GPU_GFX1013__) || /* AMD RDNA 2.0 Oberon architecture (gfx 10.1) */                    \
-            (__SYCL_TARGET_AMD_GPU_GFX1030__) || /* AMD RDNA 2.0 Navi 21 architecture (gfx 10.3) */                   \
-            (__SYCL_TARGET_AMD_GPU_GFX1031__) || /* AMD RDNA 2.0 Navi 22 architecture (gfx 10.3) */                   \
-            (__SYCL_TARGET_AMD_GPU_GFX1032__) || /* AMD RDNA 2.0 Navi 23 architecture (gfx 10.3) */                   \
-            (__SYCL_TARGET_AMD_GPU_GFX1033__) || /* AMD RDNA 2.0 Van Gogh architecture (gfx 10.3) */                  \
-            (__SYCL_TARGET_AMD_GPU_GFX1034__) || /* AMD RDNA 2.0 Navi 24 architecture (gfx 10.3) */                   \
-            (__SYCL_TARGET_AMD_GPU_GFX1035__) || /* AMD RDNA 2.0 Rembrandt Mobile architecture (gfx 10.3) */          \
-            (__SYCL_TARGET_AMD_GPU_GFX1036__) || /* AMD RDNA 2.0 Raphael architecture (gfx 10.3) */                   \
-            (__SYCL_TARGET_AMD_GPU_GFX1100__) || /* AMD RDNA 3.0 Navi 31 architecture (gfx 11.0) */                   \
-            (__SYCL_TARGET_AMD_GPU_GFX1101__) || /* AMD RDNA 3.0 Navi 32 architecture (gfx 11.0) */                   \
-            (__SYCL_TARGET_AMD_GPU_GFX1102__) || /* AMD RDNA 3.0 Navi 33 architecture (gfx 11.0) */                   \
-            (__SYCL_TARGET_AMD_GPU_GFX1103__) || /* AMD RDNA 3.0 Phoenix mobile architecture (gfx 11.0) */            \
-            (__SYCL_TARGET_AMD_GPU_GFX1150__) || /* AMD RDNA 3.5 Strix Point architecture (gfx 11.5) */               \
-            (__SYCL_TARGET_AMD_GPU_GFX1151__) || /* AMD RDNA 3.5 Strix Halo architecture (gfx 11.5) */                \
-            (__SYCL_TARGET_AMD_GPU_GFX1200__) || /* AMD RDNA 4.0 Navi 44 architecture (gfx 12.0) */                   \
-            (__SYCL_TARGET_AMD_GPU_GFX1201__) /* AMD RDNA 4.0 Navi 48 architecture (gfx 12.0) */
-
-// starting from gfx10, HIP supports only wavefront size 32
-#            define SYCL_SUBGROUP_SIZE (32)
-
-#        else // __SYCL_TARGET_*
-
-#            define SYCL_SUBGROUP_SIZE (0) /* unknown target */
-
-#        endif // __SYCL_TARGET_*
-
-#    else
-
-#        define SYCL_SUBGROUP_SIZE (0) /* host compilation */
-
-#    endif // __SYCL_DEVICE_ONLY__
-
-#endif // ALPAKA_ACC_SYCL_ENABLED
diff --git a/include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp b/include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp
deleted file mode 100644
index f0d6056..0000000
--- a/include/alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp
+++ /dev/null
@@ -1,991 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bert Wesarg, René Widera, Sergei Bastrakov, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-// Specialized traits.
-#include "alpaka/acc/Traits.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/dim/Traits.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/platform/Traits.hpp"
-
-// Implementation details.
-#include "alpaka/acc/AccCpuOmp2Blocks.hpp"
-#include "alpaka/core/Decay.hpp"
-#include "alpaka/core/OmpSchedule.hpp"
-#include "alpaka/dev/DevCpu.hpp"
-#include "alpaka/idx/MapIdx.hpp"
-#include "alpaka/kernel/KernelFunctionAttributes.hpp"
-#include "alpaka/kernel/Traits.hpp"
-#include "alpaka/platform/PlatformCpu.hpp"
-#include "alpaka/workdiv/WorkDivMembers.hpp"
-
-#include <functional>
-#include <stdexcept>
-#include <tuple>
-#include <type_traits>
-#include <utility>
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-#    include <iostream>
-#endif
-
-#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
-
-#    if BOOST_COMP_CLANG
-#        pragma clang diagnostic push
-#        pragma clang diagnostic ignored "-Wswitch-default"
-#    endif
-
-#    if _OPENMP < 200203
-#        error If ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
-#    endif
-
-#    include <omp.h>
-
-namespace alpaka
-{
-    namespace detail
-    {
-        //! Executor of parallel OpenMP loop with the given schedule
-        //!
-        //! Is explicitly specialized for all supported schedule kinds to help code optimization by compilers.
-        //!
-        //! \tparam TKernel The kernel type.
-        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
-        //! \tparam TScheduleKind The schedule kind value.
-        template<typename TKernel, typename TSchedule, omp::Schedule::Kind TScheduleKind>
-        struct ParallelForImpl;
-
-        //! Executor of parallel OpenMP loop with no schedule set
-        //!
-        //! Does not use chunk size.
-        //!
-        //! \tparam TKernel The kernel type.
-        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
-        template<typename TKernel, typename TSchedule>
-        struct ParallelForImpl<TKernel, TSchedule, omp::Schedule::NoSchedule>
-        {
-            //! Run parallel OpenMP loop
-            //!
-            //! \tparam TLoopBody The loop body functor type.
-            //! \tparam TIdx The index type.
-            //!
-            //! \param loopBody The loop body functor instance, takes iteration index as input.
-            //! \param numIterations The number of loop iterations.
-            template<typename TLoopBody, typename TIdx>
-            ALPAKA_FN_HOST void operator()(
-                TKernel const&,
-                TLoopBody&& loopBody,
-                TIdx const numIterations,
-                TSchedule const&)
-            {
-#    if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
-                         // header.
-                std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
-                std::intmax_t i;
-#        pragma omp for nowait
-                for(i = 0; i < iNumBlocksInGrid; ++i)
-#    else
-#        pragma omp for nowait
-                for(TIdx i = 0; i < numIterations; ++i)
-#    endif
-                {
-                    // Make another lambda to work around #1288
-                    auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
-                    wrappedLoopBody(i);
-                }
-            }
-        };
-
-        /* Implementations for Static, Dynamic and Guided follow the same pattern.
-         * There are two specializations of ParallelForImpl for compile-time dispatch depending on whether the
-         * OmpSchedule trait is specialized.
-         * The no trait case is further compile-time dispatched with a helper ParallelForStaticImpl.
-         * It is based on whether ompScheduleChunkSize member is available.
-         */
-
-        //! Executor of parallel OpenMP loop with the static schedule
-        //!
-        //! Specialization for kernels specializing the OmpSchedule trait.
-        //!
-        //! \tparam TKernel The kernel type.
-        template<typename TKernel>
-        struct ParallelForImpl<TKernel, omp::Schedule, omp::Schedule::Static>
-        {
-            //! Run parallel OpenMP loop
-            //!
-            //! \tparam TLoopBody The loop body functor type.
-            //! \tparam TIdx The index type.
-            //!
-            //! \param loopBody The loop body functor instance, takes iteration index as input.
-            //! \param numIterations The number of loop iterations.
-            //! \param schedule The schedule object.
-            template<typename TLoopBody, typename TIdx>
-            ALPAKA_FN_HOST void operator()(
-                TKernel const&,
-                TLoopBody&& loopBody,
-                TIdx const numIterations,
-                omp::Schedule const& schedule)
-            {
-#    if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
-                         // header.
-                std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
-                std::intmax_t i;
-#        pragma omp for nowait schedule(static, schedule.chunkSize)
-                for(i = 0; i < iNumBlocksInGrid; ++i)
-#    else
-#        pragma omp for nowait schedule(static, schedule.chunkSize)
-                for(TIdx i = 0; i < numIterations; ++i)
-#    endif
-                {
-                    // Make another lambda to work around #1288
-                    auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
-                    wrappedLoopBody(i);
-                }
-            }
-        };
-
-        //! Helper executor of parallel OpenMP loop with the static schedule
-        //!
-        //! Generel implementation is for TKernel types without member ompScheduleChunkSize.
-        //!
-        //! \tparam TKernel The kernel type.
-        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
-        template<typename TKernel, typename TSchedule, typename TSfinae = void>
-        struct ParallelForStaticImpl
-        {
-            //! Run parallel OpenMP loop
-            //!
-            //! \tparam TLoopBody The loop body functor type.
-            //! \tparam TIdx The index type.
-            //!
-            //! \param loopBody The loop body functor instance, takes iteration index as input.
-            //! \param numIterations The number of loop iterations.
-            template<typename TLoopBody, typename TIdx>
-            ALPAKA_FN_HOST void operator()(
-                TKernel const&,
-                TLoopBody&& loopBody,
-                TIdx const numIterations,
-                TSchedule const&)
-            {
-#    if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
-                         // header.
-                std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
-                std::intmax_t i;
-#        pragma omp for nowait schedule(static)
-                for(i = 0; i < iNumBlocksInGrid; ++i)
-#    else
-#        pragma omp for nowait schedule(static)
-                for(TIdx i = 0; i < numIterations; ++i)
-#    endif
-                {
-                    // Make another lambda to work around #1288
-                    auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
-                    wrappedLoopBody(i);
-                }
-            }
-        };
-
-        //! Helper type to check if TKernel has member ompScheduleChunkSize
-        //!
-        //! Is void for those types, ill-formed otherwise.
-        //!
-        //! \tparam TKernel The kernel type.
-        template<typename TKernel>
-        using HasScheduleChunkSize = std::void_t<decltype(TKernel::ompScheduleChunkSize)>;
-
-        //! Helper executor of parallel OpenMP loop with the static schedule
-        //!
-        //! Specialization for kernels with ompScheduleChunkSize member.
-        //!
-        //! \tparam TKernel The kernel type.
-        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
-        template<typename TKernel, typename TSchedule>
-        struct ParallelForStaticImpl<TKernel, TSchedule, HasScheduleChunkSize<TKernel>>
-        {
-            //! Run parallel OpenMP loop
-            //!
-            //! \tparam TLoopBody The loop body functor type.
-            //! \tparam TIdx The index type.
-            //!
-            //! \param kernel The kernel instance reference
-            //! \param loopBody The loop body functor instance, takes iteration index as input.
-            //! \param numIterations The number of loop iterations.
-            template<typename TLoopBody, typename TIdx>
-            ALPAKA_FN_HOST void operator()(
-                TKernel const& kernel,
-                TLoopBody&& loopBody,
-                TIdx const numIterations,
-                TSchedule const&)
-            {
-#    if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
-                         // header.
-                std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
-                std::intmax_t i;
-#        pragma omp for nowait schedule(static, kernel.ompScheduleChunkSize)
-                for(i = 0; i < iNumBlocksInGrid; ++i)
-#    else
-#        pragma omp for nowait schedule(static, kernel.ompScheduleChunkSize)
-                for(TIdx i = 0; i < numIterations; ++i)
-#    endif
-                {
-                    // Make another lambda to work around #1288
-                    auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
-                    wrappedLoopBody(i);
-                }
-            }
-        };
-
-        //! Executor of parallel OpenMP loop with the static schedule
-        //!
-        //! Specialization for kernels not specializing the OmpSchedule trait.
-        //! Falls back to ParallelForStaticImpl for further dispatch.
-        //!
-        //! \tparam TKernel The kernel type.
-        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
-        template<typename TKernel, typename TSchedule>
-        struct ParallelForImpl<TKernel, TSchedule, omp::Schedule::Static> : ParallelForStaticImpl<TKernel, TSchedule>
-        {
-        };
-
-        //! Executor of parallel OpenMP loop with the dynamic schedule
-        //!
-        //! Specialization for kernels specializing the OmpSchedule trait.
-        //!
-        //! \tparam TKernel The kernel type.
-        template<typename TKernel>
-        struct ParallelForImpl<TKernel, omp::Schedule, omp::Schedule::Dynamic>
-        {
-            //! Run parallel OpenMP loop
-            //!
-            //! \tparam TLoopBody The loop body functor type.
-            //! \tparam TIdx The index type.
-            //!
-            //! \param loopBody The loop body functor instance, takes iteration index as input.
-            //! \param numIterations The number of loop iterations.
-            //! \param schedule The schedule object.
-            template<typename TLoopBody, typename TIdx>
-            ALPAKA_FN_HOST void operator()(
-                TKernel const&,
-                TLoopBody&& loopBody,
-                TIdx const numIterations,
-                omp::Schedule const& schedule)
-            {
-#    if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
-                         // header.
-                std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
-                std::intmax_t i;
-#        pragma omp for nowait schedule(dynamic, schedule.chunkSize)
-                for(i = 0; i < iNumBlocksInGrid; ++i)
-#    else
-#        pragma omp for nowait schedule(dynamic, schedule.chunkSize)
-                for(TIdx i = 0; i < numIterations; ++i)
-#    endif
-                {
-                    // Make another lambda to work around #1288
-                    auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
-                    wrappedLoopBody(i);
-                }
-            }
-        };
-
-        //! Helper executor of parallel OpenMP loop with the dynamic schedule
-        //!
-        //! Generel implementation is for TKernel types without member ompScheduleChunkSize.
-        //!
-        //! \tparam TKernel The kernel type.
-        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
-        template<typename TKernel, typename TSchedule, typename TSfinae = void>
-        struct ParallelForDynamicImpl
-        {
-            //! Run parallel OpenMP loop
-            //!
-            //! \tparam TLoopBody The loop body functor type.
-            //! \tparam TIdx The index type.
-            //!
-            //! \param loopBody The loop body functor instance, takes iteration index as input.
-            //! \param numIterations The number of loop iterations.
-            template<typename TLoopBody, typename TIdx>
-            ALPAKA_FN_HOST void operator()(
-                TKernel const&,
-                TLoopBody&& loopBody,
-                TIdx const numIterations,
-                TSchedule const&)
-            {
-#    if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
-                         // header.
-                std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
-                std::intmax_t i;
-#        pragma omp for nowait schedule(dynamic)
-                for(i = 0; i < iNumBlocksInGrid; ++i)
-#    else
-#        pragma omp for nowait schedule(dynamic)
-                for(TIdx i = 0; i < numIterations; ++i)
-#    endif
-                {
-                    // Make another lambda to work around #1288
-                    auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
-                    wrappedLoopBody(i);
-                }
-            }
-        };
-
-        //! Helper executor of parallel OpenMP loop with the dynamic schedule
-        //!
-        //! Specialization for kernels with ompScheduleChunkSize member.
-        //!
-        //! \tparam TKernel The kernel type.
-        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
-        template<typename TKernel, typename TSchedule>
-        struct ParallelForDynamicImpl<TKernel, TSchedule, HasScheduleChunkSize<TKernel>>
-        {
-            //! Run parallel OpenMP loop
-            //!
-            //! \tparam TLoopBody The loop body functor type.
-            //! \tparam TIdx The index type.
-            //!
-            //! \param kernel The kernel instance reference
-            //! \param loopBody The loop body functor instance, takes iteration index as input.
-            //! \param numIterations The number of loop iterations.
-            template<typename TLoopBody, typename TIdx>
-            ALPAKA_FN_HOST void operator()(
-                TKernel const& kernel,
-                TLoopBody&& loopBody,
-                TIdx const numIterations,
-                TSchedule const&)
-            {
-#    if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
-                         // header.
-                std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
-                std::intmax_t i;
-#        pragma omp for nowait schedule(dynamic, kernel.ompScheduleChunkSize)
-                for(i = 0; i < iNumBlocksInGrid; ++i)
-#    else
-#        pragma omp for nowait schedule(dynamic, kernel.ompScheduleChunkSize)
-                for(TIdx i = 0; i < numIterations; ++i)
-#    endif
-                {
-                    // Make another lambda to work around #1288
-                    auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
-                    wrappedLoopBody(i);
-                }
-            }
-        };
-
-        //! Executor of parallel OpenMP loop with the dynamic schedule
-        //!
-        //! Specialization for kernels not specializing the OmpSchedule trait.
-        //! Falls back to ParallelForDynamicImpl for further dispatch.
-        //!
-        //! \tparam TKernel The kernel type.
-        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
-        template<typename TKernel, typename TSchedule>
-        struct ParallelForImpl<TKernel, TSchedule, omp::Schedule::Dynamic> : ParallelForDynamicImpl<TKernel, TSchedule>
-        {
-        };
-
-        //! Executor of parallel OpenMP loop with the guided schedule
-        //!
-        //! Specialization for kernels specializing the OmpSchedule trait.
-        //!
-        //! \tparam TKernel The kernel type.
-        template<typename TKernel>
-        struct ParallelForImpl<TKernel, omp::Schedule, omp::Schedule::Guided>
-        {
-            //! Run parallel OpenMP loop
-            //!
-            //! \tparam TLoopBody The loop body functor type.
-            //! \tparam TIdx The index type.
-            //!
-            //! \param loopBody The loop body functor instance, takes iteration index as input.
-            //! \param numIterations The number of loop iterations.
-            //! \param schedule The schedule object.
-            template<typename TLoopBody, typename TIdx>
-            ALPAKA_FN_HOST void operator()(
-                TKernel const&,
-                TLoopBody&& loopBody,
-                TIdx const numIterations,
-                omp::Schedule const& schedule)
-            {
-#    if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
-                         // header.
-                std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
-                std::intmax_t i;
-#        pragma omp for nowait schedule(guided, schedule.chunkSize)
-                for(i = 0; i < iNumBlocksInGrid; ++i)
-#    else
-#        pragma omp for nowait schedule(guided, schedule.chunkSize)
-                for(TIdx i = 0; i < numIterations; ++i)
-#    endif
-                {
-                    // Make another lambda to work around #1288
-                    auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
-                    wrappedLoopBody(i);
-                }
-            }
-        };
-
-        //! Helper executor of parallel OpenMP loop with the guided schedule
-        //!
-        //! Generel implementation is for TKernel types without member ompScheduleChunkSize.
-        //!
-        //! \tparam TKernel The kernel type.
-        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
-        template<typename TKernel, typename TSchedule, typename TSfinae = void>
-        struct ParallelForGuidedImpl
-        {
-            //! Run parallel OpenMP loop
-            //!
-            //! \tparam TLoopBody The loop body functor type.
-            //! \tparam TIdx The index type.
-            //!
-            //! \param loopBody The loop body functor instance, takes iteration index as input.
-            //! \param numIterations The number of loop iterations.
-            template<typename TLoopBody, typename TIdx>
-            ALPAKA_FN_HOST void operator()(
-                TKernel const&,
-                TLoopBody&& loopBody,
-                TIdx const numIterations,
-                TSchedule const&)
-            {
-#    if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
-                         // header.
-                std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
-                std::intmax_t i;
-#        pragma omp for nowait schedule(guided)
-                for(i = 0; i < iNumBlocksInGrid; ++i)
-#    else
-#        pragma omp for nowait schedule(guided)
-                for(TIdx i = 0; i < numIterations; ++i)
-#    endif
-                {
-                    // Make another lambda to work around #1288
-                    auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
-                    wrappedLoopBody(i);
-                }
-            }
-        };
-
-        //! Helper executor of parallel OpenMP loop with the guided schedule
-        //!
-        //! Specialization for kernels with ompScheduleChunkSize member.
-        //!
-        //! \tparam TKernel The kernel type.
-        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
-        template<typename TKernel, typename TSchedule>
-        struct ParallelForGuidedImpl<TKernel, TSchedule, HasScheduleChunkSize<TKernel>>
-        {
-            //! Run parallel OpenMP loop
-            //!
-            //! \tparam TLoopBody The loop body functor type.
-            //! \tparam TIdx The index type.
-            //!
-            //! \param kernel The kernel instance reference
-            //! \param loopBody The loop body functor instance, takes iteration index as input.
-            //! \param numIterations The number of loop iterations.
-            template<typename TLoopBody, typename TIdx>
-            ALPAKA_FN_HOST void operator()(
-                TKernel const& kernel,
-                TLoopBody&& loopBody,
-                TIdx const numIterations,
-                TSchedule const&)
-            {
-#    if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
-                         // header.
-                std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
-                std::intmax_t i;
-#        pragma omp for nowait schedule(guided, kernel.ompScheduleChunkSize)
-                for(i = 0; i < iNumBlocksInGrid; ++i)
-#    else
-#        pragma omp for nowait schedule(guided, kernel.ompScheduleChunkSize)
-                for(TIdx i = 0; i < numIterations; ++i)
-#    endif
-                {
-                    // Make another lambda to work around #1288
-                    auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
-                    wrappedLoopBody(i);
-                }
-            }
-        };
-
-        //! Executor of parallel OpenMP loop with the guided schedule
-        //!
-        //! Specialization for kernels not specializing the OmpSchedule trait.
-        //! Falls back to ParallelForGuidedImpl for further dispatch.
-        //!
-        //! \tparam TKernel The kernel type.
-        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
-        template<typename TKernel, typename TSchedule>
-        struct ParallelForImpl<TKernel, TSchedule, omp::Schedule::Guided> : ParallelForGuidedImpl<TKernel, TSchedule>
-        {
-        };
-
-#    if _OPENMP >= 200805
-        //! Executor of parallel OpenMP loop with auto schedule set
-        //!
-        //! Does not use chunk size.
-        //!
-        //! \tparam TKernel The kernel type.
-        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
-        template<typename TKernel, typename TSchedule>
-        struct ParallelForImpl<TKernel, TSchedule, omp::Schedule::Auto>
-        {
-            //! Run parallel OpenMP loop
-            //!
-            //! \tparam TLoopBody The loop body functor type.
-            //! \tparam TIdx The index type.
-            //!
-            //! \param loopBody The loop body functor instance, takes iteration index as input.
-            //! \param numIterations The number of loop iterations.
-            template<typename TLoopBody, typename TIdx>
-            ALPAKA_FN_HOST void operator()(
-                TKernel const&,
-                TLoopBody&& loopBody,
-                TIdx const numIterations,
-                TSchedule const&)
-            {
-#        pragma omp for nowait schedule(auto)
-                for(TIdx i = 0; i < numIterations; ++i)
-                {
-                    // Make another lambda to work around #1288
-                    auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
-                    wrappedLoopBody(i);
-                }
-            }
-        };
-#    endif
-
-        //! Executor of parallel OpenMP loop with runtime schedule set
-        //!
-        //! Does not use chunk size.
-        //!
-        //! \tparam TKernel The kernel type.
-        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
-        template<typename TKernel, typename TSchedule>
-        struct ParallelForImpl<TKernel, TSchedule, omp::Schedule::Runtime>
-        {
-            //! Run parallel OpenMP loop
-            //!
-            //! \tparam TLoopBody The loop body functor type.
-            //! \tparam TIdx The index type.
-            //!
-            //! \param loopBody The loop body functor instance, takes iteration index as input.
-            //! \param numIterations The number of loop iterations.
-            template<typename TLoopBody, typename TIdx>
-            ALPAKA_FN_HOST void operator()(
-                TKernel const&,
-                TLoopBody&& loopBody,
-                TIdx const numIterations,
-                TSchedule const&)
-            {
-#    if _OPENMP < 200805 // For OpenMP < 3.0 you have to declare the loop index (a signed integer) outside of the loop
-                         // header.
-                std::intmax_t iNumBlocksInGrid(static_cast<std::intmax_t>(numIterations));
-                std::intmax_t i;
-#        pragma omp for nowait schedule(runtime)
-                for(i = 0; i < iNumBlocksInGrid; ++i)
-#    else
-#        pragma omp for nowait schedule(runtime)
-                for(TIdx i = 0; i < numIterations; ++i)
-#    endif
-                {
-                    // Make another lambda to work around #1288
-                    auto wrappedLoopBody = [&loopBody](auto idx) { loopBody(idx); };
-                    wrappedLoopBody(i);
-                }
-            }
-        };
-
-        //! Executor of parallel OpenMP loop
-        //!
-        //! Performs dispatch based on schedule kind and forwards to the corresponding ParallelForImpl.
-        //! The default implementation is for the kernels that do not set schedule in any way, compile-time dispatch.
-        //!
-        //! \tparam TKernel The kernel type.
-        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
-        template<typename TKernel, typename TSchedule, typename TSfinae = void>
-        struct ParallelFor
-        {
-            //! Run parallel OpenMP loop
-            //!
-            //! \tparam TLoopBody The loop body functor type.
-            //! \tparam TIdx The index type.
-            //!
-            //! \param kernel The kernel instance reference
-            //! \param loopBody The loop body functor instance, takes iteration index as input.
-            //! \param numIterations The number of loop iterations.
-            //! \param schedule The schedule object.
-            template<typename TLoopBody, typename TIdx>
-            ALPAKA_FN_HOST void operator()(
-                TKernel const& kernel,
-                TLoopBody&& loopBody,
-                TIdx const numIterations,
-                TSchedule const& schedule)
-            {
-                // Forward to ParallelForImpl that performs dispatch by by chunk size
-                ParallelForImpl<TKernel, TSchedule, omp::Schedule::NoSchedule>{}(
-                    kernel,
-                    std::forward<TLoopBody>(loopBody),
-                    numIterations,
-                    schedule);
-            }
-        };
-
-        //! Executor of parallel OpenMP loop
-        //!
-        //! Performs dispatch based on schedule kind and forwards to the corresponding ParallelForImpl.
-        //! Specialization for kernels specializing the OmpSchedule trait, run-time dispatch.
-        //!
-        //! \tparam TKernel The kernel type.
-        template<typename TKernel>
-        struct ParallelFor<TKernel, omp::Schedule>
-        {
-            //! Run parallel OpenMP loop
-            //!
-            //! \tparam TLoopBody The loop body functor type.
-            //! \tparam TIdx The index type.
-            //!
-            //! \param kernel The kernel instance reference
-            //! \param loopBody The loop body functor instance, takes iteration index as input.
-            //! \param numIterations The number of loop iterations.
-            //! \param schedule The schedule object.
-            template<typename TLoopBody, typename TIdx>
-            ALPAKA_FN_HOST void operator()(
-                TKernel const& kernel,
-                TLoopBody&& loopBody,
-                TIdx const numIterations,
-                omp::Schedule const& schedule)
-            {
-                // Forward to ParallelForImpl that performs dispatch by by chunk size
-                switch(schedule.kind)
-                {
-                case omp::Schedule::NoSchedule:
-                    ParallelForImpl<TKernel, omp::Schedule, omp::Schedule::NoSchedule>{}(
-                        kernel,
-                        std::forward<TLoopBody>(loopBody),
-                        numIterations,
-                        schedule);
-                    break;
-                case omp::Schedule::Static:
-                    ParallelForImpl<TKernel, omp::Schedule, omp::Schedule::Static>{}(
-                        kernel,
-                        std::forward<TLoopBody>(loopBody),
-                        numIterations,
-                        schedule);
-                    break;
-                case omp::Schedule::Dynamic:
-                    ParallelForImpl<TKernel, omp::Schedule, omp::Schedule::Dynamic>{}(
-                        kernel,
-                        std::forward<TLoopBody>(loopBody),
-                        numIterations,
-                        schedule);
-                    break;
-                case omp::Schedule::Guided:
-                    ParallelForImpl<TKernel, omp::Schedule, omp::Schedule::Guided>{}(
-                        kernel,
-                        std::forward<TLoopBody>(loopBody),
-                        numIterations,
-                        schedule);
-                    break;
-#    if _OPENMP >= 200805
-                case omp::Schedule::Auto:
-                    ParallelForImpl<TKernel, omp::Schedule, omp::Schedule::Auto>{}(
-                        kernel,
-                        std::forward<TLoopBody>(loopBody),
-                        numIterations,
-                        schedule);
-                    break;
-#    endif
-                case omp::Schedule::Runtime:
-                    ParallelForImpl<TKernel, omp::Schedule, omp::Schedule::Runtime>{}(
-                        kernel,
-                        std::forward<TLoopBody>(loopBody),
-                        numIterations,
-                        schedule);
-                    break;
-                }
-            }
-        };
-
-        //! Helper type to check if TSchedule is a type originating from OmpSchedule trait definition
-        //!
-        //! \tparam TSchedule The schedule type.
-        template<typename TSchedule>
-        using IsOmpScheduleTraitSpecialized
-            = std::integral_constant<bool, std::is_same<TSchedule, omp::Schedule>::value>;
-
-        //! Helper type to check if member ompScheduleKind of TKernel should be used
-        //!
-        //! For that it has to be present, and no OmpSchedule trait specialized.
-        //! Is void for those types, ill-formed otherwise.
-        //!
-        //! \tparam TKernel The kernel type.
-        //! \tparam TSchedule The schedule type.
-        template<typename TKernel, typename TSchedule>
-        using UseScheduleKind
-            = std::enable_if_t<sizeof(TKernel::ompScheduleKind) && !IsOmpScheduleTraitSpecialized<TSchedule>::value>;
-
-        //! Executor of parallel OpenMP loop
-        //!
-        //! Performs dispatch based on schedule kind and forwards to the corresponding ParallelForImpl.
-        //! Specialization for kernels with ompScheduleKind member, compile-time dispatch.
-        //!
-        //! \tparam TKernel The kernel type.
-        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
-        template<typename TKernel, typename TSchedule>
-        struct ParallelFor<TKernel, TSchedule, UseScheduleKind<TKernel, TSchedule>>
-        {
-            //! Run parallel OpenMP loop
-            //!
-            //! \tparam TLoopBody The loop body functor type.
-            //! \tparam TIdx The index type.
-            //!
-            //! \param kernel The kernel instance reference
-            //! \param loopBody The loop body functor instance, takes iteration index as input.
-            //! \param numIterations The number of loop iterations.
-            //! \param schedule The schedule object.
-            template<typename TLoopBody, typename TIdx>
-            ALPAKA_FN_HOST void operator()(
-                TKernel const& kernel,
-                TLoopBody&& loopBody,
-                TIdx const numIterations,
-                TSchedule const& schedule)
-            {
-                // Forward to ParallelForImpl that performs dispatch by by chunk size
-                ParallelForImpl<TKernel, TSchedule, TKernel::ompScheduleKind>{}(
-                    kernel,
-                    std::forward<TLoopBody>(loopBody),
-                    numIterations,
-                    schedule);
-            }
-        };
-
-        //! Run parallel OpenMP loop
-        //!
-        //! \tparam TKernel The kernel type.
-        //! \tparam TLoopBody The loop body functor type.
-        //! \tparam TIdx The index type.
-        //! \tparam TSchedule The schedule type (not necessarily omp::Schedule).
-        //!
-        //! \param kernel The kernel instance reference,
-        //!        not perfect=forwarded to shorten SFINAE internally.
-        //! \param loopBody The loop body functor instance, takes iteration index as input.
-        //! \param numIterations The number of loop iterations.
-        //! \param schedule The schedule object.
-        template<typename TKernel, typename TLoopBody, typename TIdx, typename TSchedule>
-        ALPAKA_FN_HOST ALPAKA_FN_INLINE void parallelFor(
-            TKernel const& kernel,
-            TLoopBody&& loopBody,
-            TIdx const numIterations,
-            TSchedule const& schedule)
-        {
-            // Forward to ParallelFor that performs first a dispatch by schedule kind, and then by chunk size
-            ParallelFor<TKernel, TSchedule>{}(kernel, std::forward<TLoopBody>(loopBody), numIterations, schedule);
-        }
-
-    } // namespace detail
-
-    //! The CPU OpenMP 2.0 block accelerator execution task.
-    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-    class TaskKernelCpuOmp2Blocks final : public WorkDivMembers<TDim, TIdx>
-    {
-    public:
-        template<typename TWorkDiv>
-        ALPAKA_FN_HOST TaskKernelCpuOmp2Blocks(TWorkDiv&& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
-            : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
-            , m_kernelFnObj(kernelFnObj)
-            , m_args(std::forward<TArgs>(args)...)
-        {
-            static_assert(
-                Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
-                "The work division and the execution task have to be of the same dimensionality!");
-        }
-
-        //! Executes the kernel function object.
-        ALPAKA_FN_HOST auto operator()() const -> void
-        {
-            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-            auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(*this);
-            auto const blockThreadExtent = getWorkDiv<Block, Threads>(*this);
-            auto const threadElemExtent = getWorkDiv<Thread, Elems>(*this);
-
-            // Get the size of the block shared dynamic memory.
-            auto const blockSharedMemDynSizeBytes = std::apply(
-                [&](std::decay_t<TArgs> const&... args)
-                {
-                    return getBlockSharedMemDynSizeBytes<AccCpuOmp2Blocks<TDim, TIdx>>(
-                        m_kernelFnObj,
-                        blockThreadExtent,
-                        threadElemExtent,
-                        args...);
-                },
-                m_args);
-
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-            std::cout << __func__ << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
-                      << std::endl;
-#    endif
-
-            // The number of blocks in the grid.
-            TIdx const numBlocksInGrid(gridBlockExtent.prod());
-
-            // Get the OpenMP schedule information for the given kernel and parameter types
-            auto const schedule = std::apply(
-                [&](std::decay_t<TArgs> const&... args) {
-                    return getOmpSchedule<AccCpuOmp2Blocks<TDim, TIdx>>(
-                        m_kernelFnObj,
-                        blockThreadExtent,
-                        threadElemExtent,
-                        args...);
-                },
-                m_args);
-
-            if(::omp_in_parallel() != 0)
-            {
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                std::cout << __func__ << " already within a parallel region." << std::endl;
-#    endif
-                parallelFn(blockSharedMemDynSizeBytes, numBlocksInGrid, gridBlockExtent, schedule);
-            }
-            else
-            {
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                std::cout << __func__ << " opening new parallel region." << std::endl;
-#    endif
-#    pragma omp parallel
-                parallelFn(blockSharedMemDynSizeBytes, numBlocksInGrid, gridBlockExtent, schedule);
-            }
-        }
-
-    private:
-        template<typename TSchedule>
-        ALPAKA_FN_HOST auto parallelFn(
-            std::size_t const& blockSharedMemDynSizeBytes,
-            TIdx const& numBlocksInGrid,
-            Vec<TDim, TIdx> const& gridBlockExtent,
-            TSchedule const& schedule) const -> void
-        {
-#    pragma omp single nowait
-            {
-                // The OpenMP runtime does not create a parallel region when either:
-                // * only one thread is required in the num_threads clause
-                // * or only one thread is available
-                // In all other cases we expect to be in a parallel region now.
-                if((numBlocksInGrid > 1) && (::omp_get_max_threads() > 1) && (::omp_in_parallel() == 0))
-                {
-                    throw std::runtime_error("The OpenMP 2.0 runtime did not create a parallel region!");
-                }
-
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                std::cout << __func__ << " omp_get_num_threads: " << ::omp_get_num_threads() << std::endl;
-#    endif
-            }
-
-            AccCpuOmp2Blocks<TDim, TIdx> acc(
-                *static_cast<WorkDivMembers<TDim, TIdx> const*>(this),
-                blockSharedMemDynSizeBytes);
-
-            // Body of the OpenMP parallel loop to be executed.
-            // Index type is auto since we have a difference for OpenMP 2.0 and later ones
-            auto loopBody = [&](auto currentIndex)
-            {
-#    if _OPENMP < 200805
-                auto const i_tidx = static_cast<TIdx>(currentIndex); // for issue #840
-                auto const index = Vec<DimInt<1u>, TIdx>(i_tidx); // for issue #840
-#    else
-                auto const index = Vec<DimInt<1u>, TIdx>(currentIndex); // for issue #840
-#    endif
-                acc.m_gridBlockIdx = mapIdx<TDim::value>(index, gridBlockExtent);
-
-                std::apply(m_kernelFnObj, std::tuple_cat(std::tie(acc), m_args));
-
-                // After a block has been processed, the shared memory has to be deleted.
-                freeSharedVars(acc);
-            };
-
-            detail::parallelFor(m_kernelFnObj, loopBody, numBlocksInGrid, schedule);
-        }
-
-        TKernelFnObj m_kernelFnObj;
-        std::tuple<std::decay_t<TArgs>...> m_args;
-    };
-
-    namespace trait
-    {
-        //! The CPU OpenMP 2.0 grid block execution task accelerator type trait specialization.
-        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct AccType<TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            using type = AccCpuOmp2Blocks<TDim, TIdx>;
-        };
-
-        //! The CPU OpenMP 2.0 grid block execution task device type trait specialization.
-        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct DevType<TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            using type = DevCpu;
-        };
-
-        //! The CPU OpenMP 2.0 grid block execution task dimension getter trait specialization.
-        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct DimType<TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            using type = TDim;
-        };
-
-        //! The CPU OpenMP 2.0 grid block execution task platform type trait specialization.
-        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct PlatformType<TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            using type = PlatformCpu;
-        };
-
-        //! The CPU OpenMP 2.0 block execution task idx type trait specialization.
-        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct IdxType<TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            using type = TIdx;
-        };
-
-        //! \brief Specialisation of the class template FunctionAttributes
-        //! \tparam TDev The device type.
-        //! \tparam TDim The dimensionality of the accelerator device properties.
-        //! \tparam TIdx The idx type of the accelerator device properties.
-        //! \tparam TKernelFn Kernel function object type.
-        //! \tparam TArgs Kernel function object argument types as a parameter pack.
-        template<typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
-        struct FunctionAttributes<AccCpuOmp2Blocks<TDim, TIdx>, TDev, TKernelFn, TArgs...>
-        {
-            //! \param dev The device instance
-            //! \param kernelFn The kernel function object which should be executed.
-            //! \param args The kernel invocation arguments.
-            //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
-            //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
-            ALPAKA_FN_HOST static auto getFunctionAttributes(
-                TDev const& dev,
-                [[maybe_unused]] TKernelFn const& kernelFn,
-                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
-            {
-                alpaka::KernelFunctionAttributes kernelFunctionAttributes;
-
-                // set function properties for maxThreadsPerBlock to device properties, since API doesn't have function
-                // properties function.
-                auto const& props = alpaka::getAccDevProps<AccCpuOmp2Blocks<TDim, TIdx>>(dev);
-                kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(props.m_blockThreadCountMax);
-                kernelFunctionAttributes.maxDynamicSharedSizeBytes
-                    = static_cast<int>(alpaka::BlockSharedDynMemberAllocKiB * 1024);
-                return kernelFunctionAttributes;
-            }
-        };
-
-    } // namespace trait
-} // namespace alpaka
-
-#    if BOOST_COMP_CLANG
-#        pragma clang diagnostic pop
-#    endif
-
-#endif
diff --git a/include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp b/include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp
deleted file mode 100644
index 6b08e96..0000000
--- a/include/alpaka/kernel/TaskKernelCpuOmp2Threads.hpp
+++ /dev/null
@@ -1,232 +0,0 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Bert Wesarg, René Widera, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-// Specialized traits.
-#include "alpaka/acc/Traits.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/dim/Traits.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/platform/Traits.hpp"
-
-// Implementation details.
-#include "alpaka/acc/AccCpuOmp2Threads.hpp"
-#include "alpaka/core/Decay.hpp"
-#include "alpaka/dev/DevCpu.hpp"
-#include "alpaka/kernel/KernelFunctionAttributes.hpp"
-#include "alpaka/kernel/Traits.hpp"
-#include "alpaka/meta/NdLoop.hpp"
-#include "alpaka/platform/PlatformCpu.hpp"
-#include "alpaka/workdiv/WorkDivMembers.hpp"
-
-#include <functional>
-#include <stdexcept>
-#include <tuple>
-#include <type_traits>
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-#    include <iostream>
-#endif
-
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
-
-#    if _OPENMP < 200203
-#        error If ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
-#    endif
-
-#    include <omp.h>
-
-namespace alpaka
-{
-    //! The CPU OpenMP 2.0 thread accelerator execution task.
-    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-    class TaskKernelCpuOmp2Threads final : public WorkDivMembers<TDim, TIdx>
-    {
-    public:
-        template<typename TWorkDiv>
-        ALPAKA_FN_HOST TaskKernelCpuOmp2Threads(TWorkDiv&& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
-            : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
-            , m_kernelFnObj(kernelFnObj)
-            , m_args(std::forward<TArgs>(args)...)
-        {
-            static_assert(
-                Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
-                "The work division and the execution task have to be of the same dimensionality!");
-        }
-
-        //! Executes the kernel function object.
-        ALPAKA_FN_HOST auto operator()() const -> void
-        {
-            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-            auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(*this);
-            auto const blockThreadExtent = getWorkDiv<Block, Threads>(*this);
-            auto const threadElemExtent = getWorkDiv<Thread, Elems>(*this);
-
-            // Get the size of the block shared dynamic memory.
-            auto const blockSharedMemDynSizeBytes = std::apply(
-                [&](std::decay_t<TArgs> const&... args)
-                {
-                    return getBlockSharedMemDynSizeBytes<AccCpuOmp2Threads<TDim, TIdx>>(
-                        m_kernelFnObj,
-                        blockThreadExtent,
-                        threadElemExtent,
-                        args...);
-                },
-                m_args);
-
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-            std::cout << __func__ << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
-                      << std::endl;
-#    endif
-
-            AccCpuOmp2Threads<TDim, TIdx> acc(
-                *static_cast<WorkDivMembers<TDim, TIdx> const*>(this),
-                blockSharedMemDynSizeBytes);
-
-            // The number of threads in this block.
-            TIdx const blockThreadCount(blockThreadExtent.prod());
-            [[maybe_unused]] int const iBlockThreadCount(static_cast<int>(blockThreadCount));
-
-            if(::omp_in_parallel() != 0)
-            {
-                throw std::runtime_error(
-                    "The OpenMP 2.0 thread backend can not be used within an existing parallel region!");
-            }
-
-            // Force the environment to use the given number of threads.
-            int const ompIsDynamic(::omp_get_dynamic());
-            ::omp_set_dynamic(0);
-
-            // Execute the blocks serially.
-            meta::ndLoopIncIdx(
-                gridBlockExtent,
-                [&](Vec<TDim, TIdx> const& gridBlockIdx)
-                {
-                    acc.m_gridBlockIdx = gridBlockIdx;
-
-// Execute the threads in parallel.
-
-// Parallel execution of the threads in a block is required because when syncBlockThreads is called all of them have to
-// be done with their work up to this line. So we have to spawn one OS thread per thread in a block. 'omp for' is not
-// useful because it is meant for cases where multiple iterations are executed by one thread but in our case a 1:1
-// mapping is required. Therefore we use 'omp parallel' with the specified number of threads in a block.
-#    pragma omp parallel num_threads(iBlockThreadCount)
-                    {
-                        // The guard is for gcc internal compiler error, as discussed in #735
-                        if constexpr((!BOOST_COMP_GNUC) || (BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(8, 1, 0)))
-                        {
-#    pragma omp single nowait
-                            {
-                                // The OpenMP runtime does not create a parallel region when only one thread is
-                                // required in the num_threads clause. In all other cases we expect to be in a parallel
-                                // region now.
-                                if((iBlockThreadCount > 1) && (::omp_in_parallel() == 0))
-                                {
-                                    throw std::runtime_error(
-                                        "The OpenMP 2.0 runtime did not create a parallel region!");
-                                }
-
-                                int const numThreads = ::omp_get_num_threads();
-                                if(numThreads != iBlockThreadCount)
-                                {
-                                    throw std::runtime_error(
-                                        "The OpenMP 2.0 runtime did not use the number of threads "
-                                        "that had been required!");
-                                }
-                            }
-                        }
-
-                        std::apply(m_kernelFnObj, std::tuple_cat(std::tie(acc), m_args));
-
-                        // Wait for all threads to finish before deleting the shared memory.
-                        // This is done by default if the omp 'nowait' clause is missing on the omp parallel directive
-                        // syncBlockThreads(acc);
-                    }
-
-                    // After a block has been processed, the shared memory has to be deleted.
-                    freeSharedVars(acc);
-                });
-
-            // Reset the dynamic thread number setting.
-            ::omp_set_dynamic(ompIsDynamic);
-        }
-
-    private:
-        TKernelFnObj m_kernelFnObj;
-        std::tuple<std::decay_t<TArgs>...> m_args;
-    };
-
-    namespace trait
-    {
-        //! The CPU OpenMP 2.0 block thread execution task accelerator type trait specialization.
-        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct AccType<TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            using type = AccCpuOmp2Threads<TDim, TIdx>;
-        };
-
-        //! The CPU OpenMP 2.0 block thread execution task device type trait specialization.
-        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct DevType<TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            using type = DevCpu;
-        };
-
-        //! The CPU OpenMP 2.0 block thread execution task dimension getter trait specialization.
-        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct DimType<TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            using type = TDim;
-        };
-
-        //! The CPU OpenMP 2.0 block thread execution task platform type trait specialization.
-        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct PlatformType<TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            using type = PlatformCpu;
-        };
-
-        //! The CPU OpenMP 2.0 block thread execution task idx type trait specialization.
-        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct IdxType<TaskKernelCpuOmp2Threads<TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            using type = TIdx;
-        };
-
-        //! \brief Specialisation of the class template FunctionAttributes
-        //! \tparam TDev The device type.
-        //! \tparam TDim The dimensionality of the accelerator device properties.
-        //! \tparam TIdx The idx type of the accelerator device properties.
-        //! \tparam TKernelFn Kernel function object type.
-        //! \tparam TArgs Kernel function object argument types as a parameter pack.
-        template<typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
-        struct FunctionAttributes<AccCpuOmp2Threads<TDim, TIdx>, TDev, TKernelFn, TArgs...>
-        {
-            //! \param dev The device instance
-            //! \param kernelFn The kernel function object which should be executed.
-            //! \param args The kernel invocation arguments.
-            //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
-            //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
-            ALPAKA_FN_HOST static auto getFunctionAttributes(
-                TDev const& dev,
-                [[maybe_unused]] TKernelFn const& kernelFn,
-                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
-            {
-                alpaka::KernelFunctionAttributes kernelFunctionAttributes;
-
-                // set function properties for maxThreadsPerBlock to device properties, since API doesn't have function
-                // properties function.
-                auto const& props = alpaka::getAccDevProps<AccCpuOmp2Threads<TDim, TIdx>>(dev);
-                kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(props.m_blockThreadCountMax);
-                kernelFunctionAttributes.maxDynamicSharedSizeBytes
-                    = static_cast<int>(alpaka::BlockSharedDynMemberAllocKiB * 1024);
-                return kernelFunctionAttributes;
-            }
-        };
-
-    } // namespace trait
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/kernel/TaskKernelCpuSerial.hpp b/include/alpaka/kernel/TaskKernelCpuSerial.hpp
deleted file mode 100644
index a9a370d..0000000
--- a/include/alpaka/kernel/TaskKernelCpuSerial.hpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, René Widera, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-// Specialized traits.
-#include "alpaka/acc/Traits.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/dim/Traits.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/platform/Traits.hpp"
-
-// Implementation details.
-#include "alpaka/acc/AccCpuSerial.hpp"
-#include "alpaka/core/Decay.hpp"
-#include "alpaka/dev/DevCpu.hpp"
-#include "alpaka/kernel/KernelFunctionAttributes.hpp"
-#include "alpaka/kernel/Traits.hpp"
-#include "alpaka/meta/NdLoop.hpp"
-#include "alpaka/platform/PlatformCpu.hpp"
-#include "alpaka/workdiv/WorkDivMembers.hpp"
-
-#include <functional>
-#include <tuple>
-#include <type_traits>
-#include <utility>
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-#    include <iostream>
-#endif
-
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
-
-namespace alpaka
-{
-    //! The CPU serial execution task implementation.
-    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-    class TaskKernelCpuSerial final : public WorkDivMembers<TDim, TIdx>
-    {
-    public:
-        template<typename TWorkDiv>
-        ALPAKA_FN_HOST TaskKernelCpuSerial(TWorkDiv&& workDiv, TKernelFnObj kernelFnObj, TArgs&&... args)
-            : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
-            , m_kernelFnObj(std::move(kernelFnObj))
-            , m_args(std::forward<TArgs>(args)...)
-        {
-            static_assert(
-                Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
-                "The work division and the execution task have to be of the same dimensionality!");
-        }
-
-        //! Executes the kernel function object.
-        ALPAKA_FN_HOST auto operator()() const -> void
-        {
-            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-            auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(*this);
-            auto const blockThreadExtent = getWorkDiv<Block, Threads>(*this);
-            auto const threadElemExtent = getWorkDiv<Thread, Elems>(*this);
-
-            // Get the size of the block shared dynamic memory.
-            auto const blockSharedMemDynSizeBytes = std::apply(
-                [&](std::decay_t<TArgs> const&... args)
-                {
-                    return getBlockSharedMemDynSizeBytes<AccCpuSerial<TDim, TIdx>>(
-                        m_kernelFnObj,
-                        blockThreadExtent,
-                        threadElemExtent,
-                        args...);
-                },
-                m_args);
-
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-            std::cout << __func__ << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
-                      << std::endl;
-#    endif
-
-            AccCpuSerial<TDim, TIdx> acc(
-                *static_cast<WorkDivMembers<TDim, TIdx> const*>(this),
-                blockSharedMemDynSizeBytes);
-
-            // Execute the blocks serially.
-            meta::ndLoopIncIdx(
-                gridBlockExtent,
-                [&](Vec<TDim, TIdx> const& blockThreadIdx)
-                {
-                    acc.m_gridBlockIdx = blockThreadIdx;
-
-                    std::apply(m_kernelFnObj, std::tuple_cat(std::tie(acc), m_args));
-
-                    // After a block has been processed, the shared memory has to be deleted.
-                    freeSharedVars(acc);
-                });
-        }
-
-    private:
-        TKernelFnObj m_kernelFnObj;
-        std::tuple<std::decay_t<TArgs>...> m_args;
-    };
-
-    namespace trait
-    {
-        //! The CPU serial execution task accelerator type trait specialization.
-        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct AccType<TaskKernelCpuSerial<TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            using type = AccCpuSerial<TDim, TIdx>;
-        };
-
-        //! The CPU serial execution task device type trait specialization.
-        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct DevType<TaskKernelCpuSerial<TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            using type = DevCpu;
-        };
-
-        //! The CPU serial execution task dimension getter trait specialization.
-        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct DimType<TaskKernelCpuSerial<TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            using type = TDim;
-        };
-
-        //! The CPU serial execution task platform type trait specialization.
-        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct PlatformType<TaskKernelCpuSerial<TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            using type = PlatformCpu;
-        };
-
-        //! The CPU serial execution task idx type trait specialization.
-        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct IdxType<TaskKernelCpuSerial<TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            using type = TIdx;
-        };
-
-        //! \brief Specialisation of the class template FunctionAttributes
-        //! \tparam TDev The device type.
-        //! \tparam TDim The dimensionality of the accelerator device properties.
-        //! \tparam TIdx The idx type of the accelerator device properties.
-        //! \tparam TKernelFn Kernel function object type.
-        //! \tparam TArgs Kernel function object argument types as a parameter pack.
-        template<typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
-        struct FunctionAttributes<AccCpuSerial<TDim, TIdx>, TDev, TKernelFn, TArgs...>
-        {
-            //! \param dev The device instance
-            //! \param kernelFn The kernel function object which should be executed.
-            //! \param args The kernel invocation arguments.
-            //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
-            //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
-            ALPAKA_FN_HOST static auto getFunctionAttributes(
-                TDev const& dev,
-                [[maybe_unused]] TKernelFn const& kernelFn,
-                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
-            {
-                alpaka::KernelFunctionAttributes kernelFunctionAttributes;
-
-                // set function properties for maxThreadsPerBlock to device properties, since API doesn't have function
-                // properties function.
-                auto const& props = alpaka::getAccDevProps<AccCpuSerial<TDim, TIdx>>(dev);
-                kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(props.m_blockThreadCountMax);
-                kernelFunctionAttributes.maxDynamicSharedSizeBytes
-                    = static_cast<int>(alpaka::BlockSharedDynMemberAllocKiB * 1024);
-                return kernelFunctionAttributes;
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/kernel/TaskKernelCpuSycl.hpp b/include/alpaka/kernel/TaskKernelCpuSycl.hpp
deleted file mode 100644
index b811a63..0000000
--- a/include/alpaka/kernel/TaskKernelCpuSycl.hpp
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright 2024 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/acc/Tag.hpp"
-#include "alpaka/kernel/TaskKernelGenericSycl.hpp"
-
-#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_CPU)
-
-namespace alpaka
-{
-    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-    using TaskKernelCpuSycl
-        = TaskKernelGenericSycl<TagCpuSycl, AccCpuSycl<TDim, TIdx>, TDim, TIdx, TKernelFnObj, TArgs...>;
-
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp b/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp
deleted file mode 100644
index 4ca90dd..0000000
--- a/include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp
+++ /dev/null
@@ -1,183 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Erik Zenker, René Widera, Felice Pantaleo, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-// Specialized traits.
-#include "alpaka/acc/Traits.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/dim/Traits.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/platform/Traits.hpp"
-
-// Implementation details.
-#include "alpaka/acc/AccCpuTbbBlocks.hpp"
-#include "alpaka/core/Decay.hpp"
-#include "alpaka/dev/DevCpu.hpp"
-#include "alpaka/idx/MapIdx.hpp"
-#include "alpaka/kernel/KernelFunctionAttributes.hpp"
-#include "alpaka/kernel/Traits.hpp"
-#include "alpaka/meta/NdLoop.hpp"
-#include "alpaka/platform/PlatformCpu.hpp"
-#include "alpaka/workdiv/WorkDivMembers.hpp"
-
-#include <functional>
-#include <stdexcept>
-#include <tuple>
-#include <type_traits>
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-#    include <iostream>
-#endif
-
-#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
-
-#    include <tbb/blocked_range.h>
-#    include <tbb/parallel_for.h>
-#    include <tbb/task_group.h>
-
-namespace alpaka
-{
-    //! The CPU TBB block accelerator execution task.
-    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-    class TaskKernelCpuTbbBlocks final : public WorkDivMembers<TDim, TIdx>
-    {
-    public:
-        template<typename TWorkDiv>
-        ALPAKA_FN_HOST TaskKernelCpuTbbBlocks(TWorkDiv&& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
-            : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
-            , m_kernelFnObj(kernelFnObj)
-            , m_args(std::forward<TArgs>(args)...)
-        {
-            static_assert(
-                Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
-                "The work division and the execution task have to be of the same dimensionality!");
-        }
-
-        //! Executes the kernel function object.
-        ALPAKA_FN_HOST auto operator()() const -> void
-        {
-            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-            auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(*this);
-            auto const blockThreadExtent = getWorkDiv<Block, Threads>(*this);
-            auto const threadElemExtent = getWorkDiv<Thread, Elems>(*this);
-
-            // Get the size of the block shared dynamic memory.
-            auto const blockSharedMemDynSizeBytes = std::apply(
-                [&](std::decay_t<TArgs> const&... args)
-                {
-                    return getBlockSharedMemDynSizeBytes<AccCpuTbbBlocks<TDim, TIdx>>(
-                        m_kernelFnObj,
-                        blockThreadExtent,
-                        threadElemExtent,
-                        args...);
-                },
-                m_args);
-
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-            std::cout << __func__ << " blockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
-                      << std::endl;
-#    endif
-
-            // The number of blocks in the grid.
-            TIdx const numBlocksInGrid = gridBlockExtent.prod();
-
-            tbb::this_task_arena::isolate(
-                [&]
-                {
-                    tbb::parallel_for(
-                        static_cast<TIdx>(0),
-                        static_cast<TIdx>(numBlocksInGrid),
-                        [&](TIdx i)
-                        {
-                            AccCpuTbbBlocks<TDim, TIdx> acc(
-                                *static_cast<WorkDivMembers<TDim, TIdx> const*>(this),
-                                blockSharedMemDynSizeBytes);
-
-                            acc.m_gridBlockIdx
-                                = mapIdx<TDim::value>(Vec<DimInt<1u>, TIdx>(static_cast<TIdx>(i)), gridBlockExtent);
-
-                            std::apply(m_kernelFnObj, std::tuple_cat(std::tie(acc), m_args));
-
-                            freeSharedVars(acc);
-                        });
-                });
-        }
-
-    private:
-        TKernelFnObj m_kernelFnObj;
-        std::tuple<std::decay_t<TArgs>...> m_args;
-    };
-
-    namespace trait
-    {
-        //! The CPU TBB block execution task accelerator type trait specialization.
-        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct AccType<TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            using type = AccCpuTbbBlocks<TDim, TIdx>;
-        };
-
-        //! The CPU TBB block execution task device type trait specialization.
-        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct DevType<TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            using type = DevCpu;
-        };
-
-        //! The CPU TBB block execution task dimension getter trait specialization.
-        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct DimType<TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            using type = TDim;
-        };
-
-        //! The CPU TBB block execution task platform type trait specialization.
-        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct PlatformType<TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            using type = PlatformCpu;
-        };
-
-        //! The CPU TBB block execution task idx type trait specialization.
-        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct IdxType<TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            using type = TIdx;
-        };
-
-        //! \brief Specialisation of the class template FunctionAttributes
-        //! \tparam TDev The device type.
-        //! \tparam TDim The dimensionality of the accelerator device properties.
-        //! \tparam TIdx The idx type of the accelerator device properties.
-        //! \tparam TKernelFn Kernel function object type.
-        //! \tparam TArgs Kernel function object argument types as a parameter pack.
-        template<typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
-        struct FunctionAttributes<AccCpuTbbBlocks<TDim, TIdx>, TDev, TKernelFn, TArgs...>
-        {
-            //! \param dev The device instance
-            //! \param kernelFn The kernel function object which should be executed.
-            //! \param args The kernel invocation arguments.
-            //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
-            //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
-            ALPAKA_FN_HOST static auto getFunctionAttributes(
-                TDev const& dev,
-                [[maybe_unused]] TKernelFn const& kernelFn,
-                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
-            {
-                alpaka::KernelFunctionAttributes kernelFunctionAttributes;
-
-                // set function properties for maxThreadsPerBlock to device properties, since API doesn't have function
-                // properties function.
-                auto const& props = alpaka::getAccDevProps<AccCpuTbbBlocks<TDim, TIdx>>(dev);
-                kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(props.m_blockThreadCountMax);
-                kernelFunctionAttributes.maxDynamicSharedSizeBytes
-                    = static_cast<int>(alpaka::BlockSharedDynMemberAllocKiB * 1024);
-                return kernelFunctionAttributes;
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/kernel/TaskKernelCpuThreads.hpp b/include/alpaka/kernel/TaskKernelCpuThreads.hpp
deleted file mode 100644
index 850b661..0000000
--- a/include/alpaka/kernel/TaskKernelCpuThreads.hpp
+++ /dev/null
@@ -1,240 +0,0 @@
-/* Copyright 2023 Benjamin Worpitz, René Widera, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-// Specialized traits.
-#include "alpaka/acc/Traits.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/dim/Traits.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/platform/Traits.hpp"
-
-// Implementation details.
-#include "alpaka/acc/AccCpuThreads.hpp"
-#include "alpaka/core/BoostPredef.hpp"
-#include "alpaka/core/Decay.hpp"
-#include "alpaka/core/ThreadPool.hpp"
-#include "alpaka/dev/DevCpu.hpp"
-#include "alpaka/kernel/KernelFunctionAttributes.hpp"
-#include "alpaka/kernel/Traits.hpp"
-#include "alpaka/meta/NdLoop.hpp"
-#include "alpaka/platform/PlatformCpu.hpp"
-#include "alpaka/workdiv/WorkDivMembers.hpp"
-
-#include <algorithm>
-#include <functional>
-#include <future>
-#include <thread>
-#include <tuple>
-#include <type_traits>
-#include <vector>
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-#    include <iostream>
-#endif
-
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
-
-namespace alpaka
-{
-    //! The CPU threads execution task.
-    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-    class TaskKernelCpuThreads final : public WorkDivMembers<TDim, TIdx>
-    {
-    private:
-        // When using the thread pool the threads are yielding because this is faster.
-        // Using condition variables and going to sleep is very costly for real threads.
-        // Especially when the time to wait is really short (syncBlockThreads) yielding is much faster.
-        using ThreadPool = alpaka::core::detail::ThreadPool;
-
-    public:
-        template<typename TWorkDiv>
-        ALPAKA_FN_HOST TaskKernelCpuThreads(TWorkDiv&& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
-            : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
-            , m_kernelFnObj(kernelFnObj)
-            , m_args(std::forward<TArgs>(
-                  args)...) // FIXME(bgruber): this does not forward, since TArgs is not a deduced template parameter
-        {
-            static_assert(
-                Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
-                "The work division and the execution task have to be of the same dimensionality!");
-        }
-
-        //! Executes the kernel function object.
-        ALPAKA_FN_HOST auto operator()() const -> void
-        {
-            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-            std::apply([&](auto const&... args) { runWithArgs(args...); }, m_args);
-        }
-
-    private:
-        ALPAKA_FN_HOST auto runWithArgs(std::decay_t<TArgs> const&... args) const -> void
-        {
-            auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(*this);
-            auto const blockThreadExtent = getWorkDiv<Block, Threads>(*this);
-            auto const threadElemExtent = getWorkDiv<Thread, Elems>(*this);
-
-            // Get the size of the block shared dynamic memory.
-            auto const smBytes = getBlockSharedMemDynSizeBytes<AccCpuThreads<TDim, TIdx>>(
-                m_kernelFnObj,
-                blockThreadExtent,
-                threadElemExtent,
-                args...);
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-            std::cout << __func__ << " smBytes: " << smBytes << " B" << std::endl;
-#    endif
-            AccCpuThreads<TDim, TIdx> acc(*static_cast<WorkDivMembers<TDim, TIdx> const*>(this), smBytes);
-
-            auto const threadsPerBlock = blockThreadExtent.prod();
-            ThreadPool threadPool(static_cast<std::size_t>(threadsPerBlock));
-
-            // Execute the blocks serially.
-            meta::ndLoopIncIdx(
-                gridBlockExtent,
-                [&](Vec<TDim, TIdx> const& gridBlockIdx)
-                { runBlock(acc, gridBlockIdx, blockThreadExtent, threadPool, m_kernelFnObj, args...); });
-        }
-
-        //! The function executed for each grid block.
-        ALPAKA_FN_HOST static auto runBlock(
-            AccCpuThreads<TDim, TIdx>& acc,
-            Vec<TDim, TIdx> const& gridBlockIdx,
-            Vec<TDim, TIdx> const& blockThreadExtent,
-            ThreadPool& threadPool,
-            TKernelFnObj const& kernelFnObj,
-            std::decay_t<TArgs> const&... args) -> void
-        {
-            std::vector<std::future<void>> futuresInBlock;
-            acc.m_gridBlockIdx = gridBlockIdx;
-
-            // Execute the threads of this block in parallel.
-            meta::ndLoopIncIdx(
-                blockThreadExtent,
-                [&](Vec<TDim, TIdx> const& blockThreadIdx)
-                {
-                    // copy blockThreadIdx because it will get changed for the next iteration/thread.
-                    futuresInBlock.emplace_back(threadPool.enqueueTask(
-                        [&, blockThreadIdx] { runThread(acc, blockThreadIdx, kernelFnObj, args...); }));
-                });
-
-            // Wait for the completion of the block thread kernels.
-            for(auto& t : futuresInBlock)
-                t.wait();
-
-            // Clean up.
-            futuresInBlock.clear();
-            acc.m_threadToIndexMap.clear();
-            freeSharedVars(acc); // After a block has been processed, the shared memory has to be deleted.
-        }
-
-        //! The thread entry point on the accelerator.
-        ALPAKA_FN_HOST static auto runThread(
-            AccCpuThreads<TDim, TIdx>& acc,
-            Vec<TDim, TIdx> const& blockThreadIdx,
-            TKernelFnObj const& kernelFnObj,
-            std::decay_t<TArgs> const&... args) -> void
-        {
-            // We have to store the thread data before the kernel is calling any of the methods of this class depending
-            // on them.
-            auto const threadId = std::this_thread::get_id();
-
-            if(blockThreadIdx.sum() == 0)
-            {
-                acc.m_idMasterThread = threadId;
-            }
-
-            {
-                // Save the thread id, and index.
-                std::lock_guard<std::mutex> lock(acc.m_mtxMapInsert);
-                acc.m_threadToIndexMap.emplace(threadId, blockThreadIdx);
-            }
-
-            // Sync all threads so that the maps with thread id's are complete and not changed after here.
-            syncBlockThreads(acc);
-
-            // Execute the kernel itself.
-            kernelFnObj(std::as_const(acc), args...);
-
-            // We have to sync all threads here because if a thread would finish before all threads have been started,
-            // a new thread could get the recycled (then duplicate) thread id!
-            syncBlockThreads(acc);
-        }
-
-        TKernelFnObj m_kernelFnObj;
-        std::tuple<std::decay_t<TArgs>...> m_args;
-    };
-
-    namespace trait
-    {
-        //! The CPU threads execution task accelerator type trait specialization.
-        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct AccType<TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            using type = AccCpuThreads<TDim, TIdx>;
-        };
-
-        //! The CPU threads execution task device type trait specialization.
-        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct DevType<TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            using type = DevCpu;
-        };
-
-        //! The CPU threads execution task dimension getter trait specialization.
-        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct DimType<TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            using type = TDim;
-        };
-
-        //! The CPU threads execution task platform type trait specialization.
-        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct PlatformType<TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            using type = PlatformCpu;
-        };
-
-        //! The CPU threads execution task idx type trait specialization.
-        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct IdxType<TaskKernelCpuThreads<TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            using type = TIdx;
-        };
-
-        //! \brief Specialisation of the class template FunctionAttributes
-        //! \tparam TDev The device type.
-        //! \tparam TDim The dimensionality of the accelerator device properties.
-        //! \tparam TIdx The idx type of the accelerator device properties.
-        //! \tparam TKernelFn Kernel function object type.
-        //! \tparam TArgs Kernel function object argument types as a parameter pack.
-        template<typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
-        struct FunctionAttributes<AccCpuThreads<TDim, TIdx>, TDev, TKernelFn, TArgs...>
-        {
-            //! \param dev The device instance
-            //! \param kernelFn The kernel function object which should be executed.
-            //! \param args The kernel invocation arguments.
-            //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
-            //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
-            ALPAKA_FN_HOST static auto getFunctionAttributes(
-                TDev const& dev,
-                [[maybe_unused]] TKernelFn const& kernelFn,
-                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
-            {
-                alpaka::KernelFunctionAttributes kernelFunctionAttributes;
-
-                // set function properties for maxThreadsPerBlock to device properties, since API doesn't have function
-                // properties function.
-                auto const& props = alpaka::getAccDevProps<AccCpuThreads<TDim, TIdx>>(dev);
-                kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(props.m_blockThreadCountMax);
-                kernelFunctionAttributes.maxDynamicSharedSizeBytes
-                    = static_cast<int>(alpaka::BlockSharedDynMemberAllocKiB * 1024);
-                return kernelFunctionAttributes;
-            }
-        };
-
-    } // namespace trait
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/kernel/TaskKernelFpgaSyclIntel.hpp b/include/alpaka/kernel/TaskKernelFpgaSyclIntel.hpp
deleted file mode 100644
index 6163165..0000000
--- a/include/alpaka/kernel/TaskKernelFpgaSyclIntel.hpp
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright 2024 Jan Stephan, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/acc/Tag.hpp"
-#include "alpaka/kernel/TaskKernelGenericSycl.hpp"
-
-#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_FPGA)
-
-namespace alpaka
-{
-    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-    using TaskKernelFpgaSyclIntel
-        = TaskKernelGenericSycl<TagFpgaSyclIntel, AccFpgaSyclIntel<TDim, TIdx>, TDim, TIdx, TKernelFnObj, TArgs...>;
-
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/kernel/TaskKernelGenericSycl.hpp b/include/alpaka/kernel/TaskKernelGenericSycl.hpp
deleted file mode 100644
index 11cc2ca..0000000
--- a/include/alpaka/kernel/TaskKernelGenericSycl.hpp
+++ /dev/null
@@ -1,314 +0,0 @@
-/* Copyright 2024 Jan Stephan, Andrea Bocci, Luca Ferragina, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/acc/AccGenericSycl.hpp"
-#include "alpaka/acc/Traits.hpp"
-#include "alpaka/core/BoostPredef.hpp"
-#include "alpaka/core/Sycl.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/dim/Traits.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/kernel/KernelFunctionAttributes.hpp"
-#include "alpaka/kernel/SyclSubgroupSize.hpp"
-#include "alpaka/kernel/Traits.hpp"
-#include "alpaka/platform/PlatformGenericSycl.hpp"
-#include "alpaka/platform/Traits.hpp"
-#include "alpaka/queue/Traits.hpp"
-#include "alpaka/workdiv/WorkDivMembers.hpp"
-
-#include <cassert>
-#include <functional>
-#include <memory>
-#include <stdexcept>
-#include <tuple>
-#include <type_traits>
-#include <utility>
-
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-
-#    if BOOST_COMP_CLANG
-#        pragma clang diagnostic push
-#        pragma clang diagnostic ignored "-Wunused-lambda-capture"
-#        pragma clang diagnostic ignored "-Wunused-parameter"
-#    endif
-
-#    include <sycl/sycl.hpp>
-
-#    define LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(sub_group_size)                                                    \
-        cgh.parallel_for(                                                                                             \
-            sycl::nd_range<TDim::value>{global_size, local_size},                                                     \
-            [item_elements, dyn_shared_accessor, st_shared_accessor, k_func, k_args](                                 \
-                sycl::nd_item<TDim::value> work_item) [[intel::reqd_sub_group_size(sub_group_size)]]                  \
-            {                                                                                                         \
-                auto acc = TAcc{item_elements, work_item, dyn_shared_accessor, st_shared_accessor};                   \
-                std::apply(                                                                                           \
-                    [k_func, &acc](typename std::decay_t<TArgs> const&... args) { k_func(acc, args...); },            \
-                    k_args);                                                                                          \
-            });
-
-#    define LAUNCH_SYCL_KERNEL_WITH_DEFAULT_SUBGROUP_SIZE                                                             \
-        cgh.parallel_for(                                                                                             \
-            sycl::nd_range<TDim::value>{global_size, local_size},                                                     \
-            [item_elements, dyn_shared_accessor, st_shared_accessor, k_func, k_args](                                 \
-                sycl::nd_item<TDim::value> work_item)                                                                 \
-            {                                                                                                         \
-                auto acc = TAcc{item_elements, work_item, dyn_shared_accessor, st_shared_accessor};                   \
-                std::apply(                                                                                           \
-                    [k_func, &acc](typename std::decay_t<TArgs> const&... args) { k_func(acc, args...); },            \
-                    k_args);                                                                                          \
-            });
-
-#    define THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL                                                                        \
-        throw sycl::exception(sycl::make_error_code(sycl::errc::kernel_not_supported));                               \
-        cgh.parallel_for(                                                                                             \
-            sycl::nd_range<TDim::value>{global_size, local_size},                                                     \
-            [item_elements, dyn_shared_accessor, st_shared_accessor, k_func, k_args](                                 \
-                sycl::nd_item<TDim::value> work_item) {});
-
-namespace alpaka
-{
-    //! The SYCL accelerator execution task.
-    template<typename TTag, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-    class TaskKernelGenericSycl final : public WorkDivMembers<TDim, TIdx>
-    {
-    public:
-        static_assert(TDim::value > 0 && TDim::value <= 3, "Invalid kernel dimensionality");
-
-        template<typename TWorkDiv>
-        TaskKernelGenericSycl(TWorkDiv&& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
-            : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
-            , m_kernelFnObj{kernelFnObj}
-            , m_args{std::forward<TArgs>(args)...}
-        {
-        }
-
-        auto operator()(sycl::handler& cgh) const -> void
-        {
-            auto const work_groups = WorkDivMembers<TDim, TIdx>::m_gridBlockExtent;
-            auto const group_items = WorkDivMembers<TDim, TIdx>::m_blockThreadExtent;
-            auto const item_elements = WorkDivMembers<TDim, TIdx>::m_threadElemExtent;
-
-            auto const global_size = get_global_size(work_groups, group_items);
-            auto const local_size = get_local_size(group_items);
-
-            // allocate dynamic shared memory -- needs at least 1 byte to make the Xilinx Runtime happy
-            auto const dyn_shared_mem_bytes = std::max(
-                1ul,
-                std::apply(
-                    [&](std::decay_t<TArgs> const&... args) {
-                        return getBlockSharedMemDynSizeBytes<TAcc>(m_kernelFnObj, group_items, item_elements, args...);
-                    },
-                    m_args));
-
-            auto dyn_shared_accessor = sycl::local_accessor<std::byte>{sycl::range<1>{dyn_shared_mem_bytes}, cgh};
-
-            // allocate static shared memory -- value comes from the build system
-            constexpr auto st_shared_mem_bytes = std::size_t{ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB * 1024};
-            auto st_shared_accessor = sycl::local_accessor<std::byte>{sycl::range<1>{st_shared_mem_bytes}, cgh};
-
-            // copy-by-value so we don't access 'this' on the device
-            auto k_func = m_kernelFnObj;
-            auto k_args = m_args;
-
-            constexpr std::size_t sub_group_size = trait::warpSize<TKernelFnObj, TAcc>;
-            bool supported = false;
-
-            if constexpr(sub_group_size == 0)
-            {
-                // no explicit subgroup size requirement
-                LAUNCH_SYCL_KERNEL_WITH_DEFAULT_SUBGROUP_SIZE
-                supported = true;
-            }
-            else
-            {
-#    if(SYCL_SUBGROUP_SIZE == 0)
-                // no explicit SYCL target, assume JIT compilation
-                LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(sub_group_size)
-                supported = true;
-#    else
-                // check if the kernel should be launched with a subgroup size of 4
-                if constexpr(sub_group_size == 4)
-                {
-#        if(SYCL_SUBGROUP_SIZE & 4)
-                    LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(4)
-                    supported = true;
-#        else
-                    // empty kernel, required to keep SYCL happy
-                    THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
-#        endif
-                }
-
-                // check if the kernel should be launched with a subgroup size of 8
-                if constexpr(sub_group_size == 8)
-                {
-#        if(SYCL_SUBGROUP_SIZE & 8)
-                    LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(8)
-                    supported = true;
-#        else
-                    // empty kernel, required to keep SYCL happy
-                    THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
-#        endif
-                }
-
-                // check if the kernel should be launched with a subgroup size of 16
-                if constexpr(sub_group_size == 16)
-                {
-#        if(SYCL_SUBGROUP_SIZE & 16)
-                    LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(16)
-                    supported = true;
-#        else
-                    // empty kernel, required to keep SYCL happy
-                    THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
-#        endif
-                }
-
-                // check if the kernel should be launched with a subgroup size of 32
-                if constexpr(sub_group_size == 32)
-                {
-#        if(SYCL_SUBGROUP_SIZE & 32)
-                    LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(32)
-                    supported = true;
-#        else
-                    // empty kernel, required to keep SYCL happy
-                    THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
-#        endif
-                }
-
-                // check if the kernel should be launched with a subgroup size of 64
-                if constexpr(sub_group_size == 64)
-                {
-#        if(SYCL_SUBGROUP_SIZE & 64)
-                    LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(64)
-                    supported = true;
-#        else
-                    // empty kernel, required to keep SYCL happy
-                    THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
-#        endif
-                }
-#    endif
-
-                // this subgroup size is not supported, raise an exception
-                if(not supported)
-                    throw sycl::exception(sycl::make_error_code(sycl::errc::kernel_not_supported));
-            }
-        }
-
-        static constexpr auto is_sycl_task = true;
-        // Distinguish from other tasks
-        static constexpr auto is_sycl_kernel = true;
-
-    private:
-        auto get_global_size(Vec<TDim, TIdx> const& work_groups, Vec<TDim, TIdx> const& group_items) const
-        {
-            if constexpr(TDim::value == 1)
-                return sycl::range<1>{static_cast<std::size_t>(work_groups[0] * group_items[0])};
-            else if constexpr(TDim::value == 2)
-                return sycl::range<2>{
-                    static_cast<std::size_t>(work_groups[1] * group_items[1]),
-                    static_cast<std::size_t>(work_groups[0] * group_items[0])};
-            else
-                return sycl::range<3>{
-                    static_cast<std::size_t>(work_groups[2] * group_items[2]),
-                    static_cast<std::size_t>(work_groups[1] * group_items[1]),
-                    static_cast<std::size_t>(work_groups[0] * group_items[0])};
-        }
-
-        auto get_local_size(Vec<TDim, TIdx> const& group_items) const
-        {
-            if constexpr(TDim::value == 1)
-                return sycl::range<1>{static_cast<std::size_t>(group_items[0])};
-            else if constexpr(TDim::value == 2)
-                return sycl::range<2>{
-                    static_cast<std::size_t>(group_items[1]),
-                    static_cast<std::size_t>(group_items[0])};
-            else
-                return sycl::range<3>{
-                    static_cast<std::size_t>(group_items[2]),
-                    static_cast<std::size_t>(group_items[1]),
-                    static_cast<std::size_t>(group_items[0])};
-        }
-
-    public:
-        TKernelFnObj m_kernelFnObj;
-        std::tuple<std::decay_t<TArgs>...> m_args;
-    };
-
-} // namespace alpaka
-
-#    if BOOST_COMP_CLANG
-#        pragma clang diagnostic pop
-#    endif
-
-namespace alpaka::trait
-{
-    //! The SYCL execution task accelerator type trait specialization.
-    template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-    struct AccType<TaskKernelGenericSycl<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
-    {
-        using type = TAcc;
-    };
-
-    //! The SYCL execution task device type trait specialization.
-    template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-    struct DevType<TaskKernelGenericSycl<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
-    {
-        using type = typename DevType<TAcc>::type;
-    };
-
-    //! The SYCL execution task platform type trait specialization.
-    template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-    struct PlatformType<TaskKernelGenericSycl<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
-    {
-        using type = typename PlatformType<TAcc>::type;
-    };
-
-    //! The SYCL execution task dimension getter trait specialization.
-    template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-    struct DimType<TaskKernelGenericSycl<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
-    {
-        using type = TDim;
-    };
-
-    //! The SYCL execution task idx type trait specialization.
-    template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-    struct IdxType<TaskKernelGenericSycl<TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
-    {
-        using type = TIdx;
-    };
-
-    //! \brief Specialisation of the class template FunctionAttributes
-    //! \tparam TTag The SYCL device selector.
-    //! \tparam TDev The device type.
-    //! \tparam TDim The dimensionality of the accelerator device properties.
-    //! \tparam TIdx The idx type of the accelerator device properties.
-    //! \tparam TKernelFn Kernel function object type.
-    //! \tparam TArgs Kernel function object argument types as a parameter pack.
-    template<typename TTag, typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
-    struct FunctionAttributes<AccGenericSycl<TTag, TDim, TIdx>, TDev, TKernelFn, TArgs...>
-    {
-        //! \param dev The device instance
-        //! \param kernelFn The kernel function object which should be executed.
-        //! \param args The kernel invocation arguments.
-        //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
-        //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
-        ALPAKA_FN_HOST static auto getFunctionAttributes(
-            TDev const& dev,
-            [[maybe_unused]] TKernelFn const& kernelFn,
-            [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
-        {
-            alpaka::KernelFunctionAttributes kernelFunctionAttributes;
-
-            // set function properties for maxThreadsPerBlock to device properties
-            auto const& props = alpaka::getAccDevProps<AccGenericSycl<TTag, TDim, TIdx>>(dev);
-            kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(props.m_blockThreadCountMax);
-            return kernelFunctionAttributes;
-        }
-    };
-} // namespace alpaka::trait
-
-#    undef LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS
-
-#endif
diff --git a/include/alpaka/kernel/TaskKernelGpuCudaRt.hpp b/include/alpaka/kernel/TaskKernelGpuCudaRt.hpp
deleted file mode 100644
index 416e893..0000000
--- a/include/alpaka/kernel/TaskKernelGpuCudaRt.hpp
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright 2022 Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/ApiCudaRt.hpp"
-#include "alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp"
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-namespace alpaka
-{
-    template<typename TAcc, typename TDev, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-    using TaskKernelGpuCudaRt
-        = TaskKernelGpuUniformCudaHipRt<ApiCudaRt, TAcc, TDev, TDim, TIdx, TKernelFnObj, TArgs...>;
-} // namespace alpaka
-
-#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
diff --git a/include/alpaka/kernel/TaskKernelGpuHipRt.hpp b/include/alpaka/kernel/TaskKernelGpuHipRt.hpp
deleted file mode 100644
index b4b284c..0000000
--- a/include/alpaka/kernel/TaskKernelGpuHipRt.hpp
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright 2022 Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/ApiHipRt.hpp"
-#include "alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp"
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-namespace alpaka
-{
-    template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-    using TaskKernelGpuHipRt = TaskKernelGpuUniformCudaHipRt<ApiHipRt, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>;
-} // namespace alpaka
-
-#endif // ALPAKA_ACC_GPU_HIP_ENABLED
diff --git a/include/alpaka/kernel/TaskKernelGpuSyclIntel.hpp b/include/alpaka/kernel/TaskKernelGpuSyclIntel.hpp
deleted file mode 100644
index e5c5a9a..0000000
--- a/include/alpaka/kernel/TaskKernelGpuSyclIntel.hpp
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright 2024 Jan Stephan, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/acc/Tag.hpp"
-#include "alpaka/kernel/TaskKernelGenericSycl.hpp"
-
-#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_GPU)
-
-namespace alpaka
-{
-    template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-    using TaskKernelGpuSyclIntel
-        = TaskKernelGenericSycl<TagGpuSyclIntel, AccGpuSyclIntel<TDim, TIdx>, TDim, TIdx, TKernelFnObj, TArgs...>;
-
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp b/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp
deleted file mode 100644
index 53bbaf6..0000000
--- a/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp
+++ /dev/null
@@ -1,373 +0,0 @@
-/* Copyright 2024 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera, Jan Stephan, Andrea Bocci, Bernhard
- * Manfred Gruber, Antonio Di Pilato, Mehmet Yusufoglu
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/acc/AccGpuUniformCudaHipRt.hpp"
-#include "alpaka/acc/Traits.hpp"
-#include "alpaka/core/BoostPredef.hpp"
-#include "alpaka/core/Cuda.hpp"
-#include "alpaka/core/Decay.hpp"
-#include "alpaka/core/DemangleTypeNames.hpp"
-#include "alpaka/core/Hip.hpp"
-#include "alpaka/core/RemoveRestrict.hpp"
-#include "alpaka/dev/DevUniformCudaHipRt.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/dim/Traits.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/kernel/KernelFunctionAttributes.hpp"
-#include "alpaka/kernel/Traits.hpp"
-#include "alpaka/platform/Traits.hpp"
-#include "alpaka/queue/Traits.hpp"
-#include "alpaka/queue/cuda_hip/QueueUniformCudaHipRt.hpp"
-#include "alpaka/workdiv/WorkDivHelpers.hpp"
-#include "alpaka/workdiv/WorkDivMembers.hpp"
-
-#include <stdexcept>
-#include <tuple>
-#include <type_traits>
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-#    include <iostream>
-#endif
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-
-#    if !defined(ALPAKA_HOST_ONLY)
-
-#        include "alpaka/core/BoostPredef.hpp"
-
-#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
-#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#        endif
-
-#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
-#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#        endif
-
-namespace alpaka
-{
-    namespace detail
-    {
-#        if BOOST_COMP_CLANG
-#            pragma clang diagnostic push
-#            pragma clang diagnostic ignored "-Wunused-template"
-#        endif
-        //! The GPU CUDA/HIP kernel entry point.
-        // \NOTE: 'A __global__ function or function template cannot have a trailing return type.'
-        // We have put the function into a shallow namespace and gave it a short name, so the mangled name in the
-        // profiler (e.g. ncu) is as shorter as possible.
-        template<typename TKernelFnObj, typename TApi, typename TAcc, typename TDim, typename TIdx, typename... TArgs>
-        __global__ void gpuKernel(
-            Vec<TDim, TIdx> const threadElemExtent,
-            TKernelFnObj const kernelFnObj,
-            TArgs... args)
-        {
-            TAcc const acc(threadElemExtent);
-
-// with clang it is not possible to query std::result_of for a pure device lambda created on the host side
-#        if !(BOOST_COMP_CLANG_CUDA && BOOST_COMP_CLANG)
-            static_assert(
-                std::is_same_v<decltype(kernelFnObj(const_cast<TAcc const&>(acc), args...)), void>,
-                "The TKernelFnObj is required to return void!");
-#        endif
-            kernelFnObj(const_cast<TAcc const&>(acc), args...);
-        }
-#        if BOOST_COMP_CLANG
-#            pragma clang diagnostic pop
-#        endif
-    } // namespace detail
-
-    namespace uniform_cuda_hip
-    {
-        namespace detail
-        {
-            template<typename TDim, typename TIdx>
-            ALPAKA_FN_HOST auto checkVecOnly3Dim(Vec<TDim, TIdx> const& vec) -> void
-            {
-                if constexpr(TDim::value > 0)
-                {
-                    for(auto i = std::min(typename TDim::value_type{3}, TDim::value); i < TDim::value; ++i)
-                    {
-                        if(vec[TDim::value - 1u - i] != 1)
-                        {
-                            throw std::runtime_error(
-                                "The CUDA/HIP accelerator supports a maximum of 3 dimensions. All "
-                                "work division extents of the dimensions higher 3 have to be 1!");
-                        }
-                    }
-                }
-            }
-
-            template<typename TDim, typename TIdx>
-            ALPAKA_FN_HOST auto convertVecToUniformCudaHipDim(Vec<TDim, TIdx> const& vec) -> dim3
-            {
-                dim3 dim(1, 1, 1);
-                if constexpr(TDim::value >= 1)
-                    dim.x = static_cast<unsigned>(vec[TDim::value - 1u]);
-                if constexpr(TDim::value >= 2)
-                    dim.y = static_cast<unsigned>(vec[TDim::value - 2u]);
-                if constexpr(TDim::value >= 3)
-                    dim.z = static_cast<unsigned>(vec[TDim::value - 3u]);
-                checkVecOnly3Dim(vec);
-                return dim;
-            }
-        } // namespace detail
-    } // namespace uniform_cuda_hip
-
-    //! The GPU CUDA/HIP accelerator execution task.
-    template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-    class TaskKernelGpuUniformCudaHipRt final : public WorkDivMembers<TDim, TIdx>
-    {
-    public:
-        template<typename TWorkDiv>
-        ALPAKA_FN_HOST TaskKernelGpuUniformCudaHipRt(
-            TWorkDiv&& workDiv,
-            TKernelFnObj const& kernelFnObj,
-            TArgs&&... args)
-            : WorkDivMembers<TDim, TIdx>(std::forward<TWorkDiv>(workDiv))
-            , m_kernelFnObj(kernelFnObj)
-            , m_args(std::forward<TArgs>(args)...)
-        {
-            static_assert(
-                Dim<std::decay_t<TWorkDiv>>::value == TDim::value,
-                "The work division and the execution task have to be of the same dimensionality!");
-        }
-
-        TKernelFnObj m_kernelFnObj;
-        std::tuple<remove_restrict_t<std::decay_t<TArgs>>...> m_args;
-    };
-
-    namespace trait
-    {
-        //! The GPU CUDA/HIP execution task accelerator type trait specialization.
-        template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct AccType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            using type = AccGpuUniformCudaHipRt<TApi, TDim, TIdx>;
-        };
-
-        //! The GPU CUDA/HIP execution task device type trait specialization.
-        template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct DevType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            using type = DevUniformCudaHipRt<TApi>;
-        };
-
-        //! The GPU CUDA/HIP execution task dimension getter trait specialization.
-        template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct DimType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            using type = TDim;
-        };
-
-        //! The CPU CUDA/HIP execution task platform type trait specialization.
-        template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct PlatformType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            using type = PlatformUniformCudaHipRt<TApi>;
-        };
-
-        //! The GPU CUDA/HIP execution task idx type trait specialization.
-        template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct IdxType<TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            using type = TIdx;
-        };
-
-        //! The CUDA/HIP kernel enqueue trait specialization.
-        template<
-            typename TApi,
-            bool TBlocking,
-            typename TAcc,
-            typename TDim,
-            typename TIdx,
-            typename TKernelFnObj,
-            typename... TArgs>
-        struct Enqueue<
-            uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>,
-            TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            ALPAKA_FN_HOST static auto enqueue(
-                uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>& queue,
-                TaskKernelGpuUniformCudaHipRt<TApi, TAcc, TDim, TIdx, TKernelFnObj, TArgs...> const& task) -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-                // TODO: Check that (sizeof(TKernelFnObj) * m_3uiBlockThreadExtent.prod()) < available memory idx
-
-#        if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                // std::size_t printfFifoSize;
-                // TApi::deviceGetLimit(&printfFifoSize, TApi::limitPrintfFifoSize);
-                // std::cout << __func__ << " INFO: printfFifoSize: " << printfFifoSize << std::endl;
-                // TApi::deviceSetLimit(TApi::limitPrintfFifoSize, printfFifoSize*10);
-                // TApi::deviceGetLimit(&printfFifoSize, TApi::limitPrintfFifoSize);
-                // std::cout << __func__ << " INFO: printfFifoSize: " << printfFifoSize << std::endl;
-#        endif
-                auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(task);
-                auto const blockThreadExtent = getWorkDiv<Block, Threads>(task);
-                auto const threadElemExtent = getWorkDiv<Thread, Elems>(task);
-
-                dim3 const gridDim = uniform_cuda_hip::detail::convertVecToUniformCudaHipDim(gridBlockExtent);
-                dim3 const blockDim = uniform_cuda_hip::detail::convertVecToUniformCudaHipDim(blockThreadExtent);
-                uniform_cuda_hip::detail::checkVecOnly3Dim(threadElemExtent);
-
-#        if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                std::cout << __func__ << " gridDim: (" << gridDim.z << ", " << gridDim.y << ", " << gridDim.x << ")\n";
-                std::cout << __func__ << " blockDim: (" << blockDim.z << ", " << blockDim.y << ", " << blockDim.x
-                          << ")\n";
-#        endif
-
-#        if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                // This checks for a valid work division that is also compliant with the hardware maxima of the
-                // accelerator.
-                if(!isValidWorkDiv<TAcc>(task, getDev(queue)))
-                {
-                    throw std::runtime_error(
-                        "The given work division is not valid or not supported by the device of type "
-                        + getAccName<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>>() + "!");
-                }
-#        endif
-
-                // Get the size of the block shared dynamic memory.
-                auto const blockSharedMemDynSizeBytes = std::apply(
-                    [&](remove_restrict_t<std::decay_t<TArgs>> const&... args) {
-                        return getBlockSharedMemDynSizeBytes<TAcc>(
-                            task.m_kernelFnObj,
-                            blockThreadExtent,
-                            threadElemExtent,
-                            args...);
-                    },
-                    task.m_args);
-
-#        if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                // Log the block shared memory idx.
-                std::cout << __func__ << " BlockSharedMemDynSizeBytes: " << blockSharedMemDynSizeBytes << " B"
-                          << std::endl;
-#        endif
-
-                auto kernelName = alpaka::detail::
-                    gpuKernel<TKernelFnObj, TApi, TAcc, TDim, TIdx, remove_restrict_t<std::decay_t<TArgs>>...>;
-
-#        if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                // Log the function attributes.
-                typename TApi::FuncAttributes_t funcAttrs;
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::funcGetAttributes(&funcAttrs, kernelName));
-                std::cout << __func__ << " binaryVersion: " << funcAttrs.binaryVersion
-                          << " constSizeBytes: " << funcAttrs.constSizeBytes << " B"
-                          << " localSizeBytes: " << funcAttrs.localSizeBytes << " B"
-                          << " maxThreadsPerBlock: " << funcAttrs.maxThreadsPerBlock
-                          << " numRegs: " << funcAttrs.numRegs << " ptxVersion: " << funcAttrs.ptxVersion
-                          << " sharedSizeBytes: " << funcAttrs.sharedSizeBytes << " B" << std::endl;
-#        endif
-
-                // Set the current device.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(queue.m_spQueueImpl->m_dev.getNativeHandle()));
-
-                // Enqueue the kernel execution.
-                // \NOTE: No const reference (const &) is allowed as the parameter type because the kernel launch
-                // language extension expects the arguments by value. This forces the type of a float argument given
-                // with std::forward to this function to be of type float instead of e.g. "float const & __ptr64"
-                // (MSVC). If not given by value, the kernel launch code does not copy the value but the pointer to the
-                // value location.
-                std::apply(
-                    [&](remove_restrict_t<std::decay_t<TArgs>> const&... args)
-                    {
-                        kernelName<<<
-                            gridDim,
-                            blockDim,
-                            static_cast<std::size_t>(blockSharedMemDynSizeBytes),
-                            queue.getNativeHandle()>>>(threadElemExtent, task.m_kernelFnObj, args...);
-                    },
-                    task.m_args);
-
-                if constexpr(TBlocking || ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL)
-                {
-                    // Wait for the kernel execution to finish but do not check error return of this call.
-                    // Do not use the alpaka::wait method because it checks the error itself but we want to give a
-                    // custom error message.
-                    std::ignore = TApi::streamSynchronize(queue.getNativeHandle());
-                }
-                if constexpr(ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL)
-                {
-                    auto const msg
-                        = std::string{"execution of kernel '" + core::demangled<TKernelFnObj> + "' failed with"};
-                    ::alpaka::uniform_cuda_hip::detail::rtCheckLastError<TApi, true>(msg.c_str(), __FILE__, __LINE__);
-                }
-            }
-        };
-
-        //! \brief Specialisation of the class template FunctionAttributes
-        //! \tparam TApi The type the API of the GPU accelerator backend. Currently Cuda or Hip.
-        //! \tparam TDim The dimensionality of the accelerator device properties.
-        //! \tparam TIdx The idx type of the accelerator device properties.
-        //! \tparam TKernelFn Kernel function object type.
-        //! \tparam TArgs Kernel function object argument types as a parameter pack.
-        template<typename TApi, typename TDev, typename TDim, typename TIdx, typename TKernelFn, typename... TArgs>
-        struct FunctionAttributes<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>, TDev, TKernelFn, TArgs...>
-        {
-            //! \param dev The device instance
-            //! \param kernelFn The kernel function object which should be executed.
-            //! \param args The kernel invocation arguments.
-            //! \return KernelFunctionAttributes instance. The default version always returns an instance with zero
-            //! fields. For CPU, the field of max threads allowed by kernel function for the block is 1.
-            ALPAKA_FN_HOST static auto getFunctionAttributes(
-                [[maybe_unused]] TDev const& dev,
-                [[maybe_unused]] TKernelFn const& kernelFn,
-                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
-            {
-                auto kernelName = alpaka::detail::gpuKernel<
-                    TKernelFn,
-                    TApi,
-                    AccGpuUniformCudaHipRt<TApi, TDim, TIdx>,
-                    TDim,
-                    TIdx,
-                    remove_restrict_t<std::decay_t<TArgs>>...>;
-
-                typename TApi::FuncAttributes_t funcAttrs;
-#        if BOOST_COMP_GNUC
-                // Disable and enable compile warnings for gcc
-#            pragma GCC diagnostic push
-#            pragma GCC diagnostic ignored "-Wconditionally-supported"
-#        endif
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
-                    TApi::funcGetAttributes(&funcAttrs, reinterpret_cast<void const*>(kernelName)));
-#        if BOOST_COMP_GNUC
-#            pragma GCC diagnostic pop
-#        endif
-
-                alpaka::KernelFunctionAttributes kernelFunctionAttributes;
-                kernelFunctionAttributes.constSizeBytes = funcAttrs.constSizeBytes;
-                kernelFunctionAttributes.localSizeBytes = funcAttrs.localSizeBytes;
-                kernelFunctionAttributes.sharedSizeBytes = funcAttrs.sharedSizeBytes;
-                kernelFunctionAttributes.maxDynamicSharedSizeBytes = funcAttrs.maxDynamicSharedSizeBytes;
-                kernelFunctionAttributes.numRegs = funcAttrs.numRegs;
-                kernelFunctionAttributes.asmVersion = funcAttrs.ptxVersion;
-                kernelFunctionAttributes.maxThreadsPerBlock = static_cast<int>(funcAttrs.maxThreadsPerBlock);
-
-#        if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                printf("Kernel Function Attributes: \n");
-                printf("binaryVersion: %d \n", funcAttrs.binaryVersion);
-                printf(
-                    "constSizeBytes: %lu \n localSizeBytes: %lu, sharedSizeBytes %lu  maxDynamicSharedSizeBytes: %d "
-                    "\n",
-                    funcAttrs.constSizeBytes,
-                    funcAttrs.localSizeBytes,
-                    funcAttrs.sharedSizeBytes,
-                    funcAttrs.maxDynamicSharedSizeBytes);
-
-                printf(
-                    "numRegs: %d, ptxVersion: %d \n maxThreadsPerBlock: %d .\n ",
-                    funcAttrs.numRegs,
-                    funcAttrs.ptxVersion,
-                    funcAttrs.maxThreadsPerBlock);
-#        endif
-                return kernelFunctionAttributes;
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#    endif
-
-#endif
diff --git a/include/alpaka/kernel/Traits.hpp b/include/alpaka/kernel/Traits.hpp
deleted file mode 100644
index c2c0a55..0000000
--- a/include/alpaka/kernel/Traits.hpp
+++ /dev/null
@@ -1,383 +0,0 @@
-/* Copyright 2023 Axel Huebl, Benjamin Worpitz, René Widera, Sergei Bastrakov, Jan Stephan, Bernhard Manfred Gruber,
- *                Andrea Bocci, Aurora Perego, Mehmet Yusufoglu
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/BoostPredef.hpp"
-#include "alpaka/core/Common.hpp"
-#include "alpaka/core/Debug.hpp"
-#include "alpaka/core/DemangleTypeNames.hpp"
-#include "alpaka/core/OmpSchedule.hpp"
-#include "alpaka/dim/Traits.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/kernel/KernelFunctionAttributes.hpp"
-#include "alpaka/queue/Traits.hpp"
-#include "alpaka/vec/Vec.hpp"
-#include "alpaka/workdiv/Traits.hpp"
-
-#include <type_traits>
-
-//! The alpaka accelerator library.
-namespace alpaka
-{
-    //! The kernel traits.
-    namespace trait
-    {
-        //! The kernel execution task creation trait.
-        template<
-            typename TAcc,
-            typename TWorkDiv,
-            typename TKernelFnObj,
-            typename... TArgs/*,
-            typename TSfinae = void*/>
-        struct CreateTaskKernel;
-
-        //! The trait for getting the size of the block shared dynamic memory of a kernel.
-        //!
-        //! \tparam TKernelFnObj The kernel function object.
-        //! \tparam TAcc The accelerator.
-        //!
-        //! The default implementation returns 0.
-        template<typename TKernelFnObj, typename TAcc, typename TSfinae = void>
-        struct BlockSharedMemDynSizeBytes
-        {
-#if BOOST_COMP_CLANG
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored                                                                                  \
-        "-Wdocumentation" // clang does not support the syntax for variadic template arguments "args,..."
-#endif
-            //! \param kernelFnObj The kernel object for which the block shared memory size should be calculated.
-            //! \param blockThreadExtent The block thread extent.
-            //! \param threadElemExtent The thread element extent.
-            //! \tparam TArgs The kernel invocation argument types pack.
-            //! \param args,... The kernel invocation arguments.
-            //! \return The size of the shared memory allocated for a block in bytes.
-            //! The default version always returns zero.
-#if BOOST_COMP_CLANG
-#    pragma clang diagnostic pop
-#endif
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<typename TDim, typename... TArgs>
-            ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(
-                [[maybe_unused]] TKernelFnObj const& kernelFnObj,
-                [[maybe_unused]] Vec<TDim, Idx<TAcc>> const& blockThreadExtent,
-                [[maybe_unused]] Vec<TDim, Idx<TAcc>> const& threadElemExtent,
-                [[maybe_unused]] TArgs const&... args) -> std::size_t
-            {
-                return 0u;
-            }
-        };
-
-        //! \brief The structure template to access to the functions attributes of a kernel function object.
-        //! \tparam TAcc The accelerator type
-        //! \tparam TKernelFnObj Kernel function object type.
-        //! \tparam TArgs Kernel function object argument types as a parameter pack.
-        template<typename TAcc, typename TDev, typename TKernelFnObj, typename... TArgs>
-        struct FunctionAttributes
-        {
-            //! \param dev The device instance
-            //! \param kernelFn The kernel function object which should be executed.
-            //! \param args The kernel invocation arguments.
-            //! \return KernelFunctionAttributes data structure instance. The default version always returns the
-            //! instance with fields which are set to zero.
-            ALPAKA_FN_HOST static auto getFunctionAttributes(
-                [[maybe_unused]] TDev const& dev,
-                [[maybe_unused]] TKernelFnObj const& kernelFn,
-                [[maybe_unused]] TArgs&&... args) -> alpaka::KernelFunctionAttributes
-            {
-                std::string const str
-                    = std::string(__func__) + " function is not specialised for the given arguments.\n";
-                throw std::invalid_argument{str};
-            }
-        };
-
-        //! The trait for getting the warp size required by a kernel.
-        //!
-        //! \tparam TKernelFnObj The kernel function object.
-        //! \tparam TAcc The accelerator.
-        //!
-        //! The default implementation returns 0, which lets the accelerator compiler and runtime choose the warp size.
-        template<typename TKernelFnObj, typename TAcc, typename TSfinae = void>
-        struct WarpSize : std::integral_constant<std::uint32_t, 0>
-        {
-        };
-
-        //! This is a shortcut for the trait defined above
-        template<typename TKernelFnObj, typename TAcc>
-        inline constexpr std::uint32_t warpSize = WarpSize<TKernelFnObj, TAcc>::value;
-
-        //! The trait for getting the schedule to use when a kernel is run using the CpuOmp2Blocks accelerator.
-        //!
-        //! Has no effect on other accelerators.
-        //!
-        //! A user could either specialize this trait for their kernel, or define a public static member
-        //! ompScheduleKind of type alpaka::omp::Schedule, and additionally also int member ompScheduleChunkSize. In
-        //! the latter case, alpaka never odr-uses these members.
-        //!
-        //! In case schedule kind and chunk size are compile-time constants, setting then inside kernel may benefit
-        //! performance.
-        //!
-        //! \tparam TKernelFnObj The kernel function object.
-        //! \tparam TAcc The accelerator.
-        //!
-        //! The default implementation behaves as if the trait was not specialized.
-        template<typename TKernelFnObj, typename TAcc, typename TSfinae = void>
-        struct OmpSchedule
-        {
-        private:
-            //! Type returned when the trait is not specialized
-            struct TraitNotSpecialized
-            {
-            };
-
-        public:
-#if BOOST_COMP_CLANG
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored                                                                                  \
-        "-Wdocumentation" // clang does not support the syntax for variadic template arguments "args,..."
-#endif
-            //! \param kernelFnObj The kernel object for which the schedule should be returned.
-            //! \param blockThreadExtent The block thread extent.
-            //! \param threadElemExtent The thread element extent.
-            //! \tparam TArgs The kernel invocation argument types pack.
-            //! \param args,... The kernel invocation arguments.
-            //! \return The OpenMP schedule information as an alpaka::omp::Schedule object,
-            //!         returning an object of any other type is treated as if the trait is not specialized.
-#if BOOST_COMP_CLANG
-#    pragma clang diagnostic pop
-#endif
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<typename TDim, typename... TArgs>
-            ALPAKA_FN_HOST static auto getOmpSchedule(
-                [[maybe_unused]] TKernelFnObj const& kernelFnObj,
-                [[maybe_unused]] Vec<TDim, Idx<TAcc>> const& blockThreadExtent,
-                [[maybe_unused]] Vec<TDim, Idx<TAcc>> const& threadElemExtent,
-                [[maybe_unused]] TArgs const&... args) -> TraitNotSpecialized
-            {
-                return TraitNotSpecialized{};
-            }
-        };
-    } // namespace trait
-
-#if BOOST_COMP_CLANG
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored                                                                                  \
-        "-Wdocumentation" // clang does not support the syntax for variadic template arguments "args,..."
-#endif
-//! \tparam TAcc The accelerator type.
-//! \param kernelFnObj The kernel object for which the block shared memory size should be calculated.
-//! \param blockThreadExtent The block thread extent.
-//! \param threadElemExtent The thread element extent.
-//! \param args,... The kernel invocation arguments.
-//! \return The size of the shared memory allocated for a block in bytes.
-//! The default implementation always returns zero.
-#if BOOST_COMP_CLANG
-#    pragma clang diagnostic pop
-#endif
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TAcc, typename TKernelFnObj, typename TDim, typename... TArgs>
-    ALPAKA_FN_HOST_ACC auto getBlockSharedMemDynSizeBytes(
-        TKernelFnObj const& kernelFnObj,
-        Vec<TDim, Idx<TAcc>> const& blockThreadExtent,
-        Vec<TDim, Idx<TAcc>> const& threadElemExtent,
-        TArgs const&... args) -> std::size_t
-    {
-        return trait::BlockSharedMemDynSizeBytes<TKernelFnObj, TAcc>::getBlockSharedMemDynSizeBytes(
-            kernelFnObj,
-            blockThreadExtent,
-            threadElemExtent,
-            args...);
-    }
-
-    //! \tparam TAcc The accelerator type.
-    //! \tparam TDev The device type.
-    //! \param dev The device instance
-    //! \param kernelFnObj The kernel function object which should be executed.
-    //! \param args The kernel invocation arguments.
-    //! \return KernelFunctionAttributes instance. Instance is filled with values returned by the accelerator API
-    //! depending on the specific kernel. The default version always returns the instance with fields which are set to
-    //! zero.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TAcc, typename TDev, typename TKernelFnObj, typename... TArgs>
-    ALPAKA_FN_HOST auto getFunctionAttributes(TDev const& dev, TKernelFnObj const& kernelFnObj, TArgs&&... args)
-        -> alpaka::KernelFunctionAttributes
-    {
-        return trait::FunctionAttributes<TAcc, TDev, TKernelFnObj, TArgs...>::getFunctionAttributes(
-            dev,
-            kernelFnObj,
-            std::forward<TArgs>(args)...);
-    }
-
-#if BOOST_COMP_CLANG
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored                                                                                  \
-        "-Wdocumentation" // clang does not support the syntax for variadic template arguments "args,..."
-#endif
-//! \tparam TAcc The accelerator type.
-//! \param kernelFnObj The kernel object for which the block shared memory size should be calculated.
-//! \param blockThreadExtent The block thread extent.
-//! \param threadElemExtent The thread element extent.
-//! \param args,... The kernel invocation arguments.
-//! \return The OpenMP schedule information as an alpaka::omp::Schedule object if the kernel specialized the
-//!         OmpSchedule trait, an object of another type if the kernel didn't specialize the trait.
-#if BOOST_COMP_CLANG
-#    pragma clang diagnostic pop
-#endif
-    template<typename TAcc, typename TKernelFnObj, typename TDim, typename... TArgs>
-    ALPAKA_FN_HOST auto getOmpSchedule(
-        TKernelFnObj const& kernelFnObj,
-        Vec<TDim, Idx<TAcc>> const& blockThreadExtent,
-        Vec<TDim, Idx<TAcc>> const& threadElemExtent,
-        TArgs const&... args)
-    {
-        return trait::OmpSchedule<TKernelFnObj, TAcc>::getOmpSchedule(
-            kernelFnObj,
-            blockThreadExtent,
-            threadElemExtent,
-            args...);
-    }
-
-#if BOOST_COMP_CLANG
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored                                                                                  \
-        "-Wdocumentation" // clang does not support the syntax for variadic template arguments "args,..."
-#endif
-
-
-    //! Check if a type used as kernel argument is trivially copyable
-    //!
-    //! \attention In case this trait is specialized for a user type the user should be sure that the result of calling
-    //! the copy constructor is equal to use memcpy to duplicate the object. An existing destructor should be free
-    //! of side effects.
-    //!
-    //! It's implementation defined whether the closure type of a lambda is trivially copyable.
-    //! Therefor the default implementation is true for trivially copyable or empty (stateless) types.
-    //!
-    //! @tparam T type to check
-    //! @{
-    template<typename T, typename = void>
-    struct IsKernelArgumentTriviallyCopyable
-        : std::bool_constant<std::is_empty_v<T> || std::is_trivially_copyable_v<T>>
-    {
-    };
-
-    template<typename T>
-    inline constexpr bool isKernelArgumentTriviallyCopyable = IsKernelArgumentTriviallyCopyable<T>::value;
-
-    //! @}
-
-    namespace detail
-    {
-        //! Check that the return of TKernelFnObj is void
-        template<typename TAcc, typename TSfinae = void>
-        struct CheckFnReturnType
-        {
-            template<typename TKernelFnObj, typename... TArgs>
-            void operator()(TKernelFnObj const&, TArgs const&...)
-            {
-                using Result = std::invoke_result_t<TKernelFnObj, TAcc const&, TArgs const&...>;
-                static_assert(std::is_same_v<Result, void>, "The TKernelFnObj is required to return void!");
-            }
-        };
-
-        // asserts that T is trivially copyable. We put this in a separate function so we can see which T would fail
-        // the test, when called from a fold expression.
-        template<typename T>
-        inline void assertKernelArgIsTriviallyCopyable()
-        {
-            static_assert(isKernelArgumentTriviallyCopyable<T>, "The kernel argument T must be trivially copyable!");
-        }
-    } // namespace detail
-
-    //! Check if the kernel type is trivially copyable
-    //!
-    //! \attention In case this trait is specialized for a user type the user should be sure that the result of calling
-    //! the copy constructor is equal to use memcpy to duplicate the object. An existing destructor should be free
-    //! of side effects.
-    //!
-    //! The default implementation is true for trivially copyable types (or for extended lambda expressions for CUDA).
-    //!
-    //! @tparam T type to check
-    //! @{
-    template<typename T, typename = void>
-    struct IsKernelTriviallyCopyable
-#if BOOST_COMP_NVCC
-        : std::bool_constant<
-              std::is_trivially_copyable_v<T> || __nv_is_extended_device_lambda_closure_type(T)
-              || __nv_is_extended_host_device_lambda_closure_type(T)>
-#else
-        : std::is_trivially_copyable<T>
-#endif
-    {
-    };
-
-    template<typename T>
-    inline constexpr bool isKernelTriviallyCopyable = IsKernelTriviallyCopyable<T>::value;
-
-//! @}
-
-//! Creates a kernel execution task.
-//!
-//! \tparam TAcc The accelerator type.
-//! \param workDiv The index domain work division.
-//! \param kernelFnObj The kernel function object which should be executed.
-//! \param args,... The kernel invocation arguments.
-//! \return The kernel execution task.
-#if BOOST_COMP_CLANG
-#    pragma clang diagnostic pop
-#endif
-    template<typename TAcc, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
-    ALPAKA_FN_HOST auto createTaskKernel(TWorkDiv const& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
-    {
-        // check for void return type
-        detail::CheckFnReturnType<TAcc>{}(kernelFnObj, args...);
-
-#if BOOST_COMP_NVCC
-        static_assert(
-            isKernelTriviallyCopyable<TKernelFnObj>,
-            "Kernels must be trivially copyable or an extended CUDA lambda expression!");
-#else
-        static_assert(isKernelTriviallyCopyable<TKernelFnObj>, "Kernels must be trivially copyable!");
-#endif
-        (detail::assertKernelArgIsTriviallyCopyable<std::decay_t<TArgs>>(), ...);
-        static_assert(
-            Dim<std::decay_t<TWorkDiv>>::value == Dim<TAcc>::value,
-            "The dimensions of TAcc and TWorkDiv have to be identical!");
-        static_assert(
-            std::is_same_v<Idx<std::decay_t<TWorkDiv>>, Idx<TAcc>>,
-            "The idx type of TAcc and the idx type of TWorkDiv have to be identical!");
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-        std::cout << __func__ << " workDiv: " << workDiv << ", kernelFnObj: " << core::demangled<decltype(kernelFnObj)>
-                  << std::endl;
-#endif
-        return trait::CreateTaskKernel<TAcc, TWorkDiv, TKernelFnObj, TArgs...>::createTaskKernel(
-            workDiv,
-            kernelFnObj,
-            std::forward<TArgs>(args)...);
-    }
-
-#if BOOST_COMP_CLANG
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored                                                                                  \
-        "-Wdocumentation" // clang does not support the syntax for variadic template arguments "args,..."
-#endif
-//! Executes the given kernel in the given queue.
-//!
-//! \tparam TAcc The accelerator type.
-//! \param queue The queue to enqueue the view copy task into.
-//! \param workDiv The index domain work division.
-//! \param kernelFnObj The kernel function object which should be executed.
-//! \param args,... The kernel invocation arguments.
-#if BOOST_COMP_CLANG
-#    pragma clang diagnostic pop
-#endif
-    template<typename TAcc, typename TQueue, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
-    ALPAKA_FN_HOST auto exec(TQueue& queue, TWorkDiv const& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
-        -> void
-    {
-        enqueue(queue, createTaskKernel<TAcc>(workDiv, kernelFnObj, std::forward<TArgs>(args)...));
-    }
-} // namespace alpaka
diff --git a/include/alpaka/math/Complex.hpp b/include/alpaka/math/Complex.hpp
deleted file mode 100644
index f265c7b..0000000
--- a/include/alpaka/math/Complex.hpp
+++ /dev/null
@@ -1,582 +0,0 @@
-/* Copyright 2022 Sergei Bastrakov
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/math/FloatEqualExact.hpp"
-
-#include <cmath>
-#include <complex>
-#include <iostream>
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace internal
-    {
-        //! Implementation of a complex number useable on host and device.
-        //!
-        //! It follows the layout of std::complex and so array-oriented access.
-        //! The class template implements all methods and operators as std::complex<T>.
-        //! Additionally, it provides an implicit conversion to and from std::complex<T>.
-        //! All methods besides operators << and >> are host-device.
-        //! It does not provide non-member functions of std::complex besides the operators.
-        //! Those are provided the same way as alpaka math functions for real numbers.
-        //!
-        //! Note that unlike most of alpaka, this is a concrete type template, and not merely a concept.
-        //!
-        //! Naming and order of the methods match https://en.cppreference.com/w/cpp/numeric/complex in C++17.
-        //! Implementation chose to not extend it e.g. by adding constexpr to some places that would get it in C++20.
-        //! The motivation is that with internal conversion to std::complex<T> for CPU backends, it would define the
-        //! common interface for generic code anyways. So it is more clear to have alpaka's interface exactly matching
-        //! when possible, and not "improving".
-        //!
-        //! @tparam T type of the real and imaginary part: float, double, or long double.
-        template<typename T>
-        class Complex
-        {
-        public:
-            // Make sure the input type is floating-point
-            static_assert(std::is_floating_point_v<T>);
-
-            //! Type of the real and imaginary parts
-            using value_type = T;
-
-            //! Constructor from the given real and imaginary parts
-            constexpr ALPAKA_FN_HOST_ACC Complex(T const& real = T{}, T const& imag = T{}) : m_real(real), m_imag(imag)
-            {
-            }
-
-            //! Copy constructor
-            constexpr Complex(Complex const& other) = default;
-
-            //! Constructor from Complex of another type
-            template<typename U>
-            constexpr ALPAKA_FN_HOST_ACC Complex(Complex<U> const& other)
-                : m_real(static_cast<T>(other.real()))
-                , m_imag(static_cast<T>(other.imag()))
-            {
-            }
-
-            //! Constructor from std::complex
-            constexpr ALPAKA_FN_HOST_ACC Complex(std::complex<T> const& other)
-                : m_real(other.real())
-                , m_imag(other.imag())
-            {
-            }
-
-            //! Conversion to std::complex
-            constexpr ALPAKA_FN_HOST_ACC operator std::complex<T>() const
-            {
-                return std::complex<T>{m_real, m_imag};
-            }
-
-            //! Assignment
-            Complex& operator=(Complex const&) = default;
-
-            //! Get the real part
-            constexpr ALPAKA_FN_HOST_ACC T real() const
-            {
-                return m_real;
-            }
-
-            //! Set the real part
-            constexpr ALPAKA_FN_HOST_ACC void real(T value)
-            {
-                m_real = value;
-            }
-
-            //! Get the imaginary part
-            constexpr ALPAKA_FN_HOST_ACC T imag() const
-            {
-                return m_imag;
-            }
-
-            //! Set the imaginary part
-            constexpr ALPAKA_FN_HOST_ACC void imag(T value)
-            {
-                m_imag = value;
-            }
-
-            //! Addition assignment with a real number
-            ALPAKA_FN_HOST_ACC Complex& operator+=(T const& other)
-            {
-                m_real += other;
-                return *this;
-            }
-
-            //! Addition assignment with a complex number
-            template<typename U>
-            ALPAKA_FN_HOST_ACC Complex& operator+=(Complex<U> const& other)
-            {
-                m_real += static_cast<T>(other.real());
-                m_imag += static_cast<T>(other.imag());
-                return *this;
-            }
-
-            //! Subtraction assignment with a real number
-            ALPAKA_FN_HOST_ACC Complex& operator-=(T const& other)
-            {
-                m_real -= other;
-                return *this;
-            }
-
-            //! Subtraction assignment with a complex number
-            template<typename U>
-            ALPAKA_FN_HOST_ACC Complex& operator-=(Complex<U> const& other)
-            {
-                m_real -= static_cast<T>(other.real());
-                m_imag -= static_cast<T>(other.imag());
-                return *this;
-            }
-
-            //! Multiplication assignment with a real number
-            ALPAKA_FN_HOST_ACC Complex& operator*=(T const& other)
-            {
-                m_real *= other;
-                m_imag *= other;
-                return *this;
-            }
-
-            //! Multiplication assignment with a complex number
-            template<typename U>
-            ALPAKA_FN_HOST_ACC Complex& operator*=(Complex<U> const& other)
-            {
-                auto const newReal = m_real * static_cast<T>(other.real()) - m_imag * static_cast<T>(other.imag());
-                auto const newImag = m_imag * static_cast<T>(other.real()) + m_real * static_cast<T>(other.imag());
-                m_real = newReal;
-                m_imag = newImag;
-                return *this;
-            }
-
-            //! Division assignment with a real number
-            ALPAKA_FN_HOST_ACC Complex& operator/=(T const& other)
-            {
-                m_real /= other;
-                m_imag /= other;
-                return *this;
-            }
-
-            //! Division assignment with a complex number
-            template<typename U>
-            ALPAKA_FN_HOST_ACC Complex& operator/=(Complex<U> const& other)
-            {
-                return *this *= Complex{
-                           static_cast<T>(other.real() / (other.real() * other.real() + other.imag() * other.imag())),
-                           static_cast<T>(
-                               -other.imag() / (other.real() * other.real() + other.imag() * other.imag()))};
-            }
-
-        private:
-            //! Real and imaginary parts, storage enables array-oriented access
-            T m_real, m_imag;
-        };
-
-        //! Host-device arithmetic operations matching std::complex<T>.
-        //!
-        //! They take and return alpaka::Complex.
-        //!
-        //! @{
-        //!
-
-        //! Unary plus (added for compatibility with std::complex)
-        template<typename T>
-        ALPAKA_FN_HOST_ACC Complex<T> operator+(Complex<T> const& val)
-        {
-            return val;
-        }
-
-        //! Unary minus
-        template<typename T>
-        ALPAKA_FN_HOST_ACC Complex<T> operator-(Complex<T> const& val)
-        {
-            return Complex<T>{-val.real(), -val.imag()};
-        }
-
-        //! Addition of two complex numbers
-        template<typename T>
-        ALPAKA_FN_HOST_ACC Complex<T> operator+(Complex<T> const& lhs, Complex<T> const& rhs)
-        {
-            return Complex<T>{lhs.real() + rhs.real(), lhs.imag() + rhs.imag()};
-        }
-
-        //! Addition of a complex and a real number
-        template<typename T>
-        ALPAKA_FN_HOST_ACC Complex<T> operator+(Complex<T> const& lhs, T const& rhs)
-        {
-            return Complex<T>{lhs.real() + rhs, lhs.imag()};
-        }
-
-        //! Addition of a real and a complex number
-        template<typename T>
-        ALPAKA_FN_HOST_ACC Complex<T> operator+(T const& lhs, Complex<T> const& rhs)
-        {
-            return Complex<T>{lhs + rhs.real(), rhs.imag()};
-        }
-
-        //! Subtraction of two complex numbers
-        template<typename T>
-        ALPAKA_FN_HOST_ACC Complex<T> operator-(Complex<T> const& lhs, Complex<T> const& rhs)
-        {
-            return Complex<T>{lhs.real() - rhs.real(), lhs.imag() - rhs.imag()};
-        }
-
-        //! Subtraction of a complex and a real number
-        template<typename T>
-        ALPAKA_FN_HOST_ACC Complex<T> operator-(Complex<T> const& lhs, T const& rhs)
-        {
-            return Complex<T>{lhs.real() - rhs, lhs.imag()};
-        }
-
-        //! Subtraction of a real and a complex number
-        template<typename T>
-        ALPAKA_FN_HOST_ACC Complex<T> operator-(T const& lhs, Complex<T> const& rhs)
-        {
-            return Complex<T>{lhs - rhs.real(), -rhs.imag()};
-        }
-
-        //! Muptiplication of two complex numbers
-        template<typename T>
-        ALPAKA_FN_HOST_ACC Complex<T> operator*(Complex<T> const& lhs, Complex<T> const& rhs)
-        {
-            return Complex<T>{
-                lhs.real() * rhs.real() - lhs.imag() * rhs.imag(),
-                lhs.imag() * rhs.real() + lhs.real() * rhs.imag()};
-        }
-
-        //! Muptiplication of a complex and a real number
-        template<typename T>
-        ALPAKA_FN_HOST_ACC Complex<T> operator*(Complex<T> const& lhs, T const& rhs)
-        {
-            return Complex<T>{lhs.real() * rhs, lhs.imag() * rhs};
-        }
-
-        //! Muptiplication of a real and a complex number
-        template<typename T>
-        ALPAKA_FN_HOST_ACC Complex<T> operator*(T const& lhs, Complex<T> const& rhs)
-        {
-            return Complex<T>{lhs * rhs.real(), lhs * rhs.imag()};
-        }
-
-        //! Division of two complex numbers
-        template<typename T>
-        ALPAKA_FN_HOST_ACC Complex<T> operator/(Complex<T> const& lhs, Complex<T> const& rhs)
-        {
-            return Complex<T>{
-                (lhs.real() * rhs.real() + lhs.imag() * rhs.imag())
-                    / (rhs.real() * rhs.real() + rhs.imag() * rhs.imag()),
-                (lhs.imag() * rhs.real() - lhs.real() * rhs.imag())
-                    / (rhs.real() * rhs.real() + rhs.imag() * rhs.imag())};
-        }
-
-        //! Division of complex and a real number
-        template<typename T>
-        ALPAKA_FN_HOST_ACC Complex<T> operator/(Complex<T> const& lhs, T const& rhs)
-        {
-            return Complex<T>{lhs.real() / rhs, lhs.imag() / rhs};
-        }
-
-        //! Division of a real and a complex number
-        template<typename T>
-        ALPAKA_FN_HOST_ACC Complex<T> operator/(T const& lhs, Complex<T> const& rhs)
-        {
-            return Complex<T>{
-                lhs * rhs.real() / (rhs.real() * rhs.real() + rhs.imag() * rhs.imag()),
-                -lhs * rhs.imag() / (rhs.real() * rhs.real() + rhs.imag() * rhs.imag())};
-        }
-
-        //! Equality of two complex numbers
-        template<typename T>
-        constexpr ALPAKA_FN_HOST_ACC bool operator==(Complex<T> const& lhs, Complex<T> const& rhs)
-        {
-            return math::floatEqualExactNoWarning(lhs.real(), rhs.real())
-                   && math::floatEqualExactNoWarning(lhs.imag(), rhs.imag());
-        }
-
-        //! Equality of a complex and a real number
-        template<typename T>
-        constexpr ALPAKA_FN_HOST_ACC bool operator==(Complex<T> const& lhs, T const& rhs)
-        {
-            return math::floatEqualExactNoWarning(lhs.real(), rhs)
-                   && math::floatEqualExactNoWarning(lhs.imag(), static_cast<T>(0));
-        }
-
-        //! Equality of a real and a complex number
-        template<typename T>
-        constexpr ALPAKA_FN_HOST_ACC bool operator==(T const& lhs, Complex<T> const& rhs)
-        {
-            return math::floatEqualExactNoWarning(lhs, rhs.real())
-                   && math::floatEqualExactNoWarning(static_cast<T>(0), rhs.imag());
-        }
-
-        //! Inequality of two complex numbers.
-        //!
-        //! @note this and other versions of operator != should be removed since C++20, as so does std::complex
-        template<typename T>
-        constexpr ALPAKA_FN_HOST_ACC bool operator!=(Complex<T> const& lhs, Complex<T> const& rhs)
-        {
-            return !(lhs == rhs);
-        }
-
-        //! Inequality of a complex and a real number
-        template<typename T>
-        constexpr ALPAKA_FN_HOST_ACC bool operator!=(Complex<T> const& lhs, T const& rhs)
-        {
-            return !math::floatEqualExactNoWarning(lhs.real(), rhs)
-                   || !math::floatEqualExactNoWarning(lhs.imag(), static_cast<T>(0));
-        }
-
-        //! Inequality of a real and a complex number
-        template<typename T>
-        constexpr ALPAKA_FN_HOST_ACC bool operator!=(T const& lhs, Complex<T> const& rhs)
-        {
-            return !math::floatEqualExactNoWarning(lhs, rhs.real())
-                   || !math::floatEqualExactNoWarning(static_cast<T>(0), rhs.imag());
-        }
-
-        //! @}
-
-        //! Host-only output of a complex number
-        template<typename T, typename TChar, typename TTraits>
-        std::basic_ostream<TChar, TTraits>& operator<<(std::basic_ostream<TChar, TTraits>& os, Complex<T> const& x)
-        {
-            os << x.operator std::complex<T>();
-            return os;
-        }
-
-        //! Host-only input of a complex number
-        template<typename T, typename TChar, typename TTraits>
-        std::basic_istream<TChar, TTraits>& operator>>(std::basic_istream<TChar, TTraits>& is, Complex<T> const& x)
-        {
-            std::complex<T> z;
-            is >> z;
-            x = z;
-            return is;
-        }
-
-        //! Host-only math functions matching std::complex<T>.
-        //!
-        //! Due to issue #1688, these functions are technically marked host-device and suppress related warnings.
-        //! However, they must be called for host only.
-        //!
-        //! They take and return alpaka::Complex (or a real number when appropriate).
-        //! Internally cast, fall back to std::complex implementation and cast back.
-        //! These functions can be used directly on the host side.
-        //! They are also picked up by ADL in math traits for CPU backends.
-        //!
-        //! On the device side, alpaka math traits must be used instead.
-        //! Note that the set of the traits is currently a bit smaller.
-        //!
-        //! @{
-        //!
-
-        //! Absolute value
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        constexpr ALPAKA_FN_HOST_ACC T abs(Complex<T> const& x)
-        {
-            return std::abs(std::complex<T>(x));
-        }
-
-        //! Arc cosine
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        constexpr ALPAKA_FN_HOST_ACC Complex<T> acos(Complex<T> const& x)
-        {
-            return std::acos(std::complex<T>(x));
-        }
-
-        //! Arc hyperbolic cosine
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        constexpr ALPAKA_FN_HOST_ACC Complex<T> acosh(Complex<T> const& x)
-        {
-            return std::acosh(std::complex<T>(x));
-        }
-
-        //! Argument
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        constexpr ALPAKA_FN_HOST_ACC T arg(Complex<T> const& x)
-        {
-            return std::arg(std::complex<T>(x));
-        }
-
-        //! Arc sine
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        constexpr ALPAKA_FN_HOST_ACC Complex<T> asin(Complex<T> const& x)
-        {
-            return std::asin(std::complex<T>(x));
-        }
-
-        //! Arc hyperbolic sine
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        constexpr ALPAKA_FN_HOST_ACC Complex<T> asinh(Complex<T> const& x)
-        {
-            return std::asinh(std::complex<T>(x));
-        }
-
-        //! Arc tangent
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        constexpr ALPAKA_FN_HOST_ACC Complex<T> atan(Complex<T> const& x)
-        {
-            return std::atan(std::complex<T>(x));
-        }
-
-        //! Arc hyperbolic tangent
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        constexpr ALPAKA_FN_HOST_ACC Complex<T> atanh(Complex<T> const& x)
-        {
-            return std::atanh(std::complex<T>(x));
-        }
-
-        //! Complex conjugate
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        constexpr ALPAKA_FN_HOST_ACC Complex<T> conj(Complex<T> const& x)
-        {
-            return std::conj(std::complex<T>(x));
-        }
-
-        //! Cosine
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        constexpr ALPAKA_FN_HOST_ACC Complex<T> cos(Complex<T> const& x)
-        {
-            return std::cos(std::complex<T>(x));
-        }
-
-        //! Hyperbolic cosine
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        constexpr ALPAKA_FN_HOST_ACC Complex<T> cosh(Complex<T> const& x)
-        {
-            return std::cosh(std::complex<T>(x));
-        }
-
-        //! Exponential
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        constexpr ALPAKA_FN_HOST_ACC Complex<T> exp(Complex<T> const& x)
-        {
-            return std::exp(std::complex<T>(x));
-        }
-
-        //! Natural logarithm
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        constexpr ALPAKA_FN_HOST_ACC Complex<T> log(Complex<T> const& x)
-        {
-            return std::log(std::complex<T>(x));
-        }
-
-        //! Base 10 logarithm
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        constexpr ALPAKA_FN_HOST_ACC Complex<T> log10(Complex<T> const& x)
-        {
-            return std::log10(std::complex<T>(x));
-        }
-
-        //! Squared magnitude
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        constexpr ALPAKA_FN_HOST_ACC T norm(Complex<T> const& x)
-        {
-            return std::norm(std::complex<T>(x));
-        }
-
-        //! Get a complex number with given magnitude and phase angle
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        constexpr ALPAKA_FN_HOST_ACC Complex<T> polar(T const& r, T const& theta = T())
-        {
-            return std::polar(r, theta);
-        }
-
-        //! Complex power of a complex number
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T, typename U>
-        constexpr ALPAKA_FN_HOST_ACC auto pow(Complex<T> const& x, Complex<U> const& y)
-        {
-            // Use same type promotion as std::pow
-            auto const result = std::pow(std::complex<T>(x), std::complex<U>(y));
-            using ValueType = typename decltype(result)::value_type;
-            return Complex<ValueType>(result);
-        }
-
-        //! Real power of a complex number
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T, typename U>
-        constexpr ALPAKA_FN_HOST_ACC auto pow(Complex<T> const& x, U const& y)
-        {
-            return pow(x, Complex<U>(y));
-        }
-
-        //! Complex power of a real number
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T, typename U>
-        constexpr ALPAKA_FN_HOST_ACC auto pow(T const& x, Complex<U> const& y)
-        {
-            return pow(Complex<T>(x), y);
-        }
-
-        //! Projection onto the Riemann sphere
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        constexpr ALPAKA_FN_HOST_ACC Complex<T> proj(Complex<T> const& x)
-        {
-            return std::proj(std::complex<T>(x));
-        }
-
-        //! Sine
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        constexpr ALPAKA_FN_HOST_ACC Complex<T> sin(Complex<T> const& x)
-        {
-            return std::sin(std::complex<T>(x));
-        }
-
-        //! Hyperbolic sine
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        constexpr ALPAKA_FN_HOST_ACC Complex<T> sinh(Complex<T> const& x)
-        {
-            return std::sinh(std::complex<T>(x));
-        }
-
-        //! Square root
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        constexpr ALPAKA_FN_HOST_ACC Complex<T> sqrt(Complex<T> const& x)
-        {
-            return std::sqrt(std::complex<T>(x));
-        }
-
-        //! Tangent
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        constexpr ALPAKA_FN_HOST_ACC Complex<T> tan(Complex<T> const& x)
-        {
-            return std::tan(std::complex<T>(x));
-        }
-
-        //! Hyperbolic tangent
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T>
-        constexpr ALPAKA_FN_HOST_ACC Complex<T> tanh(Complex<T> const& x)
-        {
-            return std::tanh(std::complex<T>(x));
-        }
-
-        //! @}
-    } // namespace internal
-
-    using internal::Complex;
-} // namespace alpaka
diff --git a/include/alpaka/math/FloatEqualExact.hpp b/include/alpaka/math/FloatEqualExact.hpp
deleted file mode 100644
index 8c252b4..0000000
--- a/include/alpaka/math/FloatEqualExact.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright 2021 Jiri Vyskocil
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-
-#include <type_traits>
-
-namespace alpaka
-{
-    namespace math
-    {
-        /** Compare two floating point numbers for exact equivalence. Use only when necessary, and be aware of the
-         * implications. Most codes should not use this function and instead implement a correct epsilon-based
-         * comparison. If you are unfamiliar with the topic, check out
-         * https://www.geeksforgeeks.org/problem-in-comparing-floating-point-numbers-and-how-to-compare-them-correctly/
-         * or Goldberg 1991: "What every computer scientist should know about floating-point arithmetic",
-         * https://dl.acm.org/doi/10.1145/103162.103163
-         *
-         * This function calls the == operator for floating point types, but disables the warning issued by the
-         * compiler when compiling with the float equality warning checks enabled. This warning is valid an valuable in
-         * most codes and should be generally enabled, but there are specific instances where a piece of code might
-         * need to do an exact comparison (e.g. @a CudaVectorArrayWrapperTest.cpp). The verbose name for the function
-         * is intentional as it should raise a red flag if used while not absolutely needed. Users are advised to add a
-         * justification whenever they use this function.
-         *
-         * @tparam T both operands have to be the same type and conform to std::is_floating_point
-         * @param a first operand
-         * @param b second operand
-         * @return a == b
-         */
-        template<typename T>
-        ALPAKA_FN_INLINE ALPAKA_FN_HOST_ACC auto floatEqualExactNoWarning(T a, T b) -> bool
-        {
-            static_assert(std::is_floating_point_v<T>, "floatEqualExactNoWarning is for floating point values only!");
-
-            // So far only GCC and Clang check for float comparison and both accept the GCC pragmas.
-#ifdef __GNUC__
-#    pragma GCC diagnostic push
-#    pragma GCC diagnostic ignored "-Wfloat-equal"
-#endif
-            return a == b;
-#ifdef __GNUC__
-#    pragma GCC diagnostic pop
-#endif
-        }
-    } // namespace math
-} // namespace alpaka
diff --git a/include/alpaka/math/MathGenericSycl.hpp b/include/alpaka/math/MathGenericSycl.hpp
deleted file mode 100644
index 086c480..0000000
--- a/include/alpaka/math/MathGenericSycl.hpp
+++ /dev/null
@@ -1,751 +0,0 @@
-/* Copyright 2023 Jan Stephan, Sergei Bastrakov, René Widera, Luca Ferragina, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/math/Complex.hpp"
-#include "alpaka/math/Traits.hpp"
-
-#include <type_traits>
-
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-
-#    include <sycl/sycl.hpp>
-
-//! The mathematical operation specifics.
-namespace alpaka::math
-{
-    //! The SYCL abs.
-    class AbsGenericSycl : public concepts::Implements<alpaka::math::ConceptMathAbs, AbsGenericSycl>
-    {
-    };
-
-    //! The SYCL acos.
-    class AcosGenericSycl : public concepts::Implements<alpaka::math::ConceptMathAcos, AcosGenericSycl>
-    {
-    };
-
-    //! The SYCL acosh.
-    class AcoshGenericSycl : public concepts::Implements<alpaka::math::ConceptMathAcosh, AcoshGenericSycl>
-    {
-    };
-
-    //! The SYCL arg.
-    class ArgGenericSycl : public concepts::Implements<alpaka::math::ConceptMathArg, ArgGenericSycl>
-    {
-    };
-
-    //! The SYCL asin.
-    class AsinGenericSycl : public concepts::Implements<alpaka::math::ConceptMathAsin, AsinGenericSycl>
-    {
-    };
-
-    //! The SYCL asinh.
-    class AsinhGenericSycl : public concepts::Implements<alpaka::math::ConceptMathAsinh, AsinhGenericSycl>
-    {
-    };
-
-    //! The SYCL atan.
-    class AtanGenericSycl : public concepts::Implements<alpaka::math::ConceptMathAtan, AtanGenericSycl>
-    {
-    };
-
-    //! The SYCL atanh.
-    class AtanhGenericSycl : public concepts::Implements<alpaka::math::ConceptMathAtanh, AtanhGenericSycl>
-    {
-    };
-
-    //! The SYCL atan2.
-    class Atan2GenericSycl : public concepts::Implements<alpaka::math::ConceptMathAtan2, Atan2GenericSycl>
-    {
-    };
-
-    //! The SYCL cbrt.
-    class CbrtGenericSycl : public concepts::Implements<alpaka::math::ConceptMathCbrt, CbrtGenericSycl>
-    {
-    };
-
-    //! The SYCL ceil.
-    class CeilGenericSycl : public concepts::Implements<alpaka::math::ConceptMathCeil, CeilGenericSycl>
-    {
-    };
-
-    //! The SYCL conj.
-    class ConjGenericSycl : public concepts::Implements<alpaka::math::ConceptMathConj, ConjGenericSycl>
-    {
-    };
-
-    //! The SYCL copysign.
-    class CopysignGenericSycl : public concepts::Implements<alpaka::math::ConceptMathCopysign, CopysignGenericSycl>
-    {
-    };
-
-    //! The SYCL cos.
-    class CosGenericSycl : public concepts::Implements<alpaka::math::ConceptMathCos, CosGenericSycl>
-    {
-    };
-
-    //! The SYCL cosh.
-    class CoshGenericSycl : public concepts::Implements<alpaka::math::ConceptMathCosh, CoshGenericSycl>
-    {
-    };
-
-    //! The SYCL erf.
-    class ErfGenericSycl : public concepts::Implements<alpaka::math::ConceptMathErf, ErfGenericSycl>
-    {
-    };
-
-    //! The SYCL exp.
-    class ExpGenericSycl : public concepts::Implements<alpaka::math::ConceptMathExp, ExpGenericSycl>
-    {
-    };
-
-    //! The SYCL floor.
-    class FloorGenericSycl : public concepts::Implements<alpaka::math::ConceptMathFloor, FloorGenericSycl>
-    {
-    };
-
-    //! The SYCL fma.
-    class FmaGenericSycl : public concepts::Implements<alpaka::math::ConceptMathFma, FmaGenericSycl>
-    {
-    };
-
-    //! The SYCL fmod.
-    class FmodGenericSycl : public concepts::Implements<alpaka::math::ConceptMathFmod, FmodGenericSycl>
-    {
-    };
-
-    //! The SYCL isfinite.
-    class IsfiniteGenericSycl : public concepts::Implements<alpaka::math::ConceptMathIsfinite, IsfiniteGenericSycl>
-    {
-    };
-
-    //! The SYCL isfinite.
-    class IsinfGenericSycl : public concepts::Implements<alpaka::math::ConceptMathIsinf, IsinfGenericSycl>
-    {
-    };
-
-    //! The SYCL isnan.
-    class IsnanGenericSycl : public concepts::Implements<alpaka::math::ConceptMathIsnan, IsnanGenericSycl>
-    {
-    };
-
-    //! The SYCL log.
-    class LogGenericSycl : public concepts::Implements<alpaka::math::ConceptMathLog, LogGenericSycl>
-    {
-    };
-
-    //! The SYCL log2.
-    class Log2GenericSycl : public concepts::Implements<alpaka::math::ConceptMathLog2, Log2GenericSycl>
-    {
-    };
-
-    //! The SYCL log10.
-    class Log10GenericSycl : public concepts::Implements<alpaka::math::ConceptMathLog10, Log10GenericSycl>
-    {
-    };
-
-    //! The SYCL max.
-    class MaxGenericSycl : public concepts::Implements<alpaka::math::ConceptMathMax, MaxGenericSycl>
-    {
-    };
-
-    //! The SYCL min.
-    class MinGenericSycl : public concepts::Implements<alpaka::math::ConceptMathMin, MinGenericSycl>
-    {
-    };
-
-    //! The SYCL pow.
-    class PowGenericSycl : public concepts::Implements<alpaka::math::ConceptMathPow, PowGenericSycl>
-    {
-    };
-
-    //! The SYCL remainder.
-    class RemainderGenericSycl : public concepts::Implements<alpaka::math::ConceptMathRemainder, RemainderGenericSycl>
-    {
-    };
-
-    //! The SYCL round.
-    class RoundGenericSycl : public concepts::Implements<alpaka::math::ConceptMathRound, RoundGenericSycl>
-    {
-    };
-
-    //! The SYCL rsqrt.
-    class RsqrtGenericSycl : public concepts::Implements<alpaka::math::ConceptMathRsqrt, RsqrtGenericSycl>
-    {
-    };
-
-    //! The SYCL sin.
-    class SinGenericSycl : public concepts::Implements<alpaka::math::ConceptMathSin, SinGenericSycl>
-    {
-    };
-
-    //! The SYCL sinh.
-    class SinhGenericSycl : public concepts::Implements<alpaka::math::ConceptMathSinh, SinhGenericSycl>
-    {
-    };
-
-    //! The SYCL sincos.
-    class SinCosGenericSycl : public concepts::Implements<alpaka::math::ConceptMathSinCos, SinCosGenericSycl>
-    {
-    };
-
-    //! The SYCL sqrt.
-    class SqrtGenericSycl : public concepts::Implements<alpaka::math::ConceptMathSqrt, SqrtGenericSycl>
-    {
-    };
-
-    //! The SYCL tan.
-    class TanGenericSycl : public concepts::Implements<alpaka::math::ConceptMathTan, TanGenericSycl>
-    {
-    };
-
-    //! The SYCL tanh.
-    class TanhGenericSycl : public concepts::Implements<alpaka::math::ConceptMathTanh, TanhGenericSycl>
-    {
-    };
-
-    //! The SYCL trunc.
-    class TruncGenericSycl : public concepts::Implements<alpaka::math::ConceptMathTrunc, TruncGenericSycl>
-    {
-    };
-
-    //! The SYCL math trait specializations.
-    class MathGenericSycl
-        : public AbsGenericSycl
-        , public AcosGenericSycl
-        , public AcoshGenericSycl
-        , public ArgGenericSycl
-        , public AsinGenericSycl
-        , public AsinhGenericSycl
-        , public AtanGenericSycl
-        , public AtanhGenericSycl
-        , public Atan2GenericSycl
-        , public CbrtGenericSycl
-        , public CeilGenericSycl
-        , public ConjGenericSycl
-        , public CopysignGenericSycl
-        , public CosGenericSycl
-        , public CoshGenericSycl
-        , public ErfGenericSycl
-        , public ExpGenericSycl
-        , public FloorGenericSycl
-        , public FmaGenericSycl
-        , public FmodGenericSycl
-        , public IsfiniteGenericSycl
-        , public IsinfGenericSycl
-        , public IsnanGenericSycl
-        , public LogGenericSycl
-        , public Log2GenericSycl
-        , public Log10GenericSycl
-        , public MaxGenericSycl
-        , public MinGenericSycl
-        , public PowGenericSycl
-        , public RemainderGenericSycl
-        , public RoundGenericSycl
-        , public RsqrtGenericSycl
-        , public SinGenericSycl
-        , public SinhGenericSycl
-        , public SinCosGenericSycl
-        , public SqrtGenericSycl
-        , public TanGenericSycl
-        , public TanhGenericSycl
-        , public TruncGenericSycl
-    {
-    };
-} // namespace alpaka::math
-
-namespace alpaka::math::trait
-{
-    //! The SYCL abs trait specialization.
-    template<typename TArg>
-    struct Abs<math::AbsGenericSycl, TArg, std::enable_if_t<std::is_arithmetic_v<TArg>>>
-    {
-        auto operator()(math::AbsGenericSycl const&, TArg const& arg)
-        {
-            if constexpr(std::is_integral_v<TArg>)
-                return sycl::abs(arg);
-            else if constexpr(std::is_floating_point_v<TArg>)
-                return sycl::fabs(arg);
-            else
-                static_assert(!sizeof(TArg), "Unsupported data type");
-        }
-    };
-
-    //! The SYCL acos trait specialization.
-    template<typename TArg>
-    struct Acos<math::AcosGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-    {
-        auto operator()(math::AcosGenericSycl const&, TArg const& arg)
-        {
-            return sycl::acos(arg);
-        }
-    };
-
-    //! The SYCL acosh trait specialization.
-    template<typename TArg>
-    struct Acosh<math::AcoshGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-    {
-        auto operator()(math::AcoshGenericSycl const&, TArg const& arg)
-        {
-            return sycl::acosh(arg);
-        }
-    };
-
-    //! The SYCL arg trait specialization.
-    template<typename TArgument>
-    struct Arg<math::ArgGenericSycl, TArgument, std::enable_if_t<std::is_arithmetic_v<TArgument>>>
-    {
-        auto operator()(math::ArgGenericSycl const&, TArgument const& argument)
-        {
-            if constexpr(std::is_integral_v<TArgument>)
-                return sycl::atan2(0.0, static_cast<double>(argument));
-            else if constexpr(std::is_floating_point_v<TArgument>)
-                return sycl::atan2(static_cast<TArgument>(0.0), argument);
-            else
-                static_assert(!sizeof(TArgument), "Unsupported data type");
-        }
-    };
-
-    //! The SYCL asin trait specialization.
-    template<typename TArg>
-    struct Asin<math::AsinGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-    {
-        auto operator()(math::AsinGenericSycl const&, TArg const& arg)
-        {
-            return sycl::asin(arg);
-        }
-    };
-
-    //! The SYCL asinh trait specialization.
-    template<typename TArg>
-    struct Asinh<math::AsinhGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-    {
-        auto operator()(math::AsinhGenericSycl const&, TArg const& arg)
-        {
-            return sycl::asinh(arg);
-        }
-    };
-
-    //! The SYCL atan trait specialization.
-    template<typename TArg>
-    struct Atan<math::AtanGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-    {
-        auto operator()(math::AtanGenericSycl const&, TArg const& arg)
-        {
-            return sycl::atan(arg);
-        }
-    };
-
-    //! The SYCL atanh trait specialization.
-    template<typename TArg>
-    struct Atanh<math::AtanhGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-    {
-        auto operator()(math::AtanhGenericSycl const&, TArg const& arg)
-        {
-            return sycl::atanh(arg);
-        }
-    };
-
-    //! The SYCL atan2 trait specialization.
-    template<typename Ty, typename Tx>
-    struct Atan2<
-        math::Atan2GenericSycl,
-        Ty,
-        Tx,
-        std::enable_if_t<std::is_floating_point_v<Ty> && std::is_floating_point_v<Tx>>>
-    {
-        using TCommon = std::common_type_t<Ty, Tx>;
-
-        auto operator()(math::Atan2GenericSycl const&, Ty const& y, Tx const& x)
-        {
-            return sycl::atan2(static_cast<TCommon>(y), static_cast<TCommon>(x));
-        }
-    };
-
-    //! The SYCL cbrt trait specialization.
-    template<typename TArg>
-    struct Cbrt<math::CbrtGenericSycl, TArg, std::enable_if_t<std::is_arithmetic_v<TArg>>>
-    {
-        auto operator()(math::CbrtGenericSycl const&, TArg const& arg)
-        {
-            if constexpr(std::is_integral_v<TArg>)
-                return sycl::cbrt(static_cast<double>(arg)); // Mirror CUDA back-end and use double for ints
-            else if constexpr(std::is_floating_point_v<TArg>)
-                return sycl::cbrt(arg);
-            else
-                static_assert(!sizeof(TArg), "Unsupported data type");
-        }
-    };
-
-    //! The SYCL ceil trait specialization.
-    template<typename TArg>
-    struct Ceil<math::CeilGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-    {
-        auto operator()(math::CeilGenericSycl const&, TArg const& arg)
-        {
-            return sycl::ceil(arg);
-        }
-    };
-
-    //! The SYCL conj trait specialization.
-    template<typename TArg>
-    struct Conj<math::ConjGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-    {
-        auto operator()(math::ConjGenericSycl const&, TArg const& arg)
-        {
-            return Complex<TArg>{arg, TArg{0.0}};
-        }
-    };
-
-    //! The SYCL copysign trait specialization.
-    template<typename TMag, typename TSgn>
-    struct Copysign<
-        math::CopysignGenericSycl,
-        TMag,
-        TSgn,
-        std::enable_if_t<std::is_floating_point_v<TMag> && std::is_floating_point_v<TSgn>>>
-    {
-        using TCommon = std::common_type_t<TMag, TSgn>;
-
-        auto operator()(math::CopysignGenericSycl const&, TMag const& y, TSgn const& x)
-        {
-            return sycl::copysign(static_cast<TCommon>(y), static_cast<TCommon>(x));
-        }
-    };
-
-    //! The SYCL cos trait specialization.
-    template<typename TArg>
-    struct Cos<math::CosGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-    {
-        auto operator()(math::CosGenericSycl const&, TArg const& arg)
-        {
-            return sycl::cos(arg);
-        }
-    };
-
-    //! The SYCL cos trait specialization.
-    template<typename TArg>
-    struct Cosh<math::CoshGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-    {
-        auto operator()(math::CoshGenericSycl const&, TArg const& arg)
-        {
-            return sycl::cosh(arg);
-        }
-    };
-
-    //! The SYCL erf trait specialization.
-    template<typename TArg>
-    struct Erf<math::ErfGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-    {
-        auto operator()(math::ErfGenericSycl const&, TArg const& arg)
-        {
-            return sycl::erf(arg);
-        }
-    };
-
-    //! The SYCL exp trait specialization.
-    template<typename TArg>
-    struct Exp<math::ExpGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-    {
-        auto operator()(math::ExpGenericSycl const&, TArg const& arg)
-        {
-            return sycl::exp(arg);
-        }
-    };
-
-    //! The SYCL floor trait specialization.
-    template<typename TArg>
-    struct Floor<math::FloorGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-    {
-        auto operator()(math::FloorGenericSycl const&, TArg const& arg)
-        {
-            return sycl::floor(arg);
-        }
-    };
-
-    //! The SYCL fma trait specialization.
-    template<typename Tx, typename Ty, typename Tz>
-    struct Fma<
-        math::FmaGenericSycl,
-        Tx,
-        Ty,
-        Tz,
-        std::enable_if_t<std::is_floating_point_v<Tx> && std::is_floating_point_v<Ty> && std::is_floating_point_v<Tz>>>
-    {
-        auto operator()(math::FmaGenericSycl const&, Tx const& x, Ty const& y, Tz const& z)
-        {
-            return sycl::fma(x, y, z);
-        }
-    };
-
-    //! The SYCL fmod trait specialization.
-    template<typename Tx, typename Ty>
-    struct Fmod<
-        math::FmodGenericSycl,
-        Tx,
-        Ty,
-        std::enable_if_t<std::is_floating_point_v<Tx> && std::is_floating_point_v<Ty>>>
-    {
-        using TCommon = std::common_type_t<Tx, Ty>;
-
-        auto operator()(math::FmodGenericSycl const&, Tx const& x, Ty const& y)
-        {
-            return sycl::fmod(static_cast<TCommon>(x), static_cast<TCommon>(y));
-        }
-    };
-
-    //! The SYCL isfinite trait specialization.
-    template<typename TArg>
-    struct Isfinite<math::IsfiniteGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-    {
-        auto operator()(math::IsfiniteGenericSycl const&, TArg const& arg)
-        {
-            return static_cast<bool>(sycl::isfinite(arg));
-        }
-    };
-
-    //! The SYCL isinf trait specialization.
-    template<typename TArg>
-    struct Isinf<math::IsinfGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-    {
-        auto operator()(math::IsinfGenericSycl const&, TArg const& arg)
-        {
-            return static_cast<bool>(sycl::isinf(arg));
-        }
-    };
-
-    //! The SYCL isnan trait specialization.
-    template<typename TArg>
-    struct Isnan<math::IsnanGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-    {
-        auto operator()(math::IsnanGenericSycl const&, TArg const& arg)
-        {
-            return static_cast<bool>(sycl::isnan(arg));
-        }
-    };
-
-    //! The SYCL log trait specialization.
-    template<typename TArg>
-    struct Log<math::LogGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-    {
-        auto operator()(math::LogGenericSycl const&, TArg const& arg)
-        {
-            return sycl::log(arg);
-        }
-    };
-
-    //! The SYCL log2 trait specialization.
-    template<typename TArg>
-    struct Log2<math::Log2GenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-    {
-        auto operator()(math::Log2GenericSycl const&, TArg const& arg)
-        {
-            return sycl::log2(arg);
-        }
-    };
-
-    //! The SYCL log10 trait specialization.
-    template<typename TArg>
-    struct Log10<math::Log10GenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-    {
-        auto operator()(math::Log10GenericSycl const&, TArg const& arg)
-        {
-            return sycl::log10(arg);
-        }
-    };
-
-    //! The SYCL max trait specialization.
-    template<typename Tx, typename Ty>
-    struct Max<math::MaxGenericSycl, Tx, Ty, std::enable_if_t<std::is_arithmetic_v<Tx> && std::is_arithmetic_v<Ty>>>
-    {
-        using TCommon = std::common_type_t<Tx, Ty>;
-
-        auto operator()(math::MaxGenericSycl const&, Tx const& x, Ty const& y)
-        {
-            if constexpr(std::is_integral_v<Tx> && std::is_integral_v<Ty>)
-                return sycl::max(static_cast<TCommon>(x), static_cast<TCommon>(y));
-            else if constexpr(std::is_floating_point_v<Tx> && std::is_floating_point_v<Ty>)
-                return sycl::fmax(static_cast<TCommon>(x), static_cast<TCommon>(y));
-            else if constexpr(
-                (std::is_floating_point_v<Tx> && std::is_integral_v<Ty>)
-                || (std::is_integral_v<Tx> && std::is_floating_point_v<Ty>) )
-                return sycl::fmax(static_cast<double>(x), static_cast<double>(y)); // mirror CUDA back-end
-            else
-                static_assert(!sizeof(Tx), "Unsupported data types");
-        }
-    };
-
-    //! The SYCL min trait specialization.
-    template<typename Tx, typename Ty>
-    struct Min<math::MinGenericSycl, Tx, Ty, std::enable_if_t<std::is_arithmetic_v<Tx> && std::is_arithmetic_v<Ty>>>
-    {
-        auto operator()(math::MinGenericSycl const&, Tx const& x, Ty const& y)
-        {
-            if constexpr(std::is_integral_v<Tx> && std::is_integral_v<Ty>)
-                return sycl::min(x, y);
-            else if constexpr(std::is_floating_point_v<Tx> || std::is_floating_point_v<Ty>)
-                return sycl::fmin(x, y);
-            else if constexpr(
-                (std::is_floating_point_v<Tx> && std::is_integral_v<Ty>)
-                || (std::is_integral_v<Tx> && std::is_floating_point_v<Ty>) )
-                return sycl::fmin(static_cast<double>(x), static_cast<double>(y)); // mirror CUDA back-end
-            else
-                static_assert(!sizeof(Tx), "Unsupported data types");
-        }
-    };
-
-    //! The SYCL pow trait specialization.
-    template<typename TBase, typename TExp>
-    struct Pow<
-        math::PowGenericSycl,
-        TBase,
-        TExp,
-        std::enable_if_t<std::is_floating_point_v<TBase> && std::is_floating_point_v<TExp>>>
-    {
-        using TCommon = std::common_type_t<TBase, TExp>;
-
-        auto operator()(math::PowGenericSycl const&, TBase const& base, TExp const& exp)
-        {
-            return sycl::pow(static_cast<TCommon>(base), static_cast<TCommon>(exp));
-        }
-    };
-
-    //! The SYCL remainder trait specialization.
-    template<typename Tx, typename Ty>
-    struct Remainder<
-        math::RemainderGenericSycl,
-        Tx,
-        Ty,
-        std::enable_if_t<std::is_floating_point_v<Tx> && std::is_floating_point_v<Ty>>>
-    {
-        using TCommon = std::common_type_t<Tx, Ty>;
-
-        auto operator()(math::RemainderGenericSycl const&, Tx const& x, Ty const& y)
-        {
-            return sycl::remainder(static_cast<TCommon>(x), static_cast<TCommon>(y));
-        }
-    };
-
-    //! The SYCL round trait specialization.
-    template<typename TArg>
-    struct Round<math::RoundGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-    {
-        auto operator()(math::RoundGenericSycl const&, TArg const& arg)
-        {
-            return sycl::round(arg);
-        }
-    };
-
-    //! The SYCL lround trait specialization.
-    template<typename TArg>
-    struct Lround<math::RoundGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-    {
-        auto operator()(math::RoundGenericSycl const&, TArg const& arg)
-        {
-            return static_cast<long>(sycl::round(arg));
-        }
-    };
-
-    //! The SYCL llround trait specialization.
-    template<typename TArg>
-    struct Llround<math::RoundGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-    {
-        auto operator()(math::RoundGenericSycl const&, TArg const& arg)
-        {
-            return static_cast<long long>(sycl::round(arg));
-        }
-    };
-
-    //! The SYCL rsqrt trait specialization.
-    template<typename TArg>
-    struct Rsqrt<math::RsqrtGenericSycl, TArg, std::enable_if_t<std::is_arithmetic_v<TArg>>>
-    {
-        auto operator()(math::RsqrtGenericSycl const&, TArg const& arg)
-        {
-            if constexpr(std::is_floating_point_v<TArg>)
-                return sycl::rsqrt(arg);
-            else if constexpr(std::is_integral_v<TArg>)
-                return sycl::rsqrt(static_cast<double>(arg)); // mirror CUDA back-end and use double for ints
-            else
-                static_assert(!sizeof(TArg), "Unsupported data type");
-        }
-    };
-
-    //! The SYCL sin trait specialization.
-    template<typename TArg>
-    struct Sin<math::SinGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-    {
-        auto operator()(math::SinGenericSycl const&, TArg const& arg)
-        {
-            return sycl::sin(arg);
-        }
-    };
-
-    //! The SYCL sinh trait specialization.
-    template<typename TArg>
-    struct Sinh<math::SinhGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-    {
-        auto operator()(math::SinhGenericSycl const&, TArg const& arg)
-        {
-            return sycl::sinh(arg);
-        }
-    };
-
-    //! The SYCL sincos trait specialization.
-    template<typename TArg>
-    struct SinCos<math::SinCosGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-    {
-        auto operator()(math::SinCosGenericSycl const&, TArg const& arg, TArg& result_sin, TArg& result_cos) -> void
-        {
-            result_sin = sycl::sincos(arg, &result_cos);
-        }
-    };
-
-    //! The SYCL sqrt trait specialization.
-    template<typename TArg>
-    struct Sqrt<math::SqrtGenericSycl, TArg, std::enable_if_t<std::is_arithmetic_v<TArg>>>
-    {
-        auto operator()(math::SqrtGenericSycl const&, TArg const& arg)
-        {
-            if constexpr(std::is_floating_point_v<TArg>)
-                return sycl::sqrt(arg);
-            else if constexpr(std::is_integral_v<TArg>)
-                return sycl::sqrt(static_cast<double>(arg)); // mirror CUDA back-end and use double for ints
-        }
-    };
-
-    //! The SYCL tan trait specialization.
-    template<typename TArg>
-    struct Tan<math::TanGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-    {
-        auto operator()(math::TanGenericSycl const&, TArg const& arg)
-        {
-            return sycl::tan(arg);
-        }
-    };
-
-    //! The SYCL tanh trait specialization.
-    template<typename TArg>
-    struct Tanh<math::TanhGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-    {
-        auto operator()(math::TanhGenericSycl const&, TArg const& arg)
-        {
-            return sycl::tanh(arg);
-        }
-    };
-
-    //! The SYCL trunc trait specialization.
-    template<typename TArg>
-    struct Trunc<math::TruncGenericSycl, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-    {
-        auto operator()(math::TruncGenericSycl const&, TArg const& arg)
-        {
-            return sycl::trunc(arg);
-        }
-    };
-} // namespace alpaka::math::trait
-
-#endif
diff --git a/include/alpaka/math/MathStdLib.hpp b/include/alpaka/math/MathStdLib.hpp
deleted file mode 100644
index e74380f..0000000
--- a/include/alpaka/math/MathStdLib.hpp
+++ /dev/null
@@ -1,299 +0,0 @@
-/* Copyright 2023 Alexander Matthes, Axel Huebl, Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber,
- * Jeffrey Kelling, Sergei Bastrakov, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Decay.hpp"
-#include "alpaka/math/Traits.hpp"
-
-namespace alpaka::math
-{
-    //! The standard library abs, implementation covered by the general template.
-    class AbsStdLib : public concepts::Implements<ConceptMathAbs, AbsStdLib>
-    {
-    };
-
-    //! The standard library acos, implementation covered by the general template.
-    class AcosStdLib : public concepts::Implements<ConceptMathAcos, AcosStdLib>
-    {
-    };
-
-    //! The standard library acos, implementation covered by the general template.
-    class AcoshStdLib : public concepts::Implements<ConceptMathAcosh, AcoshStdLib>
-    {
-    };
-
-    //! The standard library arg, implementation covered by the general template.
-    class ArgStdLib : public concepts::Implements<ConceptMathArg, ArgStdLib>
-    {
-    };
-
-    //! The standard library asin, implementation covered by the general template.
-    class AsinStdLib : public concepts::Implements<ConceptMathAsin, AsinStdLib>
-    {
-    };
-
-    //! The standard library asinh, implementation covered by the general template.
-    class AsinhStdLib : public concepts::Implements<ConceptMathAsinh, AsinhStdLib>
-    {
-    };
-
-    //! The standard library atan, implementation covered by the general template.
-    class AtanStdLib : public concepts::Implements<ConceptMathAtan, AtanStdLib>
-    {
-    };
-
-    //! The standard library atanh, implementation covered by the general template.
-    class AtanhStdLib : public concepts::Implements<ConceptMathAtanh, AtanhStdLib>
-    {
-    };
-
-    //! The standard library atan2, implementation covered by the general template.
-    class Atan2StdLib : public concepts::Implements<ConceptMathAtan2, Atan2StdLib>
-    {
-    };
-
-    //! The standard library cbrt, implementation covered by the general template.
-    class CbrtStdLib : public concepts::Implements<ConceptMathCbrt, CbrtStdLib>
-    {
-    };
-
-    //! The standard library ceil, implementation covered by the general template.
-    class CeilStdLib : public concepts::Implements<ConceptMathCeil, CeilStdLib>
-    {
-    };
-
-    //! The standard library conj, implementation covered by the general template.
-    class ConjStdLib : public concepts::Implements<ConceptMathConj, ConjStdLib>
-    {
-    };
-
-    //! The standard library copysign, implementation covered by the general template.
-    class CopysignStdLib : public concepts::Implements<ConceptMathCopysign, CopysignStdLib>
-    {
-    };
-
-    //! The standard library cos, implementation covered by the general template.
-    class CosStdLib : public concepts::Implements<ConceptMathCos, CosStdLib>
-    {
-    };
-
-    //! The standard library cosh, implementation covered by the general template.
-    class CoshStdLib : public concepts::Implements<ConceptMathCosh, CoshStdLib>
-    {
-    };
-
-    //! The standard library erf, implementation covered by the general template.
-    class ErfStdLib : public concepts::Implements<ConceptMathErf, ErfStdLib>
-    {
-    };
-
-    //! The standard library exp, implementation covered by the general template.
-    class ExpStdLib : public concepts::Implements<ConceptMathExp, ExpStdLib>
-    {
-    };
-
-    //! The standard library floor, implementation covered by the general template.
-    class FloorStdLib : public concepts::Implements<ConceptMathFloor, FloorStdLib>
-    {
-    };
-
-    //! The standard library fma, implementation covered by the general template.
-    class FmaStdLib : public concepts::Implements<ConceptMathFma, FmaStdLib>
-    {
-    };
-
-    //! The standard library fmod, implementation covered by the general template.
-    class FmodStdLib : public concepts::Implements<ConceptMathFmod, FmodStdLib>
-    {
-    };
-
-    //! The standard library isfinite, implementation covered by the general template.
-    class IsfiniteStdLib : public concepts::Implements<ConceptMathIsfinite, IsfiniteStdLib>
-    {
-    };
-
-    //! The standard library isinf, implementation covered by the general template.
-    class IsinfStdLib : public concepts::Implements<ConceptMathIsinf, IsinfStdLib>
-    {
-    };
-
-    //! The standard library isnan, implementation covered by the general template.
-    class IsnanStdLib : public concepts::Implements<ConceptMathIsnan, IsnanStdLib>
-    {
-    };
-
-    //! The standard library log, implementation covered by the general template.
-    class LogStdLib : public concepts::Implements<ConceptMathLog, LogStdLib>
-    {
-    };
-
-    //! The standard library log2, implementation covered by the general template.
-    class Log2StdLib : public concepts::Implements<ConceptMathLog2, Log2StdLib>
-    {
-    };
-
-    //! The standard library log10, implementation covered by the general template.
-    class Log10StdLib : public concepts::Implements<ConceptMathLog10, Log10StdLib>
-    {
-    };
-
-    //! The standard library max.
-    class MaxStdLib : public concepts::Implements<ConceptMathMax, MaxStdLib>
-    {
-    };
-
-    //! The standard library min.
-    class MinStdLib : public concepts::Implements<ConceptMathMin, MinStdLib>
-    {
-    };
-
-    //! The standard library pow, implementation covered by the general template.
-    class PowStdLib : public concepts::Implements<ConceptMathPow, PowStdLib>
-    {
-    };
-
-    //! The standard library remainder, implementation covered by the general template.
-    class RemainderStdLib : public concepts::Implements<ConceptMathRemainder, RemainderStdLib>
-    {
-    };
-
-    //! The standard library round, implementation covered by the general template.
-    class RoundStdLib : public concepts::Implements<ConceptMathRound, RoundStdLib>
-    {
-    };
-
-    //! The standard library rsqrt, implementation covered by the general template.
-    class RsqrtStdLib : public concepts::Implements<ConceptMathRsqrt, RsqrtStdLib>
-    {
-    };
-
-    //! The standard library sin, implementation covered by the general template.
-    class SinStdLib : public concepts::Implements<ConceptMathSin, SinStdLib>
-    {
-    };
-
-    //! The standard library sinh, implementation covered by the general template.
-    class SinhStdLib : public concepts::Implements<ConceptMathSinh, SinhStdLib>
-    {
-    };
-
-    //! The standard library sincos, implementation covered by the general template.
-    class SinCosStdLib : public concepts::Implements<ConceptMathSinCos, SinCosStdLib>
-    {
-    };
-
-    //! The standard library sqrt, implementation covered by the general template.
-    class SqrtStdLib : public concepts::Implements<ConceptMathSqrt, SqrtStdLib>
-    {
-    };
-
-    //! The standard library tan, implementation covered by the general template.
-    class TanStdLib : public concepts::Implements<ConceptMathTan, TanStdLib>
-    {
-    };
-
-    //! The standard library tanh, implementation covered by the general template.
-    class TanhStdLib : public concepts::Implements<ConceptMathTanh, TanhStdLib>
-    {
-    };
-
-    //! The standard library trunc, implementation covered by the general template.
-    class TruncStdLib : public concepts::Implements<ConceptMathTrunc, TruncStdLib>
-    {
-    };
-
-    //! The standard library math trait specializations.
-    class MathStdLib
-        : public AbsStdLib
-        , public AcosStdLib
-        , public AcoshStdLib
-        , public ArgStdLib
-        , public AsinStdLib
-        , public AsinhStdLib
-        , public AtanStdLib
-        , public AtanhStdLib
-        , public Atan2StdLib
-        , public CbrtStdLib
-        , public CeilStdLib
-        , public ConjStdLib
-        , public CopysignStdLib
-        , public CosStdLib
-        , public CoshStdLib
-        , public ErfStdLib
-        , public ExpStdLib
-        , public FloorStdLib
-        , public FmaStdLib
-        , public FmodStdLib
-        , public LogStdLib
-        , public Log2StdLib
-        , public Log10StdLib
-        , public MaxStdLib
-        , public MinStdLib
-        , public PowStdLib
-        , public RemainderStdLib
-        , public RoundStdLib
-        , public RsqrtStdLib
-        , public SinStdLib
-        , public SinhStdLib
-        , public SinCosStdLib
-        , public SqrtStdLib
-        , public TanStdLib
-        , public TanhStdLib
-        , public TruncStdLib
-        , public IsnanStdLib
-        , public IsinfStdLib
-        , public IsfiniteStdLib
-    {
-    };
-
-    namespace trait
-    {
-        //! The standard library max trait specialization.
-        template<typename Tx, typename Ty>
-        struct Max<MaxStdLib, Tx, Ty, std::enable_if_t<std::is_arithmetic_v<Tx> && std::is_arithmetic_v<Ty>>>
-        {
-            ALPAKA_FN_HOST auto operator()(MaxStdLib const& /* max_ctx */, Tx const& x, Ty const& y)
-            {
-                using std::fmax;
-                using std::max;
-
-                if constexpr(std::is_integral_v<Tx> && std::is_integral_v<Ty>)
-                    return max(x, y);
-                else if constexpr(
-                    is_decayed_v<Tx, float> || is_decayed_v<Ty, float> || is_decayed_v<Tx, double>
-                    || is_decayed_v<Ty, double>)
-                    return fmax(x, y);
-                else
-                    static_assert(!sizeof(Tx), "Unsupported data type");
-
-                ALPAKA_UNREACHABLE(std::common_type_t<Tx, Ty>{});
-            }
-        };
-
-        //! The standard library min trait specialization.
-        template<typename Tx, typename Ty>
-        struct Min<MinStdLib, Tx, Ty, std::enable_if_t<std::is_arithmetic_v<Tx> && std::is_arithmetic_v<Ty>>>
-        {
-            ALPAKA_FN_HOST auto operator()(MinStdLib const& /* min_ctx */, Tx const& x, Ty const& y)
-            {
-                using std::fmin;
-                using std::min;
-
-                if constexpr(std::is_integral_v<Tx> && std::is_integral_v<Ty>)
-                    return min(x, y);
-                else if constexpr(
-                    is_decayed_v<Tx, float> || is_decayed_v<Ty, float> || is_decayed_v<Tx, double>
-                    || is_decayed_v<Ty, double>)
-                    return fmin(x, y);
-                else
-                    static_assert(!sizeof(Tx), "Unsupported data type");
-
-                ALPAKA_UNREACHABLE(std::common_type_t<Tx, Ty>{});
-            }
-        };
-    } // namespace trait
-
-} // namespace alpaka::math
diff --git a/include/alpaka/math/MathUniformCudaHipBuiltIn.hpp b/include/alpaka/math/MathUniformCudaHipBuiltIn.hpp
deleted file mode 100644
index ef89423..0000000
--- a/include/alpaka/math/MathUniformCudaHipBuiltIn.hpp
+++ /dev/null
@@ -1,1373 +0,0 @@
-/* Copyright 2023 Axel Huebl, Benjamin Worpitz, Matthias Werner, Bert Wesarg, Valentin Gehrke, René Widera,
- * Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Jeffrey Kelling, Sergei Bastrakov
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/BoostPredef.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/core/CudaHipCommon.hpp"
-#include "alpaka/core/Decay.hpp"
-#include "alpaka/core/UniformCudaHip.hpp"
-#include "alpaka/core/Unreachable.hpp"
-#include "alpaka/math/Complex.hpp"
-#include "alpaka/math/Traits.hpp"
-
-#include <type_traits>
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-
-namespace alpaka::math
-{
-    //! The CUDA built in abs.
-    class AbsUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathAbs, AbsUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA built in acos.
-    class AcosUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathAcos, AcosUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA built in acosh.
-    class AcoshUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathAcosh, AcoshUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA built in arg.
-    class ArgUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathArg, ArgUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA built in asin.
-    class AsinUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathAsin, AsinUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA built in asinh.
-    class AsinhUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathAsinh, AsinhUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA built in atan.
-    class AtanUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathAtan, AtanUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA built in atanh.
-    class AtanhUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathAtanh, AtanhUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA built in atan2.
-    class Atan2UniformCudaHipBuiltIn : public concepts::Implements<ConceptMathAtan2, Atan2UniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA built in cbrt.
-    class CbrtUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathCbrt, CbrtUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA built in ceil.
-    class CeilUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathCeil, CeilUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA built in conj.
-    class ConjUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathConj, ConjUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA built in copysign.
-    class CopysignUniformCudaHipBuiltIn
-        : public concepts::Implements<ConceptMathCopysign, CopysignUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA built in cos.
-    class CosUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathCos, CosUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA built in cosh.
-    class CoshUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathCosh, CoshUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA built in erf.
-    class ErfUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathErf, ErfUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA built in exp.
-    class ExpUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathExp, ExpUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA built in floor.
-    class FloorUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathFloor, FloorUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA built in fma.
-    class FmaUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathFma, FmaUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA built in fmod.
-    class FmodUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathFmod, FmodUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA built in isfinite.
-    class IsfiniteUniformCudaHipBuiltIn
-        : public concepts::Implements<ConceptMathIsfinite, IsfiniteUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA built in isinf.
-    class IsinfUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathIsinf, IsinfUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA built in isnan.
-    class IsnanUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathIsnan, IsnanUniformCudaHipBuiltIn>
-    {
-    };
-
-    // ! The CUDA built in log.
-    class LogUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathLog, LogUniformCudaHipBuiltIn>
-    {
-    };
-
-    // ! The CUDA built in log2.
-    class Log2UniformCudaHipBuiltIn : public concepts::Implements<ConceptMathLog2, Log2UniformCudaHipBuiltIn>
-    {
-    };
-
-    // ! The CUDA built in log10.
-    class Log10UniformCudaHipBuiltIn : public concepts::Implements<ConceptMathLog10, Log10UniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA built in max.
-    class MaxUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathMax, MaxUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA built in min.
-    class MinUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathMin, MinUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA built in pow.
-    class PowUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathPow, PowUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA built in remainder.
-    class RemainderUniformCudaHipBuiltIn
-        : public concepts::Implements<ConceptMathRemainder, RemainderUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA round.
-    class RoundUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathRound, RoundUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA rsqrt.
-    class RsqrtUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathRsqrt, RsqrtUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA sin.
-    class SinUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathSin, SinUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA sinh.
-    class SinhUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathSinh, SinhUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA sincos.
-    class SinCosUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathSinCos, SinCosUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA sqrt.
-    class SqrtUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathSqrt, SqrtUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA tan.
-    class TanUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathTan, TanUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA tanh.
-    class TanhUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathTanh, TanhUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The CUDA trunc.
-    class TruncUniformCudaHipBuiltIn : public concepts::Implements<ConceptMathTrunc, TruncUniformCudaHipBuiltIn>
-    {
-    };
-
-    //! The standard library math trait specializations.
-    class MathUniformCudaHipBuiltIn
-        : public AbsUniformCudaHipBuiltIn
-        , public AcosUniformCudaHipBuiltIn
-        , public AcoshUniformCudaHipBuiltIn
-        , public ArgUniformCudaHipBuiltIn
-        , public AsinUniformCudaHipBuiltIn
-        , public AsinhUniformCudaHipBuiltIn
-        , public AtanUniformCudaHipBuiltIn
-        , public AtanhUniformCudaHipBuiltIn
-        , public Atan2UniformCudaHipBuiltIn
-        , public CbrtUniformCudaHipBuiltIn
-        , public CeilUniformCudaHipBuiltIn
-        , public ConjUniformCudaHipBuiltIn
-        , public CopysignUniformCudaHipBuiltIn
-        , public CosUniformCudaHipBuiltIn
-        , public CoshUniformCudaHipBuiltIn
-        , public ErfUniformCudaHipBuiltIn
-        , public ExpUniformCudaHipBuiltIn
-        , public FloorUniformCudaHipBuiltIn
-        , public FmaUniformCudaHipBuiltIn
-        , public FmodUniformCudaHipBuiltIn
-        , public LogUniformCudaHipBuiltIn
-        , public Log2UniformCudaHipBuiltIn
-        , public Log10UniformCudaHipBuiltIn
-        , public MaxUniformCudaHipBuiltIn
-        , public MinUniformCudaHipBuiltIn
-        , public PowUniformCudaHipBuiltIn
-        , public RemainderUniformCudaHipBuiltIn
-        , public RoundUniformCudaHipBuiltIn
-        , public RsqrtUniformCudaHipBuiltIn
-        , public SinUniformCudaHipBuiltIn
-        , public SinhUniformCudaHipBuiltIn
-        , public SinCosUniformCudaHipBuiltIn
-        , public SqrtUniformCudaHipBuiltIn
-        , public TanUniformCudaHipBuiltIn
-        , public TanhUniformCudaHipBuiltIn
-        , public TruncUniformCudaHipBuiltIn
-        , public IsnanUniformCudaHipBuiltIn
-        , public IsinfUniformCudaHipBuiltIn
-        , public IsfiniteUniformCudaHipBuiltIn
-    {
-    };
-
-#    if !defined(ALPAKA_HOST_ONLY)
-
-#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
-#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#        endif
-
-#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
-#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#        endif
-
-#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDA_ARCH__)
-#            include <cuda_runtime.h>
-#        endif
-
-#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_DEVICE_COMPILE__)
-#            include <hip/math_functions.h>
-#        endif
-
-    namespace trait
-    {
-        //! The CUDA abs trait specialization for real types.
-        template<typename TArg>
-        struct Abs<AbsUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_signed_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(AbsUniformCudaHipBuiltIn const& /* abs_ctx */, TArg const& arg)
-            {
-                if constexpr(is_decayed_v<TArg, float>)
-                    return ::fabsf(arg);
-                else if constexpr(is_decayed_v<TArg, double>)
-                    return ::fabs(arg);
-                else if constexpr(is_decayed_v<TArg, int>)
-                    return ::abs(arg);
-                else if constexpr(is_decayed_v<TArg, long int>)
-                    return ::labs(arg);
-                else if constexpr(is_decayed_v<TArg, long long int>)
-                    return ::llabs(arg);
-                else
-                    static_assert(!sizeof(TArg), "Unsupported data type");
-
-                ALPAKA_UNREACHABLE(TArg{});
-            }
-        };
-
-        //! The CUDA abs trait specialization for complex types.
-        template<typename T>
-        struct Abs<AbsUniformCudaHipBuiltIn, Complex<T>>
-        {
-            //! Take context as original (accelerator) type, since we call other math functions
-            template<typename TCtx>
-            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& arg)
-            {
-                return sqrt(ctx, arg.real() * arg.real() + arg.imag() * arg.imag());
-            }
-        };
-
-        //! The CUDA acos trait specialization for real types.
-        template<typename TArg>
-        struct Acos<AcosUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(AcosUniformCudaHipBuiltIn const& /* acos_ctx */, TArg const& arg)
-            {
-                if constexpr(is_decayed_v<TArg, float>)
-                    return ::acosf(arg);
-                else if constexpr(is_decayed_v<TArg, double>)
-                    return ::acos(arg);
-                else
-                    static_assert(!sizeof(TArg), "Unsupported data type");
-
-                ALPAKA_UNREACHABLE(TArg{});
-            }
-        };
-
-        //! The CUDA acos trait specialization for complex types.
-        template<typename T>
-        struct Acos<AcosUniformCudaHipBuiltIn, Complex<T>>
-        {
-            //! Take context as original (accelerator) type, since we call other math functions
-            template<typename TCtx>
-            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& arg)
-            {
-                // This holds everywhere, including the branch cuts: acos(z) = -i * ln(z + i * sqrt(1 - z^2))
-                return Complex<T>{0.0, -1.0} * log(ctx, arg + Complex<T>{0.0, 1.0} * sqrt(ctx, T(1.0) - arg * arg));
-            }
-        };
-
-        //! The CUDA acosh trait specialization for real types.
-        template<typename TArg>
-        struct Acosh<AcoshUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(AcoshUniformCudaHipBuiltIn const& /* acosh_ctx */, TArg const& arg)
-            {
-                if constexpr(is_decayed_v<TArg, float>)
-                    return ::acoshf(arg);
-                else if constexpr(is_decayed_v<TArg, double>)
-                    return ::acosh(arg);
-                else
-                    static_assert(!sizeof(TArg), "Unsupported data type");
-
-                ALPAKA_UNREACHABLE(TArg{});
-            }
-        };
-
-        //! The CUDA acosh trait specialization for complex types.
-        template<typename T>
-        struct Acosh<AcoshUniformCudaHipBuiltIn, Complex<T>>
-        {
-            //! Take context as original (accelerator) type, since we call other math functions
-            template<typename TCtx>
-            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& arg)
-            {
-                // acos(z) = ln(z + sqrt(z-1) * sqrt(z+1))
-                return log(ctx, arg + sqrt(ctx, arg - static_cast<T>(1.0)) * sqrt(ctx, arg + static_cast<T>(1.0)));
-            }
-        };
-
-        //! The CUDA arg trait specialization for real types.
-        template<typename TArgument>
-        struct Arg<ArgUniformCudaHipBuiltIn, TArgument, std::enable_if_t<std::is_floating_point_v<TArgument>>>
-        {
-            //! Take context as original (accelerator) type, since we call other math functions
-            template<typename TCtx>
-            __host__ __device__ auto operator()(TCtx const& ctx, TArgument const& argument)
-            {
-                // Fall back to atan2 so that boundary cases are resolved consistently
-                return atan2(ctx, TArgument{0.0}, argument);
-            }
-        };
-
-        //! The CUDA arg Complex<T> specialization for complex types.
-        template<typename T>
-        struct Arg<ArgUniformCudaHipBuiltIn, Complex<T>>
-        {
-            //! Take context as original (accelerator) type, since we call other math functions
-            template<typename TCtx>
-            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& argument)
-            {
-                return atan2(ctx, argument.imag(), argument.real());
-            }
-        };
-
-        //! The CUDA asin trait specialization for real types.
-        template<typename TArg>
-        struct Asin<AsinUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(AsinUniformCudaHipBuiltIn const& /* asin_ctx */, TArg const& arg)
-            {
-                if constexpr(is_decayed_v<TArg, float>)
-                    return ::asinf(arg);
-                else if constexpr(is_decayed_v<TArg, double>)
-                    return ::asin(arg);
-                else
-                    static_assert(!sizeof(TArg), "Unsupported data type");
-
-                ALPAKA_UNREACHABLE(TArg{});
-            }
-        };
-
-        //! The CUDA asin trait specialization for complex types.
-        template<typename T>
-        struct Asin<AsinUniformCudaHipBuiltIn, Complex<T>>
-        {
-            //! Take context as original (accelerator) type, since we call other math functions
-            template<typename TCtx>
-            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& arg)
-            {
-                // This holds everywhere, including the branch cuts: asin(z) = i * ln(sqrt(1 - z^2) - i * z)
-                return Complex<T>{0.0, 1.0} * log(ctx, sqrt(ctx, T(1.0) - arg * arg) - Complex<T>{0.0, 1.0} * arg);
-            }
-        };
-
-        //! The CUDA asinh trait specialization for real types.
-        template<typename TArg>
-        struct Asinh<AsinhUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(AsinhUniformCudaHipBuiltIn const& /* asinh_ctx */, TArg const& arg)
-            {
-                if constexpr(is_decayed_v<TArg, float>)
-                    return ::asinhf(arg);
-                else if constexpr(is_decayed_v<TArg, double>)
-                    return ::asinh(arg);
-                else
-                    static_assert(!sizeof(TArg), "Unsupported data type");
-
-                ALPAKA_UNREACHABLE(TArg{});
-            }
-        };
-
-        //! The CUDA asinh trait specialization for complex types.
-        template<typename T>
-        struct Asinh<AsinhUniformCudaHipBuiltIn, Complex<T>>
-        {
-            //! Take context as original (accelerator) type, since we call other math functions
-            template<typename TCtx>
-            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& arg)
-            {
-                // asinh(z) = ln(z + sqrt(z^2 + 1))
-                return log(ctx, arg + sqrt(ctx, arg * arg + static_cast<T>(1.0)));
-            }
-        };
-
-        //! The CUDA atan trait specialization for real types.
-        template<typename TArg>
-        struct Atan<AtanUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(AtanUniformCudaHipBuiltIn const& /* atan_ctx */, TArg const& arg)
-            {
-                if constexpr(is_decayed_v<TArg, float>)
-                    return ::atanf(arg);
-                else if constexpr(is_decayed_v<TArg, double>)
-                    return ::atan(arg);
-                else
-                    static_assert(!sizeof(TArg), "Unsupported data type");
-
-                ALPAKA_UNREACHABLE(TArg{});
-            }
-        };
-
-        //! The CUDA atan trait specialization for complex types.
-        template<typename T>
-        struct Atan<AtanUniformCudaHipBuiltIn, Complex<T>>
-        {
-            //! Take context as original (accelerator) type, since we call other math functions
-            template<typename TCtx>
-            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& arg)
-            {
-                // This holds everywhere, including the branch cuts: atan(z) = -i/2 * ln((i - z) / (i + z))
-                return Complex<T>{0.0, -0.5} * log(ctx, (Complex<T>{0.0, 1.0} - arg) / (Complex<T>{0.0, 1.0} + arg));
-            }
-        };
-
-        //! The CUDA atanh trait specialization for real types.
-        template<typename TArg>
-        struct Atanh<AtanhUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(AtanhUniformCudaHipBuiltIn const& /* atanh_ctx */, TArg const& arg)
-            {
-                if constexpr(is_decayed_v<TArg, float>)
-                    return ::atanhf(arg);
-                else if constexpr(is_decayed_v<TArg, double>)
-                    return ::atanh(arg);
-                else
-                    static_assert(!sizeof(TArg), "Unsupported data type");
-
-                ALPAKA_UNREACHABLE(TArg{});
-            }
-        };
-
-        //! The CUDA atanh trait specialization for complex types.
-        template<typename T>
-        struct Atanh<AtanhUniformCudaHipBuiltIn, Complex<T>>
-        {
-            //! Take context as original (accelerator) type, since we call other math functions
-            template<typename TCtx>
-            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& arg)
-            {
-                //  atanh(z) = 0.5 * (ln(1 + z) - ln(1 - z))
-                return static_cast<T>(0.5)
-                       * (log(ctx, static_cast<T>(1.0) + arg) - log(ctx, static_cast<T>(1.0) - arg));
-            }
-        };
-
-        //! The CUDA atan2 trait specialization.
-        template<typename Ty, typename Tx>
-        struct Atan2<
-            Atan2UniformCudaHipBuiltIn,
-            Ty,
-            Tx,
-            std::enable_if_t<std::is_floating_point_v<Ty> && std::is_floating_point_v<Tx>>>
-        {
-            __host__ __device__ auto operator()(
-                Atan2UniformCudaHipBuiltIn const& /* atan2_ctx */,
-                Ty const& y,
-                Tx const& x)
-            {
-                if constexpr(is_decayed_v<Ty, float> && is_decayed_v<Tx, float>)
-                    return ::atan2f(y, x);
-                else if constexpr(is_decayed_v<Ty, double> || is_decayed_v<Tx, double>)
-                    return ::atan2(y, x);
-                else
-                    static_assert(!sizeof(Ty), "Unsupported data type");
-
-                ALPAKA_UNREACHABLE(Ty{});
-            }
-        };
-
-        //! The CUDA cbrt trait specialization.
-        template<typename TArg>
-        struct Cbrt<CbrtUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_arithmetic_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(CbrtUniformCudaHipBuiltIn const& /* cbrt_ctx */, TArg const& arg)
-            {
-                if constexpr(is_decayed_v<TArg, float>)
-                    return ::cbrtf(arg);
-                else if constexpr(is_decayed_v<TArg, double> || std::is_integral_v<TArg>)
-                    return ::cbrt(arg);
-                else
-                    static_assert(!sizeof(TArg), "Unsupported data type");
-
-                ALPAKA_UNREACHABLE(TArg{});
-            }
-        };
-
-        //! The CUDA ceil trait specialization.
-        template<typename TArg>
-        struct Ceil<CeilUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(CeilUniformCudaHipBuiltIn const& /* ceil_ctx */, TArg const& arg)
-            {
-                if constexpr(is_decayed_v<TArg, float>)
-                    return ::ceilf(arg);
-                else if constexpr(is_decayed_v<TArg, double>)
-                    return ::ceil(arg);
-                else
-                    static_assert(!sizeof(TArg), "Unsupported data type");
-
-                ALPAKA_UNREACHABLE(TArg{});
-            }
-        };
-
-        //! The CUDA conj trait specialization for real types.
-        template<typename TArg>
-        struct Conj<ConjUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(ConjUniformCudaHipBuiltIn const& /* conj_ctx */, TArg const& arg)
-            {
-                return Complex<TArg>{arg, TArg{0.0}};
-            }
-        };
-
-        //! The CUDA conj specialization for complex types.
-        template<typename T>
-        struct Conj<ConjUniformCudaHipBuiltIn, Complex<T>>
-        {
-            __host__ __device__ auto operator()(ConjUniformCudaHipBuiltIn const& /* conj_ctx */, Complex<T> const& arg)
-            {
-                return Complex<T>{arg.real(), -arg.imag()};
-            }
-        };
-
-        //! The CUDA copysign trait specialization for real types.
-        template<typename TMag, typename TSgn>
-        struct Copysign<
-            CopysignUniformCudaHipBuiltIn,
-            TMag,
-            TSgn,
-            std::enable_if_t<std::is_floating_point_v<TMag> && std::is_floating_point_v<TSgn>>>
-        {
-            __host__ __device__ auto operator()(
-                CopysignUniformCudaHipBuiltIn const& /* copysign_ctx */,
-                TMag const& mag,
-                TSgn const& sgn)
-            {
-                if constexpr(is_decayed_v<TMag, float> && is_decayed_v<TSgn, float>)
-                    return ::copysignf(mag, sgn);
-                else if constexpr(is_decayed_v<TMag, double> || is_decayed_v<TSgn, double>)
-                    return ::copysign(mag, sgn);
-                else
-                    static_assert(!sizeof(TMag), "Unsupported data type");
-
-                ALPAKA_UNREACHABLE(TMag{});
-            }
-        };
-
-        //! The CUDA cos trait specialization for real types.
-        template<typename TArg>
-        struct Cos<CosUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(CosUniformCudaHipBuiltIn const& /* cos_ctx */, TArg const& arg)
-            {
-                if constexpr(is_decayed_v<TArg, float>)
-                    return ::cosf(arg);
-                else if constexpr(is_decayed_v<TArg, double>)
-                    return ::cos(arg);
-                else
-                    static_assert(!sizeof(TArg), "Unsupported data type");
-
-                ALPAKA_UNREACHABLE(TArg{});
-            }
-        };
-
-        //! The CUDA cos trait specialization for complex types.
-        template<typename T>
-        struct Cos<CosUniformCudaHipBuiltIn, Complex<T>>
-        {
-            //! Take context as original (accelerator) type, since we call other math functions
-            template<typename TCtx>
-            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& arg)
-            {
-                // cos(z) = 0.5 * (exp(i * z) + exp(-i * z))
-                return T(0.5) * (exp(ctx, Complex<T>{0.0, 1.0} * arg) + exp(ctx, Complex<T>{0.0, -1.0} * arg));
-            }
-        };
-
-        //! The CUDA cosh trait specialization for real types.
-        template<typename TArg>
-        struct Cosh<CoshUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(CoshUniformCudaHipBuiltIn const& /* cos_ctx */, TArg const& arg)
-            {
-                if constexpr(is_decayed_v<TArg, float>)
-                    return ::coshf(arg);
-                else if constexpr(is_decayed_v<TArg, double>)
-                    return ::cosh(arg);
-                else
-                    static_assert(!sizeof(TArg), "Unsupported data type");
-
-                ALPAKA_UNREACHABLE(TArg{});
-            }
-        };
-
-        //! The CUDA cosh trait specialization for complex types.
-        template<typename T>
-        struct Cosh<CoshUniformCudaHipBuiltIn, Complex<T>>
-        {
-            //! Take context as original (accelerator) type, since we call other math functions
-            template<typename TCtx>
-            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& arg)
-            {
-                // cosh(z) = 0.5 * (exp(z) + exp(-z))
-                return T(0.5) * (exp(ctx, arg) + exp(ctx, static_cast<T>(-1.0) * arg));
-            }
-        };
-
-        //! The CUDA erf trait specialization.
-        template<typename TArg>
-        struct Erf<ErfUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(ErfUniformCudaHipBuiltIn const& /* erf_ctx */, TArg const& arg)
-            {
-                if constexpr(is_decayed_v<TArg, float>)
-                    return ::erff(arg);
-                else if constexpr(is_decayed_v<TArg, double>)
-                    return ::erf(arg);
-                else
-                    static_assert(!sizeof(TArg), "Unsupported data type");
-
-                ALPAKA_UNREACHABLE(TArg{});
-            }
-        };
-
-        //! The CUDA exp trait specialization for real types.
-        template<typename TArg>
-        struct Exp<ExpUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(ExpUniformCudaHipBuiltIn const& /* exp_ctx */, TArg const& arg)
-            {
-                if constexpr(is_decayed_v<TArg, float>)
-                    return ::expf(arg);
-                else if constexpr(is_decayed_v<TArg, double>)
-                    return ::exp(arg);
-                else
-                    static_assert(!sizeof(TArg), "Unsupported data type");
-
-                ALPAKA_UNREACHABLE(TArg{});
-            }
-        };
-
-        //! The CUDA exp trait specialization for complex types.
-        template<typename T>
-        struct Exp<ExpUniformCudaHipBuiltIn, Complex<T>>
-        {
-            //! Take context as original (accelerator) type, since we call other math functions
-            template<typename TCtx>
-            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& arg)
-            {
-                // exp(z) = exp(x + iy) = exp(x) * (cos(y) + i * sin(y))
-                auto re = T{}, im = T{};
-                sincos(ctx, arg.imag(), im, re);
-                return exp(ctx, arg.real()) * Complex<T>{re, im};
-            }
-        };
-
-        //! The CUDA floor trait specialization.
-        template<typename TArg>
-        struct Floor<FloorUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(FloorUniformCudaHipBuiltIn const& /* floor_ctx */, TArg const& arg)
-            {
-                if constexpr(is_decayed_v<TArg, float>)
-                    return ::floorf(arg);
-                else if constexpr(is_decayed_v<TArg, double>)
-                    return ::floor(arg);
-                else
-                    static_assert(!sizeof(TArg), "Unsupported data type");
-
-                ALPAKA_UNREACHABLE(TArg{});
-            }
-        };
-
-        //! The CUDA fma trait specialization.
-        template<typename Tx, typename Ty, typename Tz>
-        struct Fma<
-            FmaUniformCudaHipBuiltIn,
-            Tx,
-            Ty,
-            Tz,
-            std::enable_if_t<
-                std::is_floating_point_v<Tx> && std::is_floating_point_v<Ty> && std::is_floating_point_v<Tz>>>
-        {
-            __host__ __device__ auto operator()(
-                FmaUniformCudaHipBuiltIn const& /* fma_ctx */,
-                Tx const& x,
-                Ty const& y,
-                Tz const& z)
-            {
-                if constexpr(is_decayed_v<Tx, float> && is_decayed_v<Ty, float> && is_decayed_v<Tz, float>)
-                    return ::fmaf(x, y, z);
-                else if constexpr(is_decayed_v<Tx, double> || is_decayed_v<Ty, double> || is_decayed_v<Tz, double>)
-                    return ::fma(x, y, z);
-                else
-                    static_assert(!sizeof(Tx), "Unsupported data type");
-
-                using Ret [[maybe_unused]] = std::conditional_t<
-                    is_decayed_v<Tx, float> && is_decayed_v<Ty, float> && is_decayed_v<Tz, float>,
-                    float,
-                    double>;
-                ALPAKA_UNREACHABLE(Ret{});
-            }
-        };
-
-        //! The CUDA fmod trait specialization.
-        template<typename Tx, typename Ty>
-        struct Fmod<
-            FmodUniformCudaHipBuiltIn,
-            Tx,
-            Ty,
-            std::enable_if_t<std::is_floating_point_v<Tx> && std::is_floating_point_v<Ty>>>
-        {
-            __host__ __device__ auto operator()(
-                FmodUniformCudaHipBuiltIn const& /* fmod_ctx */,
-                Tx const& x,
-                Ty const& y)
-            {
-                if constexpr(is_decayed_v<Tx, float> && is_decayed_v<Ty, float>)
-                    return ::fmodf(x, y);
-                else if constexpr(is_decayed_v<Tx, double> || is_decayed_v<Ty, double>)
-                    return ::fmod(x, y);
-                else
-                    static_assert(!sizeof(Tx), "Unsupported data type");
-
-                using Ret [[maybe_unused]]
-                = std::conditional_t<is_decayed_v<Tx, float> && is_decayed_v<Ty, float>, float, double>;
-                ALPAKA_UNREACHABLE(Ret{});
-            }
-        };
-
-        //! The CUDA isfinite trait specialization.
-        template<typename TArg>
-        struct Isfinite<IsfiniteUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(IsfiniteUniformCudaHipBuiltIn const& /* ctx */, TArg const& arg)
-            {
-                return ::isfinite(arg);
-            }
-        };
-
-        //! The CUDA isinf trait specialization.
-        template<typename TArg>
-        struct Isinf<IsinfUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(IsinfUniformCudaHipBuiltIn const& /* ctx */, TArg const& arg)
-            {
-                return ::isinf(arg);
-            }
-        };
-
-        //! The CUDA isnan trait specialization.
-        template<typename TArg>
-        struct Isnan<IsnanUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(IsnanUniformCudaHipBuiltIn const& /* ctx */, TArg const& arg)
-            {
-                return ::isnan(arg);
-            }
-        };
-
-        //! The CUDA log trait specialization for real types.
-        template<typename TArg>
-        struct Log<LogUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(LogUniformCudaHipBuiltIn const& /* log_ctx */, TArg const& arg)
-            {
-                if constexpr(is_decayed_v<TArg, float>)
-                    return ::logf(arg);
-                else if constexpr(is_decayed_v<TArg, double>)
-                    return ::log(arg);
-                else
-                    static_assert(!sizeof(TArg), "Unsupported data type");
-
-                ALPAKA_UNREACHABLE(TArg{});
-            }
-        };
-
-        //! The CUDA log trait specialization for complex types.
-        template<typename T>
-        struct Log<LogUniformCudaHipBuiltIn, Complex<T>>
-        {
-            //! Take context as original (accelerator) type, since we call other math functions
-            template<typename TCtx>
-            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& argument)
-            {
-                // Branch cut along the negative real axis (same as for std::complex),
-                // principal value of ln(z) = ln(|z|) + i * arg(z)
-                return log(ctx, abs(ctx, argument)) + Complex<T>{0.0, 1.0} * arg(ctx, argument);
-            }
-        };
-
-        //! The CUDA log2 trait specialization for real types.
-        template<typename TArg>
-        struct Log2<Log2UniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(Log2UniformCudaHipBuiltIn const& /* log2_ctx */, TArg const& arg)
-            {
-                if constexpr(is_decayed_v<TArg, float>)
-                    return ::log2f(arg);
-                else if constexpr(is_decayed_v<TArg, double>)
-                    return ::log2(arg);
-                else
-                    static_assert(!sizeof(TArg), "Unsupported data type");
-
-                ALPAKA_UNREACHABLE(TArg{});
-            }
-        };
-
-        //! The CUDA log10 trait specialization for real types.
-        template<typename TArg>
-        struct Log10<Log10UniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(Log10UniformCudaHipBuiltIn const& /* log10_ctx */, TArg const& arg)
-            {
-                if constexpr(is_decayed_v<TArg, float>)
-                    return ::log10f(arg);
-                else if constexpr(is_decayed_v<TArg, double>)
-                    return ::log10(arg);
-                else
-                    static_assert(!sizeof(TArg), "Unsupported data type");
-
-                ALPAKA_UNREACHABLE(TArg{});
-            }
-        };
-
-        //! The CUDA log10 trait specialization for complex types.
-        template<typename T>
-        struct Log10<Log10UniformCudaHipBuiltIn, Complex<T>>
-        {
-            //! Take context as original (accelerator) type, since we call other math functions
-            template<typename TCtx>
-            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& argument)
-            {
-                return log(ctx, argument) / log(ctx, static_cast<T>(10));
-            }
-        };
-
-        //! The CUDA max trait specialization.
-        template<typename Tx, typename Ty>
-        struct Max<
-            MaxUniformCudaHipBuiltIn,
-            Tx,
-            Ty,
-            std::enable_if_t<std::is_arithmetic_v<Tx> && std::is_arithmetic_v<Ty>>>
-        {
-            __host__ __device__ auto operator()(
-                MaxUniformCudaHipBuiltIn const& /* max_ctx */,
-                Tx const& x,
-                Ty const& y)
-            {
-                if constexpr(std::is_integral_v<Tx> && std::is_integral_v<Ty>)
-                    return ::max(x, y);
-                else if constexpr(is_decayed_v<Tx, float> && is_decayed_v<Ty, float>)
-                    return ::fmaxf(x, y);
-                else if constexpr(
-                    is_decayed_v<Tx, double> || is_decayed_v<Ty, double>
-                    || (is_decayed_v<Tx, float> && std::is_integral_v<Ty>)
-                    || (std::is_integral_v<Tx> && is_decayed_v<Ty, float>) )
-                    return ::fmax(x, y);
-                else
-                    static_assert(!sizeof(Tx), "Unsupported data type");
-
-                using Ret [[maybe_unused]] = std::conditional_t<
-                    std::is_integral_v<Tx> && std::is_integral_v<Ty>,
-                    decltype(::max(x, y)),
-                    std::conditional_t<is_decayed_v<Tx, float> && is_decayed_v<Ty, float>, float, double>>;
-                ALPAKA_UNREACHABLE(Ret{});
-            }
-        };
-
-        //! The CUDA min trait specialization.
-        template<typename Tx, typename Ty>
-        struct Min<
-            MinUniformCudaHipBuiltIn,
-            Tx,
-            Ty,
-            std::enable_if_t<std::is_arithmetic_v<Tx> && std::is_arithmetic_v<Ty>>>
-        {
-            __host__ __device__ auto operator()(
-                MinUniformCudaHipBuiltIn const& /* min_ctx */,
-                Tx const& x,
-                Ty const& y)
-            {
-                if constexpr(std::is_integral_v<Tx> && std::is_integral_v<Ty>)
-                    return ::min(x, y);
-                else if constexpr(is_decayed_v<Tx, float> && is_decayed_v<Ty, float>)
-                    return ::fminf(x, y);
-                else if constexpr(
-                    is_decayed_v<Tx, double> || is_decayed_v<Ty, double>
-                    || (is_decayed_v<Tx, float> && std::is_integral_v<Ty>)
-                    || (std::is_integral_v<Tx> && is_decayed_v<Ty, float>) )
-                    return ::fmin(x, y);
-                else
-                    static_assert(!sizeof(Tx), "Unsupported data type");
-
-                using Ret [[maybe_unused]] = std::conditional_t<
-                    std::is_integral_v<Tx> && std::is_integral_v<Ty>,
-                    decltype(::min(x, y)),
-                    std::conditional_t<is_decayed_v<Tx, float> && is_decayed_v<Ty, float>, float, double>>;
-                ALPAKA_UNREACHABLE(Ret{});
-            }
-        };
-
-        //! The CUDA pow trait specialization for real types.
-        template<typename TBase, typename TExp>
-        struct Pow<
-            PowUniformCudaHipBuiltIn,
-            TBase,
-            TExp,
-            std::enable_if_t<std::is_floating_point_v<TBase> && std::is_floating_point_v<TExp>>>
-        {
-            __host__ __device__ auto operator()(
-                PowUniformCudaHipBuiltIn const& /* pow_ctx */,
-                TBase const& base,
-                TExp const& exp)
-            {
-                if constexpr(is_decayed_v<TBase, float> && is_decayed_v<TExp, float>)
-                    return ::powf(base, exp);
-                else if constexpr(is_decayed_v<TBase, double> || is_decayed_v<TExp, double>)
-                    return ::pow(static_cast<double>(base), static_cast<double>(exp));
-                else
-                    static_assert(!sizeof(TBase), "Unsupported data type");
-
-                using Ret [[maybe_unused]]
-                = std::conditional_t<is_decayed_v<TBase, float> && is_decayed_v<TExp, float>, float, double>;
-                ALPAKA_UNREACHABLE(Ret{});
-            }
-        };
-
-        //! The CUDA pow trait specialization for complex types.
-        template<typename T, typename U>
-        struct Pow<PowUniformCudaHipBuiltIn, Complex<T>, Complex<U>>
-        {
-            //! Take context as original (accelerator) type, since we call other math functions
-            template<typename TCtx>
-            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& base, Complex<U> const& exponent)
-            {
-                // Type promotion matching rules of complex std::pow but simplified given our math only supports float
-                // and double, no long double.
-                using Promoted
-                    = Complex<std::conditional_t<is_decayed_v<T, float> && is_decayed_v<U, float>, float, double>>;
-                // pow(z1, z2) = e^(z2 * log(z1))
-                return exp(ctx, Promoted{exponent} * log(ctx, Promoted{base}));
-            }
-        };
-
-        //! The CUDA pow trait specialization for complex and real types.
-        template<typename T, typename U>
-        struct Pow<PowUniformCudaHipBuiltIn, Complex<T>, U>
-        {
-            //! Take context as original (accelerator) type, since we call other math functions
-            template<typename TCtx>
-            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& base, U const& exponent)
-            {
-                return pow(ctx, base, Complex<U>{exponent});
-            }
-        };
-
-        //! The CUDA pow trait specialization for real and complex types.
-        template<typename T, typename U>
-        struct Pow<PowUniformCudaHipBuiltIn, T, Complex<U>>
-        {
-            //! Take context as original (accelerator) type, since we call other math functions
-            template<typename TCtx>
-            __host__ __device__ auto operator()(TCtx const& ctx, T const& base, Complex<U> const& exponent)
-            {
-                return pow(ctx, Complex<T>{base}, exponent);
-            }
-        };
-
-        //! The CUDA remainder trait specialization.
-        template<typename Tx, typename Ty>
-        struct Remainder<
-            RemainderUniformCudaHipBuiltIn,
-            Tx,
-            Ty,
-            std::enable_if_t<std::is_floating_point_v<Tx> && std::is_floating_point_v<Ty>>>
-        {
-            __host__ __device__ auto operator()(
-                RemainderUniformCudaHipBuiltIn const& /* remainder_ctx */,
-                Tx const& x,
-                Ty const& y)
-            {
-                if constexpr(is_decayed_v<Tx, float> && is_decayed_v<Ty, float>)
-                    return ::remainderf(x, y);
-                else if constexpr(is_decayed_v<Tx, double> || is_decayed_v<Ty, double>)
-                    return ::remainder(x, y);
-                else
-                    static_assert(!sizeof(Tx), "Unsupported data type");
-
-                using Ret [[maybe_unused]]
-                = std::conditional_t<is_decayed_v<Tx, float> && is_decayed_v<Ty, float>, float, double>;
-                ALPAKA_UNREACHABLE(Ret{});
-            }
-        };
-
-        //! The CUDA round trait specialization.
-        template<typename TArg>
-        struct Round<RoundUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(RoundUniformCudaHipBuiltIn const& /* round_ctx */, TArg const& arg)
-            {
-                if constexpr(is_decayed_v<TArg, float>)
-                    return ::roundf(arg);
-                else if constexpr(is_decayed_v<TArg, double>)
-                    return ::round(arg);
-                else
-                    static_assert(!sizeof(TArg), "Unsupported data type");
-
-                ALPAKA_UNREACHABLE(TArg{});
-            }
-        };
-
-        //! The CUDA lround trait specialization.
-        template<typename TArg>
-        struct Lround<RoundUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(RoundUniformCudaHipBuiltIn const& /* lround_ctx */, TArg const& arg)
-            {
-                if constexpr(is_decayed_v<TArg, float>)
-                    return ::lroundf(arg);
-                else if constexpr(is_decayed_v<TArg, double>)
-                    return ::lround(arg);
-                else
-                    static_assert(!sizeof(TArg), "Unsupported data type");
-
-                ALPAKA_UNREACHABLE(long{});
-            }
-        };
-
-        //! The CUDA llround trait specialization.
-        template<typename TArg>
-        struct Llround<RoundUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(RoundUniformCudaHipBuiltIn const& /* llround_ctx */, TArg const& arg)
-            {
-                if constexpr(is_decayed_v<TArg, float>)
-                    return ::llroundf(arg);
-                else if constexpr(is_decayed_v<TArg, double>)
-                    return ::llround(arg);
-                else
-                    static_assert(!sizeof(TArg), "Unsupported data type");
-
-                // NVCC versions before 11.3 are unable to compile 'long long{}': "type name is not allowed".
-                using Ret [[maybe_unused]] = long long;
-                ALPAKA_UNREACHABLE(Ret{});
-            }
-        };
-
-        //! The CUDA rsqrt trait specialization for real types.
-        template<typename TArg>
-        struct Rsqrt<RsqrtUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_arithmetic_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(RsqrtUniformCudaHipBuiltIn const& /* rsqrt_ctx */, TArg const& arg)
-            {
-                if constexpr(is_decayed_v<TArg, float>)
-                    return ::rsqrtf(arg);
-                else if constexpr(is_decayed_v<TArg, double> || std::is_integral_v<TArg>)
-                    return ::rsqrt(arg);
-                else
-                    static_assert(!sizeof(TArg), "Unsupported data type");
-
-                ALPAKA_UNREACHABLE(TArg{});
-            }
-        };
-
-        //! The CUDA rsqrt trait specialization for complex types.
-        template<typename T>
-        struct Rsqrt<RsqrtUniformCudaHipBuiltIn, Complex<T>>
-        {
-            //! Take context as original (accelerator) type, since we call other math functions
-            template<typename TCtx>
-            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& arg)
-            {
-                return T{1.0} / sqrt(ctx, arg);
-            }
-        };
-
-        //! The CUDA sin trait specialization for real types.
-        template<typename TArg>
-        struct Sin<SinUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(SinUniformCudaHipBuiltIn const& /* sin_ctx */, TArg const& arg)
-            {
-                if constexpr(is_decayed_v<TArg, float>)
-                    return ::sinf(arg);
-                else if constexpr(is_decayed_v<TArg, double>)
-                    return ::sin(arg);
-                else
-                    static_assert(!sizeof(TArg), "Unsupported data type");
-
-                ALPAKA_UNREACHABLE(TArg{});
-            }
-        };
-
-        //! The CUDA sin trait specialization for complex types.
-        template<typename T>
-        struct Sin<SinUniformCudaHipBuiltIn, Complex<T>>
-        {
-            //! Take context as original (accelerator) type, since we call other math functions
-            template<typename TCtx>
-            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& arg)
-            {
-                // sin(z) = (exp(i * z) - exp(-i * z)) / 2i
-                return (exp(ctx, Complex<T>{0.0, 1.0} * arg) - exp(ctx, Complex<T>{0.0, -1.0} * arg))
-                       / Complex<T>{0.0, 2.0};
-            }
-        };
-
-        //! The CUDA sinh trait specialization for real types.
-        template<typename TArg>
-        struct Sinh<SinhUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(SinhUniformCudaHipBuiltIn const& /* sinh_ctx */, TArg const& arg)
-            {
-                if constexpr(is_decayed_v<TArg, float>)
-                    return ::sinhf(arg);
-                else if constexpr(is_decayed_v<TArg, double>)
-                    return ::sinh(arg);
-                else
-                    static_assert(!sizeof(TArg), "Unsupported data type");
-
-                ALPAKA_UNREACHABLE(TArg{});
-            }
-        };
-
-        //! The CUDA sinh trait specialization for complex types.
-        template<typename T>
-        struct Sinh<SinhUniformCudaHipBuiltIn, Complex<T>>
-        {
-            //! Take context as original (accelerator) type, since we call other math functions
-            template<typename TCtx>
-            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& arg)
-            {
-                // sinh(z) = (exp(z) - exp(-i * z)) / 2
-                return (exp(ctx, arg) - exp(ctx, static_cast<T>(-1.0) * arg)) / static_cast<T>(2.0);
-            }
-        };
-
-        //! The CUDA sincos trait specialization for real types.
-        template<typename TArg>
-        struct SinCos<SinCosUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(
-                SinCosUniformCudaHipBuiltIn const& /* sincos_ctx */,
-                TArg const& arg,
-                TArg& result_sin,
-                TArg& result_cos) -> void
-            {
-                if constexpr(is_decayed_v<TArg, float>)
-                    ::sincosf(arg, &result_sin, &result_cos);
-                else if constexpr(is_decayed_v<TArg, double>)
-                    ::sincos(arg, &result_sin, &result_cos);
-                else
-                    static_assert(!sizeof(TArg), "Unsupported data type");
-            }
-        };
-
-        //! The CUDA sincos trait specialization for complex types.
-        template<typename T>
-        struct SinCos<SinCosUniformCudaHipBuiltIn, Complex<T>>
-        {
-            //! Take context as original (accelerator) type, since we call other math functions
-            template<typename TCtx>
-            __host__ __device__ auto operator()(
-                TCtx const& ctx,
-                Complex<T> const& arg,
-                Complex<T>& result_sin,
-                Complex<T>& result_cos) -> void
-            {
-                result_sin = sin(ctx, arg);
-                result_cos = cos(ctx, arg);
-            }
-        };
-
-        //! The CUDA sqrt trait specialization for real types.
-        template<typename TArg>
-        struct Sqrt<SqrtUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_arithmetic_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(SqrtUniformCudaHipBuiltIn const& /* sqrt_ctx */, TArg const& arg)
-            {
-                if constexpr(is_decayed_v<TArg, float>)
-                    return ::sqrtf(arg);
-                else if constexpr(is_decayed_v<TArg, double> || std::is_integral_v<TArg>)
-                    return ::sqrt(arg);
-                else
-                    static_assert(!sizeof(TArg), "Unsupported data type");
-
-                ALPAKA_UNREACHABLE(TArg{});
-            }
-        };
-
-        //! The CUDA sqrt trait specialization for complex types.
-        template<typename T>
-        struct Sqrt<SqrtUniformCudaHipBuiltIn, Complex<T>>
-        {
-            //! Take context as original (accelerator) type, since we call other math functions
-            template<typename TCtx>
-            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& argument)
-            {
-                // Branch cut along the negative real axis (same as for std::complex),
-                // principal value of sqrt(z) = sqrt(|z|) * e^(i * arg(z) / 2)
-                auto const halfArg = T(0.5) * arg(ctx, argument);
-                auto re = T{}, im = T{};
-                sincos(ctx, halfArg, im, re);
-                return sqrt(ctx, abs(ctx, argument)) * Complex<T>(re, im);
-            }
-        };
-
-        //! The CUDA tan trait specialization for real types.
-        template<typename TArg>
-        struct Tan<TanUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(TanUniformCudaHipBuiltIn const& /* tan_ctx */, TArg const& arg)
-            {
-                if constexpr(is_decayed_v<TArg, float>)
-                    return ::tanf(arg);
-                else if constexpr(is_decayed_v<TArg, double>)
-                    return ::tan(arg);
-                else
-                    static_assert(!sizeof(TArg), "Unsupported data type");
-
-                ALPAKA_UNREACHABLE(TArg{});
-            }
-        };
-
-        //! The CUDA tan trait specialization for complex types.
-        template<typename T>
-        struct Tan<TanUniformCudaHipBuiltIn, Complex<T>>
-        {
-            //! Take context as original (accelerator) type, since we call other math functions
-            template<typename TCtx>
-            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& arg)
-            {
-                // tan(z) = i * (e^-iz - e^iz) / (e^-iz + e^iz) = i * (1 - e^2iz) / (1 + e^2iz)
-                // Warning: this straightforward implementation can easily result in NaN as 0/0 or inf/inf.
-                auto const expValue = exp(ctx, Complex<T>{0.0, 2.0} * arg);
-                return Complex<T>{0.0, 1.0} * (T{1.0} - expValue) / (T{1.0} + expValue);
-            }
-        };
-
-        //! The CUDA tanh trait specialization for real types.
-        template<typename TArg>
-        struct Tanh<TanhUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(TanhUniformCudaHipBuiltIn const& /* tanh_ctx */, TArg const& arg)
-            {
-                if constexpr(is_decayed_v<TArg, float>)
-                    return ::tanhf(arg);
-                else if constexpr(is_decayed_v<TArg, double>)
-                    return ::tanh(arg);
-                else
-                    static_assert(!sizeof(TArg), "Unsupported data type");
-
-                ALPAKA_UNREACHABLE(TArg{});
-            }
-        };
-
-        //! The CUDA tanh trait specialization for complex types.
-        template<typename T>
-        struct Tanh<TanhUniformCudaHipBuiltIn, Complex<T>>
-        {
-            //! Take context as original (accelerator) type, since we call other math functions
-            template<typename TCtx>
-            __host__ __device__ auto operator()(TCtx const& ctx, Complex<T> const& arg)
-            {
-                // tanh(z) = (e^z - e^-z)/(e^z+e^-z)
-                return (exp(ctx, arg) - exp(ctx, static_cast<T>(-1.0) * arg))
-                       / (exp(ctx, arg) + exp(ctx, static_cast<T>(-1.0) * arg));
-            }
-        };
-
-        //! The CUDA trunc trait specialization.
-        template<typename TArg>
-        struct Trunc<TruncUniformCudaHipBuiltIn, TArg, std::enable_if_t<std::is_floating_point_v<TArg>>>
-        {
-            __host__ __device__ auto operator()(TruncUniformCudaHipBuiltIn const& /* trunc_ctx */, TArg const& arg)
-            {
-                if constexpr(is_decayed_v<TArg, float>)
-                    return ::truncf(arg);
-                else if constexpr(is_decayed_v<TArg, double>)
-                    return ::trunc(arg);
-                else
-                    static_assert(!sizeof(TArg), "Unsupported data type");
-
-                ALPAKA_UNREACHABLE(TArg{});
-            }
-        };
-    } // namespace trait
-#    endif
-} // namespace alpaka::math
-
-#endif
diff --git a/include/alpaka/math/Traits.hpp b/include/alpaka/math/Traits.hpp
deleted file mode 100644
index c63b662..0000000
--- a/include/alpaka/math/Traits.hpp
+++ /dev/null
@@ -1,1488 +0,0 @@
-/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber, Sergei Bastrakov,
- *                Andrea Bocci, René Widera
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/core/Concepts.hpp"
-
-#include <cmath>
-#include <complex>
-#if __has_include(<version>) // Not part of the C++17 standard but all major standard libraries include this
-#    include <version>
-#endif
-#ifdef __cpp_lib_math_constants
-#    include <numbers>
-#endif
-
-namespace alpaka::math
-{
-    namespace constants
-    {
-#ifdef __cpp_lib_math_constants
-        inline constexpr double e = std::numbers::e;
-        inline constexpr double log2e = std::numbers::log2e;
-        inline constexpr double log10e = std::numbers::log10e;
-        inline constexpr double pi = std::numbers::pi;
-        inline constexpr double inv_pi = std::numbers::inv_pi;
-        inline constexpr double ln2 = std::numbers::ln2;
-        inline constexpr double ln10 = std::numbers::ln10;
-        inline constexpr double sqrt2 = std::numbers::sqrt2;
-
-        template<typename T>
-        inline constexpr T e_v = std::numbers::e_v<T>;
-
-        template<typename T>
-        inline constexpr T log2e_v = std::numbers::log2e_v<T>;
-
-        template<typename T>
-        inline constexpr T log10e_v = std::numbers::log10e_v<T>;
-
-        template<typename T>
-        inline constexpr T pi_v = std::numbers::pi_v<T>;
-
-        template<typename T>
-        inline constexpr T inv_pi_v = std::numbers::inv_pi_v<T>;
-
-        template<typename T>
-        inline constexpr T ln2_v = std::numbers::ln2_v<T>;
-
-        template<typename T>
-        inline constexpr T ln10_v = std::numbers::ln10_v<T>;
-
-        template<typename T>
-        inline constexpr T sqrt2_v = std::numbers::sqrt2_v<T>;
-#else
-        inline constexpr double e = M_E;
-        inline constexpr double log2e = M_LOG2E;
-        inline constexpr double log10e = M_LOG10E;
-        inline constexpr double pi = M_PI;
-        inline constexpr double inv_pi = M_1_PI;
-        inline constexpr double ln2 = M_LN2;
-        inline constexpr double ln10 = M_LN10;
-        inline constexpr double sqrt2 = M_SQRT2;
-
-        template<typename T>
-        inline constexpr T e_v = static_cast<T>(e);
-
-        template<typename T>
-        inline constexpr T log2e_v = static_cast<T>(log2e);
-
-        template<typename T>
-        inline constexpr T log10e_v = static_cast<T>(log10e);
-
-        template<typename T>
-        inline constexpr T pi_v = static_cast<T>(pi);
-
-        template<typename T>
-        inline constexpr T inv_pi_v = static_cast<T>(inv_pi);
-
-        template<typename T>
-        inline constexpr T ln2_v = static_cast<T>(ln2);
-
-        template<typename T>
-        inline constexpr T ln10_v = static_cast<T>(ln10);
-
-        template<typename T>
-        inline constexpr T sqrt2_v = static_cast<T>(sqrt2);
-
-        // Use predefined float constants when available
-#    if defined(M_Ef)
-        template<>
-        inline constexpr float e_v<float> = M_Ef;
-#    endif
-
-#    if defined(M_LOG2Ef)
-        template<>
-        inline constexpr float log2e_v<float> = M_LOG2Ef;
-#    endif
-
-#    if defined(M_LOG10Ef)
-        template<>
-        inline constexpr float log10e_v<float> = M_LOG10Ef;
-#    endif
-
-#    if defined(M_PIf)
-        template<>
-        inline constexpr float pi_v<float> = M_PIf;
-#    endif
-
-#    if defined(M_1_PIf)
-        template<>
-        inline constexpr float inv_pi_v<float> = M_1_PIf;
-#    endif
-
-#    if defined(M_LN2f)
-        template<>
-        inline constexpr float ln2_v<float> = M_LN2f;
-#    endif
-
-#    if defined(M_LN10f)
-        template<>
-        inline constexpr float ln10_v<float> = M_LN10f;
-#    endif
-
-#    if defined(M_SQRT2f)
-        template<>
-        inline constexpr float sqrt2_v<float> = M_SQRT2f;
-#    endif
-
-#endif
-    } // namespace constants
-
-    struct ConceptMathAbs
-    {
-    };
-
-    struct ConceptMathAcos
-    {
-    };
-
-    struct ConceptMathAcosh
-    {
-    };
-
-    struct ConceptMathArg
-    {
-    };
-
-    struct ConceptMathAsin
-    {
-    };
-
-    struct ConceptMathAsinh
-    {
-    };
-
-    struct ConceptMathAtan
-    {
-    };
-
-    struct ConceptMathAtanh
-    {
-    };
-
-    struct ConceptMathAtan2
-    {
-    };
-
-    struct ConceptMathCbrt
-    {
-    };
-
-    struct ConceptMathCeil
-    {
-    };
-
-    struct ConceptMathConj
-    {
-    };
-
-    struct ConceptMathCopysign
-    {
-    };
-
-    struct ConceptMathCos
-    {
-    };
-
-    struct ConceptMathCosh
-    {
-    };
-
-    struct ConceptMathErf
-    {
-    };
-
-    struct ConceptMathExp
-    {
-    };
-
-    struct ConceptMathFloor
-    {
-    };
-
-    struct ConceptMathFma
-    {
-    };
-
-    struct ConceptMathFmod
-    {
-    };
-
-    struct ConceptMathIsfinite
-    {
-    };
-
-    struct ConceptMathIsinf
-    {
-    };
-
-    struct ConceptMathIsnan
-    {
-    };
-
-    struct ConceptMathLog
-    {
-    };
-
-    struct ConceptMathLog2
-    {
-    };
-
-    struct ConceptMathLog10
-    {
-    };
-
-    struct ConceptMathMax
-    {
-    };
-
-    struct ConceptMathMin
-    {
-    };
-
-    struct ConceptMathPow
-    {
-    };
-
-    struct ConceptMathRemainder
-    {
-    };
-
-    struct ConceptMathRound
-    {
-    };
-
-    struct ConceptMathRsqrt
-    {
-    };
-
-    struct ConceptMathSin
-    {
-    };
-
-    struct ConceptMathSinh
-    {
-    };
-
-    struct ConceptMathSinCos
-    {
-    };
-
-    struct ConceptMathSqrt
-    {
-    };
-
-    struct ConceptMathTan
-    {
-    };
-
-    struct ConceptMathTanh
-    {
-    };
-
-    struct ConceptMathTrunc
-    {
-    };
-
-    //! The math traits.
-    namespace trait
-    {
-        //! The abs trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Abs
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find abs(TArg) in the namespace of your type.
-                using std::abs;
-                return abs(arg);
-            }
-        };
-
-        //! The acos trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Acos
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find acos(TArg) in the namespace of your type.
-                using std::acos;
-                return acos(arg);
-            }
-        };
-
-        //! The acosh trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Acosh
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find acosh(TArg) in the namespace of your type.
-                using std::acosh;
-                return acosh(arg);
-            }
-        };
-
-        //! The arg trait.
-        template<typename T, typename TArgument, typename TSfinae = void>
-        struct Arg
-        {
-            // It is unclear why this is needed here and not in other math trait structs. But removing it causes
-            // warnings with calling a __host__ function from a __host__ __device__ function when building for CUDA.
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArgument const& argument)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find arg(TArgument) in the namespace of your type.
-                using std::arg;
-                return arg(argument);
-            }
-        };
-
-        //! The asin trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Asin
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find asin(TArg) in the namespace of your type.
-                using std::asin;
-                return asin(arg);
-            }
-        };
-
-        //! The asin trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Asinh
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find asin(TArg) in the namespace of your type.
-                using std::asinh;
-                return asinh(arg);
-            }
-        };
-
-        //! The atan trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Atan
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find atan(TArg) in the namespace of your type.
-                using std::atan;
-                return atan(arg);
-            }
-        };
-
-        //! The atanh trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Atanh
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find atanh(TArg) in the namespace of your type.
-                using std::atanh;
-                return atanh(arg);
-            }
-        };
-
-        //! The atan2 trait.
-        template<typename T, typename Ty, typename Tx, typename TSfinae = void>
-        struct Atan2
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, Ty const& y, Tx const& x)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find atan2(Tx, Ty) in the namespace of your type.
-                using std::atan2;
-                return atan2(y, x);
-            }
-        };
-
-        //! The cbrt trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Cbrt
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find cbrt(TArg) in the namespace of your type.
-                using std::cbrt;
-                return cbrt(arg);
-            } //! The erf trait.
-        };
-
-        //! The ceil trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Ceil
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find ceil(TArg) in the namespace of your type.
-                using std::ceil;
-                return ceil(arg);
-            }
-        };
-
-        //! The conj trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Conj
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find conj(TArg) in the namespace of your type.
-                using std::conj;
-                return conj(arg);
-            }
-        };
-
-        //! The copysign trait.
-        template<typename T, typename TMag, typename TSgn, typename TSfinae = void>
-        struct Copysign
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TMag const& mag, TSgn const& sgn)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find copysign(TMag, TSgn) in the namespace of your type.
-                using std::copysign;
-                return copysign(mag, sgn);
-            }
-        };
-
-        //! The cos trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Cos
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find cos(TArg) in the namespace of your type.
-                using std::cos;
-                return cos(arg);
-            }
-        };
-
-        //! The cosh trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Cosh
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find cos(TArg) in the namespace of your type.
-                using std::cosh;
-                return cosh(arg);
-            }
-        };
-
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Erf
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find erf(TArg) in the namespace of your type.
-                using std::erf;
-                return erf(arg);
-            }
-        };
-
-        //! The exp trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Exp
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find exp(TArg) in the namespace of your type.
-                using std::exp;
-                return exp(arg);
-            }
-        };
-
-        //! The floor trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Floor
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find floor(TArg) in the namespace of your type.
-                using std::floor;
-                return floor(arg);
-            }
-        };
-
-        //! The fma trait.
-        template<typename T, typename Tx, typename Ty, typename Tz, typename TSfinae = void>
-        struct Fma
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, Tx const& x, Ty const& y, Tz const& z)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find fma(Tx, Ty, Tz) in the namespace of your type.
-                using std::fma;
-                return fma(x, y, z);
-            }
-        };
-
-        //! The fmod trait.
-        template<typename T, typename Tx, typename Ty, typename TSfinae = void>
-        struct Fmod
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, Tx const& x, Ty const& y)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find fmod(Tx, Ty) in the namespace of your type.
-                using std::fmod;
-                return fmod(x, y);
-            }
-        };
-
-        //! The isfinite trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Isfinite
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find isfinite(TArg) in the namespace of your type.
-                using std::isfinite;
-                return isfinite(arg);
-            }
-        };
-
-        //! The isinf trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Isinf
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find isinf(TArg) in the namespace of your type.
-                using std::isinf;
-                return isinf(arg);
-            }
-        };
-
-        //! The isnan trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Isnan
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find isnan(TArg) in the namespace of your type.
-                using std::isnan;
-                return isnan(arg);
-            }
-        };
-
-        //! The log trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Log
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find log(TArg) in the namespace of your type.
-                using std::log;
-                return log(arg);
-            }
-        };
-
-        //! The bas 2 log trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Log2
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find log2(TArg) in the namespace of your type.
-                using std::log2;
-                return log2(arg);
-            }
-        };
-
-        //! The base 10 log trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Log10
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find log10(TArg) in the namespace of your type.
-                using std::log10;
-                return log10(arg);
-            }
-        };
-
-        //! The max trait.
-        template<typename T, typename Tx, typename Ty, typename TSfinae = void>
-        struct Max
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, Tx const& x, Ty const& y)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find max(Tx, Ty) in the namespace of your type.
-                using std::max;
-                return max(x, y);
-            }
-        };
-
-        //! The min trait.
-        template<typename T, typename Tx, typename Ty, typename TSfinae = void>
-        struct Min
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, Tx const& x, Ty const& y)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find min(Tx, Ty) in the namespace of your type.
-                using std::min;
-                return min(x, y);
-            }
-        };
-
-        //! The pow trait.
-        template<typename T, typename TBase, typename TExp, typename TSfinae = void>
-        struct Pow
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TBase const& base, TExp const& exp)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find pow(base, exp) in the namespace of your type.
-                using std::pow;
-                return pow(base, exp);
-            }
-        };
-
-        //! The remainder trait.
-        template<typename T, typename Tx, typename Ty, typename TSfinae = void>
-        struct Remainder
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, Tx const& x, Ty const& y)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find remainder(Tx, Ty) in the namespace of your type.
-                using std::remainder;
-                return remainder(x, y);
-            }
-        };
-
-        //! The round trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Round
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find round(TArg) in the namespace of your type.
-                using std::round;
-                return round(arg);
-            }
-        };
-
-        //! The round trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Lround
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find lround(TArg) in the namespace of your type.
-                using std::lround;
-                return lround(arg);
-            }
-        };
-
-        //! The round trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Llround
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find llround(TArg) in the namespace of your type.
-                using std::llround;
-                return llround(arg);
-            }
-        };
-
-        namespace detail
-        {
-            //! Fallback implementation when no better ADL match was found
-            template<typename TArg>
-            ALPAKA_FN_HOST_ACC auto rsqrt(TArg const& arg)
-            {
-                // Still use ADL to try find sqrt(arg)
-                using std::sqrt;
-                return static_cast<TArg>(1) / sqrt(arg);
-            }
-        } // namespace detail
-
-        //! The rsqrt trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Rsqrt
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find rsqrt(TArg) in the namespace of your type.
-                using detail::rsqrt;
-                return rsqrt(arg);
-            }
-        };
-
-        //! The sin trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Sin
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find sin(TArg) in the namespace of your type.
-                using std::sin;
-                return sin(arg);
-            }
-        };
-
-        //! The sin trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Sinh
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find sin(TArg) in the namespace of your type.
-                using std::sinh;
-                return sinh(arg);
-            }
-        };
-
-        namespace detail
-        {
-            //! Fallback implementation when no better ADL match was found
-            template<typename TArg>
-            ALPAKA_FN_HOST_ACC auto sincos(TArg const& arg, TArg& result_sin, TArg& result_cos)
-            {
-                // Still use ADL to try find sin(arg) and cos(arg)
-                using std::sin;
-                result_sin = sin(arg);
-                using std::cos;
-                result_cos = cos(arg);
-            }
-        } // namespace detail
-
-        //! The sincos trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct SinCos
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg, TArg& result_sin, TArg& result_cos)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find sincos(TArg, TArg&, TArg&) in the namespace of your type.
-                using detail::sincos;
-                return sincos(arg, result_sin, result_cos);
-            }
-        };
-
-        //! The sqrt trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Sqrt
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find sqrt(TArg) in the namespace of your type.
-                using std::sqrt;
-                return sqrt(arg);
-            }
-        };
-
-        //! The tan trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Tan
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find tan(TArg) in the namespace of your type.
-                using std::tan;
-                return tan(arg);
-            }
-        };
-
-        //! The tanh trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Tanh
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find tanh(TArg) in the namespace of your type.
-                using std::tanh;
-                return tanh(arg);
-            }
-        };
-
-        //! The trunc trait.
-        template<typename T, typename TArg, typename TSfinae = void>
-        struct Trunc
-        {
-            ALPAKA_FN_HOST_ACC auto operator()(T const& /* ctx */, TArg const& arg)
-            {
-                // This is an ADL call. If you get a compile error here then your type is not supported by the
-                // backend and we could not find trunc(TArg) in the namespace of your type.
-                using std::trunc;
-                return trunc(arg);
-            }
-        };
-    } // namespace trait
-
-    //! Computes the absolute value.
-    //!
-    //! \tparam T The type of the object specializing Abs.
-    //! \tparam TArg The arg type.
-    //! \param abs_ctx The object specializing Abs.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto abs(T const& abs_ctx, TArg const& arg)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathAbs, T>;
-        return trait::Abs<ImplementationBase, TArg>{}(abs_ctx, arg);
-    }
-
-    //! Computes the principal value of the arc cosine.
-    //!
-    //! The valid real argument range is [-1.0, 1.0]. For other values
-    //! the result may depend on the backend and compilation options, will
-    //! likely be NaN.
-    //!
-    //! \tparam TArg The arg type.
-    //! \param acos_ctx The object specializing Acos.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto acos(T const& acos_ctx, TArg const& arg)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathAcos, T>;
-        return trait::Acos<ImplementationBase, TArg>{}(acos_ctx, arg);
-    }
-
-    //! Computes the principal value of the hyperbolic arc cosine.
-    //!
-    //! The valid real argument range is [1.0, Inf]. For other values
-    //! the result may depend on the backend and compilation options, will
-    //! likely be NaN.
-    //!
-    //! \tparam TArg The arg type.
-    //! \param acosh_ctx The object specializing Acos.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto acosh(T const& acosh_ctx, TArg const& arg)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathAcosh, T>;
-        return trait::Acosh<ImplementationBase, TArg>{}(acosh_ctx, arg);
-    }
-
-    //! Computes the complex argument of the value.
-    //!
-    //! \tparam T The type of the object specializing Arg.
-    //! \tparam TArgument The argument type.
-    //! \param arg_ctx The object specializing Arg.
-    //! \param argument The argument.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArgument>
-    ALPAKA_FN_HOST_ACC auto arg(T const& arg_ctx, TArgument const& argument)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathArg, T>;
-        return trait::Arg<ImplementationBase, TArgument>{}(arg_ctx, argument);
-    }
-
-    //! Computes the principal value of the arc sine.
-    //!
-    //! The valid real argument range is [-1.0, 1.0]. For other values
-    //! the result may depend on the backend and compilation options, will
-    //! likely be NaN.
-    //!
-    //! \tparam TArg The arg type.
-    //! \param asin_ctx The object specializing Asin.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto asin(T const& asin_ctx, TArg const& arg)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathAsin, T>;
-        return trait::Asin<ImplementationBase, TArg>{}(asin_ctx, arg);
-    }
-
-    //! Computes the principal value of the hyperbolic arc sine.
-    //!
-    //! \tparam TArg The arg type.
-    //! \param asinh_ctx The object specializing Asin.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto asinh(T const& asinh_ctx, TArg const& arg)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathAsinh, T>;
-        return trait::Asinh<ImplementationBase, TArg>{}(asinh_ctx, arg);
-    }
-
-    //! Computes the principal value of the arc tangent.
-    //!
-    //! \tparam TArg The arg type.
-    //! \param atan_ctx The object specializing Atan.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto atan(T const& atan_ctx, TArg const& arg)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathAtan, T>;
-        return trait::Atan<ImplementationBase, TArg>{}(atan_ctx, arg);
-    }
-
-    //! Computes the principal value of the hyperbolic arc tangent.
-    //!
-    //! The valid real argument range is [-1.0, 1.0]. For other values
-    //! the result may depend on the backend and compilation options, will
-    //! likely be NaN.
-
-    //! \tparam TArg The arg type.
-    //! \param atanh_ctx The object specializing Atanh.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto atanh(T const& atanh_ctx, TArg const& arg)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathAtanh, T>;
-        return trait::Atanh<ImplementationBase, TArg>{}(atanh_ctx, arg);
-    }
-
-    //! Computes the arc tangent of y/x using the signs of arguments to determine the correct quadrant.
-    //!
-    //! \tparam T The type of the object specializing Atan2.
-    //! \tparam Ty The y arg type.
-    //! \tparam Tx The x arg type.
-    //! \param atan2_ctx The object specializing Atan2.
-    //! \param y The y arg.
-    //! \param x The x arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename Ty, typename Tx>
-    ALPAKA_FN_HOST_ACC auto atan2(T const& atan2_ctx, Ty const& y, Tx const& x)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathAtan2, T>;
-        return trait::Atan2<ImplementationBase, Ty, Tx>{}(atan2_ctx, y, x);
-    }
-
-    //! Computes the cbrt.
-    //!
-    //! \tparam T The type of the object specializing Cbrt.
-    //! \tparam TArg The arg type.
-    //! \param cbrt_ctx The object specializing Cbrt.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto cbrt(T const& cbrt_ctx, TArg const& arg)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathCbrt, T>;
-        return trait::Cbrt<ImplementationBase, TArg>{}(cbrt_ctx, arg);
-    }
-
-    //! Computes the smallest integer value not less than arg.
-    //!
-    //! \tparam T The type of the object specializing Ceil.
-    //! \tparam TArg The arg type.
-    //! \param ceil_ctx The object specializing Ceil.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto ceil(T const& ceil_ctx, TArg const& arg)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathCeil, T>;
-        return trait::Ceil<ImplementationBase, TArg>{}(ceil_ctx, arg);
-    }
-
-    //! Computes the complex conjugate of arg.
-    //!
-    //! \tparam T The type of the object specializing Conj.
-    //! \tparam TArg The arg type.
-    //! \param conj_ctx The object specializing Conj.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto conj(T const& conj_ctx, TArg const& arg)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathConj, T>;
-        return trait::Conj<ImplementationBase, TArg>{}(conj_ctx, arg);
-    }
-
-    //! Creates a value with the magnitude of mag and the sign of sgn.
-    //!
-    //! \tparam T The type of the object specializing Copysign.
-    //! \tparam TMag The mag type.
-    //! \tparam TSgn The sgn type.
-    //! \param copysign_ctx The object specializing Copysign.
-    //! \param mag The mag.
-    //! \param sgn The sgn.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TMag, typename TSgn>
-    ALPAKA_FN_HOST_ACC auto copysign(T const& copysign_ctx, TMag const& mag, TSgn const& sgn)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathCopysign, T>;
-        return trait::Copysign<ImplementationBase, TMag, TSgn>{}(copysign_ctx, mag, sgn);
-    }
-
-    //! Computes the cosine (measured in radians).
-    //!
-    //! \tparam T The type of the object specializing Cos.
-    //! \tparam TArg The arg type.
-    //! \param cos_ctx The object specializing Cos.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto cos(T const& cos_ctx, TArg const& arg)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathCos, T>;
-        return trait::Cos<ImplementationBase, TArg>{}(cos_ctx, arg);
-    }
-
-    //! Computes the hyperbolic cosine (measured in radians).
-    //!
-    //! \tparam T The type of the object specializing Cos.
-    //! \tparam TArg The arg type.
-    //! \param cosh_ctx The object specializing Cos.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto cosh(T const& cosh_ctx, TArg const& arg)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathCosh, T>;
-        return trait::Cosh<ImplementationBase, TArg>{}(cosh_ctx, arg);
-    }
-
-    //! Computes the error function of arg.
-    //!
-    //! \tparam T The type of the object specializing Erf.
-    //! \tparam TArg The arg type.
-    //! \param erf_ctx The object specializing Erf.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto erf(T const& erf_ctx, TArg const& arg)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathErf, T>;
-        return trait::Erf<ImplementationBase, TArg>{}(erf_ctx, arg);
-    }
-
-    //! Computes the e (Euler's number, 2.7182818) raised to the given power arg.
-    //!
-    //! \tparam T The type of the object specializing Exp.
-    //! \tparam TArg The arg type.
-    //! \param exp_ctx The object specializing Exp.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto exp(T const& exp_ctx, TArg const& arg)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathExp, T>;
-        return trait::Exp<ImplementationBase, TArg>{}(exp_ctx, arg);
-    }
-
-    //! Computes the largest integer value not greater than arg.
-    //!
-    //! \tparam T The type of the object specializing Floor.
-    //! \tparam TArg The arg type.
-    //! \param floor_ctx The object specializing Floor.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto floor(T const& floor_ctx, TArg const& arg)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathFloor, T>;
-        return trait::Floor<ImplementationBase, TArg>{}(floor_ctx, arg);
-    }
-
-    //! Computes x * y + z as if to infinite precision and rounded only once to fit the result type.
-    //!
-    //! \tparam T The type of the object specializing Fma.
-    //! \tparam Tx The type of the first argument.
-    //! \tparam Ty The type of the second argument.
-    //! \tparam Tz The type of the third argument.
-    //! \param fma_ctx The object specializing .
-    //! \param x The first argument.
-    //! \param y The second argument.
-    //! \param z The third argument.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename Tx, typename Ty, typename Tz>
-    ALPAKA_FN_HOST_ACC auto fma(T const& fma_ctx, Tx const& x, Ty const& y, Tz const& z)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathFma, T>;
-        return trait::Fma<ImplementationBase, Tx, Ty, Tz>{}(fma_ctx, x, y, z);
-    }
-
-    //! Computes the floating-point remainder of the division operation x/y.
-    //!
-    //! \tparam T The type of the object specializing Fmod.
-    //! \tparam Tx The type of the first argument.
-    //! \tparam Ty The type of the second argument.
-    //! \param fmod_ctx The object specializing Fmod.
-    //! \param x The first argument.
-    //! \param y The second argument.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename Tx, typename Ty>
-    ALPAKA_FN_HOST_ACC auto fmod(T const& fmod_ctx, Tx const& x, Ty const& y)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathFmod, T>;
-        return trait::Fmod<ImplementationBase, Tx, Ty>{}(fmod_ctx, x, y);
-    }
-
-    //! Checks if given value is finite.
-    //!
-    //! \tparam T The type of the object specializing Isfinite.
-    //! \tparam TArg The arg type.
-    //! \param ctx The object specializing Isfinite.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto isfinite(T const& ctx, TArg const& arg)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathIsfinite, T>;
-        return trait::Isfinite<ImplementationBase, TArg>{}(ctx, arg);
-    }
-
-    //! Checks if given value is inf.
-    //!
-    //! \tparam T The type of the object specializing Isinf.
-    //! \tparam TArg The arg type.
-    //! \param ctx The object specializing Isinf.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto isinf(T const& ctx, TArg const& arg)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathIsinf, T>;
-        return trait::Isinf<ImplementationBase, TArg>{}(ctx, arg);
-    }
-
-    //! Checks if given value is NaN.
-    //!
-    //! \tparam T The type of the object specializing Isnan.
-    //! \tparam TArg The arg type.
-    //! \param ctx The object specializing Isnan.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto isnan(T const& ctx, TArg const& arg)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathIsnan, T>;
-        return trait::Isnan<ImplementationBase, TArg>{}(ctx, arg);
-    }
-
-    //! Computes the the natural (base e) logarithm of arg.
-    //!
-    //! Valid real arguments are non-negative. For other values the result
-    //! may depend on the backend and compilation options, will likely
-    //! be NaN.
-    //!
-    //! \tparam T The type of the object specializing Log.
-    //! \tparam TArg The arg type.
-    //! \param log_ctx The object specializing Log.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto log(T const& log_ctx, TArg const& arg)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathLog, T>;
-        return trait::Log<ImplementationBase, TArg>{}(log_ctx, arg);
-    }
-
-    //! Computes the the natural (base 2) logarithm of arg.
-    //!
-    //! Valid real arguments are non-negative. For other values the result
-    //! may depend on the backend and compilation options, will likely
-    //! be NaN.
-    //!
-    //! \tparam T The type of the object specializing Log2.
-    //! \tparam TArg The arg type.
-    //! \param log2_ctx The object specializing Log2.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto log2(T const& log2_ctx, TArg const& arg)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathLog2, T>;
-        return trait::Log2<ImplementationBase, TArg>{}(log2_ctx, arg);
-    }
-
-    //! Computes the the natural (base 10) logarithm of arg.
-    //!
-    //! Valid real arguments are non-negative. For other values the result
-    //! may depend on the backend and compilation options, will likely
-    //! be NaN.
-    //!
-    //! \tparam T The type of the object specializing Log10.
-    //! \tparam TArg The arg type.
-    //! \param log10_ctx The object specializing Log10.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto log10(T const& log10_ctx, TArg const& arg)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathLog10, T>;
-        return trait::Log10<ImplementationBase, TArg>{}(log10_ctx, arg);
-    }
-
-    //! Returns the larger of two arguments.
-    //! NaNs are treated as missing data (between a NaN and a numeric value, the numeric value is chosen).
-    //!
-    //! \tparam T The type of the object specializing Max.
-    //! \tparam Tx The type of the first argument.
-    //! \tparam Ty The type of the second argument.
-    //! \param max_ctx The object specializing Max.
-    //! \param x The first argument.
-    //! \param y The second argument.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename Tx, typename Ty>
-    ALPAKA_FN_HOST_ACC auto max(T const& max_ctx, Tx const& x, Ty const& y)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathMax, T>;
-        return trait::Max<ImplementationBase, Tx, Ty>{}(max_ctx, x, y);
-    }
-
-    //! Returns the smaller of two arguments.
-    //! NaNs are treated as missing data (between a NaN and a numeric value, the numeric value is chosen).
-    //!
-    //! \tparam T The type of the object specializing Min.
-    //! \tparam Tx The type of the first argument.
-    //! \tparam Ty The type of the second argument.
-    //! \param min_ctx The object specializing Min.
-    //! \param x The first argument.
-    //! \param y The second argument.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename Tx, typename Ty>
-    ALPAKA_FN_HOST_ACC auto min(T const& min_ctx, Tx const& x, Ty const& y)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathMin, T>;
-        return trait::Min<ImplementationBase, Tx, Ty>{}(min_ctx, x, y);
-    }
-
-    //! Computes the value of base raised to the power exp.
-    //!
-    //! Valid real arguments for base are non-negative. For other values
-    //! the result may depend on the backend and compilation options, will
-    //! likely be NaN.
-    //!
-    //! \tparam T The type of the object specializing Pow.
-    //! \tparam TBase The base type.
-    //! \tparam TExp The exponent type.
-    //! \param pow_ctx The object specializing Pow.
-    //! \param base The base.
-    //! \param exp The exponent.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TBase, typename TExp>
-    ALPAKA_FN_HOST_ACC auto pow(T const& pow_ctx, TBase const& base, TExp const& exp)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathPow, T>;
-        return trait::Pow<ImplementationBase, TBase, TExp>{}(pow_ctx, base, exp);
-    }
-
-    //! Computes the IEEE remainder of the floating point division operation x/y.
-    //!
-    //! \tparam T The type of the object specializing Remainder.
-    //! \tparam Tx The type of the first argument.
-    //! \tparam Ty The type of the second argument.
-    //! \param remainder_ctx The object specializing Max.
-    //! \param x The first argument.
-    //! \param y The second argument.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename Tx, typename Ty>
-    ALPAKA_FN_HOST_ACC auto remainder(T const& remainder_ctx, Tx const& x, Ty const& y)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathRemainder, T>;
-        return trait::Remainder<ImplementationBase, Tx, Ty>{}(remainder_ctx, x, y);
-    }
-
-    //! Computes the nearest integer value to arg (in floating-point format), rounding halfway cases away from
-    //! zero, regardless of the current rounding mode.
-    //!
-    //! \tparam T The type of the object specializing Round.
-    //! \tparam TArg The arg type.
-    //! \param round_ctx The object specializing Round.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto round(T const& round_ctx, TArg const& arg)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathRound, T>;
-        return trait::Round<ImplementationBase, TArg>{}(round_ctx, arg);
-    }
-
-    //! Computes the nearest integer value to arg (in integer format), rounding halfway cases away from zero,
-    //! regardless of the current rounding mode.
-    //!
-    //! \tparam T The type of the object specializing Round.
-    //! \tparam TArg The arg type.
-    //! \param lround_ctx The object specializing Round.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto lround(T const& lround_ctx, TArg const& arg) -> long int
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathRound, T>;
-        return trait::Lround<ImplementationBase, TArg>{}(lround_ctx, arg);
-    }
-
-    //! Computes the nearest integer value to arg (in integer format), rounding halfway cases away from zero,
-    //! regardless of the current rounding mode.
-    //!
-    //! \tparam T The type of the object specializing Round.
-    //! \tparam TArg The arg type.
-    //! \param llround_ctx The object specializing Round.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto llround(T const& llround_ctx, TArg const& arg) -> long long int
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathRound, T>;
-        return trait::Llround<ImplementationBase, TArg>{}(llround_ctx, arg);
-    }
-
-    //! Computes the rsqrt.
-    //!
-    //! Valid real arguments are positive. For other values the result
-    //! may depend on the backend and compilation options, will likely
-    //! be NaN.
-    //!
-    //! \tparam T The type of the object specializing Rsqrt.
-    //! \tparam TArg The arg type.
-    //! \param rsqrt_ctx The object specializing Rsqrt.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto rsqrt(T const& rsqrt_ctx, TArg const& arg)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathRsqrt, T>;
-        return trait::Rsqrt<ImplementationBase, TArg>{}(rsqrt_ctx, arg);
-    }
-
-    //! Computes the sine (measured in radians).
-    //!
-    //! \tparam T The type of the object specializing Sin.
-    //! \tparam TArg The arg type.
-    //! \param sin_ctx The object specializing Sin.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto sin(T const& sin_ctx, TArg const& arg)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathSin, T>;
-        return trait::Sin<ImplementationBase, TArg>{}(sin_ctx, arg);
-    }
-
-    //! Computes the hyperbolic sine (measured in radians).
-    //!
-    //! \tparam T The type of the object specializing Sin.
-    //! \tparam TArg The arg type.
-    //! \param sinh_ctx The object specializing Sin.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto sinh(T const& sinh_ctx, TArg const& arg)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathSinh, T>;
-        return trait::Sinh<ImplementationBase, TArg>{}(sinh_ctx, arg);
-    }
-
-    //! Computes the sine and cosine (measured in radians).
-    //!
-    //! \tparam T The type of the object specializing SinCos.
-    //! \tparam TArg The arg type.
-    //! \param sincos_ctx The object specializing SinCos.
-    //! \param arg The arg.
-    //! \param result_sin result of sine
-    //! \param result_cos result of cosine
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto sincos(T const& sincos_ctx, TArg const& arg, TArg& result_sin, TArg& result_cos) -> void
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathSinCos, T>;
-        trait::SinCos<ImplementationBase, TArg>{}(sincos_ctx, arg, result_sin, result_cos);
-    }
-
-    //! Computes the square root of arg.
-    //!
-    //! Valid real arguments are non-negative. For other values the result
-    //! may depend on the backend and compilation options, will likely
-    //! be NaN.
-    //!
-    //! \tparam T The type of the object specializing Sqrt.
-    //! \tparam TArg The arg type.
-    //! \param sqrt_ctx The object specializing Sqrt.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto sqrt(T const& sqrt_ctx, TArg const& arg)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathSqrt, T>;
-        return trait::Sqrt<ImplementationBase, TArg>{}(sqrt_ctx, arg);
-    }
-
-    //! Computes the tangent (measured in radians).
-    //!
-    //! \tparam T The type of the object specializing Tan.
-    //! \tparam TArg The arg type.
-    //! \param tan_ctx The object specializing Tan.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto tan(T const& tan_ctx, TArg const& arg)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathTan, T>;
-        return trait::Tan<ImplementationBase, TArg>{}(tan_ctx, arg);
-    }
-
-    //! Computes the hyperbolic tangent (measured in radians).
-    //!
-    //! \tparam T The type of the object specializing Tanh.
-    //! \tparam TArg The arg type.
-    //! \param tanh_ctx The object specializing Tanh.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto tanh(T const& tanh_ctx, TArg const& arg)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathTanh, T>;
-        return trait::Tanh<ImplementationBase, TArg>{}(tanh_ctx, arg);
-    }
-
-    //! Computes the nearest integer not greater in magnitude than arg.
-    //!
-    //! \tparam T The type of the object specializing Trunc.
-    //! \tparam TArg The arg type.
-    //! \param trunc_ctx The object specializing Trunc.
-    //! \param arg The arg.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T, typename TArg>
-    ALPAKA_FN_HOST_ACC auto trunc(T const& trunc_ctx, TArg const& arg)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMathTrunc, T>;
-        return trait::Trunc<ImplementationBase, TArg>{}(trunc_ctx, arg);
-    }
-} // namespace alpaka::math
diff --git a/include/alpaka/mem/alloc/AllocCpuAligned.hpp b/include/alpaka/mem/alloc/AllocCpuAligned.hpp
deleted file mode 100644
index e458d99..0000000
--- a/include/alpaka/mem/alloc/AllocCpuAligned.hpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/AlignedAlloc.hpp"
-#include "alpaka/core/Common.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/dev/cpu/SysInfo.hpp"
-#include "alpaka/mem/alloc/Traits.hpp"
-
-#include <algorithm>
-
-namespace alpaka
-{
-    //! The CPU boost aligned allocator.
-    //!
-    //! \tparam TAlignment An integral constant containing the alignment.
-    template<typename TAlignment>
-    class AllocCpuAligned : public concepts::Implements<ConceptMemAlloc, AllocCpuAligned<TAlignment>>
-    {
-    };
-
-    namespace trait
-    {
-        //! The CPU boost aligned allocator memory allocation trait specialization.
-        template<typename T, typename TAlignment>
-        struct Malloc<T, AllocCpuAligned<TAlignment>>
-        {
-            ALPAKA_FN_HOST static auto malloc(
-                AllocCpuAligned<TAlignment> const& /* alloc */,
-                std::size_t const& sizeElems) -> T*
-            {
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-                // For CUDA/HIP host memory must be aligned to 4 kib to pin it with `cudaHostRegister`,
-                // this was described in older programming guides but was removed later.
-                // From testing with PIConGPU and cuda-memcheck we found out that the alignment is still required.
-                //
-                // For HIP the required alignment is the size of a cache line.
-                // https://rocm-developer-tools.github.io/HIP/group__Memory.html#gab8258f051e1a1f7385f794a15300e674
-                // On most x86 systems the page size is 4KiB and on OpenPower 64KiB.
-                // Page size can be tested on the terminal with: `getconf PAGE_SIZE`
-                size_t minAlignement = std::max<size_t>(TAlignment::value, cpu::detail::getPageSize());
-#else
-                constexpr size_t minAlignement = TAlignment::value;
-#endif
-                return reinterpret_cast<T*>(core::alignedAlloc(minAlignement, sizeElems * sizeof(T)));
-            }
-        };
-
-        //! The CPU boost aligned allocator memory free trait specialization.
-        template<typename T, typename TAlignment>
-        struct Free<T, AllocCpuAligned<TAlignment>>
-        {
-            ALPAKA_FN_HOST static auto free(AllocCpuAligned<TAlignment> const& /* alloc */, T const* const ptr) -> void
-            {
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-                size_t minAlignement = std::max<size_t>(TAlignment::value, cpu::detail::getPageSize());
-#else
-                constexpr size_t minAlignement = TAlignment::value;
-#endif
-                core::alignedFree(minAlignement, const_cast<void*>(reinterpret_cast<void const*>(ptr)));
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/mem/alloc/AllocCpuNew.hpp b/include/alpaka/mem/alloc/AllocCpuNew.hpp
deleted file mode 100644
index 026d46e..0000000
--- a/include/alpaka/mem/alloc/AllocCpuNew.hpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/mem/alloc/Traits.hpp"
-
-namespace alpaka
-{
-    //! The CPU new allocator.
-    class AllocCpuNew : public concepts::Implements<ConceptMemAlloc, AllocCpuNew>
-    {
-    };
-
-    namespace trait
-    {
-        //! The CPU new allocator memory allocation trait specialization.
-        template<typename T>
-        struct Malloc<T, AllocCpuNew>
-        {
-            ALPAKA_FN_HOST static auto malloc(AllocCpuNew const& /* alloc */, std::size_t const& sizeElems) -> T*
-            {
-                return new T[sizeElems];
-            }
-        };
-
-        //! The CPU new allocator memory free trait specialization.
-        template<typename T>
-        struct Free<T, AllocCpuNew>
-        {
-            ALPAKA_FN_HOST static auto free(AllocCpuNew const& /* alloc */, T const* const ptr) -> void
-            {
-                return delete[] ptr;
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/mem/alloc/Traits.hpp b/include/alpaka/mem/alloc/Traits.hpp
deleted file mode 100644
index 4b9cfcc..0000000
--- a/include/alpaka/mem/alloc/Traits.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/dim/Traits.hpp"
-#include "alpaka/extent/Traits.hpp"
-
-namespace alpaka
-{
-    struct ConceptMemAlloc
-    {
-    };
-
-    //! The allocator traits.
-    namespace trait
-    {
-        //! The memory allocation trait.
-        template<typename T, typename TAlloc, typename TSfinae = void>
-        struct Malloc;
-
-        //! The memory free trait.
-        template<typename T, typename TAlloc, typename TSfinae = void>
-        struct Free;
-    } // namespace trait
-
-    //! \return The pointer to the allocated memory.
-    template<typename T, typename TAlloc>
-    ALPAKA_FN_HOST auto malloc(TAlloc const& alloc, std::size_t const& sizeElems) -> T*
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMemAlloc, TAlloc>;
-        return trait::Malloc<T, ImplementationBase>::malloc(alloc, sizeElems);
-    }
-
-    //! Frees the memory identified by the given pointer.
-    template<typename TAlloc, typename T>
-    ALPAKA_FN_HOST auto free(TAlloc const& alloc, T const* const ptr) -> void
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMemAlloc, TAlloc>;
-        trait::Free<T, ImplementationBase>::free(alloc, ptr);
-    }
-} // namespace alpaka
diff --git a/include/alpaka/mem/buf/BufCpu.hpp b/include/alpaka/mem/buf/BufCpu.hpp
deleted file mode 100644
index 4bfc91c..0000000
--- a/include/alpaka/mem/buf/BufCpu.hpp
+++ /dev/null
@@ -1,314 +0,0 @@
-/* Copyright 2022 Alexander Matthes, Axel Huebl, Benjamin Worpitz, Andrea Bocci, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/ApiCudaRt.hpp"
-#include "alpaka/core/ApiHipRt.hpp"
-#include "alpaka/core/Cuda.hpp"
-#include "alpaka/core/Hip.hpp"
-#include "alpaka/core/Vectorize.hpp"
-#include "alpaka/dev/DevCpu.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/mem/alloc/AllocCpuAligned.hpp"
-#include "alpaka/mem/buf/Traits.hpp"
-#include "alpaka/mem/view/ViewAccessOps.hpp"
-#include "alpaka/meta/DependentFalseType.hpp"
-#include "alpaka/platform/PlatformCpu.hpp"
-#include "alpaka/vec/Vec.hpp"
-
-#include <functional>
-#include <memory>
-#include <type_traits>
-#include <utility>
-
-namespace alpaka
-{
-    namespace detail
-    {
-        //! The CPU memory buffer.
-        template<typename TElem, typename TDim, typename TIdx>
-        class BufCpuImpl final
-        {
-            static_assert(
-                !std::is_const_v<TElem>,
-                "The elem type of the buffer can not be const because the C++ Standard forbids containers of const "
-                "elements!");
-            static_assert(!std::is_const_v<TIdx>, "The idx type of the buffer can not be const!");
-
-        public:
-            template<typename TExtent>
-            ALPAKA_FN_HOST BufCpuImpl(
-                DevCpu dev,
-                TElem* pMem,
-                std::function<void(TElem*)> deleter,
-                TExtent const& extent) noexcept
-                : m_dev(std::move(dev))
-                , m_extentElements(getExtentVecEnd<TDim>(extent))
-                , m_pMem(pMem)
-                , m_deleter(std::move(deleter))
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                static_assert(
-                    TDim::value == Dim<TExtent>::value,
-                    "The dimensionality of TExtent and the dimensionality of the TDim template parameter have to be "
-                    "identical!");
-                static_assert(
-                    std::is_same_v<TIdx, Idx<TExtent>>,
-                    "The idx type of TExtent and the TIdx template parameter have to be identical!");
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                std::cout << __func__ << " e: " << m_extentElements << " ptr: " << static_cast<void*>(m_pMem)
-                          << std::endl;
-#endif
-            }
-
-            BufCpuImpl(BufCpuImpl&&) = delete;
-            auto operator=(BufCpuImpl&&) -> BufCpuImpl& = delete;
-
-            ALPAKA_FN_HOST ~BufCpuImpl()
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                // NOTE: m_pMem is allowed to be a nullptr here.
-                m_deleter(m_pMem);
-            }
-
-        public:
-            DevCpu const m_dev;
-            Vec<TDim, TIdx> const m_extentElements;
-            TElem* const m_pMem;
-            std::function<void(TElem*)> m_deleter;
-        };
-    } // namespace detail
-
-    //! The CPU memory buffer.
-    template<typename TElem, typename TDim, typename TIdx>
-    class BufCpu : public internal::ViewAccessOps<BufCpu<TElem, TDim, TIdx>>
-    {
-    public:
-        template<typename TExtent, typename Deleter>
-        ALPAKA_FN_HOST BufCpu(DevCpu const& dev, TElem* pMem, Deleter deleter, TExtent const& extent)
-            : m_spBufCpuImpl{
-                std::make_shared<detail::BufCpuImpl<TElem, TDim, TIdx>>(dev, pMem, std::move(deleter), extent)}
-        {
-        }
-
-    public:
-        std::shared_ptr<detail::BufCpuImpl<TElem, TDim, TIdx>> m_spBufCpuImpl;
-    };
-
-    namespace trait
-    {
-        //! The BufCpu device type trait specialization.
-        template<typename TElem, typename TDim, typename TIdx>
-        struct DevType<BufCpu<TElem, TDim, TIdx>>
-        {
-            using type = DevCpu;
-        };
-
-        //! The BufCpu device get trait specialization.
-        template<typename TElem, typename TDim, typename TIdx>
-        struct GetDev<BufCpu<TElem, TDim, TIdx>>
-        {
-            ALPAKA_FN_HOST static auto getDev(BufCpu<TElem, TDim, TIdx> const& buf) -> DevCpu
-            {
-                return buf.m_spBufCpuImpl->m_dev;
-            }
-        };
-
-        //! The BufCpu dimension getter trait.
-        template<typename TElem, typename TDim, typename TIdx>
-        struct DimType<BufCpu<TElem, TDim, TIdx>>
-        {
-            using type = TDim;
-        };
-
-        //! The BufCpu memory element type get trait specialization.
-        template<typename TElem, typename TDim, typename TIdx>
-        struct ElemType<BufCpu<TElem, TDim, TIdx>>
-        {
-            using type = TElem;
-        };
-
-        //! The BufCpu width get trait specialization.
-        template<typename TElem, typename TDim, typename TIdx>
-        struct GetExtents<BufCpu<TElem, TDim, TIdx>>
-        {
-            ALPAKA_FN_HOST auto operator()(BufCpu<TElem, TDim, TIdx> const& buf)
-            {
-                return buf.m_spBufCpuImpl->m_extentElements;
-            }
-        };
-
-        //! The BufCpu native pointer get trait specialization.
-        template<typename TElem, typename TDim, typename TIdx>
-        struct GetPtrNative<BufCpu<TElem, TDim, TIdx>>
-        {
-            ALPAKA_FN_HOST static auto getPtrNative(BufCpu<TElem, TDim, TIdx> const& buf) -> TElem const*
-            {
-                return buf.m_spBufCpuImpl->m_pMem;
-            }
-
-            ALPAKA_FN_HOST static auto getPtrNative(BufCpu<TElem, TDim, TIdx>& buf) -> TElem*
-            {
-                return buf.m_spBufCpuImpl->m_pMem;
-            }
-        };
-
-        //! The BufCpu pointer on device get trait specialization.
-        template<typename TElem, typename TDim, typename TIdx>
-        struct GetPtrDev<BufCpu<TElem, TDim, TIdx>, DevCpu>
-        {
-            ALPAKA_FN_HOST static auto getPtrDev(BufCpu<TElem, TDim, TIdx> const& buf, DevCpu const& dev)
-                -> TElem const*
-            {
-                if(dev == getDev(buf))
-                {
-                    return buf.m_spBufCpuImpl->m_pMem;
-                }
-                else
-                {
-                    throw std::runtime_error("The buffer is not accessible from the given device!");
-                }
-            }
-
-            ALPAKA_FN_HOST static auto getPtrDev(BufCpu<TElem, TDim, TIdx>& buf, DevCpu const& dev) -> TElem*
-            {
-                if(dev == getDev(buf))
-                {
-                    return buf.m_spBufCpuImpl->m_pMem;
-                }
-                else
-                {
-                    throw std::runtime_error("The buffer is not accessible from the given device!");
-                }
-            }
-        };
-
-        //! The BufCpu memory allocation trait specialization.
-        template<typename TElem, typename TDim, typename TIdx>
-        struct BufAlloc<TElem, TDim, TIdx, DevCpu>
-        {
-            template<typename TExtent>
-            ALPAKA_FN_HOST static auto allocBuf(DevCpu const& dev, TExtent const& extent) -> BufCpu<TElem, TDim, TIdx>
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                // If ALPAKA_DEFAULT_HOST_MEMORY_ALIGNMENT is defined, positive, and a power of 2, use it as the
-                // default alignment for host memory allocations. Otherwise, the alignment is chosen to enable optimal
-                // performance dependant on the target architecture.
-#if defined(ALPAKA_DEFAULT_HOST_MEMORY_ALIGNMENT)
-                static_assert(
-                    ALPAKA_DEFAULT_HOST_MEMORY_ALIGNMENT > 0
-                        && ((ALPAKA_DEFAULT_HOST_MEMORY_ALIGNMENT & (ALPAKA_DEFAULT_HOST_MEMORY_ALIGNMENT - 1)) == 0),
-                    "If defined, ALPAKA_DEFAULT_HOST_MEMORY_ALIGNMENT must be a power of 2.");
-                constexpr std::size_t alignment = static_cast<std::size_t>(ALPAKA_DEFAULT_HOST_MEMORY_ALIGNMENT);
-#else
-                constexpr std::size_t alignment = core::vectorization::defaultAlignment;
-#endif
-                // alpaka::AllocCpuAligned is stateless
-                using Allocator = AllocCpuAligned<std::integral_constant<std::size_t, alignment>>;
-                static_assert(std::is_empty_v<Allocator>, "AllocCpuAligned is expected to be stateless");
-                auto* memPtr = alpaka::malloc<TElem>(Allocator{}, static_cast<std::size_t>(getExtentProduct(extent)));
-                auto deleter = [](TElem* ptr) { alpaka::free(Allocator{}, ptr); };
-
-                return BufCpu<TElem, TDim, TIdx>(dev, memPtr, std::move(deleter), extent);
-            }
-        };
-
-        //! The BufCpu stream-ordered memory allocation trait specialization.
-        template<typename TElem, typename TDim, typename TIdx>
-        struct AsyncBufAlloc<TElem, TDim, TIdx, DevCpu>
-        {
-            template<typename TQueue, typename TExtent>
-            ALPAKA_FN_HOST static auto allocAsyncBuf(TQueue queue, TExtent const& extent) -> BufCpu<TElem, TDim, TIdx>
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                static_assert(
-                    std::is_same_v<Dev<TQueue>, DevCpu>,
-                    "The BufCpu buffer can only be used with a queue on a DevCpu device!");
-                DevCpu const& dev = getDev(queue);
-
-                // If ALPAKA_DEFAULT_HOST_MEMORY_ALIGNMENT is defined, positive, and a power of 2, use it as the
-                // default alignment for host memory allocations. Otherwise, the alignment is chosen to enable optimal
-                // performance dependant on the target architecture.
-#if defined(ALPAKA_DEFAULT_HOST_MEMORY_ALIGNMENT)
-                static_assert(
-                    ALPAKA_DEFAULT_HOST_MEMORY_ALIGNMENT > 0
-                        && ((ALPAKA_DEFAULT_HOST_MEMORY_ALIGNMENT & (ALPAKA_DEFAULT_HOST_MEMORY_ALIGNMENT - 1)) == 0),
-                    "If defined, ALPAKA_DEFAULT_HOST_MEMORY_ALIGNMENT must be a power of 2.");
-                constexpr std::size_t alignment = static_cast<std::size_t>(ALPAKA_DEFAULT_HOST_MEMORY_ALIGNMENT);
-#else
-                constexpr std::size_t alignment = core::vectorization::defaultAlignment;
-#endif
-                // alpaka::AllocCpuAligned is stateless
-                using Allocator = AllocCpuAligned<std::integral_constant<std::size_t, alignment>>;
-                static_assert(std::is_empty_v<Allocator>, "AllocCpuAligned is expected to be stateless");
-                auto* memPtr = alpaka::malloc<TElem>(Allocator{}, static_cast<std::size_t>(getExtentProduct(extent)));
-                auto deleter = [l_queue = std::move(queue)](TElem* ptr) mutable
-                {
-                    alpaka::enqueue(
-                        l_queue,
-                        [ptr]()
-                        {
-                            // free the memory
-                            alpaka::free(Allocator{}, ptr);
-                        });
-                };
-
-                return BufCpu<TElem, TDim, TIdx>(dev, memPtr, std::move(deleter), extent);
-            }
-        };
-
-        //! The BufCpu stream-ordered memory allocation capability trait specialization.
-        template<typename TDim>
-        struct HasAsyncBufSupport<TDim, DevCpu> : public std::true_type
-        {
-        };
-
-        //! The pinned/mapped memory allocation trait specialization.
-        template<typename TElem, typename TDim, typename TIdx>
-        struct BufAllocMapped<PlatformCpu, TElem, TDim, TIdx>
-        {
-            template<typename TExtent>
-            ALPAKA_FN_HOST static auto allocMappedBuf(
-                DevCpu const& host,
-                PlatformCpu const& /*platform*/,
-                TExtent const& extent) -> BufCpu<TElem, TDim, TIdx>
-            {
-                // Allocate standard host memory.
-                return allocBuf<TElem, TIdx>(host, extent);
-            }
-        };
-
-        //! The pinned/mapped memory allocation capability trait specialization.
-        template<>
-        struct HasMappedBufSupport<PlatformCpu> : public std::true_type
-        {
-        };
-
-        //! The BufCpu offset get trait specialization.
-        template<typename TElem, typename TDim, typename TIdx>
-        struct GetOffsets<BufCpu<TElem, TDim, TIdx>>
-        {
-            ALPAKA_FN_HOST auto operator()(BufCpu<TElem, TDim, TIdx> const&) const -> Vec<TDim, TIdx>
-            {
-                return Vec<TDim, TIdx>::zeros();
-            }
-        };
-
-        //! The BufCpu idx type trait specialization.
-        template<typename TElem, typename TDim, typename TIdx>
-        struct IdxType<BufCpu<TElem, TDim, TIdx>>
-        {
-            using type = TIdx;
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#include "alpaka/mem/buf/cpu/Copy.hpp"
-#include "alpaka/mem/buf/cpu/Set.hpp"
diff --git a/include/alpaka/mem/buf/BufCpuSycl.hpp b/include/alpaka/mem/buf/BufCpuSycl.hpp
deleted file mode 100644
index ab36f8b..0000000
--- a/include/alpaka/mem/buf/BufCpuSycl.hpp
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright 2024 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/dev/DevCpuSycl.hpp"
-#include "alpaka/mem/buf/BufGenericSycl.hpp"
-#include "alpaka/platform/PlatformCpuSycl.hpp"
-
-#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_CPU)
-
-namespace alpaka
-{
-    template<typename TElem, typename TDim, typename TIdx>
-    using BufCpuSycl = BufGenericSycl<TElem, TDim, TIdx, PlatformCpuSycl>;
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/mem/buf/BufCudaRt.hpp b/include/alpaka/mem/buf/BufCudaRt.hpp
deleted file mode 100644
index a5e0020..0000000
--- a/include/alpaka/mem/buf/BufCudaRt.hpp
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright 2022 Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/ApiCudaRt.hpp"
-#include "alpaka/mem/buf/BufUniformCudaHipRt.hpp"
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-namespace alpaka
-{
-    template<typename TElem, typename TDim, typename TIdx>
-    using BufCudaRt = BufUniformCudaHipRt<ApiCudaRt, TElem, TDim, TIdx>;
-} // namespace alpaka
-
-#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
diff --git a/include/alpaka/mem/buf/BufFpgaSyclIntel.hpp b/include/alpaka/mem/buf/BufFpgaSyclIntel.hpp
deleted file mode 100644
index 562fae9..0000000
--- a/include/alpaka/mem/buf/BufFpgaSyclIntel.hpp
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright 2024 Jan Stephan, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/dev/DevFpgaSyclIntel.hpp"
-#include "alpaka/mem/buf/BufGenericSycl.hpp"
-#include "alpaka/platform/PlatformFpgaSyclIntel.hpp"
-
-#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_FPGA)
-
-namespace alpaka
-{
-    template<typename TElem, typename TDim, typename TIdx>
-    using BufFpgaSyclIntel = BufGenericSycl<TElem, TDim, TIdx, PlatformFpgaSyclIntel>;
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/mem/buf/BufGenericSycl.hpp b/include/alpaka/mem/buf/BufGenericSycl.hpp
deleted file mode 100644
index 9beb16c..0000000
--- a/include/alpaka/mem/buf/BufGenericSycl.hpp
+++ /dev/null
@@ -1,272 +0,0 @@
-/* Copyright 2024 Jan Stephan, Luca Ferragina, Aurora Perego, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Sycl.hpp"
-#include "alpaka/dev/DevGenericSycl.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/dim/DimIntegralConst.hpp"
-#include "alpaka/dim/Traits.hpp"
-#include "alpaka/mem/buf/BufCpu.hpp"
-#include "alpaka/mem/buf/Traits.hpp"
-#include "alpaka/mem/view/ViewAccessOps.hpp"
-#include "alpaka/vec/Vec.hpp"
-
-#include <memory>
-#include <type_traits>
-
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-
-#    include <sycl/sycl.hpp>
-
-namespace alpaka
-{
-    //! The SYCL memory buffer.
-    template<typename TElem, typename TDim, typename TIdx, typename TTag>
-    class BufGenericSycl : public internal::ViewAccessOps<BufGenericSycl<TElem, TDim, TIdx, TTag>>
-    {
-    public:
-        static_assert(
-            !std::is_const_v<TElem>,
-            "The elem type of the buffer can not be const because the C++ Standard forbids containers of const "
-            "elements!");
-        static_assert(!std::is_const_v<TIdx>, "The idx type of the buffer can not be const!");
-
-        //! Constructor
-        template<typename TExtent, typename Deleter>
-        BufGenericSycl(DevGenericSycl<TTag> const& dev, TElem* const pMem, Deleter deleter, TExtent const& extent)
-            : m_dev{dev}
-            , m_extentElements{getExtentVecEnd<TDim>(extent)}
-            , m_spMem(pMem, std::move(deleter))
-        {
-            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-            static_assert(
-                TDim::value == Dim<TExtent>::value,
-                "The dimensionality of TExtent and the dimensionality of the TDim template parameter have to be "
-                "identical!");
-
-            static_assert(
-                std::is_same_v<TIdx, Idx<TExtent>>,
-                "The idx type of TExtent and the TIdx template parameter have to be identical!");
-        }
-
-        DevGenericSycl<TTag> m_dev;
-        Vec<TDim, TIdx> m_extentElements;
-        std::shared_ptr<TElem> m_spMem;
-    };
-} // namespace alpaka
-
-namespace alpaka::trait
-{
-    //! The BufGenericSycl device type trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TTag>
-    struct DevType<BufGenericSycl<TElem, TDim, TIdx, TTag>>
-    {
-        using type = DevGenericSycl<TTag>;
-    };
-
-    //! The BufGenericSycl device get trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TTag>
-    struct GetDev<BufGenericSycl<TElem, TDim, TIdx, TTag>>
-    {
-        static auto getDev(BufGenericSycl<TElem, TDim, TIdx, TTag> const& buf)
-        {
-            return buf.m_dev;
-        }
-    };
-
-    //! The BufGenericSycl dimension getter trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TTag>
-    struct DimType<BufGenericSycl<TElem, TDim, TIdx, TTag>>
-    {
-        using type = TDim;
-    };
-
-    //! The BufGenericSycl memory element type get trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TTag>
-    struct ElemType<BufGenericSycl<TElem, TDim, TIdx, TTag>>
-    {
-        using type = TElem;
-    };
-
-    //! The BufGenericSycl extent get trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TTag>
-    struct GetExtents<BufGenericSycl<TElem, TDim, TIdx, TTag>>
-    {
-        auto operator()(BufGenericSycl<TElem, TDim, TIdx, TTag> const& buf) const
-        {
-            return buf.m_extentElements;
-        }
-    };
-
-    //! The BufGenericSycl native pointer get trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TTag>
-    struct GetPtrNative<BufGenericSycl<TElem, TDim, TIdx, TTag>>
-    {
-        static auto getPtrNative(BufGenericSycl<TElem, TDim, TIdx, TTag> const& buf) -> TElem const*
-        {
-            return buf.m_spMem.get();
-        }
-
-        static auto getPtrNative(BufGenericSycl<TElem, TDim, TIdx, TTag>& buf) -> TElem*
-        {
-            return buf.m_spMem.get();
-        }
-    };
-
-    //! The BufGenericSycl pointer on device get trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TTag>
-    struct GetPtrDev<BufGenericSycl<TElem, TDim, TIdx, TTag>, DevGenericSycl<TTag>>
-    {
-        static auto getPtrDev(BufGenericSycl<TElem, TDim, TIdx, TTag> const& buf, DevGenericSycl<TTag> const& dev)
-            -> TElem const*
-        {
-            if(dev == getDev(buf))
-            {
-                return buf.m_spMem.get();
-            }
-            else
-            {
-                throw std::runtime_error("The buffer is not accessible from the given device!");
-            }
-        }
-
-        static auto getPtrDev(BufGenericSycl<TElem, TDim, TIdx, TTag>& buf, DevGenericSycl<TTag> const& dev) -> TElem*
-        {
-            if(dev == getDev(buf))
-            {
-                return buf.m_spMem.get();
-            }
-            else
-            {
-                throw std::runtime_error("The buffer is not accessible from the given device!");
-            }
-        }
-    };
-
-    //! The SYCL memory allocation trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TTag>
-    struct BufAlloc<TElem, TDim, TIdx, DevGenericSycl<TTag>>
-    {
-        template<typename TExtent>
-        static auto allocBuf(DevGenericSycl<TTag> const& dev, TExtent const& extent)
-            -> BufGenericSycl<TElem, TDim, TIdx, TTag>
-        {
-            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-            if constexpr(TDim::value == 0)
-                std::cout << __func__ << " ewb: " << sizeof(TElem) << '\n';
-            else if constexpr(TDim::value == 1)
-            {
-                auto const width = getWidth(extent);
-
-                auto const widthBytes = width * static_cast<TIdx>(sizeof(TElem));
-                std::cout << __func__ << " ew: " << width << " ewb: " << widthBytes << '\n';
-            }
-            else if constexpr(TDim::value == 2)
-            {
-                auto const width = getWidth(extent);
-                auto const height = getHeight(extent);
-
-                auto const widthBytes = width * static_cast<TIdx>(sizeof(TElem));
-                std::cout << __func__ << " ew: " << width << " eh: " << height << " ewb: " << widthBytes
-                          << " pitch: " << widthBytes << '\n';
-            }
-            else if constexpr(TDim::value == 3)
-            {
-                auto const width = getWidth(extent);
-                auto const height = getHeight(extent);
-                auto const depth = getDepth(extent);
-
-                auto const widthBytes = width * static_cast<TIdx>(sizeof(TElem));
-                std::cout << __func__ << " ew: " << width << " eh: " << height << " ed: " << depth
-                          << " ewb: " << widthBytes << " pitch: " << widthBytes << '\n';
-            }
-#    endif
-
-            auto const& [nativeDev, nativeContext] = dev.getNativeHandle();
-            TElem* memPtr = sycl::malloc_device<TElem>(
-                static_cast<std::size_t>(getExtentProduct(extent)),
-                nativeDev,
-                nativeContext);
-            auto deleter = [ctx = nativeContext](TElem* ptr) { sycl::free(ptr, ctx); };
-
-            return BufGenericSycl<TElem, TDim, TIdx, TTag>(dev, memPtr, std::move(deleter), extent);
-        }
-    };
-
-    //! The BufGenericSycl stream-ordered memory allocation capability trait specialization.
-    template<typename TDim, typename TTag>
-    struct HasAsyncBufSupport<TDim, DevGenericSycl<TTag>> : std::false_type
-    {
-    };
-
-    //! The BufGenericSycl offset get trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TTag>
-    struct GetOffsets<BufGenericSycl<TElem, TDim, TIdx, TTag>>
-    {
-        auto operator()(BufGenericSycl<TElem, TDim, TIdx, TTag> const&) const -> Vec<TDim, TIdx>
-        {
-            return Vec<TDim, TIdx>::zeros();
-        }
-    };
-
-    //! The pinned/mapped memory allocation trait specialization for the SYCL devices.
-    template<typename TTag, typename TElem, typename TDim, typename TIdx>
-    struct BufAllocMapped<PlatformGenericSycl<TTag>, TElem, TDim, TIdx>
-    {
-        template<typename TExtent>
-        static auto allocMappedBuf(
-            DevCpu const& host,
-            PlatformGenericSycl<TTag> const& platform,
-            TExtent const& extent) -> BufCpu<TElem, TDim, TIdx>
-        {
-            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-            // Allocate SYCL page-locked memory on the host, mapped into the SYCL platform's address space and
-            // accessible to all devices in the SYCL platform.
-            auto ctx = platform.syclContext();
-            TElem* memPtr = sycl::malloc_host<TElem>(static_cast<std::size_t>(getExtentProduct(extent)), ctx);
-            auto deleter = [ctx](TElem* ptr) { sycl::free(ptr, ctx); };
-
-            return BufCpu<TElem, TDim, TIdx>(host, memPtr, std::move(deleter), extent);
-        }
-    };
-
-    //! The pinned/mapped memory allocation capability trait specialization.
-    template<typename TTag>
-    struct HasMappedBufSupport<PlatformGenericSycl<TTag>> : public std::true_type
-    {
-    };
-
-    //! The BufGenericSycl idx type trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TTag>
-    struct IdxType<BufGenericSycl<TElem, TDim, TIdx, TTag>>
-    {
-        using type = TIdx;
-    };
-
-    //! The BufCpu pointer on SYCL device get trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TTag>
-    struct GetPtrDev<BufCpu<TElem, TDim, TIdx>, DevGenericSycl<TTag>>
-    {
-        static auto getPtrDev(BufCpu<TElem, TDim, TIdx> const& buf, DevGenericSycl<TTag> const&) -> TElem const*
-        {
-            return getPtrNative(buf);
-        }
-
-        static auto getPtrDev(BufCpu<TElem, TDim, TIdx>& buf, DevGenericSycl<TTag> const&) -> TElem*
-        {
-            return getPtrNative(buf);
-        }
-    };
-} // namespace alpaka::trait
-
-#    include "alpaka/mem/buf/sycl/Copy.hpp"
-#    include "alpaka/mem/buf/sycl/Set.hpp"
-
-#endif
diff --git a/include/alpaka/mem/buf/BufGpuSyclIntel.hpp b/include/alpaka/mem/buf/BufGpuSyclIntel.hpp
deleted file mode 100644
index 5597f70..0000000
--- a/include/alpaka/mem/buf/BufGpuSyclIntel.hpp
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright 2024 Jan Stephan, Luca Ferragina, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/dev/DevGpuSyclIntel.hpp"
-#include "alpaka/mem/buf/BufGenericSycl.hpp"
-#include "alpaka/platform/PlatformGpuSyclIntel.hpp"
-
-#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_GPU)
-
-namespace alpaka
-{
-    template<typename TElem, typename TDim, typename TIdx>
-    using BufGpuSyclIntel = BufGenericSycl<TElem, TDim, TIdx, PlatformGpuSyclIntel>;
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/mem/buf/BufHipRt.hpp b/include/alpaka/mem/buf/BufHipRt.hpp
deleted file mode 100644
index 4a59bc4..0000000
--- a/include/alpaka/mem/buf/BufHipRt.hpp
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright 2022 Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/ApiHipRt.hpp"
-#include "alpaka/mem/buf/BufUniformCudaHipRt.hpp"
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-namespace alpaka
-{
-    template<typename TElem, typename TDim, typename TIdx>
-    using BufHipRt = BufUniformCudaHipRt<ApiHipRt, TElem, TDim, TIdx>;
-} // namespace alpaka
-
-#endif // ALPAKA_ACC_GPU_HIP_ENABLED
diff --git a/include/alpaka/mem/buf/BufUniformCudaHipRt.hpp b/include/alpaka/mem/buf/BufUniformCudaHipRt.hpp
deleted file mode 100644
index 826edab..0000000
--- a/include/alpaka/mem/buf/BufUniformCudaHipRt.hpp
+++ /dev/null
@@ -1,422 +0,0 @@
-/* Copyright 2023 Alexander Matthes, Benjamin Worpitz, Matthias Werner, René Widera, Andrea Bocci, Jan Stephan,
- *                Bernhard Manfred Gruber, Antonio Di Pilato
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Assert.hpp"
-#include "alpaka/core/Cuda.hpp"
-#include "alpaka/core/Hip.hpp"
-#include "alpaka/dev/DevUniformCudaHipRt.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/dim/DimIntegralConst.hpp"
-#include "alpaka/mem/buf/Traits.hpp"
-#include "alpaka/mem/view/ViewAccessOps.hpp"
-#include "alpaka/meta/DependentFalseType.hpp"
-#include "alpaka/vec/Vec.hpp"
-
-#include <cstddef>
-#include <functional>
-#include <memory>
-#include <type_traits>
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-
-namespace alpaka
-{
-    // Forward declarations.
-    struct ApiCudaRt;
-    struct ApiHipRt;
-
-    template<typename TElem, typename TDim, typename TIdx>
-    class BufCpu;
-
-    namespace detail
-    {
-        template<typename TDim, typename SFINAE = void>
-        struct PitchHolder
-        {
-            explicit PitchHolder(std::size_t)
-            {
-            }
-        };
-
-        template<typename TDim>
-        struct PitchHolder<TDim, std::enable_if_t<TDim::value >= 2>>
-        {
-            std::size_t m_rowPitchInBytes;
-        };
-    } // namespace detail
-
-    //! The CUDA/HIP memory buffer.
-    template<typename TApi, typename TElem, typename TDim, typename TIdx>
-    struct BufUniformCudaHipRt
-        : detail::PitchHolder<TDim>
-        , internal::ViewAccessOps<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
-    {
-        static_assert(!std::is_const_v<TElem>, "The elem type of the buffer must not be const");
-        static_assert(!std::is_const_v<TIdx>, "The idx type of the buffer must not be const!");
-
-        //! Constructor
-        template<typename TExtent, typename Deleter>
-        ALPAKA_FN_HOST BufUniformCudaHipRt(
-            DevUniformCudaHipRt<TApi> const& dev,
-            TElem* const pMem,
-            Deleter deleter,
-            TExtent const& extent,
-            std::size_t pitchBytes)
-            : detail::PitchHolder<TDim>{pitchBytes}
-            , m_dev(dev)
-            , m_extentElements(getExtents(extent))
-            , m_spMem(pMem, std::move(deleter))
-        {
-            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-            static_assert(
-                TDim::value == alpaka::Dim<TExtent>::value,
-                "The dimensionality of TExtent and the dimensionality of the TDim template parameter have to be "
-                "identical!");
-            static_assert(
-                std::is_same_v<TIdx, alpaka::Idx<TExtent>>,
-                "The idx type of TExtent and the TIdx template parameter have to be identical!");
-        }
-
-        DevUniformCudaHipRt<TApi> m_dev;
-        Vec<TDim, TIdx> m_extentElements;
-        std::shared_ptr<TElem> m_spMem;
-    };
-
-    namespace trait
-    {
-        //! The BufUniformCudaHipRt device type trait specialization.
-        template<typename TApi, typename TElem, typename TDim, typename TIdx>
-        struct DevType<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
-        {
-            using type = DevUniformCudaHipRt<TApi>;
-        };
-
-        //! The BufUniformCudaHipRt device get trait specialization.
-        template<typename TApi, typename TElem, typename TDim, typename TIdx>
-        struct GetDev<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
-        {
-            ALPAKA_FN_HOST static auto getDev(BufUniformCudaHipRt<TApi, TElem, TDim, TIdx> const& buf)
-                -> DevUniformCudaHipRt<TApi>
-            {
-                return buf.m_dev;
-            }
-        };
-
-        //! The BufUniformCudaHipRt dimension getter trait specialization.
-        template<typename TApi, typename TElem, typename TDim, typename TIdx>
-        struct DimType<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
-        {
-            using type = TDim;
-        };
-
-        //! The BufUniformCudaHipRt memory element type get trait specialization.
-        template<typename TApi, typename TElem, typename TDim, typename TIdx>
-        struct ElemType<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
-        {
-            using type = TElem;
-        };
-
-        //! The BufUniformCudaHipRt extent get trait specialization.
-        template<typename TApi, typename TElem, typename TDim, typename TIdx>
-        struct GetExtents<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
-        {
-            ALPAKA_FN_HOST auto operator()(BufUniformCudaHipRt<TApi, TElem, TDim, TIdx> const& buffer) const
-            {
-                return buffer.m_extentElements;
-            }
-        };
-
-        //! The BufUniformCudaHipRt native pointer get trait specialization.
-        template<typename TApi, typename TElem, typename TDim, typename TIdx>
-        struct GetPtrNative<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
-        {
-            ALPAKA_FN_HOST static auto getPtrNative(BufUniformCudaHipRt<TApi, TElem, TDim, TIdx> const& buf)
-                -> TElem const*
-            {
-                return buf.m_spMem.get();
-            }
-
-            ALPAKA_FN_HOST static auto getPtrNative(BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>& buf) -> TElem*
-            {
-                return buf.m_spMem.get();
-            }
-        };
-
-        //! The BufUniformCudaHipRt pointer on device get trait specialization.
-        template<typename TApi, typename TElem, typename TDim, typename TIdx>
-        struct GetPtrDev<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>, DevUniformCudaHipRt<TApi>>
-        {
-            ALPAKA_FN_HOST static auto getPtrDev(
-                BufUniformCudaHipRt<TApi, TElem, TDim, TIdx> const& buf,
-                DevUniformCudaHipRt<TApi> const& dev) -> TElem const*
-            {
-                if(dev == getDev(buf))
-                {
-                    return buf.m_spMem.get();
-                }
-                else
-                {
-                    throw std::runtime_error("The buffer is not accessible from the given device!");
-                }
-            }
-
-            ALPAKA_FN_HOST static auto getPtrDev(
-                BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>& buf,
-                DevUniformCudaHipRt<TApi> const& dev) -> TElem*
-            {
-                if(dev == getDev(buf))
-                {
-                    return buf.m_spMem.get();
-                }
-                else
-                {
-                    throw std::runtime_error("The buffer is not accessible from the given device!");
-                }
-            }
-        };
-
-        template<typename TApi, typename TElem, typename TDim, typename TIdx>
-        struct GetPitchesInBytes<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
-        {
-            ALPAKA_FN_HOST auto operator()(BufUniformCudaHipRt<TApi, TElem, TDim, TIdx> const& buf) const
-                -> Vec<TDim, TIdx>
-            {
-                Vec<TDim, TIdx> v{};
-                if constexpr(TDim::value > 0)
-                {
-                    v.back() = sizeof(TElem);
-                    if constexpr(TDim::value > 1)
-                    {
-                        v[TDim::value - 2] = static_cast<TIdx>(buf.m_rowPitchInBytes);
-                        for(TIdx i = TDim::value - 2; i > 0; i--)
-                            v[i - 1] = buf.m_extentElements[i] * v[i];
-                    }
-                }
-                return v;
-            }
-        };
-
-        //! The CUDA/HIP memory allocation trait specialization.
-        template<typename TApi, typename TElem, typename Dim, typename TIdx>
-        struct BufAlloc<TElem, Dim, TIdx, DevUniformCudaHipRt<TApi>>
-        {
-            template<typename TExtent>
-            ALPAKA_FN_HOST static auto allocBuf(DevUniformCudaHipRt<TApi> const& dev, TExtent const& extent)
-                -> BufUniformCudaHipRt<TApi, TElem, Dim, TIdx>
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(dev.getNativeHandle()));
-
-                void* memPtr = nullptr;
-                std::size_t rowPitchInBytes = 0u;
-                if(getExtentProduct(extent) != 0)
-                {
-                    if constexpr(Dim::value == 0)
-                    {
-                        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::malloc(&memPtr, sizeof(TElem)));
-                    }
-                    else if constexpr(Dim::value == 1)
-                    {
-                        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
-                            TApi::malloc(&memPtr, static_cast<std::size_t>(getWidth(extent)) * sizeof(TElem)));
-                    }
-                    else if constexpr(Dim::value == 2)
-                    {
-                        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::mallocPitch(
-                            &memPtr,
-                            &rowPitchInBytes,
-                            static_cast<std::size_t>(getWidth(extent)) * sizeof(TElem),
-                            static_cast<std::size_t>(getHeight(extent))));
-                    }
-                    else if constexpr(Dim::value == 3)
-                    {
-                        typename TApi::Extent_t const extentVal = TApi::makeExtent(
-                            static_cast<std::size_t>(getWidth(extent)) * sizeof(TElem),
-                            static_cast<std::size_t>(getHeight(extent)),
-                            static_cast<std::size_t>(getDepth(extent)));
-                        typename TApi::PitchedPtr_t pitchedPtrVal;
-                        pitchedPtrVal.ptr = nullptr;
-                        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::malloc3D(&pitchedPtrVal, extentVal));
-                        memPtr = pitchedPtrVal.ptr;
-                        rowPitchInBytes = pitchedPtrVal.pitch;
-                    }
-                }
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                std::cout << __func__;
-                if constexpr(Dim::value >= 1)
-                    std::cout << " ew: " << getWidth(extent);
-                if constexpr(Dim::value >= 2)
-                    std::cout << " eh: " << getHeight(extent);
-                if constexpr(Dim::value >= 3)
-                    std::cout << " ed: " << getDepth(extent);
-                std::cout << " ptr: " << memPtr;
-                if constexpr(Dim::value >= 2)
-                    std::cout << " rowpitch: " << rowPitchInBytes;
-                std::cout << std::endl;
-#    endif
-                return {
-                    dev,
-                    reinterpret_cast<TElem*>(memPtr),
-                    [](TElem* ptr) { ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::free(ptr)); },
-                    extent,
-                    rowPitchInBytes};
-            }
-        };
-
-        //! The CUDA/HIP stream-ordered memory allocation trait specialization.
-        template<typename TApi, typename TElem, typename TDim, typename TIdx>
-        struct AsyncBufAlloc<TElem, TDim, TIdx, DevUniformCudaHipRt<TApi>>
-        {
-#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-            static_assert(
-                std::is_same_v<TApi, ApiCudaRt> && TApi::version >= BOOST_VERSION_NUMBER(11, 2, 0),
-                "Support for stream-ordered memory buffers requires CUDA 11.2 or higher.");
-#    endif
-#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-            static_assert(
-                std::is_same_v<TApi, ApiHipRt> && TApi::version >= BOOST_VERSION_NUMBER(5, 3, 0),
-                "Support for stream-ordered memory buffers requires HIP/ROCm 5.3 or higher.");
-#    endif
-            static_assert(
-                TDim::value <= 1,
-                "CUDA/HIP devices support only one-dimensional stream-ordered memory buffers.");
-
-            template<typename TQueue, typename TExtent>
-            ALPAKA_FN_HOST static auto allocAsyncBuf(TQueue queue, [[maybe_unused]] TExtent const& extent)
-                -> BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                static_assert(TDim::value == Dim<TExtent>::value, "extent must have the same dimension as the buffer");
-                auto const width = getExtentProduct(extent); // handles 1D and 0D buffers
-
-                auto const& dev = getDev(queue);
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(dev.getNativeHandle()));
-                void* memPtr = nullptr;
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::mallocAsync(
-                    &memPtr,
-                    static_cast<std::size_t>(width) * sizeof(TElem),
-                    queue.getNativeHandle()));
-
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                std::cout << __func__ << " ew: " << width << " ptr: " << memPtr << std::endl;
-#    endif
-                return {
-                    dev,
-                    reinterpret_cast<TElem*>(memPtr),
-                    [q = std::move(queue)](TElem* ptr)
-                    { ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::freeAsync(ptr, q.getNativeHandle())); },
-                    extent,
-                    static_cast<std::size_t>(width) * sizeof(TElem)};
-            }
-        };
-
-        //! The CUDA/HIP stream-ordered memory allocation capability trait specialization.
-        template<typename TApi, typename TDim>
-        struct HasAsyncBufSupport<TDim, DevUniformCudaHipRt<TApi>>
-            : std::bool_constant<
-                  TDim::value <= 1
-                  && (
-#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-                      std::is_same_v<TApi, ApiCudaRt> && TApi::version >= BOOST_VERSION_NUMBER(11, 2, 0)
-#    elif defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-                      std::is_same_v<TApi, ApiHipRt> && TApi::version >= BOOST_VERSION_NUMBER(5, 3, 0)
-#    else
-                      false
-#    endif
-                          )>
-        {
-        };
-
-        //! The pinned/mapped memory allocation trait specialization for the CUDA/HIP devices.
-        template<typename TApi, typename TElem, typename TDim, typename TIdx>
-        struct BufAllocMapped<PlatformUniformCudaHipRt<TApi>, TElem, TDim, TIdx>
-        {
-            template<typename TExtent>
-            ALPAKA_FN_HOST static auto allocMappedBuf(
-                DevCpu const& host,
-                PlatformUniformCudaHipRt<TApi> const& /*platform*/,
-                TExtent const& extent) -> BufCpu<TElem, TDim, TIdx>
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                // Allocate CUDA/HIP page-locked memory on the host, mapped into the CUDA/HIP address space and
-                // accessible to all CUDA/HIP devices.
-                TElem* memPtr = nullptr;
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::hostMalloc(
-                    reinterpret_cast<void**>(&memPtr),
-                    sizeof(TElem) * static_cast<std::size_t>(getExtentProduct(extent)),
-                    TApi::hostMallocMapped | TApi::hostMallocPortable));
-                auto deleter = [](TElem* ptr) { ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::hostFree(ptr)); };
-
-                return BufCpu<TElem, TDim, TIdx>(host, memPtr, std::move(deleter), extent);
-            }
-        };
-
-        //! The pinned/mapped memory allocation capability trait specialization.
-        template<typename TApi>
-        struct HasMappedBufSupport<PlatformUniformCudaHipRt<TApi>> : public std::true_type
-        {
-        };
-
-        //! The BufUniformCudaHipRt offset get trait specialization.
-        template<typename TApi, typename TElem, typename TDim, typename TIdx>
-        struct GetOffsets<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
-        {
-            ALPAKA_FN_HOST auto operator()(BufUniformCudaHipRt<TApi, TElem, TDim, TIdx> const&) const
-                -> Vec<TDim, TIdx>
-            {
-                return Vec<TDim, TIdx>::zeros();
-            }
-        };
-
-        //! The BufUniformCudaHipRt idx type trait specialization.
-        template<typename TApi, typename TElem, typename TDim, typename TIdx>
-        struct IdxType<BufUniformCudaHipRt<TApi, TElem, TDim, TIdx>>
-        {
-            using type = TIdx;
-        };
-
-        //! The BufCpu pointer on CUDA/HIP device get trait specialization.
-        template<typename TApi, typename TElem, typename TDim, typename TIdx>
-        struct GetPtrDev<BufCpu<TElem, TDim, TIdx>, DevUniformCudaHipRt<TApi>>
-        {
-            ALPAKA_FN_HOST static auto getPtrDev(
-                BufCpu<TElem, TDim, TIdx> const& buf,
-                DevUniformCudaHipRt<TApi> const&) -> TElem const*
-            {
-                // TODO: Check if the memory is mapped at all!
-                TElem* pDev(nullptr);
-
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::hostGetDevicePointer(
-                    &pDev,
-                    const_cast<void*>(reinterpret_cast<void const*>(getPtrNative(buf))),
-                    0));
-
-                return pDev;
-            }
-
-            ALPAKA_FN_HOST static auto getPtrDev(BufCpu<TElem, TDim, TIdx>& buf, DevUniformCudaHipRt<TApi> const&)
-                -> TElem*
-            {
-                // TODO: Check if the memory is mapped at all!
-                TElem* pDev(nullptr);
-
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::hostGetDevicePointer(&pDev, getPtrNative(buf), 0));
-
-                return pDev;
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#    include "alpaka/mem/buf/uniformCudaHip/Copy.hpp"
-#    include "alpaka/mem/buf/uniformCudaHip/Set.hpp"
-
-#endif
diff --git a/include/alpaka/mem/buf/SetKernel.hpp b/include/alpaka/mem/buf/SetKernel.hpp
deleted file mode 100644
index 229fce9..0000000
--- a/include/alpaka/mem/buf/SetKernel.hpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright 2022 Jeffrey Kelling, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/idx/Accessors.hpp"
-#include "alpaka/idx/MapIdx.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/mem/buf/Traits.hpp"
-#include "alpaka/meta/Fold.hpp"
-
-namespace alpaka
-{
-    //! any device ND memory set kernel.
-    class MemSetKernel
-    {
-    public:
-        //! The kernel entry point.
-        //!
-        //! All but the last element of threadElemExtent must be one.
-        //!
-        //! \tparam TAcc The accelerator environment to be executed on.
-        //! \tparam TExtent extent type.
-        //! \param acc The accelerator to be executed on.
-        //! \param val value to set.
-        //! \param dst target mem ptr.
-        //! \param extent area to set.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename TAcc, typename TExtent, typename TPitch>
-        ALPAKA_FN_ACC auto operator()(
-            TAcc const& acc,
-            std::uint8_t const val,
-            std::uint8_t* dst,
-            TExtent extent,
-            TPitch pitch) const -> void
-        {
-            using Idx = typename alpaka::trait::IdxType<TExtent>::type;
-            auto const gridThreadIdx(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc));
-            auto const threadElemExtent(alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc));
-            auto const idxThreadFirstElem = getIdxThreadFirstElem(acc, gridThreadIdx, threadElemExtent);
-            auto idx = mapIdxPitchBytes<1u, Dim<TAcc>::value>(idxThreadFirstElem, pitch)[0];
-            constexpr auto lastDim = Dim<TAcc>::value - 1;
-            auto const lastIdx = idx
-                                 + std::min(
-                                     threadElemExtent[lastDim],
-                                     static_cast<Idx>(extent[lastDim] - idxThreadFirstElem[lastDim]));
-
-            if((idxThreadFirstElem < extent).foldrAll(std::logical_and<bool>()))
-            {
-                for(; idx < lastIdx; ++idx)
-                {
-                    *(dst + idx) = val;
-                }
-            }
-        }
-    };
-} // namespace alpaka
diff --git a/include/alpaka/mem/buf/Traits.hpp b/include/alpaka/mem/buf/Traits.hpp
deleted file mode 100644
index e29cf5b..0000000
--- a/include/alpaka/mem/buf/Traits.hpp
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Copyright 2023 Alexander Matthes, Benjamin Worpitz, Andrea Bocci, Bernhard Manfred Gruber, Jan Stephan,
- *                Christian Kaever
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/BoostPredef.hpp"
-#include "alpaka/core/Common.hpp"
-#include "alpaka/mem/view/Traits.hpp"
-#include "alpaka/platform/Traits.hpp"
-
-namespace alpaka
-{
-    //! The CPU device handle.
-    class DevCpu;
-
-    //! The buffer traits.
-    namespace trait
-    {
-        //! The memory buffer type trait.
-        template<typename TDev, typename TElem, typename TDim, typename TIdx, typename TSfinae = void>
-        struct BufType;
-
-        //! The memory allocator trait.
-        template<typename TElem, typename TDim, typename TIdx, typename TDev, typename TSfinae = void>
-        struct BufAlloc;
-
-        //! The stream-ordered memory allocator trait.
-        template<typename TElem, typename TDim, typename TIdx, typename TDev, typename TSfinae = void>
-        struct AsyncBufAlloc;
-
-        //! The stream-ordered memory allocation capability trait.
-        template<typename TDim, typename TDev>
-        struct HasAsyncBufSupport : public std::false_type
-        {
-        };
-
-        //! The pinned/mapped memory allocator trait.
-        template<typename TPlatform, typename TElem, typename TDim, typename TIdx>
-        struct BufAllocMapped;
-
-        //! The pinned/mapped memory allocation capability trait.
-        template<typename TPlatform>
-        struct HasMappedBufSupport : public std::false_type
-        {
-        };
-    } // namespace trait
-
-    //! The memory buffer type trait alias template to remove the ::type.
-    template<typename TDev, typename TElem, typename TDim, typename TIdx>
-    using Buf = typename trait::BufType<alpaka::Dev<TDev>, TElem, TDim, TIdx>::type;
-
-    //! Allocates memory on the given device.
-    //!
-    //! \tparam TElem The element type of the returned buffer.
-    //! \tparam TIdx The linear index type of the buffer.
-    //! \tparam TExtent The extent type of the buffer.
-    //! \tparam TDev The type of device the buffer is allocated on.
-    //! \param dev The device to allocate the buffer on.
-    //! \param extent The extent of the buffer.
-    //! \return The newly allocated buffer.
-    template<typename TElem, typename TIdx, typename TExtent, typename TDev>
-    ALPAKA_FN_HOST auto allocBuf(TDev const& dev, TExtent const& extent = TExtent())
-    {
-        return trait::BufAlloc<TElem, Dim<TExtent>, TIdx, TDev>::allocBuf(dev, extent);
-    }
-
-    //! Allocates stream-ordered memory on the given device.
-    //!
-    //! \tparam TElem The element type of the returned buffer.
-    //! \tparam TIdx The linear index type of the buffer.
-    //! \tparam TExtent The extent type of the buffer.
-    //! \tparam TQueue The type of queue used to order the buffer allocation.
-    //! \param queue The queue used to order the buffer allocation.
-    //! \param extent The extent of the buffer.
-    //! \return The newly allocated buffer.
-    template<typename TElem, typename TIdx, typename TExtent, typename TQueue>
-    ALPAKA_FN_HOST auto allocAsyncBuf(TQueue queue, TExtent const& extent = TExtent())
-    {
-        return trait::AsyncBufAlloc<TElem, Dim<TExtent>, TIdx, alpaka::Dev<TQueue>>::allocAsyncBuf(queue, extent);
-    }
-
-    /* TODO: Remove this pragma block once support for clang versions <= 13 is removed. These versions are unable to
-       figure out that the template parameters are attached to a C++17 inline variable. */
-#if BOOST_COMP_CLANG
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored "-Wdocumentation"
-#endif
-    //! Checks if the given device can allocate a stream-ordered memory buffer of the given dimensionality.
-    //!
-    //! \tparam TDev The type of device to allocate the buffer on.
-    //! \tparam TDim The dimensionality of the buffer to allocate.
-    template<typename TDev, typename TDim>
-    inline constexpr bool hasAsyncBufSupport = trait::HasAsyncBufSupport<TDim, TDev>::value;
-#if BOOST_COMP_CLANG
-#    pragma clang diagnostic pop
-#endif
-
-    //! If supported, allocates stream-ordered memory on the given queue and the associated device.
-    //! Otherwise, allocates regular memory on the device associated to the queue.
-    //! Please note that stream-ordered and regular memory have different semantics:
-    //! this function is provided for convenience in the cases where the difference is not relevant,
-    //! and the stream-ordered memory is only used as a performance optimisation.
-    //!
-    //! \tparam TElem The element type of the returned buffer.
-    //! \tparam TIdx The linear index type of the buffer.
-    //! \tparam TExtent The extent type of the buffer.
-    //! \tparam TQueue The type of queue used to order the buffer allocation.
-    //! \param queue The queue used to order the buffer allocation.
-    //! \param extent The extent of the buffer.
-    //! \return The newly allocated buffer.
-    template<typename TElem, typename TIdx, typename TExtent, typename TQueue>
-    ALPAKA_FN_HOST auto allocAsyncBufIfSupported(TQueue queue, TExtent const& extent = TExtent())
-    {
-        if constexpr(hasAsyncBufSupport<alpaka::Dev<TQueue>, Dim<TExtent>>)
-        {
-            return allocAsyncBuf<TElem, TIdx>(queue, extent);
-        }
-        else
-        {
-            return allocBuf<TElem, TIdx>(getDev(queue), extent);
-        }
-
-        ALPAKA_UNREACHABLE(allocBuf<TElem, TIdx>(getDev(queue), extent));
-    }
-
-    //! Allocates pinned/mapped host memory, accessible by all devices in the given platform.
-    //!
-    //! \tparam TElem The element type of the returned buffer.
-    //! \tparam TIdx The linear index type of the buffer.
-    //! \tparam TExtent The extent type of the buffer.
-    //! \tparam TPlatform The platform from which the buffer is accessible.
-    //! \param host The host device to allocate the buffer on.
-    //! \param extent The extent of the buffer.
-    //! \return The newly allocated buffer.
-    template<typename TElem, typename TIdx, typename TExtent, typename TPlatform>
-    ALPAKA_FN_HOST auto allocMappedBuf(
-        DevCpu const& host,
-        TPlatform const& platform,
-        TExtent const& extent = TExtent())
-    {
-        return trait::BufAllocMapped<TPlatform, TElem, Dim<TExtent>, TIdx>::allocMappedBuf(host, platform, extent);
-    }
-
-    /* TODO: Remove this pragma block once support for clang versions <= 13 is removed. These versions are unable to
-       figure out that the template parameters are attached to a C++17 inline variable. */
-#if BOOST_COMP_CLANG
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored "-Wdocumentation"
-#endif
-    //! Checks if the host can allocate a pinned/mapped host memory, accessible by all devices in the given platform.
-    //!
-    //! \tparam TPlatform The platform from which the buffer is accessible.
-    template<typename TPlatform>
-    inline constexpr bool hasMappedBufSupport = trait::HasMappedBufSupport<TPlatform>::value;
-#if BOOST_COMP_CLANG
-#    pragma clang diagnostic pop
-#endif
-
-    //! If supported, allocates pinned/mapped host memory, accessible by all devices in the given platform.
-    //! Otherwise, allocates regular host memory.
-    //! Please note that pinned/mapped and regular memory may have different semantics:
-    //! this function is provided for convenience in the cases where the difference is not relevant,
-    //! and the pinned/mapped memory is only used as a performance optimisation.
-    //!
-    //! \tparam TElem The element type of the returned buffer.
-    //! \tparam TIdx The linear index type of the buffer.
-    //! \tparam TExtent The extent type of the buffer.
-    //! \tparam TPlatform The platform from which the buffer is accessible.
-    //! \param host The host device to allocate the buffer on.
-    //! \param extent The extent of the buffer.
-    //! \return The newly allocated buffer.
-    template<typename TElem, typename TIdx, typename TExtent, typename TPlatform>
-    ALPAKA_FN_HOST auto allocMappedBufIfSupported(
-        DevCpu const& host,
-        TPlatform const& platform,
-        TExtent const& extent = TExtent())
-    {
-        using Platform = alpaka::Platform<TPlatform>;
-        if constexpr(hasMappedBufSupport<Platform>)
-        {
-            return allocMappedBuf<TElem, TIdx>(host, platform, extent);
-        }
-        else
-        {
-            return allocBuf<TElem, TIdx>(host, extent);
-        }
-
-        ALPAKA_UNREACHABLE(allocBuf<TElem, TIdx>(host, extent));
-    }
-} // namespace alpaka
diff --git a/include/alpaka/mem/buf/cpu/Copy.hpp b/include/alpaka/mem/buf/cpu/Copy.hpp
deleted file mode 100644
index dd707bd..0000000
--- a/include/alpaka/mem/buf/cpu/Copy.hpp
+++ /dev/null
@@ -1,220 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera, Andrea Bocci, Jan Stephan, Bernhard
- * Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Assert.hpp"
-#include "alpaka/dim/DimIntegralConst.hpp"
-#include "alpaka/extent/Traits.hpp"
-#include "alpaka/mem/view/Traits.hpp"
-#include "alpaka/meta/Integral.hpp"
-#include "alpaka/meta/NdLoop.hpp"
-
-#include <cstring>
-
-namespace alpaka
-{
-    class DevCpu;
-} // namespace alpaka
-
-namespace alpaka
-{
-    namespace detail
-    {
-        //! The CPU device memory copy task base.
-        //!
-        //! Copies from CPU memory into CPU memory.
-        template<typename TDim, typename TViewDst, typename TViewSrc, typename TExtent>
-        struct TaskCopyCpuBase
-        {
-            static_assert(TDim::value > 0);
-
-            using ExtentSize = Idx<TExtent>;
-            using DstSize = Idx<TViewDst>;
-            using SrcSize = Idx<TViewSrc>;
-            using Elem = alpaka::Elem<TViewSrc>;
-
-            template<typename TViewFwd>
-            TaskCopyCpuBase(TViewFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
-                : m_extent(getExtents(extent))
-                , m_extentWidthBytes(m_extent.back() * static_cast<ExtentSize>(sizeof(Elem)))
-#if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
-                , m_dstExtent(getExtents(viewDst))
-                , m_srcExtent(getExtents(viewSrc))
-#endif
-                , m_dstPitchBytes(getPitchesInBytes(viewDst))
-                , m_srcPitchBytes(getPitchesInBytes(viewSrc))
-                , m_dstMemNative(reinterpret_cast<std::uint8_t*>(getPtrNative(viewDst)))
-                , m_srcMemNative(reinterpret_cast<std::uint8_t const*>(getPtrNative(viewSrc)))
-            {
-                if constexpr(TDim::value > 0)
-                {
-                    ALPAKA_ASSERT((castVec<DstSize>(m_extent) <= m_dstExtent).all());
-                    ALPAKA_ASSERT((castVec<SrcSize>(m_extent) <= m_srcExtent).all());
-                    if constexpr(TDim::value > 1)
-                    {
-                        ALPAKA_ASSERT(static_cast<DstSize>(m_extentWidthBytes) <= m_dstPitchBytes[TDim::value - 2]);
-                        ALPAKA_ASSERT(static_cast<SrcSize>(m_extentWidthBytes) <= m_srcPitchBytes[TDim::value - 2]);
-                    }
-                }
-            }
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-            ALPAKA_FN_HOST auto printDebug() const -> void
-            {
-                std::cout << __func__ << " e: " << m_extent << " ewb: " << this->m_extentWidthBytes
-                          << " de: " << m_dstExtent << " dptr: " << reinterpret_cast<void*>(m_dstMemNative)
-                          << " dpitchb: " << m_dstPitchBytes << " se: " << m_srcExtent
-                          << " sptr: " << reinterpret_cast<void const*>(m_srcMemNative)
-                          << " spitchb: " << m_srcPitchBytes << std::endl;
-            }
-#endif
-
-            Vec<TDim, ExtentSize> const m_extent;
-            ExtentSize const m_extentWidthBytes;
-#if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
-            Vec<TDim, DstSize> const m_dstExtent;
-            Vec<TDim, SrcSize> const m_srcExtent;
-#endif
-            Vec<TDim, DstSize> const m_dstPitchBytes;
-            Vec<TDim, SrcSize> const m_srcPitchBytes;
-
-            std::uint8_t* const m_dstMemNative;
-            std::uint8_t const* const m_srcMemNative;
-        };
-
-        //! The CPU device ND memory copy task.
-        template<typename TDim, typename TViewDst, typename TViewSrc, typename TExtent>
-        struct TaskCopyCpu : public TaskCopyCpuBase<TDim, TViewDst, TViewSrc, TExtent>
-        {
-            using DimMin1 = DimInt<TDim::value - 1u>;
-            using typename TaskCopyCpuBase<TDim, TViewDst, TViewSrc, TExtent>::ExtentSize;
-            using typename TaskCopyCpuBase<TDim, TViewDst, TViewSrc, TExtent>::DstSize;
-            using typename TaskCopyCpuBase<TDim, TViewDst, TViewSrc, TExtent>::SrcSize;
-
-            using TaskCopyCpuBase<TDim, TViewDst, TViewSrc, TExtent>::TaskCopyCpuBase;
-
-            ALPAKA_FN_HOST auto operator()() const -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                this->printDebug();
-#endif
-                // [z, y, x] -> [z, y] because all elements with the innermost x dimension are handled within one
-                // iteration.
-                Vec<DimMin1, ExtentSize> const extentWithoutInnermost = subVecBegin<DimMin1>(this->m_extent);
-                Vec<DimMin1, DstSize> const dstPitchBytesWithoutInnermost
-                    = subVecBegin<DimMin1>(this->m_dstPitchBytes);
-                Vec<DimMin1, SrcSize> const srcPitchBytesWithoutInnermost
-                    = subVecBegin<DimMin1>(this->m_srcPitchBytes);
-
-                if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
-                {
-                    meta::ndLoopIncIdx(
-                        extentWithoutInnermost,
-                        [&](Vec<DimMin1, ExtentSize> const& idx)
-                        {
-                            std::memcpy(
-                                this->m_dstMemNative + (castVec<DstSize>(idx) * dstPitchBytesWithoutInnermost).sum(),
-                                this->m_srcMemNative + (castVec<SrcSize>(idx) * srcPitchBytesWithoutInnermost).sum(),
-                                static_cast<std::size_t>(this->m_extentWidthBytes));
-                        });
-                }
-            }
-        };
-
-        //! The CPU device 1D memory copy task.
-        template<typename TViewDst, typename TViewSrc, typename TExtent>
-        struct TaskCopyCpu<DimInt<1u>, TViewDst, TViewSrc, TExtent>
-            : TaskCopyCpuBase<DimInt<1u>, TViewDst, TViewSrc, TExtent>
-        {
-            using TaskCopyCpuBase<DimInt<1u>, TViewDst, TViewSrc, TExtent>::TaskCopyCpuBase;
-
-            ALPAKA_FN_HOST auto operator()() const -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                this->printDebug();
-#endif
-                if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
-                {
-                    std::memcpy(
-                        reinterpret_cast<void*>(this->m_dstMemNative),
-                        reinterpret_cast<void const*>(this->m_srcMemNative),
-                        static_cast<std::size_t>(this->m_extentWidthBytes));
-                }
-            }
-        };
-
-        //! The CPU device scalar memory copy task.
-        //!
-        //! Copies from CPU memory into CPU memory.
-        template<typename TViewDst, typename TViewSrc, typename TExtent>
-        struct TaskCopyCpu<DimInt<0u>, TViewDst, TViewSrc, TExtent>
-        {
-            using Elem = alpaka::Elem<TViewSrc>;
-
-            template<typename TViewDstFwd>
-            TaskCopyCpu(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, [[maybe_unused]] TExtent const& extent)
-                : m_dstMemNative(reinterpret_cast<std::uint8_t*>(getPtrNative(viewDst)))
-                , m_srcMemNative(reinterpret_cast<std::uint8_t const*>(getPtrNative(viewSrc)))
-            {
-                // all zero-sized extents are equivalent
-                ALPAKA_ASSERT(getExtents(extent).prod() == 1u);
-                ALPAKA_ASSERT(getExtents(viewDst).prod() == 1u);
-                ALPAKA_ASSERT(getExtents(viewSrc).prod() == 1u);
-            }
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-            ALPAKA_FN_HOST auto printDebug() const -> void
-            {
-                using Scalar = Vec<DimInt<0u>, Idx<TExtent>>;
-                std::cout << __func__ << " e: " << Scalar() << " ewb: " << sizeof(Elem) << " de: " << Scalar()
-                          << " dptr: " << reinterpret_cast<void*>(m_dstMemNative) << " dpitchb: " << Scalar()
-                          << " se: " << Scalar() << " sptr: " << reinterpret_cast<void const*>(m_srcMemNative)
-                          << " spitchb: " << Scalar() << std::endl;
-            }
-#endif
-
-            ALPAKA_FN_HOST auto operator()() const noexcept(ALPAKA_DEBUG < ALPAKA_DEBUG_FULL) -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                printDebug();
-#endif
-                std::memcpy(
-                    reinterpret_cast<void*>(m_dstMemNative),
-                    reinterpret_cast<void const*>(m_srcMemNative),
-                    sizeof(Elem));
-            }
-
-            std::uint8_t* const m_dstMemNative;
-            std::uint8_t const* const m_srcMemNative;
-        };
-    } // namespace detail
-
-    namespace trait
-    {
-        //! The CPU device memory copy trait specialization.
-        //!
-        //! Copies from CPU memory into CPU memory.
-        template<typename TDim>
-        struct CreateTaskMemcpy<TDim, DevCpu, DevCpu>
-        {
-            template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
-            ALPAKA_FN_HOST static auto createTaskMemcpy(
-                TViewDstFwd&& viewDst,
-                TViewSrc const& viewSrc,
-                TExtent const& extent)
-                -> alpaka::detail::TaskCopyCpu<TDim, std::remove_reference_t<TViewDstFwd>, TViewSrc, TExtent>
-            {
-                return {std::forward<TViewDstFwd>(viewDst), viewSrc, extent};
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/mem/buf/cpu/Set.hpp b/include/alpaka/mem/buf/cpu/Set.hpp
deleted file mode 100644
index 1e617e2..0000000
--- a/include/alpaka/mem/buf/cpu/Set.hpp
+++ /dev/null
@@ -1,186 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Erik Zenker, Matthias Werner, Andrea Bocci, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Assert.hpp"
-#include "alpaka/dim/DimIntegralConst.hpp"
-#include "alpaka/extent/Traits.hpp"
-#include "alpaka/mem/view/Traits.hpp"
-#include "alpaka/meta/Integral.hpp"
-#include "alpaka/meta/NdLoop.hpp"
-
-#include <cstring>
-
-namespace alpaka
-{
-    class DevCpu;
-
-    namespace detail
-    {
-        //! The CPU device ND memory set task base.
-        template<typename TDim, typename TView, typename TExtent>
-        struct TaskSetCpuBase
-        {
-            static_assert(TDim::value > 0);
-
-            using ExtentSize = Idx<TExtent>;
-            using DstSize = Idx<TView>;
-            using Elem = alpaka::Elem<TView>;
-
-            template<typename TViewFwd>
-            TaskSetCpuBase(TViewFwd&& view, std::uint8_t const& byte, TExtent const& extent)
-                : m_byte(byte)
-                , m_extent(getExtents(extent))
-                , m_extentWidthBytes(m_extent.back() * static_cast<ExtentSize>(sizeof(Elem)))
-#if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
-                , m_dstExtent(getExtents(view))
-#endif
-                , m_dstPitchBytes(getPitchesInBytes(view))
-                , m_dstMemNative(reinterpret_cast<std::uint8_t*>(getPtrNative(view)))
-            {
-                ALPAKA_ASSERT((castVec<DstSize>(m_extent) <= m_dstExtent).all());
-                if constexpr(TDim::value > 1)
-                    ALPAKA_ASSERT(m_extentWidthBytes <= m_dstPitchBytes[TDim::value - 2]);
-            }
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-            ALPAKA_FN_HOST auto printDebug() const -> void
-            {
-                std::cout << __func__ << " e: " << this->m_extent << " ewb: " << this->m_extentWidthBytes
-                          << " de: " << this->m_dstExtent << " dptr: " << reinterpret_cast<void*>(this->m_dstMemNative)
-                          << " dpitchb: " << this->m_dstPitchBytes << std::endl;
-            }
-#endif
-
-            std::uint8_t const m_byte;
-            Vec<TDim, ExtentSize> const m_extent;
-            ExtentSize const m_extentWidthBytes;
-#if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
-            Vec<TDim, DstSize> const m_dstExtent;
-#endif
-            Vec<TDim, DstSize> const m_dstPitchBytes;
-            std::uint8_t* const m_dstMemNative;
-        };
-
-        //! The CPU device ND memory set task.
-        template<typename TDim, typename TView, typename TExtent>
-        struct TaskSetCpu : public TaskSetCpuBase<TDim, TView, TExtent>
-        {
-            using DimMin1 = DimInt<TDim::value - 1u>;
-            using typename TaskSetCpuBase<TDim, TView, TExtent>::ExtentSize;
-            using typename TaskSetCpuBase<TDim, TView, TExtent>::DstSize;
-
-            using TaskSetCpuBase<TDim, TView, TExtent>::TaskSetCpuBase;
-
-            ALPAKA_FN_HOST auto operator()() const -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                this->printDebug();
-#endif
-                // [z, y, x] -> [z, y] because all elements with the innermost x dimension are handled within one
-                // iteration.
-                Vec<DimMin1, ExtentSize> const extentWithoutInnermost = subVecBegin<DimMin1>(this->m_extent);
-                Vec<DimMin1, DstSize> const dstPitchBytesWithoutOutmost = subVecBegin<DimMin1>(this->m_dstPitchBytes);
-
-                if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
-                {
-                    meta::ndLoopIncIdx(
-                        extentWithoutInnermost,
-                        [&](Vec<DimMin1, ExtentSize> const& idx)
-                        {
-                            std::memset(
-                                this->m_dstMemNative + (castVec<DstSize>(idx) * dstPitchBytesWithoutOutmost).sum(),
-                                this->m_byte,
-                                static_cast<std::size_t>(this->m_extentWidthBytes));
-                        });
-                }
-            }
-        };
-
-        //! The CPU device 1D memory set task.
-        template<typename TView, typename TExtent>
-        struct TaskSetCpu<DimInt<1u>, TView, TExtent> : public TaskSetCpuBase<DimInt<1u>, TView, TExtent>
-        {
-            using TaskSetCpuBase<DimInt<1u>, TView, TExtent>::TaskSetCpuBase;
-
-            ALPAKA_FN_HOST auto operator()() const -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                this->printDebug();
-#endif
-                if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
-                {
-                    std::memset(
-                        this->m_dstMemNative,
-                        this->m_byte,
-                        static_cast<std::size_t>(this->m_extentWidthBytes));
-                }
-            }
-        };
-
-        //! The CPU device scalar memory set task.
-        template<typename TView, typename TExtent>
-        struct TaskSetCpu<DimInt<0u>, TView, TExtent>
-        {
-            using ExtentSize = Idx<TExtent>;
-            using Scalar = Vec<DimInt<0u>, ExtentSize>;
-            using DstSize = Idx<TView>;
-            using Elem = alpaka::Elem<TView>;
-
-            template<typename TViewFwd>
-            TaskSetCpu(TViewFwd&& view, std::uint8_t const& byte, [[maybe_unused]] TExtent const& extent)
-                : m_byte(byte)
-                , m_dstMemNative(reinterpret_cast<std::uint8_t*>(getPtrNative(view)))
-            {
-                // all zero-sized extents are equivalent
-                ALPAKA_ASSERT(getExtents(extent).prod() == 1u);
-                ALPAKA_ASSERT(getExtents(view).prod() == 1u);
-            }
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-            ALPAKA_FN_HOST auto printDebug() const -> void
-            {
-                std::cout << __func__ << " e: " << Scalar() << " ewb: " << sizeof(Elem) << " de: " << Scalar()
-                          << " dptr: " << reinterpret_cast<void*>(m_dstMemNative) << " dpitchb: " << Scalar()
-                          << std::endl;
-            }
-#endif
-
-            ALPAKA_FN_HOST auto operator()() const noexcept(ALPAKA_DEBUG < ALPAKA_DEBUG_FULL) -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                printDebug();
-#endif
-                std::memset(m_dstMemNative, m_byte, sizeof(Elem));
-            }
-
-            std::uint8_t const m_byte;
-            std::uint8_t* const m_dstMemNative;
-        };
-    } // namespace detail
-
-    namespace trait
-    {
-        //! The CPU device memory set trait specialization.
-        template<typename TDim>
-        struct CreateTaskMemset<TDim, DevCpu>
-        {
-            template<typename TExtent, typename TViewFwd>
-            ALPAKA_FN_HOST static auto createTaskMemset(
-                TViewFwd&& view,
-                std::uint8_t const& byte,
-                TExtent const& extent) -> alpaka::detail::TaskSetCpu<TDim, std::remove_reference_t<TViewFwd>, TExtent>
-            {
-                return {std::forward<TViewFwd>(view), byte, extent};
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/mem/buf/sycl/Common.hpp b/include/alpaka/mem/buf/sycl/Common.hpp
deleted file mode 100644
index 498577d..0000000
--- a/include/alpaka/mem/buf/sycl/Common.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright 2022 Jan Stephan
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/dim/Traits.hpp"
-#include "alpaka/extent/Traits.hpp"
-#include "alpaka/offset/Traits.hpp"
-
-#include <cstddef>
-
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-
-#    include <sycl/sycl.hpp>
-
-namespace alpaka::detail
-{
-    template<typename TExtent>
-    inline auto make_sycl_range(TExtent const& ext, std::size_t multiplier = 1)
-    {
-        constexpr auto dim = Dim<TExtent>::value;
-
-        if constexpr(dim == 0)
-            return sycl::range<1>{multiplier};
-        else
-        {
-            auto const width = getWidth(ext) * multiplier;
-            if constexpr(dim == 1)
-                return sycl::range<1>{width};
-            else if constexpr(dim == 2)
-                return sycl::range<2>{width, getHeight(ext)};
-            else
-                return sycl::range<3>{width, getHeight(ext), getDepth(ext)};
-        }
-    }
-
-    template<typename TView>
-    inline auto make_sycl_offset(TView const& view)
-    {
-        constexpr auto dim = Dim<TView>::value;
-
-        if constexpr(dim == 0)
-            return sycl::range<1>{1};
-        else
-        {
-            if constexpr(dim == 1)
-                return sycl::id<1>{getOffsetX(view)};
-            else if constexpr(dim == 2)
-                return sycl::id<2>{getOffsetX(view), getOffsetY(view)};
-            else
-                return sycl::id<3>{getOffsetX(view), getOffsetY(view), getOffsetZ(view)};
-        }
-    }
-} // namespace alpaka::detail
-
-#endif
diff --git a/include/alpaka/mem/buf/sycl/Copy.hpp b/include/alpaka/mem/buf/sycl/Copy.hpp
deleted file mode 100644
index 44098f1..0000000
--- a/include/alpaka/mem/buf/sycl/Copy.hpp
+++ /dev/null
@@ -1,240 +0,0 @@
-/* Copyright 2024 Jan Stephan, Bernhard Manfred Gruber, Luca Ferragina, Aurora Perego, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Debug.hpp"
-#include "alpaka/core/Sycl.hpp"
-#include "alpaka/dev/DevCpu.hpp"
-#include "alpaka/dev/DevGenericSycl.hpp"
-#include "alpaka/dim/DimIntegralConst.hpp"
-#include "alpaka/elem/Traits.hpp"
-#include "alpaka/extent/Traits.hpp"
-#include "alpaka/mem/buf/sycl/Common.hpp"
-#include "alpaka/mem/view/Traits.hpp"
-#include "alpaka/meta/NdLoop.hpp"
-#include "alpaka/queue/QueueGenericSyclBlocking.hpp"
-#include "alpaka/queue/QueueGenericSyclNonBlocking.hpp"
-
-#include <memory>
-#include <type_traits>
-
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-
-#    include <sycl/sycl.hpp>
-
-namespace alpaka::detail
-{
-    //!  The SYCL device memory copy task base.
-    template<typename TDim, typename TViewDst, typename TViewSrc, typename TExtent>
-    struct TaskCopySyclBase
-    {
-        static_assert(
-            std::is_same_v<std::remove_const_t<alpaka::Elem<TViewSrc>>, std::remove_const_t<alpaka::Elem<TViewDst>>>,
-            "The source and the destination view are required to have the same element type!");
-        using ExtentSize = Idx<TExtent>;
-        using DstSize = Idx<TViewDst>;
-        using SrcSize = Idx<TViewSrc>;
-        using Elem = alpaka::Elem<TViewSrc>;
-
-        template<typename TViewFwd>
-        TaskCopySyclBase(TViewFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
-            : m_extent(getExtents(extent))
-            , m_extentWidthBytes(m_extent.back() * static_cast<ExtentSize>(sizeof(Elem)))
-#    if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
-            , m_dstExtent(getExtents(viewDst))
-            , m_srcExtent(getExtents(viewSrc))
-#    endif
-            , m_dstPitchBytes(getPitchesInBytes(viewDst))
-            , m_srcPitchBytes(getPitchesInBytes(viewSrc))
-            , m_dstMemNative(reinterpret_cast<std::uint8_t*>(getPtrNative(viewDst)))
-            , m_srcMemNative(reinterpret_cast<std::uint8_t const*>(getPtrNative(viewSrc)))
-        {
-            if constexpr(TDim::value > 0)
-            {
-                ALPAKA_ASSERT((castVec<DstSize>(m_extent) <= m_dstExtent).all());
-                ALPAKA_ASSERT((castVec<SrcSize>(m_extent) <= m_srcExtent).all());
-            }
-        }
-
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-        auto printDebug() const -> void
-        {
-            std::cout << __func__ << " e: " << m_extent << " ewb: " << this->m_extentWidthBytes
-                      << " de: " << m_dstExtent << " dptr: " << reinterpret_cast<void*>(m_dstMemNative)
-                      << " se: " << m_srcExtent << " sptr: " << reinterpret_cast<void const*>(m_srcMemNative)
-                      << std::endl;
-        }
-#    endif
-
-        Vec<TDim, ExtentSize> const m_extent;
-        ExtentSize const m_extentWidthBytes;
-#    if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
-        Vec<TDim, DstSize> const m_dstExtent;
-        Vec<TDim, SrcSize> const m_srcExtent;
-#    endif
-
-        Vec<TDim, DstSize> const m_dstPitchBytes;
-        Vec<TDim, SrcSize> const m_srcPitchBytes;
-        std::uint8_t* const m_dstMemNative;
-        std::uint8_t const* const m_srcMemNative;
-        static constexpr auto is_sycl_task = true;
-    };
-
-    //! The SYCL device ND memory copy task.
-    template<typename TDim, typename TViewDst, typename TViewSrc, typename TExtent>
-    struct TaskCopySycl : public TaskCopySyclBase<TDim, TViewDst, TViewSrc, TExtent>
-    {
-        using DimMin1 = DimInt<TDim::value - 1u>;
-        using typename TaskCopySyclBase<TDim, TViewDst, TViewSrc, TExtent>::ExtentSize;
-        using typename TaskCopySyclBase<TDim, TViewDst, TViewSrc, TExtent>::DstSize;
-        using typename TaskCopySyclBase<TDim, TViewDst, TViewSrc, TExtent>::SrcSize;
-
-        using TaskCopySyclBase<TDim, TViewDst, TViewSrc, TExtent>::TaskCopySyclBase;
-
-        auto operator()(sycl::queue& queue, std::vector<sycl::event> const& requirements) const -> sycl::event
-        {
-            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-            this->printDebug();
-#    endif
-            // [z, y, x] -> [z, y] because all elements with the innermost x dimension are handled within one
-            // iteration.
-            Vec<DimMin1, ExtentSize> const extentWithoutInnermost(subVecBegin<DimMin1>(this->m_extent));
-            Vec<DimMin1, DstSize> const dstPitchBytesWithoutInnermost(subVecBegin<DimMin1>(this->m_dstPitchBytes));
-            Vec<DimMin1, SrcSize> const srcPitchBytesWithoutInnermost(subVecBegin<DimMin1>(this->m_srcPitchBytes));
-
-            // Record an event for each memcpy call
-            std::vector<sycl::event> events;
-            events.reserve(static_cast<std::size_t>(extentWithoutInnermost.prod()));
-
-            if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
-            {
-                meta::ndLoopIncIdx(
-                    extentWithoutInnermost,
-                    [&](Vec<DimMin1, ExtentSize> const& idx)
-                    {
-                        events.push_back(queue.memcpy(
-                            this->m_dstMemNative + (castVec<DstSize>(idx) * dstPitchBytesWithoutInnermost).sum(),
-                            this->m_srcMemNative + (castVec<SrcSize>(idx) * srcPitchBytesWithoutInnermost).sum(),
-                            static_cast<std::size_t>(this->m_extentWidthBytes),
-                            requirements));
-                    });
-            }
-
-            // Return an event that depends on all the events assciated to the memcpy calls
-            return queue.ext_oneapi_submit_barrier(events);
-        }
-    };
-
-    //! The SYCL device 1D memory copy task.
-    template<typename TViewDst, typename TViewSrc, typename TExtent>
-    struct TaskCopySycl<DimInt<1u>, TViewDst, TViewSrc, TExtent>
-        : TaskCopySyclBase<DimInt<1u>, TViewDst, TViewSrc, TExtent>
-    {
-        using TaskCopySyclBase<DimInt<1u>, TViewDst, TViewSrc, TExtent>::TaskCopySyclBase;
-        using Elem = alpaka::Elem<TViewSrc>;
-
-        auto operator()(sycl::queue& queue, std::vector<sycl::event> const& requirements) const -> sycl::event
-        {
-            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-            this->printDebug();
-#    endif
-            if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
-            {
-                return queue.memcpy(
-                    this->m_dstMemNative,
-                    this->m_srcMemNative,
-                    sizeof(Elem) * static_cast<std::size_t>(this->m_extent.prod()),
-                    requirements);
-            }
-            else
-            {
-                return queue.ext_oneapi_submit_barrier();
-            }
-        }
-    };
-
-    //! The scalar SYCL memory copy trait.
-    template<typename TViewDst, typename TViewSrc, typename TExtent>
-    struct TaskCopySycl<DimInt<0u>, TViewDst, TViewSrc, TExtent>
-    {
-        static_assert(
-            std::is_same_v<std::remove_const_t<alpaka::Elem<TViewSrc>>, std::remove_const_t<alpaka::Elem<TViewDst>>>,
-            "The source and the destination view are required to have the same element type!");
-
-        using Elem = alpaka::Elem<TViewSrc>;
-
-        template<typename TViewDstFwd>
-        TaskCopySycl(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, [[maybe_unused]] TExtent const& extent)
-            : m_dstMemNative(reinterpret_cast<void*>(getPtrNative(viewDst)))
-            , m_srcMemNative(reinterpret_cast<void const*>(getPtrNative(viewSrc)))
-        {
-            // all zero-sized extents are equivalent
-            ALPAKA_ASSERT(getExtents(extent).prod() == 1u);
-            ALPAKA_ASSERT(getExtents(viewDst).prod() == 1u);
-            ALPAKA_ASSERT(getExtents(viewSrc).prod() == 1u);
-        }
-
-        auto operator()(sycl::queue& queue, std::vector<sycl::event> const& requirements) const -> sycl::event
-        {
-            return queue.memcpy(m_dstMemNative, m_srcMemNative, sizeof(Elem), requirements);
-        }
-
-        void* m_dstMemNative;
-        void const* m_srcMemNative;
-        static constexpr auto is_sycl_task = true;
-    };
-} // namespace alpaka::detail
-
-// Trait specializations for CreateTaskMemcpy.
-namespace alpaka::trait
-{
-    //! The SYCL host-to-device memory copy trait specialization.
-    template<typename TTag, typename TDim>
-    struct CreateTaskMemcpy<TDim, DevGenericSycl<TTag>, DevCpu>
-    {
-        template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
-        static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
-            -> alpaka::detail::TaskCopySycl<TDim, std::remove_reference_t<TViewDstFwd>, TViewSrc, TExtent>
-        {
-            ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-            return {std::forward<TViewDstFwd>(viewDst), viewSrc, extent};
-        }
-    };
-
-    //! The SYCL device-to-host memory copy trait specialization.
-    template<typename TTag, typename TDim>
-    struct CreateTaskMemcpy<TDim, DevCpu, DevGenericSycl<TTag>>
-    {
-        template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
-        static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
-            -> alpaka::detail::TaskCopySycl<TDim, std::remove_reference_t<TViewDstFwd>, TViewSrc, TExtent>
-        {
-            ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-            return {std::forward<TViewDstFwd>(viewDst), viewSrc, extent};
-        }
-    };
-
-    //! The SYCL device-to-device memory copy trait specialization.
-    template<typename TTagDst, typename TTagSrc, typename TDim>
-    struct CreateTaskMemcpy<TDim, DevGenericSycl<TTagDst>, DevGenericSycl<TTagSrc>>
-    {
-        template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
-        static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
-            -> alpaka::detail::TaskCopySycl<TDim, std::remove_reference_t<TViewDstFwd>, TViewSrc, TExtent>
-        {
-            ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-            return {std::forward<TViewDstFwd>(viewDst), viewSrc, extent};
-        }
-    };
-} // namespace alpaka::trait
-
-#endif
diff --git a/include/alpaka/mem/buf/sycl/Set.hpp b/include/alpaka/mem/buf/sycl/Set.hpp
deleted file mode 100644
index 73478d3..0000000
--- a/include/alpaka/mem/buf/sycl/Set.hpp
+++ /dev/null
@@ -1,212 +0,0 @@
-/* Copyright 2023 Jan Stephan, Luca Ferragina, Aurora Perego, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Debug.hpp"
-#include "alpaka/core/Sycl.hpp"
-#include "alpaka/dev/DevGenericSycl.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/dim/DimIntegralConst.hpp"
-#include "alpaka/extent/Traits.hpp"
-#include "alpaka/mem/buf/sycl/Common.hpp"
-#include "alpaka/mem/view/Traits.hpp"
-#include "alpaka/meta/NdLoop.hpp"
-#include "alpaka/queue/QueueGenericSyclBlocking.hpp"
-#include "alpaka/queue/QueueGenericSyclNonBlocking.hpp"
-#include "alpaka/queue/Traits.hpp"
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-
-namespace alpaka
-{
-
-    namespace detail
-    {
-        //!  The SYCL ND memory set task base.
-        template<typename TDim, typename TView, typename TExtent>
-        struct TaskSetSyclBase
-        {
-            using ExtentSize = Idx<TExtent>;
-            using DstSize = Idx<TView>;
-            using Elem = alpaka::Elem<TView>;
-
-            template<typename TViewFwd>
-            TaskSetSyclBase(TViewFwd&& view, std::uint8_t const& byte, TExtent const& extent)
-                : m_byte(byte)
-                , m_extent(getExtents(extent))
-                , m_extentWidthBytes(m_extent.back() * static_cast<ExtentSize>(sizeof(Elem)))
-#    if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
-                , m_dstExtent(getExtents(view))
-#    endif
-
-                , m_dstPitchBytes(getPitchesInBytes(view))
-                , m_dstMemNative(reinterpret_cast<std::uint8_t*>(getPtrNative(view)))
-
-            {
-                ALPAKA_ASSERT((castVec<DstSize>(m_extent) <= m_dstExtent).all());
-                if constexpr(TDim::value > 1)
-                    ALPAKA_ASSERT(m_extentWidthBytes <= m_dstPitchBytes[TDim::value - 2]);
-            }
-
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-            auto printDebug() const -> void
-            {
-                std::cout << __func__ << " e: " << this->m_extent << " ewb: " << this->m_extentWidthBytes
-                          << " de: " << this->m_dstExtent << " dptr: " << reinterpret_cast<void*>(this->m_dstMemNative)
-                          << " dpitchb: " << this->m_dstPitchBytes << std::endl;
-            }
-#    endif
-
-            std::uint8_t const m_byte;
-            Vec<TDim, ExtentSize> const m_extent;
-            ExtentSize const m_extentWidthBytes;
-#    if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
-            Vec<TDim, DstSize> const m_dstExtent;
-#    endif
-            Vec<TDim, DstSize> const m_dstPitchBytes;
-            std::uint8_t* const m_dstMemNative;
-            static constexpr auto is_sycl_task = true;
-        };
-
-        //! The SYCL device ND memory set task.
-        template<typename TDim, typename TView, typename TExtent>
-        struct TaskSetSycl : public TaskSetSyclBase<TDim, TView, TExtent>
-        {
-            using DimMin1 = DimInt<TDim::value - 1u>;
-            using typename TaskSetSyclBase<TDim, TView, TExtent>::ExtentSize;
-            using typename TaskSetSyclBase<TDim, TView, TExtent>::DstSize;
-
-            using TaskSetSyclBase<TDim, TView, TExtent>::TaskSetSyclBase;
-
-            auto operator()(sycl::queue& queue, std::vector<sycl::event> const& requirements) const -> sycl::event
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                this->printDebug();
-#    endif
-                // [z, y, x] -> [z, y] because all elements with the innermost x dimension are handled within one
-                // iteration.
-                Vec<DimMin1, ExtentSize> const extentWithoutInnermost(subVecBegin<DimMin1>(this->m_extent));
-                Vec<DimMin1, DstSize> const dstPitchBytesWithoutInnermost(subVecBegin<DimMin1>(this->m_dstPitchBytes));
-
-                // Record an event for each memcpy call
-                std::vector<sycl::event> events;
-                events.reserve(static_cast<std::size_t>(extentWithoutInnermost.prod()));
-
-                if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
-                {
-                    meta::ndLoopIncIdx(
-                        extentWithoutInnermost,
-                        [&](Vec<DimMin1, ExtentSize> const& idx)
-                        {
-                            events.push_back(queue.memset(
-                                this->m_dstMemNative + (castVec<DstSize>(idx) * dstPitchBytesWithoutInnermost).sum(),
-                                this->m_byte,
-                                static_cast<std::size_t>(this->m_extentWidthBytes),
-                                requirements));
-                        });
-                }
-
-                // Return an event that depends on all the events assciated to the memcpy calls
-                return queue.ext_oneapi_submit_barrier(events);
-            }
-        };
-
-        //! The 1D SYCL memory set task.
-        template<typename TView, typename TExtent>
-        struct TaskSetSycl<DimInt<1u>, TView, TExtent> : public TaskSetSyclBase<DimInt<1u>, TView, TExtent>
-        {
-            using TaskSetSyclBase<DimInt<1u>, TView, TExtent>::TaskSetSyclBase;
-
-            auto operator()(sycl::queue& queue, std::vector<sycl::event> const& requirements) const -> sycl::event
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                this->printDebug();
-#    endif
-                if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
-                {
-                    return queue.memset(
-                        reinterpret_cast<void*>(this->m_dstMemNative),
-                        this->m_byte,
-                        static_cast<std::size_t>(this->m_extentWidthBytes),
-                        requirements);
-                }
-                else
-                {
-                    return queue.ext_oneapi_submit_barrier();
-                }
-            }
-        };
-
-        //! The SYCL device scalar memory set task.
-        template<typename TView, typename TExtent>
-        struct TaskSetSycl<DimInt<0u>, TView, TExtent>
-        {
-            using ExtentSize = Idx<TExtent>;
-            using Scalar = Vec<DimInt<0u>, ExtentSize>;
-            using DstSize = Idx<TView>;
-            using Elem = alpaka::Elem<TView>;
-
-            template<typename TViewFwd>
-            TaskSetSycl(TViewFwd&& view, std::uint8_t const& byte, [[maybe_unused]] TExtent const& extent)
-                : m_byte(byte)
-                , m_dstMemNative(reinterpret_cast<std::uint8_t*>(getPtrNative(view)))
-            {
-                // all zero-sized extents are equivalent
-                ALPAKA_ASSERT(getExtents(extent).prod() == 1u);
-                ALPAKA_ASSERT(getExtents(view).prod() == 1u);
-            }
-
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-            auto printDebug() const -> void
-            {
-                std::cout << __func__ << " e: " << Scalar() << " ewb: " << sizeof(Elem) << " de: " << Scalar()
-                          << " dptr: " << reinterpret_cast<void*>(m_dstMemNative) << " dpitchb: " << Scalar()
-                          << std::endl;
-            }
-#    endif
-
-            auto operator()(sycl::queue& queue, std::vector<sycl::event> const& requirements) const -> sycl::event
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                printDebug();
-#    endif
-                return queue.memset(reinterpret_cast<void*>(m_dstMemNative), m_byte, sizeof(Elem), requirements);
-            }
-
-            std::uint8_t const m_byte;
-            std::uint8_t* const m_dstMemNative;
-            static constexpr auto is_sycl_task = true;
-        };
-
-    } // namespace detail
-
-    namespace trait
-    {
-        //! The SYCL device memory set trait specialization.
-        template<typename TDim, typename TPlatform>
-        struct CreateTaskMemset<TDim, DevGenericSycl<TPlatform>>
-        {
-            template<typename TExtent, typename TView>
-            static auto createTaskMemset(TView& view, std::uint8_t const& byte, TExtent const& extent)
-                -> alpaka::detail::TaskSetSycl<TDim, TView, TExtent>
-            {
-                return alpaka::detail::TaskSetSycl<TDim, TView, TExtent>(view, byte, extent);
-            }
-        };
-
-    } // namespace trait
-
-} // namespace alpaka
-#endif
diff --git a/include/alpaka/mem/buf/uniformCudaHip/Copy.hpp b/include/alpaka/mem/buf/uniformCudaHip/Copy.hpp
deleted file mode 100644
index 37ee6fb..0000000
--- a/include/alpaka/mem/buf/uniformCudaHip/Copy.hpp
+++ /dev/null
@@ -1,643 +0,0 @@
-/* Copyright 2023 Axel Hübl, Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera, Andrea Bocci, Jan Stephan,
- *                Bernhard Manfred Gruber, Antonio Di Pilato
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Assert.hpp"
-#include "alpaka/core/Cuda.hpp"
-#include "alpaka/core/Hip.hpp"
-#include "alpaka/dev/DevCpu.hpp"
-#include "alpaka/dev/DevUniformCudaHipRt.hpp"
-#include "alpaka/dim/DimIntegralConst.hpp"
-#include "alpaka/extent/Traits.hpp"
-#include "alpaka/mem/view/Traits.hpp"
-#include "alpaka/queue/QueueUniformCudaHipRtBlocking.hpp"
-#include "alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp"
-
-#include <cstddef>
-#include <cstdint>
-#include <set>
-#include <tuple>
-#include <type_traits>
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-
-namespace alpaka
-{
-    namespace detail
-    {
-        //! The CUDA/HIP memory copy trait.
-        template<typename TApi, typename TDim, typename TViewDst, typename TViewSrc, typename TExtent>
-        struct TaskCopyUniformCudaHip;
-
-        //! The scalar CUDA/HIP memory copy trait.
-        template<typename TApi, typename TViewDst, typename TViewSrc, typename TExtent>
-        struct TaskCopyUniformCudaHip<TApi, DimInt<0u>, TViewDst, TViewSrc, TExtent>
-        {
-            using Idx = alpaka::Idx<TExtent>;
-
-            template<typename TViewDstFwd>
-            ALPAKA_FN_HOST TaskCopyUniformCudaHip(
-                TViewDstFwd&& viewDst,
-                TViewSrc const& viewSrc,
-                [[maybe_unused]] TExtent const& extent,
-                typename TApi::MemcpyKind_t const& uniformMemCpyKind,
-                int const& iDstDevice,
-                int const& iSrcDevice)
-                : m_uniformMemCpyKind(uniformMemCpyKind)
-                , m_iDstDevice(iDstDevice)
-                , m_iSrcDevice(iSrcDevice)
-                , m_dstMemNative(reinterpret_cast<void*>(getPtrNative(viewDst)))
-                , m_srcMemNative(reinterpret_cast<void const*>(getPtrNative(viewSrc)))
-            {
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                ALPAKA_ASSERT(getExtentProduct(extent) == 1);
-#    endif
-            }
-
-            template<typename TQueue>
-            auto enqueue(TQueue& queue) const -> void
-            {
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                printDebug();
-#    endif
-                // cudaMemcpy variants on cudaMallocAsync'ed memory need to be called with the correct device,
-                // see https://github.com/fwyzard/nvidia_bug_3446335 .
-                // Set the current device.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(m_iDstDevice));
-                // Initiate the memory copy.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memcpyAsync(
-                    m_dstMemNative,
-                    m_srcMemNative,
-                    sizeof(Elem<TViewDst>),
-                    m_uniformMemCpyKind,
-                    queue.getNativeHandle()));
-            }
-
-        private:
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-            ALPAKA_FN_HOST auto printDebug() const -> void
-            {
-                std::cout << __func__ << " ddev: " << m_iDstDevice << " ew: " << Idx(1u)
-                          << " ewb: " << static_cast<Idx>(sizeof(Elem<TViewDst>)) << " dw: " << Idx(1u)
-                          << " dptr: " << m_dstMemNative << " sdev: " << m_iSrcDevice << " sw: " << Idx(1u)
-                          << " sptr: " << m_srcMemNative << std::endl;
-            }
-#    endif
-
-            typename TApi::MemcpyKind_t m_uniformMemCpyKind;
-            int m_iDstDevice;
-            int m_iSrcDevice;
-            void* m_dstMemNative;
-            void const* m_srcMemNative;
-        };
-
-        //! The 1D CUDA/HIP memory copy trait.
-        template<typename TApi, typename TViewDst, typename TViewSrc, typename TExtent>
-        struct TaskCopyUniformCudaHip<TApi, DimInt<1u>, TViewDst, TViewSrc, TExtent>
-        {
-            using Idx = alpaka::Idx<TExtent>;
-
-            template<typename TViewDstFwd>
-            ALPAKA_FN_HOST TaskCopyUniformCudaHip(
-                TViewDstFwd&& viewDst,
-                TViewSrc const& viewSrc,
-                TExtent const& extent,
-                typename TApi::MemcpyKind_t const& uniformMemCpyKind,
-                int const& iDstDevice,
-                int const& iSrcDevice)
-                : m_uniformMemCpyKind(uniformMemCpyKind)
-                , m_iDstDevice(iDstDevice)
-                , m_iSrcDevice(iSrcDevice)
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                , m_extentWidth(getWidth(extent))
-                , m_dstWidth(static_cast<Idx>(getWidth(viewDst)))
-                , m_srcWidth(static_cast<Idx>(getWidth(viewSrc)))
-#    endif
-                , m_extentWidthBytes(static_cast<std::size_t>(getWidth(extent)) * sizeof(Elem<TViewDst>))
-                , m_dstMemNative(reinterpret_cast<void*>(getPtrNative(viewDst)))
-                , m_srcMemNative(reinterpret_cast<void const*>(getPtrNative(viewSrc)))
-            {
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                ALPAKA_ASSERT(m_extentWidth <= m_dstWidth);
-                ALPAKA_ASSERT(m_extentWidth <= m_srcWidth);
-#    endif
-            }
-
-            template<typename TQueue>
-            auto enqueue(TQueue& queue) const -> void
-            {
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                printDebug();
-#    endif
-                if(m_extentWidthBytes == std::size_t{0})
-                {
-                    return;
-                }
-
-                // cudaMemcpy variants on cudaMallocAsync'ed memory need to be called with the correct device,
-                // see https://github.com/fwyzard/nvidia_bug_3446335 .
-                // Set the current device.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(m_iDstDevice));
-                // Initiate the memory copy.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memcpyAsync(
-                    m_dstMemNative,
-                    m_srcMemNative,
-                    m_extentWidthBytes,
-                    m_uniformMemCpyKind,
-                    queue.getNativeHandle()));
-            }
-
-        private:
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-            ALPAKA_FN_HOST auto printDebug() const -> void
-            {
-                std::cout << __func__ << " ddev: " << m_iDstDevice << " ew: " << m_extentWidth
-                          << " ewb: " << m_extentWidthBytes << " dw: " << m_dstWidth << " dptr: " << m_dstMemNative
-                          << " sdev: " << m_iSrcDevice << " sw: " << m_srcWidth << " sptr: " << m_srcMemNative
-                          << std::endl;
-            }
-#    endif
-
-            typename TApi::MemcpyKind_t m_uniformMemCpyKind;
-            int m_iDstDevice;
-            int m_iSrcDevice;
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-            Idx m_extentWidth;
-            Idx m_dstWidth;
-            Idx m_srcWidth;
-#    endif
-            std::size_t m_extentWidthBytes;
-            void* m_dstMemNative;
-            void const* m_srcMemNative;
-        };
-
-        //! The 2D CUDA/HIP memory copy trait.
-        template<typename TApi, typename TViewDst, typename TViewSrc, typename TExtent>
-        struct TaskCopyUniformCudaHip<TApi, DimInt<2u>, TViewDst, TViewSrc, TExtent>
-        {
-            using Idx = alpaka::Idx<TExtent>;
-
-            template<typename TViewDstFwd>
-            ALPAKA_FN_HOST TaskCopyUniformCudaHip(
-                TViewDstFwd&& viewDst,
-                TViewSrc const& viewSrc,
-                TExtent const& extent,
-                typename TApi::MemcpyKind_t const& uniformMemcpyKind,
-                int const& iDstDevice,
-                int const& iSrcDevice)
-                : m_uniformMemCpyKind(uniformMemcpyKind)
-                , m_iDstDevice(iDstDevice)
-                , m_iSrcDevice(iSrcDevice)
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                , m_extentWidth(getWidth(extent))
-#    endif
-                , m_extentWidthBytes(static_cast<std::size_t>(getWidth(extent)) * sizeof(Elem<TViewDst>))
-                , m_dstWidth(static_cast<Idx>(getWidth(viewDst)))
-                , m_srcWidth(static_cast<Idx>(getWidth(viewSrc)))
-                , m_extentHeight(getHeight(extent))
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                , m_dstHeight(static_cast<Idx>(getHeight(viewDst)))
-                , m_srcHeight(static_cast<Idx>(getHeight(viewSrc)))
-#    endif
-                , m_dstRowPitchBytes(static_cast<std::size_t>(getPitchesInBytes(viewDst)[0]))
-                , m_srcRowPitchBytes(static_cast<std::size_t>(getPitchesInBytes(viewSrc)[0]))
-                , m_dstMemNative(reinterpret_cast<void*>(getPtrNative(viewDst)))
-                , m_srcMemNative(reinterpret_cast<void const*>(getPtrNative(viewSrc)))
-            {
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                ALPAKA_ASSERT(m_extentWidth <= m_dstWidth);
-                ALPAKA_ASSERT(m_extentHeight <= m_dstHeight);
-                ALPAKA_ASSERT(m_extentWidth <= m_srcWidth);
-                ALPAKA_ASSERT(m_extentHeight <= m_srcHeight);
-                ALPAKA_ASSERT(m_extentWidthBytes <= m_dstRowPitchBytes);
-                ALPAKA_ASSERT(m_extentWidthBytes <= m_srcRowPitchBytes);
-#    endif
-            }
-
-            template<typename TQueue>
-            auto enqueue(TQueue& queue) const -> void
-            {
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                printDebug();
-#    endif
-                // This is not only an optimization but also prevents a division by zero.
-                if(m_extentWidthBytes == std::size_t{0} || m_extentHeight == 0)
-                {
-                    return;
-                }
-
-                // cudaMemcpy variants on cudaMallocAsync'ed memory need to be called with the correct device,
-                // see https://github.com/fwyzard/nvidia_bug_3446335 .
-                // Set the current device.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(m_iDstDevice));
-                // Initiate the memory copy.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memcpy2DAsync(
-                    m_dstMemNative,
-                    m_dstRowPitchBytes,
-                    m_srcMemNative,
-                    m_srcRowPitchBytes,
-                    m_extentWidthBytes,
-                    static_cast<std::size_t>(m_extentHeight),
-                    m_uniformMemCpyKind,
-                    queue.getNativeHandle()));
-            }
-
-        private:
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-            ALPAKA_FN_HOST auto printDebug() const -> void
-            {
-                std::cout << __func__ << " ew: " << m_extentWidth << " eh: " << m_extentHeight
-                          << " ewb: " << m_extentWidthBytes << " ddev: " << m_iDstDevice << " dw: " << m_dstWidth
-                          << " dh: " << m_dstHeight << " dptr: " << m_dstMemNative << " dpitch: " << m_dstRowPitchBytes
-                          << " sdev: " << m_iSrcDevice << " sw: " << m_srcWidth << " sh: " << m_srcHeight
-                          << " sptr: " << m_srcMemNative << " spitch: " << m_srcRowPitchBytes << std::endl;
-            }
-#    endif
-
-            typename TApi::MemcpyKind_t m_uniformMemCpyKind;
-            int m_iDstDevice;
-            int m_iSrcDevice;
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-            Idx m_extentWidth;
-#    endif
-            std::size_t m_extentWidthBytes;
-            Idx m_dstWidth;
-            Idx m_srcWidth;
-
-            Idx m_extentHeight;
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-            Idx m_dstHeight;
-            Idx m_srcHeight;
-#    endif
-            std::size_t m_dstRowPitchBytes;
-            std::size_t m_srcRowPitchBytes;
-
-            void* m_dstMemNative;
-            void const* m_srcMemNative;
-        };
-
-        //! The 3D CUDA/HIP memory copy trait.
-        template<typename TApi, typename TViewDst, typename TViewSrc, typename TExtent>
-        struct TaskCopyUniformCudaHip<TApi, DimInt<3u>, TViewDst, TViewSrc, TExtent>
-        {
-            using Idx = alpaka::Idx<TExtent>;
-
-            template<typename TViewDstFwd>
-            ALPAKA_FN_HOST TaskCopyUniformCudaHip(
-                TViewDstFwd&& viewDst,
-                TViewSrc const& viewSrc,
-                TExtent const& extent,
-                typename TApi::MemcpyKind_t const& uniformMemcpyKind,
-                int const& iDstDevice,
-                int const& iSrcDevice)
-                : m_uniformMemCpyKind(uniformMemcpyKind)
-                , m_iDstDevice(iDstDevice)
-                , m_iSrcDevice(iSrcDevice)
-                , m_extentWidth(getWidth(extent))
-                , m_extentWidthBytes(static_cast<std::size_t>(m_extentWidth) * sizeof(Elem<TViewDst>))
-                , m_dstWidth(static_cast<Idx>(getWidth(viewDst)))
-                , m_srcWidth(static_cast<Idx>(getWidth(viewSrc)))
-                , m_extentHeight(getHeight(extent))
-                , m_extentDepth(getDepth(extent))
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                , m_dstHeight(static_cast<Idx>(getHeight(viewDst)))
-                , m_srcHeight(static_cast<Idx>(getHeight(viewSrc)))
-                , m_dstDepth(static_cast<Idx>(getDepth(viewDst)))
-                , m_srcDepth(static_cast<Idx>(getDepth(viewSrc)))
-#    endif
-                , m_dstRowPitchBytes(static_cast<std::size_t>(getPitchesInBytes(viewDst)[1]))
-                , m_srcRowPitchBytes(static_cast<std::size_t>(getPitchesInBytes(viewSrc)[1]))
-                , m_dstSlicePitchBytes(static_cast<std::size_t>(getPitchesInBytes(viewDst)[0]))
-                , m_srcSlicePitchBytes(static_cast<std::size_t>(getPitchesInBytes(viewSrc)[0]))
-                , m_dstMemNative(reinterpret_cast<void*>(getPtrNative(viewDst)))
-                , m_srcMemNative(reinterpret_cast<void const*>(getPtrNative(viewSrc)))
-            {
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                ALPAKA_ASSERT(m_extentWidth <= m_dstWidth);
-                ALPAKA_ASSERT(m_extentHeight <= m_dstHeight);
-                ALPAKA_ASSERT(m_extentDepth <= m_dstDepth);
-                ALPAKA_ASSERT(m_extentWidth <= m_srcWidth);
-                ALPAKA_ASSERT(m_extentHeight <= m_srcHeight);
-                ALPAKA_ASSERT(m_extentDepth <= m_srcDepth);
-                ALPAKA_ASSERT(m_extentWidthBytes <= m_dstRowPitchBytes);
-                ALPAKA_ASSERT(m_extentWidthBytes <= m_srcRowPitchBytes);
-#    endif
-            }
-
-            template<typename TQueue>
-            auto enqueue(TQueue& queue) const -> void
-            {
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                printDebug();
-#    endif
-                // This is not only an optimization but also prevents a division by zero.
-                if(m_extentWidthBytes == std::size_t{0} || m_extentHeight == 0 || m_extentDepth == 0)
-                {
-                    return;
-                }
-
-                // Create the struct describing the copy.
-                typename TApi::Memcpy3DParms_t const uniformCudaHipMemCpy3DParms(buildUniformCudaHipMemcpy3DParms());
-
-                // cudaMemcpy variants on cudaMallocAsync'ed memory need to be called with the correct device,
-                // see https://github.com/fwyzard/nvidia_bug_3446335 .
-                // Set the current device.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(m_iDstDevice));
-
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
-                    TApi::memcpy3DAsync(&uniformCudaHipMemCpy3DParms, queue.getNativeHandle()));
-            }
-
-        private:
-            ALPAKA_FN_HOST auto buildUniformCudaHipMemcpy3DParms() const -> typename TApi::Memcpy3DParms_t
-            {
-                ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                // Fill CUDA/HIP parameter structure.
-                typename TApi::Memcpy3DParms_t memCpy3DParms{}; // zero-init required per CUDA documentation
-                memCpy3DParms.srcPtr = TApi::makePitchedPtr(
-                    const_cast<void*>(m_srcMemNative),
-                    m_srcRowPitchBytes,
-                    static_cast<std::size_t>(m_srcWidth),
-                    m_srcSlicePitchBytes / m_srcRowPitchBytes);
-                memCpy3DParms.dstPtr = TApi::makePitchedPtr(
-                    m_dstMemNative,
-                    m_dstRowPitchBytes,
-                    static_cast<std::size_t>(m_dstWidth),
-                    m_dstSlicePitchBytes / m_dstRowPitchBytes);
-                memCpy3DParms.extent = TApi::makeExtent(
-                    m_extentWidthBytes,
-                    static_cast<std::size_t>(m_extentHeight),
-                    static_cast<std::size_t>(m_extentDepth));
-                memCpy3DParms.kind = m_uniformMemCpyKind;
-                return memCpy3DParms;
-            }
-
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-            ALPAKA_FN_HOST auto printDebug() const -> void
-            {
-                std::cout << __func__ << " ew: " << m_extentWidth << " eh: " << m_extentHeight
-                          << " ed: " << m_extentDepth << " ewb: " << m_extentWidthBytes << " ddev: " << m_iDstDevice
-                          << " dw: " << m_dstWidth << " dh: " << m_dstHeight << " dd: " << m_dstDepth
-                          << " dptr: " << m_dstMemNative << " drowpitch: " << m_dstRowPitchBytes
-                          << " dslicepitch: " << m_dstSlicePitchBytes << " sdev: " << m_iSrcDevice
-                          << " sw: " << m_srcWidth << " sh: " << m_srcHeight << " sd: " << m_srcDepth
-                          << " sptr: " << m_srcMemNative << " srowpitch: " << m_srcRowPitchBytes
-                          << " sslicepitch: " << m_srcSlicePitchBytes << std::endl;
-            }
-#    endif
-            typename TApi::MemcpyKind_t m_uniformMemCpyKind;
-            int m_iDstDevice;
-            int m_iSrcDevice;
-
-            Idx m_extentWidth;
-            std::size_t m_extentWidthBytes;
-            Idx m_dstWidth;
-            Idx m_srcWidth;
-
-            Idx m_extentHeight;
-            Idx m_extentDepth;
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-            Idx m_dstHeight;
-            Idx m_srcHeight;
-            Idx m_dstDepth;
-            Idx m_srcDepth;
-#    endif
-            std::size_t m_dstRowPitchBytes;
-            std::size_t m_srcRowPitchBytes;
-            std::size_t m_dstSlicePitchBytes;
-            std::size_t m_srcSlicePitchBytes;
-
-            void* m_dstMemNative;
-            void const* m_srcMemNative;
-        };
-    } // namespace detail
-
-    // Trait specializations for CreateTaskMemcpy.
-    namespace trait
-    {
-        //! The CUDA/HIP to CPU memory copy trait specialization.
-        template<typename TApi, typename TDim>
-        struct CreateTaskMemcpy<TDim, DevCpu, DevUniformCudaHipRt<TApi>>
-        {
-            template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
-            ALPAKA_FN_HOST static auto createTaskMemcpy(
-                TViewDstFwd&& viewDst,
-                TViewSrc const& viewSrc,
-                TExtent const& extent) -> alpaka::detail::
-                TaskCopyUniformCudaHip<TApi, TDim, std::remove_reference_t<TViewDstFwd>, TViewSrc, TExtent>
-            {
-                ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                auto const iDevice = getDev(viewSrc).getNativeHandle();
-
-                return {
-                    std::forward<TViewDstFwd>(viewDst),
-                    viewSrc,
-                    extent,
-                    TApi::memcpyDeviceToHost,
-                    iDevice,
-                    iDevice};
-            }
-        };
-
-        //! The CPU to CUDA/HIP memory copy trait specialization.
-        template<typename TApi, typename TDim>
-        struct CreateTaskMemcpy<TDim, DevUniformCudaHipRt<TApi>, DevCpu>
-        {
-            template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
-            ALPAKA_FN_HOST static auto createTaskMemcpy(
-                TViewDstFwd&& viewDst,
-                TViewSrc const& viewSrc,
-                TExtent const& extent) -> alpaka::detail::
-                TaskCopyUniformCudaHip<TApi, TDim, std::remove_reference_t<TViewDstFwd>, TViewSrc, TExtent>
-            {
-                ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                auto const iDevice = getDev(viewDst).getNativeHandle();
-
-                return {
-                    std::forward<TViewDstFwd>(viewDst),
-                    viewSrc,
-                    extent,
-                    TApi::memcpyHostToDevice,
-                    iDevice,
-                    iDevice};
-            }
-        };
-
-        //! The CUDA/HIP to CUDA/HIP memory copy trait specialization.
-        template<typename TApi, typename TDim>
-        struct CreateTaskMemcpy<TDim, DevUniformCudaHipRt<TApi>, DevUniformCudaHipRt<TApi>>
-        {
-            template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
-            ALPAKA_FN_HOST static auto createTaskMemcpy(
-                TViewDstFwd&& viewDst,
-                TViewSrc const& viewSrc,
-                TExtent const& extent) -> alpaka::detail::
-                TaskCopyUniformCudaHip<TApi, TDim, std::remove_reference_t<TViewDstFwd>, TViewSrc, TExtent>
-            {
-                ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                auto const iDstDevice = getDev(viewDst).getNativeHandle();
-
-                return {
-                    std::forward<TViewDstFwd>(viewDst),
-                    viewSrc,
-                    extent,
-                    TApi::memcpyDeviceToDevice,
-                    iDstDevice,
-                    getDev(viewSrc).getNativeHandle()};
-            }
-        };
-
-        //! The CUDA/HIP non-blocking device queue scalar copy enqueue trait specialization.
-        template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
-        struct Enqueue<
-            QueueUniformCudaHipRtNonBlocking<TApi>,
-            alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<0u>, TViewDst, TViewSrc, TExtent>>
-        {
-            ALPAKA_FN_HOST static auto enqueue(
-                QueueUniformCudaHipRtNonBlocking<TApi>& queue,
-                alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<0u>, TViewDst, TViewSrc, TExtent> const& task)
-                -> void
-            {
-                ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                task.enqueue(queue);
-            }
-        };
-
-        //! The CUDA/HIP blocking device queue scalar copy enqueue trait specialization.
-        template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
-        struct Enqueue<
-            QueueUniformCudaHipRtBlocking<TApi>,
-            alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<0u>, TViewDst, TViewSrc, TExtent>>
-        {
-            ALPAKA_FN_HOST static auto enqueue(
-                QueueUniformCudaHipRtBlocking<TApi>& queue,
-                alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<0u>, TViewDst, TViewSrc, TExtent> const& task)
-                -> void
-            {
-                ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                task.enqueue(queue);
-
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamSynchronize(queue.getNativeHandle()));
-            }
-        };
-
-        //! The CUDA/HIP non-blocking device queue 1D copy enqueue trait specialization.
-        template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
-        struct Enqueue<
-            QueueUniformCudaHipRtNonBlocking<TApi>,
-            alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<1u>, TViewDst, TViewSrc, TExtent>>
-        {
-            ALPAKA_FN_HOST static auto enqueue(
-                QueueUniformCudaHipRtNonBlocking<TApi>& queue,
-                alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<1u>, TViewDst, TViewSrc, TExtent> const& task)
-                -> void
-            {
-                ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                task.enqueue(queue);
-            }
-        };
-
-        //! The CUDA/HIP blocking device queue 1D copy enqueue trait specialization.
-        template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
-        struct Enqueue<
-            QueueUniformCudaHipRtBlocking<TApi>,
-            alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<1u>, TViewDst, TViewSrc, TExtent>>
-        {
-            ALPAKA_FN_HOST static auto enqueue(
-                QueueUniformCudaHipRtBlocking<TApi>& queue,
-                alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<1u>, TViewDst, TViewSrc, TExtent> const& task)
-                -> void
-            {
-                ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                task.enqueue(queue);
-
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamSynchronize(queue.getNativeHandle()));
-            }
-        };
-
-        //! The CUDA/HIP non-blocking device queue 2D copy enqueue trait specialization.
-        template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
-        struct Enqueue<
-            QueueUniformCudaHipRtNonBlocking<TApi>,
-            alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<2u>, TViewDst, TViewSrc, TExtent>>
-        {
-            ALPAKA_FN_HOST static auto enqueue(
-                QueueUniformCudaHipRtNonBlocking<TApi>& queue,
-                alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<2u>, TViewDst, TViewSrc, TExtent> const& task)
-                -> void
-            {
-                ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                task.enqueue(queue);
-            }
-        };
-
-        //! The CUDA/HIP blocking device queue 2D copy enqueue trait specialization.
-        template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
-        struct Enqueue<
-            QueueUniformCudaHipRtBlocking<TApi>,
-            alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<2u>, TViewDst, TViewSrc, TExtent>>
-        {
-            ALPAKA_FN_HOST static auto enqueue(
-                QueueUniformCudaHipRtBlocking<TApi>& queue,
-                alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<2u>, TViewDst, TViewSrc, TExtent> const& task)
-                -> void
-            {
-                ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                task.enqueue(queue);
-
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamSynchronize(queue.getNativeHandle()));
-            }
-        };
-
-        //! The CUDA/HIP non-blocking device queue 3D copy enqueue trait specialization.
-        template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
-        struct Enqueue<
-            QueueUniformCudaHipRtNonBlocking<TApi>,
-            alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<3u>, TViewDst, TViewSrc, TExtent>>
-        {
-            ALPAKA_FN_HOST static auto enqueue(
-                QueueUniformCudaHipRtNonBlocking<TApi>& queue,
-                alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<3u>, TViewDst, TViewSrc, TExtent> const& task)
-                -> void
-            {
-                ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                task.enqueue(queue);
-            }
-        };
-
-        //! The CUDA/HIP blocking device queue 3D copy enqueue trait specialization.
-        template<typename TApi, typename TExtent, typename TViewSrc, typename TViewDst>
-        struct Enqueue<
-            QueueUniformCudaHipRtBlocking<TApi>,
-            alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<3u>, TViewDst, TViewSrc, TExtent>>
-        {
-            ALPAKA_FN_HOST static auto enqueue(
-                QueueUniformCudaHipRtBlocking<TApi>& queue,
-                alpaka::detail::TaskCopyUniformCudaHip<TApi, DimInt<3u>, TViewDst, TViewSrc, TExtent> const& task)
-                -> void
-            {
-                ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                task.enqueue(queue);
-
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamSynchronize(queue.getNativeHandle()));
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/mem/buf/uniformCudaHip/Set.hpp b/include/alpaka/mem/buf/uniformCudaHip/Set.hpp
deleted file mode 100644
index 3b6551c..0000000
--- a/include/alpaka/mem/buf/uniformCudaHip/Set.hpp
+++ /dev/null
@@ -1,385 +0,0 @@
-/* Copyright 2023 Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera, Andrea Bocci, Bernhard Manfred Gruber,
- *                Antonio Di Pilato, Jan Stephan
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Assert.hpp"
-#include "alpaka/core/Cuda.hpp"
-#include "alpaka/core/Hip.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/dim/DimIntegralConst.hpp"
-#include "alpaka/extent/Traits.hpp"
-#include "alpaka/mem/view/Traits.hpp"
-#include "alpaka/queue/QueueUniformCudaHipRtBlocking.hpp"
-#include "alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp"
-#include "alpaka/queue/Traits.hpp"
-#include "alpaka/wait/Traits.hpp"
-
-#include <cstddef>
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-
-namespace alpaka
-{
-    template<typename TApi>
-    class DevUniformCudaHipRt;
-
-    namespace detail
-    {
-        //! The CUDA/HIP memory set task base.
-        template<typename TApi, typename TDim, typename TView, typename TExtent>
-        struct TaskSetUniformCudaHipBase
-        {
-            TaskSetUniformCudaHipBase(TView& view, std::uint8_t const& byte, TExtent const& extent)
-                : m_view(view)
-                , m_byte(byte)
-                , m_extent(extent)
-                , m_iDevice(getDev(view).getNativeHandle())
-            {
-            }
-
-        protected:
-            TView& m_view;
-            std::uint8_t const m_byte;
-            TExtent const m_extent;
-            std::int32_t const m_iDevice;
-        };
-
-        //! The CUDA/HIP memory set task.
-        template<typename TApi, typename TDim, typename TView, typename TExtent>
-        struct TaskSetUniformCudaHip;
-
-        //! The scalar CUDA/HIP memory set task.
-        template<typename TApi, typename TView, typename TExtent>
-        struct TaskSetUniformCudaHip<TApi, DimInt<0u>, TView, TExtent>
-            : public TaskSetUniformCudaHipBase<TApi, DimInt<0u>, TView, TExtent>
-        {
-            template<typename TViewFwd>
-            TaskSetUniformCudaHip(TViewFwd&& view, std::uint8_t const& byte, TExtent const& extent)
-                : TaskSetUniformCudaHipBase<TApi, DimInt<0u>, TView, TExtent>(
-                    std::forward<TViewFwd>(view),
-                    byte,
-                    extent)
-            {
-            }
-
-            template<typename TQueue>
-            auto enqueue(TQueue& queue) const -> void
-            {
-                // Initiate the memory set.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memsetAsync(
-                    getPtrNative(this->m_view),
-                    static_cast<int>(this->m_byte),
-                    sizeof(Elem<TView>),
-                    queue.getNativeHandle()));
-            }
-        };
-
-        //! The 1D CUDA/HIP memory set task.
-        template<typename TApi, typename TView, typename TExtent>
-        struct TaskSetUniformCudaHip<TApi, DimInt<1u>, TView, TExtent>
-            : public TaskSetUniformCudaHipBase<TApi, DimInt<1u>, TView, TExtent>
-        {
-            template<typename TViewFwd>
-            TaskSetUniformCudaHip(TViewFwd&& view, std::uint8_t const& byte, TExtent const& extent)
-                : TaskSetUniformCudaHipBase<TApi, DimInt<1u>, TView, TExtent>(
-                    std::forward<TViewFwd>(view),
-                    byte,
-                    extent)
-            {
-            }
-
-            template<typename TQueue>
-            auto enqueue(TQueue& queue) const -> void
-            {
-                auto& view = this->m_view;
-                auto const& extent = this->m_extent;
-
-                auto const extentWidth = getWidth(extent);
-                ALPAKA_ASSERT(extentWidth <= getWidth(view));
-
-                if(extentWidth == 0)
-                {
-                    return;
-                }
-
-                // Initiate the memory set.
-                auto const extentWidthBytes = static_cast<std::size_t>(extentWidth) * sizeof(Elem<TView>);
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memsetAsync(
-                    getPtrNative(view),
-                    static_cast<int>(this->m_byte),
-                    extentWidthBytes,
-                    queue.getNativeHandle()));
-            }
-        };
-
-        //! The 2D CUDA/HIP memory set task.
-        template<typename TApi, typename TView, typename TExtent>
-        struct TaskSetUniformCudaHip<TApi, DimInt<2u>, TView, TExtent>
-            : public TaskSetUniformCudaHipBase<TApi, DimInt<2u>, TView, TExtent>
-        {
-            template<typename TViewFwd>
-            TaskSetUniformCudaHip(TViewFwd&& view, std::uint8_t const& byte, TExtent const& extent)
-                : TaskSetUniformCudaHipBase<TApi, DimInt<2u>, TView, TExtent>(
-                    std::forward<TViewFwd>(view),
-                    byte,
-                    extent)
-            {
-            }
-
-            template<typename TQueue>
-            auto enqueue(TQueue& queue) const -> void
-            {
-                auto& view = this->m_view;
-                auto const& extent = this->m_extent;
-
-                auto const extentWidth = getWidth(extent);
-                auto const extentHeight = getHeight(extent);
-
-                if(extentWidth == 0 || extentHeight == 0)
-                {
-                    return;
-                }
-
-                auto const extentWidthBytes = static_cast<std::size_t>(extentWidth) * sizeof(Elem<TView>);
-
-#    if !defined(NDEBUG)
-                auto const dstWidth = getWidth(view);
-                auto const dstHeight = getHeight(view);
-#    endif
-                auto const dstRowPitchBytes = static_cast<std::size_t>(getPitchesInBytes(view)[0]);
-                auto const dstNativePtr = reinterpret_cast<void*>(getPtrNative(view));
-                ALPAKA_ASSERT(extentWidth <= dstWidth);
-                ALPAKA_ASSERT(extentHeight <= dstHeight);
-
-                // Initiate the memory set.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memset2DAsync(
-                    dstNativePtr,
-                    dstRowPitchBytes,
-                    static_cast<int>(this->m_byte),
-                    extentWidthBytes,
-                    static_cast<std::size_t>(extentHeight),
-                    queue.getNativeHandle()));
-            }
-        };
-
-        //! The 3D CUDA/HIP memory set task.
-        template<typename TApi, typename TView, typename TExtent>
-        struct TaskSetUniformCudaHip<TApi, DimInt<3u>, TView, TExtent>
-            : public TaskSetUniformCudaHipBase<TApi, DimInt<3u>, TView, TExtent>
-        {
-            template<typename TViewFwd>
-            TaskSetUniformCudaHip(TViewFwd&& view, std::uint8_t const& byte, TExtent const& extent)
-                : TaskSetUniformCudaHipBase<TApi, DimInt<3u>, TView, TExtent>(
-                    std::forward<TViewFwd>(view),
-                    byte,
-                    extent)
-            {
-            }
-
-            template<typename TQueue>
-            auto enqueue(TQueue& queue) const -> void
-            {
-                using Elem = alpaka::Elem<TView>;
-
-                auto& view = this->m_view;
-                auto const& extent = this->m_extent;
-
-                auto const extentWidth = getWidth(extent);
-                auto const extentHeight = getHeight(extent);
-                auto const extentDepth = getDepth(extent);
-
-                // This is not only an optimization but also prevents a division by zero.
-                if(extentWidth == 0 || extentHeight == 0 || extentDepth == 0)
-                {
-                    return;
-                }
-
-                auto const dstWidth = getWidth(view);
-#    if !defined(NDEBUG)
-                auto const dstHeight = getHeight(view);
-                auto const dstDepth = getDepth(view);
-#    endif
-                auto const [dstSlicePitchBytes, dstRowPitchBytes, _] = getPitchesInBytes(view);
-                auto const dstNativePtr = reinterpret_cast<void*>(getPtrNative(view));
-                ALPAKA_ASSERT(extentWidth <= dstWidth);
-                ALPAKA_ASSERT(extentHeight <= dstHeight);
-                ALPAKA_ASSERT(extentDepth <= dstDepth);
-
-                // Fill CUDA parameter structures.
-                typename TApi::PitchedPtr_t const pitchedPtrVal = TApi::makePitchedPtr(
-                    dstNativePtr,
-                    static_cast<std::size_t>(dstRowPitchBytes),
-                    static_cast<std::size_t>(dstWidth) * sizeof(Elem),
-                    static_cast<std::size_t>(dstSlicePitchBytes / dstRowPitchBytes));
-
-                typename TApi::Extent_t const extentVal = TApi::makeExtent(
-                    static_cast<std::size_t>(extentWidth) * sizeof(Elem),
-                    static_cast<std::size_t>(extentHeight),
-                    static_cast<std::size_t>(extentDepth));
-
-                // Initiate the memory set.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::memset3DAsync(
-                    pitchedPtrVal,
-                    static_cast<int>(this->m_byte),
-                    extentVal,
-                    queue.getNativeHandle()));
-            }
-        };
-    } // namespace detail
-
-    namespace trait
-    {
-        //! The CUDA device memory set trait specialization.
-        template<typename TApi, typename TDim>
-        struct CreateTaskMemset<TDim, DevUniformCudaHipRt<TApi>>
-        {
-            template<typename TExtent, typename TView>
-            ALPAKA_FN_HOST static auto createTaskMemset(TView& view, std::uint8_t const& byte, TExtent const& extent)
-                -> alpaka::detail::TaskSetUniformCudaHip<TApi, TDim, TView, TExtent>
-            {
-                return alpaka::detail::TaskSetUniformCudaHip<TApi, TDim, TView, TExtent>(view, byte, extent);
-            }
-        };
-
-        //! The CUDA non-blocking device queue scalar set enqueue trait specialization.
-        template<typename TApi, typename TView, typename TExtent>
-        struct Enqueue<
-            QueueUniformCudaHipRtNonBlocking<TApi>,
-            alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<0u>, TView, TExtent>>
-        {
-            ALPAKA_FN_HOST static auto enqueue(
-                QueueUniformCudaHipRtNonBlocking<TApi>& queue,
-                alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<0u>, TView, TExtent> const& task) -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                task.enqueue(queue);
-            }
-        };
-
-        //! The CUDA blocking device queue scalar set enqueue trait specialization.
-        template<typename TApi, typename TView, typename TExtent>
-        struct Enqueue<
-            QueueUniformCudaHipRtBlocking<TApi>,
-            alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<0u>, TView, TExtent>>
-        {
-            ALPAKA_FN_HOST static auto enqueue(
-                QueueUniformCudaHipRtBlocking<TApi>& queue,
-                alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<0u>, TView, TExtent> const& task) -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                task.enqueue(queue);
-
-                wait(queue);
-            }
-        };
-
-        //! The CUDA non-blocking device queue 1D set enqueue trait specialization.
-        template<typename TApi, typename TView, typename TExtent>
-        struct Enqueue<
-            QueueUniformCudaHipRtNonBlocking<TApi>,
-            alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<1u>, TView, TExtent>>
-        {
-            ALPAKA_FN_HOST static auto enqueue(
-                QueueUniformCudaHipRtNonBlocking<TApi>& queue,
-                alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<1u>, TView, TExtent> const& task) -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                task.enqueue(queue);
-            }
-        };
-
-        //! The CUDA blocking device queue 1D set enqueue trait specialization.
-        template<typename TApi, typename TView, typename TExtent>
-        struct Enqueue<
-            QueueUniformCudaHipRtBlocking<TApi>,
-            alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<1u>, TView, TExtent>>
-        {
-            ALPAKA_FN_HOST static auto enqueue(
-                QueueUniformCudaHipRtBlocking<TApi>& queue,
-                alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<1u>, TView, TExtent> const& task) -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                task.enqueue(queue);
-
-                wait(queue);
-            }
-        };
-
-        //! The CUDA non-blocking device queue 2D set enqueue trait specialization.
-        template<typename TApi, typename TView, typename TExtent>
-        struct Enqueue<
-            QueueUniformCudaHipRtNonBlocking<TApi>,
-            alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<2u>, TView, TExtent>>
-        {
-            ALPAKA_FN_HOST static auto enqueue(
-                QueueUniformCudaHipRtNonBlocking<TApi>& queue,
-                alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<2u>, TView, TExtent> const& task) -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                task.enqueue(queue);
-            }
-        };
-
-        //! The CUDA blocking device queue 2D set enqueue trait specialization.
-        template<typename TApi, typename TView, typename TExtent>
-        struct Enqueue<
-            QueueUniformCudaHipRtBlocking<TApi>,
-            alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<2u>, TView, TExtent>>
-        {
-            ALPAKA_FN_HOST static auto enqueue(
-                QueueUniformCudaHipRtBlocking<TApi>& queue,
-                alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<2u>, TView, TExtent> const& task) -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                task.enqueue(queue);
-
-                wait(queue);
-            }
-        };
-
-        //! The CUDA non-blocking device queue 3D set enqueue trait specialization.
-        template<typename TApi, typename TView, typename TExtent>
-        struct Enqueue<
-            QueueUniformCudaHipRtNonBlocking<TApi>,
-            alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<3u>, TView, TExtent>>
-        {
-            ALPAKA_FN_HOST static auto enqueue(
-                QueueUniformCudaHipRtNonBlocking<TApi>& queue,
-                alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<3u>, TView, TExtent> const& task) -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                task.enqueue(queue);
-            }
-        };
-
-        //! The CUDA blocking device queue 3D set enqueue trait specialization.
-        template<typename TApi, typename TView, typename TExtent>
-        struct Enqueue<
-            QueueUniformCudaHipRtBlocking<TApi>,
-            alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<3u>, TView, TExtent>>
-        {
-            ALPAKA_FN_HOST static auto enqueue(
-                QueueUniformCudaHipRtBlocking<TApi>& queue,
-                alpaka::detail::TaskSetUniformCudaHip<TApi, DimInt<3u>, TView, TExtent> const& task) -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                task.enqueue(queue);
-
-                wait(queue);
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/mem/fence/MemFenceCpu.hpp b/include/alpaka/mem/fence/MemFenceCpu.hpp
deleted file mode 100644
index 43b8cd9..0000000
--- a/include/alpaka/mem/fence/MemFenceCpu.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright 2022 Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/mem/fence/Traits.hpp"
-
-#include <atomic>
-
-namespace alpaka
-{
-    //! The default CPU memory fence.
-    class MemFenceCpu : public concepts::Implements<ConceptMemFence, MemFenceCpu>
-    {
-    };
-
-    namespace trait
-    {
-        template<typename TMemScope>
-        struct MemFence<MemFenceCpu, TMemScope>
-        {
-            static auto mem_fence(MemFenceCpu const&, TMemScope const&)
-            {
-                /*
-                 * Intuitively, std::atomic_thread_fence creates a fence on the block level.
-                 *
-                 * Creating a block fence is enough for the whole device because the blocks are executed serially. By
-                 * definition of fences, preceding blocks don't have a guarantee to see the results of this block's
-                 * STORE operations (only that they will be ordered correctly); the following blocks see the results
-                 * once they start. Consider the following code:
-                 *
-                 * int x = 1;
-                 * int y = 2;
-                 *
-                 * void foo()
-                 * {
-                 *     x = 10;
-                 *     alpaka::mem_fence(acc, memory_scope::device);
-                 *     y = 20;
-                 * }
-                 *
-                 * void bar()
-                 * {
-                 *     auto b = y;
-                 *     alpaka::mem_fence(acc, memory_scope::device);
-                 *     auto a = x;
-                 * }
-                 *
-                 * The following are all valid outcomes:
-                 *   a == 1 && b == 2
-                 *   a == 10 && b == 2
-                 *   a == 10 && b == 20
-                 */
-
-                std::atomic_thread_fence(std::memory_order_acq_rel);
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/mem/fence/MemFenceCpuSerial.hpp b/include/alpaka/mem/fence/MemFenceCpuSerial.hpp
deleted file mode 100644
index df981f1..0000000
--- a/include/alpaka/mem/fence/MemFenceCpuSerial.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright 2022 Jan Stephan, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/mem/fence/Traits.hpp"
-
-#include <atomic>
-
-namespace alpaka
-{
-    //! The serial CPU memory fence.
-    class MemFenceCpuSerial : public concepts::Implements<ConceptMemFence, MemFenceCpuSerial>
-    {
-    };
-
-    namespace trait
-    {
-        template<>
-        struct MemFence<MemFenceCpuSerial, memory_scope::Block>
-        {
-            static auto mem_fence(MemFenceCpuSerial const&, memory_scope::Block const&)
-            {
-                /* Nothing to be done on the block level for the serial case. */
-            }
-        };
-
-        template<>
-        struct MemFence<MemFenceCpuSerial, memory_scope::Grid>
-        {
-            static auto mem_fence(MemFenceCpuSerial const&, memory_scope::Grid const&)
-            {
-                /* Nothing to be done on the grid level for the serial case. */
-            }
-        };
-
-        template<typename TMemScope>
-        struct MemFence<MemFenceCpuSerial, TMemScope>
-        {
-            static auto mem_fence(MemFenceCpuSerial const&, TMemScope const&)
-            {
-                /* Enable device fences because we may want to synchronize with other (serial) kernels. */
-                std::atomic_thread_fence(std::memory_order_acq_rel);
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/mem/fence/MemFenceGenericSycl.hpp b/include/alpaka/mem/fence/MemFenceGenericSycl.hpp
deleted file mode 100644
index 2c2cd9e..0000000
--- a/include/alpaka/mem/fence/MemFenceGenericSycl.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/mem/fence/Traits.hpp"
-
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-
-#    include <sycl/sycl.hpp>
-
-namespace alpaka
-{
-    namespace detail
-    {
-        template<typename TAlpakaMemScope>
-        struct SyclFenceProps
-        {
-        };
-
-        template<>
-        struct SyclFenceProps<alpaka::memory_scope::Block>
-        {
-            static constexpr auto scope = sycl::memory_scope::work_group;
-        };
-
-        template<>
-        struct SyclFenceProps<alpaka::memory_scope::Device>
-        {
-            static constexpr auto scope = sycl::memory_scope::device;
-        };
-
-        template<>
-        struct SyclFenceProps<alpaka::memory_scope::Grid>
-        {
-            static constexpr auto scope = sycl::memory_scope::device;
-        };
-    } // namespace detail
-
-    //! The SYCL memory fence.
-    class MemFenceGenericSycl : public concepts::Implements<ConceptMemFence, MemFenceGenericSycl>
-    {
-    };
-} // namespace alpaka
-
-namespace alpaka::trait
-{
-    template<typename TMemScope>
-    struct MemFence<MemFenceGenericSycl, TMemScope>
-    {
-        static auto mem_fence(MemFenceGenericSycl const&, TMemScope const&)
-        {
-            static constexpr auto scope = alpaka::detail::SyclFenceProps<TMemScope>::scope;
-            sycl::atomic_fence(sycl::memory_order::acq_rel, scope);
-        }
-    };
-} // namespace alpaka::trait
-
-#endif
diff --git a/include/alpaka/mem/fence/MemFenceOmp2Blocks.hpp b/include/alpaka/mem/fence/MemFenceOmp2Blocks.hpp
deleted file mode 100644
index 09f7811..0000000
--- a/include/alpaka/mem/fence/MemFenceOmp2Blocks.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright 2022 Jan Stephan, Bernhard Manfred Gruber, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/mem/fence/Traits.hpp"
-
-#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
-
-#    if _OPENMP < 200203
-#        error If ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
-#    endif
-
-namespace alpaka
-{
-    //! The CPU OpenMP 2.0 block memory fence.
-    class MemFenceOmp2Blocks : public concepts::Implements<ConceptMemFence, MemFenceOmp2Blocks>
-    {
-    };
-
-    namespace trait
-    {
-        template<>
-        struct MemFence<MemFenceOmp2Blocks, memory_scope::Block>
-        {
-            static auto mem_fence(MemFenceOmp2Blocks const&, memory_scope::Block const&)
-            {
-                // Only one thread per block allowed -> no memory fence required on block level
-            }
-        };
-
-        template<>
-        struct MemFence<MemFenceOmp2Blocks, memory_scope::Grid>
-        {
-            static auto mem_fence(MemFenceOmp2Blocks const&, memory_scope::Grid const&)
-            {
-#    pragma omp flush
-            }
-        };
-
-        template<>
-        struct MemFence<MemFenceOmp2Blocks, memory_scope::Device>
-        {
-            static auto mem_fence(MemFenceOmp2Blocks const&, memory_scope::Device const&)
-            {
-#    pragma omp flush
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/mem/fence/MemFenceOmp2Threads.hpp b/include/alpaka/mem/fence/MemFenceOmp2Threads.hpp
deleted file mode 100644
index 45ba0d5..0000000
--- a/include/alpaka/mem/fence/MemFenceOmp2Threads.hpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright 2022 Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/mem/fence/Traits.hpp"
-
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
-
-#    if _OPENMP < 200203
-#        error If ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
-#    endif
-
-namespace alpaka
-{
-    //! The CPU OpenMP 2.0 block memory fence.
-    class MemFenceOmp2Threads : public concepts::Implements<ConceptMemFence, MemFenceOmp2Threads>
-    {
-    };
-
-    namespace trait
-    {
-        template<typename TMemScope>
-        struct MemFence<MemFenceOmp2Threads, TMemScope>
-        {
-            static auto mem_fence(MemFenceOmp2Threads const&, TMemScope const&)
-            {
-                /*
-                 * Intuitively, this pragma creates a fence on the block level.
-                 *
-                 * Creating a block fence is enough for the whole device because the blocks are executed serially. By
-                 * definition of fences, preceding blocks don't have a guarantee to see the results of this block's
-                 * STORE operations (only that they will be ordered correctly); the following blocks see the results
-                 * once they start. Consider the following code:
-                 *
-                 * int x = 1;
-                 * int y = 2;
-                 *
-                 * void foo()
-                 * {
-                 *     x = 10;
-                 *     alpaka::mem_fence(acc, memory_scope::device);
-                 *     y = 20;
-                 * }
-                 *
-                 * void bar()
-                 * {
-                 *     auto b = y;
-                 *     alpaka::mem_fence(acc, memory_scope::device);
-                 *     auto a = x;
-                 * }
-                 *
-                 * The following are all valid outcomes:
-                 *   a == 1 && b == 2
-                 *   a == 10 && b == 2
-                 *   a == 10 && b == 20
-                 */
-#    pragma omp flush
-#    ifdef _MSC_VER
-                ; // MSVC needs an empty statement here or it diagnoses a syntax error
-#    endif
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
-#endif
diff --git a/include/alpaka/mem/fence/MemFenceUniformCudaHipBuiltIn.hpp b/include/alpaka/mem/fence/MemFenceUniformCudaHipBuiltIn.hpp
deleted file mode 100644
index d94b1bc..0000000
--- a/include/alpaka/mem/fence/MemFenceUniformCudaHipBuiltIn.hpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright 2022 Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/BoostPredef.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/mem/fence/Traits.hpp"
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-
-namespace alpaka
-{
-    //! The GPU CUDA/HIP memory fence.
-    class MemFenceUniformCudaHipBuiltIn : public concepts::Implements<ConceptMemFence, MemFenceUniformCudaHipBuiltIn>
-    {
-    };
-
-#    if !defined(ALPAKA_HOST_ONLY)
-
-#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
-#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#        endif
-
-#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
-#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#        endif
-
-    namespace trait
-    {
-        template<>
-        struct MemFence<MemFenceUniformCudaHipBuiltIn, memory_scope::Block>
-        {
-            __device__ static auto mem_fence(MemFenceUniformCudaHipBuiltIn const&, memory_scope::Block const&)
-            {
-                __threadfence_block();
-            }
-        };
-
-        template<>
-        struct MemFence<MemFenceUniformCudaHipBuiltIn, memory_scope::Grid>
-        {
-            __device__ static auto mem_fence(MemFenceUniformCudaHipBuiltIn const&, memory_scope::Grid const&)
-            {
-                // CUDA and HIP do not have a per-grid memory fence, so a device-level fence is used
-                __threadfence();
-            }
-        };
-
-        template<>
-        struct MemFence<MemFenceUniformCudaHipBuiltIn, memory_scope::Device>
-        {
-            __device__ static auto mem_fence(MemFenceUniformCudaHipBuiltIn const&, memory_scope::Device const&)
-            {
-                __threadfence();
-            }
-        };
-    } // namespace trait
-
-#    endif
-
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/mem/fence/Traits.hpp b/include/alpaka/mem/fence/Traits.hpp
deleted file mode 100644
index da02ff3..0000000
--- a/include/alpaka/mem/fence/Traits.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright 2022 Jan Stephan, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/core/Concepts.hpp"
-
-namespace alpaka
-{
-    struct ConceptMemFence
-    {
-    };
-
-    namespace memory_scope
-    {
-        //! Memory fences are observed by all threads in the same block.
-        struct Block
-        {
-        };
-
-        //! Memory fences are observed by all threads in the same grid.
-        struct Grid
-        {
-        };
-
-        //! Memory fences are observed by all threads on the device.
-        struct Device
-        {
-        };
-    } // namespace memory_scope
-
-    //! The memory fence trait.
-    namespace trait
-    {
-        //! The mem_fence trait.
-        template<typename TMemFence, typename TMemScope, typename TSfinae = void>
-        struct MemFence;
-    } // namespace trait
-
-    //! Issues memory fence instructions.
-    //
-    // Issues a memory fence instruction for a given memory scope (\a memory_scope::Block or \a memory_scope::Device).
-    // This guarantees the following:
-    // * All \a LOAD instructions preceeding the fence will always occur before the LOAD instructions following the
-    //   fence (\a LoadLoad coherence)
-    // * All \a STORE instructions preceeding the fence will always occur before the STORE instructions following the
-    //   fence (\a LoadStore coherence). The pre-fence STORE results will be propagated to the other threads in the
-    //   scope at an unknown point in time.
-    //
-    // Note that there are no further guarantees, especially with regard to \a LoadStore ordering. Users should not
-    // mistake this as a synchronization function between threads (please use syncBlockThreads() instead).
-    //
-    //! \tparam TMemFence The memory fence implementation type.
-    //! \tparam TMemScope The memory scope type.
-    //! \param fence The memory fence implementation.
-    //! \param scope The memory scope.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TMemFence, typename TMemScope>
-    ALPAKA_FN_ACC auto mem_fence(TMemFence const& fence, TMemScope const& scope) -> void
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptMemFence, TMemFence>;
-        trait::MemFence<ImplementationBase, TMemScope>::mem_fence(fence, scope);
-    }
-} // namespace alpaka
diff --git a/include/alpaka/mem/global/DeviceGlobalCpu.hpp b/include/alpaka/mem/global/DeviceGlobalCpu.hpp
deleted file mode 100644
index aafcb06..0000000
--- a/include/alpaka/mem/global/DeviceGlobalCpu.hpp
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright 2024 Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/mem/buf/cpu/Copy.hpp"
-#include "alpaka/mem/global/Traits.hpp"
-#include "alpaka/mem/view/ViewPlainPtr.hpp"
-
-#include <type_traits>
-
-// memcpy specialization for device global variables
-namespace alpaka
-{
-
-    namespace detail
-    {
-        template<typename T>
-        struct DevGlobalTrait<TagCpuOmp2Blocks, T>
-        {
-            using Type = detail::DevGlobalImplGeneric<TagCpuOmp2Blocks, T>;
-        };
-
-        template<typename T>
-        struct DevGlobalTrait<TagCpuOmp2Threads, T>
-        {
-            using Type = detail::DevGlobalImplGeneric<TagCpuOmp2Threads, T>;
-        };
-
-        template<typename T>
-        struct DevGlobalTrait<TagCpuSerial, T>
-        {
-            using Type = detail::DevGlobalImplGeneric<TagCpuSerial, T>;
-        };
-
-        template<typename T>
-        struct DevGlobalTrait<TagCpuTbbBlocks, T>
-        {
-            using Type = detail::DevGlobalImplGeneric<TagCpuTbbBlocks, T>;
-        };
-
-        template<typename T>
-        struct DevGlobalTrait<TagCpuThreads, T>
-        {
-            using Type = detail::DevGlobalImplGeneric<TagCpuThreads, T>;
-        };
-    } // namespace detail
-
-    template<
-        typename TTag,
-        typename TViewSrc,
-        typename TTypeDst,
-        typename TQueue,
-        typename std::enable_if_t<
-            std::is_same_v<TTag, TagCpuOmp2Blocks> || std::is_same_v<TTag, TagCpuOmp2Threads>
-                || std::is_same_v<TTag, TagCpuSerial> || std::is_same_v<TTag, TagCpuTbbBlocks>
-                || std::is_same_v<TTag, TagCpuThreads>,
-            int>
-        = 0>
-    ALPAKA_FN_HOST auto memcpy(
-        TQueue& queue,
-        alpaka::detail::DevGlobalImplGeneric<TTag, TTypeDst>& viewDst,
-        TViewSrc const& viewSrc) -> void
-    {
-        using Type = std::remove_const_t<std::remove_all_extents_t<TTypeDst>>;
-        auto extent = getExtents(viewSrc);
-        auto view = alpaka::ViewPlainPtr<DevCpu, Type, alpaka::Dim<decltype(extent)>, alpaka::Idx<decltype(extent)>>(
-            reinterpret_cast<Type*>(const_cast<std::remove_const_t<TTypeDst>*>(&viewDst)),
-            alpaka::getDev(queue),
-            extent);
-        enqueue(queue, createTaskMemcpy(std::forward<decltype(view)>(view), viewSrc, extent));
-    }
-
-    template<
-        typename TTag,
-        typename TTypeSrc,
-        typename TViewDstFwd,
-        typename TQueue,
-        typename std::enable_if_t<
-            std::is_same_v<TTag, TagCpuOmp2Blocks> || std::is_same_v<TTag, TagCpuOmp2Threads>
-                || std::is_same_v<TTag, TagCpuSerial> || std::is_same_v<TTag, TagCpuTbbBlocks>
-                || std::is_same_v<TTag, TagCpuThreads>,
-            int>
-        = 0>
-    ALPAKA_FN_HOST auto memcpy(
-        TQueue& queue,
-        TViewDstFwd&& viewDst,
-        alpaka::detail::DevGlobalImplGeneric<TTag, TTypeSrc>& viewSrc) -> void
-    {
-        using Type = std::remove_all_extents_t<TTypeSrc>;
-        auto extent = getExtents(viewDst);
-        auto view = alpaka::ViewPlainPtr<DevCpu, Type, alpaka::Dim<decltype(extent)>, alpaka::Idx<decltype(extent)>>(
-            reinterpret_cast<Type*>(&viewSrc),
-            alpaka::getDev(queue),
-            extent);
-        enqueue(queue, createTaskMemcpy(std::forward<TViewDstFwd>(viewDst), view, extent));
-    }
-
-    template<
-        typename TTag,
-        typename TExtent,
-        typename TViewSrc,
-        typename TTypeDst,
-        typename TQueue,
-        typename std::enable_if_t<
-            std::is_same_v<TTag, TagCpuOmp2Blocks> || std::is_same_v<TTag, TagCpuOmp2Threads>
-                || std::is_same_v<TTag, TagCpuSerial> || std::is_same_v<TTag, TagCpuTbbBlocks>
-                || std::is_same_v<TTag, TagCpuThreads>,
-            int>
-        = 0>
-    ALPAKA_FN_HOST auto memcpy(
-        TQueue& queue,
-        alpaka::detail::DevGlobalImplGeneric<TTag, TTypeDst>& viewDst,
-        TViewSrc const& viewSrc,
-        TExtent const& extent) -> void
-    {
-        using Type = std::remove_const_t<std::remove_all_extents_t<TTypeDst>>;
-        auto view = alpaka::ViewPlainPtr<DevCpu, Type, alpaka::Dim<TExtent>, alpaka::Idx<TExtent>>(
-            reinterpret_cast<Type*>(const_cast<std::remove_const_t<TTypeDst>*>(&viewDst)),
-            alpaka::getDev(queue),
-            extent);
-        enqueue(queue, createTaskMemcpy(std::forward<decltype(view)>(view), viewSrc, extent));
-    }
-
-    template<
-        typename TTag,
-        typename TExtent,
-        typename TTypeSrc,
-        typename TViewDstFwd,
-        typename TQueue,
-        typename std::enable_if_t<
-            std::is_same_v<TTag, TagCpuOmp2Blocks> || std::is_same_v<TTag, TagCpuOmp2Threads>
-                || std::is_same_v<TTag, TagCpuSerial> || std::is_same_v<TTag, TagCpuTbbBlocks>
-                || std::is_same_v<TTag, TagCpuThreads>,
-            int>
-        = 0>
-    ALPAKA_FN_HOST auto memcpy(
-        TQueue& queue,
-        TViewDstFwd&& viewDst,
-        alpaka::detail::DevGlobalImplGeneric<TTag, TTypeSrc>& viewSrc,
-        TExtent const& extent) -> void
-    {
-        using Type = std::remove_all_extents_t<TTypeSrc>;
-        auto view = alpaka::ViewPlainPtr<DevCpu, Type, alpaka::Dim<TExtent>, alpaka::Idx<TExtent>>(
-            reinterpret_cast<Type*>(&viewSrc),
-            alpaka::getDev(queue),
-            extent);
-        enqueue(queue, createTaskMemcpy(std::forward<TViewDstFwd>(viewDst), view, extent));
-    }
-} // namespace alpaka
diff --git a/include/alpaka/mem/global/DeviceGlobalGenericSycl.hpp b/include/alpaka/mem/global/DeviceGlobalGenericSycl.hpp
deleted file mode 100644
index 56ee98c..0000000
--- a/include/alpaka/mem/global/DeviceGlobalGenericSycl.hpp
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright 2024 Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/elem/Traits.hpp"
-#include "alpaka/mem/global/Traits.hpp"
-#include "alpaka/queue/sycl/QueueGenericSyclBase.hpp"
-
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-
-#    include <sycl/sycl.hpp>
-
-namespace alpaka
-{
-    namespace detail
-    {
-        template<typename T>
-        struct DevGlobalTrait<TagCpuSycl, T>
-        {
-            // SYCL CPU implementation
-            using Type = sycl::ext::oneapi::experimental::device_global<T>;
-        };
-
-        template<typename T>
-        struct DevGlobalTrait<TagGpuSyclIntel, T>
-        {
-            // SYCL GPU implementation
-            using Type = sycl::ext::oneapi::experimental::device_global<T>;
-        };
-
-        template<typename T>
-        struct DevGlobalTrait<TagFpgaSyclIntel, T>
-        {
-            // SYCL FPGA implementation
-            using Type = sycl::ext::oneapi::experimental::device_global<T>;
-        };
-
-        template<typename T>
-        struct DevGlobalTrait<TagGenericSycl, T>
-        {
-            // generic SYCL implementation
-            using Type = sycl::ext::oneapi::experimental::device_global<T>;
-        };
-    } // namespace detail
-
-    // from device to host
-    template<typename TDev, bool TBlocking, typename TViewDst, typename TTypeSrc>
-    ALPAKA_FN_HOST auto memcpy(
-        detail::QueueGenericSyclBase<TDev, TBlocking>& queue,
-        TViewDst&& viewDst,
-        sycl::ext::oneapi::experimental::device_global<TTypeSrc> const& viewSrc)
-    {
-        queue.getNativeHandle().memcpy(reinterpret_cast<void*>(getPtrNative(viewDst)), viewSrc);
-    }
-
-    // from host to device
-    template<typename TDev, bool TBlocking, typename TTypeDst, typename TViewSrc>
-    ALPAKA_FN_HOST auto memcpy(
-        detail::QueueGenericSyclBase<TDev, TBlocking>& queue,
-        sycl::ext::oneapi::experimental::device_global<TTypeDst>& viewDst,
-        TViewSrc const& viewSrc)
-    {
-        queue.getNativeHandle().memcpy(viewDst, reinterpret_cast<void const*>(getPtrNative(viewSrc)));
-    }
-
-    // from device to host
-    template<typename TDev, bool TBlocking, typename TViewDst, typename TTypeSrc, typename TExtent>
-    ALPAKA_FN_HOST auto memcpy(
-        detail::QueueGenericSyclBase<TDev, TBlocking>& queue,
-        TViewDst&& viewDst,
-        sycl::ext::oneapi::experimental::device_global<TTypeSrc> const& viewSrc,
-        TExtent extent)
-    {
-        using Elem = alpaka::Elem<std::remove_reference_t<TViewDst>>;
-        auto size = static_cast<std::size_t>(getHeight(extent)) * static_cast<std::size_t>(getDepth(extent))
-                    * static_cast<std::size_t>(getWidth(extent)) * sizeof(Elem);
-        queue.getNativeHandle().memcpy(reinterpret_cast<void*>(getPtrNative(viewDst)), viewSrc, size);
-    }
-
-    // from host to device
-    template<typename TDev, bool TBlocking, typename TTypeDst, typename TViewSrc, typename TExtent>
-    ALPAKA_FN_HOST auto memcpy(
-        detail::QueueGenericSyclBase<TDev, TBlocking>& queue,
-        sycl::ext::oneapi::experimental::device_global<TTypeDst>& viewDst,
-        TViewSrc const& viewSrc,
-        TExtent extent)
-    {
-        using Elem = alpaka::Elem<TViewSrc>;
-        auto size = static_cast<std::size_t>(getHeight(extent)) * static_cast<std::size_t>(getDepth(extent))
-                    * static_cast<std::size_t>(getWidth(extent)) * sizeof(Elem);
-        queue.getNativeHandle().memcpy(viewDst, reinterpret_cast<void const*>(getPtrNative(viewSrc)), size);
-    }
-} // namespace alpaka
-#endif
diff --git a/include/alpaka/mem/global/DeviceGlobalUniformCudaHipBuiltIn.hpp b/include/alpaka/mem/global/DeviceGlobalUniformCudaHipBuiltIn.hpp
deleted file mode 100644
index 6b802fc..0000000
--- a/include/alpaka/mem/global/DeviceGlobalUniformCudaHipBuiltIn.hpp
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright 2024 Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/dev/DevUniformCudaHipRt.hpp"
-#include "alpaka/mem/global/Traits.hpp"
-#include "alpaka/mem/view/ViewPlainPtr.hpp"
-#include "alpaka/queue/cuda_hip/QueueUniformCudaHipRt.hpp"
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-
-#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-#        include "alpaka/core/ApiCudaRt.hpp"
-#    endif
-
-#    if defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-#        include "alpaka/core/ApiHipRt.hpp"
-#    endif
-
-namespace alpaka
-{
-
-    namespace detail
-    {
-        template<typename T>
-        struct DevGlobalTrait<TagGpuCudaRt, T>
-        {
-            // CUDA implementation
-            using Type = detail::DevGlobalImplGeneric<TagGpuCudaRt, T>;
-        };
-
-        template<typename T>
-        struct DevGlobalTrait<TagGpuHipRt, T>
-        {
-            // HIP/ROCm implementation
-            using Type = detail::DevGlobalImplGeneric<TagGpuHipRt, T>;
-        };
-    } // namespace detail
-
-    // from device to host
-    template<
-        typename TTag,
-        typename TApi,
-        bool TBlocking,
-        typename TViewDst,
-        typename TTypeSrc,
-        typename std::enable_if_t<
-#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-            (std::is_same_v<TTag, TagGpuCudaRt> && std::is_same_v<TApi, ApiCudaRt>)
-#    else
-            (std::is_same_v<TTag, TagGpuHipRt> && std::is_same_v<TApi, ApiHipRt>)
-#    endif
-                ,
-            int>
-        = 0>
-    ALPAKA_FN_HOST auto memcpy(
-        uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>& queue,
-        TViewDst& viewDst,
-        alpaka::detail::DevGlobalImplGeneric<TTag, TTypeSrc>& viewSrc)
-    {
-        using Type = std::remove_const_t<std::remove_all_extents_t<TTypeSrc>>;
-        using TypeExt = std::remove_const_t<TTypeSrc>;
-        auto extent = getExtents(viewDst);
-        TypeExt* pMemAcc(nullptr);
-        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
-            TApi::getSymbolAddress(reinterpret_cast<void**>(&pMemAcc), *(const_cast<TypeExt*>(&viewSrc))));
-
-        auto view = alpaka::ViewPlainPtr<
-            DevUniformCudaHipRt<TApi>,
-            Type,
-            alpaka::Dim<decltype(extent)>,
-            alpaka::Idx<decltype(extent)>>(reinterpret_cast<Type*>(pMemAcc), alpaka::getDev(queue), extent);
-        enqueue(queue, createTaskMemcpy(std::forward<TViewDst>(viewDst), view, extent));
-    }
-
-    // from host to device
-    template<
-        typename TTag,
-        typename TApi,
-        bool TBlocking,
-        typename TTypeDst,
-        typename TViewSrc,
-        typename std::enable_if_t<
-#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-            (std::is_same_v<TTag, TagGpuCudaRt> && std::is_same_v<TApi, ApiCudaRt>)
-#    else
-            (std::is_same_v<TTag, TagGpuHipRt> && std::is_same_v<TApi, ApiHipRt>)
-#    endif
-                ,
-            int>
-        = 0>
-    ALPAKA_FN_HOST auto memcpy(
-        uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>& queue,
-        alpaka::detail::DevGlobalImplGeneric<TTag, TTypeDst>& viewDst,
-        TViewSrc const& viewSrc)
-    {
-        using Type = std::remove_const_t<std::remove_all_extents_t<TTypeDst>>;
-        using TypeExt = std::remove_const_t<TTypeDst>;
-        auto extent = getExtents(viewSrc);
-        Type* pMemAcc(nullptr);
-        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
-            TApi::getSymbolAddress(reinterpret_cast<void**>(&pMemAcc), *(const_cast<TypeExt*>(&viewDst))));
-
-        auto view = alpaka::ViewPlainPtr<
-            DevUniformCudaHipRt<TApi>,
-            Type,
-            alpaka::Dim<decltype(extent)>,
-            alpaka::Idx<decltype(extent)>>(reinterpret_cast<Type*>(pMemAcc), alpaka::getDev(queue), extent);
-        enqueue(queue, createTaskMemcpy(std::forward<decltype(view)>(view), viewSrc, extent));
-    }
-
-    // from device to host
-    template<
-        typename TTag,
-        typename TApi,
-        bool TBlocking,
-        typename TViewDst,
-        typename TTypeSrc,
-        typename TExtent,
-        typename std::enable_if_t<
-#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-            (std::is_same_v<TTag, TagGpuCudaRt> && std::is_same_v<TApi, ApiCudaRt>)
-#    else
-            (std::is_same_v<TTag, TagGpuHipRt> && std::is_same_v<TApi, ApiHipRt>)
-#    endif
-                ,
-            int>
-        = 0>
-    ALPAKA_FN_HOST auto memcpy(
-        uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>& queue,
-        TViewDst& viewDst,
-        alpaka::detail::DevGlobalImplGeneric<TTag, TTypeSrc>& viewSrc,
-        TExtent extent)
-    {
-        using Type = std::remove_const_t<std::remove_all_extents_t<TTypeSrc>>;
-        using TypeExt = std::remove_const_t<TTypeSrc>;
-        Type* pMemAcc(nullptr);
-        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
-            TApi::getSymbolAddress(reinterpret_cast<void**>(&pMemAcc), *(const_cast<TypeExt*>(&viewSrc))));
-
-        auto view = alpaka::ViewPlainPtr<DevUniformCudaHipRt<TApi>, Type, alpaka::Dim<TExtent>, alpaka::Idx<TExtent>>(
-            reinterpret_cast<Type*>(pMemAcc),
-            alpaka::getDev(queue),
-            extent);
-        enqueue(queue, createTaskMemcpy(std::forward<TViewDst>(viewDst), view, extent));
-    }
-
-    // from host to device
-    template<
-        typename TTag,
-        typename TApi,
-        bool TBlocking,
-        typename TTypeDst,
-        typename TViewSrc,
-        typename TExtent,
-        typename std::enable_if_t<
-#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-            (std::is_same_v<TTag, TagGpuCudaRt> && std::is_same_v<TApi, ApiCudaRt>)
-#    else
-            (std::is_same_v<TTag, TagGpuHipRt> && std::is_same_v<TApi, ApiHipRt>)
-#    endif
-                ,
-            int>
-        = 0>
-    ALPAKA_FN_HOST auto memcpy(
-        uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>& queue,
-        alpaka::detail::DevGlobalImplGeneric<TTag, TTypeDst>& viewDst,
-        TViewSrc const& viewSrc,
-        TExtent extent)
-    {
-        using Type = std::remove_const_t<std::remove_all_extents_t<TTypeDst>>;
-        using TypeExt = std::remove_const_t<TTypeDst>;
-        Type* pMemAcc(nullptr);
-        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
-            TApi::getSymbolAddress(reinterpret_cast<void**>(&pMemAcc), *(const_cast<TypeExt*>(&viewDst))));
-
-        auto view = alpaka::ViewPlainPtr<DevUniformCudaHipRt<TApi>, Type, alpaka::Dim<TExtent>, alpaka::Idx<TExtent>>(
-            reinterpret_cast<Type*>(pMemAcc),
-            alpaka::getDev(queue),
-            extent);
-        enqueue(queue, createTaskMemcpy(std::forward<decltype(view)>(view), viewSrc, extent));
-    }
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/mem/global/Traits.hpp b/include/alpaka/mem/global/Traits.hpp
deleted file mode 100644
index 7b3c3d1..0000000
--- a/include/alpaka/mem/global/Traits.hpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2024 Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/acc/Tag.hpp"
-#include "alpaka/core/Common.hpp"
-#include "alpaka/meta/DependentFalseType.hpp"
-
-namespace alpaka
-{
-
-    namespace detail
-    {
-        template<typename TTag, typename T>
-        struct DevGlobalImplGeneric
-        {
-            // does not make use of TTag
-            using Type = std::remove_const_t<T>;
-            Type value; // backend specific value
-
-            ALPAKA_FN_HOST_ACC T* operator&()
-            {
-                return &value;
-            }
-
-            ALPAKA_FN_HOST_ACC T& get()
-            {
-                return value;
-            }
-        };
-
-        template<typename TTag, typename T>
-        struct DevGlobalTrait
-        {
-            static constexpr bool const IsImplementedFor = alpaka::meta::DependentFalseType<TTag>::value;
-
-            static_assert(IsImplementedFor, "Error: device global variables are not implemented for the given Tag");
-        };
-    } // namespace detail
-
-    template<typename TAcc, typename T>
-    using DevGlobal = typename detail::DevGlobalTrait<typename alpaka::trait::AccToTag<TAcc>::type, T>::Type;
-} // namespace alpaka
diff --git a/include/alpaka/mem/view/Traits.hpp b/include/alpaka/mem/view/Traits.hpp
deleted file mode 100644
index 5a9db5b..0000000
--- a/include/alpaka/mem/view/Traits.hpp
+++ /dev/null
@@ -1,614 +0,0 @@
-/* Copyright 2024 Axel Hübl, Benjamin Worpitz, Matthias Werner, Andrea Bocci, Jan Stephan, Bernhard Manfred Gruber,
- *                Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/core/Unreachable.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/dim/Traits.hpp"
-#include "alpaka/elem/Traits.hpp"
-#include "alpaka/extent/Traits.hpp"
-#include "alpaka/meta/Fold.hpp"
-#include "alpaka/meta/Integral.hpp"
-#include "alpaka/offset/Traits.hpp"
-#include "alpaka/queue/Traits.hpp"
-#include "alpaka/vec/Traits.hpp"
-#include "alpaka/vec/Vec.hpp"
-
-#include <array>
-#include <cstddef>
-#include <iosfwd>
-#include <type_traits>
-#include <vector>
-#ifdef ALPAKA_USE_MDSPAN
-#    include <experimental/mdspan>
-#endif
-
-namespace alpaka
-{
-    namespace detail
-    {
-        //! Calculate the pitches purely from the extents.
-        template<typename TElem, typename TDim, typename TIdx>
-        ALPAKA_FN_HOST_ACC inline constexpr auto calculatePitchesFromExtents(Vec<TDim, TIdx> const& extent)
-        {
-            Vec<TDim, TIdx> pitchBytes{};
-            constexpr auto dim = TIdx{TDim::value};
-            if constexpr(dim > 0)
-                pitchBytes.back() = static_cast<TIdx>(sizeof(TElem));
-            if constexpr(dim > 1)
-                for(TIdx i = TDim::value - 1; i > 0; i--)
-                    pitchBytes[i - 1] = extent[i] * pitchBytes[i];
-            return pitchBytes;
-        }
-    } // namespace detail
-
-    //! The view traits.
-    namespace trait
-    {
-        //! The native pointer get trait.
-        template<typename TView, typename TSfinae = void>
-        struct GetPtrNative;
-
-        //! The pointer on device get trait.
-        template<typename TView, typename TDev, typename TSfinae = void>
-        struct GetPtrDev;
-
-        //! The pitch in bytes.
-        //! This is the distance in bytes in the linear memory between two consecutive elements in the next higher
-        //! dimension (TIdx-1).
-        //!
-        //! The default implementation uses the extent to calculate the pitch.
-        template<typename TIdx, typename TView, typename TSfinae = void>
-        struct [[deprecated("Use GetPitchesInBytes instead")]] GetPitchBytes
-        {
-            using ViewIdx = Idx<TView>;
-
-            ALPAKA_FN_HOST static auto getPitchBytes(TView const& view) -> ViewIdx
-            {
-                return getPitchBytesDefault(view);
-            }
-
-        private:
-            static auto getPitchBytesDefault(TView const& view) -> ViewIdx
-            {
-                constexpr auto idx = TIdx::value;
-                constexpr auto viewDim = Dim<TView>::value;
-                if constexpr(idx < viewDim - 1)
-                {
-#if BOOST_COMP_CLANG || BOOST_COMP_GNUC
-#    pragma GCC diagnostic push
-#    pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-                    return getExtents(view)[idx] * GetPitchBytes<DimInt<idx + 1>, TView>::getPitchBytes(view);
-#if BOOST_COMP_CLANG || BOOST_COMP_GNUC
-#    pragma GCC diagnostic pop
-#endif
-                }
-                else if constexpr(idx == viewDim - 1)
-                    return getExtents(view)[viewDim - 1] * static_cast<ViewIdx>(sizeof(Elem<TView>));
-                else
-                    return static_cast<ViewIdx>(sizeof(Elem<TView>));
-                ALPAKA_UNREACHABLE({});
-            }
-        };
-
-        //! Customization point for \ref getPitchesInBytes.
-        //! The default implementation uses the extent to calculate the pitches.
-        template<typename TView, typename TSfinae = void>
-        struct GetPitchesInBytes
-        {
-            ALPAKA_FN_HOST_ACC constexpr auto operator()(TView const& view) const
-            {
-                return alpaka::detail::calculatePitchesFromExtents<Elem<TView>>(getExtents(view));
-            }
-        };
-
-        //! The memory set task trait.
-        //!
-        //! Fills the view with data.
-        template<typename TDim, typename TDev, typename TSfinae = void>
-        struct CreateTaskMemset;
-
-        //! The memory copy task trait.
-        //!
-        //! Copies memory from one view into another view possibly on a different device.
-        template<typename TDim, typename TDevDst, typename TDevSrc, typename TSfinae = void>
-        struct CreateTaskMemcpy;
-
-        //! The device memory view creation trait.
-        template<typename TDev, typename TSfinae = void>
-        struct CreateViewPlainPtr;
-
-        //! The sub view creation trait.
-        template<typename TDev, typename TSfinae = void>
-        struct CreateSubView;
-    } // namespace trait
-
-    //! Gets the native pointer of the memory view.
-    //!
-    //! \param view The memory view.
-    //! \return The native pointer.
-    template<typename TView>
-    ALPAKA_FN_HOST auto getPtrNative(TView const& view) -> Elem<TView> const*
-    {
-        return trait::GetPtrNative<TView>::getPtrNative(view);
-    }
-
-    //! Gets the native pointer of the memory view.
-    //!
-    //! \param view The memory view.
-    //! \return The native pointer.
-    template<typename TView>
-    ALPAKA_FN_HOST auto getPtrNative(TView& view) -> Elem<TView>*
-    {
-        return trait::GetPtrNative<TView>::getPtrNative(view);
-    }
-
-    //! Gets the pointer to the view on the given device.
-    //!
-    //! \param view The memory view.
-    //! \param dev The device.
-    //! \return The pointer on the device.
-    template<typename TView, typename TDev>
-    ALPAKA_FN_HOST auto getPtrDev(TView const& view, TDev const& dev) -> Elem<TView> const*
-    {
-        return trait::GetPtrDev<TView, TDev>::getPtrDev(view, dev);
-    }
-
-    //! Gets the pointer to the view on the given device.
-    //!
-    //! \param view The memory view.
-    //! \param dev The device.
-    //! \return The pointer on the device.
-    template<typename TView, typename TDev>
-    ALPAKA_FN_HOST auto getPtrDev(TView& view, TDev const& dev) -> Elem<TView>*
-    {
-        return trait::GetPtrDev<TView, TDev>::getPtrDev(view, dev);
-    }
-
-    //! \return The pitch in bytes. This is the distance in bytes between two consecutive elements in the given
-    //! dimension.
-    template<std::size_t Tidx, typename TView>
-    [[deprecated("Use getPitchesInBytes instead")]] ALPAKA_FN_HOST auto getPitchBytes(TView const& view) -> Idx<TView>
-    {
-#if BOOST_COMP_CLANG || BOOST_COMP_GNUC
-#    pragma GCC diagnostic push
-#    pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-        return trait::GetPitchBytes<DimInt<Tidx>, TView>::getPitchBytes(view);
-#if BOOST_COMP_CLANG || BOOST_COMP_GNUC
-#    pragma GCC diagnostic pop
-#endif
-    }
-
-    //! \return The pitches in bytes as an alpaka::Vec. This is the distance in bytes between two consecutive elements
-    //! in the given dimension.
-    //! E.g. for a 3D view without padding, the 0-dim pitch is the distance in bytes to jump from one element to the
-    //! next within the same row, the 1-dim pitch (aka. the row pitch) is the distance in bytes to jump from one
-    //! element to the neighboring element on the next row. The 2-dim pitch (aka. the slice pitch) is the distance in
-    //! bytes to jump from one element to the neighboring element on the next slice.
-    //! E.g. a 3D view of floats without padding and the extents {42, 10, 2}, would have a pitch vector of {80, 8, 4}.
-    template<typename TView>
-    ALPAKA_FN_HOST auto getPitchesInBytes(TView const& view) -> Vec<Dim<TView>, Idx<TView>>
-    {
-        return trait::GetPitchesInBytes<TView>{}(view);
-    }
-
-    //! Create a memory set task.
-    //!
-    //! \param view The memory view to fill.
-    //! \param byte Value to set for each element of the specified view.
-    //! \param extent The extent of the view to fill.
-    template<typename TExtent, typename TViewFwd>
-    ALPAKA_FN_HOST auto createTaskMemset(TViewFwd&& view, std::uint8_t const& byte, TExtent const& extent)
-    {
-        using TView = std::remove_reference_t<TViewFwd>;
-        static_assert(!std::is_const_v<TView>, "The view must not be const!");
-        static_assert(
-            Dim<TView>::value == Dim<TExtent>::value,
-            "The view and the extent are required to have the same dimensionality!");
-        static_assert(
-            meta::IsIntegralSuperset<Idx<TView>, Idx<TExtent>>::value,
-            "The view and the extent must have compatible index types!");
-
-        return trait::CreateTaskMemset<Dim<TView>, Dev<TView>>::createTaskMemset(
-            std::forward<TViewFwd>(view),
-            byte,
-            extent);
-    }
-
-    //! Sets the bytes of the memory of view, described by extent, to the given value.
-    //!
-    //! \param queue The queue to enqueue the view fill task into.
-    //! \param[in,out] view The memory view to fill. May be a temporary object.
-    //! \param byte Value to set for each element of the specified view.
-    //! \param extent The extent of the view to fill.
-    template<typename TExtent, typename TViewFwd, typename TQueue>
-    ALPAKA_FN_HOST auto memset(TQueue& queue, TViewFwd&& view, std::uint8_t const& byte, TExtent const& extent) -> void
-    {
-        enqueue(queue, createTaskMemset(std::forward<TViewFwd>(view), byte, extent));
-    }
-
-    //! Sets each byte of the memory of the entire view to the given value.
-    //!
-    //! \param queue The queue to enqueue the view fill task into.
-    //! \param[in,out] view The memory view to fill. May be a temporary object.
-    //! \param byte Value to set for each element of the specified view.
-    template<typename TViewFwd, typename TQueue>
-    ALPAKA_FN_HOST auto memset(TQueue& queue, TViewFwd&& view, std::uint8_t const& byte) -> void
-    {
-        enqueue(queue, createTaskMemset(std::forward<TViewFwd>(view), byte, getExtents(view)));
-    }
-
-    //! Creates a memory copy task.
-    //!
-    //! \param viewDst The destination memory view.
-    //! \param viewSrc The source memory view.
-    //! \param extent The extent of the view to copy.
-    template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
-    ALPAKA_FN_HOST auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
-    {
-        using TViewDst = std::remove_reference_t<TViewDstFwd>;
-        using SrcElem = Elem<TViewSrc>;
-        using DstElem = Elem<TViewDst>;
-        using ExtentIdx = Idx<TExtent>;
-        using DstIdx = Idx<TViewDst>;
-        using SrcIdx = Idx<TViewSrc>;
-
-        static_assert(!std::is_const_v<TViewDst>, "The destination view must not be const!");
-        static_assert(!std::is_const_v<DstElem>, "The destination view's element type must not be const!");
-        static_assert(
-            Dim<TViewDst>::value == Dim<TViewSrc>::value,
-            "The source and the destination view must have the same dimensionality!");
-        static_assert(
-            Dim<TViewDst>::value == Dim<TExtent>::value,
-            "The destination view and the extent must have the same dimensionality!");
-        static_assert(
-            std::is_same_v<DstElem, std::remove_const_t<SrcElem>>,
-            "The source and destination view must have the same element type!");
-        static_assert(
-            meta::IsIntegralSuperset<DstIdx, ExtentIdx>::value,
-            "The destination view and the extent are required to have compatible index types!");
-        static_assert(
-            meta::IsIntegralSuperset<SrcIdx, ExtentIdx>::value,
-            "The source view and the extent are required to have compatible index types!");
-
-        return trait::CreateTaskMemcpy<Dim<TViewDst>, Dev<TViewDst>, Dev<TViewSrc>>::createTaskMemcpy(
-            std::forward<TViewDstFwd>(viewDst),
-            viewSrc,
-            extent);
-    }
-
-    //! Copies memory from a part of viewSrc to viewDst, described by extent. Possibly copies between different memory
-    //! spaces.
-    //!
-    //! \param queue The queue to enqueue the view copy task into.
-    //! \param[in,out] viewDst The destination memory view. May be a temporary object.
-    //! \param viewSrc The source memory view. May be a temporary object.
-    //! \param extent The extent of the view to copy.
-    template<typename TExtent, typename TViewSrc, typename TViewDstFwd, typename TQueue>
-    ALPAKA_FN_HOST auto memcpy(TQueue& queue, TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
-        -> void
-    {
-        enqueue(queue, createTaskMemcpy(std::forward<TViewDstFwd>(viewDst), viewSrc, extent));
-    }
-
-    //! Copies the entire memory of viewSrc to viewDst. Possibly copies between different memory
-    //! spaces.
-    //!
-    //! \param queue The queue to enqueue the view copy task into.
-    //! \param[in,out] viewDst The destination memory view. May be a temporary object.
-    //! \param viewSrc The source memory view. May be a temporary object.
-    template<typename TViewSrc, typename TViewDstFwd, typename TQueue>
-    ALPAKA_FN_HOST auto memcpy(TQueue& queue, TViewDstFwd&& viewDst, TViewSrc const& viewSrc) -> void
-    {
-        enqueue(queue, createTaskMemcpy(std::forward<TViewDstFwd>(viewDst), viewSrc, getExtents(viewSrc)));
-    }
-
-    namespace detail
-    {
-        template<typename TDim, typename TView>
-        struct Print
-        {
-            ALPAKA_FN_HOST static auto print(
-                TView const& view,
-                Elem<TView> const* const ptr,
-                Vec<Dim<TView>, Idx<TView>> const& extent,
-                std::ostream& os,
-                std::string const& elementSeparator,
-                std::string const& rowSeparator,
-                std::string const& rowPrefix,
-                std::string const& rowSuffix) -> void
-            {
-                os << rowPrefix;
-
-                auto const pitch = getPitchesInBytes(view)[TDim::value + 1];
-                auto const lastIdx(extent[TDim::value] - 1u);
-                for(auto i(decltype(lastIdx)(0)); i <= lastIdx; ++i)
-                {
-                    Print<DimInt<TDim::value + 1u>, TView>::print(
-                        view,
-                        reinterpret_cast<Elem<TView> const*>(reinterpret_cast<std::uint8_t const*>(ptr) + i * pitch),
-                        extent,
-                        os,
-                        elementSeparator,
-                        rowSeparator,
-                        rowPrefix,
-                        rowSuffix);
-
-                    // While we are not at the end of a row, add the row separator.
-                    if(i != lastIdx)
-                    {
-                        os << rowSeparator;
-                    }
-                }
-
-                os << rowSuffix;
-            }
-        };
-
-        template<typename TView>
-        struct Print<DimInt<Dim<TView>::value - 1u>, TView>
-        {
-            ALPAKA_FN_HOST static auto print(
-                TView const& /* view */,
-                Elem<TView> const* const ptr,
-                Vec<Dim<TView>, Idx<TView>> const& extent,
-                std::ostream& os,
-                std::string const& elementSeparator,
-                std::string const& /* rowSeparator */,
-                std::string const& rowPrefix,
-                std::string const& rowSuffix) -> void
-            {
-                os << rowPrefix;
-
-                auto const lastIdx(extent[Dim<TView>::value - 1u] - 1u);
-                for(auto i(decltype(lastIdx)(0)); i <= lastIdx; ++i)
-                {
-                    // Add the current element.
-                    os << *(ptr + i);
-
-                    // While we are not at the end of a line, add the element separator.
-                    if(i != lastIdx)
-                    {
-                        os << elementSeparator;
-                    }
-                }
-
-                os << rowSuffix;
-            }
-        };
-    } // namespace detail
-
-    //! Prints the content of the view to the given queue.
-    // \TODO: Add precision flag.
-    // \TODO: Add column alignment flag.
-    template<typename TView>
-    ALPAKA_FN_HOST auto print(
-        TView const& view,
-        std::ostream& os,
-        std::string const& elementSeparator = ", ",
-        std::string const& rowSeparator = "\n",
-        std::string const& rowPrefix = "[",
-        std::string const& rowSuffix = "]") -> void
-    {
-        detail::Print<DimInt<0u>, TView>::print(
-            view,
-            getPtrNative(view),
-            getExtents(view),
-            os,
-            elementSeparator,
-            rowSeparator,
-            rowPrefix,
-            rowSuffix);
-    }
-
-    //! \return The pitch vector.
-    template<typename TView>
-    [[deprecated("Use getPitchesInBytes instead")]] auto getPitchBytesVec(TView const& view)
-        -> Vec<Dim<TView>, Idx<TView>>
-    {
-        return getPitchesInBytes(view);
-    }
-
-    //! \return The pitch but only the last N elements.
-    template<typename TDim, typename TView>
-    ALPAKA_FN_HOST auto getPitchBytesVecEnd(TView const& view = TView()) -> Vec<TDim, Idx<TView>>
-    {
-        return subVecEnd<TDim>(getPitchesInBytes(view));
-    }
-
-    //! Creates a view to a device pointer
-    //!
-    //! \param dev Device from where pMem can be accessed.
-    //! \param pMem Pointer to memory. The pointer must be accessible from the given device.
-    //! \param extent Number of elements represented by the pMem.
-    //!               Using a multi dimensional extent will result in a multi dimension view to the memory represented
-    //!               by pMem.
-    //! \return A view to device memory.
-    template<typename TDev, typename TElem, typename TExtent>
-    auto createView(TDev const& dev, TElem* pMem, TExtent const& extent)
-    {
-        using Dim = alpaka::Dim<TExtent>;
-        using Idx = alpaka::Idx<TExtent>;
-        auto const extentVec = Vec<Dim, Idx>(extent);
-        return trait::CreateViewPlainPtr<TDev>::createViewPlainPtr(
-            dev,
-            pMem,
-            extentVec,
-            detail::calculatePitchesFromExtents<TElem>(extentVec));
-    }
-
-    //! Creates a view to a device pointer
-    //!
-    //! \param dev Device from where pMem can be accessed.
-    //! \param pMem Pointer to memory. The pointer must be accessible from the given device.
-    //! \param extent Number of elements represented by the pMem.
-    //!               Using a multi dimensional extent will result in a multi dimension view to the memory represented
-    //!               by pMem.
-    //! \param pitch Pitch in bytes for each dimension. Dimensionality must be equal to extent.
-    //! \return A view to device memory.
-    template<typename TDev, typename TElem, typename TExtent, typename TPitch>
-    auto createView(TDev const& dev, TElem* pMem, TExtent const& extent, TPitch pitch)
-    {
-        return trait::CreateViewPlainPtr<TDev>::createViewPlainPtr(dev, pMem, extent, pitch);
-    }
-
-    //! Creates a view to a contiguous container of device-accessible memory.
-    //!
-    //! \param dev Device from which the container can be accessed.
-    //! \param con Contiguous container. The container must provide a `data()` method. The data held by the container
-    //!            must be accessible from the given device. The `GetExtent` trait must be defined for the container.
-    //! \return A view to device memory.
-    template<typename TDev, typename TContainer>
-    auto createView(TDev const& dev, TContainer& con)
-    {
-        return createView(dev, std::data(con), getExtents(con));
-    }
-
-    //! Creates a view to a contiguous container of device-accessible memory.
-    //!
-    //! \param dev Device from which the container can be accessed.
-    //! \param con Contiguous container. The container must provide a `data()` method. The data held by the container
-    //!            must be accessible from the given device. The `GetExtent` trait must be defined for the container.
-    //! \param extent Number of elements held by the container. Using a multi-dimensional extent will result in a
-    //!               multi-dimensional view to the memory represented by the container.
-    //! \return A view to device memory.
-    template<typename TDev, typename TContainer, typename TExtent>
-    auto createView(TDev const& dev, TContainer& con, TExtent const& extent)
-    {
-        return createView(dev, std::data(con), extent);
-    }
-
-    //! Creates a sub view to an existing view.
-    //!
-    //! \param view The view this view is a sub-view of.
-    //! \param extent Number of elements the resulting view holds.
-    //! \param offset Number of elements skipped in view for the new origin of the resulting view.
-    //! \return A sub view to a existing view.
-    template<typename TView, typename TExtent, typename TOffsets>
-    auto createSubView(TView& view, TExtent const& extent, TOffsets const& offset = TExtent())
-    {
-        return trait::CreateSubView<typename trait::DevType<TView>::type>::createSubView(view, extent, offset);
-    }
-
-#ifdef ALPAKA_USE_MDSPAN
-    namespace experimental
-    {
-        // import mdspan into alpaka::experimental namespace. see: https://eel.is/c++draft/mdspan.syn
-        using std::experimental::default_accessor;
-        using std::experimental::dextents;
-        using std::experimental::extents;
-        using std::experimental::layout_left;
-        using std::experimental::layout_right;
-        using std::experimental::layout_stride;
-        using std::experimental::mdspan;
-        // import submdspan as well, which is not standardized yet
-        using std::experimental::full_extent;
-        using std::experimental::submdspan;
-
-        namespace traits
-        {
-            namespace detail
-            {
-                template<typename ElementType>
-                struct ByteIndexedAccessor
-                {
-                    using offset_policy = ByteIndexedAccessor;
-                    using element_type = ElementType;
-                    using reference = ElementType&;
-
-                    using data_handle_type
-                        = std::conditional_t<std::is_const_v<ElementType>, std::byte const*, std::byte*>;
-
-                    constexpr ByteIndexedAccessor() noexcept = default;
-
-                    ALPAKA_FN_HOST_ACC constexpr data_handle_type offset(data_handle_type p, size_t i) const noexcept
-                    {
-                        return p + i;
-                    }
-
-                    ALPAKA_FN_HOST_ACC constexpr reference access(data_handle_type p, size_t i) const noexcept
-                    {
-                        assert(i % alignof(ElementType) == 0);
-#    if BOOST_COMP_GNUC
-#        pragma GCC diagnostic push
-#        pragma GCC diagnostic ignored "-Wcast-align"
-#    endif
-                        return *reinterpret_cast<ElementType*>(p + i);
-#    if BOOST_COMP_GNUC
-#        pragma GCC diagnostic pop
-#    endif
-                    }
-                };
-
-                template<typename TView, std::size_t... Is>
-                ALPAKA_FN_HOST auto makeExtents(TView const& view, std::index_sequence<Is...>)
-                {
-                    auto const ex = getExtents(view);
-                    return std::experimental::dextents<Idx<TView>, Dim<TView>::value>{ex[Is]...};
-                }
-            } // namespace detail
-
-            //! Customization point for getting an mdspan from a view.
-            template<typename TView, typename TSfinae = void>
-            struct GetMdSpan
-            {
-                ALPAKA_FN_HOST static auto getMdSpan(TView& view)
-                {
-                    constexpr auto dim = Dim<TView>::value;
-                    using Element = Elem<TView>;
-                    auto extents = detail::makeExtents(view, std::make_index_sequence<dim>{});
-                    auto* ptr = reinterpret_cast<std::byte*>(getPtrNative(view));
-                    auto const strides = toArray(getPitchesInBytes(view));
-                    layout_stride::mapping<decltype(extents)> m{extents, strides};
-                    return mdspan<Element, decltype(extents), layout_stride, detail::ByteIndexedAccessor<Element>>{
-                        ptr,
-                        m};
-                }
-
-                ALPAKA_FN_HOST static auto getMdSpanTransposed(TView& view)
-                {
-                    constexpr auto dim = Dim<TView>::value;
-                    using Element = Elem<TView>;
-                    auto extents = detail::makeExtents(view, std::make_index_sequence<dim>{});
-                    auto* ptr = reinterpret_cast<std::byte*>(getPtrNative(view));
-                    auto strides = toArray(getPitchesInBytes(view));
-                    std::reverse(begin(strides), end(strides));
-                    layout_stride::mapping<decltype(extents)> m{extents, strides};
-                    return mdspan<Element, decltype(extents), layout_stride, detail::ByteIndexedAccessor<Element>>{
-                        ptr,
-                        m};
-                }
-            };
-        } // namespace traits
-
-        //! Gets a std::mdspan from the given view. The memory layout is determined by the pitches of the view.
-        template<typename TView>
-        ALPAKA_FN_HOST auto getMdSpan(TView& view)
-        {
-            return traits::GetMdSpan<TView>::getMdSpan(view);
-        }
-
-        //! Gets a std::mdspan from the given view. The memory layout is determined by the reversed pitches of the
-        //! view. This effectively also reverses the extents of the view. In order words, if you create a transposed
-        //! mdspan on a 10x5 element view, the mdspan will have an iteration space of 5x10.
-        template<typename TView>
-        ALPAKA_FN_HOST auto getMdSpanTransposed(TView& view)
-        {
-            return traits::GetMdSpan<TView>::getMdSpanTransposed(view);
-        }
-
-        template<typename TElem, typename TIdx, typename TDim>
-        using MdSpan = alpaka::experimental::mdspan<
-            TElem,
-            alpaka::experimental::dextents<TIdx, TDim::value>,
-            alpaka::experimental::layout_stride,
-            alpaka::experimental::traits::detail::ByteIndexedAccessor<TElem>>;
-    } // namespace experimental
-#endif
-} // namespace alpaka
diff --git a/include/alpaka/mem/view/ViewAccessOps.hpp b/include/alpaka/mem/view/ViewAccessOps.hpp
deleted file mode 100644
index 2705667..0000000
--- a/include/alpaka/mem/view/ViewAccessOps.hpp
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright 2023 Andrea Bocci, Bernhard Manfred Gruber, Jan Stephan
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/dim/Traits.hpp"
-#include "alpaka/extent/Traits.hpp"
-#include "alpaka/mem/view/Traits.hpp"
-
-#include <cstdint>
-#include <sstream>
-#include <stdexcept>
-#include <type_traits>
-#include <utility>
-
-namespace alpaka::internal
-{
-    template<typename T, typename SFINAE = void>
-    inline constexpr bool isView = false;
-
-    // TODO(bgruber): replace this by a concept in C++20
-    template<typename TView>
-    inline constexpr bool isView<
-        TView,
-        std::void_t<
-            Idx<TView>,
-            Dim<TView>,
-            decltype(getPtrNative(std::declval<TView>())),
-            decltype(getPitchesInBytes(std::declval<TView>())),
-            decltype(getExtents(std::declval<TView>()))>>
-        = true;
-
-    template<typename TView>
-    struct ViewAccessOps
-    {
-        static_assert(isView<TView>);
-
-    private:
-        using value_type = Elem<TView>;
-        using pointer = value_type*;
-        using const_pointer = value_type const*;
-        using reference = value_type&;
-        using const_reference = value_type const&;
-        using Idx = alpaka::Idx<TView>;
-        using Dim = alpaka::Dim<TView>;
-
-    public:
-        ALPAKA_FN_HOST auto data() -> pointer
-        {
-            return getPtrNative(*static_cast<TView*>(this));
-        }
-
-        [[nodiscard]] ALPAKA_FN_HOST auto data() const -> const_pointer
-        {
-            return getPtrNative(*static_cast<TView const*>(this));
-        }
-
-        ALPAKA_FN_HOST auto operator*() -> reference
-        {
-            static_assert(Dim::value == 0, "operator* is only valid for Buffers and Views of dimension 0");
-            return *data();
-        }
-
-        ALPAKA_FN_HOST auto operator*() const -> const_reference
-        {
-            static_assert(Dim::value == 0, "operator* is only valid for Buffers and Views of dimension 0");
-            return *data();
-        }
-
-        ALPAKA_FN_HOST auto operator->() -> pointer
-        {
-            static_assert(Dim::value == 0, "operator-> is only valid for Buffers and Views of dimension 0");
-            return data();
-        }
-
-        ALPAKA_FN_HOST auto operator->() const -> const_pointer
-        {
-            static_assert(Dim::value == 0, "operator-> is only valid for Buffers and Views of dimension 0");
-            return data();
-        }
-
-        ALPAKA_FN_HOST auto operator[](Idx i) -> reference
-        {
-            static_assert(Dim::value == 1, "operator[i] is only valid for Buffers and Views of dimension 1");
-            return data()[i];
-        }
-
-        ALPAKA_FN_HOST auto operator[](Idx i) const -> const_reference
-        {
-            static_assert(Dim::value == 1, "operator[i] is only valid for Buffers and Views of dimension 1");
-            return data()[i];
-        }
-
-    private:
-        template<typename TIdx>
-        [[nodiscard]] ALPAKA_FN_HOST auto ptr_at([[maybe_unused]] Vec<Dim, TIdx> index) const -> const_pointer
-        {
-            static_assert(
-                std::is_convertible_v<TIdx, Idx>,
-                "the index type must be convertible to the index of the Buffer or View");
-
-            auto ptr = reinterpret_cast<std::uintptr_t>(data());
-            if constexpr(Dim::value > 0)
-            {
-                ptr += static_cast<std::uintptr_t>(
-                    (getPitchesInBytes(*static_cast<TView const*>(this)) * castVec<Idx>(index)).sum());
-            }
-            return reinterpret_cast<const_pointer>(ptr);
-        }
-
-    public:
-        template<typename TIdx>
-        ALPAKA_FN_HOST auto operator[](Vec<Dim, TIdx> index) -> reference
-        {
-            return *const_cast<pointer>(ptr_at(index));
-        }
-
-        template<typename TIdx>
-        ALPAKA_FN_HOST auto operator[](Vec<Dim, TIdx> index) const -> const_reference
-        {
-            return *ptr_at(index);
-        }
-
-        template<typename TIdx>
-        ALPAKA_FN_HOST auto at(Vec<Dim, TIdx> index) -> reference
-        {
-            auto extent = getExtents(*static_cast<TView*>(this));
-            if(!(index < extent).all())
-            {
-                std::stringstream msg;
-                msg << "index " << index << " is outside of the Buffer or View extent " << extent;
-                throw std::out_of_range(msg.str());
-            }
-            return *const_cast<pointer>(ptr_at(index));
-        }
-
-        template<typename TIdx>
-        [[nodiscard]] ALPAKA_FN_HOST auto at(Vec<Dim, TIdx> index) const -> const_reference
-        {
-            auto extent = getExtents(*static_cast<TView const*>(this));
-            if(!(index < extent).all())
-            {
-                std::stringstream msg;
-                msg << "index " << index << " is outside of the Buffer or View extent " << extent;
-                throw std::out_of_range(msg.str());
-            }
-            return *ptr_at(index);
-        }
-    };
-} // namespace alpaka::internal
diff --git a/include/alpaka/mem/view/ViewConst.hpp b/include/alpaka/mem/view/ViewConst.hpp
deleted file mode 100644
index a4cd5db..0000000
--- a/include/alpaka/mem/view/ViewConst.hpp
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright 2022 Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/dim/Traits.hpp"
-#include "alpaka/extent/Traits.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/mem/view/Traits.hpp"
-#include "alpaka/mem/view/ViewAccessOps.hpp"
-#include "alpaka/offset/Traits.hpp"
-
-namespace alpaka
-{
-    //! A non-modifiable wrapper around a view. This view acts as the wrapped view, but the underlying data is only
-    //! exposed const-qualified.
-    template<typename TView>
-    struct ViewConst : internal::ViewAccessOps<ViewConst<TView>>
-    {
-        static_assert(!std::is_const_v<TView>, "ViewConst must be instantiated with a non-const type");
-        static_assert(
-            !std::is_reference_v<TView>,
-            "This is not implemented"); // It might even be dangerous for ViewConst to store a reference to the wrapped
-                                        // view, as this decouples the wrapped view's lifetime.
-
-        ALPAKA_FN_HOST ViewConst(TView const& view) : m_view(view)
-        {
-        }
-
-        ALPAKA_FN_HOST ViewConst(TView&& view) : m_view(std::move(view))
-        {
-        }
-
-        TView m_view;
-    };
-
-    template<typename TView>
-    ViewConst(TView) -> ViewConst<std::decay_t<TView>>;
-
-    namespace trait
-    {
-        template<typename TView>
-        struct DevType<ViewConst<TView>> : DevType<TView>
-        {
-        };
-
-        template<typename TView>
-        struct GetDev<ViewConst<TView>>
-        {
-            ALPAKA_FN_HOST static auto getDev(ViewConst<TView> const& view)
-            {
-                return alpaka::getDev(view.m_view);
-            }
-        };
-
-        template<typename TView>
-        struct DimType<ViewConst<TView>> : DimType<TView>
-        {
-        };
-
-        template<typename TView>
-        struct ElemType<ViewConst<TView>>
-        {
-            // const qualify the element type of the inner view
-            using type = typename ElemType<TView>::type const;
-        };
-
-        template<typename TView>
-        struct GetExtents<ViewConst<TView>>
-        {
-            ALPAKA_FN_HOST auto operator()(ViewConst<TView> const& view) const
-            {
-                return getExtents(view.m_view);
-            }
-        };
-
-        template<typename TView>
-        struct GetPtrNative<ViewConst<TView>>
-        {
-            using TElem = typename ElemType<TView>::type;
-
-            // const qualify the element type of the inner view
-            ALPAKA_FN_HOST static auto getPtrNative(ViewConst<TView> const& view) -> TElem const*
-            {
-                return alpaka::getPtrNative(view.m_view);
-            }
-        };
-
-        template<typename TView>
-        struct GetPitchesInBytes<ViewConst<TView>>
-        {
-            ALPAKA_FN_HOST auto operator()(ViewConst<TView> const& view) const
-            {
-                return alpaka::getPitchesInBytes(view.m_view);
-            }
-        };
-
-        template<typename TView>
-        struct GetOffsets<ViewConst<TView>>
-        {
-            ALPAKA_FN_HOST auto operator()(ViewConst<TView> const& view) const
-            {
-                return alpaka::getOffsets(view.m_view);
-            }
-        };
-
-        template<typename TView>
-        struct IdxType<ViewConst<TView>> : IdxType<TView>
-        {
-        };
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/mem/view/ViewPlainPtr.hpp b/include/alpaka/mem/view/ViewPlainPtr.hpp
deleted file mode 100644
index dda4a17..0000000
--- a/include/alpaka/mem/view/ViewPlainPtr.hpp
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Copyright 2024 Benjamin Worpitz, Matthias Werner, René Widera, Sergei Bastrakov, Bernhard Manfred Gruber,
- *                Jan Stephan, Andrea Bocci, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/dev/DevCpu.hpp"
-#include "alpaka/dev/DevGenericSycl.hpp"
-#include "alpaka/dev/DevUniformCudaHipRt.hpp"
-#include "alpaka/mem/view/Traits.hpp"
-#include "alpaka/mem/view/ViewAccessOps.hpp"
-#include "alpaka/meta/DependentFalseType.hpp"
-#include "alpaka/vec/Vec.hpp"
-
-#include <type_traits>
-#include <utility>
-
-namespace alpaka
-{
-    //! The memory view to wrap plain pointers.
-    template<typename TDev, typename TElem, typename TDim, typename TIdx>
-    struct ViewPlainPtr final : internal::ViewAccessOps<ViewPlainPtr<TDev, TElem, TDim, TIdx>>
-    {
-        static_assert(!std::is_const_v<TIdx>, "The idx type of the view can not be const!");
-
-        template<typename TExtent>
-        ALPAKA_FN_HOST ViewPlainPtr(TElem* pMem, TDev dev, TExtent const& extent = TExtent())
-            : ViewPlainPtr(pMem, std::move(dev), extent, detail::calculatePitchesFromExtents<TElem>(extent))
-        {
-        }
-
-        template<typename TExtent, typename TPitch>
-        ALPAKA_FN_HOST ViewPlainPtr(TElem* pMem, TDev dev, TExtent const& extent, TPitch pitchBytes)
-            : m_pMem(pMem)
-            , m_dev(std::move(dev))
-            , m_extentElements(extent)
-            , m_pitchBytes(static_cast<Vec<TDim, TIdx>>(pitchBytes))
-        {
-        }
-
-        TElem* m_pMem;
-        TDev m_dev;
-        Vec<TDim, TIdx> m_extentElements;
-        Vec<TDim, TIdx> m_pitchBytes;
-    };
-
-    // Trait specializations for ViewPlainPtr.
-    namespace trait
-    {
-        //! The ViewPlainPtr device type trait specialization.
-        template<typename TDev, typename TElem, typename TDim, typename TIdx>
-        struct DevType<ViewPlainPtr<TDev, TElem, TDim, TIdx>>
-        {
-            using type = alpaka::Dev<TDev>;
-        };
-
-        //! The ViewPlainPtr device get trait specialization.
-        template<typename TDev, typename TElem, typename TDim, typename TIdx>
-        struct GetDev<ViewPlainPtr<TDev, TElem, TDim, TIdx>>
-        {
-            static auto getDev(ViewPlainPtr<TDev, TElem, TDim, TIdx> const& view) -> alpaka::Dev<TDev>
-            {
-                return view.m_dev;
-            }
-        };
-
-        //! The ViewPlainPtr dimension getter trait.
-        template<typename TDev, typename TElem, typename TDim, typename TIdx>
-        struct DimType<ViewPlainPtr<TDev, TElem, TDim, TIdx>>
-        {
-            using type = TDim;
-        };
-
-        //! The ViewPlainPtr memory element type get trait specialization.
-        template<typename TDev, typename TElem, typename TDim, typename TIdx>
-        struct ElemType<ViewPlainPtr<TDev, TElem, TDim, TIdx>>
-        {
-            using type = TElem;
-        };
-    } // namespace trait
-
-    namespace trait
-    {
-        template<typename TDev, typename TElem, typename TDim, typename TIdx>
-        struct GetExtents<ViewPlainPtr<TDev, TElem, TDim, TIdx>>
-        {
-            ALPAKA_FN_HOST auto operator()(ViewPlainPtr<TDev, TElem, TDim, TIdx> const& view) const
-            {
-                return view.m_extentElements;
-            }
-        };
-
-        //! The ViewPlainPtr native pointer get trait specialization.
-        template<typename TDev, typename TElem, typename TDim, typename TIdx>
-        struct GetPtrNative<ViewPlainPtr<TDev, TElem, TDim, TIdx>>
-        {
-            static auto getPtrNative(ViewPlainPtr<TDev, TElem, TDim, TIdx> const& view) -> TElem const*
-            {
-                return view.m_pMem;
-            }
-
-            static auto getPtrNative(ViewPlainPtr<TDev, TElem, TDim, TIdx>& view) -> TElem*
-            {
-                return view.m_pMem;
-            }
-        };
-
-        template<typename TDev, typename TElem, typename TDim, typename TIdx>
-        struct GetPitchesInBytes<ViewPlainPtr<TDev, TElem, TDim, TIdx>>
-        {
-            ALPAKA_FN_HOST auto operator()(ViewPlainPtr<TDev, TElem, TDim, TIdx> const& view) const
-            {
-                return view.m_pitchBytes;
-            }
-        };
-
-        //! The CPU device CreateViewPlainPtr trait specialization.
-        template<>
-        struct CreateViewPlainPtr<DevCpu>
-        {
-            template<typename TElem, typename TExtent, typename TPitch>
-            static auto createViewPlainPtr(DevCpu const& dev, TElem* pMem, TExtent const& extent, TPitch pitch)
-            {
-                return alpaka::ViewPlainPtr<DevCpu, TElem, alpaka::Dim<TExtent>, alpaka::Idx<TExtent>>(
-                    pMem,
-                    dev,
-                    extent,
-                    pitch);
-            }
-        };
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-        //! The CUDA/HIP RT device CreateViewPlainPtr trait specialization.
-        template<typename TApi>
-        struct CreateViewPlainPtr<DevUniformCudaHipRt<TApi>>
-        {
-            template<typename TElem, typename TExtent, typename TPitch>
-            static auto createViewPlainPtr(
-                DevUniformCudaHipRt<TApi> const& dev,
-                TElem* pMem,
-                TExtent const& extent,
-                TPitch pitch)
-            {
-                return alpaka::
-                    ViewPlainPtr<DevUniformCudaHipRt<TApi>, TElem, alpaka::Dim<TExtent>, alpaka::Idx<TExtent>>(
-                        pMem,
-                        dev,
-                        extent,
-                        pitch);
-            }
-        };
-#endif
-
-#if defined(ALPAKA_ACC_SYCL_ENABLED)
-        //! The SYCL device CreateViewPlainPtr trait specialization.
-        template<typename TTag>
-        struct CreateViewPlainPtr<DevGenericSycl<TTag>>
-        {
-            template<typename TElem, typename TExtent, typename TPitch>
-            static auto createViewPlainPtr(
-                DevGenericSycl<TTag> const& dev,
-                TElem* pMem,
-                TExtent const& extent,
-                TPitch pitch)
-            {
-                return alpaka::ViewPlainPtr<DevGenericSycl<TTag>, TElem, alpaka::Dim<TExtent>, alpaka::Idx<TExtent>>(
-                    pMem,
-                    dev,
-                    extent,
-                    pitch);
-            }
-        };
-#endif
-        //! The ViewPlainPtr offset get trait specialization.
-        template<typename TDev, typename TElem, typename TDim, typename TIdx>
-        struct GetOffsets<ViewPlainPtr<TDev, TElem, TDim, TIdx>>
-        {
-            ALPAKA_FN_HOST auto operator()(ViewPlainPtr<TDev, TElem, TDim, TIdx> const&) const -> Vec<TDim, TIdx>
-            {
-                return Vec<TDim, TIdx>::zeros();
-            }
-        };
-
-        //! The ViewPlainPtr idx type trait specialization.
-        template<typename TDev, typename TElem, typename TDim, typename TIdx>
-        struct IdxType<ViewPlainPtr<TDev, TElem, TDim, TIdx>>
-        {
-            using type = TIdx;
-        };
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/mem/view/ViewStdArray.hpp b/include/alpaka/mem/view/ViewStdArray.hpp
deleted file mode 100644
index de01ec8..0000000
--- a/include/alpaka/mem/view/ViewStdArray.hpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-/* TODO: Once C++20 is available remove this file and replace with a generic ContiguousContainer solution based on
- * concepts. It should be sufficient to check for the existence of Container.size() and Container.data() */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/dev/DevCpu.hpp"
-#include "alpaka/mem/view/Traits.hpp"
-#include "alpaka/platform/PlatformCpu.hpp"
-
-#include <array>
-
-namespace alpaka::trait
-{
-    //! The std::array device type trait specialization.
-    template<typename TElem, std::size_t Tsize>
-    struct DevType<std::array<TElem, Tsize>>
-    {
-        using type = DevCpu;
-    };
-
-    //! The std::array device get trait specialization.
-    template<typename TElem, std::size_t Tsize>
-    struct GetDev<std::array<TElem, Tsize>>
-    {
-        ALPAKA_FN_HOST static auto getDev(std::array<TElem, Tsize> const& /* view */) -> DevCpu
-        {
-            // Instantiating the CPU platform here is a hack we can do internally, because we know that the CPU
-            // platform does not contain any data. But it generally does not apply.
-            return getDevByIdx(PlatformCpu{}, 0u);
-        }
-    };
-
-    //! The std::array dimension getter trait specialization.
-    template<typename TElem, std::size_t Tsize>
-    struct DimType<std::array<TElem, Tsize>>
-    {
-        using type = DimInt<1u>;
-    };
-
-    //! The std::array memory element type get trait specialization.
-    template<typename TElem, std::size_t Tsize>
-    struct ElemType<std::array<TElem, Tsize>>
-    {
-        using type = TElem;
-    };
-
-    template<typename TElem, std::size_t Tsize>
-    struct GetExtents<std::array<TElem, Tsize>>
-    {
-        ALPAKA_FN_HOST constexpr auto operator()(std::array<TElem, Tsize> const& a)
-            -> Vec<DimInt<1>, Idx<std::array<TElem, Tsize>>>
-        {
-            return {std::size(a)};
-        }
-    };
-
-    //! The std::array native pointer get trait specialization.
-    template<typename TElem, std::size_t Tsize>
-    struct GetPtrNative<std::array<TElem, Tsize>>
-    {
-        ALPAKA_FN_HOST static auto getPtrNative(std::array<TElem, Tsize> const& view) -> TElem const*
-        {
-            return std::data(view);
-        }
-
-        ALPAKA_FN_HOST static auto getPtrNative(std::array<TElem, Tsize>& view) -> TElem*
-        {
-            return std::data(view);
-        }
-    };
-
-    //! The std::array offset get trait specialization.
-    template<typename TElem, std::size_t Tsize>
-    struct GetOffsets<std::array<TElem, Tsize>>
-    {
-        ALPAKA_FN_HOST auto operator()(std::array<TElem, Tsize> const&)
-            -> Vec<DimInt<1>, Idx<std::array<TElem, Tsize>>>
-        {
-            return {0};
-        }
-    };
-
-    //! The std::vector idx type trait specialization.
-    template<typename TElem, std::size_t Tsize>
-    struct IdxType<std::array<TElem, Tsize>>
-    {
-        using type = std::size_t;
-    };
-} // namespace alpaka::trait
diff --git a/include/alpaka/mem/view/ViewStdVector.hpp b/include/alpaka/mem/view/ViewStdVector.hpp
deleted file mode 100644
index e09b370..0000000
--- a/include/alpaka/mem/view/ViewStdVector.hpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-/* TODO: Once C++20 is available remove this file and replace with a generic ContiguousContainer solution based on
- * concepts. It should be sufficient to check for the existence of Container.size() and Container.data() */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/dev/DevCpu.hpp"
-#include "alpaka/mem/view/Traits.hpp"
-#include "alpaka/platform/PlatformCpu.hpp"
-
-#include <vector>
-
-namespace alpaka::trait
-{
-    //! The std::vector device type trait specialization.
-    template<typename TElem, typename TAllocator>
-    struct DevType<std::vector<TElem, TAllocator>>
-    {
-        using type = DevCpu;
-    };
-
-    //! The std::vector device get trait specialization.
-    template<typename TElem, typename TAllocator>
-    struct GetDev<std::vector<TElem, TAllocator>>
-    {
-        ALPAKA_FN_HOST static auto getDev(std::vector<TElem, TAllocator> const& /* view */) -> DevCpu
-        {
-            return getDevByIdx(PlatformCpu{}, 0u);
-        }
-    };
-
-    //! The std::vector dimension getter trait specialization.
-    template<typename TElem, typename TAllocator>
-    struct DimType<std::vector<TElem, TAllocator>>
-    {
-        using type = DimInt<1u>;
-    };
-
-    //! The std::vector memory element type get trait specialization.
-    template<typename TElem, typename TAllocator>
-    struct ElemType<std::vector<TElem, TAllocator>>
-    {
-        using type = TElem;
-    };
-
-    template<typename TElem, typename TAllocator>
-    struct GetExtents<std::vector<TElem, TAllocator>>
-    {
-        ALPAKA_FN_HOST constexpr auto operator()(std::vector<TElem, TAllocator> const& a)
-            -> Vec<DimInt<1>, Idx<std::vector<TElem, TAllocator>>>
-        {
-            return {std::size(a)};
-        }
-    };
-
-    //! The std::vector native pointer get trait specialization.
-    template<typename TElem, typename TAllocator>
-    struct GetPtrNative<std::vector<TElem, TAllocator>>
-    {
-        ALPAKA_FN_HOST static auto getPtrNative(std::vector<TElem, TAllocator> const& view) -> TElem const*
-        {
-            return std::data(view);
-        }
-
-        ALPAKA_FN_HOST static auto getPtrNative(std::vector<TElem, TAllocator>& view) -> TElem*
-        {
-            return std::data(view);
-        }
-    };
-
-    //! The std::vector offset get trait specialization.
-    template<typename TElem, typename TAllocator>
-    struct GetOffsets<std::vector<TElem, TAllocator>>
-    {
-        ALPAKA_FN_HOST auto operator()(std::vector<TElem, TAllocator> const&) const
-            -> Vec<DimInt<1>, Idx<std::vector<TElem, TAllocator>>>
-        {
-            return {0};
-        }
-    };
-
-    //! The std::vector idx type trait specialization.
-    template<typename TElem, typename TAllocator>
-    struct IdxType<std::vector<TElem, TAllocator>>
-    {
-        using type = std::size_t;
-    };
-} // namespace alpaka::trait
diff --git a/include/alpaka/mem/view/ViewSubView.hpp b/include/alpaka/mem/view/ViewSubView.hpp
deleted file mode 100644
index a35fa22..0000000
--- a/include/alpaka/mem/view/ViewSubView.hpp
+++ /dev/null
@@ -1,217 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Assert.hpp"
-#include "alpaka/core/Common.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/dim/Traits.hpp"
-#include "alpaka/extent/Traits.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/mem/view/Traits.hpp"
-#include "alpaka/mem/view/ViewAccessOps.hpp"
-#include "alpaka/mem/view/ViewPlainPtr.hpp"
-#include "alpaka/offset/Traits.hpp"
-#include "alpaka/vec/Vec.hpp"
-
-#include <type_traits>
-#include <utility>
-
-namespace alpaka
-{
-    //! A sub-view to a view.
-    template<typename TDev, typename TElem, typename TDim, typename TIdx>
-    class ViewSubView : public internal::ViewAccessOps<ViewSubView<TDev, TElem, TDim, TIdx>>
-    {
-        static_assert(!std::is_const_v<TIdx>, "The idx type of the view can not be const!");
-
-        using Dev = alpaka::Dev<TDev>;
-
-    public:
-        //! Constructor.
-        //! \param view The view this view is a sub-view of.
-        //! \param extentElements The extent in elements.
-        //! \param relativeOffsetsElements The offsets in elements.
-        template<typename TQualifiedView, typename TOffsets, typename TExtent>
-        ViewSubView(
-            TQualifiedView& view,
-            TExtent const& extentElements,
-            TOffsets const& relativeOffsetsElements = TOffsets())
-            : m_viewParentView(getPtrNative(view), getDev(view), getExtents(view), getPitchesInBytes(view))
-            , m_extentElements(getExtents(extentElements))
-            , m_offsetsElements(getOffsets(relativeOffsetsElements))
-            , m_nativePtr(computeNativePtr())
-        {
-            ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-            using View = std::remove_cv_t<TQualifiedView>;
-
-            static_assert(
-                std::is_same_v<Dev, alpaka::Dev<View>>,
-                "The dev type of TView and the Dev template parameter have to be identical!");
-
-            static_assert(
-                std::is_same_v<TIdx, alpaka::Idx<View>>,
-                "The idx type of TView and the TIdx template parameter have to be identical!");
-            static_assert(
-                std::is_same_v<TIdx, alpaka::Idx<TExtent>>,
-                "The idx type of TExtent and the TIdx template parameter have to be identical!");
-            static_assert(
-                std::is_same_v<TIdx, alpaka::Idx<TOffsets>>,
-                "The idx type of TOffsets and the TIdx template parameter have to be identical!");
-
-            static_assert(
-                std::is_same_v<TDim, alpaka::Dim<View>>,
-                "The dim type of TView and the TDim template parameter have to be identical!");
-            static_assert(
-                std::is_same_v<TDim, alpaka::Dim<TExtent>>,
-                "The dim type of TExtent and the TDim template parameter have to be identical!");
-            static_assert(
-                std::is_same_v<TDim, alpaka::Dim<TOffsets>>,
-                "The dim type of TOffsets and the TDim template parameter have to be identical!");
-
-            ALPAKA_ASSERT(((m_offsetsElements + m_extentElements) <= getExtents(view)).all());
-        }
-
-        //! \param view The view this view is a sub-view of.
-        template<typename TView>
-        explicit ViewSubView(TView const& view) : ViewSubView(view, getExtents(view), Vec<TDim, TIdx>::zeros())
-        {
-            ALPAKA_DEBUG_FULL_LOG_SCOPE;
-        }
-
-        //! \param view The view this view is a sub-view of.
-        template<typename TView>
-        explicit ViewSubView(TView& view) : ViewSubView(view, getExtents(view), Vec<TDim, TIdx>::zeros())
-        {
-            ALPAKA_DEBUG_FULL_LOG_SCOPE;
-        }
-
-    public:
-        ALPAKA_FN_HOST auto computeNativePtr()
-        {
-#if BOOST_COMP_GNUC
-#    pragma GCC diagnostic push
-            // "cast from 'std::uint8_t*' to 'TElem*' increases required alignment of target type"
-#    pragma GCC diagnostic ignored "-Wcast-align"
-#endif
-            return reinterpret_cast<TElem*>(
-                reinterpret_cast<std::uint8_t*>(alpaka::getPtrNative(m_viewParentView))
-                + (m_offsetsElements * getPitchesInBytes(m_viewParentView)).sum());
-#if BOOST_COMP_GNUC
-#    pragma GCC diagnostic pop
-#endif
-        }
-
-        ViewPlainPtr<Dev, TElem, TDim, TIdx> m_viewParentView; // This wraps the parent view.
-        Vec<TDim, TIdx> m_extentElements; // The extent of this view.
-        Vec<TDim, TIdx> m_offsetsElements; // The offset relative to the parent view.
-        TElem* m_nativePtr;
-    };
-
-    // Trait specializations for ViewSubView.
-    namespace trait
-    {
-        //! The ViewSubView device type trait specialization.
-        template<typename TElem, typename TDim, typename TDev, typename TIdx>
-        struct DevType<ViewSubView<TDev, TElem, TDim, TIdx>>
-        {
-            using type = alpaka::Dev<TDev>;
-        };
-
-        //! The ViewSubView device get trait specialization.
-        template<typename TElem, typename TDim, typename TDev, typename TIdx>
-        struct GetDev<ViewSubView<TDev, TElem, TDim, TIdx>>
-        {
-            ALPAKA_FN_HOST static auto getDev(ViewSubView<TDev, TElem, TDim, TIdx> const& view) -> alpaka::Dev<TDev>
-            {
-                return alpaka::getDev(view.m_viewParentView);
-            }
-        };
-
-        //! The ViewSubView dimension getter trait specialization.
-        template<typename TElem, typename TDim, typename TDev, typename TIdx>
-        struct DimType<ViewSubView<TDev, TElem, TDim, TIdx>>
-        {
-            using type = TDim;
-        };
-
-        //! The ViewSubView memory element type get trait specialization.
-        template<typename TElem, typename TDim, typename TDev, typename TIdx>
-        struct ElemType<ViewSubView<TDev, TElem, TDim, TIdx>>
-        {
-            using type = TElem;
-        };
-
-        //! The ViewSubView width get trait specialization.
-        template<typename TElem, typename TDim, typename TDev, typename TIdx>
-        struct GetExtents<ViewSubView<TDev, TElem, TDim, TIdx>>
-        {
-            ALPAKA_FN_HOST auto operator()(ViewSubView<TDev, TElem, TDim, TIdx> const& view) const
-            {
-                return view.m_extentElements;
-            }
-        };
-
-        //! The ViewSubView native pointer get trait specialization.
-        template<typename TElem, typename TDim, typename TDev, typename TIdx>
-        struct GetPtrNative<ViewSubView<TDev, TElem, TDim, TIdx>>
-        {
-            ALPAKA_FN_HOST static auto getPtrNative(ViewSubView<TDev, TElem, TDim, TIdx> const& view) -> TElem const*
-            {
-                return view.m_nativePtr;
-            }
-
-            ALPAKA_FN_HOST static auto getPtrNative(ViewSubView<TDev, TElem, TDim, TIdx>& view) -> TElem*
-            {
-                return view.m_nativePtr;
-            }
-        };
-
-        //! The ViewSubView pitch get trait specialization.
-        template<typename TDev, typename TElem, typename TDim, typename TIdx>
-        struct GetPitchesInBytes<ViewSubView<TDev, TElem, TDim, TIdx>>
-        {
-            ALPAKA_FN_HOST auto operator()(ViewSubView<TDev, TElem, TDim, TIdx> const& view) const
-            {
-                return getPitchesInBytes(view.m_viewParentView);
-            }
-        };
-
-        //! The ViewSubView x offset get trait specialization.
-        template<typename TElem, typename TDim, typename TDev, typename TIdx>
-        struct GetOffsets<ViewSubView<TDev, TElem, TDim, TIdx>>
-        {
-            ALPAKA_FN_HOST auto operator()(ViewSubView<TDev, TElem, TDim, TIdx> const& offset)
-            {
-                return offset.m_offsetsElements;
-            }
-        };
-
-        //! The ViewSubView idx type trait specialization.
-        template<typename TElem, typename TDim, typename TDev, typename TIdx>
-        struct IdxType<ViewSubView<TDev, TElem, TDim, TIdx>>
-        {
-            using type = TIdx;
-        };
-
-        //! The CPU device CreateSubView trait default implementation
-        template<typename TDev, typename TSfinae>
-        struct CreateSubView
-        {
-            template<typename TView, typename TExtent, typename TOffsets>
-            static auto createSubView(
-                TView& view,
-                TExtent const& extentElements,
-                TOffsets const& relativeOffsetsElements)
-            {
-                using Dim = alpaka::Dim<TExtent>;
-                using Idx = alpaka::Idx<TExtent>;
-                using Elem = typename trait::ElemType<TView>::type;
-                return ViewSubView<TDev, Elem, Dim, Idx>(view, extentElements, relativeOffsetsElements);
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/meta/Apply.hpp b/include/alpaka/meta/Apply.hpp
deleted file mode 100644
index bcffe8c..0000000
--- a/include/alpaka/meta/Apply.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-namespace alpaka::meta
-{
-    namespace detail
-    {
-        template<typename TList, template<typename...> class TApplicant>
-        struct ApplyImpl;
-
-        template<template<typename...> class TList, template<typename...> class TApplicant, typename... T>
-        struct ApplyImpl<TList<T...>, TApplicant>
-        {
-            using type = TApplicant<T...>;
-        };
-    } // namespace detail
-    template<typename TList, template<typename...> class TApplicant>
-    using Apply = typename detail::ApplyImpl<TList, TApplicant>::type;
-} // namespace alpaka::meta
diff --git a/include/alpaka/meta/CartesianProduct.hpp b/include/alpaka/meta/CartesianProduct.hpp
deleted file mode 100644
index dc1a1d6..0000000
--- a/include/alpaka/meta/CartesianProduct.hpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/meta/Concatenate.hpp"
-
-namespace alpaka::meta
-{
-    // This is based on code by Patrick Fromberg.
-    // See
-    // http://stackoverflow.com/questions/9122028/how-to-create-the-cartesian-product-of-a-type-list/19611856#19611856
-    namespace detail
-    {
-        template<typename... Ts>
-        struct CartesianProductImplHelper;
-
-        // Stop condition.
-        template<template<typename...> class TList, typename... Ts>
-        struct CartesianProductImplHelper<TList<Ts...>>
-        {
-            using type = TList<Ts...>;
-        };
-
-        // Catches first empty tuple.
-        template<template<typename...> class TList, typename... Ts>
-        struct CartesianProductImplHelper<TList<TList<>>, Ts...>
-        {
-            using type = TList<>;
-        };
-
-        // Catches any empty tuple except first.
-        template<template<typename...> class TList, typename... Ts, typename... Rests>
-        struct CartesianProductImplHelper<TList<Ts...>, TList<>, Rests...>
-        {
-            using type = TList<>;
-        };
-
-        template<template<typename...> class TList, typename... X, typename H, typename... Rests>
-        struct CartesianProductImplHelper<TList<X...>, TList<H>, Rests...>
-        {
-            using type1 = TList<Concatenate<X, TList<H>>...>;
-            using type = typename CartesianProductImplHelper<type1, Rests...>::type;
-        };
-
-        template<
-            template<typename...>
-            class TList,
-            typename... X,
-            template<typename...>
-            class Head,
-            typename T,
-            typename... Ts,
-            typename... Rests>
-        struct CartesianProductImplHelper<TList<X...>, Head<T, Ts...>, Rests...>
-        {
-            using type1 = TList<Concatenate<X, TList<T>>...>;
-            using type2 = typename CartesianProductImplHelper<TList<X...>, TList<Ts...>>::type;
-            using type3 = Concatenate<type1, type2>;
-            using type = typename CartesianProductImplHelper<type3, Rests...>::type;
-        };
-
-        template<template<typename...> class TList, typename... Ts>
-        struct CartesianProductImpl;
-
-        // The base case for no input returns an empty sequence.
-        template<template<typename...> class TList>
-        struct CartesianProductImpl<TList>
-        {
-            using type = TList<>;
-        };
-
-        // R is the return type, Head<A...> is the first input list
-        template<template<typename...> class TList, template<typename...> class Head, typename... Ts, typename... Tail>
-        struct CartesianProductImpl<TList, Head<Ts...>, Tail...>
-        {
-            using type = typename detail::CartesianProductImplHelper<TList<TList<Ts>...>, Tail...>::type;
-        };
-    } // namespace detail
-
-    template<template<typename...> class TList, typename... Ts>
-    using CartesianProduct = typename detail::CartesianProductImpl<TList, Ts...>::type;
-} // namespace alpaka::meta
diff --git a/include/alpaka/meta/Concatenate.hpp b/include/alpaka/meta/Concatenate.hpp
deleted file mode 100644
index 9133eb6..0000000
--- a/include/alpaka/meta/Concatenate.hpp
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-namespace alpaka::meta
-{
-    namespace detail
-    {
-        template<typename... T>
-        struct ConcatenateImpl;
-
-        template<typename T>
-        struct ConcatenateImpl<T>
-        {
-            using type = T;
-        };
-
-        template<template<typename...> class TList, typename... As, typename... Bs, typename... TRest>
-        struct ConcatenateImpl<TList<As...>, TList<Bs...>, TRest...>
-        {
-            using type = typename ConcatenateImpl<TList<As..., Bs...>, TRest...>::type;
-        };
-    } // namespace detail
-
-    template<typename... T>
-    using Concatenate = typename detail::ConcatenateImpl<T...>::type;
-} // namespace alpaka::meta
diff --git a/include/alpaka/meta/DependentFalseType.hpp b/include/alpaka/meta/DependentFalseType.hpp
deleted file mode 100644
index a0f2855..0000000
--- a/include/alpaka/meta/DependentFalseType.hpp
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include <type_traits>
-
-namespace alpaka::meta
-{
-    //! A false_type being dependent on a ignored template parameter.
-    //! This allows to use static_assert in uninstantiated template specializations without triggering.
-    template<typename T>
-    struct DependentFalseType : std::false_type
-    {
-    };
-} // namespace alpaka::meta
diff --git a/include/alpaka/meta/Filter.hpp b/include/alpaka/meta/Filter.hpp
deleted file mode 100644
index 52e93dc..0000000
--- a/include/alpaka/meta/Filter.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/meta/Concatenate.hpp"
-
-#include <type_traits>
-
-namespace alpaka::meta
-{
-    namespace detail
-    {
-        template<template<typename...> class TList, template<typename...> class TPred, typename... Ts>
-        struct FilterImplHelper;
-
-        template<template<typename...> class TList, template<typename...> class TPred>
-        struct FilterImplHelper<TList, TPred>
-        {
-            using type = TList<>;
-        };
-
-        template<template<typename...> class TList, template<typename...> class TPred, typename T, typename... Ts>
-        struct FilterImplHelper<TList, TPred, T, Ts...>
-        {
-            using type = std::conditional_t<
-                TPred<T>::value,
-                Concatenate<TList<T>, typename FilterImplHelper<TList, TPred, Ts...>::type>,
-                typename FilterImplHelper<TList, TPred, Ts...>::type>;
-        };
-
-        template<typename TList, template<typename...> class TPred>
-        struct FilterImpl;
-
-        template<template<typename...> class TList, template<typename...> class TPred, typename... Ts>
-        struct FilterImpl<TList<Ts...>, TPred>
-        {
-            using type = typename detail::FilterImplHelper<TList, TPred, Ts...>::type;
-        };
-    } // namespace detail
-
-    /// \tparam TPred Only the first parameter is used, all other must be set by TPred to some default.
-    ///               Using '...' instead of a single type is a workaround for CrayClang.
-    template<typename TList, template<typename...> class TPred>
-    using Filter = typename detail::FilterImpl<TList, TPred>::type;
-} // namespace alpaka::meta
diff --git a/include/alpaka/meta/Fold.hpp b/include/alpaka/meta/Fold.hpp
deleted file mode 100644
index 1a258f4..0000000
--- a/include/alpaka/meta/Fold.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-
-namespace alpaka::meta
-{
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TFnObj, typename T>
-    ALPAKA_FN_HOST_ACC constexpr auto foldr(TFnObj const& /* f */, T const& t) -> T
-    {
-        return t;
-    }
-
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TFnObj, typename T0, typename T1, typename... Ts>
-    ALPAKA_FN_HOST_ACC constexpr auto foldr(TFnObj const& f, T0 const& t0, T1 const& t1, Ts const&... ts)
-    {
-        return f(t0, foldr(f, t1, ts...));
-    }
-} // namespace alpaka::meta
diff --git a/include/alpaka/meta/ForEachType.hpp b/include/alpaka/meta/ForEachType.hpp
deleted file mode 100644
index 030851f..0000000
--- a/include/alpaka/meta/ForEachType.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-
-#include <utility>
-
-namespace alpaka::meta
-{
-    namespace detail
-    {
-        template<typename TList>
-        struct ForEachTypeHelper;
-
-        template<template<typename...> class TList>
-        struct ForEachTypeHelper<TList<>>
-        {
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<typename TFnObj, typename... TArgs>
-            ALPAKA_FN_HOST_ACC static auto forEachTypeHelper(TFnObj&& /* f */, TArgs&&... /* args */) -> void
-            {
-            }
-        };
-
-        template<template<typename...> class TList, typename T, typename... Ts>
-        struct ForEachTypeHelper<TList<T, Ts...>>
-        {
-            ALPAKA_NO_HOST_ACC_WARNING
-            template<typename TFnObj, typename... TArgs>
-            ALPAKA_FN_HOST_ACC static auto forEachTypeHelper(TFnObj&& f, TArgs&&... args) -> void
-            {
-                f.template operator()<T>(std::forward<TArgs>(args)...);
-                ForEachTypeHelper<TList<Ts...>>::forEachTypeHelper(
-                    std::forward<TFnObj>(f),
-                    std::forward<TArgs>(args)...);
-            }
-        };
-    } // namespace detail
-
-    //! Equivalent to boost::mpl::for_each but does not require the types of the sequence to be default
-    //! constructible. This function does not create instances of the types instead it passes the types as template
-    //! parameter.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TList, typename TFnObj, typename... TArgs>
-    ALPAKA_FN_HOST_ACC auto forEachType(TFnObj&& f, TArgs&&... args) -> void
-    {
-        detail::ForEachTypeHelper<TList>::forEachTypeHelper(std::forward<TFnObj>(f), std::forward<TArgs>(args)...);
-    }
-} // namespace alpaka::meta
diff --git a/include/alpaka/meta/Functional.hpp b/include/alpaka/meta/Functional.hpp
deleted file mode 100644
index 0a5d848..0000000
--- a/include/alpaka/meta/Functional.hpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-
-namespace alpaka::meta
-{
-    template<typename T>
-    struct min
-    {
-        ALPAKA_NO_HOST_ACC_WARNING
-        ALPAKA_FN_HOST_ACC constexpr auto operator()(T const& lhs, T const& rhs) const
-        {
-            return (lhs < rhs) ? lhs : rhs;
-        }
-    };
-
-    template<typename T>
-    struct max
-    {
-        ALPAKA_NO_HOST_ACC_WARNING
-        ALPAKA_FN_HOST_ACC constexpr auto operator()(T const& lhs, T const& rhs) const
-        {
-            return (lhs > rhs) ? lhs : rhs;
-        }
-    };
-} // namespace alpaka::meta
diff --git a/include/alpaka/meta/InheritFromList.hpp b/include/alpaka/meta/InheritFromList.hpp
deleted file mode 100644
index e0a8fac..0000000
--- a/include/alpaka/meta/InheritFromList.hpp
+++ /dev/null
@@ -1,16 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-namespace alpaka::meta
-{
-    template<typename TBaseList>
-    class InheritFromList;
-
-    template<template<typename...> class TList, typename... TBases>
-    class InheritFromList<TList<TBases...>> : public TBases...
-    {
-    };
-} // namespace alpaka::meta
diff --git a/include/alpaka/meta/IntegerSequence.hpp b/include/alpaka/meta/IntegerSequence.hpp
deleted file mode 100644
index bc8bfac..0000000
--- a/include/alpaka/meta/IntegerSequence.hpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/meta/Set.hpp"
-
-#include <cstddef>
-#include <type_traits>
-#include <utility>
-
-namespace alpaka::meta
-{
-    namespace detail
-    {
-        template<typename TDstType, typename TIntegerSequence>
-        struct ConvertIntegerSequence;
-
-        template<typename TDstType, typename T, T... Tvals>
-        struct ConvertIntegerSequence<TDstType, std::integer_sequence<T, Tvals...>>
-        {
-            using type = std::integer_sequence<TDstType, static_cast<TDstType>(Tvals)...>;
-        };
-    } // namespace detail
-
-    template<typename TDstType, typename TIntegerSequence>
-    using ConvertIntegerSequence = typename detail::ConvertIntegerSequence<TDstType, TIntegerSequence>::type;
-
-    namespace detail
-    {
-        template<bool TisSizeNegative, bool TbIsBegin, typename T, T Tbegin, typename TIntCon, typename TIntSeq>
-        struct MakeIntegerSequenceHelper
-        {
-            static_assert(!TisSizeNegative, "MakeIntegerSequence<T, N> requires N to be non-negative.");
-        };
-
-        template<typename T, T Tbegin, T... Tvals>
-        struct MakeIntegerSequenceHelper<
-            false,
-            true,
-            T,
-            Tbegin,
-            std::integral_constant<T, Tbegin>,
-            std::integer_sequence<T, Tvals...>>
-        {
-            using type = std::integer_sequence<T, Tvals...>;
-        };
-
-        template<typename T, T Tbegin, T TIdx, T... Tvals>
-        struct MakeIntegerSequenceHelper<
-            false,
-            false,
-            T,
-            Tbegin,
-            std::integral_constant<T, TIdx>,
-            std::integer_sequence<T, Tvals...>>
-        {
-            using type = typename MakeIntegerSequenceHelper<
-                false,
-                TIdx == (Tbegin + 1),
-                T,
-                Tbegin,
-                std::integral_constant<T, TIdx - 1>,
-                std::integer_sequence<T, TIdx - 1, Tvals...>>::type;
-        };
-    } // namespace detail
-
-    template<typename T, T Tbegin, T Tsize>
-    using MakeIntegerSequenceOffset = typename detail::MakeIntegerSequenceHelper<
-        (Tsize < 0),
-        (Tsize == 0),
-        T,
-        Tbegin,
-        std::integral_constant<T, Tbegin + Tsize>,
-        std::integer_sequence<T>>::type;
-
-    //! Checks if the integral values are unique.
-    template<typename T, T... Tvals>
-    struct IntegralValuesUnique
-    {
-        static constexpr bool value = meta::IsParameterPackSet<std::integral_constant<T, Tvals>...>::value;
-    };
-
-    //! Checks if the values in the index sequence are unique.
-    template<typename TIntegerSequence>
-    struct IntegerSequenceValuesUnique;
-
-    //! Checks if the values in the index sequence are unique.
-    template<typename T, T... Tvals>
-    struct IntegerSequenceValuesUnique<std::integer_sequence<T, Tvals...>>
-    {
-        static constexpr bool value = IntegralValuesUnique<T, Tvals...>::value;
-    };
-
-    //! Checks if the integral values are within the given range.
-    template<typename T, T Tmin, T Tmax, T... Tvals>
-    struct IntegralValuesInRange;
-
-    //! Checks if the integral values are within the given range.
-    template<typename T, T Tmin, T Tmax>
-    struct IntegralValuesInRange<T, Tmin, Tmax>
-    {
-        static constexpr bool value = true;
-    };
-
-    //! Checks if the integral values are within the given range.
-    template<typename T, T Tmin, T Tmax, T I, T... Tvals>
-    struct IntegralValuesInRange<T, Tmin, Tmax, I, Tvals...>
-    {
-        static constexpr bool value
-            = (I >= Tmin) && (I <= Tmax) && IntegralValuesInRange<T, Tmin, Tmax, Tvals...>::value;
-    };
-
-    //! Checks if the values in the index sequence are within the given range.
-    template<typename TIntegerSequence, typename T, T Tmin, T Tmax>
-    struct IntegerSequenceValuesInRange;
-
-    //! Checks if the values in the index sequence are within the given range.
-    template<typename T, T... Tvals, T Tmin, T Tmax>
-    struct IntegerSequenceValuesInRange<std::integer_sequence<T, Tvals...>, T, Tmin, Tmax>
-    {
-        static constexpr bool value = IntegralValuesInRange<T, Tmin, Tmax, Tvals...>::value;
-    };
-} // namespace alpaka::meta
diff --git a/include/alpaka/meta/Integral.hpp b/include/alpaka/meta/Integral.hpp
deleted file mode 100644
index 48f4867..0000000
--- a/include/alpaka/meta/Integral.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include <type_traits>
-
-namespace alpaka::meta
-{
-    //! The trait is true if all values of TSubset are contained in TSuperset.
-    template<typename TSuperset, typename TSubset>
-    using IsIntegralSuperset = std::integral_constant<
-        bool,
-        std::is_integral_v<TSuperset> && std::is_integral_v<TSubset>
-            && (
-                // If the signdness is equal, the sizes have to be greater or equal to be a superset.
-                ((std::is_unsigned_v<TSuperset>
-                  == std::is_unsigned_v<TSubset>) &&(sizeof(TSuperset) >= sizeof(TSubset)))
-                // If the signdness is non-equal, the superset has to have at least one bit more.
-                || ((std::is_unsigned_v<TSuperset> != std::is_unsigned_v<TSubset>) &&(
-                    sizeof(TSuperset) > sizeof(TSubset))))>;
-
-    //! The type that has the higher max value.
-    template<typename T0, typename T1>
-    using HigherMax = std::conditional_t<
-        (sizeof(T0) > sizeof(T1)),
-        T0,
-        std::conditional_t<((sizeof(T0) == sizeof(T1)) && std::is_unsigned_v<T0> && std::is_signed_v<T1>), T0, T1>>;
-
-    //! The type that has the lower max value.
-    template<typename T0, typename T1>
-    using LowerMax = std::conditional_t<
-        (sizeof(T0) < sizeof(T1)),
-        T0,
-        std::conditional_t<((sizeof(T0) == sizeof(T1)) && std::is_signed_v<T0> && std::is_unsigned_v<T1>), T0, T1>>;
-
-    //! The type that has the higher min value. If both types have the same min value, the type with the wider
-    //! range is chosen.
-    template<typename T0, typename T1>
-    using HigherMin = std::conditional_t<
-        (std::is_unsigned_v<T0> == std::is_unsigned_v<T1>),
-        std::conditional_t<
-            std::is_unsigned_v<T0>,
-            std::conditional_t<(sizeof(T0) < sizeof(T1)), T1, T0>,
-            std::conditional_t<(sizeof(T0) < sizeof(T1)), T0, T1>>,
-        std::conditional_t<std::is_unsigned_v<T0>, T0, T1>>;
-
-    //! The type that has the lower min value. If both types have the same min value, the type with the wider range
-    //! is chosen.
-    template<typename T0, typename T1>
-    using LowerMin = std::conditional_t<
-        (std::is_unsigned_v<T0> == std::is_unsigned_v<T1>),
-        std::conditional_t<(sizeof(T0) > sizeof(T1)), T0, T1>,
-        std::conditional_t<std::is_signed_v<T0>, T0, T1>>;
-} // namespace alpaka::meta
diff --git a/include/alpaka/meta/IsArrayOrVector.hpp b/include/alpaka/meta/IsArrayOrVector.hpp
deleted file mode 100644
index f755916..0000000
--- a/include/alpaka/meta/IsArrayOrVector.hpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright 2022 Jiri Vyskocil, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/vec/Vec.hpp"
-
-#include <functional>
-#include <numeric>
-#include <type_traits>
-#include <vector>
-
-namespace alpaka::meta
-{
-    /** Checks whether T is an array or a vector type
-     *
-     * @tparam T a type to check
-     */
-    template<typename T>
-    struct IsArrayOrVector : std::false_type
-    {
-    };
-
-    /** Specialization of \a IsArrayOrVector for vector types
-     *
-     * @tparam T inner type held in the vector
-     * @tparam A vector allocator
-     */
-    template<typename T, typename A>
-    struct IsArrayOrVector<std::vector<T, A>> : std::true_type
-    {
-    };
-
-    /** Specialization of \a IsArrayOrVector for plain arrays
-     *
-     * @tparam T inner type held in the array
-     * @tparam N size of the array
-     */
-    template<typename T, std::size_t N>
-    struct IsArrayOrVector<T[N]> : std::true_type
-    {
-    };
-
-    /** Specialization of \a IsArrayOrVector for std::array
-     *
-     * @tparam T inner type held in the array
-     * @tparam N size of the array
-     */
-    template<typename T, std::size_t N>
-    struct IsArrayOrVector<std::array<T, N>> : std::true_type
-    {
-    };
-
-    /** Specialization of \a IsArrayOrVector for alpaka::Vec
-     *
-     * @tparam T inner type held in the array
-     * @tparam N size of the array
-     */
-    template<typename T, typename N>
-    struct IsArrayOrVector<alpaka::Vec<N, T>> : std::true_type
-    {
-    };
-
-} // namespace alpaka::meta
diff --git a/include/alpaka/meta/IsStrictBase.hpp b/include/alpaka/meta/IsStrictBase.hpp
deleted file mode 100644
index 80ece93..0000000
--- a/include/alpaka/meta/IsStrictBase.hpp
+++ /dev/null
@@ -1,15 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include <type_traits>
-
-namespace alpaka::meta
-{
-    //! The trait is true if TDerived is derived from TBase but is not TBase itself.
-    template<typename TBase, typename TDerived>
-    using IsStrictBase = std::
-        integral_constant<bool, std::is_base_of_v<TBase, TDerived> && !std::is_same_v<TBase, std::decay_t<TDerived>>>;
-} // namespace alpaka::meta
diff --git a/include/alpaka/meta/NdLoop.hpp b/include/alpaka/meta/NdLoop.hpp
deleted file mode 100644
index a9a3267..0000000
--- a/include/alpaka/meta/NdLoop.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/dim/Traits.hpp"
-#include "alpaka/vec/Vec.hpp"
-
-#include <utility>
-
-namespace alpaka::meta
-{
-    namespace detail
-    {
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename TIndex, typename TExtentVec, typename TFnObj>
-        ALPAKA_FN_HOST_ACC constexpr void ndLoopImpl(
-            std::index_sequence<>,
-            TIndex& idx,
-            TExtentVec const&,
-            TFnObj const& f)
-        {
-            f(idx);
-        }
-
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<std::size_t Tdim0, std::size_t... Tdims, typename TIndex, typename TExtentVec, typename TFnObj>
-        ALPAKA_FN_HOST_ACC constexpr void ndLoopImpl(
-            std::index_sequence<Tdim0, Tdims...>,
-            TIndex& idx,
-            TExtentVec const& extent,
-            TFnObj const& f)
-        {
-            static_assert(Dim<TIndex>::value > 0u, "The dimension given to ndLoop has to be larger than zero!");
-            static_assert(
-                Dim<TIndex>::value == Dim<TExtentVec>::value,
-                "The dimensions of the iteration vector and the extent vector have to be identical!");
-            static_assert(Dim<TIndex>::value > Tdim0, "The current dimension has to be in the range [0,dim-1]!");
-
-            for(idx[Tdim0] = 0u; idx[Tdim0] < extent[Tdim0]; ++idx[Tdim0])
-            {
-                ndLoopImpl(std::index_sequence<Tdims...>{}, idx, extent, f);
-            }
-        }
-    } // namespace detail
-
-    //! Loops over an n-dimensional iteration index variable calling f(idx, args...) for each iteration.
-    //! The loops are nested in the order given by the index_sequence with the first element being the outermost
-    //! and the last index the innermost loop.
-    //!
-    //! \param indexSequence A sequence of indices being a permutation of the values [0, dim-1].
-    //! \param extent N-dimensional loop extent.
-    //! \param f The function called at each iteration.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TExtentVec, typename TFnObj, std::size_t... Tdims>
-    ALPAKA_FN_HOST_ACC auto ndLoop(
-        [[maybe_unused]] std::index_sequence<Tdims...> indexSequence,
-        TExtentVec const& extent,
-        TFnObj const& f) -> void
-    {
-        static_assert(
-            IntegerSequenceValuesInRange<std::index_sequence<Tdims...>, std::size_t, 0, Dim<TExtentVec>::value>::value,
-            "The values in the index_sequence have to be in the range [0,dim-1]!");
-        static_assert(
-            IntegerSequenceValuesUnique<std::index_sequence<Tdims...>>::value,
-            "The values in the index_sequence have to be unique!");
-
-        auto idx = Vec<Dim<TExtentVec>, Idx<TExtentVec>>::zeros();
-        detail::ndLoopImpl(std::index_sequence<Tdims...>{}, idx, extent, f);
-    }
-
-    //! Loops over an n-dimensional iteration index variable calling f(idx, args...) for each iteration.
-    //! The loops are nested from index zero outmost to index (dim-1) innermost.
-    //!
-    //! \param extent N-dimensional loop extent.
-    //! \param f The function called at each iteration.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TExtentVec, typename TFnObj>
-    ALPAKA_FN_HOST_ACC auto ndLoopIncIdx(TExtentVec const& extent, TFnObj const& f) -> void
-    {
-        ndLoop(std::make_index_sequence<Dim<TExtentVec>::value>(), extent, f);
-    }
-} // namespace alpaka::meta
diff --git a/include/alpaka/meta/NonZero.hpp b/include/alpaka/meta/NonZero.hpp
deleted file mode 100644
index 49d9bf9..0000000
--- a/include/alpaka/meta/NonZero.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2023 Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include <type_traits>
-
-namespace alpaka::meta
-{
-    namespace detail
-    {
-        template<typename T>
-        struct NonZeroImpl : std::false_type
-        {
-        };
-
-        template<typename T, T TValue>
-        struct NonZeroImpl<std::integral_constant<T, TValue>> : std::bool_constant<TValue != static_cast<T>(0)>
-        {
-        };
-    } // namespace detail
-
-    template<typename T>
-    using NonZero = typename detail::NonZeroImpl<T>;
-
-} // namespace alpaka::meta
diff --git a/include/alpaka/meta/Set.hpp b/include/alpaka/meta/Set.hpp
deleted file mode 100644
index a4e387c..0000000
--- a/include/alpaka/meta/Set.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include <utility>
-
-namespace alpaka::meta
-{
-    namespace detail
-    {
-        //! Empty dependent type.
-        template<typename T>
-        struct Empty
-        {
-        };
-
-        template<typename... Ts>
-        struct IsParameterPackSetImpl;
-
-        template<>
-        struct IsParameterPackSetImpl<>
-        {
-            static constexpr bool value = true;
-        };
-
-        // Based on code by Roland Bock: https://gist.github.com/rbock/ad8eedde80c060132a18
-        // Linearly inherits from empty<T> and checks if it has already inherited from this type.
-        template<typename T, typename... Ts>
-        struct IsParameterPackSetImpl<T, Ts...>
-            : public IsParameterPackSetImpl<Ts...>
-            , public virtual Empty<T>
-        {
-            using Base = IsParameterPackSetImpl<Ts...>;
-
-            static constexpr bool value = Base::value && !std::is_base_of_v<Empty<T>, Base>;
-        };
-    } // namespace detail
-
-    //! Trait that tells if the parameter pack contains only unique (no equal) types.
-    template<typename... Ts>
-    using IsParameterPackSet = detail::IsParameterPackSetImpl<Ts...>;
-
-    namespace detail
-    {
-        template<typename TList>
-        struct IsSetImpl;
-
-        template<template<typename...> class TList, typename... Ts>
-        struct IsSetImpl<TList<Ts...>>
-        {
-            static constexpr bool value = IsParameterPackSet<Ts...>::value;
-        };
-    } // namespace detail
-
-    //! Trait that tells if the template contains only unique (no equal) types.
-    template<typename TList>
-    using IsSet = detail::IsSetImpl<TList>;
-} // namespace alpaka::meta
diff --git a/include/alpaka/meta/Transform.hpp b/include/alpaka/meta/Transform.hpp
deleted file mode 100644
index d7d079a..0000000
--- a/include/alpaka/meta/Transform.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-namespace alpaka::meta
-{
-    namespace detail
-    {
-        template<typename Ts, template<typename...> class TOp>
-        struct TransformImpl;
-
-        template<template<typename...> class TList, typename... Ts, template<typename...> class TOp>
-        struct TransformImpl<TList<Ts...>, TOp>
-        {
-            using type = TList<TOp<Ts>...>;
-        };
-    } // namespace detail
-    template<typename Ts, template<typename...> class TOp>
-    using Transform = typename detail::TransformImpl<Ts, TOp>::type;
-} // namespace alpaka::meta
diff --git a/include/alpaka/meta/TypeListOps.hpp b/include/alpaka/meta/TypeListOps.hpp
deleted file mode 100644
index c63b656..0000000
--- a/include/alpaka/meta/TypeListOps.hpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright 2022 Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include <tuple>
-#include <type_traits>
-
-namespace alpaka::meta
-{
-    namespace detail
-    {
-        template<typename List>
-        struct Front
-        {
-        };
-
-        template<template<typename...> class List, typename Head, typename... Tail>
-        struct Front<List<Head, Tail...>>
-        {
-            using type = Head;
-        };
-    } // namespace detail
-
-    template<typename List>
-    using Front = typename detail::Front<List>::type;
-
-    template<typename List, typename Value>
-    struct Contains : std::false_type
-    {
-    };
-
-    template<template<typename...> class List, typename Head, typename... Tail, typename Value>
-    struct Contains<List<Head, Tail...>, Value>
-    {
-        static constexpr bool value = std::is_same_v<Head, Value> || Contains<List<Tail...>, Value>::value;
-    };
-
-    // copied from https://stackoverflow.com/a/51073558/22035743
-    template<typename T>
-    struct IsList : std::false_type
-    {
-    };
-
-    template<template<typename...> class TList, typename... TTypes>
-    struct IsList<TList<TTypes...>> : std::true_type
-    {
-    };
-
-    //! \brief Checks whether the specified type is a list. List is a type with a variadic number of template types.
-    template<typename T>
-    constexpr bool isList = IsList<std::decay_t<T>>::value;
-
-    namespace detail
-    {
-        template<template<typename...> class TListType, typename TType, typename = void>
-        struct ToListImpl
-        {
-            using type = TListType<TType>;
-        };
-
-        template<template<typename...> class TListType, typename TList>
-        struct ToListImpl<TListType, TList, std::enable_if_t<alpaka::meta::isList<TList>>>
-        {
-            using type = TList;
-        };
-    } // namespace detail
-
-    //! \brief Takes an arbitrary number of types (T) and creates a type list of type TListType with the types (T). If
-    //! T is a single template parameter and it satisfies alpaka::meta::isList, the type of the structure is T (no type
-    //! change). For example std::tuple can be used as TListType.
-    //! \tparam TListType type of the created list
-    //! \tparam T possible list types or type list
-    template<template<typename...> class TListType, typename... T>
-    struct ToList;
-
-    template<template<typename...> class TListType, typename T>
-    struct ToList<TListType, T> : detail::ToListImpl<TListType, T>
-    {
-    };
-
-    template<template<typename...> class TListType, typename T, typename... Ts>
-    struct ToList<TListType, T, Ts...>
-    {
-        using type = TListType<T, Ts...>;
-    };
-
-    //! \brief If T is a single argument and a type list (fullfil alpaka::meta::isList), the return type is T.
-    //! Otherwise, std::tuple is returned with T types as template parameters.
-    template<typename... T>
-    using ToTuple = typename ToList<std::tuple, T...>::type;
-
-
-} // namespace alpaka::meta
diff --git a/include/alpaka/meta/Unique.hpp b/include/alpaka/meta/Unique.hpp
deleted file mode 100644
index ea20ff2..0000000
--- a/include/alpaka/meta/Unique.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include <type_traits>
-
-namespace alpaka::meta
-{
-    namespace detail
-    {
-        template<typename T, typename... Ts>
-        struct UniqueHelper
-        {
-            using type = T;
-        };
-
-        template<template<typename...> class TList, typename... Ts, typename U, typename... Us>
-        struct UniqueHelper<TList<Ts...>, U, Us...>
-            : std::conditional_t<
-                  (std::is_same_v<U, Ts> || ...),
-                  UniqueHelper<TList<Ts...>, Us...>,
-                  UniqueHelper<TList<Ts..., U>, Us...>>
-        {
-        };
-
-        template<typename T>
-        struct UniqueImpl;
-
-        template<template<typename...> class TList, typename... Ts>
-        struct UniqueImpl<TList<Ts...>>
-        {
-            using type = typename UniqueHelper<TList<>, Ts...>::type;
-        };
-    } // namespace detail
-
-    //! Trait that returns a list with only unique (no equal) types (a set). Duplicates will be filtered out.
-    template<typename TList>
-    using Unique = typename detail::UniqueImpl<TList>::type;
-} // namespace alpaka::meta
diff --git a/include/alpaka/offset/Traits.hpp b/include/alpaka/offset/Traits.hpp
deleted file mode 100644
index c2edb3b..0000000
--- a/include/alpaka/offset/Traits.hpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/dim/DimIntegralConst.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/vec/Vec.hpp"
-
-#include <type_traits>
-
-namespace alpaka
-{
-    //! The offset traits.
-    namespace trait
-    {
-        //! The x offset get trait.
-        //!
-        //! If not specialized explicitly it returns 0.
-        template<typename TIdx, typename TOffsets, typename TSfinae = void>
-        struct [[deprecated("Specialize GetOffsets instead")]] GetOffset
-        {
-            ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC static auto getOffset(TOffsets const&) -> Idx<TOffsets>
-            {
-                return static_cast<Idx<TOffsets>>(0);
-            } // namespace trait
-        }; // namespace alpaka
-
-        //! The GetOffsets trait for getting the offsets of an object as an alpaka::Vec.
-        template<typename TExtent, typename TSfinae = void>
-        struct GetOffsets;
-    } // namespace trait
-
-    //! \return The offset in the given dimension.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<std::size_t Tidx, typename TOffsets>
-    [[deprecated("use getOffsets(offsets)[Tidx] instead")]] ALPAKA_FN_HOST_ACC auto getOffset(TOffsets const& offsets)
-        -> Idx<TOffsets>
-    {
-#if BOOST_COMP_CLANG || BOOST_COMP_GNUC
-#    pragma GCC diagnostic push
-#    pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-        return trait::GetOffset<DimInt<Tidx>, TOffsets>::getOffset(offsets);
-#if BOOST_COMP_CLANG || BOOST_COMP_GNUC
-#    pragma GCC diagnostic pop
-#endif
-    }
-
-    //! \return The extents of the given object.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T>
-    ALPAKA_FN_HOST_ACC auto getOffsets(T const& object) -> Vec<Dim<T>, Idx<T>>
-    {
-        return trait::GetOffsets<T>{}(object);
-    }
-
-    //! \tparam T has to specialize GetOffsets.
-    //! \return The offset vector.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T>
-    ALPAKA_FN_HOST_ACC constexpr auto getOffsetVec(T const& object = {}) -> Vec<Dim<T>, Idx<T>>
-    {
-        return getOffsets(object);
-    }
-
-    //! \tparam T has to specialize GetOffsets.
-    //! \return The offset vector but only the last TDim elements.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TDim, typename T>
-    ALPAKA_FN_HOST_ACC constexpr auto getOffsetVecEnd(T const& object = {}) -> Vec<TDim, Idx<T>>
-    {
-        static_assert(TDim::value <= Dim<T>::value, "Cannot get more items than the offsets hold");
-
-        auto const o = getOffsets(object);
-        Vec<TDim, Idx<T>> v;
-        for(unsigned i = 0; i < TDim::value; i++)
-            v[i] = o[(Dim<T>::value - TDim::value) + i];
-        return v;
-    }
-
-    //! \return The offset in x dimension.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TOffsets>
-    ALPAKA_FN_HOST_ACC auto getOffsetX(TOffsets const& offsets = TOffsets()) -> Idx<TOffsets>
-    {
-        return getOffsets(offsets)[Dim<TOffsets>::value - 1u];
-    }
-
-    //! \return The offset in y dimension.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TOffsets>
-    ALPAKA_FN_HOST_ACC auto getOffsetY(TOffsets const& offsets = TOffsets()) -> Idx<TOffsets>
-    {
-        return getOffsets(offsets)[Dim<TOffsets>::value - 2u];
-    }
-
-    //! \return The offset in z dimension.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TOffsets>
-    ALPAKA_FN_HOST_ACC auto getOffsetZ(TOffsets const& offsets = TOffsets()) -> Idx<TOffsets>
-    {
-        return getOffsets(offsets)[Dim<TOffsets>::value - 3u];
-    }
-
-    namespace trait
-    {
-        //! The Vec offset get trait specialization.
-        template<typename TDim, typename TVal>
-        struct GetOffsets<Vec<TDim, TVal>>
-        {
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC constexpr auto operator()(Vec<TDim, TVal> const& offsets) const -> Vec<TDim, TVal>
-            {
-                return offsets;
-            }
-        };
-
-        //! The unsigned integral x offset get trait specialization.
-        template<typename TIntegral>
-        struct GetOffsets<TIntegral, std::enable_if_t<std::is_integral_v<TIntegral>>>
-        {
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC constexpr auto operator()(TIntegral const& i) const
-            {
-                return Vec{i};
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/platform/PlatformCpu.hpp b/include/alpaka/platform/PlatformCpu.hpp
deleted file mode 100644
index c431fd4..0000000
--- a/include/alpaka/platform/PlatformCpu.hpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/dev/DevCpu.hpp"
-#include "alpaka/platform/Traits.hpp"
-
-#include <sstream>
-#include <vector>
-
-namespace alpaka
-{
-    //! The CPU device platform.
-    struct PlatformCpu : concepts::Implements<ConceptPlatform, PlatformCpu>
-    {
-#if defined(BOOST_COMP_GNUC) && BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(11, 0, 0)                                     \
-    && BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(12, 0, 0)
-        // This is a workaround for g++-11 bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96295
-        // g++-11 complains in *all* places where a PlatformCpu is used, that it "may be used uninitialized"
-        char c = {};
-#endif
-    };
-
-    namespace trait
-    {
-        //! The CPU device device type trait specialization.
-        template<>
-        struct DevType<PlatformCpu>
-        {
-            using type = DevCpu;
-        };
-
-        //! The CPU platform device count get trait specialization.
-        template<>
-        struct GetDevCount<PlatformCpu>
-        {
-            ALPAKA_FN_HOST static auto getDevCount(PlatformCpu const&) -> std::size_t
-            {
-                ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                return 1;
-            }
-        };
-
-        //! The CPU platform device get trait specialization.
-        template<>
-        struct GetDevByIdx<PlatformCpu>
-        {
-            ALPAKA_FN_HOST static auto getDevByIdx(PlatformCpu const& platform, std::size_t const& devIdx) -> DevCpu
-            {
-                ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                std::size_t const devCount = getDevCount(platform);
-                if(devIdx >= devCount)
-                {
-                    std::stringstream ssErr;
-                    ssErr << "Unable to return device handle for CPU device with index " << devIdx
-                          << " because there are only " << devCount << " devices!";
-                    throw std::runtime_error(ssErr.str());
-                }
-
-                return {};
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/platform/PlatformCpuSycl.hpp b/include/alpaka/platform/PlatformCpuSycl.hpp
deleted file mode 100644
index 4fdda8d..0000000
--- a/include/alpaka/platform/PlatformCpuSycl.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright 2024 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/dev/DevGenericSycl.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/platform/PlatformGenericSycl.hpp"
-
-#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_CPU)
-
-#    include <sycl/sycl.hpp>
-
-namespace alpaka
-{
-    namespace detail
-    {
-        template<>
-        struct SYCLDeviceSelector<TagCpuSycl>
-        {
-            auto operator()(sycl::device const& dev) const -> int
-            {
-                return dev.is_cpu() ? 1 : -1;
-            }
-        };
-    } // namespace detail
-
-    //! The SYCL device manager.
-    using PlatformCpuSycl = PlatformGenericSycl<TagCpuSycl>;
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/platform/PlatformCudaRt.hpp b/include/alpaka/platform/PlatformCudaRt.hpp
deleted file mode 100644
index 9bf76fa..0000000
--- a/include/alpaka/platform/PlatformCudaRt.hpp
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright 2022 Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/ApiCudaRt.hpp"
-#include "alpaka/platform/PlatformUniformCudaHipRt.hpp"
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-namespace alpaka
-{
-    //! The CUDA RT platform.
-    using PlatformCudaRt = PlatformUniformCudaHipRt<ApiCudaRt>;
-} // namespace alpaka
-
-#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
diff --git a/include/alpaka/platform/PlatformFpgaSyclIntel.hpp b/include/alpaka/platform/PlatformFpgaSyclIntel.hpp
deleted file mode 100644
index a3a7342..0000000
--- a/include/alpaka/platform/PlatformFpgaSyclIntel.hpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright 2024 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/dev/DevGenericSycl.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/platform/PlatformGenericSycl.hpp"
-
-#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_FPGA)
-
-#    include <sycl/sycl.hpp>
-
-namespace alpaka
-{
-    namespace detail
-    {
-        // Prevent clang from annoying us with warnings about emitting too many vtables. These are discarded by the
-        // linker anyway.
-#    if BOOST_COMP_CLANG
-#        pragma clang diagnostic push
-#        pragma clang diagnostic ignored "-Wweak-vtables"
-#    endif
-        template<>
-        struct SYCLDeviceSelector<TagFpgaSyclIntel>
-        {
-#    ifdef ALPAKA_FPGA_EMULATION
-            static constexpr auto platform_name = "Intel(R) FPGA Emulation Platform for OpenCL(TM)";
-#    else
-            static constexpr auto platform_name = "Intel(R) FPGA SDK for OpenCL(TM)";
-#    endif
-
-            auto operator()(sycl::device const& dev) const -> int
-            {
-                auto const& platform = dev.get_platform().get_info<sycl::info::platform::name>();
-                auto const is_intel_fpga = dev.is_accelerator() && (platform == platform_name);
-
-                return is_intel_fpga ? 1 : -1;
-            }
-        };
-#    if BOOST_COMP_CLANG
-#        pragma clang diagnostic pop
-#    endif
-    } // namespace detail
-
-    //! The SYCL device manager.
-    using PlatformFpgaSyclIntel = PlatformGenericSycl<TagFpgaSyclIntel>;
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/platform/PlatformGenericSycl.hpp b/include/alpaka/platform/PlatformGenericSycl.hpp
deleted file mode 100644
index 12e00fc..0000000
--- a/include/alpaka/platform/PlatformGenericSycl.hpp
+++ /dev/null
@@ -1,746 +0,0 @@
-/* Copyright 2024 Jan Stephan, Luca Ferragina, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/core/Sycl.hpp"
-#include "alpaka/dev/DevGenericSycl.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/platform/Traits.hpp"
-
-#include <cstddef>
-#include <exception>
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-#    include <iostream>
-#endif
-#include <sstream>
-#include <stdexcept>
-#include <vector>
-
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-
-#    if BOOST_COMP_CLANG
-#        pragma clang diagnostic push
-#        pragma clang diagnostic ignored "-Wswitch-default"
-#    endif
-
-#    include <sycl/sycl.hpp>
-
-namespace alpaka
-{
-    namespace detail
-    {
-        template<typename TTag>
-        struct SYCLDeviceSelector;
-    } // namespace detail
-
-    //! The SYCL device manager.
-    template<typename TTag>
-    struct PlatformGenericSycl : concepts::Implements<ConceptPlatform, PlatformGenericSycl<TTag>>
-    {
-        PlatformGenericSycl()
-            : platform{detail::SYCLDeviceSelector<TTag>{}}
-            , devices(platform.get_devices())
-            , context{sycl::context{
-                  devices,
-                  [](sycl::exception_list exceptions)
-                  {
-                      auto ss_err = std::stringstream{};
-                      ss_err << "Caught asynchronous SYCL exception(s):\n";
-                      for(std::exception_ptr e : exceptions)
-                      {
-                          try
-                          {
-                              std::rethrow_exception(e);
-                          }
-                          catch(sycl::exception const& err)
-                          {
-                              ss_err << err.what() << " (" << err.code() << ")\n";
-                          }
-                      }
-                      throw std::runtime_error(ss_err.str());
-                  }}}
-        {
-        }
-
-        [[nodiscard]] auto syclPlatform() -> sycl::platform&
-        {
-            return platform;
-        }
-
-        [[nodiscard]] auto syclPlatform() const -> sycl::platform const&
-        {
-            return platform;
-        }
-
-        [[nodiscard]] auto syclDevices() -> std::vector<sycl::device>&
-        {
-            return devices;
-        }
-
-        [[nodiscard]] auto syclDevices() const -> std::vector<sycl::device> const&
-        {
-            return devices;
-        }
-
-        [[nodiscard]] auto syclContext() -> sycl::context&
-        {
-            return context;
-        }
-
-        [[nodiscard]] auto syclContext() const -> sycl::context const&
-        {
-            return context;
-        }
-
-    private:
-        sycl::platform platform;
-        std::vector<sycl::device> devices;
-        sycl::context context;
-    };
-
-    namespace trait
-    {
-        //! The SYCL platform device type trait specialization.
-        template<typename TTag>
-        struct DevType<PlatformGenericSycl<TTag>>
-        {
-            using type = DevGenericSycl<TTag>;
-        };
-
-        //! The SYCL platform device count get trait specialization.
-        template<typename TTag>
-        struct GetDevCount<PlatformGenericSycl<TTag>>
-        {
-            static auto getDevCount(PlatformGenericSycl<TTag> const& platform) -> std::size_t
-            {
-                ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                return platform.syclDevices().size();
-            }
-        };
-
-        //! The SYCL platform device get trait specialization.
-        template<typename TTag>
-        struct GetDevByIdx<PlatformGenericSycl<TTag>>
-        {
-            static auto getDevByIdx(PlatformGenericSycl<TTag> const& platform, std::size_t const& devIdx)
-            {
-                ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                auto const& devices = platform.syclDevices();
-                if(devIdx >= devices.size())
-                {
-                    auto ss_err = std::stringstream{};
-                    ss_err << "Unable to return device handle for device " << devIdx << ". There are only "
-                           << devices.size() << " SYCL devices!";
-                    throw std::runtime_error(ss_err.str());
-                }
-
-                auto sycl_dev = devices.at(devIdx);
-
-                // Log this device.
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                printDeviceProperties(sycl_dev);
-#    elif ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                std::cout << __func__ << sycl_dev.template get_info<sycl::info::device::name>() << '\n';
-#    endif
-                using SyclPlatform = alpaka::PlatformGenericSycl<TTag>;
-                return typename DevType<SyclPlatform>::type{sycl_dev, platform.syclContext()};
-            }
-
-        private:
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-            //! Prints all the device properties to std::cout.
-            static auto printDeviceProperties(sycl::device const& device) -> void
-            {
-                ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                constexpr auto KiB = std::size_t{1024};
-                constexpr auto MiB = KiB * KiB;
-
-                std::cout << "Device type: ";
-                switch(device.get_info<sycl::info::device::device_type>())
-                {
-                case sycl::info::device_type::cpu:
-                    std::cout << "CPU";
-                    break;
-
-                case sycl::info::device_type::gpu:
-                    std::cout << "GPU";
-                    break;
-
-                case sycl::info::device_type::accelerator:
-                    std::cout << "Accelerator";
-                    break;
-
-                case sycl::info::device_type::custom:
-                    std::cout << "Custom";
-                    break;
-
-                case sycl::info::device_type::automatic:
-                    std::cout << "Automatic";
-                    break;
-
-                case sycl::info::device_type::host:
-                    std::cout << "Host";
-                    break;
-
-                // The SYCL spec forbids the return of device_type::all
-                // Including this here to prevent warnings because of
-                // missing cases
-                case sycl::info::device_type::all:
-                    std::cout << "All";
-                    break;
-                }
-                std::cout << '\n';
-
-                std::cout << "Name: " << device.get_info<sycl::info::device::name>() << '\n';
-
-                std::cout << "Vendor: " << device.get_info<sycl::info::device::vendor>() << '\n';
-
-                std::cout << "Vendor ID: " << device.get_info<sycl::info::device::vendor_id>() << '\n';
-
-                std::cout << "Driver version: " << device.get_info<sycl::info::device::driver_version>() << '\n';
-
-                std::cout << "SYCL version: " << device.get_info<sycl::info::device::version>() << '\n';
-
-#        if !defined(BOOST_COMP_ICPX)
-                // Not defined by Level Zero back-end
-                std::cout << "Backend version: " << device.get_info<sycl::info::device::backend_version>() << '\n';
-#        endif
-
-                std::cout << "Aspects: " << '\n';
-
-#        if defined(BOOST_COMP_ICPX)
-#            if BOOST_COMP_ICPX >= BOOST_VERSION_NUMBER(53, 2, 0)
-                // These aspects are missing from oneAPI versions < 2023.2.0
-                if(device.has(sycl::aspect::emulated))
-                    std::cout << "\t* emulated\n";
-
-                if(device.has(sycl::aspect::host_debuggable))
-                    std::cout << "\t* debuggable using standard debuggers\n";
-#            endif
-#        endif
-
-                if(device.has(sycl::aspect::fp16))
-                    std::cout << "\t* supports sycl::half precision\n";
-
-                if(device.has(sycl::aspect::fp64))
-                    std::cout << "\t* supports double precision\n";
-
-                if(device.has(sycl::aspect::atomic64))
-                    std::cout << "\t* supports 64-bit atomics\n";
-
-                if(device.has(sycl::aspect::image))
-                    std::cout << "\t* supports images\n";
-
-                if(device.has(sycl::aspect::online_compiler))
-                    std::cout << "\t* supports online compilation of device code\n";
-
-                if(device.has(sycl::aspect::online_linker))
-                    std::cout << "\t* supports online linking of device code\n";
-
-                if(device.has(sycl::aspect::queue_profiling))
-                    std::cout << "\t* supports queue profiling\n";
-
-                if(device.has(sycl::aspect::usm_device_allocations))
-                    std::cout << "\t* supports explicit USM allocations\n";
-
-                if(device.has(sycl::aspect::usm_host_allocations))
-                    std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::host\n";
-
-                if(device.has(sycl::aspect::usm_atomic_host_allocations))
-                    std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::host atomically\n";
-
-                if(device.has(sycl::aspect::usm_shared_allocations))
-                    std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::shared\n";
-
-                if(device.has(sycl::aspect::usm_atomic_shared_allocations))
-                    std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::shared atomically\n";
-
-                if(device.has(sycl::aspect::usm_system_allocations))
-                    std::cout << "\t* can access memory allocated by the system allocator\n";
-
-                std::cout << "Available compute units: " << device.get_info<sycl::info::device::max_compute_units>()
-                          << '\n';
-
-                std::cout << "Maximum work item dimensions: ";
-                auto dims = device.get_info<sycl::info::device::max_work_item_dimensions>();
-                std::cout << dims << std::endl;
-
-                std::cout << "Maximum number of work items:\n";
-                auto const wi_1D = device.get_info<sycl::info::device::max_work_item_sizes<1>>();
-                auto const wi_2D = device.get_info<sycl::info::device::max_work_item_sizes<2>>();
-                auto const wi_3D = device.get_info<sycl::info::device::max_work_item_sizes<3>>();
-                std::cout << "\t* 1D: (" << wi_1D.get(0) << ")\n";
-                std::cout << "\t* 2D: (" << wi_2D.get(0) << ", " << wi_2D.get(1) << ")\n";
-                std::cout << "\t* 3D: (" << wi_3D.get(0) << ", " << wi_3D.get(1) << ", " << wi_3D.get(2) << ")\n";
-
-                std::cout << "Maximum number of work items per work-group: "
-                          << device.get_info<sycl::info::device::max_work_group_size>() << '\n';
-
-                std::cout << "Maximum number of sub-groups per work-group: "
-                          << device.get_info<sycl::info::device::max_num_sub_groups>() << '\n';
-
-                std::cout << "Supported sub-group sizes: ";
-                auto const sg_sizes = device.get_info<sycl::info::device::sub_group_sizes>();
-                for(auto const& sz : sg_sizes)
-                    std::cout << sz << ", ";
-                std::cout << '\n';
-
-                std::cout << "Preferred native vector width (char): "
-                          << device.get_info<sycl::info::device::preferred_vector_width_char>() << '\n';
-
-                std::cout << "Native ISA vector width (char): "
-                          << device.get_info<sycl::info::device::native_vector_width_char>() << '\n';
-
-                std::cout << "Preferred native vector width (short): "
-                          << device.get_info<sycl::info::device::preferred_vector_width_short>() << '\n';
-
-                std::cout << "Native ISA vector width (short): "
-                          << device.get_info<sycl::info::device::native_vector_width_short>() << '\n';
-
-                std::cout << "Preferred native vector width (int): "
-                          << device.get_info<sycl::info::device::preferred_vector_width_int>() << '\n';
-
-                std::cout << "Native ISA vector width (int): "
-                          << device.get_info<sycl::info::device::native_vector_width_int>() << '\n';
-
-                std::cout << "Preferred native vector width (long): "
-                          << device.get_info<sycl::info::device::preferred_vector_width_long>() << '\n';
-
-                std::cout << "Native ISA vector width (long): "
-                          << device.get_info<sycl::info::device::native_vector_width_long>() << '\n';
-
-                std::cout << "Preferred native vector width (float): "
-                          << device.get_info<sycl::info::device::preferred_vector_width_float>() << '\n';
-
-                std::cout << "Native ISA vector width (float): "
-                          << device.get_info<sycl::info::device::native_vector_width_float>() << '\n';
-
-                if(device.has(sycl::aspect::fp64))
-                {
-                    std::cout << "Preferred native vector width (double): "
-                              << device.get_info<sycl::info::device::preferred_vector_width_double>() << '\n';
-
-                    std::cout << "Native ISA vector width (double): "
-                              << device.get_info<sycl::info::device::native_vector_width_double>() << '\n';
-                }
-
-                if(device.has(sycl::aspect::fp16))
-                {
-                    std::cout << "Preferred native vector width (half): "
-                              << device.get_info<sycl::info::device::preferred_vector_width_half>() << '\n';
-
-                    std::cout << "Native ISA vector width (half): "
-                              << device.get_info<sycl::info::device::native_vector_width_half>() << '\n';
-                }
-
-                std::cout << "Maximum clock frequency: " << device.get_info<sycl::info::device::max_clock_frequency>()
-                          << " MHz\n";
-
-                std::cout << "Address space size: " << device.get_info<sycl::info::device::address_bits>() << "-bit\n";
-
-                std::cout << "Maximum size of memory object allocation: "
-                          << device.get_info<sycl::info::device::max_mem_alloc_size>() << " bytes\n";
-
-                if(device.has(sycl::aspect::image))
-                {
-                    std::cout << "Maximum number of simultaneous image object reads per kernel: "
-                              << device.get_info<sycl::info::device::max_read_image_args>() << '\n';
-
-                    std::cout << "Maximum number of simultaneous image writes per kernel: "
-                              << device.get_info<sycl::info::device::max_write_image_args>() << '\n';
-
-                    std::cout << "Maximum 1D/2D image width: "
-                              << device.get_info<sycl::info::device::image2d_max_width>() << " px\n";
-
-                    std::cout << "Maximum 2D image height: "
-                              << device.get_info<sycl::info::device::image2d_max_height>() << " px\n";
-
-                    std::cout << "Maximum 3D image width: " << device.get_info<sycl::info::device::image3d_max_width>()
-                              << " px\n";
-
-                    std::cout << "Maximum 3D image height: "
-                              << device.get_info<sycl::info::device::image3d_max_height>() << " px\n";
-
-                    std::cout << "Maximum 3D image depth: " << device.get_info<sycl::info::device::image3d_max_depth>()
-                              << " px\n";
-
-                    std::cout << "Maximum number of samplers per kernel: "
-                              << device.get_info<sycl::info::device::max_samplers>() << '\n';
-                }
-
-                std::cout << "Maximum kernel argument size: "
-                          << device.get_info<sycl::info::device::max_parameter_size>() << " bytes\n";
-
-                std::cout << "Memory base address alignment: "
-                          << device.get_info<sycl::info::device::mem_base_addr_align>() << " bit\n";
-
-                auto print_fp_config = [](std::string const& fp, std::vector<sycl::info::fp_config> const& conf)
-                {
-                    std::cout << fp << " precision floating-point capabilities:\n";
-
-                    auto find_and_print = [&](sycl::info::fp_config val)
-                    {
-                        auto it = std::find(begin(conf), end(conf), val);
-                        std::cout << (it == std::end(conf) ? "No" : "Yes") << '\n';
-                    };
-
-                    std::cout << "\t* denorm support: ";
-                    find_and_print(sycl::info::fp_config::denorm);
-
-                    std::cout << "\t* INF & quiet NaN support: ";
-                    find_and_print(sycl::info::fp_config::inf_nan);
-
-                    std::cout << "\t* round to nearest even support: ";
-                    find_and_print(sycl::info::fp_config::round_to_nearest);
-
-                    std::cout << "\t* round to zero support: ";
-                    find_and_print(sycl::info::fp_config::round_to_zero);
-
-                    std::cout << "\t* round to infinity support: ";
-                    find_and_print(sycl::info::fp_config::round_to_inf);
-
-                    std::cout << "\t* IEEE754-2008 FMA support: ";
-                    find_and_print(sycl::info::fp_config::fma);
-
-                    std::cout << "\t* correctly rounded divide/sqrt support: ";
-                    find_and_print(sycl::info::fp_config::correctly_rounded_divide_sqrt);
-
-                    std::cout << "\t* software-implemented floating point operations: ";
-                    find_and_print(sycl::info::fp_config::soft_float);
-                };
-
-                if(device.has(sycl::aspect::fp16))
-                {
-                    auto const fp16_conf = device.get_info<sycl::info::device::half_fp_config>();
-                    print_fp_config("Half", fp16_conf);
-                }
-
-                auto const fp32_conf = device.get_info<sycl::info::device::single_fp_config>();
-                print_fp_config("Single", fp32_conf);
-
-                if(device.has(sycl::aspect::fp64))
-                {
-                    auto const fp64_conf = device.get_info<sycl::info::device::double_fp_config>();
-                    print_fp_config("Double", fp64_conf);
-                }
-
-                std::cout << "Global memory cache type: ";
-                auto has_global_mem_cache = false;
-                switch(device.get_info<sycl::info::device::global_mem_cache_type>())
-                {
-                case sycl::info::global_mem_cache_type::none:
-                    std::cout << "none";
-                    break;
-
-                case sycl::info::global_mem_cache_type::read_only:
-                    std::cout << "read-only";
-                    has_global_mem_cache = true;
-                    break;
-
-                case sycl::info::global_mem_cache_type::read_write:
-                    std::cout << "read-write";
-                    has_global_mem_cache = true;
-                    break;
-                }
-                std::cout << '\n';
-
-                if(has_global_mem_cache)
-                {
-                    std::cout << "Global memory cache line size: "
-                              << device.get_info<sycl::info::device::global_mem_cache_line_size>() << " bytes\n";
-
-                    std::cout << "Global memory cache size: "
-                              << device.get_info<sycl::info::device::global_mem_cache_size>() / KiB << " KiB\n";
-                }
-
-                std::cout << "Global memory size: " << device.get_info<sycl::info::device::global_mem_size>() / MiB
-                          << " MiB" << std::endl;
-
-                std::cout << "Local memory type: ";
-                auto has_local_memory = false;
-                switch(device.get_info<sycl::info::device::local_mem_type>())
-                {
-                case sycl::info::local_mem_type::none:
-                    std::cout << "none";
-                    break;
-
-                case sycl::info::local_mem_type::local:
-                    std::cout << "local";
-                    has_local_memory = true;
-                    break;
-
-                case sycl::info::local_mem_type::global:
-                    std::cout << "global";
-                    has_local_memory = true;
-                    break;
-                }
-                std::cout << '\n';
-
-                if(has_local_memory)
-                    std::cout << "Local memory size: " << device.get_info<sycl::info::device::local_mem_size>() / KiB
-                              << " KiB\n";
-
-                std::cout << "Error correction support: "
-                          << (device.get_info<sycl::info::device::error_correction_support>() ? "Yes" : "No") << '\n';
-
-                auto print_memory_orders = [](std::vector<sycl::memory_order> const& mem_orders)
-                {
-                    for(auto const& cap : mem_orders)
-                    {
-                        switch(cap)
-                        {
-                        case sycl::memory_order::relaxed:
-                            std::cout << "relaxed";
-                            break;
-
-                        case sycl::memory_order::acquire:
-                            std::cout << "acquire";
-                            break;
-
-                        case sycl::memory_order::release:
-                            std::cout << "release";
-                            break;
-
-                        case sycl::memory_order::acq_rel:
-                            std::cout << "acq_rel";
-                            break;
-
-                        case sycl::memory_order::seq_cst:
-                            std::cout << "seq_cst";
-                            break;
-#        if defined(BOOST_COMP_ICPX)
-                        // Stop icpx from complaining about its own internals.
-                        case sycl::memory_order::__consume_unsupported:
-                            break;
-#        endif
-                        }
-                        std::cout << ", ";
-                    }
-                    std::cout << '\n';
-                };
-
-                std::cout << "Supported memory orderings for atomic operations: ";
-                auto const mem_orders = device.get_info<sycl::info::device::atomic_memory_order_capabilities>();
-                print_memory_orders(mem_orders);
-
-#        if defined(BOOST_COMP_ICPX)
-#            if BOOST_COMP_ICPX >= BOOST_VERSION_NUMBER(53, 2, 0)
-                // Not implemented in oneAPI < 2023.2.0
-                std::cout << "Supported memory orderings for sycl::atomic_fence: ";
-                auto const fence_orders = device.get_info<sycl::info::device::atomic_fence_order_capabilities>();
-                print_memory_orders(fence_orders);
-#            endif
-#        endif
-
-                auto print_memory_scopes = [](std::vector<sycl::memory_scope> const& mem_scopes)
-                {
-                    for(auto const& cap : mem_scopes)
-                    {
-                        switch(cap)
-                        {
-                        case sycl::memory_scope::work_item:
-                            std::cout << "work-item";
-                            break;
-
-                        case sycl::memory_scope::sub_group:
-                            std::cout << "sub-group";
-                            break;
-
-                        case sycl::memory_scope::work_group:
-                            std::cout << "work-group";
-                            break;
-
-                        case sycl::memory_scope::device:
-                            std::cout << "device";
-                            break;
-
-                        case sycl::memory_scope::system:
-                            std::cout << "system";
-                            break;
-                        }
-                        std::cout << ", ";
-                    }
-                    std::cout << '\n';
-                };
-
-                std::cout << "Supported memory scopes for atomic operations: ";
-                auto const mem_scopes = device.get_info<sycl::info::device::atomic_memory_scope_capabilities>();
-                print_memory_scopes(mem_scopes);
-
-#        if defined(BOOST_COMP_ICPX)
-#            if BOOST_COMP_ICPX >= BOOST_VERSION_NUMBER(53, 2, 0)
-                // Not implemented in oneAPI < 2023.2.0
-                std::cout << "Supported memory scopes for sycl::atomic_fence: ";
-                auto const fence_scopes = device.get_info<sycl::info::device::atomic_fence_scope_capabilities>();
-                print_memory_scopes(fence_scopes);
-#            endif
-#        endif
-
-                std::cout << "Device timer resolution: "
-                          << device.get_info<sycl::info::device::profiling_timer_resolution>() << " ns\n";
-
-                std::cout << "Built-in kernels: ";
-                auto const builtins = device.get_info<sycl::info::device::built_in_kernel_ids>();
-                for(auto const& b : builtins)
-                    std::cout << b.get_name() << ", ";
-                std::cout << '\n';
-
-                std::cout << "Maximum number of subdevices: ";
-                auto const max_subs = device.get_info<sycl::info::device::partition_max_sub_devices>();
-                std::cout << max_subs << '\n';
-
-                if(max_subs > 1)
-                {
-                    std::cout << "Supported partition properties: ";
-                    auto const part_props = device.get_info<sycl::info::device::partition_properties>();
-                    auto has_affinity_domains = false;
-                    for(auto const& prop : part_props)
-                    {
-                        switch(prop)
-                        {
-                        case sycl::info::partition_property::no_partition:
-                            std::cout << "no partition";
-                            break;
-
-                        case sycl::info::partition_property::partition_equally:
-                            std::cout << "equally";
-                            break;
-
-                        case sycl::info::partition_property::partition_by_counts:
-                            std::cout << "by counts";
-                            break;
-
-                        case sycl::info::partition_property::partition_by_affinity_domain:
-                            std::cout << "by affinity domain";
-                            has_affinity_domains = true;
-                            break;
-#        if defined(BOOST_COMP_ICPX)
-                        case sycl::info::partition_property::ext_intel_partition_by_cslice:
-                            std::cout << "by compute slice (Intel extension; deprecated)";
-                            break;
-#        endif
-                        }
-                        std::cout << ", ";
-                    }
-                    std::cout << '\n';
-
-                    if(has_affinity_domains)
-                    {
-                        std::cout << "Supported partition affinity domains: ";
-                        auto const aff_doms = device.get_info<sycl::info::device::partition_affinity_domains>();
-                        for(auto const& dom : aff_doms)
-                        {
-                            switch(dom)
-                            {
-                            case sycl::info::partition_affinity_domain::not_applicable:
-                                std::cout << "not applicable";
-                                break;
-
-                            case sycl::info::partition_affinity_domain::numa:
-                                std::cout << "NUMA";
-                                break;
-
-                            case sycl::info::partition_affinity_domain::L4_cache:
-                                std::cout << "L4 cache";
-                                break;
-
-                            case sycl::info::partition_affinity_domain::L3_cache:
-                                std::cout << "L3 cache";
-                                break;
-
-                            case sycl::info::partition_affinity_domain::L2_cache:
-                                std::cout << "L2 cache";
-                                break;
-
-                            case sycl::info::partition_affinity_domain::L1_cache:
-                                std::cout << "L1 cache";
-                                break;
-
-                            case sycl::info::partition_affinity_domain::next_partitionable:
-                                std::cout << "next partitionable";
-                                break;
-                            }
-                            std::cout << ", ";
-                        }
-                        std::cout << '\n';
-                    }
-
-                    std::cout << "Current partition property: ";
-                    switch(device.get_info<sycl::info::device::partition_type_property>())
-                    {
-                    case sycl::info::partition_property::no_partition:
-                        std::cout << "no partition";
-                        break;
-
-                    case sycl::info::partition_property::partition_equally:
-                        std::cout << "partitioned equally";
-                        break;
-
-                    case sycl::info::partition_property::partition_by_counts:
-                        std::cout << "partitioned by counts";
-                        break;
-
-                    case sycl::info::partition_property::partition_by_affinity_domain:
-                        std::cout << "partitioned by affinity domain";
-                        break;
-
-#        if defined(BOOST_COMP_ICPX)
-                    case sycl::info::partition_property::ext_intel_partition_by_cslice:
-                        std::cout << "partitioned by compute slice (Intel extension; deprecated)";
-                        break;
-#        endif
-                    }
-                    std::cout << '\n';
-
-                    std::cout << "Current partition affinity domain: ";
-                    switch(device.get_info<sycl::info::device::partition_type_affinity_domain>())
-                    {
-                    case sycl::info::partition_affinity_domain::not_applicable:
-                        std::cout << "not applicable";
-                        break;
-
-                    case sycl::info::partition_affinity_domain::numa:
-                        std::cout << "NUMA";
-                        break;
-
-                    case sycl::info::partition_affinity_domain::L4_cache:
-                        std::cout << "L4 cache";
-                        break;
-
-                    case sycl::info::partition_affinity_domain::L3_cache:
-                        std::cout << "L3 cache";
-                        break;
-
-                    case sycl::info::partition_affinity_domain::L2_cache:
-                        std::cout << "L2 cache";
-                        break;
-
-                    case sycl::info::partition_affinity_domain::L1_cache:
-                        std::cout << "L1 cache";
-                        break;
-
-                    case sycl::info::partition_affinity_domain::next_partitionable:
-                        std::cout << "next partitionable";
-                        break;
-                    }
-                    std::cout << '\n';
-                }
-
-                std::cout.flush();
-            }
-#    endif
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#    if BOOST_COMP_CLANG
-#        pragma clang diagnostic pop
-#    endif
-
-#endif
diff --git a/include/alpaka/platform/PlatformGpuSyclIntel.hpp b/include/alpaka/platform/PlatformGpuSyclIntel.hpp
deleted file mode 100644
index d49695a..0000000
--- a/include/alpaka/platform/PlatformGpuSyclIntel.hpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright 2024 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/dev/DevGenericSycl.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/platform/PlatformGenericSycl.hpp"
-
-#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_GPU)
-
-#    include <sycl/sycl.hpp>
-
-namespace alpaka
-{
-    namespace detail
-    {
-        template<>
-        struct SYCLDeviceSelector<TagGpuSyclIntel>
-        {
-            auto operator()(sycl::device const& dev) const -> int
-            {
-                auto const& vendor = dev.get_info<sycl::info::device::vendor>();
-                auto const is_intel_gpu = dev.is_gpu() && (vendor.find("Intel(R) Corporation") != std::string::npos);
-
-                return is_intel_gpu ? 1 : -1;
-            }
-        };
-    } // namespace detail
-
-    //! The SYCL device manager.
-    using PlatformGpuSyclIntel = PlatformGenericSycl<TagGpuSyclIntel>;
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/platform/PlatformHipRt.hpp b/include/alpaka/platform/PlatformHipRt.hpp
deleted file mode 100644
index 25303ae..0000000
--- a/include/alpaka/platform/PlatformHipRt.hpp
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright 2022 Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/ApiHipRt.hpp"
-#include "alpaka/platform/PlatformUniformCudaHipRt.hpp"
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-namespace alpaka
-{
-    //! The HIP RT platform.
-    using PlatformHipRt = PlatformUniformCudaHipRt<ApiHipRt>;
-} // namespace alpaka
-
-#endif // ALPAKA_ACC_GPU_HIP_ENABLED
diff --git a/include/alpaka/platform/PlatformUniformCudaHipRt.hpp b/include/alpaka/platform/PlatformUniformCudaHipRt.hpp
deleted file mode 100644
index a3ae0ef..0000000
--- a/include/alpaka/platform/PlatformUniformCudaHipRt.hpp
+++ /dev/null
@@ -1,265 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, René Widera, Andrea Bocci, Bernhard Manfred Gruber, Antonio Di Pilato,
- *                Christian Kaever
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/core/Cuda.hpp"
-#include "alpaka/core/Hip.hpp"
-#include "alpaka/dev/DevUniformCudaHipRt.hpp"
-#include "alpaka/dev/Traits.hpp"
-
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <tuple>
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-
-namespace alpaka
-{
-    // Forward declarations.
-    struct ApiCudaRt;
-    struct ApiHipRt;
-
-    //! The CUDA/HIP RT platform.
-    template<typename TApi>
-    struct PlatformUniformCudaHipRt : concepts::Implements<ConceptPlatform, PlatformUniformCudaHipRt<TApi>>
-    {
-#    if defined(BOOST_COMP_GNUC) && BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(11, 0, 0)                                 \
-        && BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(12, 0, 0)
-        // This is a workaround for g++-11 bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96295
-        // g++-11 complains in *all* places where a PlatformCpu is used, that it "may be used uninitialized"
-        char c = {};
-#    endif
-    };
-
-    namespace trait
-    {
-        //! The CUDA/HIP RT platform device type trait specialization.
-        template<typename TApi>
-        struct DevType<PlatformUniformCudaHipRt<TApi>>
-        {
-            using type = DevUniformCudaHipRt<TApi>;
-        };
-
-        //! The CUDA/HIP RT platform device count get trait specialization.
-        template<typename TApi>
-        struct GetDevCount<PlatformUniformCudaHipRt<TApi>>
-        {
-            ALPAKA_FN_HOST static auto getDevCount(PlatformUniformCudaHipRt<TApi> const&) -> std::size_t
-            {
-                ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                int iNumDevices(0);
-                typename TApi::Error_t error = TApi::getDeviceCount(&iNumDevices);
-                if(error != TApi::success)
-                    iNumDevices = 0;
-
-                return static_cast<std::size_t>(iNumDevices);
-            }
-        };
-
-        //! The CUDA/HIP RT platform device get trait specialization.
-        template<typename TApi>
-        struct GetDevByIdx<PlatformUniformCudaHipRt<TApi>>
-        {
-            ALPAKA_FN_HOST static auto getDevByIdx(
-                PlatformUniformCudaHipRt<TApi> const& platform,
-                std::size_t const& devIdx) -> DevUniformCudaHipRt<TApi>
-            {
-                ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                std::size_t const devCount = getDevCount(platform);
-                if(devIdx >= devCount)
-                {
-                    std::stringstream ssErr;
-                    ssErr << "Unable to return device handle for device " << devIdx << ". There are only " << devCount
-                          << " devices!";
-                    throw std::runtime_error(ssErr.str());
-                }
-
-                if(isDevUsable(devIdx))
-                {
-                    DevUniformCudaHipRt<TApi> dev(static_cast<int>(devIdx));
-
-                    // Log this device.
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                    typename TApi::DeviceProp_t devProp;
-                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::getDeviceProperties(&devProp, dev.getNativeHandle()));
-#    endif
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    printDeviceProperties(devProp);
-#    elif ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                    std::cout << __func__ << devProp.name << std::endl;
-#    endif
-                    return dev;
-                }
-                else
-                {
-                    std::stringstream ssErr;
-                    ssErr << "Unable to return device handle for device " << devIdx << ". It is not accessible!";
-                    throw std::runtime_error(ssErr.str());
-                }
-            }
-
-        private:
-            //! \return If the device is usable.
-            ALPAKA_FN_HOST static auto isDevUsable(std::size_t iDevice) -> bool
-            {
-                typename TApi::Error_t rc = TApi::setDevice(static_cast<int>(iDevice));
-                typename TApi::Stream_t queue = {};
-                // Create a dummy queue to check if the device is already used by an other process.
-                // cuda/hip-SetDevice never returns an error if another process already uses the selected device and
-                // gpu compute mode is set "process exclusive". \TODO: Check if this workaround is needed!
-                if(rc == TApi::success)
-                {
-                    rc = TApi::streamCreate(&queue);
-                }
-
-                if(rc == TApi::success)
-                {
-                    // Destroy the dummy queue.
-                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamDestroy(queue));
-                    return true;
-                }
-                else
-                {
-                    // Return the previous error from cudaStreamCreate.
-                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(rc);
-                    // Reset the Error state.
-                    std::ignore = TApi::getLastError();
-                    return false;
-                }
-            }
-
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-            //! Prints all the device properties to std::cout.
-            ALPAKA_FN_HOST static auto printDeviceProperties(typename TApi::DeviceProp_t const& devProp) -> void
-            {
-                ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                constexpr auto KiB = std::size_t{1024};
-                constexpr auto MiB = KiB * KiB;
-                std::cout << "name: " << devProp.name << std::endl;
-                std::cout << "totalGlobalMem: " << devProp.totalGlobalMem / MiB << " MiB" << std::endl;
-                std::cout << "sharedMemPerBlock: " << devProp.sharedMemPerBlock / KiB << " KiB" << std::endl;
-                std::cout << "regsPerBlock: " << devProp.regsPerBlock << std::endl;
-                std::cout << "warpSize: " << devProp.warpSize << std::endl;
-                std::cout << "maxThreadsPerBlock: " << devProp.maxThreadsPerBlock << std::endl;
-                std::cout << "maxThreadsDim[3]: (" << devProp.maxThreadsDim[0] << ", " << devProp.maxThreadsDim[1]
-                          << ", " << devProp.maxThreadsDim[2] << ")" << std::endl;
-                std::cout << "maxGridSize[3]: (" << devProp.maxGridSize[0] << ", " << devProp.maxGridSize[1] << ", "
-                          << devProp.maxGridSize[2] << ")" << std::endl;
-                std::cout << "clockRate: " << devProp.clockRate << " kHz" << std::endl;
-                std::cout << "totalConstMem: " << devProp.totalConstMem / KiB << " KiB" << std::endl;
-                std::cout << "major: " << devProp.major << std::endl;
-                std::cout << "minor: " << devProp.minor << std::endl;
-
-                // std::cout << "deviceOverlap: " << devProp.deviceOverlap << std::endl;    // Deprecated
-                std::cout << "multiProcessorCount: " << devProp.multiProcessorCount << std::endl;
-                std::cout << "integrated: " << devProp.integrated << std::endl;
-                std::cout << "canMapHostMemory: " << devProp.canMapHostMemory << std::endl;
-                std::cout << "computeMode: " << devProp.computeMode << std::endl;
-                std::cout << "concurrentKernels: " << devProp.concurrentKernels << std::endl;
-                std::cout << "pciBusID: " << devProp.pciBusID << std::endl;
-                std::cout << "pciDeviceID: " << devProp.pciDeviceID << std::endl;
-                std::cout << "pciDomainID: " << devProp.pciDomainID << std::endl;
-                std::cout << "memoryClockRate: " << devProp.memoryClockRate << " kHz" << std::endl;
-                std::cout << "memoryBusWidth: " << devProp.memoryBusWidth << " b" << std::endl;
-                std::cout << "l2CacheSize: " << devProp.l2CacheSize << " B" << std::endl;
-                std::cout << "maxThreadsPerMultiProcessor: " << devProp.maxThreadsPerMultiProcessor << std::endl;
-                std::cout << "isMultiGpuBoard: " << devProp.isMultiGpuBoard << std::endl;
-                if constexpr(std::is_same_v<TApi, ApiCudaRt>)
-                {
-                    std::cout << "memPitch: " << devProp.memPitch << " B" << std::endl;
-                    std::cout << "textureAlignment: " << devProp.textureAlignment << std::endl;
-                    std::cout << "texturePitchAlignment: " << devProp.texturePitchAlignment << std::endl;
-                    std::cout << "kernelExecTimeoutEnabled: " << devProp.kernelExecTimeoutEnabled << std::endl;
-                    std::cout << "unifiedAddressing: " << devProp.unifiedAddressing << std::endl;
-                    std::cout << "multiGpuBoardGroupID: " << devProp.multiGpuBoardGroupID << std::endl;
-                    std::cout << "singleToDoublePrecisionPerfRatio: " << devProp.singleToDoublePrecisionPerfRatio
-                              << std::endl;
-                    std::cout << "pageableMemoryAccess: " << devProp.pageableMemoryAccess << std::endl;
-                    std::cout << "concurrentManagedAccess: " << devProp.concurrentManagedAccess << std::endl;
-                    std::cout << "computePreemptionSupported: " << devProp.computePreemptionSupported << std::endl;
-                    std::cout << "canUseHostPointerForRegisteredMem: " << devProp.canUseHostPointerForRegisteredMem
-                              << std::endl;
-                    std::cout << "cooperativeLaunch: " << devProp.cooperativeLaunch << std::endl;
-                    std::cout << "cooperativeMultiDeviceLaunch: " << devProp.cooperativeMultiDeviceLaunch << std::endl;
-                    std::cout << "maxTexture1D: " << devProp.maxTexture1D << std::endl;
-                    std::cout << "maxTexture1DLinear: " << devProp.maxTexture1DLinear << std::endl;
-                    std::cout << "maxTexture2D[2]: " << devProp.maxTexture2D[0] << "x" << devProp.maxTexture2D[1]
-                              << std::endl;
-                    std::cout << "maxTexture2DLinear[3]: " << devProp.maxTexture2DLinear[0] << "x"
-                              << devProp.maxTexture2DLinear[1] << "x" << devProp.maxTexture2DLinear[2] << std::endl;
-                    std::cout << "maxTexture2DGather[2]: " << devProp.maxTexture2DGather[0] << "x"
-                              << devProp.maxTexture2DGather[1] << std::endl;
-                    std::cout << "maxTexture3D[3]: " << devProp.maxTexture3D[0] << "x" << devProp.maxTexture3D[1]
-                              << "x" << devProp.maxTexture3D[2] << std::endl;
-                    std::cout << "maxTextureCubemap: " << devProp.maxTextureCubemap << std::endl;
-                    std::cout << "maxTexture1DLayered[2]: " << devProp.maxTexture1DLayered[0] << "x"
-                              << devProp.maxTexture1DLayered[1] << std::endl;
-                    std::cout << "maxTexture2DLayered[3]: " << devProp.maxTexture2DLayered[0] << "x"
-                              << devProp.maxTexture2DLayered[1] << "x" << devProp.maxTexture2DLayered[2] << std::endl;
-                    std::cout << "maxTextureCubemapLayered[2]: " << devProp.maxTextureCubemapLayered[0] << "x"
-                              << devProp.maxTextureCubemapLayered[1] << std::endl;
-                    std::cout << "maxSurface1D: " << devProp.maxSurface1D << std::endl;
-                    std::cout << "maxSurface2D[2]: " << devProp.maxSurface2D[0] << "x" << devProp.maxSurface2D[1]
-                              << std::endl;
-                    std::cout << "maxSurface3D[3]: " << devProp.maxSurface3D[0] << "x" << devProp.maxSurface3D[1]
-                              << "x" << devProp.maxSurface3D[2] << std::endl;
-                    std::cout << "maxSurface1DLayered[2]: " << devProp.maxSurface1DLayered[0] << "x"
-                              << devProp.maxSurface1DLayered[1] << std::endl;
-                    std::cout << "maxSurface2DLayered[3]: " << devProp.maxSurface2DLayered[0] << "x"
-                              << devProp.maxSurface2DLayered[1] << "x" << devProp.maxSurface2DLayered[2] << std::endl;
-                    std::cout << "maxSurfaceCubemap: " << devProp.maxSurfaceCubemap << std::endl;
-                    std::cout << "maxSurfaceCubemapLayered[2]: " << devProp.maxSurfaceCubemapLayered[0] << "x"
-                              << devProp.maxSurfaceCubemapLayered[1] << std::endl;
-                    std::cout << "surfaceAlignment: " << devProp.surfaceAlignment << std::endl;
-                    std::cout << "ECCEnabled: " << devProp.ECCEnabled << std::endl;
-                    std::cout << "tccDriver: " << devProp.tccDriver << std::endl;
-                    std::cout << "asyncEngineCount: " << devProp.asyncEngineCount << std::endl;
-                    std::cout << "streamPrioritiesSupported: " << devProp.streamPrioritiesSupported << std::endl;
-                    std::cout << "globalL1CacheSupported: " << devProp.globalL1CacheSupported << std::endl;
-                    std::cout << "localL1CacheSupported: " << devProp.localL1CacheSupported << std::endl;
-                    std::cout << "sharedMemPerMultiprocessor: " << devProp.sharedMemPerMultiprocessor << std::endl;
-                    std::cout << "regsPerMultiprocessor: " << devProp.regsPerMultiprocessor << std::endl;
-                    std::cout << "managedMemory: " << devProp.managedMemory << std::endl;
-                }
-                else
-                { // ApiHipRt
-                    std::cout << "clockInstructionRate: " << devProp.clockInstructionRate << "kHz" << std::endl;
-                    std::cout << "maxSharedMemoryPerMultiProcessor: " << devProp.maxSharedMemoryPerMultiProcessor / KiB
-                              << " KiB" << std::endl;
-                    std::cout << "gcnArchName: " << devProp.gcnArchName << std::endl;
-                    std::cout << "arch: " << std::endl;
-                    std::cout << "    hasGlobalInt32Atomics: " << devProp.arch.hasGlobalInt32Atomics << std::endl;
-                    std::cout << "    hasGlobalFloatAtomicExch: " << devProp.arch.hasGlobalFloatAtomicExch
-                              << std::endl;
-                    std::cout << "    hasSharedInt32Atomics: " << devProp.arch.hasSharedInt32Atomics << std::endl;
-                    std::cout << "    hasSharedFloatAtomicExch: " << devProp.arch.hasSharedFloatAtomicExch
-                              << std::endl;
-                    std::cout << "    hasFloatAtomicAdd: " << devProp.arch.hasFloatAtomicAdd << std::endl;
-                    std::cout << "    hasGlobalInt64Atomics: " << devProp.arch.hasGlobalInt64Atomics << std::endl;
-                    std::cout << "    hasSharedInt64Atomics: " << devProp.arch.hasSharedInt64Atomics << std::endl;
-                    std::cout << "    hasDoubles: " << devProp.arch.hasDoubles << std::endl;
-                    std::cout << "    hasWarpVote: " << devProp.arch.hasWarpVote << std::endl;
-                    std::cout << "    hasWarpBallot: " << devProp.arch.hasWarpBallot << std::endl;
-                    std::cout << "    hasWarpShuffle: " << devProp.arch.hasWarpShuffle << std::endl;
-                    std::cout << "    hasFunnelShift: " << devProp.arch.hasFunnelShift << std::endl;
-                    std::cout << "    hasThreadFenceSystem: " << devProp.arch.hasThreadFenceSystem << std::endl;
-                    std::cout << "    hasSyncThreadsExt: " << devProp.arch.hasSyncThreadsExt << std::endl;
-                    std::cout << "    hasSurfaceFuncs: " << devProp.arch.hasSurfaceFuncs << std::endl;
-                    std::cout << "    has3dGrid: " << devProp.arch.has3dGrid << std::endl;
-                    std::cout << "    hasDynamicParallelism: " << devProp.arch.hasDynamicParallelism << std::endl;
-                }
-            }
-#    endif
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/platform/Traits.hpp b/include/alpaka/platform/Traits.hpp
deleted file mode 100644
index 5c094c3..0000000
--- a/include/alpaka/platform/Traits.hpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/queue/Traits.hpp"
-
-#include <type_traits>
-#include <vector>
-
-namespace alpaka
-{
-    struct ConceptPlatform
-    {
-    };
-
-    //! True if TPlatform is a platform, i.e. if it implements the ConceptPlatform concept.
-    template<typename TPlatform>
-    inline constexpr bool isPlatform = concepts::ImplementsConcept<ConceptPlatform, TPlatform>::value;
-
-    //! The platform traits.
-    namespace trait
-    {
-        //! The platform type trait.
-        template<typename T, typename TSfinae = void>
-        struct PlatformType;
-
-        template<typename TPlatform>
-        struct PlatformType<
-            TPlatform,
-            std::enable_if_t<concepts::ImplementsConcept<ConceptPlatform, TPlatform>::value>>
-        {
-            using type = typename concepts::ImplementationBase<ConceptDev, TPlatform>;
-        };
-
-        //! The device count get trait.
-        template<typename T, typename TSfinae = void>
-        struct GetDevCount;
-
-        //! The device get trait.
-        template<typename T, typename TSfinae = void>
-        struct GetDevByIdx;
-    } // namespace trait
-
-    //! The platform type trait alias template to remove the ::type.
-    template<typename T>
-    using Platform = typename trait::PlatformType<T>::type;
-
-    //! \return The device identified by its index.
-    template<typename TPlatform>
-    ALPAKA_FN_HOST auto getDevCount(TPlatform const& platform)
-    {
-        return trait::GetDevCount<TPlatform>::getDevCount(platform);
-    }
-
-    //! \return The device identified by its index.
-    template<typename TPlatform>
-    ALPAKA_FN_HOST auto getDevByIdx(TPlatform const& platform, std::size_t const& devIdx) -> Dev<TPlatform>
-    {
-        return trait::GetDevByIdx<TPlatform>::getDevByIdx(platform, devIdx);
-    }
-
-    //! \return All the devices available on this accelerator.
-    template<typename TPlatform>
-    ALPAKA_FN_HOST auto getDevs(TPlatform const& platform) -> std::vector<Dev<TPlatform>>
-    {
-        std::vector<Dev<TPlatform>> devs;
-
-        std::size_t const devCount = getDevCount(platform);
-        devs.reserve(devCount);
-        for(std::size_t devIdx(0); devIdx < devCount; ++devIdx)
-        {
-            devs.push_back(getDevByIdx(platform, devIdx));
-        }
-
-        return devs;
-    }
-
-    namespace trait
-    {
-        template<typename TPlatform, typename TProperty>
-        struct QueueType<
-            TPlatform,
-            TProperty,
-            std::enable_if_t<concepts::ImplementsConcept<ConceptPlatform, TPlatform>::value>>
-        {
-            using type = typename QueueType<typename alpaka::trait::DevType<TPlatform>::type, TProperty>::type;
-        };
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/queue/Properties.hpp b/include/alpaka/queue/Properties.hpp
deleted file mode 100644
index d3e3b55..0000000
--- a/include/alpaka/queue/Properties.hpp
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright 2020 Rene Widera, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-namespace alpaka
-{
-    //! Properties to define queue behavior
-    namespace property
-    {
-        //! The caller is waiting until the enqueued task is finished
-        struct Blocking;
-
-        //! The caller is NOT waiting until the enqueued task is finished
-        struct NonBlocking;
-    } // namespace property
-
-    using namespace property;
-} // namespace alpaka
diff --git a/include/alpaka/queue/QueueCpuBlocking.hpp b/include/alpaka/queue/QueueCpuBlocking.hpp
deleted file mode 100644
index 8cf4746..0000000
--- a/include/alpaka/queue/QueueCpuBlocking.hpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/* Copyright 2020 Jeffrey Kelling, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/event/EventCpu.hpp"
-#include "alpaka/queue/QueueGenericThreadsBlocking.hpp"
-
-namespace alpaka
-{
-    using QueueCpuBlocking = QueueGenericThreadsBlocking<DevCpu>;
-} // namespace alpaka
diff --git a/include/alpaka/queue/QueueCpuNonBlocking.hpp b/include/alpaka/queue/QueueCpuNonBlocking.hpp
deleted file mode 100644
index 78eb028..0000000
--- a/include/alpaka/queue/QueueCpuNonBlocking.hpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/* Copyright 2020 Jeffrey Kelling, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/event/EventCpu.hpp"
-#include "alpaka/queue/QueueGenericThreadsNonBlocking.hpp"
-
-namespace alpaka
-{
-    using QueueCpuNonBlocking = QueueGenericThreadsNonBlocking<DevCpu>;
-} // namespace alpaka
diff --git a/include/alpaka/queue/QueueCpuSyclBlocking.hpp b/include/alpaka/queue/QueueCpuSyclBlocking.hpp
deleted file mode 100644
index 392740a..0000000
--- a/include/alpaka/queue/QueueCpuSyclBlocking.hpp
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Copyright 2024 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/acc/Tag.hpp"
-#include "alpaka/queue/QueueGenericSyclBlocking.hpp"
-
-#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_CPU)
-
-namespace alpaka
-{
-    using QueueCpuSyclBlocking = QueueGenericSyclBlocking<TagCpuSycl>;
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/queue/QueueCpuSyclNonBlocking.hpp b/include/alpaka/queue/QueueCpuSyclNonBlocking.hpp
deleted file mode 100644
index 19904ba..0000000
--- a/include/alpaka/queue/QueueCpuSyclNonBlocking.hpp
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Copyright 2024 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/acc/Tag.hpp"
-#include "alpaka/queue/QueueGenericSyclNonBlocking.hpp"
-
-#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_CPU)
-
-namespace alpaka
-{
-    using QueueCpuSyclNonBlocking = QueueGenericSyclNonBlocking<TagCpuSycl>;
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/queue/QueueCudaRtBlocking.hpp b/include/alpaka/queue/QueueCudaRtBlocking.hpp
deleted file mode 100644
index c54a618..0000000
--- a/include/alpaka/queue/QueueCudaRtBlocking.hpp
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright 2022 Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/ApiCudaRt.hpp"
-#include "alpaka/queue/QueueUniformCudaHipRtBlocking.hpp"
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-namespace alpaka
-{
-    //! The CUDA RT blocking queue.
-    using QueueCudaRtBlocking = QueueUniformCudaHipRtBlocking<ApiCudaRt>;
-} // namespace alpaka
-
-#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
diff --git a/include/alpaka/queue/QueueCudaRtNonBlocking.hpp b/include/alpaka/queue/QueueCudaRtNonBlocking.hpp
deleted file mode 100644
index a7180d6..0000000
--- a/include/alpaka/queue/QueueCudaRtNonBlocking.hpp
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright 2022 Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/ApiCudaRt.hpp"
-#include "alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp"
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-namespace alpaka
-{
-    //! The CUDA RT non-blocking queue.
-    using QueueCudaRtNonBlocking = QueueUniformCudaHipRtNonBlocking<ApiCudaRt>;
-} // namespace alpaka
-
-#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
diff --git a/include/alpaka/queue/QueueFpgaSyclIntelBlocking.hpp b/include/alpaka/queue/QueueFpgaSyclIntelBlocking.hpp
deleted file mode 100644
index 7c2f791..0000000
--- a/include/alpaka/queue/QueueFpgaSyclIntelBlocking.hpp
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Copyright 2024 Jan Stephan, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/acc/Tag.hpp"
-#include "alpaka/queue/QueueGenericSyclBlocking.hpp"
-
-#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_FPGA)
-
-namespace alpaka
-{
-    using QueueFpgaSyclIntelBlocking = QueueGenericSyclBlocking<TagFpgaSyclIntel>;
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/queue/QueueFpgaSyclIntelNonBlocking.hpp b/include/alpaka/queue/QueueFpgaSyclIntelNonBlocking.hpp
deleted file mode 100644
index de1d7a6..0000000
--- a/include/alpaka/queue/QueueFpgaSyclIntelNonBlocking.hpp
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Copyright 2024 Jan Stephan, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/acc/Tag.hpp"
-#include "alpaka/queue/QueueGenericSyclNonBlocking.hpp"
-
-#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_FPGA)
-
-namespace alpaka
-{
-    using QueueFpgaSyclIntelNonBlocking = QueueGenericSyclNonBlocking<TagFpgaSyclIntel>;
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/queue/QueueGenericSyclBlocking.hpp b/include/alpaka/queue/QueueGenericSyclBlocking.hpp
deleted file mode 100644
index 44dfb14..0000000
--- a/include/alpaka/queue/QueueGenericSyclBlocking.hpp
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Copyright 2024 Jan Stephan, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/queue/sycl/QueueGenericSyclBase.hpp"
-
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-
-namespace alpaka
-{
-    template<typename TTag>
-    using QueueGenericSyclBlocking = detail::QueueGenericSyclBase<TTag, true>;
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/queue/QueueGenericSyclNonBlocking.hpp b/include/alpaka/queue/QueueGenericSyclNonBlocking.hpp
deleted file mode 100644
index 22615ca..0000000
--- a/include/alpaka/queue/QueueGenericSyclNonBlocking.hpp
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Copyright 2024 Jan Stephan, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/queue/sycl/QueueGenericSyclBase.hpp"
-
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-
-namespace alpaka
-{
-    template<typename TTag>
-    using QueueGenericSyclNonBlocking = detail::QueueGenericSyclBase<TTag, false>;
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/queue/QueueGenericThreadsBlocking.hpp b/include/alpaka/queue/QueueGenericThreadsBlocking.hpp
deleted file mode 100644
index 65361bd..0000000
--- a/include/alpaka/queue/QueueGenericThreadsBlocking.hpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/event/Traits.hpp"
-#include "alpaka/queue/Traits.hpp"
-#include "alpaka/queue/cpu/IGenericThreadsQueue.hpp"
-#include "alpaka/wait/Traits.hpp"
-
-#include <atomic>
-#include <memory>
-#include <mutex>
-
-namespace alpaka
-{
-    template<typename TDev>
-    class EventGenericThreads;
-
-    namespace generic
-    {
-        namespace detail
-        {
-#if BOOST_COMP_CLANG
-// avoid diagnostic warning: "has no out-of-line virtual method definitions; its vtable will be emitted in every
-// translation unit [-Werror,-Wweak-vtables]" https://stackoverflow.com/a/29288300
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored "-Wweak-vtables"
-#endif
-            //! The CPU device queue implementation.
-            template<typename TDev>
-            class QueueGenericThreadsBlockingImpl final : public IGenericThreadsQueue<TDev>
-#if BOOST_COMP_CLANG
-#    pragma clang diagnostic pop
-#endif
-            {
-            public:
-                explicit QueueGenericThreadsBlockingImpl(TDev dev) noexcept
-                    : m_dev(std::move(dev))
-                    , m_bCurrentlyExecutingTask(false)
-                {
-                }
-
-                QueueGenericThreadsBlockingImpl(QueueGenericThreadsBlockingImpl<TDev> const&) = delete;
-                auto operator=(QueueGenericThreadsBlockingImpl<TDev> const&)
-                    -> QueueGenericThreadsBlockingImpl<TDev>& = delete;
-
-                void enqueue(EventGenericThreads<TDev>& ev) final
-                {
-                    alpaka::enqueue(*this, ev);
-                }
-
-                void wait(EventGenericThreads<TDev> const& ev) final
-                {
-                    alpaka::wait(*this, ev);
-                }
-
-            public:
-                TDev const m_dev; //!< The device this queue is bound to.
-                std::mutex mutable m_mutex;
-                std::atomic<bool> m_bCurrentlyExecutingTask;
-            };
-        } // namespace detail
-    } // namespace generic
-
-    //! The CPU device queue.
-    template<typename TDev>
-    class QueueGenericThreadsBlocking final
-        : public concepts::Implements<ConceptCurrentThreadWaitFor, QueueGenericThreadsBlocking<TDev>>
-        , public concepts::Implements<ConceptQueue, QueueGenericThreadsBlocking<TDev>>
-        , public concepts::Implements<ConceptGetDev, QueueGenericThreadsBlocking<TDev>>
-    {
-    public:
-        explicit QueueGenericThreadsBlocking(TDev const& dev)
-            : m_spQueueImpl(std::make_shared<generic::detail::QueueGenericThreadsBlockingImpl<TDev>>(dev))
-        {
-            ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-            dev.registerQueue(m_spQueueImpl);
-        }
-
-        auto operator==(QueueGenericThreadsBlocking<TDev> const& rhs) const -> bool
-        {
-            return (m_spQueueImpl == rhs.m_spQueueImpl);
-        }
-
-        auto operator!=(QueueGenericThreadsBlocking<TDev> const& rhs) const -> bool
-        {
-            return !((*this) == rhs);
-        }
-
-    public:
-        std::shared_ptr<generic::detail::QueueGenericThreadsBlockingImpl<TDev>> m_spQueueImpl;
-    };
-
-    namespace trait
-    {
-        //! The CPU blocking device queue device type trait specialization.
-        template<typename TDev>
-        struct DevType<QueueGenericThreadsBlocking<TDev>>
-        {
-            using type = TDev;
-        };
-
-        //! The CPU blocking device queue device get trait specialization.
-        template<typename TDev>
-        struct GetDev<QueueGenericThreadsBlocking<TDev>>
-        {
-            ALPAKA_FN_HOST static auto getDev(QueueGenericThreadsBlocking<TDev> const& queue) -> TDev
-            {
-                return queue.m_spQueueImpl->m_dev;
-            }
-        };
-
-        //! The CPU blocking device queue event type trait specialization.
-        template<typename TDev>
-        struct EventType<QueueGenericThreadsBlocking<TDev>>
-        {
-            using type = EventGenericThreads<TDev>;
-        };
-
-        //! The CPU blocking device queue enqueue trait specialization.
-        //! This default implementation for all tasks directly invokes the function call operator of the task.
-        template<typename TDev, typename TTask>
-        struct Enqueue<QueueGenericThreadsBlocking<TDev>, TTask>
-        {
-            ALPAKA_FN_HOST static auto enqueue(QueueGenericThreadsBlocking<TDev>& queue, TTask const& task) -> void
-            {
-                std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
-
-                queue.m_spQueueImpl->m_bCurrentlyExecutingTask = true;
-
-                task();
-
-                queue.m_spQueueImpl->m_bCurrentlyExecutingTask = false;
-            }
-        };
-
-        //! The CPU blocking device queue test trait specialization.
-        template<typename TDev>
-        struct Empty<QueueGenericThreadsBlocking<TDev>>
-        {
-            ALPAKA_FN_HOST static auto empty(QueueGenericThreadsBlocking<TDev> const& queue) -> bool
-            {
-                return !queue.m_spQueueImpl->m_bCurrentlyExecutingTask;
-            }
-        };
-
-        //! The CPU blocking device queue thread wait trait specialization.
-        //!
-        //! Blocks execution of the calling thread until the queue has finished processing all previously requested
-        //! tasks (kernels, data copies, ...)
-        template<typename TDev>
-        struct CurrentThreadWaitFor<QueueGenericThreadsBlocking<TDev>>
-        {
-            ALPAKA_FN_HOST static auto currentThreadWaitFor(QueueGenericThreadsBlocking<TDev> const& queue) -> void
-            {
-                std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#include "alpaka/event/EventGenericThreads.hpp"
diff --git a/include/alpaka/queue/QueueGenericThreadsNonBlocking.hpp b/include/alpaka/queue/QueueGenericThreadsNonBlocking.hpp
deleted file mode 100644
index 4e02a91..0000000
--- a/include/alpaka/queue/QueueGenericThreadsNonBlocking.hpp
+++ /dev/null
@@ -1,156 +0,0 @@
-/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber, Jeffrey Kelling
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/BoostPredef.hpp"
-#include "alpaka/core/CallbackThread.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/event/Traits.hpp"
-#include "alpaka/queue/Traits.hpp"
-#include "alpaka/queue/cpu/IGenericThreadsQueue.hpp"
-#include "alpaka/wait/Traits.hpp"
-
-#include <future>
-#include <memory>
-#include <mutex>
-#include <thread>
-#include <tuple>
-#include <type_traits>
-
-namespace alpaka
-{
-    template<typename TDev>
-    class EventGenericThreads;
-
-    namespace generic
-    {
-        namespace detail
-        {
-#if BOOST_COMP_CLANG
-// avoid diagnostic warning: "has no out-of-line virtual method definitions; its vtable will be emitted in every
-// translation unit [-Werror,-Wweak-vtables]" https://stackoverflow.com/a/29288300
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored "-Wweak-vtables"
-#endif
-            //! The CPU device queue implementation.
-            template<typename TDev>
-            class QueueGenericThreadsNonBlockingImpl final : public IGenericThreadsQueue<TDev>
-#if BOOST_COMP_CLANG
-#    pragma clang diagnostic pop
-#endif
-            {
-            public:
-                explicit QueueGenericThreadsNonBlockingImpl(TDev dev) : m_dev(std::move(dev))
-                {
-                }
-
-                QueueGenericThreadsNonBlockingImpl(QueueGenericThreadsNonBlockingImpl<TDev> const&) = delete;
-                QueueGenericThreadsNonBlockingImpl(QueueGenericThreadsNonBlockingImpl<TDev>&&) = delete;
-                auto operator=(QueueGenericThreadsNonBlockingImpl<TDev> const&)
-                    -> QueueGenericThreadsNonBlockingImpl<TDev>& = delete;
-                auto operator=(QueueGenericThreadsNonBlockingImpl&&)
-                    -> QueueGenericThreadsNonBlockingImpl<TDev>& = delete;
-
-                ~QueueGenericThreadsNonBlockingImpl() override
-                {
-                }
-
-                void enqueue(EventGenericThreads<TDev>& ev) final
-                {
-                    alpaka::enqueue(*this, ev);
-                }
-
-                void wait(EventGenericThreads<TDev> const& ev) final
-                {
-                    alpaka::wait(*this, ev);
-                }
-
-            public:
-                TDev const m_dev; //!< The device this queue is bound to.
-                core::CallbackThread m_workerThread;
-            };
-        } // namespace detail
-    } // namespace generic
-
-    //! The CPU device queue.
-    template<typename TDev>
-    class QueueGenericThreadsNonBlocking final
-        : public concepts::Implements<ConceptCurrentThreadWaitFor, QueueGenericThreadsNonBlocking<TDev>>
-        , public concepts::Implements<ConceptQueue, QueueGenericThreadsNonBlocking<TDev>>
-        , public concepts::Implements<ConceptGetDev, QueueGenericThreadsNonBlocking<TDev>>
-    {
-    public:
-        explicit QueueGenericThreadsNonBlocking(TDev const& dev)
-            : m_spQueueImpl(std::make_shared<generic::detail::QueueGenericThreadsNonBlockingImpl<TDev>>(dev))
-        {
-            ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-            dev.registerQueue(m_spQueueImpl);
-        }
-
-        auto operator==(QueueGenericThreadsNonBlocking<TDev> const& rhs) const -> bool
-        {
-            return (m_spQueueImpl == rhs.m_spQueueImpl);
-        }
-
-        auto operator!=(QueueGenericThreadsNonBlocking<TDev> const& rhs) const -> bool
-        {
-            return !((*this) == rhs);
-        }
-
-    public:
-        std::shared_ptr<generic::detail::QueueGenericThreadsNonBlockingImpl<TDev>> m_spQueueImpl;
-    };
-
-    namespace trait
-    {
-        //! The CPU non-blocking device queue device type trait specialization.
-        template<typename TDev>
-        struct DevType<QueueGenericThreadsNonBlocking<TDev>>
-        {
-            using type = TDev;
-        };
-
-        //! The CPU non-blocking device queue device get trait specialization.
-        template<typename TDev>
-        struct GetDev<QueueGenericThreadsNonBlocking<TDev>>
-        {
-            ALPAKA_FN_HOST static auto getDev(QueueGenericThreadsNonBlocking<TDev> const& queue) -> TDev
-            {
-                return queue.m_spQueueImpl->m_dev;
-            }
-        };
-
-        //! The CPU non-blocking device queue event type trait specialization.
-        template<typename TDev>
-        struct EventType<QueueGenericThreadsNonBlocking<TDev>>
-        {
-            using type = EventGenericThreads<TDev>;
-        };
-
-        //! The CPU non-blocking device queue enqueue trait specialization.
-        //! This default implementation for all tasks directly invokes the function call operator of the task.
-        template<typename TDev, typename TTask>
-        struct Enqueue<QueueGenericThreadsNonBlocking<TDev>, TTask>
-        {
-            ALPAKA_FN_HOST static auto enqueue(QueueGenericThreadsNonBlocking<TDev>& queue, TTask const& task) -> void
-            {
-                queue.m_spQueueImpl->m_workerThread.submit(task);
-            }
-        };
-
-        //! The CPU non-blocking device queue test trait specialization.
-        template<typename TDev>
-        struct Empty<QueueGenericThreadsNonBlocking<TDev>>
-        {
-            ALPAKA_FN_HOST static auto empty(QueueGenericThreadsNonBlocking<TDev> const& queue) -> bool
-            {
-                return queue.m_spQueueImpl->m_workerThread.empty();
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#include "alpaka/event/EventGenericThreads.hpp"
diff --git a/include/alpaka/queue/QueueGpuSyclIntelBlocking.hpp b/include/alpaka/queue/QueueGpuSyclIntelBlocking.hpp
deleted file mode 100644
index 37d4bda..0000000
--- a/include/alpaka/queue/QueueGpuSyclIntelBlocking.hpp
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Copyright 2024 Jan Stephan, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/acc/Tag.hpp"
-#include "alpaka/queue/QueueGenericSyclBlocking.hpp"
-
-#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_GPU)
-
-namespace alpaka
-{
-    using QueueGpuSyclIntelBlocking = QueueGenericSyclBlocking<TagGpuSyclIntel>;
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/queue/QueueGpuSyclIntelNonBlocking.hpp b/include/alpaka/queue/QueueGpuSyclIntelNonBlocking.hpp
deleted file mode 100644
index a50299e..0000000
--- a/include/alpaka/queue/QueueGpuSyclIntelNonBlocking.hpp
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Copyright 2024 Jan Stephan, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/acc/Tag.hpp"
-#include "alpaka/queue/QueueGenericSyclNonBlocking.hpp"
-
-#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_GPU)
-
-namespace alpaka
-{
-    using QueueGpuSyclIntelNonBlocking = QueueGenericSyclNonBlocking<TagGpuSyclIntel>;
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/queue/QueueHipRtBlocking.hpp b/include/alpaka/queue/QueueHipRtBlocking.hpp
deleted file mode 100644
index cdb1dfb..0000000
--- a/include/alpaka/queue/QueueHipRtBlocking.hpp
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright 2022 Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/ApiHipRt.hpp"
-#include "alpaka/queue/QueueUniformCudaHipRtBlocking.hpp"
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-namespace alpaka
-{
-    //! The HIP RT blocking queue.
-    using QueueHipRtBlocking = QueueUniformCudaHipRtBlocking<ApiHipRt>;
-} // namespace alpaka
-
-#endif // ALPAKA_ACC_GPU_HIP_ENABLED
diff --git a/include/alpaka/queue/QueueHipRtNonBlocking.hpp b/include/alpaka/queue/QueueHipRtNonBlocking.hpp
deleted file mode 100644
index 732609e..0000000
--- a/include/alpaka/queue/QueueHipRtNonBlocking.hpp
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright 2022 Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/ApiHipRt.hpp"
-#include "alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp"
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-namespace alpaka
-{
-    //! The HIP RT non-blocking queue.
-    using QueueHipRtNonBlocking = QueueUniformCudaHipRtNonBlocking<ApiHipRt>;
-} // namespace alpaka
-
-#endif // ALPAKA_ACC_GPU_HIP_ENABLED
diff --git a/include/alpaka/queue/QueueUniformCudaHipRtBlocking.hpp b/include/alpaka/queue/QueueUniformCudaHipRtBlocking.hpp
deleted file mode 100644
index 5add0ef..0000000
--- a/include/alpaka/queue/QueueUniformCudaHipRtBlocking.hpp
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright 2022 Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/queue/cuda_hip/QueueUniformCudaHipRt.hpp"
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-
-namespace alpaka
-{
-    //! The CUDA/HIP RT blocking queue.
-    template<typename TApi>
-    using QueueUniformCudaHipRtBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, true>;
-
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp b/include/alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp
deleted file mode 100644
index 62b0b0f..0000000
--- a/include/alpaka/queue/QueueUniformCudaHipRtNonBlocking.hpp
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright 2022 Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/queue/cuda_hip/QueueUniformCudaHipRt.hpp"
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-
-namespace alpaka
-{
-    //! The CUDA/HIP RT non-blocking queue.
-    template<typename TApi>
-    using QueueUniformCudaHipRtNonBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, false>;
-
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/queue/Traits.hpp b/include/alpaka/queue/Traits.hpp
deleted file mode 100644
index 71d3ec9..0000000
--- a/include/alpaka/queue/Traits.hpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/wait/Traits.hpp"
-
-#include <type_traits>
-#include <utility>
-
-namespace alpaka
-{
-    struct ConceptQueue;
-
-    //! True if TQueue is a queue, i.e. if it implements the ConceptQueue concept.
-    template<typename TQueue>
-    inline constexpr bool isQueue = concepts::ImplementsConcept<ConceptQueue, std::decay_t<TQueue>>::value;
-
-    //! The queue traits.
-    namespace trait
-    {
-        //! The queue enqueue trait.
-        template<typename TQueue, typename TTask, typename TSfinae = void>
-        struct Enqueue;
-
-        //! The queue empty trait.
-        template<typename TQueue, typename TSfinae = void>
-        struct Empty;
-
-        //! Queue for an accelerator
-        template<typename TAcc, typename TProperty, typename TSfinae = void>
-        struct QueueType;
-    } // namespace trait
-
-    //! Queues the given task in the given queue.
-    //!
-    //! Special Handling for events:
-    //!   If the event has previously been queued, then this call will overwrite any existing state of the event.
-    //!   Any subsequent calls which examine the status of event will only examine the completion of this most recent
-    //!   call to enqueue.
-    //!   If a queue is waiting for an event the latter's event state at the time of the API call to wait() will be
-    //!   used to release the queue.
-    template<typename TQueue, typename TTask>
-    ALPAKA_FN_HOST auto enqueue(TQueue& queue, TTask&& task) -> void
-    {
-        trait::Enqueue<TQueue, std::decay_t<TTask>>::enqueue(queue, std::forward<TTask>(task));
-    }
-
-    //! Tests if the queue is empty (all ops in the given queue have been completed).
-    //!
-    //! \warning This function is allowed to return false negatives. An empty queue can reported as
-    //! non empty because the status information are not fully propagated by the used alpaka backend.
-    //! \return true queue is empty else false.
-    template<typename TQueue>
-    ALPAKA_FN_HOST auto empty(TQueue const& queue) -> bool
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptQueue, TQueue>;
-        return trait::Empty<ImplementationBase>::empty(queue);
-    }
-
-    //! Queue based on the environment and a property
-    //!
-    //! \tparam TEnv Environment type, e.g.  accelerator, device or a platform.
-    //!              trait::QueueType must be specialized for TEnv
-    //! \tparam TProperty Property to define the behavior of TEnv.
-    template<typename TEnv, typename TProperty>
-    using Queue = typename trait::QueueType<TEnv, TProperty>::type;
-} // namespace alpaka
diff --git a/include/alpaka/queue/cpu/ICpuQueue.hpp b/include/alpaka/queue/cpu/ICpuQueue.hpp
deleted file mode 100644
index cd71072..0000000
--- a/include/alpaka/queue/cpu/ICpuQueue.hpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/dev/DevCpu.hpp"
-#include "alpaka/queue/cpu/IGenericThreadsQueue.hpp"
-
-namespace alpaka::cpu
-{
-    //! The CPU queue interface
-    using ICpuQueue = IGenericThreadsQueue<DevCpu>;
-} // namespace alpaka::cpu
diff --git a/include/alpaka/queue/cpu/IGenericThreadsQueue.hpp b/include/alpaka/queue/cpu/IGenericThreadsQueue.hpp
deleted file mode 100644
index 3d82a9c..0000000
--- a/include/alpaka/queue/cpu/IGenericThreadsQueue.hpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright 2020 Axel Huebl, Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/BoostPredef.hpp"
-
-namespace alpaka
-{
-    template<typename TDev>
-    class EventGenericThreads;
-
-#if BOOST_COMP_CLANG
-// avoid diagnostic warning: "has no out-of-line virtual method definitions; its vtable will be emitted in every
-// translation unit [-Werror,-Wweak-vtables]" https://stackoverflow.com/a/29288300
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored "-Wweak-vtables"
-#endif
-
-    //! The CPU queue interface
-    template<typename TDev>
-    class IGenericThreadsQueue
-    {
-    public:
-        //! enqueue the event
-        virtual void enqueue(EventGenericThreads<TDev>&) = 0;
-        //! waiting for the event
-        virtual void wait(EventGenericThreads<TDev> const&) = 0;
-        virtual ~IGenericThreadsQueue() = default;
-    };
-#if BOOST_COMP_CLANG
-#    pragma clang diagnostic pop
-#endif
-} // namespace alpaka
diff --git a/include/alpaka/queue/cuda_hip/QueueUniformCudaHipRt.hpp b/include/alpaka/queue/cuda_hip/QueueUniformCudaHipRt.hpp
deleted file mode 100644
index 3a85fac..0000000
--- a/include/alpaka/queue/cuda_hip/QueueUniformCudaHipRt.hpp
+++ /dev/null
@@ -1,245 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Matthias Werner, René Widera, Andrea Bocci, Bernhard Manfred Gruber,
- * Antonio Di Pilato
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/CallbackThread.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/core/Cuda.hpp"
-#include "alpaka/core/Hip.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/event/Traits.hpp"
-#include "alpaka/meta/DependentFalseType.hpp"
-#include "alpaka/queue/Traits.hpp"
-#include "alpaka/traits/Traits.hpp"
-#include "alpaka/wait/Traits.hpp"
-
-#include <condition_variable>
-#include <functional>
-#include <future>
-#include <memory>
-#include <mutex>
-#include <thread>
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-
-namespace alpaka
-{
-    template<typename TApi>
-    class EventUniformCudaHipRt;
-
-    template<typename TApi>
-    class DevUniformCudaHipRt;
-
-    namespace uniform_cuda_hip::detail
-    {
-        //! The CUDA/HIP RT queue implementation.
-        template<typename TApi>
-        class QueueUniformCudaHipRtImpl final
-        {
-        public:
-            ALPAKA_FN_HOST QueueUniformCudaHipRtImpl(DevUniformCudaHipRt<TApi> const& dev)
-                : m_dev(dev)
-                , m_UniformCudaHipQueue()
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                // Set the current device.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::setDevice(m_dev.getNativeHandle()));
-
-                // - [cuda/hip]StreamDefault: Default queue creation flag.
-                // - [cuda/hip]StreamNonBlocking: Specifies that work running in the created queue may run
-                // concurrently with work in queue 0 (the NULL queue),
-                //   and that the created queue should perform no implicit synchronization with queue 0.
-                // Create the queue on the current device.
-                // NOTE: [cuda/hip]StreamNonBlocking is required to match the semantic implemented in the alpaka
-                // CPU queue. It would be too much work to implement implicit default queue synchronization on CPU.
-
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
-                    TApi::streamCreateWithFlags(&m_UniformCudaHipQueue, TApi::streamNonBlocking));
-            }
-
-            QueueUniformCudaHipRtImpl(QueueUniformCudaHipRtImpl&&) = default;
-            auto operator=(QueueUniformCudaHipRtImpl&&) -> QueueUniformCudaHipRtImpl& = delete;
-
-            ALPAKA_FN_HOST ~QueueUniformCudaHipRtImpl()
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                // Make sure all pending async work is finished before destroying the stream to guarantee determinism.
-                // This would not be necessary for plain CUDA/HIP operations, but we can have host functions in the
-                // stream, which reference this queue instance and its CallbackThread. Make sure they are done.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::streamSynchronize(m_UniformCudaHipQueue));
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::streamDestroy(m_UniformCudaHipQueue));
-            }
-
-            [[nodiscard]] auto getNativeHandle() const noexcept
-            {
-                return m_UniformCudaHipQueue;
-            }
-
-        public:
-            DevUniformCudaHipRt<TApi> const m_dev; //!< The device this queue is bound to.
-            core::CallbackThread m_callbackThread;
-
-        private:
-            typename TApi::Stream_t m_UniformCudaHipQueue;
-        };
-
-        //! The CUDA/HIP RT queue.
-        template<typename TApi, bool TBlocking>
-        class QueueUniformCudaHipRt
-            : public concepts::Implements<ConceptCurrentThreadWaitFor, QueueUniformCudaHipRt<TApi, TBlocking>>
-            , public concepts::Implements<ConceptQueue, QueueUniformCudaHipRt<TApi, TBlocking>>
-            , public concepts::Implements<ConceptGetDev, QueueUniformCudaHipRt<TApi, TBlocking>>
-        {
-        public:
-            ALPAKA_FN_HOST QueueUniformCudaHipRt(DevUniformCudaHipRt<TApi> const& dev)
-                : m_spQueueImpl(std::make_shared<QueueUniformCudaHipRtImpl<TApi>>(dev))
-            {
-                dev.registerQueue(m_spQueueImpl);
-            }
-
-            ALPAKA_FN_HOST auto operator==(QueueUniformCudaHipRt const& rhs) const -> bool
-            {
-                return (m_spQueueImpl == rhs.m_spQueueImpl);
-            }
-
-            ALPAKA_FN_HOST auto operator!=(QueueUniformCudaHipRt const& rhs) const -> bool
-            {
-                return !((*this) == rhs);
-            }
-
-            [[nodiscard]] auto getNativeHandle() const noexcept
-            {
-                return m_spQueueImpl->getNativeHandle();
-            }
-
-            auto getCallbackThread() -> core::CallbackThread&
-            {
-                return m_spQueueImpl->m_callbackThread;
-            }
-
-        public:
-            std::shared_ptr<QueueUniformCudaHipRtImpl<TApi>> m_spQueueImpl;
-        };
-    } // namespace uniform_cuda_hip::detail
-
-    namespace trait
-    {
-        //! The CUDA/HIP RT queue device get trait specialization.
-        template<typename TApi, bool TBlocking>
-        struct GetDev<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>
-        {
-            ALPAKA_FN_HOST static auto getDev(
-                uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking> const& queue)
-                -> DevUniformCudaHipRt<TApi>
-            {
-                return queue.m_spQueueImpl->m_dev;
-            }
-        };
-
-        //! The CUDA/HIP RT queue test trait specialization.
-        template<typename TApi, bool TBlocking>
-        struct Empty<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>
-        {
-            ALPAKA_FN_HOST static auto empty(
-                uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking> const& queue) -> bool
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                // Query is allowed even for queues on non current device.
-                typename TApi::Error_t ret = TApi::success;
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE(
-                    ret = TApi::streamQuery(queue.getNativeHandle()),
-                    TApi::errorNotReady);
-                return (ret == TApi::success);
-            }
-        };
-
-        //! The CUDA/HIP RT queue thread wait trait specialization.
-        //!
-        //! Blocks execution of the calling thread until the queue has finished processing all previously requested
-        //! tasks (kernels, data copies, ...)
-        template<typename TApi, bool TBlocking>
-        struct CurrentThreadWaitFor<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>
-        {
-            ALPAKA_FN_HOST static auto currentThreadWaitFor(
-                uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking> const& queue) -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                // Sync is allowed even for queues on non current device.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamSynchronize(queue.getNativeHandle()));
-            }
-        };
-
-        //! The CUDA/HIP RT blocking queue device type trait specialization.
-        template<typename TApi, bool TBlocking>
-        struct DevType<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>
-        {
-            using type = DevUniformCudaHipRt<TApi>;
-        };
-
-        //! The CUDA/HIP RT blocking queue event type trait specialization.
-        template<typename TApi, bool TBlocking>
-        struct EventType<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>
-        {
-            using type = EventUniformCudaHipRt<TApi>;
-        };
-
-        //! The CUDA/HIP RT blocking queue enqueue trait specialization.
-        template<typename TApi, bool TBlocking, typename TTask>
-        struct Enqueue<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>, TTask>
-        {
-            using QueueImpl = uniform_cuda_hip::detail::QueueUniformCudaHipRtImpl<TApi>;
-
-            struct HostFuncData
-            {
-                // We don't need to keep the queue alive, because in it's dtor it will synchronize with the CUDA/HIP
-                // stream and wait until all host functions and the CallbackThread are done. It's actually an error to
-                // copy the queue into the host function. Destroying it here would call CUDA/HIP APIs from the host
-                // function. Passing it further to the Callback thread, would make the Callback thread hold a task
-                // containing the queue with the CallbackThread itself. Destroying the task if no other queue instance
-                // exists will make the CallbackThread join itself and crash.
-                QueueImpl& q;
-                TTask t;
-            };
-
-            ALPAKA_FN_HOST static void uniformCudaHipRtHostFunc(void* arg)
-            {
-                auto data = std::unique_ptr<HostFuncData>(reinterpret_cast<HostFuncData*>(arg));
-                auto& queue = data->q;
-                auto f = queue.m_callbackThread.submit([d = std::move(data)] { d->t(); });
-                f.wait();
-            }
-
-            ALPAKA_FN_HOST static auto enqueue(
-                uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>& queue,
-                TTask const& task) -> void
-            {
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::launchHostFunc(
-                    queue.getNativeHandle(),
-                    uniformCudaHipRtHostFunc,
-                    new HostFuncData{*queue.m_spQueueImpl, task}));
-                if constexpr(TBlocking)
-                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::streamSynchronize(queue.getNativeHandle()));
-            }
-        };
-
-        //! The CUDA/HIP RT blocking queue native handle trait specialization.
-        template<typename TApi, bool TBlocking>
-        struct NativeHandle<uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking>>
-        {
-            [[nodiscard]] static auto getNativeHandle(
-                uniform_cuda_hip::detail::QueueUniformCudaHipRt<TApi, TBlocking> const& queue)
-            {
-                return queue.getNativeHandle();
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#endif
diff --git a/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp b/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp
deleted file mode 100644
index abf5763..0000000
--- a/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp
+++ /dev/null
@@ -1,289 +0,0 @@
-/* Copyright 2024 Jan Stephan, Antonio Di Pilato, Luca Ferragina, Andrea Bocci, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/event/Traits.hpp"
-#include "alpaka/queue/Traits.hpp"
-#include "alpaka/traits/Traits.hpp"
-#include "alpaka/wait/Traits.hpp"
-
-#include <algorithm>
-#include <exception>
-#include <memory>
-#include <mutex>
-#include <shared_mutex>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-
-#    include <sycl/sycl.hpp>
-
-namespace alpaka
-{
-    template<typename TTag>
-    class DevGenericSycl;
-
-    template<typename TTag>
-    class EventGenericSycl;
-
-    namespace detail
-    {
-        template<typename T, typename = void>
-        inline constexpr auto is_sycl_task = false;
-
-        template<typename T>
-        inline constexpr auto is_sycl_task<T, std::void_t<decltype(T::is_sycl_task)>> = true;
-
-        template<typename T, typename = void>
-        inline constexpr auto is_sycl_kernel = false;
-
-        template<typename T>
-        inline constexpr auto is_sycl_kernel<T, std::void_t<decltype(T::is_sycl_kernel)>> = true;
-
-        class QueueGenericSyclImpl
-        {
-        public:
-            QueueGenericSyclImpl(sycl::context context, sycl::device device)
-                : m_queue{
-                    std::move(context), // This is important. In SYCL a device can belong to multiple contexts.
-                    std::move(device),
-                    {sycl::property::queue::enable_profiling{}, sycl::property::queue::in_order{}}}
-            {
-            }
-
-            // This class will only exist as a pointer. We don't care about copy and move semantics.
-            QueueGenericSyclImpl(QueueGenericSyclImpl const& other) = delete;
-            auto operator=(QueueGenericSyclImpl const& rhs) -> QueueGenericSyclImpl& = delete;
-
-            QueueGenericSyclImpl(QueueGenericSyclImpl&& other) noexcept = delete;
-            auto operator=(QueueGenericSyclImpl&& rhs) noexcept -> QueueGenericSyclImpl& = delete;
-
-            ~QueueGenericSyclImpl()
-            {
-                try
-                {
-                    m_queue.wait_and_throw();
-                }
-                catch(sycl::exception const& err)
-                {
-                    std::cerr << "Caught SYCL exception while destructing a SYCL queue: " << err.what() << " ("
-                              << err.code() << ')' << std::endl;
-                }
-                catch(std::exception const& err)
-                {
-                    std::cerr << "The following runtime error(s) occured while destructing a SYCL queue:" << err.what()
-                              << std::endl;
-                }
-            }
-
-            // Don't call this without locking first!
-            auto clean_dependencies() -> void
-            {
-                // Clean up completed events
-                auto const start = std::begin(m_dependencies);
-                auto const old_end = std::end(m_dependencies);
-                auto const new_end = std::remove_if(
-                    start,
-                    old_end,
-                    [](sycl::event ev) {
-                        return ev.get_info<sycl::info::event::command_execution_status>()
-                               == sycl::info::event_command_status::complete;
-                    });
-
-                m_dependencies.erase(new_end, old_end);
-            }
-
-            auto register_dependency(sycl::event event) -> void
-            {
-                std::lock_guard<std::shared_mutex> lock{m_mutex};
-
-                clean_dependencies();
-                m_dependencies.push_back(event);
-            }
-
-            auto empty() const -> bool
-            {
-                std::shared_lock<std::shared_mutex> lock{m_mutex};
-                return m_last_event.get_info<sycl::info::event::command_execution_status>()
-                       == sycl::info::event_command_status::complete;
-            }
-
-            auto wait() -> void
-            {
-                // SYCL queues are thread-safe.
-                m_queue.wait_and_throw();
-            }
-
-            auto get_last_event() const -> sycl::event
-            {
-                std::shared_lock<std::shared_mutex> lock{m_mutex};
-                return m_last_event;
-            }
-
-            template<bool TBlocking, typename TTask>
-            auto enqueue(TTask const& task) -> void
-            {
-                {
-                    std::lock_guard<std::shared_mutex> lock{m_mutex};
-
-                    clean_dependencies();
-
-                    // Execute task
-                    if constexpr(is_sycl_task<TTask> && !is_sycl_kernel<TTask>) // Copy / Fill
-                    {
-                        m_last_event = task(m_queue, m_dependencies); // Will call queue.{copy, fill} internally
-                    }
-                    else
-                    {
-                        m_last_event = m_queue.submit(
-                            [this, &task](sycl::handler& cgh)
-                            {
-                                if(!m_dependencies.empty())
-                                    cgh.depends_on(m_dependencies);
-
-                                if constexpr(is_sycl_kernel<TTask>) // Kernel
-                                    task(cgh); // Will call cgh.parallel_for internally
-                                else // Host
-                                    cgh.host_task(task);
-                            });
-                    }
-
-                    m_dependencies.clear();
-                }
-
-                if constexpr(TBlocking)
-                    wait();
-            }
-
-            [[nodiscard]] auto getNativeHandle() const noexcept
-            {
-                return m_queue;
-            }
-
-            std::vector<sycl::event> m_dependencies;
-            sycl::event m_last_event;
-            std::shared_mutex mutable m_mutex;
-
-        private:
-            sycl::queue m_queue;
-        };
-
-        template<typename TTag, bool TBlocking>
-        class QueueGenericSyclBase
-            : public concepts::Implements<ConceptCurrentThreadWaitFor, QueueGenericSyclBase<TTag, TBlocking>>
-            , public concepts::Implements<ConceptQueue, QueueGenericSyclBase<TTag, TBlocking>>
-            , public concepts::Implements<ConceptGetDev, QueueGenericSyclBase<TTag, TBlocking>>
-        {
-        public:
-            QueueGenericSyclBase(DevGenericSycl<TTag> const& dev)
-                : m_dev{dev}
-                , m_spQueueImpl{std::make_shared<detail::QueueGenericSyclImpl>(
-                      dev.getNativeHandle().second,
-                      dev.getNativeHandle().first)}
-            {
-                m_dev.m_impl->register_queue(m_spQueueImpl);
-            }
-
-            friend auto operator==(QueueGenericSyclBase const& lhs, QueueGenericSyclBase const& rhs) -> bool
-            {
-                return (lhs.m_dev == rhs.m_dev) && (lhs.m_spQueueImpl == rhs.m_spQueueImpl);
-            }
-
-            friend auto operator!=(QueueGenericSyclBase const& lhs, QueueGenericSyclBase const& rhs) -> bool
-            {
-                return !(lhs == rhs);
-            }
-
-            [[nodiscard]] auto getNativeHandle() const noexcept
-            {
-                return m_spQueueImpl->getNativeHandle();
-            }
-
-            DevGenericSycl<TTag> m_dev;
-            std::shared_ptr<detail::QueueGenericSyclImpl> m_spQueueImpl;
-        };
-    } // namespace detail
-
-    namespace trait
-    {
-        //! The SYCL blocking queue device type trait specialization.
-        template<typename TTag, bool TBlocking>
-        struct DevType<alpaka::detail::QueueGenericSyclBase<TTag, TBlocking>>
-        {
-            using type = DevGenericSycl<TTag>;
-        };
-
-        //! The SYCL blocking queue device get trait specialization.
-        template<typename TTag, bool TBlocking>
-        struct GetDev<alpaka::detail::QueueGenericSyclBase<TTag, TBlocking>>
-        {
-            static auto getDev(alpaka::detail::QueueGenericSyclBase<TTag, TBlocking> const& queue)
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-                return queue.m_dev;
-            }
-        };
-
-        //! The SYCL blocking queue event type trait specialization.
-        template<typename TTag, bool TBlocking>
-        struct EventType<alpaka::detail::QueueGenericSyclBase<TTag, TBlocking>>
-        {
-            using type = EventGenericSycl<TTag>;
-        };
-
-        //! The SYCL blocking queue enqueue trait specialization.
-        template<typename TTag, bool TBlocking, typename TTask>
-        struct Enqueue<alpaka::detail::QueueGenericSyclBase<TTag, TBlocking>, TTask>
-        {
-            static auto enqueue(alpaka::detail::QueueGenericSyclBase<TTag, TBlocking>& queue, TTask const& task)
-                -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-                queue.m_spQueueImpl->template enqueue<TBlocking>(task);
-            }
-        };
-
-        //! The SYCL blocking queue test trait specialization.
-        template<typename TTag, bool TBlocking>
-        struct Empty<alpaka::detail::QueueGenericSyclBase<TTag, TBlocking>>
-        {
-            static auto empty(alpaka::detail::QueueGenericSyclBase<TTag, TBlocking> const& queue) -> bool
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-                return queue.m_spQueueImpl->empty();
-            }
-        };
-
-        //! The SYCL blocking queue thread wait trait specialization.
-        //!
-        //! Blocks execution of the calling thread until the queue has finished processing all previously requested
-        //! tasks (kernels, data copies, ...)
-        template<typename TTag, bool TBlocking>
-        struct CurrentThreadWaitFor<alpaka::detail::QueueGenericSyclBase<TTag, TBlocking>>
-        {
-            static auto currentThreadWaitFor(alpaka::detail::QueueGenericSyclBase<TTag, TBlocking> const& queue)
-                -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-                queue.m_spQueueImpl->wait();
-            }
-        };
-
-        //! The SYCL queue native handle trait specialization.
-        template<typename TTag, bool TBlocking>
-        struct NativeHandle<alpaka::detail::QueueGenericSyclBase<TTag, TBlocking>>
-        {
-            [[nodiscard]] static auto getNativeHandle(
-                alpaka::detail::QueueGenericSyclBase<TTag, TBlocking> const& queue)
-            {
-                return queue.getNativeHandle();
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
-#endif
diff --git a/include/alpaka/rand/Philox/MultiplyAndSplit64to32.hpp b/include/alpaka/rand/Philox/MultiplyAndSplit64to32.hpp
deleted file mode 100644
index e0c0361..0000000
--- a/include/alpaka/rand/Philox/MultiplyAndSplit64to32.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2023 Jiří Vyskočil, Bernhard Manfred Gruber, Jan Stephan
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-
-#include <cstdint>
-
-namespace alpaka::rand
-{
-    /// Get high 32 bits of a 64-bit number
-    ALPAKA_FN_HOST_ACC inline constexpr auto high32Bits(std::uint64_t const x) -> std::uint32_t
-    {
-        return static_cast<std::uint32_t>(x >> 32);
-    }
-
-    /// Get low 32 bits of a 64-bit number
-    ALPAKA_FN_HOST_ACC inline constexpr auto low32Bits(std::uint64_t const x) -> std::uint32_t
-    {
-        return static_cast<std::uint32_t>(x & 0xffff'ffff);
-    }
-
-    /** Multiply two 64-bit numbers and split the result into high and low 32 bits, also known as "mulhilo32"
-     *
-     * @param a first 64-bit multiplier
-     * @param b second 64-bit multiplier
-     * @param resultHigh high 32 bits of the product a*b
-     * @param resultLow low 32 bits of the product a*b
-     */
-    // TODO: See single-instruction implementations in original Philox source code
-    ALPAKA_FN_HOST_ACC inline constexpr void multiplyAndSplit64to32(
-        std::uint64_t const a,
-        std::uint64_t const b,
-        std::uint32_t& resultHigh,
-        std::uint32_t& resultLow)
-    {
-        std::uint64_t res64 = a * b;
-        resultHigh = high32Bits(res64);
-        resultLow = low32Bits(res64);
-    }
-} // namespace alpaka::rand
diff --git a/include/alpaka/rand/Philox/PhiloxBaseCommon.hpp b/include/alpaka/rand/Philox/PhiloxBaseCommon.hpp
deleted file mode 100644
index e80d8a1..0000000
--- a/include/alpaka/rand/Philox/PhiloxBaseCommon.hpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright 2022 Jiri Vyskocil, Bernhard Manfred Gruber, Jeffrey Kelling
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/rand/Philox/PhiloxStateless.hpp"
-
-#include <utility>
-
-namespace alpaka::rand::engine
-{
-    /** Common class for Philox family engines
-     *
-     * Relies on `PhiloxStateless` to provide the PRNG and adds state to handling the counting.
-     *
-     * @tparam TParams Philox algorithm parameters \sa PhiloxParams
-     * @tparam TImpl engine type implementation (CRTP)
-     *
-     * static const data members are transformed into functions, because GCC
-     * assumes types with static data members to be not mappable and makes not
-     * exception for constexpr ones. This is a valid interpretation of the
-     * OpenMP <= 4.5 standard. In OpenMP >= 5.0 types with any kind of static
-     * data member are mappable.
-     */
-    template<typename TParams, typename TImpl>
-    class PhiloxBaseCommon : public PhiloxStateless<TParams>
-    {
-    public:
-        using Counter = typename PhiloxStateless<TParams>::Counter;
-        using Key = typename PhiloxStateless<TParams>::Key;
-
-        /// Distribution container type
-        template<typename TDistributionResultScalar>
-        using ResultContainer = typename alpaka::Vec<alpaka::DimInt<TParams::counterSize>, TDistributionResultScalar>;
-
-    protected:
-        /** Advance the \a counter to the next state
-         *
-         * Increments the passed-in \a counter by one with a 128-bit carry.
-         *
-         * @param counter reference to the counter which is to be advanced
-         */
-        ALPAKA_FN_HOST_ACC void advanceCounter(Counter& counter)
-        {
-            counter[0]++;
-            /* 128-bit carry */
-            if(counter[0] == 0)
-            {
-                counter[1]++;
-                if(counter[1] == 0)
-                {
-                    counter[2]++;
-                    if(counter[2] == 0)
-                    {
-                        counter[3]++;
-                    }
-                }
-            }
-        }
-
-        /** Advance the internal state counter by \a offset N-vectors (N = counter size)
-         *
-         * Advances the internal value of this->state.counter
-         *
-         * @param offset number of N-vectors to skip
-         */
-        ALPAKA_FN_HOST_ACC void skip4(uint64_t offset)
-        {
-            Counter& counter = static_cast<TImpl*>(this)->state.counter;
-            Counter temp = counter;
-            counter[0] += low32Bits(offset);
-            counter[1] += high32Bits(offset) + (counter[0] < temp[0] ? 1 : 0);
-            counter[2] += (counter[0] < temp[1] ? 1u : 0u);
-            counter[3] += (counter[0] < temp[2] ? 1u : 0u);
-        }
-
-        /** Advance the counter by the length of \a subsequence
-         *
-         * Advances the internal value of this->state.counter
-         *
-         * @param subsequence number of subsequences to skip
-         */
-        ALPAKA_FN_HOST_ACC void skipSubsequence(uint64_t subsequence)
-        {
-            Counter& counter = static_cast<TImpl*>(this)->state.counter;
-            Counter temp = counter;
-            counter[2] += low32Bits(subsequence);
-            counter[3] += high32Bits(subsequence) + (counter[2] < temp[2] ? 1 : 0);
-        }
-    };
-} // namespace alpaka::rand::engine
diff --git a/include/alpaka/rand/Philox/PhiloxConstants.hpp b/include/alpaka/rand/Philox/PhiloxConstants.hpp
deleted file mode 100644
index 831a1de..0000000
--- a/include/alpaka/rand/Philox/PhiloxConstants.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright 2022 Jiri Vyskocil, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/rand/Philox/MultiplyAndSplit64to32.hpp"
-
-#include <cstdint>
-#include <utility>
-
-namespace alpaka::rand::engine
-{
-    /** Constants used in the Philox algorithm
-     *
-     * The numbers are taken from the reference Philox implementation:
-     *
-     * J. K. Salmon, M. A. Moraes, R. O. Dror and D. E. Shaw, "Parallel random numbers: As easy as 1, 2, 3,"
-     * SC '11: Proceedings of 2011 International Conference for High Performance Computing, Networking,
-     * Storage and Analysis, 2011, pp. 1-12, doi: 10.1145/2063384.2063405.
-     *
-     * @tparam TParams basic Philox algorithm parameters
-     *
-     * static const data members are transformed into functions, because GCC
-     * assumes types with static data members to be not mappable and makes not
-     * exception for constexpr ones. This is a valid interpretation of the
-     * OpenMP <= 4.5 standard. In OpenMP >= 5.0 types with any kind of static
-     * data member are mappable.
-     */
-    template<typename TParams>
-    class PhiloxConstants
-    {
-    public:
-        /// First Weyl sequence parameter: the golden ratio
-        static constexpr std::uint64_t WEYL_64_0()
-        {
-            return 0x9E37'79B9'7F4A'7C15;
-        }
-
-        /// Second Weyl sequence parameter: \f$ \sqrt{3}-1 \f$
-        static constexpr std::uint64_t WEYL_64_1()
-        {
-            return 0xBB67'AE85'84CA'A73B;
-        }
-
-        /// 1st Weyl sequence parameter, 32 bits
-        static constexpr std::uint32_t WEYL_32_0()
-        {
-            return high32Bits(WEYL_64_0());
-        }
-
-        /// 2nd Weyl sequence parameter, 32 bits
-        static constexpr std::uint32_t WEYL_32_1()
-        {
-            return high32Bits(WEYL_64_1());
-        }
-
-        /// First Philox S-box multiplier
-        static constexpr std::uint32_t MULTIPLITER_4x32_0()
-        {
-            return 0xCD9E'8D57;
-        }
-
-        /// Second Philox S-box multiplier
-        static constexpr std::uint32_t MULTIPLITER_4x32_1()
-        {
-            return 0xD251'1F53;
-        }
-    };
-} // namespace alpaka::rand::engine
diff --git a/include/alpaka/rand/Philox/PhiloxSingle.hpp b/include/alpaka/rand/Philox/PhiloxSingle.hpp
deleted file mode 100644
index 3f7b6ff..0000000
--- a/include/alpaka/rand/Philox/PhiloxSingle.hpp
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright 2022 Jiri Vyskocil, Rene Widera, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/rand/Philox/MultiplyAndSplit64to32.hpp"
-#include "alpaka/rand/Philox/PhiloxBaseCommon.hpp"
-
-#include <utility>
-
-namespace alpaka::rand::engine
-{
-    /** Philox state for single value engine
-     *
-     * @tparam TCounter Type of the Counter array
-     * @tparam TKey Type of the Key array
-     */
-    template<typename TCounter, typename TKey>
-    struct PhiloxStateSingle
-    {
-        using Counter = TCounter;
-        using Key = TKey;
-
-        /// Counter array
-        Counter counter;
-        /// Key array
-        Key key;
-        /// Intermediate result array
-        Counter result;
-        /// Pointer to the active intermediate result element
-        std::uint32_t position;
-        // TODO: Box-Muller states
-    };
-
-    /** Philox engine generating a single number
-     *
-     * This engine's operator() will return a single number. Since the result is the same size as the counter,
-     * and so it contains more than one number, it has to be stored between individual invocations of
-     * operator(). Additionally a pointer has to be stored indicating which part of the result array is to be
-     * returned next.
-     *
-     * @tparam TParams Basic parameters for the Philox algorithm
-     */
-    template<typename TParams>
-    class PhiloxSingle : public PhiloxBaseCommon<TParams, PhiloxSingle<TParams>>
-    {
-    public:
-        using Base = PhiloxBaseCommon<TParams, PhiloxSingle<TParams>>;
-
-        /// Counter type
-        using Counter = typename Base::Counter;
-        /// Key type
-        using Key = typename Base::Key;
-        /// State type
-        using State = PhiloxStateSingle<Counter, Key>;
-
-        /// Internal engine state
-        State state;
-
-    protected:
-        /** Advance internal counter to the next value
-         *
-         * Advances the full internal counter array, resets the position pointer and stores the intermediate
-         * result to be recalled when the user requests a number.
-         */
-        ALPAKA_FN_HOST_ACC void advanceState()
-        {
-            this->advanceCounter(state.counter);
-            state.result = this->nRounds(state.counter, state.key);
-            state.position = 0;
-        }
-
-        /** Get the next random number and advance internal state
-         *
-         * The intermediate result stores N = TParams::counterSize numbers. Check if we've already given out
-         * all of them. If so, generate a new intermediate result (this also resets the pointer to the position
-         * of the actual number). Finally, we return the actual number.
-         *
-         * @return The next random number
-         */
-        ALPAKA_FN_HOST_ACC auto nextNumber()
-        {
-            // Element zero will always contain the next valid random number.
-            auto result = state.result[0];
-            state.position++;
-            if(state.position == TParams::counterSize)
-            {
-                advanceState();
-            }
-            else
-            {
-                // Shift state results to allow hard coded access to element zero.
-                // This will avoid high register usage on NVIDIA devices.
-                // \todo Check if this shifting of the result vector is decreasing CPU performance.
-                //       If so this optimization for GPUs (mostly NVIDIA) should be moved into
-                //       PhiloxBaseCudaArray.
-                state.result[0] = state.result[1];
-                state.result[1] = state.result[2];
-                state.result[2] = state.result[3];
-            }
-
-            return result;
-        }
-
-        /// Skips the next \a offset numbers
-        ALPAKA_FN_HOST_ACC void skip(uint64_t offset)
-        {
-            static_assert(TParams::counterSize == 4, "Only counterSize is supported.");
-            state.position = static_cast<decltype(state.position)>(state.position + (offset & 3));
-            offset += state.position < 4 ? 0 : 4;
-            state.position -= state.position < 4 ? 0 : 4u;
-            for(auto numShifts = state.position; numShifts > 0; --numShifts)
-            {
-                // Shift state results to allow hard coded access to element zero.
-                // This will avoid high register usage on NVIDIA devices.
-                state.result[0] = state.result[1];
-                state.result[1] = state.result[2];
-                state.result[2] = state.result[3];
-            }
-            this->skip4(offset / 4);
-        }
-
-    public:
-        /** Construct a new Philox engine with single-value output
-         *
-         * @param seed Set the Philox generator key
-         * @param subsequence Select a subsequence of size 2^64
-         * @param offset Skip \a offset numbers form the start of the subsequence
-         */
-        ALPAKA_FN_HOST_ACC PhiloxSingle(uint64_t seed = 0, uint64_t subsequence = 0, uint64_t offset = 0)
-            : state{{0, 0, 0, 0}, {low32Bits(seed), high32Bits(seed)}, {0, 0, 0, 0}, 0}
-        {
-            this->skipSubsequence(subsequence);
-            skip(offset);
-            advanceState();
-        }
-
-        /** Get the next random number
-         *
-         * @return The next random number
-         */
-        ALPAKA_FN_HOST_ACC auto operator()()
-        {
-            return nextNumber();
-        }
-    };
-} // namespace alpaka::rand::engine
diff --git a/include/alpaka/rand/Philox/PhiloxStateless.hpp b/include/alpaka/rand/Philox/PhiloxStateless.hpp
deleted file mode 100644
index 3011d44..0000000
--- a/include/alpaka/rand/Philox/PhiloxStateless.hpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright 2022 Jiri Vyskocil, Bernhard Manfred Gruber, Jeffrey Kelling
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Unroll.hpp"
-#include "alpaka/rand/Philox/MultiplyAndSplit64to32.hpp"
-#include "alpaka/rand/Philox/PhiloxConstants.hpp"
-#include "alpaka/vec/Vec.hpp"
-
-#include <utility>
-
-namespace alpaka::rand::engine
-{
-    /** Philox algorithm parameters
-     *
-     * @tparam TCounterSize number of elements in the counter
-     * @tparam TWidth width of one counter element (in bits)
-     * @tparam TRounds number of S-box rounds
-     */
-    template<unsigned TCounterSize, unsigned TWidth, unsigned TRounds>
-    struct PhiloxParams
-    {
-        static constexpr unsigned counterSize = TCounterSize;
-        static constexpr unsigned width = TWidth;
-        static constexpr unsigned rounds = TRounds;
-    };
-
-    /** Class basic Philox family counter-based PRNG
-     *
-     * Checks the validity of passed-in parameters and calls the backend methods to perform N rounds of the
-     * Philox shuffle.
-     *
-     * @tparam TParams Philox algorithm parameters \sa PhiloxParams
-     */
-    template<typename TParams>
-    class PhiloxStateless : public PhiloxConstants<TParams>
-    {
-        static constexpr unsigned numRounds()
-        {
-            return TParams::rounds;
-        }
-
-        static constexpr unsigned vectorSize()
-        {
-            return TParams::counterSize;
-        }
-
-        static constexpr unsigned numberWidth()
-        {
-            return TParams::width;
-        }
-
-        static_assert(numRounds() > 0, "Number of Philox rounds must be > 0.");
-        static_assert(vectorSize() % 2 == 0, "Philox counter size must be an even number.");
-        static_assert(vectorSize() <= 16, "Philox SP network is not specified for sizes > 16.");
-        static_assert(numberWidth() % 8 == 0, "Philox number width in bits must be a multiple of 8.");
-
-        static_assert(numberWidth() == 32, "Philox implemented only for 32 bit numbers.");
-
-    public:
-        using Counter = alpaka::Vec<alpaka::DimInt<TParams::counterSize>, std::uint32_t>;
-        using Key = alpaka::Vec<alpaka::DimInt<TParams::counterSize / 2>, std::uint32_t>;
-        using Constants = PhiloxConstants<TParams>;
-
-    protected:
-        /** Single round of the Philox shuffle
-         *
-         * @param counter state of the counter
-         * @param key value of the key
-         * @return shuffled counter
-         */
-        static ALPAKA_FN_HOST_ACC auto singleRound(Counter const& counter, Key const& key)
-        {
-            std::uint32_t H0, L0, H1, L1;
-            multiplyAndSplit64to32(counter[0], Constants::MULTIPLITER_4x32_0(), H0, L0);
-            multiplyAndSplit64to32(counter[2], Constants::MULTIPLITER_4x32_1(), H1, L1);
-            return Counter{H1 ^ counter[1] ^ key[0], L1, H0 ^ counter[3] ^ key[1], L0};
-        }
-
-        /** Bump the \a key by the Weyl sequence step parameter
-         *
-         * @param key the key to be bumped
-         * @return the bumped key
-         */
-        static ALPAKA_FN_HOST_ACC auto bumpKey(Key const& key)
-        {
-            return Key{key[0] + Constants::WEYL_32_0(), key[1] + Constants::WEYL_32_1()};
-        }
-
-        /** Performs N rounds of the Philox shuffle
-         *
-         * @param counter_in initial state of the counter
-         * @param key_in initial state of the key
-         * @return result of the PRNG shuffle; has the same size as the counter
-         */
-        static ALPAKA_FN_HOST_ACC auto nRounds(Counter const& counter_in, Key const& key_in) -> Counter
-        {
-            Key key{key_in};
-            Counter counter = singleRound(counter_in, key);
-
-            ALPAKA_UNROLL(numRounds())
-            for(unsigned int n = 0; n < numRounds(); ++n)
-            {
-                key = bumpKey(key);
-                counter = singleRound(counter, key);
-            }
-
-            return counter;
-        }
-
-    public:
-        /** Generates a random number (\p TCounterSize x32-bit)
-         *
-         * @param counter initial state of the counter
-         * @param key initial state of the key
-         * @return result of the PRNG shuffle; has the same size as the counter
-         */
-        static ALPAKA_FN_HOST_ACC auto generate(Counter const& counter, Key const& key) -> Counter
-        {
-            return nRounds(counter, key);
-        }
-    };
-} // namespace alpaka::rand::engine
diff --git a/include/alpaka/rand/Philox/PhiloxStatelessKeyedBase.hpp b/include/alpaka/rand/Philox/PhiloxStatelessKeyedBase.hpp
deleted file mode 100644
index bb6795b..0000000
--- a/include/alpaka/rand/Philox/PhiloxStatelessKeyedBase.hpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright 2022 Jeffrey Kelling
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/rand/Philox/PhiloxStateless.hpp"
-
-namespace alpaka::rand::engine
-{
-    /** Common class for Philox family engines
-     *
-     * Checks the validity of passed-in parameters and calls the backend methods to perform N rounds of the
-     * Philox shuffle.
-     *
-     * @tparam TParams Philox algorithm parameters \sa PhiloxParams
-     */
-    template<typename TParams>
-    struct PhiloxStatelessKeyedBase : public PhiloxStateless<TParams>
-    {
-    public:
-        using Counter = typename PhiloxStateless<TParams>::Counter;
-        using Key = typename PhiloxStateless<TParams>::Key;
-
-        Key const m_key;
-
-        PhiloxStatelessKeyedBase(Key&& key) : m_key(std::move(key))
-        {
-        }
-
-        ALPAKA_FN_HOST_ACC auto operator()(Counter const& counter) const
-        {
-            return this->generate(counter, m_key);
-        }
-    };
-} // namespace alpaka::rand::engine
diff --git a/include/alpaka/rand/Philox/PhiloxVector.hpp b/include/alpaka/rand/Philox/PhiloxVector.hpp
deleted file mode 100644
index 64c89b4..0000000
--- a/include/alpaka/rand/Philox/PhiloxVector.hpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright 2022 Jiri Vyskocil, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/rand/Philox/MultiplyAndSplit64to32.hpp"
-#include "alpaka/rand/Philox/PhiloxBaseCommon.hpp"
-
-#include <utility>
-
-namespace alpaka::rand::engine
-{
-    /** Philox state for vector generator
-     *
-     * @tparam TCounter Type of the Counter array
-     * @tparam TKey Type of the Key array
-     */
-    template<typename TCounter, typename TKey>
-    struct PhiloxStateVector
-    {
-        using Counter = TCounter;
-        using Key = TKey;
-
-        /// Counter array
-        Counter counter;
-        /// Key array
-        Key key;
-    };
-
-    /** Philox engine generating a vector of numbers
-     *
-     * This engine's operator() will return a vector of numbers corresponding to the full size of its counter.
-     * This is a convenience vs. memory size tradeoff since the user has to deal with the output array
-     * themselves, but the internal state comprises only of a single counter and a key.
-     *
-     * @tparam TParams Basic parameters for the Philox algorithm
-     */
-    template<typename TParams>
-    class PhiloxVector : public PhiloxBaseCommon<TParams, PhiloxVector<TParams>>
-    {
-    public:
-        using Base = PhiloxBaseCommon<TParams, PhiloxVector<TParams>>;
-
-        /// Counter type
-        using Counter = typename Base::Counter;
-        /// Key type
-        using Key = typename Base::Key;
-        /// State type
-        using State = PhiloxStateVector<Counter, Key>;
-
-        template<typename TDistributionResultScalar>
-        using ResultContainer = typename Base::template ResultContainer<TDistributionResultScalar>;
-
-        State state;
-
-    protected:
-        /** Get the next array of random numbers and advance internal state
-         *
-         * @return The next array of random numbers
-         */
-        ALPAKA_FN_HOST_ACC auto nextVector()
-        {
-            this->advanceCounter(state.counter);
-            return this->nRounds(state.counter, state.key);
-        }
-
-        /** Skips the next \a offset vectors
-         *
-         * Unlike its counterpart in \a PhiloxSingle, this function advances the state in multiples of the
-         * counter size thus skipping the entire array of numbers.
-         */
-        ALPAKA_FN_HOST_ACC void skip(uint64_t offset)
-        {
-            this->skip4(offset);
-        }
-
-    public:
-        /** Construct a new Philox engine with vector output
-         *
-         * @param seed Set the Philox generator key
-         * @param subsequence Select a subsequence of size 2^64
-         * @param offset Skip \a offset numbers form the start of the subsequence
-         */
-        ALPAKA_FN_HOST_ACC PhiloxVector(uint64_t seed = 0, uint64_t subsequence = 0, uint64_t offset = 0)
-            : state{{0, 0, 0, 0}, {low32Bits(seed), high32Bits(seed)}}
-        {
-            this->skipSubsequence(subsequence);
-            skip(offset);
-            nextVector();
-        }
-
-        /** Get the next vector of random numbers
-         *
-         * @return The next vector of random numbers
-         */
-        ALPAKA_FN_HOST_ACC auto operator()()
-        {
-            return nextVector();
-        }
-    };
-} // namespace alpaka::rand::engine
diff --git a/include/alpaka/rand/RandDefault.hpp b/include/alpaka/rand/RandDefault.hpp
deleted file mode 100644
index bbe763c..0000000
--- a/include/alpaka/rand/RandDefault.hpp
+++ /dev/null
@@ -1,216 +0,0 @@
-/* Copyright 2022 Jeffrey Kelling, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/math/Traits.hpp"
-#include "alpaka/rand/RandPhilox.hpp"
-#include "alpaka/rand/Traits.hpp"
-
-#include <algorithm>
-#include <limits>
-#include <type_traits>
-
-namespace alpaka::rand
-{
-    class RandDefault : public concepts::Implements<ConceptRand, RandDefault>
-    {
-    };
-
-    namespace distribution::gpu
-    {
-        namespace detail
-        {
-            template<typename TFloat>
-            struct BitsType;
-
-            template<>
-            struct BitsType<float>
-            {
-                using type = std::uint32_t;
-            };
-
-            template<>
-            struct BitsType<double>
-            {
-                using type = std::uint64_t;
-            };
-        } // namespace detail
-
-        //! The GPU random number normal distribution.
-        template<typename T>
-        class UniformUint
-        {
-            static_assert(std::is_integral_v<T>, "Return type of UniformUint must be integral.");
-
-        public:
-            UniformUint() = default;
-
-            template<typename TEngine>
-            ALPAKA_FN_HOST_ACC auto operator()(TEngine& engine) -> T
-            {
-                using BitsT = typename TEngine::result_type;
-                T ret = 0;
-                constexpr auto N = sizeof(T) / sizeof(BitsT);
-                for(unsigned int a = 0; a < N; ++a)
-                {
-                    ret
-                        ^= (static_cast<T>(engine())
-                            << (sizeof(BitsT) * std::numeric_limits<unsigned char>::digits * a));
-                }
-                return ret;
-            }
-        };
-
-        //! The GPU random number uniform distribution.
-        template<typename T>
-        class UniformReal
-        {
-            static_assert(std::is_floating_point_v<T>, "Return type of UniformReal must be floating point.");
-
-            using BitsT = typename detail::BitsType<T>::type;
-
-        public:
-            UniformReal() = default;
-
-            template<typename TEngine>
-            ALPAKA_FN_HOST_ACC auto operator()(TEngine& engine) -> T
-            {
-                constexpr BitsT limit = static_cast<BitsT>(1) << std::numeric_limits<T>::digits;
-                BitsT const b = UniformUint<BitsT>()(engine);
-                auto const ret = static_cast<T>(b & (limit - 1)) / limit;
-                return ret;
-            }
-        };
-
-        /*! The GPU random number normal distribution.
-         *
-         * \note
-         * This type contains state and is not thread-safe: To be used
-         * per thread, not shared.
-         *
-         * \note When reproducibility is a concern, each instance of
-         * this class should be used with only on random engine
-         * instance, or two consecutive number should be generated with
-         * each engine used. This is due to the implicit caching of one
-         * Gaussian random number.
-         */
-        template<typename Acc, typename T>
-        class NormalReal
-        {
-            static_assert(std::is_floating_point_v<T>, "Return type of NormalReal must be floating point.");
-
-            Acc const* m_acc;
-            T m_cache = std::numeric_limits<T>::quiet_NaN();
-
-        public:
-            /*! \warning Retains a reference to \p acc, thus must not outlive it.
-             */
-            ALPAKA_FN_HOST_ACC constexpr NormalReal(Acc const& acc) : m_acc(&acc)
-            {
-            }
-
-            // All copy operations (and thus also move since we don't declare those and they fall back to copy) do NOT
-            // copy m_cache. This way we can ensure that the following holds:
-            // NormalReal<Acc> a(acc), b(acc);
-            // Engine<Acc> e(acc);
-            // assert(a(e) != b(e)); // because of two engine invocations
-            // b = a;
-            // assert(a(e) != b(e)); // because of two engine invocations
-
-            ALPAKA_FN_HOST_ACC constexpr NormalReal(NormalReal const& other) : m_acc(other.m_acc)
-            {
-            }
-
-            ALPAKA_FN_HOST_ACC constexpr auto operator=(NormalReal const& other) -> NormalReal&
-            {
-                m_acc = other.m_acc;
-                return *this;
-            }
-
-            template<typename TEngine>
-            ALPAKA_FN_HOST_ACC auto operator()(TEngine& engine) -> T
-            {
-                constexpr auto sigma = T{1};
-                constexpr auto mu = T{0};
-                if(math::isnan(*m_acc, m_cache))
-                {
-                    UniformReal<T> uni;
-
-                    T u1, u2;
-                    do
-                    {
-                        u1 = uni(engine);
-                        u2 = uni(engine);
-                    } while(u1 <= std::numeric_limits<T>::epsilon());
-
-                    // compute z0 and z1
-                    T const mag = sigma * math::sqrt(*m_acc, static_cast<T>(-2.) * math::log(*m_acc, u1));
-                    constexpr T twoPi = static_cast<T>(2. * math::constants::pi);
-                    // getting two normal number out of this, store one for later
-                    m_cache = mag * static_cast<T>(math::cos(*m_acc, twoPi * u2)) + mu;
-
-                    return mag * static_cast<T>(math::sin(*m_acc, twoPi * u2)) + mu;
-                }
-
-                T const ret = m_cache;
-                m_cache = std::numeric_limits<T>::quiet_NaN();
-                return ret;
-            }
-        };
-    } // namespace distribution::gpu
-
-    namespace distribution::trait
-    {
-        //! The GPU device random number float normal distribution get trait specialization.
-        template<typename T>
-        struct CreateNormalReal<RandDefault, T, std::enable_if_t<std::is_floating_point_v<T>>>
-        {
-            template<typename TAcc>
-            ALPAKA_FN_HOST_ACC static auto createNormalReal(TAcc const& acc) -> gpu::NormalReal<TAcc, T>
-            {
-                return {acc};
-            }
-        };
-
-        //! The GPU device random number float uniform distribution get trait specialization.
-        template<typename T>
-        struct CreateUniformReal<RandDefault, T, std::enable_if_t<std::is_floating_point_v<T>>>
-        {
-            ALPAKA_FN_HOST_ACC static auto createUniformReal(RandDefault const& /* rand */) -> gpu::UniformReal<T>
-            {
-                return {};
-            }
-        };
-
-        //! The GPU device random number integer uniform distribution get trait specialization.
-        template<typename T>
-        struct CreateUniformUint<RandDefault, T, std::enable_if_t<std::is_integral_v<T>>>
-        {
-            ALPAKA_FN_HOST_ACC static auto createUniformUint(RandDefault const& /* rand */) -> gpu::UniformUint<T>
-            {
-                return {};
-            }
-        };
-    } // namespace distribution::trait
-
-    namespace engine::trait
-    {
-        //! The GPU device random number default generator get trait specialization.
-        template<>
-        struct CreateDefault<RandDefault>
-        {
-            template<typename TAcc>
-            ALPAKA_FN_HOST_ACC static auto createDefault(
-                TAcc const& /* acc */,
-                std::uint32_t const& seed,
-                std::uint32_t const& subsequence,
-                std::uint32_t const& offset) -> Philox4x32x10
-            {
-                return {seed, subsequence, offset};
-            }
-        };
-    } // namespace engine::trait
-} // namespace alpaka::rand
diff --git a/include/alpaka/rand/RandGenericSycl.hpp b/include/alpaka/rand/RandGenericSycl.hpp
deleted file mode 100644
index c114a4f..0000000
--- a/include/alpaka/rand/RandGenericSycl.hpp
+++ /dev/null
@@ -1,198 +0,0 @@
-/* Copyright 2023 Luca Ferragina, Aurora Perego, Jan Stephan, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/BoostPredef.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/dev/DevGenericSycl.hpp"
-#include "alpaka/rand/Traits.hpp"
-
-#if defined(ALPAKA_ACC_SYCL_ENABLED) && !defined(ALPAKA_DISABLE_VENDOR_RNG)
-
-// Backend specific imports.
-#    include <sycl/sycl.hpp>
-#    if BOOST_COMP_CLANG
-#        pragma clang diagnostic push
-#        pragma clang diagnostic ignored "-Wcast-align"
-#        pragma clang diagnostic ignored "-Wcast-qual"
-#        pragma clang diagnostic ignored "-Wextra-semi"
-#        pragma clang diagnostic ignored "-Wfloat-equal"
-#        pragma clang diagnostic ignored "-Wold-style-cast"
-#        pragma clang diagnostic ignored "-Wreserved-identifier"
-#        pragma clang diagnostic ignored "-Wreserved-macro-identifier"
-#        pragma clang diagnostic ignored "-Wsign-compare"
-#        pragma clang diagnostic ignored "-Wundef"
-#    endif
-#    include <oneapi/dpl/random>
-#    if BOOST_COMP_CLANG
-#        pragma clang diagnostic pop
-#    endif
-
-#    include <type_traits>
-
-namespace alpaka::rand
-{
-    //! The SYCL rand implementation.
-    template<typename TDim>
-    struct RandGenericSycl : concepts::Implements<ConceptRand, RandGenericSycl<TDim>>
-    {
-        explicit RandGenericSycl(sycl::nd_item<TDim::value> my_item) : m_item_rand{my_item}
-        {
-        }
-
-        sycl::nd_item<TDim::value> m_item_rand;
-    };
-
-#    if !defined(ALPAKA_HOST_ONLY)
-    namespace distribution::sycl_rand
-    {
-        //! The SYCL random number floating point normal distribution.
-        template<typename T>
-        struct NormalReal;
-
-        //! The SYCL random number uniform distribution.
-        template<typename T>
-        struct Uniform;
-    } // namespace distribution::sycl_rand
-
-    namespace engine::sycl_rand
-    {
-        //! The SYCL linear congruential random number generator engine.
-        template<typename TDim>
-        class Minstd
-        {
-        public:
-            // After calling this constructor the instance is not valid initialized and
-            // need to be overwritten with a valid object
-            Minstd() = default;
-
-            Minstd(RandGenericSycl<TDim> rand, std::uint32_t const& seed)
-            {
-                oneapi::dpl::minstd_rand engine(seed, rand.m_item_rand.get_global_linear_id());
-                rng_engine = engine;
-            }
-
-        private:
-            template<typename T>
-            friend struct distribution::sycl_rand::NormalReal;
-            template<typename T>
-            friend struct distribution::sycl_rand::Uniform;
-
-            oneapi::dpl::minstd_rand rng_engine;
-
-        public:
-            using result_type = float;
-
-            ALPAKA_FN_HOST_ACC static result_type min()
-            {
-                return std::numeric_limits<result_type>::min();
-            }
-
-            ALPAKA_FN_HOST_ACC static result_type max()
-            {
-                return std::numeric_limits<result_type>::max();
-            }
-
-            result_type operator()()
-            {
-                oneapi::dpl::uniform_real_distribution<float> distr;
-                return distr(rng_engine);
-            }
-        };
-    } // namespace engine::sycl_rand
-
-    namespace distribution::sycl_rand
-    {
-
-        //! The SYCL random number double normal distribution.
-        template<typename F>
-        struct NormalReal
-        {
-            static_assert(std::is_floating_point_v<F>);
-
-            template<typename TEngine>
-            auto operator()(TEngine& engine) -> F
-            {
-                oneapi::dpl::normal_distribution<F> distr;
-                return distr(engine.rng_engine);
-            }
-        };
-
-        //! The SYCL random number float uniform distribution.
-        template<typename T>
-        struct Uniform
-        {
-            static_assert(std::is_floating_point_v<T> || std::is_unsigned_v<T>);
-
-            template<typename TEngine>
-            auto operator()(TEngine& engine) -> T
-            {
-                if constexpr(std::is_floating_point_v<T>)
-                {
-                    oneapi::dpl::uniform_real_distribution<T> distr;
-                    return distr(engine.rng_engine);
-                }
-                else
-                {
-                    oneapi::dpl::uniform_int_distribution<T> distr;
-                    return distr(engine.rng_engine);
-                }
-            }
-        };
-    } // namespace distribution::sycl_rand
-
-    namespace distribution::trait
-    {
-        //! The SYCL random number float normal distribution get trait specialization.
-        template<typename TDim, typename T>
-        struct CreateNormalReal<RandGenericSycl<TDim>, T, std::enable_if_t<std::is_floating_point_v<T>>>
-        {
-            static auto createNormalReal(RandGenericSycl<TDim> const& /*rand*/) -> sycl_rand::NormalReal<T>
-            {
-                return {};
-            }
-        };
-
-        //! The SYCL random number float uniform distribution get trait specialization.
-        template<typename TDim, typename T>
-        struct CreateUniformReal<RandGenericSycl<TDim>, T, std::enable_if_t<std::is_floating_point_v<T>>>
-        {
-            static auto createUniformReal(RandGenericSycl<TDim> const& /*rand*/) -> sycl_rand::Uniform<T>
-            {
-                return {};
-            }
-        };
-
-        //! The SYCL random number integer uniform distribution get trait specialization.
-        template<typename TDim, typename T>
-        struct CreateUniformUint<RandGenericSycl<TDim>, T, std::enable_if_t<std::is_integral_v<T>>>
-        {
-            static auto createUniformUint(RandGenericSycl<TDim> const& /*rand*/) -> sycl_rand::Uniform<T>
-            {
-                return {};
-            }
-        };
-    } // namespace distribution::trait
-
-    namespace engine::trait
-    {
-        //! The SYCL random number default generator get trait specialization.
-        template<typename TDim>
-        struct CreateDefault<RandGenericSycl<TDim>>
-        {
-            static auto createDefault(
-                RandGenericSycl<TDim> const& rand,
-                std::uint32_t const& seed = 0,
-                std::uint32_t const& /* subsequence */ = 0,
-                std::uint32_t const& /* offset */ = 0) -> sycl_rand::Minstd<TDim>
-            {
-                return {rand, seed};
-            }
-        };
-    } // namespace engine::trait
-#    endif
-} // namespace alpaka::rand
-
-#endif
diff --git a/include/alpaka/rand/RandPhilox.hpp b/include/alpaka/rand/RandPhilox.hpp
deleted file mode 100644
index d11cacb..0000000
--- a/include/alpaka/rand/RandPhilox.hpp
+++ /dev/null
@@ -1,201 +0,0 @@
-/* Copyright 2022 Jiří Vyskočil, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/meta/IsArrayOrVector.hpp"
-#include "alpaka/rand/Philox/PhiloxSingle.hpp"
-#include "alpaka/rand/Philox/PhiloxVector.hpp"
-#include "alpaka/rand/Traits.hpp"
-
-#include <cstdint>
-#include <limits>
-#include <random>
-#include <type_traits>
-
-namespace alpaka::rand
-{
-    /** Most common Philox engine variant, outputs single number
-     *
-     * This is a variant of the Philox engine generator which outputs a single float. The counter size is \f$4
-     * \times 32 = 128\f$ bits. Since the engine returns a single number, the generated result, which has the same
-     * size as the counter, has to be stored between invocations. Additionally a 32 bit pointer is stored. The
-     * total size of the state is 352 bits = 44 bytes.
-     *
-     * Ref.: J. K. Salmon, M. A. Moraes, R. O. Dror and D. E. Shaw, "Parallel random numbers: As easy as 1, 2, 3,"
-     * SC '11: Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and
-     * Analysis, 2011, pp. 1-12, doi: 10.1145/2063384.2063405.
-     */
-    class Philox4x32x10 : public concepts::Implements<ConceptRand, Philox4x32x10>
-    {
-    public:
-        /// Philox algorithm: 10 rounds, 4 numbers of size 32.
-        using EngineParams = engine::PhiloxParams<4, 32, 10>;
-        /// Engine outputs a single number
-        using EngineVariant = engine::PhiloxSingle<EngineParams>;
-
-        /** Initialize a new Philox engine
-         *
-         * @param seed Set the Philox generator key
-         * @param subsequence Select a subsequence of size 2^64
-         * @param offset Skip \a offset numbers form the start of the subsequence
-         */
-        ALPAKA_FN_HOST_ACC Philox4x32x10(
-            std::uint64_t const seed = 0,
-            std::uint64_t const subsequence = 0,
-            std::uint64_t const offset = 0)
-            : engineVariant(seed, subsequence, offset)
-        {
-        }
-
-        // STL UniformRandomBitGenerator concept
-        // https://en.cppreference.com/w/cpp/named_req/UniformRandomBitGenerator
-        using result_type = std::uint32_t;
-
-        ALPAKA_FN_HOST_ACC constexpr auto min() -> result_type
-        {
-            return 0;
-        }
-
-        ALPAKA_FN_HOST_ACC constexpr auto max() -> result_type
-        {
-            return std::numeric_limits<result_type>::max();
-        }
-
-        ALPAKA_FN_HOST_ACC auto operator()() -> result_type
-        {
-            return engineVariant();
-        }
-
-    private:
-        EngineVariant engineVariant;
-    };
-
-    /** Most common Philox engine variant, outputs a 4-vector of floats
-     *
-     * This is a variant of the Philox engine generator which outputs a vector containing 4 floats. The counter
-     * size is \f$4 \times 32 = 128\f$ bits. Since the engine returns the whole generated vector, it is up to the
-     * user to extract individual floats as they need. The benefit is smaller state size since the state does not
-     * contain the intermediate results. The total size of the state is 192 bits = 24 bytes.
-     *
-     * Ref.: J. K. Salmon, M. A. Moraes, R. O. Dror and D. E. Shaw, "Parallel random numbers: As easy as 1, 2, 3,"
-     * SC '11: Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and
-     * Analysis, 2011, pp. 1-12, doi: 10.1145/2063384.2063405.
-     */
-    class Philox4x32x10Vector : public concepts::Implements<ConceptRand, Philox4x32x10Vector>
-    {
-    public:
-        using EngineParams = engine::PhiloxParams<4, 32, 10>;
-        using EngineVariant = engine::PhiloxVector<EngineParams>;
-
-        /** Initialize a new Philox engine
-         *
-         * @param seed Set the Philox generator key
-         * @param subsequence Select a subsequence of size 2^64
-         * @param offset Number of numbers to skip form the start of the subsequence.
-         */
-        ALPAKA_FN_HOST_ACC Philox4x32x10Vector(
-            std::uint32_t const seed = 0,
-            std::uint32_t const subsequence = 0,
-            std::uint32_t const offset = 0)
-            : engineVariant(seed, subsequence, offset)
-        {
-        }
-
-        template<typename TScalar>
-        using ResultContainer = typename EngineVariant::template ResultContainer<TScalar>;
-
-        using ResultInt = std::uint32_t;
-        using ResultVec = decltype(std::declval<EngineVariant>()());
-
-        ALPAKA_FN_HOST_ACC constexpr auto min() -> ResultInt
-        {
-            return 0;
-        }
-
-        ALPAKA_FN_HOST_ACC constexpr auto max() -> ResultInt
-        {
-            return std::numeric_limits<ResultInt>::max();
-        }
-
-        ALPAKA_FN_HOST_ACC auto operator()() -> ResultVec
-        {
-            return engineVariant();
-        }
-
-    private:
-        EngineVariant engineVariant;
-    };
-
-    // The following exists because you "cannot call __device__ function from a __host__ __device__ function"
-    // directly, but wrapping that call in a struct is just fine.
-    template<typename TEngine>
-    struct EngineCallHostAccProxy
-    {
-        ALPAKA_FN_HOST_ACC auto operator()(TEngine& engine) -> decltype(engine())
-        {
-            return engine();
-        }
-    };
-
-    /// TEMP: Distributions to be decided on later. The generator should be compatible with STL as of now.
-    template<typename TResult, typename TSfinae = void>
-    class UniformReal : public concepts::Implements<ConceptRand, UniformReal<TResult>>
-    {
-        template<typename TRes, typename TEnable = void>
-        struct ResultType
-        {
-            using type = TRes;
-        };
-
-        template<typename TRes>
-        struct ResultType<TRes, std::enable_if_t<meta::IsArrayOrVector<TRes>::value>>
-        {
-            using type = typename TRes::value_type;
-        };
-
-        using T = typename ResultType<TResult>::type;
-        static_assert(std::is_floating_point_v<T>, "Only floating-point types are supported");
-
-    public:
-        ALPAKA_FN_HOST_ACC UniformReal() : UniformReal(0, 1)
-        {
-        }
-
-        ALPAKA_FN_HOST_ACC UniformReal(T min, T max) : _min(min), _max(max), _range(_max - _min)
-        {
-        }
-
-        template<typename TEngine>
-        ALPAKA_FN_HOST_ACC auto operator()(TEngine& engine) -> TResult
-        {
-            if constexpr(meta::IsArrayOrVector<TResult>::value)
-            {
-                auto result = engine();
-                T scale = static_cast<T>(1) / static_cast<T>(engine.max()) * _range;
-                TResult ret{
-                    static_cast<T>(result[0]) * scale + _min,
-                    static_cast<T>(result[1]) * scale + _min,
-                    static_cast<T>(result[2]) * scale + _min,
-                    static_cast<T>(result[3]) * scale + _min};
-                return ret;
-            }
-            else
-            {
-                // Since it's possible to get a host-only engine here, the call has to go through proxy
-                return static_cast<T>(EngineCallHostAccProxy<TEngine>{}(engine)) / static_cast<T>(engine.max())
-                           * _range
-                       + _min;
-            }
-
-            ALPAKA_UNREACHABLE(TResult{});
-        }
-
-    private:
-        T const _min;
-        T const _max;
-        T const _range;
-    };
-} // namespace alpaka::rand
diff --git a/include/alpaka/rand/RandPhiloxStateless.hpp b/include/alpaka/rand/RandPhiloxStateless.hpp
deleted file mode 100644
index b2530d1..0000000
--- a/include/alpaka/rand/RandPhiloxStateless.hpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2022 Jeffrey Kelling
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/rand/Philox/PhiloxStateless.hpp"
-#include "alpaka/rand/Traits.hpp"
-
-namespace alpaka::rand
-{
-    /** Most common Philox engine variant, stateless, outputs a 4-vector of floats
-     *
-     * This is a variant of the Philox engine generator which outputs a vector containing 4 floats. The counter
-     * size is \f$4 \times 32 = 128\f$ bits. Since the engine returns the whole generated vector, it is up to the
-     * user to extract individual floats as they need. The benefit is smaller state size since the state does not
-     * contain the intermediate results. The total size of the state is 192 bits = 24 bytes.
-     *
-     * Ref.: J. K. Salmon, M. A. Moraes, R. O. Dror and D. E. Shaw, "Parallel random numbers: As easy as 1, 2, 3,"
-     * SC '11: Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and
-     * Analysis, 2011, pp. 1-12, doi: 10.1145/2063384.2063405.
-     */
-    class PhiloxStateless4x32x10Vector
-        : public alpaka::rand::engine::PhiloxStateless<engine::PhiloxParams<4, 32, 10>>
-        , public concepts::Implements<ConceptRand, PhiloxStateless4x32x10Vector>
-    {
-    public:
-        using EngineParams = engine::PhiloxParams<4, 32, 10>;
-    };
-} // namespace alpaka::rand
diff --git a/include/alpaka/rand/RandStdLib.hpp b/include/alpaka/rand/RandStdLib.hpp
deleted file mode 100644
index ec507e0..0000000
--- a/include/alpaka/rand/RandStdLib.hpp
+++ /dev/null
@@ -1,279 +0,0 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, René Widera, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/rand/TinyMT/Engine.hpp"
-#include "alpaka/rand/Traits.hpp"
-
-#include <cstdint>
-#include <limits>
-#include <random>
-#include <type_traits>
-
-namespace alpaka::rand
-{
-    //! "Tiny" state mersenne twister implementation
-    class TinyMersenneTwister : public concepts::Implements<ConceptRand, TinyMersenneTwister>
-    {
-    };
-
-    using RandStdLib = TinyMersenneTwister;
-
-    //! The standard library mersenne twister implementation.
-    class MersenneTwister : public concepts::Implements<ConceptRand, MersenneTwister>
-    {
-    };
-
-    //! The standard library rand device implementation.
-    class RandomDevice : public concepts::Implements<ConceptRand, RandomDevice>
-    {
-    };
-
-    namespace engine::cpu
-    {
-        //! The standard library mersenne twister random number generator.
-        //!
-        //! size of state: 19937 bytes
-        class MersenneTwister
-        {
-            std::mt19937 state;
-
-        public:
-            MersenneTwister() = default;
-
-            ALPAKA_FN_HOST MersenneTwister(
-                std::uint32_t const& seed,
-                std::uint32_t const& subsequence = 0,
-                std::uint32_t const& offset = 0)
-                : // NOTE: XOR the seed and the subsequence to generate a unique seed.
-                state((seed ^ subsequence) + offset)
-            {
-            }
-
-            // STL UniformRandomBitGenerator concept interface
-            using result_type = std::mt19937::result_type;
-
-            ALPAKA_FN_HOST static constexpr auto min() -> result_type
-            {
-                return std::mt19937::min();
-            }
-
-            ALPAKA_FN_HOST static constexpr auto max() -> result_type
-            {
-                return std::mt19937::max();
-            }
-
-            ALPAKA_FN_HOST auto operator()() -> result_type
-            {
-                return state();
-            }
-        };
-
-        //! "Tiny" state mersenne twister implementation
-        //!
-        //! repository: github.com/MersenneTwister-Lab/TinyMT
-        //!
-        //! license: 3-clause BSD
-        //!
-        //! @author Mutsuo Saito (Hiroshima University)Tokio University.
-        //! @author Makoto Matsumoto (The University of Tokyo)
-        //!
-        //! size of state: 28 bytes (127 bits?!)
-        class TinyMersenneTwister
-        {
-            TinyMTengine state;
-
-        public:
-            TinyMersenneTwister() = default;
-
-            ALPAKA_FN_HOST TinyMersenneTwister(
-                std::uint32_t const& seed,
-                std::uint32_t const& subsequence = 0,
-                std::uint32_t const& offset = 0)
-                : // NOTE: XOR the seed and the subsequence to generate a unique seed.
-                state((seed ^ subsequence) + offset)
-            {
-            }
-
-            // STL UniformRandomBitGenerator concept interface
-            using result_type = TinyMTengine::result_type;
-
-            ALPAKA_FN_HOST static constexpr auto min() -> result_type
-            {
-                return TinyMTengine::min();
-            }
-
-            ALPAKA_FN_HOST static constexpr auto max() -> result_type
-            {
-                return TinyMTengine::max();
-            }
-
-            ALPAKA_FN_HOST auto operator()() -> result_type
-            {
-                return state();
-            }
-        };
-
-        //! The standard library's random device based on the local entropy pool.
-        //!
-        //! Warning: the entropy pool on many devices degrates quickly and performance
-        //!          will drop significantly when this point occures.
-        //!
-        //! size of state: 1 byte
-        class RandomDevice
-        {
-            std::random_device state;
-
-        public:
-            RandomDevice() = default;
-
-            ALPAKA_FN_HOST RandomDevice(std::uint32_t const&, std::uint32_t const& = 0, std::uint32_t const& = 0)
-            {
-            }
-
-            // STL UniformRandomBitGenerator concept interface
-            using result_type = std::random_device::result_type;
-
-            ALPAKA_FN_HOST static constexpr auto min() -> result_type
-            {
-                return std::random_device::min();
-            }
-
-            ALPAKA_FN_HOST static constexpr auto max() -> result_type
-            {
-                return std::random_device::max();
-            }
-
-            ALPAKA_FN_HOST auto operator()() -> result_type
-            {
-                return state();
-            }
-        };
-    } // namespace engine::cpu
-
-    namespace distribution::cpu
-    {
-        //! The CPU random number normal distribution.
-        template<typename T>
-        struct NormalReal
-        {
-            template<typename TEngine>
-            ALPAKA_FN_HOST auto operator()(TEngine& engine) -> T
-            {
-                return m_dist(engine);
-            }
-
-        private:
-            std::normal_distribution<T> m_dist;
-        };
-
-        //! The CPU random number uniform distribution.
-        template<typename T>
-        struct UniformReal
-        {
-            template<typename TEngine>
-            ALPAKA_FN_HOST auto operator()(TEngine& engine) -> T
-            {
-                return m_dist(engine);
-            }
-
-        private:
-            std::uniform_real_distribution<T> m_dist;
-        };
-
-        //! The CPU random number normal distribution.
-        template<typename T>
-        struct UniformUint
-        {
-            template<typename TEngine>
-            ALPAKA_FN_HOST auto operator()(TEngine& engine) -> T
-            {
-                return m_dist(engine);
-            }
-
-        private:
-            std::uniform_int_distribution<T> m_dist{
-                0, // For signed integer: std::numeric_limits<T>::lowest()
-                std::numeric_limits<T>::max()};
-        };
-    } // namespace distribution::cpu
-
-    namespace distribution::trait
-    {
-        //! The CPU device random number float normal distribution get trait specialization.
-        template<typename T>
-        struct CreateNormalReal<RandStdLib, T, std::enable_if_t<std::is_floating_point_v<T>>>
-        {
-            ALPAKA_FN_HOST static auto createNormalReal(RandStdLib const& /* rand */) -> cpu::NormalReal<T>
-            {
-                return {};
-            }
-        };
-
-        //! The CPU device random number float uniform distribution get trait specialization.
-        template<typename T>
-        struct CreateUniformReal<RandStdLib, T, std::enable_if_t<std::is_floating_point_v<T>>>
-        {
-            ALPAKA_FN_HOST static auto createUniformReal(RandStdLib const& /* rand */) -> cpu::UniformReal<T>
-            {
-                return {};
-            }
-        };
-
-        //! The CPU device random number integer uniform distribution get trait specialization.
-        template<typename T>
-        struct CreateUniformUint<RandStdLib, T, std::enable_if_t<std::is_integral_v<T>>>
-        {
-            ALPAKA_FN_HOST static auto createUniformUint(RandStdLib const& /* rand */) -> cpu::UniformUint<T>
-            {
-                return {};
-            }
-        };
-    } // namespace distribution::trait
-
-    namespace engine::trait
-    {
-        //! The CPU device random number default generator get trait specialization.
-        template<>
-        struct CreateDefault<TinyMersenneTwister>
-        {
-            ALPAKA_FN_HOST static auto createDefault(
-                TinyMersenneTwister const& /* rand */,
-                std::uint32_t const& seed = 0,
-                std::uint32_t const& subsequence = 0,
-                std::uint32_t const& offset = 0) -> cpu::TinyMersenneTwister
-            {
-                return {seed, subsequence, offset};
-            }
-        };
-
-        template<>
-        struct CreateDefault<MersenneTwister>
-        {
-            ALPAKA_FN_HOST static auto createDefault(
-                MersenneTwister const& /* rand */,
-                std::uint32_t const& seed = 0,
-                std::uint32_t const& subsequence = 0,
-                std::uint32_t const& offset = 0) -> cpu::MersenneTwister
-            {
-                return {seed, subsequence, offset};
-            }
-        };
-
-        template<>
-        struct CreateDefault<RandomDevice>
-        {
-            ALPAKA_FN_HOST static auto createDefault(
-                RandomDevice const& /* rand */,
-                std::uint32_t const& seed = 0,
-                std::uint32_t const& subsequence = 0,
-                std::uint32_t const& offset = 0) -> cpu::RandomDevice
-            {
-                return {seed, subsequence, offset};
-            }
-        };
-    } // namespace engine::trait
-} // namespace alpaka::rand
diff --git a/include/alpaka/rand/RandUniformCudaHipRand.hpp b/include/alpaka/rand/RandUniformCudaHipRand.hpp
deleted file mode 100644
index 63ffea9..0000000
--- a/include/alpaka/rand/RandUniformCudaHipRand.hpp
+++ /dev/null
@@ -1,283 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, René Widera, Andrea Bocci, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/BoostPredef.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/core/Cuda.hpp"
-#include "alpaka/core/Hip.hpp"
-#include "alpaka/dev/DevUniformCudaHipRt.hpp"
-#include "alpaka/rand/Traits.hpp"
-
-#include <type_traits>
-
-#if(defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)) && !defined(ALPAKA_DISABLE_VENDOR_RNG)
-
-#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-#        include <curand_kernel.h>
-#    elif defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-#        if BOOST_COMP_CLANG
-#            pragma clang diagnostic push
-#            pragma clang diagnostic ignored "-Wduplicate-decl-specifier"
-#        endif
-
-#        if HIP_VERSION >= 50'200'000
-#            include <hiprand/hiprand_kernel.h>
-#        else
-#            include <hiprand_kernel.h>
-#        endif
-
-#        if BOOST_COMP_CLANG
-#            pragma clang diagnostic pop
-#        endif
-#    endif
-
-namespace alpaka::rand
-{
-    //! The CUDA/HIP rand implementation.
-    template<typename TApi>
-    class RandUniformCudaHipRand : public concepts::Implements<ConceptRand, RandUniformCudaHipRand<TApi>>
-    {
-    };
-
-#    if !defined(ALPAKA_HOST_ONLY)
-
-#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
-#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#        endif
-
-#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
-#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#        endif
-
-    namespace distribution::uniform_cuda_hip
-    {
-        //! The CUDA/HIP random number floating point normal distribution.
-        template<typename T>
-        class NormalReal;
-
-        //! The CUDA/HIP random number floating point uniform distribution.
-        template<typename T>
-        class UniformReal;
-
-        //! The CUDA/HIP random number integer uniform distribution.
-        template<typename T>
-        class UniformUint;
-    } // namespace distribution::uniform_cuda_hip
-
-    namespace engine::uniform_cuda_hip
-    {
-        //! The CUDA/HIP Xor random number generator engine.
-        class Xor
-        {
-        public:
-            // After calling this constructor the instance is not valid initialized and
-            // need to be overwritten with a valid object
-            Xor() = default;
-
-            __device__ Xor(
-                std::uint32_t const& seed,
-                std::uint32_t const& subsequence = 0,
-                std::uint32_t const& offset = 0)
-            {
-#        ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-                curand_init(seed, subsequence, offset, &state);
-#        else
-                hiprand_init(seed, subsequence, offset, &state);
-#        endif
-            }
-
-        private:
-            template<typename T>
-            friend class distribution::uniform_cuda_hip::NormalReal;
-            template<typename T>
-            friend class distribution::uniform_cuda_hip::UniformReal;
-            template<typename T>
-            friend class distribution::uniform_cuda_hip::UniformUint;
-
-#        ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-            curandStateXORWOW_t state = curandStateXORWOW_t{};
-#        else
-            hiprandStateXORWOW_t state = hiprandStateXORWOW_t{};
-#        endif
-
-        public:
-            // STL UniformRandomBitGenerator concept. This is not strictly necessary as the distributions
-            // contained in this file are aware of the API specifics of the CUDA/HIP XORWOW engine and STL
-            // distributions might not work on the device, but it servers a compatibility bridge to other
-            // potentially compatible alpaka distributions.
-#        ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-            using result_type = decltype(curand(&state));
-#        else
-            using result_type = decltype(hiprand(&state));
-#        endif
-            ALPAKA_FN_HOST_ACC static constexpr result_type min()
-            {
-                return std::numeric_limits<result_type>::min();
-            }
-
-            ALPAKA_FN_HOST_ACC static constexpr result_type max()
-            {
-                return std::numeric_limits<result_type>::max();
-            }
-
-            __device__ result_type operator()()
-            {
-#        ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-                return curand(&state);
-#        else
-                return hiprand(&state);
-#        endif
-            }
-        };
-    } // namespace engine::uniform_cuda_hip
-
-    namespace distribution::uniform_cuda_hip
-    {
-        //! The CUDA/HIP random number float normal distribution.
-        template<>
-        class NormalReal<float>
-        {
-        public:
-            template<typename TEngine>
-            __device__ auto operator()(TEngine& engine) -> float
-            {
-#        ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-                return curand_normal(&engine.state);
-#        else
-                return hiprand_normal(&engine.state);
-#        endif
-            }
-        };
-
-        //! The CUDA/HIP random number float normal distribution.
-        template<>
-        class NormalReal<double>
-        {
-        public:
-            template<typename TEngine>
-            __device__ auto operator()(TEngine& engine) -> double
-            {
-#        ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-                return curand_normal_double(&engine.state);
-#        else
-                return hiprand_normal_double(&engine.state);
-#        endif
-            }
-        };
-
-        //! The CUDA/HIP random number float uniform distribution.
-        template<>
-        class UniformReal<float>
-        {
-        public:
-            template<typename TEngine>
-            __device__ auto operator()(TEngine& engine) -> float
-            {
-                // (0.f, 1.0f]
-#        ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-                float const fUniformRand(curand_uniform(&engine.state));
-#        else
-                float const fUniformRand(hiprand_uniform(&engine.state));
-#        endif
-                // NOTE: (1.0f - curand_uniform) does not work, because curand_uniform seems to return
-                // denormalized floats around 0.f. [0.f, 1.0f)
-                return fUniformRand * static_cast<float>(fUniformRand != 1.0f);
-            }
-        };
-
-        //! The CUDA/HIP random number float uniform distribution.
-        template<>
-        class UniformReal<double>
-        {
-        public:
-            template<typename TEngine>
-            __device__ auto operator()(TEngine& engine) -> double
-            {
-                // (0.f, 1.0f]
-#        ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-                double const fUniformRand(curand_uniform_double(&engine.state));
-#        else
-                double const fUniformRand(hiprand_uniform_double(&engine.state));
-#        endif
-                // NOTE: (1.0f - curand_uniform_double) does not work, because curand_uniform_double seems to
-                // return denormalized floats around 0.f. [0.f, 1.0f)
-                return fUniformRand * static_cast<double>(fUniformRand != 1.0);
-            }
-        };
-
-        //! The CUDA/HIP random number unsigned integer uniform distribution.
-        template<>
-        class UniformUint<unsigned int>
-        {
-        public:
-            template<typename TEngine>
-            __device__ auto operator()(TEngine& engine) -> unsigned int
-            {
-#        ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-                return curand(&engine.state);
-#        else
-                return hiprand(&engine.state);
-#        endif
-            }
-        };
-    } // namespace distribution::uniform_cuda_hip
-
-    namespace distribution::trait
-    {
-        //! The CUDA/HIP random number float normal distribution get trait specialization.
-        template<typename TApi, typename T>
-        struct CreateNormalReal<RandUniformCudaHipRand<TApi>, T, std::enable_if_t<std::is_floating_point_v<T>>>
-        {
-            static __device__ auto createNormalReal(RandUniformCudaHipRand<TApi> const& /*rand*/)
-                -> uniform_cuda_hip::NormalReal<T>
-            {
-                return {};
-            }
-        };
-
-        //! The CUDA/HIP random number float uniform distribution get trait specialization.
-        template<typename TApi, typename T>
-        struct CreateUniformReal<RandUniformCudaHipRand<TApi>, T, std::enable_if_t<std::is_floating_point_v<T>>>
-        {
-            static __device__ auto createUniformReal(RandUniformCudaHipRand<TApi> const& /*rand*/)
-                -> uniform_cuda_hip::UniformReal<T>
-            {
-                return {};
-            }
-        };
-
-        //! The CUDA/HIP random number integer uniform distribution get trait specialization.
-        template<typename TApi, typename T>
-        struct CreateUniformUint<RandUniformCudaHipRand<TApi>, T, std::enable_if_t<std::is_integral_v<T>>>
-        {
-            static __device__ auto createUniformUint(RandUniformCudaHipRand<TApi> const& /*rand*/)
-                -> uniform_cuda_hip::UniformUint<T>
-            {
-                return {};
-            }
-        };
-    } // namespace distribution::trait
-
-    namespace engine::trait
-    {
-        //! The CUDA/HIP random number default generator get trait specialization.
-        template<typename TApi>
-        struct CreateDefault<RandUniformCudaHipRand<TApi>>
-        {
-            static __device__ auto createDefault(
-                RandUniformCudaHipRand<TApi> const& /*rand*/,
-                std::uint32_t const& seed = 0,
-                std::uint32_t const& subsequence = 0,
-                std::uint32_t const& offset = 0) -> uniform_cuda_hip::Xor
-            {
-                return {seed, subsequence, offset};
-            }
-        };
-    } // namespace engine::trait
-#    endif
-} // namespace alpaka::rand
-
-#endif
diff --git a/include/alpaka/rand/TinyMT/Engine.hpp b/include/alpaka/rand/TinyMT/Engine.hpp
deleted file mode 100644
index 9f5d05e..0000000
--- a/include/alpaka/rand/TinyMT/Engine.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/rand/TinyMT/tinymt32.h"
-
-#include <cstdint>
-
-namespace alpaka::rand::engine::cpu
-{
-    //! Implementation of std::UniformRandomBitGenerator for TinyMT32
-    struct TinyMTengine
-    {
-        using result_type = std::uint32_t;
-
-        static constexpr auto default_seed() -> result_type
-        {
-            return 42u;
-        }
-
-        void seed(result_type value = default_seed())
-        {
-            // parameters from TinyMT/jump/sample.c
-            prng.mat1 = 0x8f70'11ee;
-            prng.mat2 = 0xfc78'ff1f;
-            prng.tmat = 0x3793'fdff;
-
-            tinymt32_init(&prng, value);
-        }
-
-        TinyMTengine(std::uint32_t const& seedValue)
-        {
-            seed(seedValue);
-        }
-
-        TinyMTengine()
-        {
-            seed(default_seed());
-        }
-
-        auto operator()() -> result_type
-        {
-            return tinymt32_generate_uint32(&prng);
-        }
-
-        static constexpr auto min() -> result_type
-        {
-            return 0u;
-        }
-
-        static constexpr auto max() -> result_type
-        {
-            return UINT32_MAX;
-        }
-
-        void discard(unsigned long long) // z
-        {
-            // not implemented
-            // tinymt32_jump( &prng, z, z );
-        }
-
-        tinymt32_t prng;
-    };
-} // namespace alpaka::rand::engine::cpu
diff --git a/include/alpaka/rand/TinyMT/LICENSE.txt b/include/alpaka/rand/TinyMT/LICENSE.txt
deleted file mode 100644
index 88bd896..0000000
--- a/include/alpaka/rand/TinyMT/LICENSE.txt
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2019 Mutsuo Saito
- *
- * This file is part of alpaka.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-Copyright (c) 2011, 2013 Mutsuo Saito, Makoto Matsumoto,
-Hiroshima University and The University of Tokyo.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above
-      copyright notice, this list of conditions and the following
-      disclaimer in the documentation and/or other materials provided
-      with the distribution.
-    * Neither the name of the Hiroshima University nor the names of
-      its contributors may be used to endorse or promote products
-      derived from this software without specific prior written
-      permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/include/alpaka/rand/TinyMT/tinymt32.h b/include/alpaka/rand/TinyMT/tinymt32.h
deleted file mode 100644
index 55a946f..0000000
--- a/include/alpaka/rand/TinyMT/tinymt32.h
+++ /dev/null
@@ -1,429 +0,0 @@
-/* Copyright 2011 - 2023 Mutsuo Saito, Makoto Matsumoto, Axel Hübl, Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: BSD-3-Clause
- */
-// clang-format off
-#ifndef TINYMT32_H
-#define TINYMT32_H
-/**
- * @file tinymt32.h
- *
- * @brief Tiny Mersenne Twister only 127 bit internal state
- *
- * @author Mutsuo Saito (Hiroshima University)
- * @author Makoto Matsumoto (University of Tokyo)
- *
- * Copyright (C) 2011 Mutsuo Saito, Makoto Matsumoto,
- * Hiroshima University and The University of Tokyo.
- * All rights reserved.
- *
- * The 3-clause BSD License is applied to this software, see
- * LICENSE.txt
- */
-
-#include "alpaka/core/BoostPredef.hpp"
-
-#include <cstdint>
-/* work-around for glibc < 2.18 according to bug
- * https://sourceware.org/bugzilla/show_bug.cgi?id=15366
- */
-#ifndef UINT32_MAX
-#   define UINT32_MAX ((uint32_t)-1u)
-#endif
-#ifndef UINT32_C
-#   define UINT32_C(value) uint_least32_t(value)
-#endif
-#include <cinttypes>
-
-#if BOOST_COMP_CLANG
-#   pragma clang diagnostic push
-#   pragma clang diagnostic ignored "-Wold-style-cast"
-#   pragma clang diagnostic ignored "-Wunused-function"
-#endif
-#if BOOST_COMP_GNUC
-#   pragma GCC diagnostic push
-#   pragma GCC diagnostic ignored "-Wold-style-cast"
-#endif
-#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
-    #pragma warning(push)
-    #pragma warning(disable: 4100)  // tinymt32.h(60): warning C4100: 'random': unreferenced formal parameter
-#endif
-
-#define TINYMT32_MEXP 127
-#define TINYMT32_SH0 1
-#define TINYMT32_SH1 10
-#define TINYMT32_SH8 8
-#define TINYMT32_MASK UINT32_C(0x7fffffff)
-#define TINYMT32_MUL (1.0f / 16777216.0f)
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-/**
- * tinymt32 internal state vector and parameters
- */
-struct TINYMT32_T {
-    uint32_t status[4];
-    uint32_t mat1;
-    uint32_t mat2;
-    uint32_t tmat;
-};
-
-typedef struct TINYMT32_T tinymt32_t;
-
-inline void tinymt32_init(tinymt32_t * random, uint32_t seed);
-inline void tinymt32_init_by_array(tinymt32_t * random, uint32_t init_key[],
-                            int key_length);
-
-#if defined(__GNUC__)
-/**
- * This function always returns 127
- * @param random not used
- * @return always 127
- */
-inline static int tinymt32_get_mexp(
-    tinymt32_t * random  __attribute__((unused))) {
-    return TINYMT32_MEXP;
-}
-#else
-inline static int tinymt32_get_mexp(tinymt32_t * random) {
-    return TINYMT32_MEXP;
-}
-#endif
-
-/**
- * This function changes internal state of tinymt32.
- * Users should not call this function directly.
- * @param random tinymt internal status
- */
-inline static void tinymt32_next_state(tinymt32_t * random) {
-    uint32_t x;
-    uint32_t y;
-
-    y = random->status[3];
-    x = (random->status[0] & TINYMT32_MASK)
-        ^ random->status[1]
-        ^ random->status[2];
-    x ^= (x << TINYMT32_SH0);
-    y ^= (y >> TINYMT32_SH0) ^ x;
-    random->status[0] = random->status[1];
-    random->status[1] = random->status[2];
-    random->status[2] = x ^ (y << TINYMT32_SH1);
-    random->status[3] = y;
-    int32_t const a = -((int32_t)(y & 1)) & (int32_t)random->mat1;
-    int32_t const b = -((int32_t)(y & 1)) & (int32_t)random->mat2;
-    random->status[1] ^= (uint32_t)a;
-    random->status[2] ^= (uint32_t)b;
-}
-
-/**
- * This function outputs 32-bit unsigned integer from internal state.
- * Users should not call this function directly.
- * @param random tinymt internal status
- * @return 32-bit unsigned pseudorandom number
- */
-inline static uint32_t tinymt32_temper(tinymt32_t * random) {
-    uint32_t t0, t1;
-    t0 = random->status[3];
-#if defined(LINEARITY_CHECK)
-    t1 = random->status[0]
-        ^ (random->status[2] >> TINYMT32_SH8);
-#else
-    t1 = random->status[0]
-        + (random->status[2] >> TINYMT32_SH8);
-#endif
-    t0 ^= t1;
-    if ((t1 & 1) != 0) {
-        t0 ^= random->tmat;
-    }
-    return t0;
-}
-
-/**
- * This function outputs floating point number from internal state.
- * Users should not call this function directly.
- * @param random tinymt internal status
- * @return floating point number r (1.0 <= r < 2.0)
- */
-inline static float tinymt32_temper_conv(tinymt32_t * random) {
-    uint32_t t0, t1;
-    union {
-        uint32_t u;
-        float f;
-    } conv;
-
-    t0 = random->status[3];
-#if defined(LINEARITY_CHECK)
-    t1 = random->status[0]
-        ^ (random->status[2] >> TINYMT32_SH8);
-#else
-    t1 = random->status[0]
-        + (random->status[2] >> TINYMT32_SH8);
-#endif
-    t0 ^= t1;
-    if ((t1 & 1) != 0) {
-        conv.u  = ((t0 ^ random->tmat) >> 9) | UINT32_C(0x3f800000);
-    } else {
-        conv.u  = (t0 >> 9) | UINT32_C(0x3f800000);
-    }
-    return conv.f;
-}
-
-/**
- * This function outputs floating point number from internal state.
- * Users should not call this function directly.
- * @param random tinymt internal status
- * @return floating point number r (1.0 < r < 2.0)
- */
-inline static float tinymt32_temper_conv_open(tinymt32_t * random) {
-    uint32_t t0, t1;
-    union {
-        uint32_t u;
-        float f;
-    } conv;
-
-    t0 = random->status[3];
-#if defined(LINEARITY_CHECK)
-    t1 = random->status[0]
-        ^ (random->status[2] >> TINYMT32_SH8);
-#else
-    t1 = random->status[0]
-        + (random->status[2] >> TINYMT32_SH8);
-#endif
-    t0 ^= t1;
-    if ((t1 & 1) != 0) {
-        conv.u  = ((t0 ^ random->tmat) >> 9) | UINT32_C(0x3f800001);
-    } else {
-        conv.u  = (t0 >> 9) | UINT32_C(0x3f800001);
-    }
-    return conv.f;
-}
-
-/**
- * This function outputs 32-bit unsigned integer from internal state.
- * @param random tinymt internal status
- * @return 32-bit unsigned integer r (0 <= r < 2^32)
- */
-inline static uint32_t tinymt32_generate_uint32(tinymt32_t * random) {
-    tinymt32_next_state(random);
-    return tinymt32_temper(random);
-}
-
-/**
- * This function outputs floating point number from internal state.
- * This function is implemented using multiplying by (1 / 2^24).
- * floating point multiplication is faster than using union trick in
- * my Intel CPU.
- * @param random tinymt internal status
- * @return floating point number r (0.0 <= r < 1.0)
- */
-inline static float tinymt32_generate_float(tinymt32_t * random) {
-    tinymt32_next_state(random);
-    return (float)(tinymt32_temper(random) >> 8) * TINYMT32_MUL;
-}
-
-/**
- * This function outputs floating point number from internal state.
- * This function is implemented using union trick.
- * @param random tinymt internal status
- * @return floating point number r (1.0 <= r < 2.0)
- */
-inline static float tinymt32_generate_float12(tinymt32_t * random) {
-    tinymt32_next_state(random);
-    return tinymt32_temper_conv(random);
-}
-
-/**
- * This function outputs floating point number from internal state.
- * This function is implemented using union trick.
- * @param random tinymt internal status
- * @return floating point number r (0.0 <= r < 1.0)
- */
-inline static float tinymt32_generate_float01(tinymt32_t * random) {
-    tinymt32_next_state(random);
-    return tinymt32_temper_conv(random) - 1.0f;
-}
-
-/**
- * This function outputs floating point number from internal state.
- * This function may return 1.0 and never returns 0.0.
- * @param random tinymt internal status
- * @return floating point number r (0.0 < r <= 1.0)
- */
-inline static float tinymt32_generate_floatOC(tinymt32_t * random) {
-    tinymt32_next_state(random);
-    return 1.0f - tinymt32_generate_float(random);
-}
-
-/**
- * This function outputs floating point number from internal state.
- * This function returns neither 0.0 nor 1.0.
- * @param random tinymt internal status
- * @return floating point number r (0.0 < r < 1.0)
- */
-inline static float tinymt32_generate_floatOO(tinymt32_t * random) {
-    tinymt32_next_state(random);
-    return tinymt32_temper_conv_open(random) - 1.0f;
-}
-
-/**
- * This function outputs double precision floating point number from
- * internal state. The returned value has 32-bit precision.
- * In other words, this function makes one double precision floating point
- * number from one 32-bit unsigned integer.
- * @param random tinymt internal status
- * @return floating point number r (0.0 <= r < 1.0)
- */
-inline static double tinymt32_generate_32double(tinymt32_t * random) {
-    tinymt32_next_state(random);
-    return tinymt32_temper(random) * (1.0 / 4294967296.0);
-}
-
-#if defined(__cplusplus)
-}
-#endif
-
-#define MIN_LOOP 8
-#define PRE_LOOP 8
-
-/**
- * This function represents a function used in the initialization
- * by init_by_array
- * @param x 32-bit integer
- * @return 32-bit integer
- */
-static uint32_t ini_func1(uint32_t x) {
-    return (x ^ (x >> 27)) * UINT32_C(1664525);
-}
-
-/**
- * This function represents a function used in the initialization
- * by init_by_array
- * @param x 32-bit integer
- * @return 32-bit integer
- */
-static uint32_t ini_func2(uint32_t x) {
-    return (x ^ (x >> 27)) * UINT32_C(1566083941);
-}
-
-/**
- * This function certificate the period of 2^127-1.
- * @param random tinymt state vector.
- */
-static void period_certification(tinymt32_t * random) {
-    if ((random->status[0] & TINYMT32_MASK) == 0 &&
-        random->status[1] == 0 &&
-        random->status[2] == 0 &&
-        random->status[3] == 0) {
-        random->status[0] = 'T';
-        random->status[1] = 'I';
-        random->status[2] = 'N';
-        random->status[3] = 'Y';
-    }
-}
-
-/**
- * This function initializes the internal state array with a 32-bit
- * unsigned integer seed.
- * @param random tinymt state vector.
- * @param seed a 32-bit unsigned integer used as a seed.
- */
-void tinymt32_init(tinymt32_t * random, uint32_t seed) {
-    random->status[0] = seed;
-    random->status[1] = random->mat1;
-    random->status[2] = random->mat2;
-    random->status[3] = random->tmat;
-    for (unsigned int i = 1; i < MIN_LOOP; i++) {
-        random->status[i & 3] ^= i + UINT32_C(1812433253)
-            * (random->status[(i - 1) & 3]
-               ^ (random->status[(i - 1) & 3] >> 30));
-    }
-    period_certification(random);
-    for (unsigned int i = 0; i < PRE_LOOP; i++) {
-        tinymt32_next_state(random);
-    }
-}
-
-/**
- * This function initializes the internal state array,
- * with an array of 32-bit unsigned integers used as seeds
- * @param random tinymt state vector.
- * @param init_key the array of 32-bit integers, used as a seed.
- * @param key_length the length of init_key.
- */
-void tinymt32_init_by_array(tinymt32_t * random, uint32_t init_key[],
-                            int key_length) {
-    const unsigned int lag = 1;
-    const unsigned int mid = 1;
-    const unsigned int size = 4;
-    unsigned int i, j;
-    unsigned int count;
-    uint32_t r;
-    uint32_t * st = &random->status[0];
-
-    st[0] = 0;
-    st[1] = random->mat1;
-    st[2] = random->mat2;
-    st[3] = random->tmat;
-    if (key_length + 1 > MIN_LOOP) {
-        count = (unsigned int)key_length + 1;
-    } else {
-        count = MIN_LOOP;
-    }
-    r = ini_func1(st[0] ^ st[mid % size]
-                  ^ st[(size - 1) % size]);
-    st[mid % size] += r;
-    r += (unsigned int)key_length;
-    st[(mid + lag) % size] += r;
-    st[0] = r;
-    count--;
-    for (i = 1, j = 0; (j < count) && (j < (unsigned int)key_length); j++) {
-        r = ini_func1(st[i % size]
-                      ^ st[(i + mid) % size]
-                      ^ st[(i + size - 1) % size]);
-        st[(i + mid) % size] += r;
-        r += init_key[j] + i;
-        st[(i + mid + lag) % size] += r;
-        st[i % size] = r;
-        i = (i + 1) % size;
-    }
-    for (; j < count; j++) {
-        r = ini_func1(st[i % size]
-                      ^ st[(i + mid) % size]
-                      ^ st[(i + size - 1) % size]);
-        st[(i + mid) % size] += r;
-        r += i;
-        st[(i + mid + lag) % size] += r;
-        st[i % size] = r;
-        i = (i + 1) % size;
-    }
-    for (j = 0; j < size; j++) {
-        r = ini_func2(st[i % size]
-                      + st[(i + mid) % size]
-                      + st[(i + size - 1) % size]);
-        st[(i + mid) % size] ^= r;
-        r -= i;
-        st[(i + mid + lag) % size] ^= r;
-        st[i % size] = r;
-        i = (i + 1) % size;
-    }
-    period_certification(random);
-    for (i = 0; i < PRE_LOOP; i++) {
-        tinymt32_next_state(random);
-    }
-}
-
-#undef MIN_LOOP
-#undef PRE_LOOP
-
-#if BOOST_COMP_CLANG
-#   pragma clang diagnostic pop
-#endif
-#if BOOST_COMP_GNUC
-#   pragma GCC diagnostic pop
-#endif
-#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
-#   pragma warning(pop)
-#endif
-
-#endif
diff --git a/include/alpaka/rand/Traits.hpp b/include/alpaka/rand/Traits.hpp
deleted file mode 100644
index 1ccd1ba..0000000
--- a/include/alpaka/rand/Traits.hpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright 2023 Benjamin Worpitz, Bernhard Manfred Gruber, Jan Stephan
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/core/Concepts.hpp"
-
-#include <cstdint>
-#include <type_traits>
-
-namespace alpaka::rand
-{
-    struct ConceptRand
-    {
-    };
-
-    //! The random number generator distribution specifics.
-    namespace distribution
-    {
-        //! The random number generator distribution trait.
-        namespace trait
-        {
-            //! The random number float normal distribution get trait.
-            template<typename TRand, typename T, typename TSfinae = void>
-            struct CreateNormalReal;
-
-            //! The random number float uniform distribution get trait.
-            template<typename TRand, typename T, typename TSfinae = void>
-            struct CreateUniformReal;
-
-            //! The random number integer uniform distribution get trait.
-            template<typename TRand, typename T, typename TSfinae = void>
-            struct CreateUniformUint;
-        } // namespace trait
-
-        //! \return A normal float distribution with mean 0.0f and standard deviation 1.0f.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T, typename TRand>
-        ALPAKA_FN_HOST_ACC auto createNormalReal(TRand const& rand)
-        {
-            static_assert(std::is_floating_point_v<T>, "The value type T has to be a floating point type!");
-
-            using ImplementationBase = concepts::ImplementationBase<ConceptRand, TRand>;
-            return trait::CreateNormalReal<ImplementationBase, T>::createNormalReal(rand);
-        }
-
-        //! \return A uniform floating point distribution [0.0, 1.0).
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T, typename TRand>
-        ALPAKA_FN_HOST_ACC auto createUniformReal(TRand const& rand)
-        {
-            static_assert(std::is_floating_point_v<T>, "The value type T has to be a floating point type!");
-
-            using ImplementationBase = concepts::ImplementationBase<ConceptRand, TRand>;
-            return trait::CreateUniformReal<ImplementationBase, T>::createUniformReal(rand);
-        }
-
-        //! \return A uniform integer distribution [0, UINT_MAX].
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename T, typename TRand>
-        ALPAKA_FN_HOST_ACC auto createUniformUint(TRand const& rand)
-        {
-            static_assert(
-                std::is_integral_v<T> && std::is_unsigned_v<T>,
-                "The value type T has to be a unsigned integral type!");
-
-            using ImplementationBase = concepts::ImplementationBase<ConceptRand, TRand>;
-            return trait::CreateUniformUint<ImplementationBase, T>::createUniformUint(rand);
-        }
-    } // namespace distribution
-
-    //! The random number generator engine specifics.
-    namespace engine
-    {
-        //! The random number generator engine trait.
-        namespace trait
-        {
-            //! The random number default generator engine get trait.
-            template<typename TRand, typename TSfinae = void>
-            struct CreateDefault;
-        } // namespace trait
-
-        //! \return A default random number generator engine. Its type is guaranteed to be trivially copyable.
-        //!         Except HIP accelerator for HIP versions below 5.2 as its internal state was not trivially copyable.
-        //!         The limitation was discussed in PR #1778.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename TRand>
-        ALPAKA_FN_HOST_ACC auto createDefault(
-            TRand const& rand,
-            std::uint32_t const& seed = 0,
-            std::uint32_t const& subsequence = 0,
-            std::uint32_t const& offset = 0)
-        {
-            using ImplementationBase = concepts::ImplementationBase<ConceptRand, TRand>;
-            return trait::CreateDefault<ImplementationBase>::createDefault(rand, seed, subsequence, offset);
-        }
-    } // namespace engine
-} // namespace alpaka::rand
diff --git a/include/alpaka/standalone/CpuOmp2Blocks.hpp b/include/alpaka/standalone/CpuOmp2Blocks.hpp
deleted file mode 100644
index 34c69d5..0000000
--- a/include/alpaka/standalone/CpuOmp2Blocks.hpp
+++ /dev/null
@@ -1,9 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#ifndef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
-#    define ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
-#endif
diff --git a/include/alpaka/standalone/CpuOmp2Threads.hpp b/include/alpaka/standalone/CpuOmp2Threads.hpp
deleted file mode 100644
index b48139a..0000000
--- a/include/alpaka/standalone/CpuOmp2Threads.hpp
+++ /dev/null
@@ -1,9 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#ifndef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
-#    define ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
-#endif
diff --git a/include/alpaka/standalone/CpuSerial.hpp b/include/alpaka/standalone/CpuSerial.hpp
deleted file mode 100644
index 338a5c0..0000000
--- a/include/alpaka/standalone/CpuSerial.hpp
+++ /dev/null
@@ -1,9 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#ifndef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
-#    define ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
-#endif
diff --git a/include/alpaka/standalone/CpuSycl.hpp b/include/alpaka/standalone/CpuSycl.hpp
deleted file mode 100644
index 7e42735..0000000
--- a/include/alpaka/standalone/CpuSycl.hpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/* Copyright 2023 Jan Stephan, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#include "alpaka/standalone/GenericSycl.hpp"
-
-#ifndef ALPAKA_SYCL_ONEAPI_CPU
-#    define ALPAKA_SYCL_ONEAPI_CPU
-#endif
-
-#ifndef ALPAKA_SYCL_TARGET_CPU
-#    define ALPAKA_SYCL_TARGET_CPU
-#endif
diff --git a/include/alpaka/standalone/CpuTbbBlocks.hpp b/include/alpaka/standalone/CpuTbbBlocks.hpp
deleted file mode 100644
index 87e7548..0000000
--- a/include/alpaka/standalone/CpuTbbBlocks.hpp
+++ /dev/null
@@ -1,9 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#ifndef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
-#    define ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
-#endif
diff --git a/include/alpaka/standalone/CpuThreads.hpp b/include/alpaka/standalone/CpuThreads.hpp
deleted file mode 100644
index cd28f09..0000000
--- a/include/alpaka/standalone/CpuThreads.hpp
+++ /dev/null
@@ -1,9 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#ifndef ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
-#    define ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
-#endif
diff --git a/include/alpaka/standalone/FpgaSyclIntel.hpp b/include/alpaka/standalone/FpgaSyclIntel.hpp
deleted file mode 100644
index 35a44bc..0000000
--- a/include/alpaka/standalone/FpgaSyclIntel.hpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/* Copyright 2023 Jan Stephan
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#include "alpaka/standalone/GenericSycl.hpp"
-
-#ifndef ALPAKA_SYCL_ONEAPI_FPGA
-#    define ALPAKA_SYCL_ONEAPI_FPGA
-#endif
-
-#ifndef ALPAKA_SYCL_TARGET_FPGA
-#    define ALPAKA_SYCL_TARGET_FPGA
-#endif
diff --git a/include/alpaka/standalone/GenericSycl.hpp b/include/alpaka/standalone/GenericSycl.hpp
deleted file mode 100644
index c75e0a1..0000000
--- a/include/alpaka/standalone/GenericSycl.hpp
+++ /dev/null
@@ -1,9 +0,0 @@
-/* Copyright 2022 Jan Stephan
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#ifndef ALPAKA_ACC_SYCL_ENABLED
-#    define ALPAKA_ACC_SYCL_ENABLED
-#endif
diff --git a/include/alpaka/standalone/GpuCudaRt.hpp b/include/alpaka/standalone/GpuCudaRt.hpp
deleted file mode 100644
index eeaae15..0000000
--- a/include/alpaka/standalone/GpuCudaRt.hpp
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Jan Stephan
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#ifndef ALPAKA_ACC_GPU_CUDA_ENABLED
-#    define ALPAKA_ACC_GPU_CUDA_ENABLED
-#endif
-
-#include "alpaka/core/BoostPredef.hpp"
-
-#if defined(BOOST_COMP_CLANG_CUDA) && (BOOST_COMP_CLANG_CUDA == BOOST_VERSION_NUMBER(14, 0, 0))
-
-#    include <cuda.h>
-
-#    if(CUDART_VERSION == 11030)
-#        error "clang-14 cannot be used as CUDA compiler when using CUDA v11.3. See alpaka GitHub issue 1857."
-#    endif
-
-#endif
diff --git a/include/alpaka/standalone/GpuHipRt.hpp b/include/alpaka/standalone/GpuHipRt.hpp
deleted file mode 100644
index 494d3d4..0000000
--- a/include/alpaka/standalone/GpuHipRt.hpp
+++ /dev/null
@@ -1,9 +0,0 @@
-/* Copyright 2019 Benjamin Worpitz
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#ifndef ALPAKA_ACC_GPU_HIP_ENABLED
-#    define ALPAKA_ACC_GPU_HIP_ENABLED
-#endif
diff --git a/include/alpaka/standalone/GpuSyclIntel.hpp b/include/alpaka/standalone/GpuSyclIntel.hpp
deleted file mode 100644
index 8911e39..0000000
--- a/include/alpaka/standalone/GpuSyclIntel.hpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/* Copyright 2023 Jan Stephan
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#include "alpaka/standalone/GenericSycl.hpp"
-
-#ifndef ALPAKA_SYCL_ONEAPI_GPU
-#    define ALPAKA_SYCL_ONEAPI_GPU
-#endif
-
-#ifndef ALPAKA_SYCL_TARGET_GPU
-#    define ALPAKA_SYCL_TARGET_GPU
-#endif
diff --git a/include/alpaka/test/Array.hpp b/include/alpaka/test/Array.hpp
deleted file mode 100644
index 08cc9f0..0000000
--- a/include/alpaka/test/Array.hpp
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-#include "alpaka/alpaka.hpp"
-
-#include <cstddef>
-
-namespace alpaka::test
-{
-    template<typename TType, size_t TSize>
-    struct Array
-    {
-        TType m_data[TSize];
-
-        template<typename T_Idx>
-        ALPAKA_FN_HOST_ACC auto operator[](const T_Idx idx) const -> TType const&
-        {
-            return m_data[idx];
-        }
-
-        template<typename TIdx>
-        ALPAKA_FN_HOST_ACC auto operator[](const TIdx idx) -> TType&
-        {
-            return m_data[idx];
-        }
-    };
-} // namespace alpaka::test
diff --git a/include/alpaka/test/Check.hpp b/include/alpaka/test/Check.hpp
deleted file mode 100644
index 39545e7..0000000
--- a/include/alpaka/test/Check.hpp
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright 2023 Benjamin Worpitz, Jan Stephan, Luca Ferragina, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Sycl.hpp"
-
-#include <cstdio>
-
-#define ALPAKA_CHECK(success, expression)                                                                             \
-    do                                                                                                                \
-    {                                                                                                                 \
-        if(!(expression))                                                                                             \
-        {                                                                                                             \
-            printf("ALPAKA_CHECK failed because '!(%s)'\n", #expression);                                             \
-            success = false;                                                                                          \
-        }                                                                                                             \
-    } while(0)
diff --git a/include/alpaka/test/Extent.hpp b/include/alpaka/test/Extent.hpp
deleted file mode 100644
index 56ccfaf..0000000
--- a/include/alpaka/test/Extent.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/alpaka.hpp"
-
-#include <cstddef>
-
-namespace alpaka::test
-{
-    template<typename TDim, typename TVal>
-    inline constexpr auto extentBuf = []
-    {
-        Vec<TDim, TVal> v;
-        if constexpr(TDim::value > 0)
-            for(TVal i = 0; i < TVal{TDim::value}; i++)
-                v[i] = 11 - i;
-        return v;
-    }();
-
-    template<typename TDim, typename TVal>
-    inline constexpr auto extentSubView = []
-    {
-        Vec<TDim, TVal> v;
-        if constexpr(TDim::value > 0)
-            for(TVal i = 0; i < TVal{TDim::value}; i++)
-                v[i] = 8 - i * 2;
-        return v;
-    }();
-
-    template<typename TDim, typename TVal>
-    inline constexpr auto offset = []
-    {
-        Vec<TDim, TVal> v;
-        if constexpr(TDim::value > 0)
-            for(TVal i = 0; i < TVal{TDim::value}; i++)
-                v[i] = 2 + i;
-        return v;
-    }();
-} // namespace alpaka::test
diff --git a/include/alpaka/test/KernelExecutionFixture.hpp b/include/alpaka/test/KernelExecutionFixture.hpp
deleted file mode 100644
index 0e59344..0000000
--- a/include/alpaka/test/KernelExecutionFixture.hpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright 2024 Benjamin Worpitz, Andrea Bocci, Bernhard Manfred Gruber, Jan Stephan, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/alpaka.hpp"
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
-#    error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#endif
-
-#if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
-#    error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#endif
-
-#include "alpaka/test/Check.hpp"
-#include "alpaka/test/queue/Queue.hpp"
-
-#include <utility>
-
-namespace alpaka::test
-{
-    //! The fixture for executing a kernel on a given accelerator.
-    template<typename TAcc>
-    class KernelExecutionFixture
-    {
-    public:
-        using Acc = TAcc;
-        using Dim = alpaka::Dim<Acc>;
-        using Idx = alpaka::Idx<Acc>;
-        using Platform = alpaka::Platform<Acc>;
-        using Device = Dev<Acc>;
-        using Queue = test::DefaultQueue<Device>;
-        using WorkDiv = WorkDivMembers<Dim, Idx>;
-
-        KernelExecutionFixture(WorkDiv workDiv) : m_queue{m_device}, m_workDiv{std::move(workDiv)}
-        {
-        }
-
-        template<typename TExtent>
-        KernelExecutionFixture(TExtent const& extent) : m_queue{m_device}
-                                                      , m_extent{extent}
-        {
-        }
-
-        KernelExecutionFixture(Queue queue, WorkDiv workDiv)
-            : m_platform{} // if the platform is not stateless, this is wrong; we ignore it because it is not be used
-            , m_device{alpaka::getDev(queue)}
-            , m_queue{std::move(queue)}
-            , m_workDiv{std::move(workDiv)}
-        {
-        }
-
-        template<typename TExtent>
-        KernelExecutionFixture(Queue queue, TExtent const& extent)
-            : m_platform{} // if the platform is not stateless, this is wrong; we ignore it because it is not be used
-            , m_device{alpaka::getDev(queue)}
-            , m_queue{std::move(queue)}
-            , m_extent{extent}
-        {
-        }
-
-        template<typename TKernelFnObj, typename... TArgs>
-        auto operator()(TKernelFnObj kernelFnObj, TArgs&&... args) -> bool
-        {
-            // Allocate the result value
-            auto bufAccResult = allocBuf<bool, Idx>(m_device, static_cast<Idx>(1u));
-            memset(m_queue, bufAccResult, static_cast<std::uint8_t>(true));
-
-
-            alpaka::KernelCfg<Acc> const kernelCfg = {m_extent, Vec<Dim, Idx>::ones()};
-
-            // set workdiv if it is not before
-            if(m_workDiv == WorkDiv{Vec<Dim, Idx>::all(0), Vec<Dim, Idx>::all(0), Vec<Dim, Idx>::all(0)})
-                m_workDiv = alpaka::getValidWorkDiv(
-                    kernelCfg,
-                    m_device,
-                    kernelFnObj,
-                    getPtrNative(bufAccResult),
-                    std::forward<TArgs>(args)...);
-
-            exec<Acc>(m_queue, m_workDiv, kernelFnObj, getPtrNative(bufAccResult), std::forward<TArgs>(args)...);
-
-            // Copy the result value to the host
-            auto bufHostResult = allocBuf<bool, Idx>(m_devHost, static_cast<Idx>(1u));
-            memcpy(m_queue, bufHostResult, bufAccResult);
-            wait(m_queue);
-
-            auto const result = *getPtrNative(bufHostResult);
-
-            return result;
-        }
-
-    private:
-        PlatformCpu m_platformHost{};
-        DevCpu m_devHost{getDevByIdx(m_platformHost, 0)};
-        Platform m_platform{};
-        Device m_device{getDevByIdx(m_platform, 0)};
-        Queue m_queue;
-        WorkDiv m_workDiv{Vec<Dim, Idx>::all(0), Vec<Dim, Idx>::all(0), Vec<Dim, Idx>::all(0)};
-        Vec<Dim, Idx> m_extent;
-    };
-
-} // namespace alpaka::test
diff --git a/include/alpaka/test/MeasureKernelRunTime.hpp b/include/alpaka/test/MeasureKernelRunTime.hpp
deleted file mode 100644
index 8ef4f45..0000000
--- a/include/alpaka/test/MeasureKernelRunTime.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/alpaka.hpp"
-#include "alpaka/core/DemangleTypeNames.hpp"
-
-#include <type_traits>
-#include <utility>
-
-namespace alpaka::test::integ
-{
-    //! Measures and returns the runtime in ms of the passed callable.
-    //! \param callable An object with operator().
-    template<typename TCallable>
-    auto measureRunTimeMs(TCallable&& callable) -> std::chrono::milliseconds::rep
-    {
-        auto const start = std::chrono::high_resolution_clock::now();
-        std::forward<TCallable>(callable)();
-        auto const end = std::chrono::high_resolution_clock::now();
-        return std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
-    }
-
-    //! \return The run time of the given kernel.
-    template<typename TQueue, typename TTask>
-    auto measureTaskRunTimeMs(TQueue& queue, TTask&& task) -> std::chrono::milliseconds::rep
-    {
-#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-        std::cout << "measureKernelRunTime("
-                  << " queue: " << core::demangled<TQueue> << " task: " << core::demangled<std::decay_t<TTask>> << ")"
-                  << std::endl;
-#endif
-        // Wait for the queue to finish all tasks enqueued prior to the given task.
-        alpaka::wait(queue);
-
-        return measureRunTimeMs(
-            [&]
-            {
-                alpaka::enqueue(queue, std::forward<TTask>(task));
-
-                // Wait for the queue to finish the task execution to measure its run time.
-                alpaka::wait(queue);
-            });
-    }
-} // namespace alpaka::test::integ
diff --git a/include/alpaka/test/acc/TestAccs.hpp b/include/alpaka/test/acc/TestAccs.hpp
deleted file mode 100644
index 2370fa4..0000000
--- a/include/alpaka/test/acc/TestAccs.hpp
+++ /dev/null
@@ -1,183 +0,0 @@
-/* Copyright 2024 Benjamin Worpitz, Erik Zenker, Matthias Werner, Andrea Bocci, Bernhard Manfred Gruber, Jan Stephan,
- * Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/alpaka.hpp"
-#include "alpaka/test/dim/TestDims.hpp"
-#include "alpaka/test/idx/TestIdxs.hpp"
-
-#include <iosfwd>
-#include <tuple>
-#include <type_traits>
-
-// When compiling the tests with CUDA enabled (nvcc or native clang) on the CI infrastructure
-// we have to dramatically reduce the number of tested combinations.
-// Else the log length would be exceeded.
-#if defined(ALPAKA_CI)
-#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && BOOST_LANG_CUDA                                                       \
-        || defined(ALPAKA_ACC_GPU_HIP_ENABLED) && BOOST_LANG_HIP
-#        define ALPAKA_CUDA_CI
-#    endif
-#endif
-
-namespace alpaka::test
-{
-    //! The detail namespace is used to separate implementation details from user accessible code.
-    namespace detail
-    {
-#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
-        template<typename TDim, typename TIdx>
-        using AccCpuSerialIfAvailableElseInt = AccCpuSerial<TDim, TIdx>;
-#else
-        template<typename TDim, typename TIdx>
-        using AccCpuSerialIfAvailableElseInt = int;
-#endif
-#if defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED) && !defined(ALPAKA_CUDA_CI)
-        template<typename TDim, typename TIdx>
-        using AccCpuThreadsIfAvailableElseInt = AccCpuThreads<TDim, TIdx>;
-#else
-        template<typename TDim, typename TIdx>
-        using AccCpuThreadsIfAvailableElseInt = int;
-#endif
-#if defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED)
-        template<typename TDim, typename TIdx>
-        using AccCpuTbbIfAvailableElseInt = AccCpuTbbBlocks<TDim, TIdx>;
-#else
-        template<typename TDim, typename TIdx>
-        using AccCpuTbbIfAvailableElseInt = int;
-#endif
-#if defined(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED)
-        template<typename TDim, typename TIdx>
-        using AccCpuOmp2BlocksIfAvailableElseInt = AccCpuOmp2Blocks<TDim, TIdx>;
-#else
-        template<typename TDim, typename TIdx>
-        using AccCpuOmp2BlocksIfAvailableElseInt = int;
-#endif
-#if defined(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED) && !defined(ALPAKA_CUDA_CI)
-        template<typename TDim, typename TIdx>
-        using AccCpuOmp2ThreadsIfAvailableElseInt = AccCpuOmp2Threads<TDim, TIdx>;
-#else
-        template<typename TDim, typename TIdx>
-        using AccCpuOmp2ThreadsIfAvailableElseInt = int;
-#endif
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && (BOOST_LANG_CUDA || defined(ALPAKA_HOST_ONLY))
-        template<typename TDim, typename TIdx>
-        using AccGpuCudaRtIfAvailableElseInt = AccGpuCudaRt<TDim, TIdx>;
-#else
-        template<typename TDim, typename TIdx>
-        using AccGpuCudaRtIfAvailableElseInt = int;
-#endif
-#if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && (BOOST_LANG_HIP || defined(ALPAKA_HOST_ONLY))
-        template<typename TDim, typename TIdx>
-        using AccGpuHipRtIfAvailableElseInt =
-            typename std::conditional<std::is_same_v<TDim, DimInt<3u>> == false, AccGpuHipRt<TDim, TIdx>, int>::type;
-#else
-        template<typename TDim, typename TIdx>
-        using AccGpuHipRtIfAvailableElseInt = int;
-#endif
-#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_TARGET_CPU)
-        template<typename TDim, typename TIdx>
-        using AccCpuSyclIfAvailableElseInt = AccCpuSycl<TDim, TIdx>;
-#else
-        template<typename TDim, typename TIdx>
-        using AccCpuSyclIfAvailableElseInt = int;
-#endif
-#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_TARGET_FPGA)
-        template<typename TDim, typename TIdx>
-        using AccFpgaSyclIntelIfAvailableElseInt = AccFpgaSyclIntel<TDim, TIdx>;
-#else
-        template<typename TDim, typename TIdx>
-        using AccFpgaSyclIntelIfAvailableElseInt = int;
-#endif
-#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_TARGET_GPU)
-        template<typename TDim, typename TIdx>
-        using AccGpuSyclIntelIfAvailableElseInt = AccGpuSyclIntel<TDim, TIdx>;
-#else
-        template<typename TDim, typename TIdx>
-        using AccGpuSyclIntelIfAvailableElseInt = int;
-#endif
-
-        //! A vector containing all available accelerators and int's.
-        template<typename TDim, typename TIdx>
-        using EnabledAccsElseInt = std::tuple<
-            AccCpuSerialIfAvailableElseInt<TDim, TIdx>,
-            AccCpuThreadsIfAvailableElseInt<TDim, TIdx>,
-            AccCpuTbbIfAvailableElseInt<TDim, TIdx>,
-            AccCpuOmp2BlocksIfAvailableElseInt<TDim, TIdx>,
-            AccCpuOmp2ThreadsIfAvailableElseInt<TDim, TIdx>,
-            AccGpuCudaRtIfAvailableElseInt<TDim, TIdx>,
-            AccGpuHipRtIfAvailableElseInt<TDim, TIdx>,
-            AccCpuSyclIfAvailableElseInt<TDim, TIdx>,
-            AccFpgaSyclIntelIfAvailableElseInt<TDim, TIdx>,
-            AccGpuSyclIntelIfAvailableElseInt<TDim, TIdx>>;
-    } // namespace detail
-
-    //! A vector containing all available accelerators.
-    template<typename TDim, typename TIdx>
-    using EnabledAccs = typename meta::Filter<detail::EnabledAccsElseInt<TDim, TIdx>, std::is_class>;
-
-    namespace detail
-    {
-        //! The accelerator name write wrapper.
-        struct StreamOutAccName
-        {
-            template<typename TAcc>
-            ALPAKA_FN_HOST auto operator()(std::ostream& os) -> void
-            {
-                os << getAccName<TAcc>();
-                os << " ";
-            }
-        };
-    } // namespace detail
-
-    //! Writes the enabled accelerators to the given stream.
-    template<typename TDim, typename TIdx>
-    ALPAKA_FN_HOST auto writeEnabledAccs(std::ostream& os) -> void
-    {
-        os << "Accelerators enabled: ";
-
-        meta::forEachType<EnabledAccs<TDim, TIdx>>(detail::StreamOutAccName(), std::ref(os));
-
-        os << std::endl;
-    }
-
-    namespace detail
-    {
-        //! A std::tuple holding multiple std::tuple consisting of a dimension and a idx type.
-        //!
-        //! TestDimIdxTuples =
-        //!     tuple<
-        //!         tuple<Dim1,Idx1>,
-        //!         tuple<Dim2,Idx1>,
-        //!         tuple<Dim3,Idx1>,
-        //!         ...,
-        //!         tuple<DimN,IdxN>>
-        using TestDimIdxTuples = meta::CartesianProduct<std::tuple, NonZeroTestDims, TestIdxs>;
-
-        template<typename TList>
-        using ApplyEnabledAccs = meta::Apply<TList, EnabledAccs>;
-
-        //! A std::tuple containing std::tuple with fully instantiated accelerators.
-        //!
-        //! TestEnabledAccs =
-        //!     tuple<
-        //!         tuple<Acc1<Dim1,Idx1>, ..., AccN<Dim1,Idx1>>,
-        //!         tuple<Acc1<Dim2,Idx1>, ..., AccN<Dim2,Idx1>>,
-        //!         ...,
-        //!         tuple<Acc1<DimN,IdxN>, ..., AccN<DimN,IdxN>>>
-        using InstantiatedEnabledAccs = meta::Transform<TestDimIdxTuples, ApplyEnabledAccs>;
-    } // namespace detail
-
-    //! A std::tuple containing fully instantiated accelerators.
-    //!
-    //! TestAccs =
-    //!     tuple<
-    //!         Acc1<Dim1,Idx1>, ..., AccN<Dim1,Idx1>,
-    //!         Acc1<Dim2,Idx1>, ..., AccN<Dim2,Idx1>,
-    //!         ...,
-    //!         Acc1<DimN,IdxN>, ..., AccN<DimN,IdxN>>
-    using TestAccs = meta::Apply<detail::InstantiatedEnabledAccs, meta::Concatenate>;
-} // namespace alpaka::test
diff --git a/include/alpaka/test/dim/TestDims.hpp b/include/alpaka/test/dim/TestDims.hpp
deleted file mode 100644
index 395c97e..0000000
--- a/include/alpaka/test/dim/TestDims.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright 2023 Benjamin Worpitz, Andrea Bocci, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/BoostPredef.hpp"
-#include "alpaka/dim/DimIntegralConst.hpp"
-#include "alpaka/meta/Filter.hpp"
-#include "alpaka/meta/NonZero.hpp"
-
-#include <tuple>
-
-namespace alpaka::test
-{
-    //! A std::tuple holding dimensions.
-    using TestDims = std::tuple<
-        DimInt<0u>,
-        DimInt<1u>,
-        DimInt<2u>,
-        DimInt<3u>
-    // CUDA, HIP and SYCL accelerators do not support 4D buffers and 4D acceleration.
-#if !defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !defined(ALPAKA_ACC_SYCL_ENABLED)
-        ,
-        DimInt<4u>
-#endif
-        >;
-
-    //! A std::tuple holding non-zero dimensions.
-    //!
-    //! NonZeroTestDims = std::tuple<Dim1, Dim2, ... DimN>
-    using NonZeroTestDims = meta::Filter<TestDims, meta::NonZero>;
-
-} // namespace alpaka::test
diff --git a/include/alpaka/test/event/EventHostManualTrigger.hpp b/include/alpaka/test/event/EventHostManualTrigger.hpp
deleted file mode 100644
index 653dbbb..0000000
--- a/include/alpaka/test/event/EventHostManualTrigger.hpp
+++ /dev/null
@@ -1,779 +0,0 @@
-/* Copyright 2024 Benjamin Worpitz, Matthias Werner, Jan Stephan, Jeffrey Kelling, Andrea Bocci,
- *                Bernhard Manfred Gruber, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/alpaka.hpp"
-
-#include <condition_variable>
-#include <mutex>
-#include <utility>
-
-namespace alpaka::test
-{
-    namespace trait
-    {
-        template<typename TDev>
-        struct EventHostManualTriggerType;
-
-        template<typename TDev>
-        struct IsEventHostManualTriggerSupported;
-    } // namespace trait
-
-    //! The event host manual trigger type trait alias template to remove the ::type.
-    template<typename TDev>
-    using EventHostManualTrigger = typename trait::EventHostManualTriggerType<TDev>::type;
-
-    template<typename TDev>
-    ALPAKA_FN_HOST auto isEventHostManualTriggerSupported(TDev const& dev) -> bool
-    {
-        return trait::IsEventHostManualTriggerSupported<TDev>::isSupported(dev);
-    }
-
-    namespace cpu::detail
-    {
-        //! Event that can be enqueued into a queue and can be triggered by the Host.
-        template<class TDev = DevCpu>
-        class EventHostManualTriggerCpuImpl
-        {
-        public:
-            //! Constructor.
-            ALPAKA_FN_HOST EventHostManualTriggerCpuImpl(TDev dev) noexcept
-                : m_dev(std::move(dev))
-                , m_mutex()
-                , m_enqueueCount(0u)
-                , m_bIsReady(true)
-            {
-            }
-
-            EventHostManualTriggerCpuImpl(EventHostManualTriggerCpuImpl const& other) = delete;
-            auto operator=(EventHostManualTriggerCpuImpl const&) -> EventHostManualTriggerCpuImpl& = delete;
-
-            void trigger()
-            {
-                {
-                    std::unique_lock<std::mutex> lock(m_mutex);
-                    m_bIsReady = true;
-                }
-                m_conditionVariable.notify_one();
-                // Give alpaka time to update into the new state, process all events and tasks.
-                std::this_thread::sleep_for(std::chrono::milliseconds(200u));
-            }
-
-        public:
-            TDev const m_dev; //!< The device this event is bound to.
-
-            mutable std::mutex m_mutex; //!< The mutex used to synchronize access to the event.
-
-            mutable std::condition_variable m_conditionVariable; //!< The condition signaling the event completion.
-            std::size_t m_enqueueCount; //!< The number of times this event has been enqueued.
-
-            bool m_bIsReady; //!< If the event is not waiting within a queue (not enqueued or already
-                             //!< completed).
-        };
-    } // namespace cpu::detail
-
-    //! Event that can be enqueued into a queue and can be triggered by the Host.
-    template<class TDev = DevCpu>
-    class EventHostManualTriggerCpu
-    {
-    public:
-        //! Constructor.
-        ALPAKA_FN_HOST EventHostManualTriggerCpu(TDev const& dev)
-            : m_spEventImpl(std::make_shared<cpu::detail::EventHostManualTriggerCpuImpl<TDev>>(dev))
-        {
-        }
-
-        //! Equality comparison operator.
-        ALPAKA_FN_HOST auto operator==(EventHostManualTriggerCpu const& rhs) const -> bool
-        {
-            return (m_spEventImpl == rhs.m_spEventImpl);
-        }
-
-        //! Inequality comparison operator.
-        ALPAKA_FN_HOST auto operator!=(EventHostManualTriggerCpu const& rhs) const -> bool
-        {
-            return !((*this) == rhs);
-        }
-
-        void trigger()
-        {
-            m_spEventImpl->trigger();
-            // Give alpaka time to update into the new state, process all events and tasks.
-            std::this_thread::sleep_for(std::chrono::milliseconds(200u));
-        }
-
-    public:
-        std::shared_ptr<cpu::detail::EventHostManualTriggerCpuImpl<TDev>> m_spEventImpl;
-    };
-
-    namespace trait
-    {
-        template<>
-        struct EventHostManualTriggerType<DevCpu>
-        {
-            using type = test::EventHostManualTriggerCpu<DevCpu>;
-        };
-
-        //! The CPU event host manual trigger support get trait specialization.
-        template<>
-        struct IsEventHostManualTriggerSupported<DevCpu>
-        {
-            ALPAKA_FN_HOST static auto isSupported(DevCpu const&) -> bool
-            {
-                return true;
-            }
-        };
-    } // namespace trait
-} // namespace alpaka::test
-
-namespace alpaka::trait
-{
-    //! The CPU device event device get trait specialization.
-    template<typename TDev>
-    struct GetDev<test::EventHostManualTriggerCpu<TDev>>
-    {
-        //
-        ALPAKA_FN_HOST static auto getDev(test::EventHostManualTriggerCpu<TDev> const& event) -> TDev
-        {
-            return event.m_spEventImpl->m_dev;
-        }
-    };
-
-    //! The CPU device event test trait specialization.
-    template<typename TDev>
-    struct IsComplete<test::EventHostManualTriggerCpu<TDev>>
-    {
-        //! \return If the event is not waiting within a queue (not enqueued or already handled).
-        ALPAKA_FN_HOST static auto isComplete(test::EventHostManualTriggerCpu<TDev> const& event) -> bool
-        {
-            std::lock_guard<std::mutex> lk(event.m_spEventImpl->m_mutex);
-
-            return event.m_spEventImpl->m_bIsReady;
-        }
-    };
-
-    template<typename TDev>
-    struct Enqueue<QueueGenericThreadsNonBlocking<TDev>, test::EventHostManualTriggerCpu<TDev>>
-    {
-        //
-        ALPAKA_FN_HOST static auto enqueue(
-            QueueGenericThreadsNonBlocking<TDev>& queue,
-            test::EventHostManualTriggerCpu<TDev>& event) -> void
-        {
-            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-            // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.
-            auto spEventImpl = event.m_spEventImpl;
-
-            // Setting the event state and enqueuing it has to be atomic.
-            std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
-
-            // The event should not yet be enqueued.
-            ALPAKA_ASSERT(spEventImpl->m_bIsReady);
-
-            // Set its state to enqueued.
-            spEventImpl->m_bIsReady = false;
-
-            // Increment the enqueue counter. This is used to skip waits for events that had already been finished
-            // and re-enqueued which would lead to deadlocks.
-            ++spEventImpl->m_enqueueCount;
-
-            auto const enqueueCount = spEventImpl->m_enqueueCount;
-
-            // Enqueue a task that only resets the events flag if it is completed.
-            queue.m_spQueueImpl->m_workerThread.submit(
-                [spEventImpl, enqueueCount]() mutable
-                {
-                    std::unique_lock<std::mutex> lk2(spEventImpl->m_mutex);
-                    spEventImpl->m_conditionVariable.wait(
-                        lk2,
-                        [spEventImpl, enqueueCount]
-                        { return (enqueueCount != spEventImpl->m_enqueueCount) || spEventImpl->m_bIsReady; });
-                });
-        }
-    };
-
-    template<typename TDev>
-    struct Enqueue<QueueGenericThreadsBlocking<TDev>, test::EventHostManualTriggerCpu<TDev>>
-    {
-        //
-        ALPAKA_FN_HOST static auto enqueue(
-            QueueGenericThreadsBlocking<TDev>&,
-            test::EventHostManualTriggerCpu<TDev>& event) -> void
-        {
-            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-            // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.
-            auto spEventImpl = event.m_spEventImpl;
-
-            // Setting the event state and enqueuing it has to be atomic.
-            std::unique_lock<std::mutex> lk(spEventImpl->m_mutex);
-
-            // The event should not yet be enqueued.
-            ALPAKA_ASSERT(spEventImpl->m_bIsReady);
-
-            // Set its state to enqueued.
-            spEventImpl->m_bIsReady = false;
-
-            // Increment the enqueue counter. This is used to skip waits for events that had already been finished
-            // and re-enqueued which would lead to deadlocks.
-            ++spEventImpl->m_enqueueCount;
-
-            auto const enqueueCount = spEventImpl->m_enqueueCount;
-
-            spEventImpl->m_conditionVariable.wait(
-                lk,
-                [spEventImpl, enqueueCount]
-                { return (enqueueCount != spEventImpl->m_enqueueCount) || spEventImpl->m_bIsReady; });
-        }
-    };
-} // namespace alpaka::trait
-
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-
-#    include "alpaka/core/BoostPredef.hpp"
-
-#    include <cuda.h>
-
-#    if !BOOST_LANG_CUDA && !defined(ALPAKA_HOST_ONLY)
-#        error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#    endif
-
-#    include "alpaka/core/Cuda.hpp"
-
-namespace alpaka::test
-{
-    namespace uniform_cuda_hip::detail
-    {
-        class EventHostManualTriggerCudaImpl final
-        {
-            using TApi = alpaka::ApiCudaRt;
-
-        public:
-            ALPAKA_FN_HOST EventHostManualTriggerCudaImpl(DevCudaRt const& dev)
-                : m_dev(dev)
-                , m_mutex()
-                , m_bIsReady(true)
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                // Set the current device.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cudaSetDevice(m_dev.getNativeHandle()));
-                // Allocate the buffer on this device.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cudaMalloc(&m_devMem, static_cast<size_t>(sizeof(int32_t))));
-                // Initiate the memory set.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
-                    cudaMemset(m_devMem, static_cast<int>(0u), static_cast<size_t>(sizeof(int32_t))));
-            }
-
-            EventHostManualTriggerCudaImpl(EventHostManualTriggerCudaImpl const&) = delete;
-            auto operator=(EventHostManualTriggerCudaImpl const&) -> EventHostManualTriggerCudaImpl& = delete;
-
-            ALPAKA_FN_HOST ~EventHostManualTriggerCudaImpl()
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                // Free the buffer.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(cudaFree(m_devMem));
-            }
-
-            void trigger()
-            {
-                std::unique_lock<std::mutex> lock(m_mutex);
-                m_bIsReady = true;
-
-                // Set the current device.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cudaSetDevice(m_dev.getNativeHandle()));
-                // Initiate the memory set.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
-                    cudaMemset(m_devMem, static_cast<int>(1u), static_cast<size_t>(sizeof(int32_t))));
-                // Give alpaka time to update into the new state, process all events and tasks.
-                std::this_thread::sleep_for(std::chrono::milliseconds(200u));
-            }
-
-        public:
-            DevCudaRt const m_dev; //!< The device this event is bound to.
-
-            mutable std::mutex m_mutex; //!< The mutex used to synchronize access to the event.
-            void* m_devMem;
-
-            bool m_bIsReady; //!< If the event is not waiting within a queue (not enqueued or already
-                             //!< completed).
-        };
-    } // namespace uniform_cuda_hip::detail
-
-    class EventHostManualTriggerCuda final
-    {
-    public:
-        ALPAKA_FN_HOST EventHostManualTriggerCuda(DevCudaRt const& dev)
-            : m_spEventImpl(std::make_shared<uniform_cuda_hip::detail::EventHostManualTriggerCudaImpl>(dev))
-        {
-            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-        }
-
-        ALPAKA_FN_HOST auto operator==(EventHostManualTriggerCuda const& rhs) const -> bool
-        {
-            return (m_spEventImpl == rhs.m_spEventImpl);
-        }
-
-        ALPAKA_FN_HOST auto operator!=(EventHostManualTriggerCuda const& rhs) const -> bool
-        {
-            return !((*this) == rhs);
-        }
-
-        void trigger()
-        {
-            m_spEventImpl->trigger();
-            // Give alpaka time to update into the new state, process all events and tasks.
-            std::this_thread::sleep_for(std::chrono::milliseconds(200u));
-        }
-
-    public:
-        std::shared_ptr<uniform_cuda_hip::detail::EventHostManualTriggerCudaImpl> m_spEventImpl;
-    };
-
-    namespace trait
-    {
-        template<>
-        struct EventHostManualTriggerType<DevCudaRt>
-        {
-            using type = test::EventHostManualTriggerCuda;
-        };
-
-        //! The CPU event host manual trigger support get trait specialization.
-        template<>
-        struct IsEventHostManualTriggerSupported<DevCudaRt>
-        {
-            ALPAKA_FN_HOST static auto isSupported([[maybe_unused]] DevCudaRt const& dev) -> bool
-            {
-#    if CUDA_VERSION < 11070
-                int result = 0;
-                cuDeviceGetAttribute(&result, CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS, dev.getNativeHandle());
-                return result != 0;
-#    else
-                return true; // Always enabled as of CUDA 11.7
-#    endif
-            }
-        };
-    } // namespace trait
-} // namespace alpaka::test
-
-namespace alpaka::trait
-{
-    namespace detail
-    {
-        // TODO: Replace with cuStreamWaitValue32 once support for CUDA < 12 is dropped.
-        inline auto streamWaitValue(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags)
-            -> CUresult
-        {
-            // NVIDIA introduced a new stream memory ops API with CUDA 11.7 (called v2). The corresponding CUDA
-            // functions were suffixed with `_v2`. With CUDA 12.0 v1 of the API was removed and the `_v2` removed
-            // from the new functions. So CUDA <= 11.6 and CUDA >= 12.0 share the same function signature but
-            // internally do different things.
-#    if(CUDA_VERSION < 11070) || (CUDA_VERSION >= 12000)
-            return cuStreamWaitValue32(stream, addr, value, flags);
-#    else
-            return cuStreamWaitValue32_v2(stream, addr, value, flags);
-#    endif
-        }
-    } // namespace detail
-
-    //! The CPU device event device get trait specialization.
-    template<>
-    struct GetDev<test::EventHostManualTriggerCuda>
-    {
-        ALPAKA_FN_HOST static auto getDev(test::EventHostManualTriggerCuda const& event) -> DevCudaRt
-        {
-            return event.m_spEventImpl->m_dev;
-        }
-    };
-
-    //! The CPU device event test trait specialization.
-    template<>
-    struct IsComplete<test::EventHostManualTriggerCuda>
-    {
-        //! \return If the event is not waiting within a queue (not enqueued or already handled).
-        ALPAKA_FN_HOST static auto isComplete(test::EventHostManualTriggerCuda const& event) -> bool
-        {
-            std::lock_guard<std::mutex> lk(event.m_spEventImpl->m_mutex);
-
-            return event.m_spEventImpl->m_bIsReady;
-        }
-    };
-
-    template<>
-    struct Enqueue<QueueCudaRtNonBlocking, test::EventHostManualTriggerCuda>
-    {
-        ALPAKA_FN_HOST static auto enqueue(QueueCudaRtNonBlocking& queue, test::EventHostManualTriggerCuda& event)
-            -> void
-        {
-            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-            // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.
-            auto spEventImpl(event.m_spEventImpl);
-
-            // Setting the event state and enqueuing it has to be atomic.
-            std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
-
-            // The event should not yet be enqueued.
-            ALPAKA_ASSERT(spEventImpl->m_bIsReady);
-
-            // Set its state to enqueued.
-            spEventImpl->m_bIsReady = false;
-
-            // PGI Profiler`s User Guide:
-            // The following are known issues related to Events and Metrics:
-            // * In event or metric profiling, kernel launches are blocking. Thus kernels waiting
-            //   on host updates may hang. This includes synchronization between the host and
-            //   the device build upon value-based CUDA queue synchronization APIs such as
-            //   cuStreamWaitValue32() and cuStreamWriteValue32().
-            ALPAKA_CUDA_DRV_CHECK(detail::streamWaitValue(
-                static_cast<CUstream>(queue.getNativeHandle()),
-                reinterpret_cast<CUdeviceptr>(event.m_spEventImpl->m_devMem),
-                0x0101'0101u,
-                CU_STREAM_WAIT_VALUE_GEQ));
-        }
-    };
-
-    template<>
-    struct Enqueue<QueueCudaRtBlocking, test::EventHostManualTriggerCuda>
-    {
-        ALPAKA_FN_HOST static auto enqueue(QueueCudaRtBlocking& queue, test::EventHostManualTriggerCuda& event) -> void
-        {
-            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-            // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.
-            auto spEventImpl(event.m_spEventImpl);
-
-            // Setting the event state and enqueuing it has to be atomic.
-            std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
-
-            // The event should not yet be enqueued.
-            ALPAKA_ASSERT(spEventImpl->m_bIsReady);
-
-            // Set its state to enqueued.
-            spEventImpl->m_bIsReady = false;
-
-            // PGI Profiler`s User Guide:
-            // The following are known issues related to Events and Metrics:
-            // * In event or metric profiling, kernel launches are blocking. Thus kernels waiting
-            //   on host updates may hang. This includes synchronization between the host and
-            //   the device build upon value-based CUDA queue synchronization APIs such as
-            //   cuStreamWaitValue32() and cuStreamWriteValue32().
-            ALPAKA_CUDA_DRV_CHECK(detail::streamWaitValue(
-                static_cast<CUstream>(queue.getNativeHandle()),
-                reinterpret_cast<CUdeviceptr>(event.m_spEventImpl->m_devMem),
-                0x0101'0101u,
-                CU_STREAM_WAIT_VALUE_GEQ));
-        }
-    };
-} // namespace alpaka::trait
-#endif
-
-
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-
-#    include <hip/hip_runtime.h>
-
-#    if !BOOST_LANG_HIP && !defined(ALPAKA_HOST_ONLY)
-#        error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#    endif
-
-#    include "alpaka/core/Hip.hpp"
-
-namespace alpaka::test
-{
-    namespace hip::detail
-    {
-        class EventHostManualTriggerHipImpl final
-        {
-            using TApi = alpaka::ApiHipRt;
-
-        public:
-            ALPAKA_FN_HOST EventHostManualTriggerHipImpl(DevHipRt const& dev) : m_dev(dev), m_mutex(), m_bIsReady(true)
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                // Set the current device.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipSetDevice(m_dev.getNativeHandle()));
-                // Allocate the buffer on this device.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipMalloc(&m_devMem, static_cast<size_t>(sizeof(int32_t))));
-                // Initiate the memory set.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
-                    hipMemset(m_devMem, static_cast<int>(0u), static_cast<size_t>(sizeof(int32_t))));
-            }
-
-            EventHostManualTriggerHipImpl(EventHostManualTriggerHipImpl const&) = delete;
-            auto operator=(EventHostManualTriggerHipImpl const&) -> EventHostManualTriggerHipImpl& = delete;
-
-            ALPAKA_FN_HOST ~EventHostManualTriggerHipImpl()
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                // Free the buffer.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(hipFree(m_devMem));
-            }
-
-            void trigger()
-            {
-                std::unique_lock<std::mutex> lock(m_mutex);
-                m_bIsReady = true;
-
-                // Set the current device.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipSetDevice(m_dev.getNativeHandle()));
-                // Initiate the memory set.
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
-                    hipMemset(m_devMem, static_cast<int>(1u), static_cast<size_t>(sizeof(int32_t))));
-                // Give alpaka time to update into the new state, process all events and tasks.
-                std::this_thread::sleep_for(std::chrono::milliseconds(200u));
-            }
-
-        public:
-            DevHipRt const m_dev; //!< The device this event is bound to.
-
-            mutable std::mutex m_mutex; //!< The mutex used to synchronize access to the event.
-            void* m_devMem;
-
-            bool m_bIsReady; //!< If the event is not waiting within a queue (not enqueued or already
-                             //!< completed).
-        };
-    } // namespace hip::detail
-
-    class EventHostManualTriggerHip final
-    {
-    public:
-        ALPAKA_FN_HOST EventHostManualTriggerHip(DevHipRt const& dev)
-            : m_spEventImpl(std::make_shared<hip::detail::EventHostManualTriggerHipImpl>(dev))
-        {
-            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-        }
-
-        ALPAKA_FN_HOST auto operator==(EventHostManualTriggerHip const& rhs) const -> bool
-        {
-            return (m_spEventImpl == rhs.m_spEventImpl);
-        }
-
-        ALPAKA_FN_HOST auto operator!=(EventHostManualTriggerHip const& rhs) const -> bool
-        {
-            return !((*this) == rhs);
-        }
-
-        void trigger()
-        {
-            m_spEventImpl->trigger();
-            // Give alpaka time to update into the new state, process all events and tasks.
-            std::this_thread::sleep_for(std::chrono::milliseconds(200u));
-        }
-
-    public:
-        std::shared_ptr<hip::detail::EventHostManualTriggerHipImpl> m_spEventImpl;
-    };
-
-    namespace trait
-    {
-        template<>
-        struct EventHostManualTriggerType<DevHipRt>
-        {
-            using type = test::EventHostManualTriggerHip;
-        };
-
-        //! The HIP event host manual trigger support get trait specialization.
-        template<>
-        struct IsEventHostManualTriggerSupported<DevHipRt>
-        {
-            // TODO: there is no CUDA_VERSION in the HIP compiler path.
-            // TODO: there is a hipDeviceGetAttribute, but there is no pendant for
-            // CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS.
-            ALPAKA_FN_HOST static auto isSupported(DevHipRt const&) -> bool
-            {
-                return false;
-            }
-        };
-    } // namespace trait
-} // namespace alpaka::test
-
-namespace alpaka::trait
-{
-    //! The CPU device event device get trait specialization.
-    template<>
-    struct GetDev<test::EventHostManualTriggerHip>
-    {
-        ALPAKA_FN_HOST static auto getDev(test::EventHostManualTriggerHip const& event) -> DevHipRt
-        {
-            return event.m_spEventImpl->m_dev;
-        }
-    };
-
-    //! The CPU device event test trait specialization.
-    template<>
-    struct IsComplete<test::EventHostManualTriggerHip>
-    {
-        //! \return If the event is not waiting within a queue (not enqueued or already handled).
-        ALPAKA_FN_HOST static auto isComplete(test::EventHostManualTriggerHip const& event) -> bool
-        {
-            std::lock_guard<std::mutex> lk(event.m_spEventImpl->m_mutex);
-
-            return event.m_spEventImpl->m_bIsReady;
-        }
-    };
-
-    template<>
-    struct Enqueue<QueueHipRtNonBlocking, test::EventHostManualTriggerHip>
-    {
-        using TApi = alpaka::ApiHipRt;
-
-        ALPAKA_FN_HOST static auto enqueue(QueueHipRtNonBlocking& queue, test::EventHostManualTriggerHip& event)
-            -> void
-        {
-            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-            // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.
-            auto spEventImpl(event.m_spEventImpl);
-
-            // Setting the event state and enqueuing it has to be atomic.
-            std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
-
-            // The event should not yet be enqueued.
-            ALPAKA_ASSERT(spEventImpl->m_bIsReady);
-
-            // Set its state to enqueued.
-            spEventImpl->m_bIsReady = false;
-
-            // PGI Profiler`s User Guide:
-            // The following are known issues related to Events and Metrics:
-            // * In event or metric profiling, kernel launches are blocking. Thus kernels waiting
-            //   on host updates may hang. This includes synchronization between the host and
-            //   the device build upon value-based CUDA queue synchronization APIs such as
-            //   cuStreamWaitValue32() and cuStreamWriteValue32().
-            int32_t hostMem = 0;
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-            std::cerr << "[Workaround] polling of device-located value in stream, as hipStreamWaitValue32 is not "
-                         "available.\n";
-#    endif
-            while(hostMem < 0x0101'0101)
-            {
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipMemcpyDtoHAsync(
-                    &hostMem,
-                    reinterpret_cast<hipDeviceptr_t>(event.m_spEventImpl->m_devMem),
-                    sizeof(int32_t),
-                    queue.getNativeHandle()));
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipStreamSynchronize(queue.getNativeHandle()));
-            }
-        }
-    };
-
-    template<>
-    struct Enqueue<QueueHipRtBlocking, test::EventHostManualTriggerHip>
-    {
-        using TApi = alpaka::ApiHipRt;
-
-        ALPAKA_FN_HOST static auto enqueue(QueueHipRtBlocking& /* queue */, test::EventHostManualTriggerHip& event)
-            -> void
-        {
-            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-            // Copy the shared pointer to ensure that the event implementation is alive as long as it is enqueued.
-            auto spEventImpl(event.m_spEventImpl);
-
-            // Setting the event state and enqueuing it has to be atomic.
-            std::lock_guard<std::mutex> lk(spEventImpl->m_mutex);
-
-            // The event should not yet be enqueued.
-            ALPAKA_ASSERT(spEventImpl->m_bIsReady);
-
-            // Set its state to enqueued.
-            spEventImpl->m_bIsReady = false;
-
-            // PGI Profiler`s User Guide:
-            // The following are known issues related to Events and Metrics:
-            // * In event or metric profiling, kernel launches are blocking. Thus kernels waiting
-            //   on host updates may hang. This includes synchronization between the host and
-            //   the device build upon value-based HIP queue synchronization APIs such as
-            //   cuStreamWaitValue32() and cuStreamWriteValue32().
-
-            // workaround for missing cuStreamWaitValue32 in HIP
-            std::uint32_t hmem = 0;
-            do
-            {
-                std::this_thread::sleep_for(std::chrono::milliseconds(10u));
-                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
-                    hipMemcpy(&hmem, event.m_spEventImpl->m_devMem, sizeof(std::uint32_t), hipMemcpyDefault));
-            } while(hmem < 0x0101'0101u);
-        }
-    };
-} // namespace alpaka::trait
-#endif
-
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-namespace alpaka
-{
-    namespace test
-    {
-        template<typename TTag>
-        class EventHostManualTriggerSycl
-        {
-        public:
-            EventHostManualTriggerSycl(DevGenericSycl<TTag> const&)
-            {
-            }
-
-            auto trigger()
-            {
-            }
-        };
-
-        namespace trait
-        {
-            template<typename TTag>
-            struct EventHostManualTriggerType<DevGenericSycl<TTag>>
-            {
-                using type = alpaka::test::EventHostManualTriggerSycl<TTag>;
-            };
-
-            template<typename TTag>
-            struct IsEventHostManualTriggerSupported<DevGenericSycl<TTag>>
-            {
-                ALPAKA_FN_HOST static auto isSupported(DevGenericSycl<TTag> const&) -> bool
-                {
-                    return false;
-                }
-            };
-        } // namespace trait
-    } // namespace test
-
-    namespace trait
-    {
-        template<typename TTag>
-        struct Enqueue<QueueGenericSyclBlocking<TTag>, test::EventHostManualTriggerSycl<TTag>>
-        {
-            ALPAKA_FN_HOST static auto enqueue(
-                QueueGenericSyclBlocking<TTag>& /* queue */,
-                test::EventHostManualTriggerSycl<TTag>& /* event */) -> void
-            {
-            }
-        };
-
-        template<typename TTag>
-        struct Enqueue<QueueGenericSyclNonBlocking<TTag>, test::EventHostManualTriggerSycl<TTag>>
-        {
-            ALPAKA_FN_HOST static auto enqueue(
-                QueueGenericSyclNonBlocking<TTag>& /* queue */,
-                test::EventHostManualTriggerSycl<TTag>& /* event */) -> void
-            {
-            }
-        };
-
-        template<typename TTag>
-        struct IsComplete<test::EventHostManualTriggerSycl<TTag>>
-        {
-            ALPAKA_FN_HOST static auto isComplete(test::EventHostManualTriggerSycl<TTag> const& /* event */) -> bool
-            {
-                return true;
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
-#endif
diff --git a/include/alpaka/test/idx/TestIdxs.hpp b/include/alpaka/test/idx/TestIdxs.hpp
deleted file mode 100644
index 19bf5a9..0000000
--- a/include/alpaka/test/idx/TestIdxs.hpp
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Erik Zenker, Matthias Werner, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include <cstdint>
-#include <tuple>
-
-namespace alpaka::test
-{
-    //! A std::tuple holding idx types.
-    using TestIdxs = std::tuple<
-    // size_t is most probably identical to either std::uint64_t or std::uint32_t.
-    // This would lead to duplicate tests (especially test names) which is not allowed.
-    // std::size_t,
-#if !defined(ALPAKA_CI)
-        std::int64_t,
-#endif
-        std::uint64_t,
-        std::int32_t
-#if !defined(ALPAKA_CI)
-        ,
-        std::uint32_t
-#endif
-        // index type must be >=32bit
-        >;
-} // namespace alpaka::test
diff --git a/include/alpaka/test/mem/view/Iterator.hpp b/include/alpaka/test/mem/view/Iterator.hpp
deleted file mode 100644
index 314d1c0..0000000
--- a/include/alpaka/test/mem/view/Iterator.hpp
+++ /dev/null
@@ -1,143 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Erik Zenker, Bernhard Manfred Gruber, Jan Stephan
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/alpaka.hpp"
-
-#include <type_traits>
-
-namespace alpaka::test
-{
-    namespace trait
-    {
-        // \tparam T Type to conditionally make const.
-        // \tparam TSource Type to mimic the constness of.
-        template<typename T, typename TSource>
-        using MimicConst = std::conditional_t<std::is_const_v<TSource>, std::add_const_t<T>, std::remove_const_t<T>>;
-
-        template<typename TView, typename TSfinae = void>
-        class IteratorView
-        {
-            using TViewDecayed = std::decay_t<TView>;
-            using Dim = alpaka::Dim<TViewDecayed>;
-            using Idx = alpaka::Idx<TViewDecayed>;
-            using Elem = MimicConst<alpaka::Elem<TViewDecayed>, TView>;
-
-        public:
-            ALPAKA_FN_HOST IteratorView(TView& view, Idx const idx)
-                : m_nativePtr(getPtrNative(view))
-                , m_currentIdx(idx)
-                , m_extents(getExtents(view))
-                , m_pitchBytes(getPitchesInBytes(view))
-            {
-            }
-
-            ALPAKA_FN_HOST explicit IteratorView(TView& view) : IteratorView(view, 0)
-            {
-            }
-
-            ALPAKA_FN_HOST_ACC auto operator++() -> IteratorView&
-            {
-                ++m_currentIdx;
-                return *this;
-            }
-
-            ALPAKA_FN_HOST_ACC auto operator--() -> IteratorView&
-            {
-                --m_currentIdx;
-                return *this;
-            }
-
-            ALPAKA_FN_HOST_ACC auto operator++(int) -> IteratorView
-            {
-                IteratorView iterCopy = *this;
-                m_currentIdx++;
-                return iterCopy;
-            }
-
-            ALPAKA_FN_HOST_ACC auto operator--(int) -> IteratorView
-            {
-                IteratorView iterCopy = *this;
-                m_currentIdx--;
-                return iterCopy;
-            }
-
-            template<typename TIter>
-            ALPAKA_FN_HOST_ACC auto operator==(TIter& other) const -> bool
-            {
-                return m_currentIdx == other.m_currentIdx;
-            }
-
-            template<typename TIter>
-            ALPAKA_FN_HOST_ACC auto operator!=(TIter& other) const -> bool
-            {
-                return m_currentIdx != other.m_currentIdx;
-            }
-
-            ALPAKA_FN_HOST_ACC auto operator*() const -> Elem&
-            {
-                if constexpr(Dim::value == 0)
-                    return *m_nativePtr;
-                else
-                {
-                    Vec<Dim, Idx> const currentIdxDimx
-                        = mapIdx<Dim::value>(Vec<DimInt<1>, Idx>{m_currentIdx}, m_extents);
-                    auto const offsetInBytes = (currentIdxDimx * m_pitchBytes).sum();
-                    using QualifiedByte = MimicConst<std::byte, Elem>;
-#if BOOST_COMP_GNUC
-#    pragma GCC diagnostic push
-                    // "cast from 'Byte*' to 'Elem*' increases required alignment of target type"
-#    pragma GCC diagnostic ignored "-Wcast-align"
-#endif
-                    return *reinterpret_cast<Elem*>(reinterpret_cast<QualifiedByte*>(m_nativePtr) + offsetInBytes);
-#if BOOST_COMP_GNUC
-#    pragma GCC diagnostic pop
-#endif
-                }
-                ALPAKA_UNREACHABLE(*m_nativePtr);
-            }
-
-        private:
-            Elem* m_nativePtr;
-            Idx m_currentIdx;
-            Vec<Dim, Idx> m_extents;
-            Vec<Dim, Idx> m_pitchBytes;
-        };
-
-        template<typename TView, typename TSfinae = void>
-        struct Begin
-        {
-            ALPAKA_FN_HOST static auto begin(TView& view) -> IteratorView<TView>
-            {
-                return IteratorView<TView>(view);
-            }
-        };
-
-        template<typename TView, typename TSfinae = void>
-        struct End
-        {
-            ALPAKA_FN_HOST static auto end(TView& view) -> IteratorView<TView>
-            {
-                auto extents = getExtents(view);
-                return IteratorView<TView>(view, extents.prod());
-            }
-        };
-    } // namespace trait
-
-    template<typename TView>
-    using Iterator = trait::IteratorView<TView>;
-
-    template<typename TView>
-    ALPAKA_FN_HOST auto begin(TView& view) -> Iterator<TView>
-    {
-        return trait::Begin<TView>::begin(view);
-    }
-
-    template<typename TView>
-    ALPAKA_FN_HOST auto end(TView& view) -> Iterator<TView>
-    {
-        return trait::End<TView>::end(view);
-    }
-} // namespace alpaka::test
diff --git a/include/alpaka/test/mem/view/ViewTest.hpp b/include/alpaka/test/mem/view/ViewTest.hpp
deleted file mode 100644
index eef3b5a..0000000
--- a/include/alpaka/test/mem/view/ViewTest.hpp
+++ /dev/null
@@ -1,264 +0,0 @@
-/* Copyright 2023 Benjamin Worpitz, Sergei Bastrakov, René Widera, Bernhard Manfred Gruber, Jan Stephan
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/alpaka.hpp"
-#include "alpaka/test/KernelExecutionFixture.hpp"
-#include "alpaka/test/mem/view/Iterator.hpp"
-
-#include <catch2/catch_test_macros.hpp>
-
-#include <numeric>
-#include <type_traits>
-
-//! The test specifics.
-namespace alpaka::test
-{
-    template<typename TElem, typename TDim, typename TIdx, typename TDev, typename TView>
-    ALPAKA_FN_HOST auto testViewImmutable(
-        TView const& view,
-        TDev const& dev,
-        Vec<TDim, TIdx> const& extent,
-        Vec<TDim, TIdx> const& offset) -> void
-    {
-        // trait::DevType
-        {
-            static_assert(
-                std::is_same_v<Dev<TView>, TDev>,
-                "The device type of the view has to be equal to the specified one.");
-        }
-
-        // trait::GetDev
-        {
-            REQUIRE(dev == getDev(view));
-        }
-
-        // trait::DimType
-        {
-            static_assert(
-                Dim<TView>::value == TDim::value,
-                "The dimensionality of the view has to be equal to the specified one.");
-        }
-
-        // trait::ElemType
-        {
-            static_assert(
-                std::is_same_v<Elem<TView>, TElem>,
-                "The element type of the view has to be equal to the specified one.");
-        }
-
-        // trait::GetExtents
-        {
-            REQUIRE(extent == getExtents(view));
-        }
-
-        // trait::GetPitchBytes
-        {
-            auto const pitchMinimum = alpaka::detail::calculatePitchesFromExtents<TElem>(extent);
-            auto const pitchView = getPitchesInBytes(view);
-
-            for(TIdx i = TDim::value; i > static_cast<TIdx>(0u); --i)
-            {
-                REQUIRE(pitchView[i - 1] >= pitchMinimum[i - 1]);
-            }
-        }
-
-        // trait::GetPtrNative
-        {
-            // The view is a const& so the pointer has to point to a const value.
-            using NativePtr = decltype(getPtrNative(view));
-            static_assert(std::is_pointer_v<NativePtr>, "The value returned by getPtrNative has to be a pointer.");
-            static_assert(
-                std::is_const_v<std::remove_pointer_t<NativePtr>>,
-                "The value returned by getPtrNative has to be const when the view is const.");
-
-            if(getExtentProduct(view) != static_cast<TIdx>(0u))
-            {
-                // The pointer is only required to be non-null when the extent is > 0.
-                TElem const* const invalidPtr(nullptr);
-                REQUIRE(invalidPtr != getPtrNative(view));
-            }
-            else
-            {
-                // When the extent is 0, the pointer is undefined but it should still be possible get it.
-                getPtrNative(view);
-            }
-        }
-
-        // trait::GetOffsets
-        {
-            REQUIRE(offset == getOffsets(view));
-        }
-
-        // trait::IdxType
-        {
-            static_assert(
-                std::is_same_v<Idx<TView>, TIdx>,
-                "The idx type of the view has to be equal to the specified one.");
-        }
-    }
-
-    //! Compares element-wise that all bytes are set to the same value.
-    struct VerifyBytesSetKernel
-    {
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename TAcc, typename TIter>
-        ALPAKA_FN_ACC void operator()(
-            TAcc const& acc [[maybe_unused]], // used by SYCL back-end
-            bool* success,
-            TIter const& begin,
-            TIter const& end,
-            std::uint8_t const& byte) const
-        {
-            constexpr auto elemSizeInByte = static_cast<unsigned>(sizeof(decltype(*begin)));
-            for(auto it = begin; it != end; ++it)
-            {
-                auto const& elem = *it;
-                auto const pBytes = reinterpret_cast<std::uint8_t const*>(&elem);
-                for(unsigned i = 0; i < elemSizeInByte; ++i)
-                {
-                    if(pBytes[i] != byte)
-                    {
-                        printf("Byte at offset %u is different: %u != %u\n", i, unsigned{pBytes[i]}, unsigned{byte});
-                        *success = false;
-                    }
-                }
-            }
-        }
-    };
-
-    template<typename TAcc, typename TView>
-    ALPAKA_FN_HOST auto verifyBytesSet(TView const& view, std::uint8_t const& byte) -> void
-    {
-        using Dim = Dim<TView>;
-        using Idx = Idx<TView>;
-
-        KernelExecutionFixture<TAcc> fixture(Vec<Dim, Idx>::ones());
-
-        VerifyBytesSetKernel verifyBytesSet;
-
-        REQUIRE(fixture(verifyBytesSet, test::begin(view), test::end(view), byte));
-    }
-
-    //! Compares iterators element-wise
-#if BOOST_COMP_GNUC
-#    pragma GCC diagnostic push
-#    pragma GCC diagnostic ignored "-Wfloat-equal" // "comparing floating point with == or != is unsafe"
-#endif
-    struct VerifyViewsEqualKernel
-    {
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename TAcc, typename TIterA, typename TIterB>
-        ALPAKA_FN_ACC void operator()(
-            TAcc const& acc [[maybe_unused]], // used by SYCL back-end
-            bool* success,
-            TIterA beginA,
-            TIterA const& endA,
-            TIterB beginB) const
-        {
-            for(; beginA != endA; ++beginA, ++beginB)
-            {
-#if BOOST_COMP_CLANG
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored "-Wfloat-equal" // "comparing floating point with == or != is unsafe"
-#endif
-                ALPAKA_CHECK(*success, *beginA == *beginB);
-#if BOOST_COMP_CLANG
-#    pragma clang diagnostic pop
-#endif
-            }
-        }
-    };
-#if BOOST_COMP_GNUC
-#    pragma GCC diagnostic pop
-#endif
-
-    template<typename TAcc, typename TViewB, typename TViewA>
-    ALPAKA_FN_HOST auto verifyViewsEqual(TViewA const& viewA, TViewB const& viewB) -> void
-    {
-        using DimA = Dim<TViewA>;
-        using DimB = Dim<TViewB>;
-        static_assert(DimA::value == DimB::value, "viewA and viewB are required to have identical Dim");
-        using IdxA = Idx<TViewA>;
-        using IdxB = Idx<TViewB>;
-        static_assert(std::is_same_v<IdxA, IdxB>, "viewA and viewB are required to have identical Idx");
-
-        test::KernelExecutionFixture<TAcc> fixture(Vec<DimA, IdxA>::ones());
-
-        VerifyViewsEqualKernel verifyViewsEqualKernel;
-
-        REQUIRE(fixture(verifyViewsEqualKernel, test::begin(viewA), test::end(viewA), test::begin(viewB)));
-    }
-
-    //! Fills the given view with increasing values starting at 0.
-    template<typename TView, typename TQueue>
-    ALPAKA_FN_HOST auto iotaFillView(TQueue& queue, TView& view) -> void
-    {
-        using Elem = Elem<TView>;
-
-        auto const platformHost = alpaka::PlatformCpu{};
-        auto const devHost = alpaka::getDevByIdx(platformHost, 0);
-
-        auto const extent = getExtents(view);
-
-        // Init buf with increasing values
-        std::vector<Elem> v(static_cast<std::size_t>(extent.prod()), static_cast<Elem>(0));
-        std::iota(std::begin(v), std::end(v), static_cast<Elem>(0));
-        auto plainBuf = createView(devHost, v, extent);
-
-        // Copy the generated content into the given view.
-        memcpy(queue, view, plainBuf);
-
-        wait(queue);
-    }
-
-    template<typename TAcc, typename TView, typename TQueue>
-    ALPAKA_FN_HOST auto testViewMutable(TQueue& queue, TView& view) -> void
-    {
-        // trait::GetPtrNative
-        {
-            // The view is a non-const so the pointer has to point to a non-const value.
-            using NativePtr = decltype(getPtrNative(view));
-            static_assert(std::is_pointer_v<NativePtr>, "The value returned by getPtrNative has to be a pointer.");
-            static_assert(
-                !std::is_const_v<std::remove_pointer_t<NativePtr>>,
-                "The value returned by getPtrNative has to be non-const when the view is non-const.");
-        }
-
-        // set
-        {
-            auto const byte(static_cast<uint8_t>(42u));
-            memset(queue, view, byte);
-            wait(queue);
-            verifyBytesSet<TAcc>(view, byte);
-        }
-
-        // copy
-        {
-            using Elem = Elem<TView>;
-            using Idx = Idx<TView>;
-
-            auto const devAcc = getDev(view);
-            auto const extent = getExtents(view);
-
-            // copy into given view
-            {
-                auto srcBufAcc = allocBuf<Elem, Idx>(devAcc, extent);
-                iotaFillView(queue, srcBufAcc);
-                memcpy(queue, view, srcBufAcc);
-                wait(queue);
-                verifyViewsEqual<TAcc>(view, srcBufAcc);
-            }
-
-            // copy from given view
-            {
-                auto dstBufAcc = allocBuf<Elem, Idx>(devAcc, extent);
-                memcpy(queue, dstBufAcc, view);
-                wait(queue);
-                verifyViewsEqual<TAcc>(dstBufAcc, view);
-            }
-        }
-    }
-} // namespace alpaka::test
diff --git a/include/alpaka/test/queue/Queue.hpp b/include/alpaka/test/queue/Queue.hpp
deleted file mode 100644
index 0518e6d..0000000
--- a/include/alpaka/test/queue/Queue.hpp
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright 2024 Benjamin Worpitz, Matthias Werner, René Widera, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci,
- * Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/alpaka.hpp"
-
-namespace alpaka::test
-{
-    namespace trait
-    {
-        //! The default queue type trait for devices.
-        template<typename TDev, typename TSfinae = void>
-        struct DefaultQueueType;
-
-        //! The default queue type trait specialization for the CPU device.
-        template<>
-        struct DefaultQueueType<DevCpu>
-        {
-#if(ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
-            using type = QueueCpuBlocking;
-#else
-            using type = QueueCpuNonBlocking;
-#endif
-        };
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-
-        //! The default queue type trait specialization for the CUDA/HIP device.
-        template<typename TApi>
-        struct DefaultQueueType<DevUniformCudaHipRt<TApi>>
-        {
-#    if(ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
-            using type = QueueUniformCudaHipRtBlocking<TApi>;
-#    else
-            using type = QueueUniformCudaHipRtNonBlocking<TApi>;
-#    endif
-        };
-#endif
-
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-        //! The default queue type trait specialization for the SYCL device.
-        template<typename TTag>
-        struct DefaultQueueType<DevGenericSycl<TTag>>
-        {
-#    if(ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
-            using type = QueueGenericSyclBlocking<TTag>;
-#    else
-            using type = QueueGenericSyclNonBlocking<TTag>;
-#    endif
-        };
-#endif
-
-        //! The blocking queue trait.
-        template<typename TQueue, typename TSfinae = void>
-        struct IsBlockingQueue;
-
-        //! The blocking queue trait specialization for a blocking CPU queue.
-        template<typename TDev>
-        struct IsBlockingQueue<QueueGenericThreadsBlocking<TDev>>
-        {
-            static constexpr bool value = true;
-        };
-
-        //! The blocking queue trait specialization for a non-blocking CPU queue.
-        template<typename TDev>
-        struct IsBlockingQueue<QueueGenericThreadsNonBlocking<TDev>>
-        {
-            static constexpr bool value = false;
-        };
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-
-        //! The blocking queue trait specialization for a blocking CUDA/HIP RT queue.
-        template<typename TApi>
-        struct IsBlockingQueue<QueueUniformCudaHipRtBlocking<TApi>>
-        {
-            static constexpr bool value = true;
-        };
-
-        //! The blocking queue trait specialization for a non-blocking CUDA/HIP RT queue.
-        template<typename TApi>
-        struct IsBlockingQueue<QueueUniformCudaHipRtNonBlocking<TApi>>
-        {
-            static constexpr bool value = false;
-        };
-#endif
-
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-        template<typename TTag>
-        struct IsBlockingQueue<QueueGenericSyclBlocking<TTag>>
-        {
-            static constexpr auto value = true;
-        };
-
-        template<typename TTag>
-        struct IsBlockingQueue<QueueGenericSyclNonBlocking<TTag>>
-        {
-            static constexpr auto value = false;
-        };
-#endif
-    } // namespace trait
-
-    //! The queue type that should be used for the given device.
-    template<typename TDev>
-    using DefaultQueue = typename trait::DefaultQueueType<TDev>::type;
-
-    //! The queue type that should be used for the given accelerator.
-    template<typename TQueue>
-    using IsBlockingQueue = trait::IsBlockingQueue<TQueue>;
-
-    //! A std::tuple holding tuples of devices and corresponding queue types.
-    using TestQueues = std::tuple<
-        std::tuple<DevCpu, QueueCpuBlocking>,
-        std::tuple<DevCpu, QueueCpuNonBlocking>
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-        ,
-        std::tuple<DevCudaRt, QueueCudaRtBlocking>,
-        std::tuple<DevCudaRt, QueueCudaRtNonBlocking>
-#endif
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-        ,
-        std::tuple<DevHipRt, QueueHipRtBlocking>,
-        std::tuple<DevHipRt, QueueHipRtNonBlocking>
-#endif
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-#    ifdef ALPAKA_SYCL_ONEAPI_CPU
-        ,
-        std::tuple<alpaka::DevCpuSycl, alpaka::QueueCpuSyclBlocking>,
-        std::tuple<alpaka::DevCpuSycl, alpaka::QueueCpuSyclNonBlocking>
-#    endif
-#    ifdef ALPAKA_SYCL_ONEAPI_FPGA
-        ,
-        std::tuple<alpaka::DevFpgaSyclIntel, alpaka::QueueFpgaSyclIntelBlocking>,
-        std::tuple<alpaka::DevFpgaSyclIntel, alpaka::QueueFpgaSyclIntelNonBlocking>
-#    endif
-#    ifdef ALPAKA_SYCL_ONEAPI_GPU
-        ,
-        std::tuple<alpaka::DevGpuSyclIntel, alpaka::QueueGpuSyclIntelBlocking>,
-        std::tuple<alpaka::DevGpuSyclIntel, alpaka::QueueGpuSyclIntelNonBlocking>
-#    endif
-#endif
-        >;
-} // namespace alpaka::test
diff --git a/include/alpaka/test/queue/QueueCpuOmp2Collective.hpp b/include/alpaka/test/queue/QueueCpuOmp2Collective.hpp
deleted file mode 100644
index 4b346c8..0000000
--- a/include/alpaka/test/queue/QueueCpuOmp2Collective.hpp
+++ /dev/null
@@ -1,297 +0,0 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/dev/DevCpu.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/event/EventCpu.hpp"
-#include "alpaka/event/Traits.hpp"
-#include "alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp"
-#include "alpaka/queue/QueueCpuBlocking.hpp"
-#include "alpaka/queue/Traits.hpp"
-#include "alpaka/queue/cpu/ICpuQueue.hpp"
-#include "alpaka/test/event/EventHostManualTrigger.hpp"
-#include "alpaka/test/queue/Queue.hpp"
-#include "alpaka/wait/Traits.hpp"
-
-#include <atomic>
-#include <mutex>
-
-#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
-
-#    if _OPENMP < 200203
-#        error If ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED is set, the compiler has to support OpenMP 2.0 or higher!
-#    endif
-
-#    include <omp.h>
-
-namespace alpaka
-{
-    namespace cpu::detail
-    {
-#    if BOOST_COMP_CLANG
-// avoid diagnostic warning: "has no out-of-line virtual method definitions; its vtable will be emitted in every
-// translation unit [-Werror,-Wweak-vtables]" https://stackoverflow.com/a/29288300
-#        pragma clang diagnostic push
-#        pragma clang diagnostic ignored "-Wweak-vtables"
-#    endif
-        //! The CPU collective device queue implementation.
-        struct QueueCpuOmp2CollectiveImpl final : cpu::ICpuQueue
-#    if BOOST_COMP_CLANG
-#        pragma clang diagnostic pop
-#    endif
-        {
-            explicit QueueCpuOmp2CollectiveImpl(DevCpu const& dev) noexcept : m_dev(dev), blockingQueue(dev)
-            {
-            }
-
-            void enqueue(EventCpu& ev) final
-            {
-                alpaka::enqueue(*this, ev);
-            }
-
-            void wait(EventCpu const& ev) final
-            {
-                alpaka::wait(*this, ev);
-            }
-
-            void busyWaitUntilBlockingQueueEmpty()
-            {
-                while(!empty(blockingQueue))
-                    ;
-            }
-
-            DevCpu const m_dev; //!< The device this queue is bound to.
-            std::mutex mutable m_mutex;
-            QueueCpuBlocking blockingQueue;
-            std::atomic<uint32_t> m_uCurrentlyExecutingTask = 0;
-        };
-    } // namespace cpu::detail
-
-    //! The CPU collective device queue.
-    //
-    // @attention Queue can only be used together with the accelerator AccCpuOmp2Blocks.
-    //
-    // This queue is an example for a user provided queue and the behavior is strongly coupled
-    // to the user workflows.
-    //
-    // Within an OpenMP parallel region kernel will be performed collectively.
-    // All other operations will be performed from one thread (it is not defined which thread) and there will be no
-    // implicit synchronization between other operations within the parallel OpenMP parallel region. Operations
-    // executed within a OpenMP parallel region will be executed after already queued tasks before the parallel region
-    // was created.
-    //
-    // Outside of an OpenMP parallel region the queue behaves like QueueCpuBlocking.
-    struct QueueCpuOmp2Collective final : concepts::Implements<ConceptCurrentThreadWaitFor, QueueCpuOmp2Collective>
-    {
-        explicit QueueCpuOmp2Collective(DevCpu const& dev)
-            : m_spQueueImpl(std::make_shared<cpu::detail::QueueCpuOmp2CollectiveImpl>(dev))
-        {
-            dev.registerQueue(m_spQueueImpl);
-        }
-
-        auto operator==(QueueCpuOmp2Collective const& rhs) const -> bool
-        {
-            return m_spQueueImpl == rhs.m_spQueueImpl;
-        }
-
-        auto operator!=(QueueCpuOmp2Collective const& rhs) const -> bool
-        {
-            return !((*this) == rhs);
-        }
-
-        std::shared_ptr<cpu::detail::QueueCpuOmp2CollectiveImpl> m_spQueueImpl;
-    };
-
-    namespace trait
-    {
-        //! The CPU blocking device queue device type trait specialization.
-        template<>
-        struct DevType<QueueCpuOmp2Collective>
-        {
-            using type = DevCpu;
-        };
-
-        //! The CPU blocking device queue device get trait specialization.
-        template<>
-        struct GetDev<QueueCpuOmp2Collective>
-        {
-            ALPAKA_FN_HOST static auto getDev(QueueCpuOmp2Collective const& queue) -> DevCpu
-            {
-                return queue.m_spQueueImpl->m_dev;
-            }
-        };
-
-        //! The CPU blocking device queue event type trait specialization.
-        template<>
-        struct EventType<QueueCpuOmp2Collective>
-        {
-            using type = EventCpu;
-        };
-
-        //! The CPU blocking device queue enqueue trait specialization.
-        //! This default implementation for all tasks directly invokes the function call operator of the task.
-        template<typename TTask>
-        struct Enqueue<QueueCpuOmp2Collective, TTask>
-        {
-            ALPAKA_FN_HOST static auto enqueue(QueueCpuOmp2Collective& queue, TTask const& task) -> void
-            {
-                if(::omp_in_parallel() != 0)
-                {
-                    // wait for all tasks enqueued before the parallel region
-                    queue.m_spQueueImpl->busyWaitUntilBlockingQueueEmpty();
-                    ++queue.m_spQueueImpl->m_uCurrentlyExecutingTask;
-#    pragma omp single nowait
-                    task();
-                    --queue.m_spQueueImpl->m_uCurrentlyExecutingTask;
-                }
-                else
-                {
-                    std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
-                    alpaka::enqueue(queue.m_spQueueImpl->blockingQueue, task);
-                }
-            }
-        };
-
-        //! The CPU blocking device queue test trait specialization.
-        template<>
-        struct Empty<QueueCpuOmp2Collective>
-        {
-            ALPAKA_FN_HOST static auto empty(QueueCpuOmp2Collective const& queue) -> bool
-            {
-                return queue.m_spQueueImpl->m_uCurrentlyExecutingTask == 0u
-                       && alpaka::empty(queue.m_spQueueImpl->blockingQueue);
-            }
-        };
-
-        //! The CPU OpenMP2 collective device queue enqueue trait specialization.
-        template<>
-        struct Enqueue<cpu::detail::QueueCpuOmp2CollectiveImpl, EventCpu>
-        {
-            ALPAKA_FN_HOST static auto enqueue(cpu::detail::QueueCpuOmp2CollectiveImpl&, EventCpu&) -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-#    pragma omp barrier
-            }
-        };
-
-        //! The CPU OpenMP2 collective device queue enqueue trait specialization.
-        template<>
-        struct Enqueue<QueueCpuOmp2Collective, EventCpu>
-        {
-            ALPAKA_FN_HOST static auto enqueue(QueueCpuOmp2Collective& queue, EventCpu& event) -> void
-            {
-                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-
-                if(::omp_in_parallel() != 0)
-                {
-                    // wait for all tasks en-queued before the parallel region
-                    queue.m_spQueueImpl->busyWaitUntilBlockingQueueEmpty();
-#    pragma omp barrier
-                }
-                else
-                {
-                    alpaka::enqueue(queue.m_spQueueImpl->blockingQueue, event);
-                }
-            }
-        };
-
-        //! The CPU blocking device queue enqueue trait specialization.
-        //! This default implementation for all tasks directly invokes the function call operator of the task.
-        template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-        struct Enqueue<QueueCpuOmp2Collective, TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...>>
-        {
-            ALPAKA_FN_HOST static auto enqueue(
-                QueueCpuOmp2Collective& queue,
-                TaskKernelCpuOmp2Blocks<TDim, TIdx, TKernelFnObj, TArgs...> const& task) -> void
-            {
-                if(::omp_in_parallel() != 0)
-                {
-                    queue.m_spQueueImpl->busyWaitUntilBlockingQueueEmpty();
-                    ++queue.m_spQueueImpl->m_uCurrentlyExecutingTask;
-                    // execute task within an OpenMP parallel region
-                    task();
-                    --queue.m_spQueueImpl->m_uCurrentlyExecutingTask;
-                }
-                else
-                {
-                    std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
-                    alpaka::enqueue(queue.m_spQueueImpl->blockingQueue, task);
-                }
-            }
-        };
-
-        template<>
-        struct Enqueue<QueueCpuOmp2Collective, test::EventHostManualTriggerCpu<>>
-        {
-            ALPAKA_FN_HOST static auto enqueue(QueueCpuOmp2Collective&, test::EventHostManualTriggerCpu<>&) -> void
-            {
-                // EventHostManualTriggerCpu are not supported for together with the queue
-                // QueueCpuOmp2Collective but a specialization is needed to path the EventTests
-            }
-        };
-
-        //! The CPU blocking device queue thread wait trait specialization.
-        //!
-        //! Blocks execution of the calling thread until the queue has finished processing all previously requested
-        //! tasks (kernels, data copies, ...)
-        template<>
-        struct CurrentThreadWaitFor<QueueCpuOmp2Collective>
-        {
-            ALPAKA_FN_HOST static auto currentThreadWaitFor(QueueCpuOmp2Collective const& queue) -> void
-            {
-                if(::omp_in_parallel() != 0)
-                {
-                    // wait for all tasks en-queued before the parallel region
-                    queue.m_spQueueImpl->busyWaitUntilBlockingQueueEmpty();
-#    pragma omp barrier
-                }
-                else
-                {
-                    std::lock_guard<std::mutex> lk(queue.m_spQueueImpl->m_mutex);
-                    wait(queue.m_spQueueImpl->blockingQueue);
-                }
-            }
-        };
-
-        //! The CPU OpenMP2 collective device queue event wait trait specialization.
-        template<>
-        struct WaiterWaitFor<cpu::detail::QueueCpuOmp2CollectiveImpl, EventCpu>
-        {
-            ALPAKA_FN_HOST static auto waiterWaitFor(cpu::detail::QueueCpuOmp2CollectiveImpl&, EventCpu const&) -> void
-            {
-#    pragma omp barrier
-            }
-        };
-
-        //! The CPU OpenMP2 collective queue event wait trait specialization.
-        template<>
-        struct WaiterWaitFor<QueueCpuOmp2Collective, EventCpu>
-        {
-            ALPAKA_FN_HOST static auto waiterWaitFor(QueueCpuOmp2Collective& queue, EventCpu const& event) -> void
-            {
-                if(::omp_in_parallel() != 0)
-                {
-                    // wait for all tasks en-queued before the parallel region
-                    queue.m_spQueueImpl->busyWaitUntilBlockingQueueEmpty();
-                    wait(queue);
-                }
-                else
-                    wait(queue.m_spQueueImpl->blockingQueue, event);
-            }
-        };
-    } // namespace trait
-
-    //! The blocking queue trait specialization for a OpenMP2 collective CPU queue.
-    template<>
-    struct test::trait::IsBlockingQueue<QueueCpuOmp2Collective> : std::true_type
-    {
-    };
-} // namespace alpaka
-
-#    include "alpaka/event/EventCpu.hpp"
-
-#endif
diff --git a/include/alpaka/test/queue/QueueTestFixture.hpp b/include/alpaka/test/queue/QueueTestFixture.hpp
deleted file mode 100644
index ad6f815..0000000
--- a/include/alpaka/test/queue/QueueTestFixture.hpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright 2023 Benjamin Worpitz, Jan Stephan
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-#include "alpaka/alpaka.hpp"
-
-#include <tuple>
-
-namespace alpaka::test
-{
-    template<typename TDevQueue>
-    struct QueueTestFixture
-    {
-        using Dev = std::tuple_element_t<0, TDevQueue>;
-        using Queue = std::tuple_element_t<1, TDevQueue>;
-        using Platform = alpaka::Platform<Dev>;
-
-        Platform m_platform{};
-        Dev m_dev{getDevByIdx(m_platform, 0)};
-        Queue m_queue{m_dev};
-    };
-} // namespace alpaka::test
diff --git a/include/alpaka/traits/Traits.hpp b/include/alpaka/traits/Traits.hpp
deleted file mode 100644
index 987a48a..0000000
--- a/include/alpaka/traits/Traits.hpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2022 Antonio Di Pilato
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-
-namespace alpaka
-{
-    //! The common trait.
-    namespace trait
-    {
-        //! The native handle trait.
-        template<typename TImpl, typename TSfinae = void>
-        struct NativeHandle
-        {
-            static auto getNativeHandle(TImpl const&)
-            {
-                static_assert(!sizeof(TImpl), "This type does not have a native handle!");
-                return 0;
-            }
-        };
-    } // namespace trait
-
-    //! Get the native handle of the alpaka object.
-    //! It will return the alpaka object handle if there is any, otherwise it generates a compile time error.
-    template<typename TImpl>
-    ALPAKA_FN_HOST auto getNativeHandle(TImpl const& impl)
-    {
-        return trait::NativeHandle<TImpl>::getNativeHandle(impl);
-    }
-
-    //! Alias to the type of the native handle.
-    template<typename TImpl>
-    using NativeHandle = decltype(getNativeHandle(std::declval<TImpl>()));
-} // namespace alpaka
diff --git a/include/alpaka/vec/Traits.hpp b/include/alpaka/vec/Traits.hpp
deleted file mode 100644
index 531fe04..0000000
--- a/include/alpaka/vec/Traits.hpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/dim/Traits.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/meta/IntegerSequence.hpp"
-
-#include <utility>
-
-namespace alpaka
-{
-    //! The vec traits.
-    namespace trait
-    {
-        //! Trait for selecting a sub-vector.
-        template<typename TVec, typename TIndexSequence, typename TSfinae = void>
-        struct SubVecFromIndices;
-
-        //! Trait for casting a vector.
-        template<typename TVal, typename TVec, typename TSfinae = void>
-        struct CastVec;
-
-        //! Trait for reversing a vector.
-        template<typename TVec, typename TSfinae = void>
-        struct ReverseVec;
-
-        //! Trait for concatenating two vectors.
-        template<typename TVecL, typename TVecR, typename TSfinae = void>
-        struct ConcatVec;
-    } // namespace trait
-
-    //! Builds a new vector by selecting the elements of the source vector in the given order.
-    //! Repeating and swizzling elements is allowed.
-    //! \return The sub-vector consisting of the elements specified by the indices.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TIndexSequence, typename TVec>
-    ALPAKA_FN_HOST_ACC constexpr auto subVecFromIndices(TVec const& vec)
-    {
-        return trait::SubVecFromIndices<TVec, TIndexSequence>::subVecFromIndices(vec);
-    }
-
-    //! \tparam TVec has to specialize SubVecFromIndices.
-    //! \return The sub-vector consisting of the first N elements of the source vector.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TSubDim, typename TVec>
-    ALPAKA_FN_HOST_ACC constexpr auto subVecBegin(TVec const& vec)
-    {
-        static_assert(
-            TSubDim::value <= Dim<TVec>::value,
-            "The sub-Vec has to be smaller (or same size) then the original Vec.");
-
-        //! A sequence of integers from 0 to dim-1.
-        using IdxSubSequence = std::make_integer_sequence<std::size_t, TSubDim::value>;
-        return subVecFromIndices<IdxSubSequence>(vec);
-    }
-
-    //! \tparam TVec has to specialize SubVecFromIndices.
-    //! \return The sub-vector consisting of the last N elements of the source vector.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TSubDim, typename TVec>
-    ALPAKA_FN_HOST_ACC constexpr auto subVecEnd(TVec const& vec)
-    {
-        static_assert(
-            TSubDim::value <= Dim<TVec>::value,
-            "The sub-Vec has to be smaller (or same size) then the original Vec.");
-
-        constexpr std::size_t idxOffset = Dim<TVec>::value - TSubDim::value;
-
-        //! A sequence of integers from 0 to dim-1.
-        using IdxSubSequence = meta::MakeIntegerSequenceOffset<std::size_t, idxOffset, TSubDim::value>;
-        return subVecFromIndices<IdxSubSequence>(vec);
-    }
-
-    //! \return The casted vector.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TVal, typename TVec>
-    ALPAKA_FN_HOST_ACC constexpr auto castVec(TVec const& vec)
-    {
-        return trait::CastVec<TVal, TVec>::castVec(vec);
-    }
-
-    //! \return The reverseVec vector.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TVec>
-    ALPAKA_FN_HOST_ACC constexpr auto reverseVec(TVec const& vec)
-    {
-        return trait::ReverseVec<TVec>::reverseVec(vec);
-    }
-
-    //! \return The concatenated vector.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TVecL, typename TVecR>
-    ALPAKA_FN_HOST_ACC constexpr auto concatVec(TVecL const& vecL, TVecR const& vecR)
-    {
-        return trait::ConcatVec<TVecL, TVecR>::concatVec(vecL, vecR);
-    }
-} // namespace alpaka
diff --git a/include/alpaka/vec/Vec.hpp b/include/alpaka/vec/Vec.hpp
deleted file mode 100644
index d327f60..0000000
--- a/include/alpaka/vec/Vec.hpp
+++ /dev/null
@@ -1,799 +0,0 @@
-/* Copyright 2023 Axel Huebl, Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera, Andrea Bocci, Jan Stephan,
- *                Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Align.hpp"
-#include "alpaka/core/Assert.hpp"
-#include "alpaka/core/BoostPredef.hpp"
-#include "alpaka/core/Common.hpp"
-#include "alpaka/core/Unreachable.hpp"
-#include "alpaka/dim/DimIntegralConst.hpp"
-#include "alpaka/dim/Traits.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/meta/Fold.hpp"
-#include "alpaka/meta/Functional.hpp"
-#include "alpaka/meta/IntegerSequence.hpp"
-#include "alpaka/vec/Traits.hpp"
-
-#include <algorithm>
-#include <cstdint>
-#include <functional>
-#include <limits>
-#include <ostream>
-#include <tuple>
-#include <type_traits>
-#include <utility>
-
-namespace alpaka
-{
-    template<typename TDim, typename TVal>
-    class Vec;
-
-    //! A n-dimensional vector.
-    template<typename TDim, typename TVal>
-    class Vec final
-    {
-    public:
-        static_assert(TDim::value >= 0u, "Invalid dimensionality");
-
-        using Dim = TDim;
-        using Val = TVal;
-        using value_type = Val; //!< STL-like value_type.
-
-    private:
-        //! A sequence of integers from 0 to dim-1.
-        //! This can be used to write compile time indexing algorithms.
-        using IdxSequence = std::make_integer_sequence<std::size_t, TDim::value>;
-
-    public:
-        ALPAKA_FN_HOST_ACC constexpr Vec() : m_data{}
-        {
-        }
-
-        //! Value constructor.
-        //! This constructor is only available if the number of parameters matches the vector idx.
-        ALPAKA_NO_HOST_ACC_WARNING
-#if BOOST_COMP_NVCC && BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(11, 3, 0)                                              \
-    && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 4, 0)
-        // This constructor tries to avoid SFINAE, which crashes nvcc 11.3. We also need to have a first
-        // argument, so an unconstrained ctor with forwarding references does not hijack the compiler provided
-        // copy-ctor.
-        template<typename... TArgs>
-        ALPAKA_FN_HOST_ACC constexpr Vec(TVal arg0, TArgs&&... args)
-            : m_data{std::move(arg0), static_cast<TVal>(std::forward<TArgs>(args))...}
-        {
-            static_assert(
-                1 + sizeof...(TArgs) == TDim::value && (std::is_convertible_v<std::decay_t<TArgs>, TVal> && ...),
-                "Wrong number of arguments to Vec constructor or types are not convertible to TVal.");
-        }
-#else
-        template<
-            typename... TArgs,
-            typename = std::enable_if_t<
-                sizeof...(TArgs) == TDim::value && (std::is_convertible_v<std::decay_t<TArgs>, TVal> && ...)>>
-        ALPAKA_FN_HOST_ACC constexpr Vec(TArgs&&... args) : m_data{static_cast<TVal>(std::forward<TArgs>(args))...}
-        {
-        }
-#endif
-
-        //! Generator constructor.
-        //! Initializes the vector with the values returned from generator(IC) in order, where IC::value runs from 0 to
-        //! TDim - 1 (inclusive).
-#if BOOST_COMP_NVCC && BOOST_COMP_NVCC >= BOOST_VERSION_NUMBER(11, 3, 0)                                              \
-    && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 4, 0)
-        template<typename F>
-        ALPAKA_FN_HOST_ACC constexpr explicit Vec(
-            F&& generator,
-            std::void_t<decltype(generator(std::integral_constant<std::size_t, 0>{}))>* ignore = nullptr)
-            : Vec(std::forward<F>(generator), std::make_index_sequence<TDim::value>{})
-        {
-            static_cast<void>(ignore);
-        }
-#else
-        template<typename F, std::enable_if_t<std::is_invocable_v<F, std::integral_constant<std::size_t, 0>>, int> = 0>
-        ALPAKA_FN_HOST_ACC constexpr explicit Vec(F&& generator)
-            : Vec(std::forward<F>(generator), std::make_index_sequence<TDim::value>{})
-        {
-        }
-#endif
-
-    private:
-        template<typename F, std::size_t... Is>
-        ALPAKA_FN_HOST_ACC constexpr explicit Vec(F&& generator, std::index_sequence<Is...>)
-            : m_data{generator(std::integral_constant<std::size_t, Is>{})...}
-        {
-        }
-
-    public:
-        //! \brief Single value constructor.
-        //!
-        //! Creates a vector with all values set to val.
-        //! \param val The initial value.
-        ALPAKA_NO_HOST_ACC_WARNING
-        ALPAKA_FN_HOST_ACC static constexpr auto all(TVal const& val) -> Vec<TDim, TVal>
-        {
-            Vec<TDim, TVal> v;
-            for(auto& e : v)
-                e = val;
-            return v;
-        }
-
-        //! Zero value constructor.
-        ALPAKA_NO_HOST_ACC_WARNING
-        ALPAKA_FN_HOST_ACC static constexpr auto zeros() -> Vec<TDim, TVal>
-        {
-            return all(static_cast<TVal>(0));
-        }
-
-        //! One value constructor.
-        ALPAKA_NO_HOST_ACC_WARNING
-        ALPAKA_FN_HOST_ACC static constexpr auto ones() -> Vec<TDim, TVal>
-        {
-            return all(static_cast<TVal>(1));
-        }
-
-        ALPAKA_FN_HOST_ACC constexpr auto begin() -> TVal*
-        {
-            return m_data;
-        }
-
-        ALPAKA_FN_HOST_ACC constexpr auto begin() const -> TVal const*
-        {
-            return m_data;
-        }
-
-        ALPAKA_FN_HOST_ACC constexpr auto end() -> TVal*
-        {
-            return m_data + TDim::value;
-        }
-
-        ALPAKA_FN_HOST_ACC constexpr auto end() const -> TVal const*
-        {
-            return m_data + TDim::value;
-        }
-
-        ALPAKA_FN_HOST_ACC constexpr auto front() -> TVal&
-        {
-            return m_data[0];
-        }
-
-        ALPAKA_FN_HOST_ACC constexpr auto front() const -> TVal const&
-        {
-            return m_data[0];
-        }
-
-        ALPAKA_FN_HOST_ACC constexpr auto back() -> TVal&
-        {
-            return m_data[Dim::value - 1];
-        }
-
-        ALPAKA_FN_HOST_ACC constexpr auto back() const -> TVal const&
-        {
-            return m_data[Dim::value - 1];
-        }
-
-        //! access elements by name
-        //!
-        //! names: x,y,z,w
-        //! @{
-        template<typename TDefer = Dim, std::enable_if_t<std::is_same_v<TDefer, Dim> && Dim::value >= 1, int> = 0>
-        ALPAKA_FN_HOST_ACC constexpr decltype(auto) x() const
-        {
-            return m_data[Dim::value - 1];
-        }
-
-        template<typename TDefer = Dim, std::enable_if_t<std::is_same_v<TDefer, Dim> && Dim::value >= 1, int> = 0>
-        ALPAKA_FN_HOST_ACC constexpr decltype(auto) x()
-        {
-            return m_data[Dim::value - 1];
-        }
-
-        template<typename TDefer = Dim, std::enable_if_t<std::is_same_v<TDefer, Dim> && Dim::value >= 2, int> = 0>
-        ALPAKA_FN_HOST_ACC constexpr decltype(auto) y() const
-        {
-            return m_data[Dim::value - 2];
-        }
-
-        template<typename TDefer = Dim, std::enable_if_t<std::is_same_v<TDefer, Dim> && Dim::value >= 2, int> = 0>
-        ALPAKA_FN_HOST_ACC constexpr decltype(auto) y()
-        {
-            return m_data[Dim::value - 2];
-        }
-
-        template<typename TDefer = Dim, std::enable_if_t<std::is_same_v<TDefer, Dim> && Dim::value >= 3, int> = 0>
-        ALPAKA_FN_HOST_ACC constexpr decltype(auto) z() const
-        {
-            return m_data[Dim::value - 3];
-        }
-
-        template<typename TDefer = Dim, std::enable_if_t<std::is_same_v<TDefer, Dim> && Dim::value >= 3, int> = 0>
-        ALPAKA_FN_HOST_ACC constexpr decltype(auto) z()
-        {
-            return m_data[Dim::value - 3];
-        }
-
-        template<typename TDefer = Dim, std::enable_if_t<std::is_same_v<TDefer, Dim> && Dim::value >= 4, int> = 0>
-        ALPAKA_FN_HOST_ACC constexpr decltype(auto) w() const
-        {
-            return m_data[Dim::value - 4];
-        }
-
-        template<typename TDefer = Dim, std::enable_if_t<std::is_same_v<TDefer, Dim> && Dim::value >= 4, int> = 0>
-        ALPAKA_FN_HOST_ACC constexpr decltype(auto) w()
-        {
-            return m_data[Dim::value - 4];
-        }
-
-        //! @}
-
-        //! Value reference accessor at the given non-unsigned integer index.
-        //! \return A reference to the value at the given index.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename TIdx, typename = std::enable_if_t<std::is_integral_v<TIdx>>>
-        ALPAKA_FN_HOST_ACC constexpr auto operator[](TIdx const iIdx) -> TVal&
-        {
-            core::assertValueUnsigned(iIdx);
-            auto const idx = static_cast<typename TDim::value_type>(iIdx);
-            core::assertGreaterThan<TDim>(idx);
-            return m_data[idx];
-        }
-
-        //! Value accessor at the given non-unsigned integer index.
-        //! \return The value at the given index.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename TIdx, typename = std::enable_if_t<std::is_integral_v<TIdx>>>
-        ALPAKA_FN_HOST_ACC constexpr auto operator[](TIdx const iIdx) const -> TVal
-        {
-            core::assertValueUnsigned(iIdx);
-            auto const idx = static_cast<typename TDim::value_type>(iIdx);
-            core::assertGreaterThan<TDim>(idx);
-            return m_data[idx];
-        }
-
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename TFnObj, std::size_t... TIndices>
-        [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto foldrByIndices(
-            TFnObj const& f,
-            std::integer_sequence<std::size_t, TIndices...>) const
-        {
-            return meta::foldr(f, (*this)[TIndices]...);
-        }
-
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename TFnObj, std::size_t... TIndices>
-        [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto foldrByIndices(
-            TFnObj const& f,
-            std::integer_sequence<std::size_t, TIndices...>,
-            TVal initial) const
-        {
-            return meta::foldr(f, (*this)[TIndices]..., initial);
-        }
-
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename TFnObj>
-        [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto foldrAll(TFnObj const& f) const
-        {
-            return foldrByIndices(f, IdxSequence());
-        }
-
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename TFnObj>
-        [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto foldrAll(TFnObj const& f, TVal initial) const
-        {
-            return foldrByIndices(f, IdxSequence(), initial);
-        }
-
-// suppress strange warning produced by nvcc+MSVC in release mode
-#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
-#    pragma warning(push)
-#    pragma warning(disable : 4702) // unreachable code
-#endif
-        //! \return The product of all values.
-        ALPAKA_NO_HOST_ACC_WARNING
-        [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto prod() const -> TVal
-        {
-            return foldrAll(std::multiplies<TVal>{}, TVal{1});
-        }
-#if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
-#    pragma warning(pop)
-#endif
-        //! \return The sum of all values.
-        ALPAKA_NO_HOST_ACC_WARNING
-        [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto sum() const -> TVal
-        {
-            return foldrAll(std::plus<TVal>{}, TVal{0});
-        }
-
-        //! \return The min of all values.
-        ALPAKA_NO_HOST_ACC_WARNING
-        [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto min() const -> TVal
-        {
-            return foldrAll(meta::min<TVal>{}, std::numeric_limits<TVal>::max());
-        }
-
-        //! \return The max of all values.
-        ALPAKA_NO_HOST_ACC_WARNING
-        [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto max() const -> TVal
-        {
-            return foldrAll(meta::max<TVal>{}, std::numeric_limits<TVal>::min());
-        }
-
-        //! \return True if all values are true, i.e., the "logical and" of all values.
-        ALPAKA_NO_HOST_ACC_WARNING
-        [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto all() const -> bool
-        {
-            return foldrAll(std::logical_and<TVal>{}, true);
-        }
-
-        //! \return True if any value is true, i.e., the "logical or" of all values.
-        ALPAKA_NO_HOST_ACC_WARNING
-        [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto any() const -> bool
-        {
-            return foldrAll(std::logical_or<TVal>{}, false);
-        }
-
-        //! \return True if none of the values are true
-        ALPAKA_NO_HOST_ACC_WARNING
-        [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto none() const -> bool
-        {
-            return !foldrAll(std::logical_or<TVal>{}, false);
-        }
-
-        //! \return The index of the minimal element.
-        [[nodiscard]] ALPAKA_FN_HOST constexpr auto minElem() const -> typename TDim::value_type
-        {
-            return static_cast<typename TDim::value_type>(
-                std::distance(std::begin(m_data), std::min_element(std::begin(m_data), std::end(m_data))));
-        }
-
-        //! \return The index of the maximal element.
-        [[nodiscard]] ALPAKA_FN_HOST constexpr auto maxElem() const -> typename TDim::value_type
-        {
-            return static_cast<typename TDim::value_type>(
-                std::distance(std::begin(m_data), std::max_element(std::begin(m_data), std::end(m_data))));
-        }
-
-        template<size_t I>
-        ALPAKA_FN_HOST_ACC constexpr auto get() -> TVal&
-        {
-            return (*this)[I];
-        }
-
-        template<size_t I>
-        [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto get() const -> TVal
-        {
-            return (*this)[I];
-        }
-
-        //! \return The element-wise sum of two vectors.
-        ALPAKA_NO_HOST_ACC_WARNING
-        ALPAKA_FN_HOST_ACC friend constexpr auto operator+(Vec const& p, Vec const& q) -> Vec
-        {
-            Vec r;
-#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
-            if(TDim::value > 0)
-#else
-            if constexpr(TDim::value > 0)
-#endif
-            {
-                for(typename TDim::value_type i = 0; i < TDim::value; ++i)
-                    r[i] = p[i] + q[i];
-            }
-            return r;
-        }
-
-        //! \return The element-wise difference of two vectors.
-        ALPAKA_NO_HOST_ACC_WARNING
-        ALPAKA_FN_HOST_ACC friend constexpr auto operator-(Vec const& p, Vec const& q) -> Vec
-        {
-            Vec r;
-#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
-            if(TDim::value > 0)
-#else
-            if constexpr(TDim::value > 0)
-#endif
-            {
-#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
-#    pragma diag_suppress = unsigned_compare_with_zero
-#endif
-                for(typename TDim::value_type i = 0; i < TDim::value; ++i)
-#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
-#    pragma diag_default = unsigned_compare_with_zero
-#endif
-                    r[i] = p[i] - q[i];
-            }
-            return r;
-        }
-
-        //! \return The element-wise product of two vectors.
-        ALPAKA_NO_HOST_ACC_WARNING
-        ALPAKA_FN_HOST_ACC friend constexpr auto operator*(Vec const& p, Vec const& q) -> Vec
-        {
-            Vec r;
-#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
-            if(TDim::value > 0)
-#else
-            if constexpr(TDim::value > 0)
-#endif
-            {
-                for(typename TDim::value_type i = 0; i < TDim::value; ++i)
-                    r[i] = p[i] * q[i];
-            }
-            return r;
-        }
-
-        ALPAKA_NO_HOST_ACC_WARNING
-        ALPAKA_FN_HOST_ACC friend constexpr auto operator==(Vec const& a, Vec const& b) -> bool
-        {
-#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
-            if(TDim::value > 0)
-#else
-            if constexpr(TDim::value > 0)
-#endif
-            {
-#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
-#    pragma diag_suppress = unsigned_compare_with_zero
-#endif
-                for(typename TDim::value_type i(0); i < TDim::value; ++i)
-#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
-#    pragma diag_default = unsigned_compare_with_zero
-#endif
-                {
-                    if(a[i] != b[i])
-                        return false;
-                }
-            }
-            return true;
-        }
-
-        ALPAKA_NO_HOST_ACC_WARNING
-        ALPAKA_FN_HOST_ACC friend constexpr auto operator!=(Vec const& a, Vec const& b) -> bool
-        {
-            return !(a == b);
-        }
-
-        //! \return The element-wise less than relation of two vectors.
-        ALPAKA_NO_HOST_ACC_WARNING
-        ALPAKA_FN_HOST_ACC friend constexpr auto operator<(Vec const& p, Vec const& q) -> Vec<TDim, bool>
-        {
-            Vec<TDim, bool> r;
-#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
-            if(TDim::value > 0)
-#else
-            if constexpr(TDim::value > 0)
-#endif
-            {
-                for(typename TDim::value_type i = 0; i < TDim::value; ++i)
-                    r[i] = p[i] < q[i];
-            }
-            return r;
-        }
-
-        //! \return The element-wise less than relation of two vectors.
-        ALPAKA_NO_HOST_ACC_WARNING
-        ALPAKA_FN_HOST_ACC friend constexpr auto operator<=(Vec const& p, Vec const& q) -> Vec<TDim, bool>
-        {
-            Vec<TDim, bool> r;
-#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
-            if(TDim::value > 0)
-#else
-            if constexpr(TDim::value > 0)
-#endif
-            {
-                for(typename TDim::value_type i = 0; i < TDim::value; ++i)
-                    r[i] = p[i] <= q[i];
-            }
-            return r;
-        }
-
-        //! \return The element-wise greater than relation of two vectors.
-        ALPAKA_NO_HOST_ACC_WARNING
-        ALPAKA_FN_HOST_ACC friend constexpr auto operator>(Vec const& p, Vec const& q) -> Vec<TDim, bool>
-        {
-            Vec<TDim, bool> r;
-#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
-            if(TDim::value > 0)
-#else
-            if constexpr(TDim::value > 0)
-#endif
-            {
-                for(typename TDim::value_type i = 0; i < TDim::value; ++i)
-                    r[i] = p[i] > q[i];
-            }
-            return r;
-        }
-
-        //! \return The element-wise greater equal than relation of two vectors.
-        ALPAKA_NO_HOST_ACC_WARNING
-        ALPAKA_FN_HOST_ACC friend constexpr auto operator>=(Vec const& p, Vec const& q) -> Vec<TDim, bool>
-        {
-            Vec<TDim, bool> r;
-#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
-            if(TDim::value > 0)
-#else
-            if constexpr(TDim::value > 0)
-#endif
-            {
-                for(typename TDim::value_type i = 0; i < TDim::value; ++i)
-                    r[i] = p[i] >= q[i];
-            }
-            return r;
-        }
-
-        //! \return The element-wise logical and relation of two vectors.
-        ALPAKA_NO_HOST_ACC_WARNING
-        ALPAKA_FN_HOST_ACC friend constexpr auto operator&&(Vec const& p, Vec const& q) -> Vec<TDim, bool>
-        {
-            Vec<TDim, bool> r;
-#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
-            if(TDim::value > 0)
-#else
-            if constexpr(TDim::value > 0)
-#endif
-            {
-                for(typename TDim::value_type i = 0; i < TDim::value; ++i)
-                    r[i] = p[i] && q[i];
-            }
-            return r;
-        }
-
-        //! \return The element-wise logical or relation of two vectors.
-        ALPAKA_NO_HOST_ACC_WARNING
-        ALPAKA_FN_HOST_ACC friend constexpr auto operator||(Vec const& p, Vec const& q) -> Vec<TDim, bool>
-        {
-            Vec<TDim, bool> r;
-#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
-            if(TDim::value > 0)
-#else
-            if constexpr(TDim::value > 0)
-#endif
-            {
-                for(typename TDim::value_type i = 0; i < TDim::value; ++i)
-                    r[i] = p[i] || q[i];
-            }
-            return r;
-        }
-
-        ALPAKA_FN_HOST friend constexpr auto operator<<(std::ostream& os, Vec const& v) -> std::ostream&
-        {
-            os << "(";
-#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
-            if(TDim::value > 0)
-#else
-            if constexpr(TDim::value > 0)
-#endif
-            {
-#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
-#    pragma diag_suppress = unsigned_compare_with_zero
-#endif
-                for(typename TDim::value_type i = 0; i < TDim::value; ++i)
-#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
-#    pragma diag_default = unsigned_compare_with_zero
-#endif
-                {
-                    os << v[i];
-                    if(i != TDim::value - 1)
-                        os << ", ";
-                }
-            }
-            else
-                os << ".";
-            os << ")";
-
-            return os;
-        }
-
-    private:
-        // Zero sized arrays are not allowed, therefore zero-dimensional vectors have one member.
-        TVal m_data[TDim::value == 0u ? 1u : TDim::value];
-    };
-
-    template<typename TFirstIndex, typename... TRestIndices>
-    ALPAKA_FN_HOST_ACC Vec(TFirstIndex&&, TRestIndices&&...)
-        -> Vec<DimInt<1 + sizeof...(TRestIndices)>, std::decay_t<TFirstIndex>>;
-
-    template<typename T>
-    inline constexpr bool isVec = false;
-
-    template<typename TDim, typename TVal>
-    inline constexpr bool isVec<Vec<TDim, TVal>> = true;
-
-    //! Converts a Vec to a std::array
-    template<typename TDim, typename TVal>
-    ALPAKA_FN_HOST_ACC constexpr auto toArray(Vec<TDim, TVal> const& v) -> std::array<TVal, TDim::value>
-    {
-        std::array<TVal, TDim::value> a{};
-#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
-        if(TDim::value > 0)
-#else
-        if constexpr(TDim::value > 0)
-#endif
-        {
-            for(unsigned i = 0; i < TDim::value; i++)
-                a[i] = v[i];
-        }
-        return a;
-    }
-
-    //! \return The element-wise minimum of one or more vectors.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TDim,
-        typename TVal,
-        typename... Vecs,
-        typename = std::enable_if_t<(std::is_same_v<Vec<TDim, TVal>, Vecs> && ...)>>
-    ALPAKA_FN_HOST_ACC constexpr auto elementwise_min(Vec<TDim, TVal> const& p, Vecs const&... qs) -> Vec<TDim, TVal>
-    {
-        Vec<TDim, TVal> r;
-#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
-        if(TDim::value > 0)
-#else
-        if constexpr(TDim::value > 0)
-#endif
-        {
-            for(typename TDim::value_type i = 0; i < TDim::value; ++i)
-                r[i] = std::min({p[i], qs[i]...});
-        }
-        return r;
-    }
-
-    //! \return The element-wise maximum of one or more vectors.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<
-        typename TDim,
-        typename TVal,
-        typename... Vecs,
-        typename = std::enable_if_t<(std::is_same_v<Vec<TDim, TVal>, Vecs> && ...)>>
-    ALPAKA_FN_HOST_ACC constexpr auto elementwise_max(Vec<TDim, TVal> const& p, Vecs const&... qs) -> Vec<TDim, TVal>
-    {
-        Vec<TDim, TVal> r;
-#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
-        if(TDim::value > 0)
-#else
-        if constexpr(TDim::value > 0)
-#endif
-        {
-            for(typename TDim::value_type i = 0; i < TDim::value; ++i)
-                r[i] = std::max({p[i], qs[i]...});
-        }
-        return r;
-    }
-
-    namespace trait
-    {
-        //! The Vec dimension get trait specialization.
-        template<typename TDim, typename TVal>
-        struct DimType<Vec<TDim, TVal>>
-        {
-            using type = TDim;
-        };
-
-        //! The Vec idx type trait specialization.
-        template<typename TDim, typename TVal>
-        struct IdxType<Vec<TDim, TVal>>
-        {
-            using type = TVal;
-        };
-
-        //! Specialization for selecting a sub-vector.
-        template<typename TDim, typename TVal, std::size_t... TIndices>
-        struct SubVecFromIndices<Vec<TDim, TVal>, std::index_sequence<TIndices...>>
-        {
-            ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC static constexpr auto subVecFromIndices(
-                Vec<TDim, TVal> const& vec) -> Vec<DimInt<sizeof...(TIndices)>, TVal>
-            {
-                if constexpr(std::is_same_v<std::index_sequence<TIndices...>, std::make_index_sequence<TDim::value>>)
-                {
-                    return vec; // Return whole vector.
-                }
-                else
-                {
-                    static_assert(
-                        sizeof...(TIndices) <= TDim::value,
-                        "The sub-vector's dimensionality must be smaller than or equal to the original "
-                        "dimensionality.");
-                    return {vec[TIndices]...}; // Return sub-vector.
-                }
-                ALPAKA_UNREACHABLE({});
-            }
-        };
-
-        template<typename TValNew, typename TDim, typename TVal>
-        struct CastVec<TValNew, Vec<TDim, TVal>>
-        {
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC static constexpr auto castVec(Vec<TDim, TVal> const& vec) -> Vec<TDim, TValNew>
-            {
-                if constexpr(std::is_same_v<TValNew, TVal>)
-                {
-                    return vec;
-                }
-                else
-                {
-                    Vec<TDim, TValNew> r;
-#if BOOST_COMP_NVCC && BOOST_COMP_NVCC < BOOST_VERSION_NUMBER(11, 3, 0)
-                    if(TDim::value > 0)
-#else
-                    if constexpr(TDim::value > 0)
-#endif
-                    {
-                        for(typename TDim::value_type i = 0; i < TDim::value; ++i)
-                            r[i] = static_cast<TValNew>(vec[i]);
-                    }
-                    return r;
-                }
-                ALPAKA_UNREACHABLE({});
-            }
-        };
-
-        //! ReverseVec specialization for Vec.
-        template<typename TDim, typename TVal>
-        struct ReverseVec<Vec<TDim, TVal>>
-        {
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC static constexpr auto reverseVec(Vec<TDim, TVal> const& vec) -> Vec<TDim, TVal>
-            {
-                if constexpr(TDim::value <= 1)
-                {
-                    return vec;
-                }
-                else
-                {
-                    Vec<TDim, TVal> r;
-                    for(typename TDim::value_type i = 0; i < TDim::value; ++i)
-                        r[i] = vec[TDim::value - 1u - i];
-                    return r;
-                }
-                ALPAKA_UNREACHABLE({});
-            }
-        };
-
-        //! Concatenation specialization for Vec.
-        template<typename TDimL, typename TDimR, typename TVal>
-        struct ConcatVec<Vec<TDimL, TVal>, Vec<TDimR, TVal>>
-        {
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC static constexpr auto concatVec(
-                Vec<TDimL, TVal> const& vecL,
-                Vec<TDimR, TVal> const& vecR) -> Vec<DimInt<TDimL::value + TDimR::value>, TVal>
-            {
-                Vec<DimInt<TDimL::value + TDimR::value>, TVal> r;
-                if constexpr(TDimL::value > 0)
-                {
-                    for(typename TDimL::value_type i = 0; i < TDimL::value; ++i)
-                        r[i] = vecL[i];
-                }
-                if constexpr(TDimR::value > 0)
-                {
-                    for(typename TDimR::value_type i = 0; i < TDimR::value; ++i)
-                        r[TDimL::value + i] = vecR[i];
-                }
-                return r;
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
-
-#if defined(__clang__)
-#    pragma GCC diagnostic push
-#    pragma GCC diagnostic ignored "-Wmismatched-tags"
-#endif
-namespace std
-{
-    template<typename TDim, typename TVal>
-    struct tuple_size<alpaka::Vec<TDim, TVal>> : integral_constant<size_t, TDim::value>
-    {
-    };
-
-    template<size_t I, typename TDim, typename TVal>
-    struct tuple_element<I, alpaka::Vec<TDim, TVal>>
-    {
-        using type = TVal;
-    };
-} // namespace std
-#if defined(__clang__)
-#    pragma GCC diagnostic pop
-#endif
diff --git a/include/alpaka/version.hpp b/include/alpaka/version.hpp
deleted file mode 100644
index 9ea2db7..0000000
--- a/include/alpaka/version.hpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Erik Zenker, Jan Stephan
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include <boost/predef/version_number.h>
-
-#define ALPAKA_VERSION_MAJOR 1
-#define ALPAKA_VERSION_MINOR 2
-#define ALPAKA_VERSION_PATCH 0
-
-//! The alpaka library version number
-#define ALPAKA_VERSION BOOST_VERSION_NUMBER(ALPAKA_VERSION_MAJOR, ALPAKA_VERSION_MINOR, ALPAKA_VERSION_PATCH)
diff --git a/include/alpaka/wait/Traits.hpp b/include/alpaka/wait/Traits.hpp
deleted file mode 100644
index c0cfa89..0000000
--- a/include/alpaka/wait/Traits.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/core/Concepts.hpp"
-
-namespace alpaka
-{
-    struct ConceptCurrentThreadWaitFor
-    {
-    };
-
-    //! The wait traits.
-    namespace trait
-    {
-        //! The thread wait trait.
-        template<typename TAwaited, typename TSfinae = void>
-        struct CurrentThreadWaitFor;
-
-        //! The waiter wait trait.
-        template<typename TWaiter, typename TAwaited, typename TSfinae = void>
-        struct WaiterWaitFor;
-    } // namespace trait
-
-    //! Waits the thread for the completion of the given awaited action to complete.
-    //!
-    //! Special Handling for events:
-    //!   If the event is re-enqueued wait() will terminate when the re-enqueued event will be ready and previously
-    //!   enqueued states of the event will be ignored.
-    template<typename TAwaited>
-    ALPAKA_FN_HOST auto wait(TAwaited const& awaited) -> void
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptCurrentThreadWaitFor, TAwaited>;
-        trait::CurrentThreadWaitFor<ImplementationBase>::currentThreadWaitFor(awaited);
-    }
-
-    //! The waiter waits for the given awaited action to complete.
-    //!
-    //! Special Handling if \p waiter is a queue and \p awaited an event:
-    //!   The \p waiter waits for the event state to become ready based on the recently captured event state at the
-    //!   time of the API call even if the event is being re-enqueued later.
-    template<typename TWaiter, typename TAwaited>
-    ALPAKA_FN_HOST auto wait(TWaiter& waiter, TAwaited const& awaited) -> void
-    {
-        trait::WaiterWaitFor<TWaiter, TAwaited>::waiterWaitFor(waiter, awaited);
-    }
-} // namespace alpaka
diff --git a/include/alpaka/warp/Traits.hpp b/include/alpaka/warp/Traits.hpp
deleted file mode 100644
index f4cfb4d..0000000
--- a/include/alpaka/warp/Traits.hpp
+++ /dev/null
@@ -1,317 +0,0 @@
-/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Bernhard Manfred Gruber, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/core/Concepts.hpp"
-
-#include <cstdint>
-#include <type_traits>
-
-namespace alpaka::warp
-{
-    struct ConceptWarp
-    {
-    };
-
-    //! The warp traits.
-    namespace trait
-    {
-        //! The warp size trait.
-        template<typename TWarp, typename TSfinae = void>
-        struct GetSize;
-
-        //! The all warp vote trait.
-        template<typename TWarp, typename TSfinae = void>
-        struct All;
-
-        //! The any warp vote trait.
-        template<typename TWarp, typename TSfinae = void>
-        struct Any;
-
-        //! The ballot warp vote trait.
-        template<typename TWarp, typename TSfinae = void>
-        struct Ballot;
-
-        //! The shfl warp swizzling trait.
-        template<typename TWarp, typename TSfinae = void>
-        struct Shfl;
-
-        //! The shfl up warp swizzling trait.
-        template<typename TWarp, typename TSfinae = void>
-        struct ShflUp;
-
-        //! The shfl down warp swizzling trait.
-        template<typename TWarp, typename TSfinae = void>
-        struct ShflDown;
-
-        //! The shfl xor warp swizzling trait.
-        template<typename TWarp, typename TSfinae = void>
-        struct ShflXor;
-
-        //! The active mask trait.
-        template<typename TWarp, typename TSfinae = void>
-        struct Activemask;
-    } // namespace trait
-
-    //! Returns warp size.
-    //!
-    //! \tparam TWarp The warp implementation type.
-    //! \param warp The warp implementation.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TWarp>
-    ALPAKA_FN_ACC auto getSize(TWarp const& warp) -> std::int32_t
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
-        return trait::GetSize<ImplementationBase>::getSize(warp);
-    }
-
-    //! Returns a 32- or 64-bit unsigned integer (depending on the
-    //! accelerator) whose Nth bit is set if and only if the Nth thread
-    //! of the warp is active.
-    //!
-    //! Note: decltype for return type is required there, otherwise
-    //! compilcation with a CPU and a GPU accelerator enabled fails as it
-    //! tries to call device function from a host-device one. The reason
-    //! is unclear, but likely related to deducing the return type.
-    //!
-    //! Note:
-    //! * The programmer must ensure that all threads calling this function are executing
-    //!   the same line of code. In particular it is not portable to write
-    //!   if(a) {activemask} else {activemask}.
-    //!
-    //! \tparam TWarp The warp implementation type.
-    //! \param warp The warp implementation.
-    //! \return 32-bit or 64-bit unsigned type depending on the accelerator.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TWarp>
-    ALPAKA_FN_ACC auto activemask(TWarp const& warp)
-        -> decltype(trait::Activemask<concepts::ImplementationBase<ConceptWarp, TWarp>>::activemask(warp))
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
-        return trait::Activemask<ImplementationBase>::activemask(warp);
-    }
-
-    //! Evaluates predicate for all active threads of the warp and returns
-    //! non-zero if and only if predicate evaluates to non-zero for all of them.
-    //!
-    //! It follows the logic of __all(predicate) in CUDA before version 9.0 and HIP,
-    //! the operation is applied for all active threads.
-    //! The modern CUDA counterpart would be __all_sync(__activemask(), predicate).
-    //!
-    //! Note:
-    //! * The programmer must ensure that all threads calling this function are executing
-    //!   the same line of code. In particular it is not portable to write
-    //!   if(a) {all} else {all}.
-    //!
-    //! \tparam TWarp The warp implementation type.
-    //! \param warp The warp implementation.
-    //! \param predicate The predicate value for current thread.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TWarp>
-    ALPAKA_FN_ACC auto all(TWarp const& warp, std::int32_t predicate) -> std::int32_t
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
-        return trait::All<ImplementationBase>::all(warp, predicate);
-    }
-
-    //! Evaluates predicate for all active threads of the warp and returns
-    //! non-zero if and only if predicate evaluates to non-zero for any of them.
-    //!
-    //! It follows the logic of __any(predicate) in CUDA before version 9.0 and HIP,
-    //! the operation is applied for all active threads.
-    //! The modern CUDA counterpart would be __any_sync(__activemask(), predicate).
-    //!
-    //! Note:
-    //! * The programmer must ensure that all threads calling this function are executing
-    //!   the same line of code. In particular it is not portable to write
-    //!   if(a) {any} else {any}.
-    //!
-    //! \tparam TWarp The warp implementation type.
-    //! \param warp The warp implementation.
-    //! \param predicate The predicate value for current thread.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TWarp>
-    ALPAKA_FN_ACC auto any(TWarp const& warp, std::int32_t predicate) -> std::int32_t
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
-        return trait::Any<ImplementationBase>::any(warp, predicate);
-    }
-
-    //! Evaluates predicate for all non-exited threads in a warp and returns
-    //! a 32- or 64-bit unsigned integer (depending on the accelerator)
-    //! whose Nth bit is set if and only if predicate evaluates to non-zero
-    //! for the Nth thread of the warp and the Nth thread is active.
-    //!
-    //! It follows the logic of __ballot(predicate) in CUDA before version 9.0 and HIP,
-    //! the operation is applied for all active threads.
-    //! The modern CUDA counterpart would be __ballot_sync(__activemask(), predicate).
-    //! Return type is 64-bit to fit all platforms.
-    //!
-    //! Note:
-    //! * The programmer must ensure that all threads calling this function are executing
-    //!   the same line of code. In particular it is not portable to write
-    //!   if(a) {ballot} else {ballot}.
-    //!
-    //! \tparam TWarp The warp implementation type.
-    //! \param warp The warp implementation.
-    //! \param predicate The predicate value for current thread.
-    //! \return 32-bit or 64-bit unsigned type depending on the accelerator.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TWarp>
-    ALPAKA_FN_ACC auto ballot(TWarp const& warp, std::int32_t predicate)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
-        return trait::Ballot<ImplementationBase>::ballot(warp, predicate);
-    }
-
-    //! Exchange data between threads within a warp.
-    //!
-    //! Effectively executes:
-    //!
-    //!     __shared__ int32_t values[warpsize];
-    //!     values[threadIdx.x] = value;
-    //!     __syncthreads();
-    //!     return values[width*(threadIdx.x/width) + srcLane%width];
-    //!
-    //! However, it does not use shared memory.
-    //!
-    //! Notes:
-    //! * The programmer must ensure that all threads calling this
-    //!   function (and the srcLane) are executing the same line of code.
-    //!   In particular it is not portable to write if(a) {shfl} else {shfl}.
-    //!
-    //! * Commonly used with width = warpsize (the default), (returns values[srcLane])
-    //!
-    //! * Width must be a power of 2.
-    //!
-    //! \tparam TWarp   warp implementation type
-    //! \param  warp    warp implementation
-    //! \param  value   value to broadcast (only meaningful from threadIdx == srcLane)
-    //! \param  srcLane source lane sending value
-    //! \param  width   number of threads receiving a single value
-    //! \return val from the thread index srcLane.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TWarp, typename T>
-    ALPAKA_FN_ACC auto shfl(TWarp const& warp, T value, std::int32_t srcLane, std::int32_t width = 0)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
-        return trait::Shfl<ImplementationBase>::shfl(warp, value, srcLane, width ? width : getSize(warp));
-    }
-
-    //! Exchange data between threads within a warp.
-    //! It copies from a lane with lower ID relative to caller.
-    //! The lane ID is calculated by subtracting delta from the caller’s lane ID.
-    //!
-    //! Effectively executes:
-    //!
-    //!     __shared__ int32_t values[warpsize];
-    //!     values[threadIdx.x] = value;
-    //!     __syncthreads();
-    //!     return (threadIdx.x % width >= delta) ? values[threadIdx.x - delta] : values[threadIdx.x];
-    //!
-    //! However, it does not use shared memory.
-    //!
-    //! Notes:
-    //! * The programmer must ensure that all threads calling this
-    //!   function (and the srcLane) are executing the same line of code.
-    //!   In particular it is not portable to write if(a) {shfl} else {shfl}.
-    //!
-    //! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x - delta] if threadIdx.x >=
-    //! delta)
-    //!
-    //! * Width must be a power of 2.
-    //!
-    //! \tparam TWarp   warp implementation type
-    //! \tparam T       value type
-    //! \param  warp    warp implementation
-    //! \param  value   value to broadcast
-    //! \param  offset  corresponds to the delta used to compute the lane ID
-    //! \param  width   size of the group participating in the shuffle operation
-    //! \return val from the thread index lane ID.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TWarp, typename T>
-    ALPAKA_FN_ACC auto shfl_up(TWarp const& warp, T value, std::uint32_t offset, std::int32_t width = 0)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
-        return trait::ShflUp<ImplementationBase>::shfl_up(warp, value, offset, width ? width : getSize(warp));
-    }
-
-    //! Exchange data between threads within a warp.
-    //! It copies from a lane with higher ID relative to caller.
-    //! The lane ID is calculated by adding delta to the caller’s lane ID.
-    //!
-    //! Effectively executes:
-    //!
-    //!     __shared__ int32_t values[warpsize];
-    //!     values[threadIdx.x] = value;
-    //!     __syncthreads();
-    //!     return (threadIdx.x % width + delta < width) ? values[threadIdx.x + delta] : values[threadIdx.x];
-    //!
-    //! However, it does not use shared memory.
-    //!
-    //! Notes:
-    //! * The programmer must ensure that all threads calling this
-    //!   function (and the srcLane) are executing the same line of code.
-    //!   In particular it is not portable to write if(a) {shfl} else {shfl}.
-    //!
-    //! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x+delta] if threadIdx.x+delta <
-    //! warpsize)
-    //!
-    //! * Width must be a power of 2.
-    //!
-    //! \tparam TWarp   warp implementation type
-    //! \tparam T       value type
-    //! \param  warp    warp implementation
-    //! \param  value   value to broadcast
-    //! \param  offset  corresponds to the delta used to compute the lane ID
-    //! \param  width   size of the group participating in the shuffle operation
-    //! \return val from the thread index lane ID.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TWarp, typename T>
-    ALPAKA_FN_ACC auto shfl_down(TWarp const& warp, T value, std::uint32_t offset, std::int32_t width = 0)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
-        return trait::ShflDown<ImplementationBase>::shfl_down(warp, value, offset, width ? width : getSize(warp));
-    }
-
-    //! Exchange data between threads within a warp.
-    //! It copies from a lane based on bitwise XOR of own lane ID.
-    //! The lane ID is calculated by performing a bitwise XOR of the caller’s lane ID with mask
-    //!
-    //! Effectively executes:
-    //!
-    //!     __shared__ int32_t values[warpsize];
-    //!     values[threadIdx.x] = value;
-    //!     __syncthreads();
-    //!     int lane = threadIdx.x ^ mask;
-    //!     return values[lane / width > threadIdx.x / width ? threadIdx.x : lane];
-    //!
-    //! However, it does not use shared memory.
-    //!
-    //! Notes:
-    //! * The programmer must ensure that all threads calling this
-    //!   function (and the srcLane) are executing the same line of code.
-    //!   In particular it is not portable to write if(a) {shfl} else {shfl}.
-    //!
-    //! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x^mask])
-    //!
-    //! * Width must be a power of 2.
-    //!
-    //! \tparam TWarp   warp implementation type
-    //! \tparam T       value type
-    //! \param  warp    warp implementation
-    //! \param  value   value to broadcast
-    //! \param  mask    corresponds to the mask used to compute the lane ID
-    //! \param  width   size of the group participating in the shuffle operation
-    //! \return val from the thread index lane ID.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TWarp, typename T>
-    ALPAKA_FN_ACC auto shfl_xor(TWarp const& warp, T value, std::int32_t mask, std::int32_t width = 0)
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
-        return trait::ShflXor<ImplementationBase>::shfl_xor(warp, value, mask, width ? width : getSize(warp));
-    }
-} // namespace alpaka::warp
diff --git a/include/alpaka/warp/WarpGenericSycl.hpp b/include/alpaka/warp/WarpGenericSycl.hpp
deleted file mode 100644
index 51957ba..0000000
--- a/include/alpaka/warp/WarpGenericSycl.hpp
+++ /dev/null
@@ -1,200 +0,0 @@
-/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- *
- * The implementations of Shfl::shfl(), ShflUp::shfl_up(), ShflDown::shfl_down() and ShflXor::shfl_xor() are derived
- * from Intel DPCT.
- * Copyright (C) Intel Corporation.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- * See https://llvm.org/LICENSE.txt for license information.
- */
-
-#pragma once
-
-#include "alpaka/core/Assert.hpp"
-#include "alpaka/warp/Traits.hpp"
-
-#include <cstdint>
-
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-
-#    include <sycl/sycl.hpp>
-
-namespace alpaka::warp
-{
-    //! The SYCL warp.
-    template<typename TDim>
-    class WarpGenericSycl : public concepts::Implements<alpaka::warp::ConceptWarp, WarpGenericSycl<TDim>>
-    {
-    public:
-        WarpGenericSycl(sycl::nd_item<TDim::value> my_item) : m_item_warp{my_item}
-        {
-        }
-
-        sycl::nd_item<TDim::value> m_item_warp;
-    };
-} // namespace alpaka::warp
-
-namespace alpaka::warp::trait
-{
-    template<typename TDim>
-    struct GetSize<warp::WarpGenericSycl<TDim>>
-    {
-        static auto getSize(warp::WarpGenericSycl<TDim> const& warp) -> std::int32_t
-        {
-            auto const sub_group = warp.m_item_warp.get_sub_group();
-            // SYCL sub-groups are always 1D
-            return static_cast<std::int32_t>(sub_group.get_max_local_range()[0]);
-        }
-    };
-
-    template<typename TDim>
-    struct Activemask<warp::WarpGenericSycl<TDim>>
-    {
-        // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
-        // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
-        // Restrict to warpSize <= 32 for now.
-        static auto activemask(warp::WarpGenericSycl<TDim> const& warp) -> std::uint32_t
-        {
-            static_assert(!sizeof(warp), "activemask is not supported on SYCL");
-            // SYCL does not have an API to get the activemask. It is also questionable (to me, bgruber) whether an
-            // "activemask" even exists on some hardware architectures, since the idea is bound to threads being
-            // "turned off" when they take different control flow in a warp. A SYCL implementation could run each
-            // thread as a SIMD lane, in which cause the "thread" is always active, but some SIMD lanes are either
-            // predicated off, or side-effects are masked out when writing them back.
-            //
-            // An implementation via oneAPI's sycl::ext::oneapi::group_ballot causes UB, because activemask is expected
-            // to be callable when less than all threads are active in a warp (CUDA). But SYCL requires all threads of
-            // a group to call the function.
-            //
-            // Intel's CUDA -> SYCL migration tool also suggests that there is no direct equivalent and the user must
-            // rewrite their kernel logic. See also:
-            // https://oneapi-src.github.io/SYCLomatic/dev_guide/diagnostic_ref/dpct1086.html
-
-            return ~std::uint32_t{0};
-        }
-    };
-
-    template<typename TDim>
-    struct All<warp::WarpGenericSycl<TDim>>
-    {
-        static auto all(warp::WarpGenericSycl<TDim> const& warp, std::int32_t predicate) -> std::int32_t
-        {
-            auto const sub_group = warp.m_item_warp.get_sub_group();
-            return static_cast<std::int32_t>(sycl::all_of_group(sub_group, static_cast<bool>(predicate)));
-        }
-    };
-
-    template<typename TDim>
-    struct Any<warp::WarpGenericSycl<TDim>>
-    {
-        static auto any(warp::WarpGenericSycl<TDim> const& warp, std::int32_t predicate) -> std::int32_t
-        {
-            auto const sub_group = warp.m_item_warp.get_sub_group();
-            return static_cast<std::int32_t>(sycl::any_of_group(sub_group, static_cast<bool>(predicate)));
-        }
-    };
-
-    template<typename TDim>
-    struct Ballot<warp::WarpGenericSycl<TDim>>
-    {
-        // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
-        // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
-        // Restrict to warpSize <= 32 for now.
-        static auto ballot(warp::WarpGenericSycl<TDim> const& warp, std::int32_t predicate) -> std::uint32_t
-        {
-            auto const sub_group = warp.m_item_warp.get_sub_group();
-            auto const mask = sycl::ext::oneapi::group_ballot(sub_group, static_cast<bool>(predicate));
-            // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
-            // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
-            // Restrict to warpSize <= 32 for now.
-            std::uint32_t bits = 0;
-            mask.extract_bits(bits);
-            return bits;
-        }
-    };
-
-    template<typename TDim>
-    struct Shfl<warp::WarpGenericSycl<TDim>>
-    {
-        template<typename T>
-        static auto shfl(warp::WarpGenericSycl<TDim> const& warp, T value, std::int32_t srcLane, std::int32_t width)
-        {
-            ALPAKA_ASSERT_ACC(width > 0);
-            ALPAKA_ASSERT_ACC(srcLane >= 0);
-
-            /* If width < srcLane the sub-group needs to be split into assumed subdivisions. The first item of each
-               subdivision has the assumed index 0. The srcLane index is relative to the subdivisions.
-
-               Example: If we assume a sub-group size of 32 and a width of 16 we will receive two subdivisions:
-               The first starts at sub-group index 0 and the second at sub-group index 16. For srcLane = 4 the
-               first subdivision will access the value at sub-group index 4 and the second at sub-group index 20. */
-            auto const actual_group = warp.m_item_warp.get_sub_group();
-            std::uint32_t const w = static_cast<std::uint32_t>(width);
-            std::uint32_t const start_index = actual_group.get_local_linear_id() / w * w;
-            return sycl::select_from_group(actual_group, value, start_index + static_cast<std::uint32_t>(srcLane) % w);
-        }
-    };
-
-    template<typename TDim>
-    struct ShflUp<warp::WarpGenericSycl<TDim>>
-    {
-        template<typename T>
-        static auto shfl_up(
-            warp::WarpGenericSycl<TDim> const& warp,
-            T value,
-            std::uint32_t offset, /* must be the same for all work-items in the group */
-            std::int32_t width)
-        {
-            auto const actual_group = warp.m_item_warp.get_sub_group();
-            std::uint32_t const w = static_cast<std::uint32_t>(width);
-            std::uint32_t const id = actual_group.get_local_linear_id();
-            std::uint32_t const start_index = id / w * w;
-            T result = sycl::shift_group_right(actual_group, value, offset);
-            if((id - start_index) < offset)
-            {
-                result = value;
-            }
-            return result;
-        }
-    };
-
-    template<typename TDim>
-    struct ShflDown<warp::WarpGenericSycl<TDim>>
-    {
-        template<typename T>
-        static auto shfl_down(
-            warp::WarpGenericSycl<TDim> const& warp,
-            T value,
-            std::uint32_t offset,
-            std::int32_t width)
-        {
-            auto const actual_group = warp.m_item_warp.get_sub_group();
-            std::uint32_t const w = static_cast<std::uint32_t>(width);
-            std::uint32_t const id = actual_group.get_local_linear_id();
-            std::uint32_t const end_index = (id / w + 1) * w;
-            T result = sycl::shift_group_left(actual_group, value, offset);
-            if((id + offset) >= end_index)
-            {
-                result = value;
-            }
-            return result;
-        }
-    };
-
-    template<typename TDim>
-    struct ShflXor<warp::WarpGenericSycl<TDim>>
-    {
-        template<typename T>
-        static auto shfl_xor(warp::WarpGenericSycl<TDim> const& warp, T value, std::int32_t mask, std::int32_t width)
-        {
-            auto const actual_group = warp.m_item_warp.get_sub_group();
-            std::uint32_t const w = static_cast<std::uint32_t>(width);
-            std::uint32_t const id = actual_group.get_local_linear_id();
-            std::uint32_t const start_index = id / w * w;
-            std::uint32_t const target_offset = (id % w) ^ static_cast<std::uint32_t>(mask);
-            return sycl::select_from_group(actual_group, value, target_offset < w ? start_index + target_offset : id);
-        }
-    };
-} // namespace alpaka::warp::trait
-
-#endif
diff --git a/include/alpaka/warp/WarpSingleThread.hpp b/include/alpaka/warp/WarpSingleThread.hpp
deleted file mode 100644
index d271303..0000000
--- a/include/alpaka/warp/WarpSingleThread.hpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Bernhard Manfred Gruber, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/warp/Traits.hpp"
-
-#include <cstdint>
-
-namespace alpaka::warp
-{
-    //! The single-threaded warp to emulate it on CPUs.
-    class WarpSingleThread : public concepts::Implements<ConceptWarp, WarpSingleThread>
-    {
-    };
-
-    namespace trait
-    {
-        template<>
-        struct GetSize<WarpSingleThread>
-        {
-            static auto getSize(warp::WarpSingleThread const& /*warp*/)
-            {
-                return 1;
-            }
-        };
-
-        template<>
-        struct Activemask<WarpSingleThread>
-        {
-            static auto activemask(warp::WarpSingleThread const& /*warp*/)
-            {
-                return 1u;
-            }
-        };
-
-        template<>
-        struct All<WarpSingleThread>
-        {
-            static auto all(warp::WarpSingleThread const& /*warp*/, std::int32_t predicate)
-            {
-                return predicate;
-            }
-        };
-
-        template<>
-        struct Any<WarpSingleThread>
-        {
-            static auto any(warp::WarpSingleThread const& /*warp*/, std::int32_t predicate)
-            {
-                return predicate;
-            }
-        };
-
-        template<>
-        struct Ballot<WarpSingleThread>
-        {
-            static auto ballot(warp::WarpSingleThread const& /*warp*/, std::int32_t predicate)
-            {
-                return predicate ? 1u : 0u;
-            }
-        };
-
-        template<>
-        struct Shfl<WarpSingleThread>
-        {
-            template<typename T>
-            static auto shfl(
-                warp::WarpSingleThread const& /*warp*/,
-                T val,
-                std::int32_t /*srcLane*/,
-                std::int32_t /*width*/)
-            {
-                return val;
-            }
-        };
-
-        template<>
-        struct ShflUp<WarpSingleThread>
-        {
-            template<typename T>
-            static auto shfl_up(
-                warp::WarpSingleThread const& /*warp*/,
-                T val,
-                std::uint32_t /*srcLane*/,
-                std::int32_t /*width*/)
-            {
-                return val;
-            }
-        };
-
-        template<>
-        struct ShflDown<WarpSingleThread>
-        {
-            template<typename T>
-            static auto shfl_down(
-                warp::WarpSingleThread const& /*warp*/,
-                T val,
-                std::uint32_t /*srcLane*/,
-                std::int32_t /*width*/)
-            {
-                return val;
-            }
-        };
-
-        template<>
-        struct ShflXor<WarpSingleThread>
-        {
-            template<typename T>
-            static auto shfl_xor(
-                warp::WarpSingleThread const& /*warp*/,
-                T val,
-                std::int32_t /*srcLane*/,
-                std::int32_t /*width*/)
-            {
-                return val;
-            }
-        };
-    } // namespace trait
-} // namespace alpaka::warp
diff --git a/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp b/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp
deleted file mode 100644
index 3a6d495..0000000
--- a/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp
+++ /dev/null
@@ -1,189 +0,0 @@
-/* Copyright 2023 Sergei Bastrakov, David M. Rogers, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/BoostPredef.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/warp/Traits.hpp"
-
-#include <cstdint>
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-
-namespace alpaka::warp
-{
-    //! The GPU CUDA/HIP warp.
-    class WarpUniformCudaHipBuiltIn : public concepts::Implements<ConceptWarp, WarpUniformCudaHipBuiltIn>
-    {
-    };
-
-#    if !defined(ALPAKA_HOST_ONLY)
-
-#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
-#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#        endif
-
-#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
-#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#        endif
-
-    namespace trait
-    {
-        template<>
-        struct GetSize<WarpUniformCudaHipBuiltIn>
-        {
-            __device__ static auto getSize(warp::WarpUniformCudaHipBuiltIn const& /*warp*/) -> std::int32_t
-            {
-                return warpSize;
-            }
-        };
-
-        template<>
-        struct Activemask<WarpUniformCudaHipBuiltIn>
-        {
-            __device__ static auto activemask(warp::WarpUniformCudaHipBuiltIn const& /*warp*/)
-#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-                -> std::uint32_t
-#        else
-                -> std::uint64_t
-#        endif
-            {
-#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-                return __activemask();
-#        else
-                // No HIP intrinsic for it, emulate via ballot
-                return __ballot(1);
-#        endif
-            }
-        };
-
-        template<>
-        struct All<WarpUniformCudaHipBuiltIn>
-        {
-            __device__ static auto all(
-                [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
-                std::int32_t predicate) -> std::int32_t
-            {
-#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-                return __all_sync(0xffff'ffff, predicate);
-#        else
-                return __all(predicate);
-#        endif
-            }
-        };
-
-        template<>
-        struct Any<WarpUniformCudaHipBuiltIn>
-        {
-            __device__ static auto any(
-                [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
-                std::int32_t predicate) -> std::int32_t
-            {
-#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-                return __any_sync(0xffff'ffff, predicate);
-#        else
-                return __any(predicate);
-#        endif
-            }
-        };
-
-        template<>
-        struct Ballot<WarpUniformCudaHipBuiltIn>
-        {
-            __device__ static auto ballot(
-                [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
-                std::int32_t predicate)
-            // return type is required by the compiler
-#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-                -> std::uint32_t
-#        else
-                -> std::uint64_t
-#        endif
-            {
-#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-                return __ballot_sync(0xffff'ffff, predicate);
-#        else
-                return __ballot(predicate);
-#        endif
-            }
-        };
-
-        template<>
-        struct Shfl<WarpUniformCudaHipBuiltIn>
-        {
-            template<typename T>
-            __device__ static auto shfl(
-                [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
-                T val,
-                int srcLane,
-                std::int32_t width) -> T
-            {
-#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-                return __shfl_sync(0xffff'ffff, val, srcLane, width);
-#        else
-                return __shfl(val, srcLane, width);
-#        endif
-            }
-        };
-
-        template<>
-        struct ShflUp<WarpUniformCudaHipBuiltIn>
-        {
-            template<typename T>
-            __device__ static auto shfl_up(
-                [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
-                T val,
-                std::uint32_t offset,
-                std::int32_t width) -> T
-            {
-#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-                return __shfl_up_sync(0xffff'ffff, val, offset, width);
-#        else
-                return __shfl_up(val, offset, width);
-#        endif
-            }
-        };
-
-        template<>
-        struct ShflDown<WarpUniformCudaHipBuiltIn>
-        {
-            template<typename T>
-            __device__ static auto shfl_down(
-                [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
-                T val,
-                std::uint32_t offset,
-                std::int32_t width) -> T
-            {
-#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-                return __shfl_down_sync(0xffff'ffff, val, offset, width);
-#        else
-                return __shfl_down(val, offset, width);
-#        endif
-            }
-        };
-
-        template<>
-        struct ShflXor<WarpUniformCudaHipBuiltIn>
-        {
-            template<typename T>
-            __device__ static auto shfl_xor(
-                [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
-                T val,
-                std::int32_t mask,
-                std::int32_t width) -> T
-            {
-#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-                return __shfl_xor_sync(0xffff'ffff, val, mask, width);
-#        else
-                return __shfl_xor(val, mask, width);
-#        endif
-            }
-        };
-
-    } // namespace trait
-#    endif
-} // namespace alpaka::warp
-
-#endif
diff --git a/include/alpaka/workdiv/Traits.hpp b/include/alpaka/workdiv/Traits.hpp
deleted file mode 100644
index 211d688..0000000
--- a/include/alpaka/workdiv/Traits.hpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/core/Positioning.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/vec/Vec.hpp"
-
-#include <type_traits>
-#include <utility>
-
-namespace alpaka
-{
-    struct ConceptWorkDiv
-    {
-    };
-
-    //! The work division trait.
-    namespace trait
-    {
-        //! The work div trait.
-        template<typename TWorkDiv, typename TOrigin, typename TUnit, typename TSfinae = void>
-        struct GetWorkDiv;
-    } // namespace trait
-
-    //! Get the extent requested.
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TOrigin, typename TUnit, typename TWorkDiv>
-    ALPAKA_FN_HOST_ACC auto getWorkDiv(TWorkDiv const& workDiv) -> Vec<Dim<TWorkDiv>, Idx<TWorkDiv>>
-    {
-        using ImplementationBase = concepts::ImplementationBase<ConceptWorkDiv, TWorkDiv>;
-        return trait::GetWorkDiv<ImplementationBase, TOrigin, TUnit>::getWorkDiv(workDiv);
-    }
-
-    namespace trait
-    {
-        //! The work div grid thread extent trait specialization.
-        template<typename TWorkDiv>
-        struct GetWorkDiv<TWorkDiv, origin::Grid, unit::Threads>
-        {
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC static auto getWorkDiv(TWorkDiv const& workDiv)
-            {
-                return alpaka::getWorkDiv<origin::Grid, unit::Blocks>(workDiv)
-                       * alpaka::getWorkDiv<origin::Block, unit::Threads>(workDiv);
-            }
-        };
-
-        //! The work div grid element extent trait specialization.
-        template<typename TWorkDiv>
-        struct GetWorkDiv<TWorkDiv, origin::Grid, unit::Elems>
-        {
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC static auto getWorkDiv(TWorkDiv const& workDiv)
-            {
-                return alpaka::getWorkDiv<origin::Grid, unit::Threads>(workDiv)
-                       * alpaka::getWorkDiv<origin::Thread, unit::Elems>(workDiv);
-            }
-        };
-
-        //! The work div block element extent trait specialization.
-        template<typename TWorkDiv>
-        struct GetWorkDiv<TWorkDiv, origin::Block, unit::Elems>
-        {
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC static auto getWorkDiv(TWorkDiv const& workDiv)
-            {
-                return alpaka::getWorkDiv<origin::Block, unit::Threads>(workDiv)
-                       * alpaka::getWorkDiv<origin::Thread, unit::Elems>(workDiv);
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/workdiv/WorkDivGenericSycl.hpp b/include/alpaka/workdiv/WorkDivGenericSycl.hpp
deleted file mode 100644
index 26e0075..0000000
--- a/include/alpaka/workdiv/WorkDivGenericSycl.hpp
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/vec/Vec.hpp"
-#include "alpaka/workdiv/Traits.hpp"
-
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-
-#    include <sycl/sycl.hpp>
-
-namespace alpaka
-{
-    //! The SYCL accelerator work division.
-    template<typename TDim, typename TIdx>
-    class WorkDivGenericSycl : public concepts::Implements<ConceptWorkDiv, WorkDivGenericSycl<TDim, TIdx>>
-    {
-        static_assert(TDim::value > 0, "The SYCL work division must have a dimension greater than zero.");
-
-    public:
-        using WorkDivBase = WorkDivGenericSycl;
-
-        WorkDivGenericSycl(Vec<TDim, TIdx> const& threadElemExtent, sycl::nd_item<TDim::value> work_item)
-            : m_threadElemExtent{threadElemExtent}
-            , m_item_workdiv{work_item}
-        {
-        }
-
-        Vec<TDim, TIdx> const& m_threadElemExtent;
-        sycl::nd_item<TDim::value> m_item_workdiv;
-    };
-} // namespace alpaka
-
-namespace alpaka::trait
-{
-    //! The SYCL accelerator work division dimension get trait specialization.
-    template<typename TDim, typename TIdx>
-    struct DimType<WorkDivGenericSycl<TDim, TIdx>>
-    {
-        using type = TDim;
-    };
-
-    //! The SYCL accelerator work division idx type trait specialization.
-    template<typename TDim, typename TIdx>
-    struct IdxType<WorkDivGenericSycl<TDim, TIdx>>
-    {
-        using type = TIdx;
-    };
-
-    //! The SYCL accelerator work division grid block extent trait specialization.
-    template<typename TDim, typename TIdx>
-    struct GetWorkDiv<WorkDivGenericSycl<TDim, TIdx>, origin::Grid, unit::Blocks>
-    {
-        //! \return The number of blocks in each dimension of the grid.
-        static auto getWorkDiv(WorkDivGenericSycl<TDim, TIdx> const& workDiv) -> Vec<TDim, TIdx>
-        {
-            if constexpr(TDim::value == 0)
-                return Vec<TDim, TIdx>{};
-            else if constexpr(TDim::value == 1)
-                return Vec<TDim, TIdx>{static_cast<TIdx>(workDiv.m_item_workdiv.get_group_range(0))};
-            else if constexpr(TDim::value == 2)
-            {
-                return Vec<TDim, TIdx>{
-                    static_cast<TIdx>(workDiv.m_item_workdiv.get_group_range(1)),
-                    static_cast<TIdx>(workDiv.m_item_workdiv.get_group_range(0))};
-            }
-            else
-            {
-                return Vec<TDim, TIdx>{
-                    static_cast<TIdx>(workDiv.m_item_workdiv.get_group_range(2)),
-                    static_cast<TIdx>(workDiv.m_item_workdiv.get_group_range(1)),
-                    static_cast<TIdx>(workDiv.m_item_workdiv.get_group_range(0))};
-            }
-        }
-    };
-
-    //! The SYCL accelerator work division block thread extent trait specialization.
-    template<typename TDim, typename TIdx>
-    struct GetWorkDiv<WorkDivGenericSycl<TDim, TIdx>, origin::Block, unit::Threads>
-    {
-        //! \return The number of threads in each dimension of a block.
-        static auto getWorkDiv(WorkDivGenericSycl<TDim, TIdx> const& workDiv) -> Vec<TDim, TIdx>
-        {
-            if constexpr(TDim::value == 0)
-                return Vec<TDim, TIdx>{};
-            else if constexpr(TDim::value == 1)
-                return Vec<TDim, TIdx>{static_cast<TIdx>(workDiv.m_item_workdiv.get_local_range(0))};
-            else if constexpr(TDim::value == 2)
-            {
-                return Vec<TDim, TIdx>{
-                    static_cast<TIdx>(workDiv.m_item_workdiv.get_local_range(1)),
-                    static_cast<TIdx>(workDiv.m_item_workdiv.get_local_range(0))};
-            }
-            else
-            {
-                return Vec<TDim, TIdx>{
-                    static_cast<TIdx>(workDiv.m_item_workdiv.get_local_range(2)),
-                    static_cast<TIdx>(workDiv.m_item_workdiv.get_local_range(1)),
-                    static_cast<TIdx>(workDiv.m_item_workdiv.get_local_range(0))};
-            }
-        }
-    };
-
-    //! The SYCL accelerator work division thread element extent trait specialization.
-    template<typename TDim, typename TIdx>
-    struct GetWorkDiv<WorkDivGenericSycl<TDim, TIdx>, origin::Thread, unit::Elems>
-    {
-        //! \return The number of elements in each dimension of the thread.
-        static auto getWorkDiv(WorkDivGenericSycl<TDim, TIdx> const& workDiv) -> Vec<TDim, TIdx>
-        {
-            return workDiv.m_threadElemExtent;
-        }
-    };
-} // namespace alpaka::trait
-
-#endif
diff --git a/include/alpaka/workdiv/WorkDivHelpers.hpp b/include/alpaka/workdiv/WorkDivHelpers.hpp
deleted file mode 100644
index c15319c..0000000
--- a/include/alpaka/workdiv/WorkDivHelpers.hpp
+++ /dev/null
@@ -1,554 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/acc/Traits.hpp"
-#include "alpaka/core/Assert.hpp"
-#include "alpaka/core/Common.hpp"
-#include "alpaka/core/Utility.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/extent/Traits.hpp"
-#include "alpaka/kernel/KernelFunctionAttributes.hpp"
-#include "alpaka/kernel/Traits.hpp"
-#include "alpaka/vec/Vec.hpp"
-#include "alpaka/workdiv/WorkDivMembers.hpp"
-
-#include <algorithm>
-#include <array>
-#include <cmath>
-#include <functional>
-#include <set>
-#include <type_traits>
-
-#if BOOST_COMP_CLANG
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored "-Wswitch-default"
-#endif
-
-//! The alpaka library.
-namespace alpaka
-{
-    //! The grid block extent subdivision restrictions.
-    enum class GridBlockExtentSubDivRestrictions
-    {
-        EqualExtent, //!< The block thread extent will be equal in all dimensions.
-        CloseToEqualExtent, //!< The block thread extent will be as close to equal as possible in all dimensions.
-        Unrestricted, //!< The block thread extent will not have any restrictions.
-    };
-
-    namespace detail
-    {
-        //! Finds the largest divisor where divident % divisor == 0
-        //! \param dividend The dividend.
-        //! \param maxDivisor The maximum divisor.
-        //! \return The biggest number that satisfies the following conditions:
-        //!     1) dividend%ret==0
-        //!     2) ret<=maxDivisor
-        template<typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
-        ALPAKA_FN_HOST auto nextDivisorLowerOrEqual(T const& dividend, T const& maxDivisor) -> T
-        {
-            core::assertValueUnsigned(dividend);
-            core::assertValueUnsigned(maxDivisor);
-            ALPAKA_ASSERT(dividend >= maxDivisor);
-
-            T divisor = maxDivisor;
-            while(dividend % divisor != 0)
-                --divisor;
-            return divisor;
-        }
-
-        //! \param val The value to find divisors of.
-        //! \param maxDivisor The maximum.
-        //! \return A list of all divisors less then or equal to the given maximum.
-        template<typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
-        ALPAKA_FN_HOST auto allDivisorsLessOrEqual(T const& val, T const& maxDivisor) -> std::set<T>
-        {
-            std::set<T> divisorSet;
-
-            core::assertValueUnsigned(val);
-            core::assertValueUnsigned(maxDivisor);
-            ALPAKA_ASSERT(maxDivisor <= val);
-
-            for(T i(1); i <= std::min(val, maxDivisor); ++i)
-            {
-                if(val % i == 0)
-                {
-                    divisorSet.insert(static_cast<T>(val / i));
-                }
-            }
-
-            return divisorSet;
-        }
-    } // namespace detail
-
-    //! \tparam TDim The dimensionality of the accelerator device properties.
-    //! \tparam TIdx The idx type of the accelerator device properties.
-    //! \param accDevProps The maxima for the work division.
-    //! \return If the accelerator device properties are valid.
-    template<typename TDim, typename TIdx>
-    ALPAKA_FN_HOST auto isValidAccDevProps(AccDevProps<TDim, TIdx> const& accDevProps) -> bool
-    {
-        // Check that the maximum counts are greater or equal 1.
-        if((accDevProps.m_gridBlockCountMax < 1) || (accDevProps.m_blockThreadCountMax < 1)
-           || (accDevProps.m_threadElemCountMax < 1))
-        {
-            return false;
-        }
-
-        // Store the maxima allowed for extents of grid, blocks and threads.
-        auto const gridBlockExtentMax = subVecEnd<TDim>(accDevProps.m_gridBlockExtentMax);
-        auto const blockThreadExtentMax = subVecEnd<TDim>(accDevProps.m_blockThreadExtentMax);
-        auto const threadElemExtentMax = subVecEnd<TDim>(accDevProps.m_threadElemExtentMax);
-
-        // Check that the extents for all dimensions are correct.
-        for(typename TDim::value_type i(0); i < TDim::value; ++i)
-        {
-            // Check that the maximum extents are greater or equal 1.
-            if((gridBlockExtentMax[i] < 1) || (blockThreadExtentMax[i] < 1) || (threadElemExtentMax[i] < 1))
-            {
-                return false;
-            }
-        }
-
-        return true;
-    }
-
-    //! Subdivides the given grid thread extent into blocks restricted by the maxima allowed.
-    //! 1. The the maxima block, thread and element extent and counts
-    //! 2. The requirement of the block thread extent to divide the grid thread extent without remainder
-    //! 3. The requirement of the block extent.
-    //!
-    //! \param gridElemExtent The full extent of elements in the grid.
-    //! \param threadElemExtent the number of elements computed per thread.
-    //! \param accDevProps The maxima for the work division.
-    //! \param kernelBlockThreadCountMax The maximum number of threads per block. If it is zero this argument is not
-    //! used, device hard limits are used.
-    //! \param blockThreadMustDivideGridThreadExtent If this is true, the grid thread extent will be multiples of the
-    //! corresponding block thread extent.
-    //!     NOTE: If this is true and gridThreadExtent is prime (or otherwise bad chosen) in a dimension, the block
-    //!     thread extent will be one in this dimension.
-    //! \param gridBlockExtentSubDivRestrictions The grid block extent subdivision restrictions.
-    template<typename TDim, typename TIdx>
-    ALPAKA_FN_HOST auto subDivideGridElems(
-        Vec<TDim, TIdx> const& gridElemExtent,
-        Vec<TDim, TIdx> const& threadElemExtent,
-        AccDevProps<TDim, TIdx> const& accDevProps,
-        TIdx kernelBlockThreadCountMax = static_cast<TIdx>(0u),
-        bool blockThreadMustDivideGridThreadExtent = true,
-        GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions
-        = GridBlockExtentSubDivRestrictions::Unrestricted) -> WorkDivMembers<TDim, TIdx>
-    {
-        using Vec = Vec<TDim, TIdx>;
-        using DimLoopInd = typename TDim::value_type;
-
-        for(DimLoopInd i(0); i < TDim::value; ++i)
-        {
-            ALPAKA_ASSERT(gridElemExtent[i] >= 1);
-            ALPAKA_ASSERT(threadElemExtent[i] >= 1);
-            ALPAKA_ASSERT(threadElemExtent[i] <= accDevProps.m_threadElemExtentMax[i]);
-        }
-        ALPAKA_ASSERT(threadElemExtent.prod() <= accDevProps.m_threadElemCountMax);
-        ALPAKA_ASSERT(isValidAccDevProps(accDevProps));
-
-        // Handle threadElemExtent and compute gridThreadExtent. Afterwards, only the blockThreadExtent has to be
-        // optimized.
-        auto clippedThreadElemExtent = elementwise_min(threadElemExtent, gridElemExtent);
-        auto const gridThreadExtent = [&]
-        {
-            Vec r;
-            for(DimLoopInd i(0u); i < TDim::value; ++i)
-                r[i] = core::divCeil(gridElemExtent[i], clippedThreadElemExtent[i]);
-            return r;
-        }();
-
-        ///////////////////////////////////////////////////////////////////
-        // Try to calculate an optimal blockThreadExtent.
-
-        // Restrict the max block thread extent from the maximum possible to the grid thread extent.
-        // This removes dimensions not required in the grid thread extent.
-        // This has to be done before the blockThreadCountMax clipping to get the maximum correctly.
-        auto blockThreadExtent = elementwise_min(accDevProps.m_blockThreadExtentMax, gridThreadExtent);
-
-        // For equal block thread extent, restrict it to its minimum component.
-        // For example (512, 256, 1024) will get (256, 256, 256).
-        if(gridBlockExtentSubDivRestrictions == GridBlockExtentSubDivRestrictions::EqualExtent)
-            blockThreadExtent = Vec::all(blockThreadExtent.min() != TIdx(0) ? blockThreadExtent.min() : TIdx(1));
-
-        // Choose kernelBlockThreadCountMax if it is not zero. It is less than the accelerator properties.
-        TIdx const& blockThreadCountMax
-            = (kernelBlockThreadCountMax != 0) ? kernelBlockThreadCountMax : accDevProps.m_blockThreadCountMax;
-
-        // Block thread extent could be {1024,1024,1024} although max threads per block is 1024. Block thread extent
-        // shows the max number of threads along each axis, it is not a measure to get max number of threads per block.
-        // It must be further limited (clipped above) by the kernel limit along each axis, using device limits is not
-        // enough.
-        for(typename TDim::value_type i(0); i < TDim::value; ++i)
-        {
-            blockThreadExtent[i] = std::min(blockThreadExtent[i], blockThreadCountMax);
-        }
-
-        // Make the blockThreadExtent product smaller or equal to the accelerator's limit.
-        if(blockThreadCountMax == 1)
-        {
-            blockThreadExtent = Vec::all(core::nthRootFloor(blockThreadCountMax, TIdx{TDim::value}));
-        }
-        else if(blockThreadExtent.prod() > blockThreadCountMax)
-        {
-            switch(gridBlockExtentSubDivRestrictions)
-            {
-            case GridBlockExtentSubDivRestrictions::EqualExtent:
-                blockThreadExtent = Vec::all(core::nthRootFloor(blockThreadCountMax, TIdx{TDim::value}));
-                break;
-            case GridBlockExtentSubDivRestrictions::CloseToEqualExtent:
-                // Very primitive clipping. Just halve the largest value until it fits.
-                while(blockThreadExtent.prod() > blockThreadCountMax)
-                    blockThreadExtent[blockThreadExtent.maxElem()] /= TIdx{2};
-                break;
-            case GridBlockExtentSubDivRestrictions::Unrestricted:
-                // Very primitive clipping. Just halve the smallest value (which is not 1) until it fits.
-                while(blockThreadExtent.prod() > blockThreadCountMax)
-                {
-                    auto const it = std::min_element(
-                        blockThreadExtent.begin(),
-                        blockThreadExtent.end() - 1, //! \todo why omit the last element?
-                        [](TIdx const& a, TIdx const& b)
-                        {
-                            if(a == TIdx{1})
-                                return false;
-                            if(b == TIdx{1})
-                                return true;
-                            return a < b;
-                        });
-                    *it /= TIdx{2};
-                }
-                break;
-            }
-        }
-
-
-        // Make the block thread extent divide the grid thread extent.
-        if(blockThreadMustDivideGridThreadExtent)
-        {
-            switch(gridBlockExtentSubDivRestrictions)
-            {
-            case GridBlockExtentSubDivRestrictions::EqualExtent:
-                {
-                    // For equal size block extent we have to compute the gcd of all grid thread extent that is less
-                    // then the current maximal block thread extent. For this we compute the divisors of all grid
-                    // thread extent less then the current maximal block thread extent.
-                    std::array<std::set<TIdx>, TDim::value> gridThreadExtentDivisors;
-                    for(DimLoopInd i(0u); i < TDim::value; ++i)
-                    {
-                        gridThreadExtentDivisors[i]
-                            = detail::allDivisorsLessOrEqual(gridThreadExtent[i], blockThreadExtent[i]);
-                    }
-                    // The maximal common divisor of all block thread extent is the optimal solution.
-                    std::set<TIdx> intersects[2u];
-                    for(DimLoopInd i(1u); i < TDim::value; ++i)
-                    {
-                        intersects[(i - 1u) % 2u] = gridThreadExtentDivisors[0];
-                        intersects[(i) % 2u].clear();
-                        set_intersection(
-                            std::begin(intersects[(i - 1u) % 2u]),
-                            std::end(intersects[(i - 1u) % 2u]),
-                            std::begin(gridThreadExtentDivisors[i]),
-                            std::end(gridThreadExtentDivisors[i]),
-                            std::inserter(intersects[i % 2], std::begin(intersects[i % 2u])));
-                    }
-                    TIdx const maxCommonDivisor = *(--std::end(intersects[(TDim::value - 1) % 2u]));
-                    blockThreadExtent = Vec::all(maxCommonDivisor);
-                    break;
-                }
-            case GridBlockExtentSubDivRestrictions::CloseToEqualExtent:
-                [[fallthrough]];
-            case GridBlockExtentSubDivRestrictions::Unrestricted:
-                for(DimLoopInd i(0u); i < TDim::value; ++i)
-                {
-                    blockThreadExtent[i] = detail::nextDivisorLowerOrEqual(gridThreadExtent[i], blockThreadExtent[i]);
-                }
-                break;
-            }
-        }
-
-        // grid blocks extent = grid thread / block thread extent. quotient is rounded up.
-        auto gridBlockExtent = [&]
-        {
-            Vec r;
-            for(DimLoopInd i = 0; i < TDim::value; ++i)
-                r[i] = core::divCeil(gridThreadExtent[i], blockThreadExtent[i]);
-            return r;
-        }();
-
-
-        // Store the maxima allowed for extents of grid, blocks and threads.
-        auto const gridBlockExtentMax = subVecEnd<TDim>(accDevProps.m_gridBlockExtentMax);
-        auto const blockThreadExtentMax = subVecEnd<TDim>(accDevProps.m_blockThreadExtentMax);
-        auto const threadElemExtentMax = subVecEnd<TDim>(accDevProps.m_threadElemExtentMax);
-
-        // Check that the extents for all dimensions are correct.
-        for(typename TDim::value_type i(0); i < TDim::value; ++i)
-        {
-            // Check that the maximum extents are greater or equal 1.
-            if(gridBlockExtentMax[i] < gridBlockExtent[i])
-            {
-                gridBlockExtent[i] = gridBlockExtentMax[i];
-            }
-            if(blockThreadExtentMax[i] < blockThreadExtent[i])
-            {
-                blockThreadExtent[i] = blockThreadExtentMax[i];
-            }
-            if(threadElemExtentMax[i] < threadElemExtent[i])
-            {
-                clippedThreadElemExtent[i] = threadElemExtentMax[i];
-            }
-        }
-
-        return WorkDivMembers<TDim, TIdx>(gridBlockExtent, blockThreadExtent, clippedThreadElemExtent);
-    }
-
-    //! Kernel start configuration to determine a valid work division
-    //!
-    //! \tparam TGridElemExtent The type of the grid element extent.
-    //! \tparam TThreadElemExtent The type of the thread element extent.
-    template<
-        typename TAcc,
-        typename TGridElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>,
-        typename TThreadElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>>
-    struct KernelCfg
-    {
-        //! The full extent of elements in the grid.
-        TGridElemExtent const gridElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>::ones();
-        //! The number of elements computed per thread.
-        TThreadElemExtent const threadElemExtent = alpaka::Vec<Dim<TAcc>, Idx<TAcc>>::ones();
-        //! If this is true, the grid thread extent will be multiples of
-        //! the corresponding block thread extent.
-        //!     NOTE: If this is true and gridThreadExtent is prime (or otherwise bad chosen) in a dimension, the block
-        //!     thread extent will be one in this dimension.
-        bool blockThreadMustDivideGridThreadExtent = true;
-        //! The grid block extent subdivision restrictions.
-        GridBlockExtentSubDivRestrictions gridBlockExtentSubDivRestrictions
-            = GridBlockExtentSubDivRestrictions::Unrestricted;
-
-        static_assert(
-            Dim<TGridElemExtent>::value == Dim<TAcc>::value,
-            "The dimension of Acc and the dimension of TGridElemExtent have to be identical!");
-        static_assert(
-            Dim<TGridElemExtent>::value == Dim<TAcc>::value,
-            "The dimension of Acc and the dimension of TThreadElemExtent have to be identical!");
-        static_assert(
-            std::is_same_v<Idx<TGridElemExtent>, Idx<TAcc>>,
-            "The idx type of Acc and the idx type of TGridElemExtent have to be identical!");
-        static_assert(
-            std::is_same_v<Idx<TThreadElemExtent>, Idx<TAcc>>,
-            "The idx type of Acc and the idx type of TThreadElemExtent have to be identical!");
-    };
-
-    //! \tparam TDev The type of the device.
-    //! \tparam TGridElemExtent The type of the grid element extent.
-    //! \tparam TThreadElemExtent The type of the thread element extent.
-    //! \param dev The device the work division should be valid for.
-    //! \param kernelFnObj The kernel function object which should be executed.
-    //! \param args The kernel invocation arguments.
-    //! \return The work division for the accelerator based on the kernel and argument types
-    template<
-        typename TAcc,
-        typename TDev,
-        typename TGridElemExtent,
-        typename TThreadElemExtent,
-        typename TKernelFnObj,
-        typename... TArgs>
-    ALPAKA_FN_HOST auto getValidWorkDiv(
-        KernelCfg<TAcc, TGridElemExtent, TThreadElemExtent> const& kernelCfg,
-        [[maybe_unused]] TDev const& dev,
-        TKernelFnObj const& kernelFnObj,
-        TArgs&&... args) -> WorkDivMembers<Dim<TAcc>, Idx<TAcc>>
-    {
-        using Acc = TAcc;
-
-        // Get max number of threads per block depending on the kernel function attributes.
-        // For GPU backend; number of registers used by the kernel, local and shared memory usage of the kernel
-        // determines the max number of threads per block. This number could be equal or less than the max number of
-        // threads per block defined by device properties.
-        auto const kernelFunctionAttributes
-            = getFunctionAttributes<Acc>(dev, kernelFnObj, std::forward<TArgs>(args)...);
-        auto const threadsPerBlock = kernelFunctionAttributes.maxThreadsPerBlock;
-
-        if constexpr(Dim<TGridElemExtent>::value == 0)
-        {
-            auto const zero = Vec<DimInt<0>, Idx<Acc>>{};
-            ALPAKA_ASSERT(kernelCfg.gridElemExtent == zero);
-            ALPAKA_ASSERT(kernelCfg.threadElemExtent == zero);
-            return WorkDivMembers<DimInt<0>, Idx<Acc>>{zero, zero, zero};
-        }
-        else
-            return subDivideGridElems(
-                getExtents(kernelCfg.gridElemExtent),
-                getExtents(kernelCfg.threadElemExtent),
-                getAccDevProps<Acc>(dev),
-                static_cast<Idx<Acc>>(threadsPerBlock),
-                kernelCfg.blockThreadMustDivideGridThreadExtent,
-                kernelCfg.gridBlockExtentSubDivRestrictions);
-
-        using V [[maybe_unused]] = Vec<Dim<TGridElemExtent>, Idx<TGridElemExtent>>;
-        ALPAKA_UNREACHABLE(WorkDivMembers<Dim<TGridElemExtent>, Idx<TGridElemExtent>>{V{}, V{}, V{}});
-    }
-
-    //! Checks if the work division is supported
-    //!
-    //! \tparam TWorkDiv The type of the work division.
-    //! \tparam TDim The dimensionality of the accelerator device properties.
-    //! \tparam TIdx The idx type of the accelerator device properties.
-    //! \param workDiv The work division to test for validity.
-    //! \param accDevProps The maxima for the work division.
-    //! \return If the work division is valid for the given accelerator device properties.
-    template<typename TWorkDiv, typename TDim, typename TIdx>
-    ALPAKA_FN_HOST auto isValidWorkDiv(TWorkDiv const& workDiv, AccDevProps<TDim, TIdx> const& accDevProps) -> bool
-    {
-        // Get the extents of grid, blocks and threads of the work division to check.
-        auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(workDiv);
-        auto const blockThreadExtent = getWorkDiv<Block, Threads>(workDiv);
-        auto const threadElemExtent = getWorkDiv<Thread, Elems>(workDiv);
-
-        // Check that the maximal counts are satisfied.
-        if(accDevProps.m_gridBlockCountMax < gridBlockExtent.prod())
-        {
-            return false;
-        }
-        if(accDevProps.m_blockThreadCountMax < blockThreadExtent.prod())
-        {
-            return false;
-        }
-        if(accDevProps.m_threadElemCountMax < threadElemExtent.prod())
-        {
-            return false;
-        }
-
-        // Check that the extents for all dimensions are correct.
-        if constexpr(Dim<TWorkDiv>::value > 0)
-        {
-            // Store the maxima allowed for extents of grid, blocks and threads.
-            auto const gridBlockExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_gridBlockExtentMax);
-            auto const blockThreadExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_blockThreadExtentMax);
-            auto const threadElemExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_threadElemExtentMax);
-
-            for(typename Dim<TWorkDiv>::value_type i(0); i < Dim<TWorkDiv>::value; ++i)
-            {
-                // No extent is allowed to be zero or greater then the allowed maximum.
-                if((gridBlockExtent[i] < 1) || (blockThreadExtent[i] < 1) || (threadElemExtent[i] < 1)
-                   || (gridBlockExtentMax[i] < gridBlockExtent[i]) || (blockThreadExtentMax[i] < blockThreadExtent[i])
-                   || (threadElemExtentMax[i] < threadElemExtent[i]))
-                {
-                    return false;
-                }
-            }
-        }
-
-        return true;
-    }
-
-    //! Checks if the work division is supported
-    //!
-    //! \tparam TWorkDiv The type of the work division.
-    //! \tparam TDim The dimensionality of the accelerator device properties.
-    //! \tparam TIdx The idx type of the accelerator device properties.
-    //! \param workDiv The work division to test for validity.
-    //! \param accDevProps The maxima for the work division.
-    //! \param kernelFunctionAttributes Kernel attributes, including the maximum number of threads per block that can
-    //! be used by this kernel on the given device. This number can be equal to or smaller than the the number of
-    //! threads per block supported by the device.
-    //! \return Returns true if the work division is valid for the given accelerator device properties and for the
-    //! given kernel. Otherwise returns false.
-    template<typename TAcc, typename TWorkDiv, typename TDim, typename TIdx>
-    ALPAKA_FN_HOST auto isValidWorkDiv(
-        TWorkDiv const& workDiv,
-        AccDevProps<TDim, TIdx> const& accDevProps,
-        KernelFunctionAttributes const& kernelFunctionAttributes) -> bool
-    {
-        // Get the extents of grid, blocks and threads of the work division to check.
-        auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(workDiv);
-        auto const blockThreadExtent = getWorkDiv<Block, Threads>(workDiv);
-        auto const threadElemExtent = getWorkDiv<Thread, Elems>(workDiv);
-        // Use kernel properties to find the max threads per block for the kernel
-        auto const threadsPerBlockForKernel = kernelFunctionAttributes.maxThreadsPerBlock;
-        // Select the minimum to find the upper bound for the threads per block
-        auto const allowedThreadsPerBlock = std::min(
-            static_cast<TIdx>(threadsPerBlockForKernel),
-            static_cast<TIdx>(accDevProps.m_blockThreadCountMax));
-        // Check that the maximal counts are satisfied.
-        if(accDevProps.m_gridBlockCountMax < gridBlockExtent.prod())
-        {
-            return false;
-        }
-        if(allowedThreadsPerBlock < blockThreadExtent.prod())
-        {
-            return false;
-        }
-        if(accDevProps.m_threadElemCountMax < threadElemExtent.prod())
-        {
-            return false;
-        }
-
-        // Check that the extents for all dimensions are correct.
-        if constexpr(Dim<TWorkDiv>::value > 0)
-        {
-            // Store the maxima allowed for extents of grid, blocks and threads.
-            auto const gridBlockExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_gridBlockExtentMax);
-            auto const blockThreadExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_blockThreadExtentMax);
-            auto const threadElemExtentMax = subVecEnd<Dim<TWorkDiv>>(accDevProps.m_threadElemExtentMax);
-
-            for(typename Dim<TWorkDiv>::value_type i(0); i < Dim<TWorkDiv>::value; ++i)
-            {
-                // No extent is allowed to be zero or greater then the allowed maximum.
-                if((gridBlockExtent[i] < 1) || (blockThreadExtent[i] < 1) || (threadElemExtent[i] < 1)
-                   || (gridBlockExtentMax[i] < gridBlockExtent[i]) || (blockThreadExtentMax[i] < blockThreadExtent[i])
-                   || (threadElemExtentMax[i] < threadElemExtent[i]))
-                {
-                    return false;
-                }
-            }
-        }
-
-        return true;
-    }
-
-    //! Checks if the work division is supported for the kernel on the device
-    //!
-    //! \tparam TAcc The accelerator to test the validity on.
-    //! \tparam TDev The type of the device.
-    //! \tparam TWorkDiv The type of work division to test for validity.
-    //! \param workDiv The work division to test for validity.
-    //! \param dev The device to test the work division for validity on.
-    //! \param kernelFnObj The kernel function object which should be executed.
-    //! \param args The kernel invocation arguments.
-    //! \return Returns the value of isValidWorkDiv function.
-    template<typename TAcc, typename TWorkDiv, typename TDev, typename TKernelFnObj, typename... TArgs>
-    ALPAKA_FN_HOST auto isValidWorkDiv(
-        TWorkDiv const& workDiv,
-        TDev const& dev,
-        TKernelFnObj const& kernelFnObj,
-        TArgs&&... args) -> bool
-    {
-        return isValidWorkDiv<TAcc>(
-            workDiv,
-            getAccDevProps<TAcc>(dev),
-            getFunctionAttributes<TAcc>(dev, kernelFnObj, std::forward<TArgs>(args)...));
-    }
-
-    //! Checks if the work division is supported by the device
-    //!
-    //! \tparam TAcc The accelerator to test the validity on.
-    //! \param workDiv The work division to test for validity.
-    //! \param dev The device to test the work division for validity on.
-    //! \return If the work division is valid on this accelerator.
-    template<typename TAcc, typename TWorkDiv, typename TDev>
-    ALPAKA_FN_HOST auto isValidWorkDiv(TWorkDiv const& workDiv, TDev const& dev) -> bool
-    {
-        return isValidWorkDiv(workDiv, getAccDevProps<TAcc>(dev));
-    }
-} // namespace alpaka
-
-#if BOOST_COMP_CLANG
-#    pragma clang diagnostic pop
-#endif
diff --git a/include/alpaka/workdiv/WorkDivMembers.hpp b/include/alpaka/workdiv/WorkDivMembers.hpp
deleted file mode 100644
index 3d36450..0000000
--- a/include/alpaka/workdiv/WorkDivMembers.hpp
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright 2022 Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/Common.hpp"
-#include "alpaka/extent/Traits.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/vec/Vec.hpp"
-#include "alpaka/workdiv/Traits.hpp"
-
-#include <iosfwd>
-
-namespace alpaka
-{
-    //! A basic class holding the work division as grid block extent, block thread and thread element extent.
-    template<typename TDim, typename TIdx>
-    class WorkDivMembers : public concepts::Implements<ConceptWorkDiv, WorkDivMembers<TDim, TIdx>>
-    {
-    public:
-        ALPAKA_FN_HOST_ACC WorkDivMembers() = delete;
-
-        //! Accepts different alpaka vector types and takes the last TDim number of items.
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename TGridBlockExtent, typename TBlockThreadExtent, typename TThreadElemExtent>
-        ALPAKA_FN_HOST_ACC explicit WorkDivMembers(
-            TGridBlockExtent const& gridBlockExtent = TGridBlockExtent(),
-            TBlockThreadExtent const& blockThreadExtent = TBlockThreadExtent(),
-            TThreadElemExtent const& threadElemExtent = TThreadElemExtent())
-            : m_gridBlockExtent(getExtentVecEnd<TDim>(gridBlockExtent))
-            , m_blockThreadExtent(getExtentVecEnd<TDim>(blockThreadExtent))
-            , m_threadElemExtent(getExtentVecEnd<TDim>(threadElemExtent))
-        {
-        }
-
-        //! \brief Accepts single specific type and is called without explicit template parameters.
-        ALPAKA_NO_HOST_ACC_WARNING
-        ALPAKA_FN_HOST_ACC WorkDivMembers(
-            alpaka::Vec<TDim, TIdx> const& gridBlockExtent,
-            alpaka::Vec<TDim, TIdx> const& blockThreadExtent,
-            alpaka::Vec<TDim, TIdx> const& elemExtent)
-            : m_gridBlockExtent(gridBlockExtent)
-            , m_blockThreadExtent(blockThreadExtent)
-            , m_threadElemExtent(elemExtent)
-        {
-        }
-
-        ALPAKA_NO_HOST_ACC_WARNING
-        ALPAKA_FN_HOST_ACC WorkDivMembers(WorkDivMembers const& other)
-            : m_gridBlockExtent(other.m_gridBlockExtent)
-            , m_blockThreadExtent(other.m_blockThreadExtent)
-            , m_threadElemExtent(other.m_threadElemExtent)
-        {
-        }
-
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename TWorkDiv>
-        ALPAKA_FN_HOST_ACC explicit WorkDivMembers(TWorkDiv const& other)
-            : m_gridBlockExtent(subVecEnd<TDim>(getWorkDiv<Grid, Blocks>(other)))
-            , m_blockThreadExtent(subVecEnd<TDim>(getWorkDiv<Block, Threads>(other)))
-            , m_threadElemExtent(subVecEnd<TDim>(getWorkDiv<Thread, Elems>(other)))
-        {
-        }
-
-        WorkDivMembers(WorkDivMembers&&) = default;
-        auto operator=(WorkDivMembers const&) -> WorkDivMembers& = default;
-        auto operator=(WorkDivMembers&&) -> WorkDivMembers& = default;
-
-        ALPAKA_NO_HOST_ACC_WARNING
-        template<typename TWorkDiv>
-        ALPAKA_FN_HOST_ACC auto operator=(TWorkDiv const& other) -> WorkDivMembers<TDim, TIdx>&
-        {
-            m_gridBlockExtent = subVecEnd<TDim>(getWorkDiv<Grid, Blocks>(other));
-            m_blockThreadExtent = subVecEnd<TDim>(getWorkDiv<Block, Threads>(other));
-            m_threadElemExtent = subVecEnd<TDim>(getWorkDiv<Thread, Elems>(other));
-            return *this;
-        }
-
-        ALPAKA_FN_HOST_ACC friend constexpr auto operator==(WorkDivMembers const& a, WorkDivMembers const& b) -> bool
-        {
-            return a.m_gridBlockExtent == b.m_gridBlockExtent && a.m_blockThreadExtent == b.m_blockThreadExtent
-                   && a.m_threadElemExtent == b.m_threadElemExtent;
-        }
-
-        ALPAKA_FN_HOST friend auto operator<<(std::ostream& os, WorkDivMembers const& workDiv) -> std::ostream&
-        {
-            return os << "{gridBlockExtent: " << workDiv.m_gridBlockExtent
-                      << ", blockThreadExtent: " << workDiv.m_blockThreadExtent
-                      << ", threadElemExtent: " << workDiv.m_threadElemExtent << "}";
-        }
-
-    public:
-        Vec<TDim, TIdx> m_gridBlockExtent;
-        Vec<TDim, TIdx> m_blockThreadExtent;
-        Vec<TDim, TIdx> m_threadElemExtent;
-    };
-
-    //! Deduction guide for the constructor which can be called without explicit template type parameters
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TDim, typename TIdx>
-    ALPAKA_FN_HOST_ACC WorkDivMembers(
-        alpaka::Vec<TDim, TIdx> const& gridBlockExtent,
-        alpaka::Vec<TDim, TIdx> const& blockThreadExtent,
-        alpaka::Vec<TDim, TIdx> const& elemExtent) -> WorkDivMembers<TDim, TIdx>;
-
-    namespace trait
-    {
-        //! The WorkDivMembers dimension get trait specialization.
-        template<typename TDim, typename TIdx>
-        struct DimType<WorkDivMembers<TDim, TIdx>>
-        {
-            using type = TDim;
-        };
-
-        //! The WorkDivMembers idx type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct IdxType<WorkDivMembers<TDim, TIdx>>
-        {
-            using type = TIdx;
-        };
-
-        //! The WorkDivMembers grid block extent trait specialization.
-        template<typename TDim, typename TIdx>
-        struct GetWorkDiv<WorkDivMembers<TDim, TIdx>, origin::Grid, unit::Blocks>
-        {
-            //! \return The number of blocks in each dimension of the grid.
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC static auto getWorkDiv(WorkDivMembers<TDim, TIdx> const& workDiv) -> Vec<TDim, TIdx>
-            {
-                return workDiv.m_gridBlockExtent;
-            }
-        };
-
-        //! The WorkDivMembers block thread extent trait specialization.
-        template<typename TDim, typename TIdx>
-        struct GetWorkDiv<WorkDivMembers<TDim, TIdx>, origin::Block, unit::Threads>
-        {
-            //! \return The number of threads in each dimension of a block.
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC static auto getWorkDiv(WorkDivMembers<TDim, TIdx> const& workDiv) -> Vec<TDim, TIdx>
-            {
-                return workDiv.m_blockThreadExtent;
-            }
-        };
-
-        //! The WorkDivMembers thread element extent trait specialization.
-        template<typename TDim, typename TIdx>
-        struct GetWorkDiv<WorkDivMembers<TDim, TIdx>, origin::Thread, unit::Elems>
-        {
-            //! \return The number of elements in each dimension of a thread.
-            ALPAKA_NO_HOST_ACC_WARNING
-            ALPAKA_FN_HOST_ACC static auto getWorkDiv(WorkDivMembers<TDim, TIdx> const& workDiv) -> Vec<TDim, TIdx>
-            {
-                return workDiv.m_threadElemExtent;
-            }
-        };
-    } // namespace trait
-} // namespace alpaka
diff --git a/include/alpaka/workdiv/WorkDivUniformCudaHipBuiltIn.hpp b/include/alpaka/workdiv/WorkDivUniformCudaHipBuiltIn.hpp
deleted file mode 100644
index 8915267..0000000
--- a/include/alpaka/workdiv/WorkDivUniformCudaHipBuiltIn.hpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/core/BoostPredef.hpp"
-#include "alpaka/core/Concepts.hpp"
-#include "alpaka/core/Cuda.hpp"
-#include "alpaka/core/Hip.hpp"
-#include "alpaka/idx/Traits.hpp"
-#include "alpaka/vec/Vec.hpp"
-#include "alpaka/workdiv/Traits.hpp"
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED)
-
-namespace alpaka
-{
-    //! The GPU CUDA/HIP accelerator work division.
-    template<typename TDim, typename TIdx>
-    class WorkDivUniformCudaHipBuiltIn
-        : public concepts::Implements<ConceptWorkDiv, WorkDivUniformCudaHipBuiltIn<TDim, TIdx>>
-    {
-    public:
-        ALPAKA_FN_HOST_ACC WorkDivUniformCudaHipBuiltIn(Vec<TDim, TIdx> const& threadElemExtent)
-            : m_threadElemExtent(threadElemExtent)
-        {
-        }
-
-        // \TODO: Optimize! Add WorkDivUniformCudaHipBuiltInNoElems that has no member m_threadElemExtent as well as
-        // AccGpuUniformCudaHipRtNoElems. Use it instead of AccGpuUniformCudaHipRt if the thread element extent is one
-        // to reduce the register usage.
-        Vec<TDim, TIdx> const& m_threadElemExtent;
-    };
-
-#    if !defined(ALPAKA_HOST_ONLY)
-
-#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
-#            error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
-#        endif
-
-#        if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
-#            error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
-#        endif
-
-    namespace trait
-    {
-        //! The GPU CUDA/HIP accelerator work division dimension get trait specialization.
-        template<typename TDim, typename TIdx>
-        struct DimType<WorkDivUniformCudaHipBuiltIn<TDim, TIdx>>
-        {
-            using type = TDim;
-        };
-
-        //! The GPU CUDA/HIP accelerator work division idx type trait specialization.
-        template<typename TDim, typename TIdx>
-        struct IdxType<WorkDivUniformCudaHipBuiltIn<TDim, TIdx>>
-        {
-            using type = TIdx;
-        };
-
-        //! The GPU CUDA/HIP accelerator work division grid block extent trait specialization.
-        template<typename TDim, typename TIdx>
-        struct GetWorkDiv<WorkDivUniformCudaHipBuiltIn<TDim, TIdx>, origin::Grid, unit::Blocks>
-        {
-            //! \return The number of blocks in each dimension of the grid.
-            __device__ static auto getWorkDiv(WorkDivUniformCudaHipBuiltIn<TDim, TIdx> const& /* workDiv */)
-                -> Vec<TDim, TIdx>
-            {
-#        ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-                return castVec<TIdx>(getExtentVecEnd<TDim>(gridDim));
-#        else
-                return getExtentVecEnd<TDim>(Vec<std::integral_constant<typename TDim::value_type, 3>, TIdx>(
-                    static_cast<TIdx>(hipGridDim_z),
-                    static_cast<TIdx>(hipGridDim_y),
-                    static_cast<TIdx>(hipGridDim_x)));
-#        endif
-            }
-        };
-
-        //! The GPU CUDA/HIP accelerator work division block thread extent trait specialization.
-        template<typename TDim, typename TIdx>
-        struct GetWorkDiv<WorkDivUniformCudaHipBuiltIn<TDim, TIdx>, origin::Block, unit::Threads>
-        {
-            //! \return The number of threads in each dimension of a block.
-            __device__ static auto getWorkDiv(WorkDivUniformCudaHipBuiltIn<TDim, TIdx> const& /* workDiv */)
-                -> Vec<TDim, TIdx>
-            {
-#        ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-                return castVec<TIdx>(getExtentVecEnd<TDim>(blockDim));
-#        else
-                return getExtentVecEnd<TDim>(Vec<std::integral_constant<typename TDim::value_type, 3>, TIdx>(
-                    static_cast<TIdx>(hipBlockDim_z),
-                    static_cast<TIdx>(hipBlockDim_y),
-                    static_cast<TIdx>(hipBlockDim_x)));
-#        endif
-            }
-        };
-
-        //! The GPU CUDA/HIP accelerator work division thread element extent trait specialization.
-        template<typename TDim, typename TIdx>
-        struct GetWorkDiv<WorkDivUniformCudaHipBuiltIn<TDim, TIdx>, origin::Thread, unit::Elems>
-        {
-            //! \return The number of blocks in each dimension of the grid.
-            __device__ static auto getWorkDiv(WorkDivUniformCudaHipBuiltIn<TDim, TIdx> const& workDiv)
-                -> Vec<TDim, TIdx>
-            {
-                return workDiv.m_threadElemExtent;
-            }
-        };
-    } // namespace trait
-
-#    endif
-
-} // namespace alpaka
-
-#endif

From d2ae502e680602d638e6fb17e62d432525d47d2a Mon Sep 17 00:00:00 2001
From: Francesco Derme <francesco.derme02@gmail.com>
Date: Tue, 16 Dec 2025 11:37:27 +0100
Subject: [PATCH 03/33] Stop tracking external code

---
 .gitattributes | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 .gitattributes

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..a195421
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,2 @@
+# External code
+models/ linguist-vendored

From 15289a6108432ad3d39aefdbdaa5032cfdbc4c13 Mon Sep 17 00:00:00 2001
From: Francesco Derme <francesco.derme02@gmail.com>
Date: Tue, 16 Dec 2025 12:47:53 +0100
Subject: [PATCH 04/33] Working on run.py

---
 run.py                   | 101 +++++++++++++++++++++++++++++++--------
 tests/test_transpose.cpp |  67 ++++++++++++++++++--------
 2 files changed, 127 insertions(+), 41 deletions(-)

diff --git a/run.py b/run.py
index b38f59f..8f02b8f 100644
--- a/run.py
+++ b/run.py
@@ -1,13 +1,28 @@
 import subprocess
 import sys
 import os
+import re
+
+# Configuration
+EXECUTABLE_PATHS = [
+    "./bin/test_transpose.out",
+    "./bin/test_concat.out",
+    "./bin/test_where.out",
+    "./bin/test_topk.out"
+]
+
+BENCHMARK_SIZES = [
+    512,
+    1024,
+    # 2048 # Be careful
+]
 
 def build_kernel_tests():
     """
     Calls the Makefile to build the kernel tests.
     Returns True if successful, False otherwise.
     """
-    print("Building project with Make...")
+    print("Building kernel tests with Make...")
     try:
         # Check if Makefile exists
         if not os.path.exists("Makefile"):
@@ -15,17 +30,16 @@ def build_kernel_tests():
             return False
 
         # Run 'make'.
-        # capture_output=False lets the user see the compiler output in real-time
         subprocess.run(["make"], check=True)
         
-        print("✅ Build successful\n")
+        print("Build successful\n")
         return True
         
     except subprocess.CalledProcessError:
-        print("❌ Build failed. Please fix C++ errors before running benchmarks.")
+        print("Build failed. Please fix C++ errors before running benchmarks")
         return False
     except FileNotFoundError:
-        print("❌ Error: 'make' command not found. Is it installed?")
+        print("Error: 'make' command not found. Is it installed?")
         return False
 
 def run_benchmark(executable_path, args):
@@ -33,36 +47,81 @@ def run_benchmark(executable_path, args):
     Runs the compiled executable with arguments.
     """
     if not os.path.exists(executable_path):
-        print(f"❌ Error: Executable '{executable_path}' not found after build.")
+        print(f"Error: Executable '{executable_path}' not found after build")
         return
 
-    print(f"🚀 Running {executable_path} with args: {args}...")
+    N = args[0]
+
     try:
         # Construct the command
         cmd = [executable_path] + [str(a) for a in args]
         
         # Run and capture output for parsing
         result = subprocess.run(cmd, capture_output=True, text=True, check=True)
-        
-        print("--- Output ---")
-        print(result.stdout)
-        
-        # TODO: Add your parsing logic here (regex or string split) to get the time
+        output = result.stdout
+
+        kernel_match = re.search(r"TIME_KERNEL_MS:\s+(\d+\.?\d*)", output)
+        total_match = re.search(r"TIME_TOTAL_MS:\s+(\d+\.?\d*)", output)
+
+        if kernel_match and total_match:
+            return float(kernel_match.group(1)), float(total_match.group(1))
+        else:
+            print(f"Output parsing failed for size {N}x{N}.")
+            print("--- Raw Output ---")
+            print(output)
+            print("------------------\n")
+            return None
         
     except subprocess.CalledProcessError as e:
-        print(f"❌ Execution failed with return code {e.returncode}")
+        print(f"Execution failed with return code {e.returncode}")
         print("Stderr:", e.stderr)
 
-if __name__ == "__main__":
+def main():
     # Build Phase
     if not build_kernel_tests():
         sys.exit(1)
 
+    print("Bandwidth calculated based on kernel execution time only,")
+    print("if the result is 0 you're probably using the CPU itself as the accelerator")
+
     # Benchmark Phase
-    # Adjust this path to match where your Makefile outputs the binary
-    binary_path = "./build/alpaka_test_kernel" 
-    
-    input_sizes = [1024, 2048, 4096]
-    
-    for size in input_sizes:
-        run_benchmark(binary_path, [size])
+    for EXECUTABLE_PATH in EXECUTABLE_PATHS:
+        print(f"Benchmarking {EXECUTABLE_PATH}")
+        print(f"{'SIZE (NxN)':<12} | {'KERNEL (ms)':<12} | {'TOTAL (ms)':<12} | {'BANDWIDTH (GB/s)*':<18}")
+        print("-" * 65)
+
+        for N in BENCHMARK_SIZES:
+        
+            res = run_benchmark(EXECUTABLE_PATH, [N])
+        
+            if res:
+                k_ms, t_ms = res
+            
+                # Bandwidth Calculation (approximate)
+                # Transpose reads N*N floats and writes N*N floats
+                # Total Bytes = 2 * N * N * 4 bytes (for float32)
+                total_bytes = 0.0
+
+                if EXECUTABLE_PATH == "./bin/test_transpose.out":
+                    total_bytes = 8 * N * N
+                elif EXECUTABLE_PATH == "./bin/test_concat.out":
+                    total_bytes = 24 * N * N
+                elif EXECUTABLE_PATH == "./bin/test_where.out":
+                    total_bytes = 13 * N * N 
+                else:
+                    k = 4
+                    total_bytes = 4 * N * N + 4 * N * k
+            
+                # GB/s = (Bytes / 1e9) / (Seconds)
+                # Time is in ms, so divide by 1000.0
+                if k_ms > 0:
+                    bandwidth_gbs = (total_bytes / 1e9) / (k_ms / 1000.0)
+                else:
+                    bandwidth_gbs = 0.0
+
+                print(f"{N:<12} | {k_ms:<12.4f} | {t_ms:<12.4f} | {bandwidth_gbs:<18.2f}")
+
+        print("-" * 65)
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_transpose.cpp b/tests/test_transpose.cpp
index 8f89690..dad9fdc 100644
--- a/tests/test_transpose.cpp
+++ b/tests/test_transpose.cpp
@@ -1,4 +1,5 @@
 #include <alpaka/alpaka.hpp>
+#include <chrono>
 #include <iostream>
 #include <random>
 #include <vector>
@@ -29,7 +30,9 @@ using DevHost = alpaka::DevCpu;
 using PlatAcc = alpaka::Platform<DevAcc>;
 using PlatHost = alpaka::PlatformCpu;
 
-int main() {
+auto now() { return std::chrono::high_resolution_clock::now(); }
+
+int main(int argc, char* argv[]) {
     using namespace alpaka_kernels;
     using T = float;
 
@@ -40,11 +43,19 @@ int main() {
     std::uniform_real_distribution<float> distrib_real(-1.0f, 1.0f);
 
     // Input matrix dimensions
-    const std::size_t rows = distrib_int(gen);
-    const std::size_t cols = distrib_int(gen);
-    const std::size_t numElems = rows * cols;
+    std::size_t rows = distrib_int(gen);
+    std::size_t cols = distrib_int(gen);
+
+    if (argc >= 2) {
+        rows = std::atoi(argv[1]);
+        cols = rows;
+        std::cout << "Using input dimensions " << rows << "x" << cols << "\n";
+    }
+    else {
+        std::cout << "Using random dimensions " << rows << "x" << cols << "\n";
+    }
 
-    std::cout << "Input is of shape " << rows << "x" << cols << "\n";
+    const std::size_t numElems = rows * cols;
 
     std::vector<T> INPUT(numElems);
     for (auto& val : INPUT) val = distrib_real(gen);
@@ -69,17 +80,6 @@ int main() {
     auto hIn = alpaka::allocBuf<T, Idx>(devHost, extentIn);
     auto hOut = alpaka::allocBuf<T, Idx>(devHost, extentOut);
 
-    // Initial data transfer
-    // 1) INPUT -> host buffer (safe via raw pointer)
-    {
-        T* pHost = alpaka::getPtrNative(hIn);
-        for (Idx i = 0; i < numElems; ++i) pHost[i] = INPUT[i];
-    }
-
-    // 2) host -> accelerator
-    alpaka::memcpy(queue, aIn, hIn);
-    alpaka::wait(queue);
-
     // Prepare kernel arguments
     auto input_strides = alpaka::Vec<Dim, Idx>(cols, 1);
     auto output_strides = alpaka::Vec<Dim, Idx>(rows, 1);
@@ -93,20 +93,39 @@ int main() {
     const std::size_t blocksX = (cols + threadsX - 1) / threadsX;
     const std::size_t blocksY = (rows + threadsY - 1) / threadsY;
 
-    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{alpaka::Vec<Dim, Idx>(blocksX, blocksY),
-                                                          alpaka::Vec<Dim, Idx>(threadsX, threadsY), extentOut};
+    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{
+        alpaka::Vec<Dim, Idx>(blocksX, blocksY),
+        alpaka::Vec<Dim, Idx>(threadsX, threadsY), extentOut};
+
+    // Initial data transfer
+    // 1) INPUT -> host buffer (safe via raw pointer)
+    {
+        T* pHost = alpaka::getPtrNative(hIn);
+        for (Idx i = 0; i < numElems; ++i) pHost[i] = INPUT[i];
+    }
+
+    auto start_total = now();
+
+    // 2) host -> accelerator
+    alpaka::memcpy(queue, aIn, hIn);
+    alpaka::wait(queue);
 
     // Launch kernel
     TransposeKernel kernel;
 
-    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn), alpaka::getPtrNative(aOut), input_strides,
-                      output_strides, extentOut, perm);
+    auto start_kernel = now();
+
+    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn),
+                      alpaka::getPtrNative(aOut), input_strides, output_strides,
+                      extentOut, perm);
 
     alpaka::wait(queue);
+    auto end_kernel = now();
 
     // Final data transfer: accelerator -> host
     alpaka::memcpy(queue, hOut, aOut);
     alpaka::wait(queue);
+    auto end_total = now();
 
     // Print result
     std::cout << "Output is of shape " << cols << "x" << rows << "\n";
@@ -127,5 +146,13 @@ int main() {
     }
 
     std::cout << "Correct!\n";
+
+    std::chrono::duration<double, std::milli> kernel_ms =
+        end_kernel - start_kernel;
+    std::chrono::duration<double, std::milli> total_ms =
+        end_total - start_total;
+
+    std::cout << "TIME_KERNEL_MS: " << kernel_ms.count() << std::endl;
+    std::cout << "TIME_TOTAL_MS: " << total_ms.count() << std::endl;
     return 0;
 }

From 4fdbc1645a66b3a7044ae6e6dc9de98317d0d3ea Mon Sep 17 00:00:00 2001
From: Francesco Derme <francesco.derme02@gmail.com>
Date: Tue, 16 Dec 2025 18:39:57 +0100
Subject: [PATCH 05/33] New tests

---
 kernels/transpose.hpp    |  9 +++--
 run.py                   |  9 ++---
 tests/test_concat.cpp    | 86 ++++++++++++++++++++++++++++------------
 tests/test_topk.cpp      | 71 +++++++++++++++++++++++----------
 tests/test_transpose.cpp |  6 +--
 tests/test_where.cpp     | 70 ++++++++++++++++++++++----------
 6 files changed, 171 insertions(+), 80 deletions(-)

diff --git a/kernels/transpose.hpp b/kernels/transpose.hpp
index 4732544..edf4ea5 100644
--- a/kernels/transpose.hpp
+++ b/kernels/transpose.hpp
@@ -7,11 +7,14 @@ namespace alpaka_kernels {
 
 struct TransposeKernel {
     template <typename TAcc, typename T, typename Dim, typename Idx>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* input, T* output, alpaka::Vec<Dim, Idx> input_strides,
-                                  alpaka::Vec<Dim, Idx> output_strides, alpaka::Vec<Dim, Idx> output_shape,
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* input, T* output,
+                                  alpaka::Vec<Dim, Idx> input_strides,
+                                  alpaka::Vec<Dim, Idx> output_strides,
+                                  alpaka::Vec<Dim, Idx> output_shape,
                                   alpaka::Vec<Dim, Idx> perm) const {
         using DimAcc = alpaka::Dim<TAcc>;
-        static_assert(DimAcc::value == Dim::value, "Accelerator and data dimensions must match!");
+        static_assert(DimAcc::value == Dim::value,
+                      "Accelerator and data dimensions must match!");
 
         constexpr std::size_t D = Dim::value;
         auto elements = alpaka::uniformElementsND(acc, output_shape);
diff --git a/run.py b/run.py
index 8f02b8f..e12d460 100644
--- a/run.py
+++ b/run.py
@@ -67,9 +67,8 @@ def run_benchmark(executable_path, args):
             return float(kernel_match.group(1)), float(total_match.group(1))
         else:
             print(f"Output parsing failed for size {N}x{N}.")
-            print("--- Raw Output ---")
+            print("Printing raw output from the cpp executable")
             print(output)
-            print("------------------\n")
             return None
         
     except subprocess.CalledProcessError as e:
@@ -82,12 +81,12 @@ def main():
         sys.exit(1)
 
     print("Bandwidth calculated based on kernel execution time only,")
-    print("if the result is 0 you're probably using the CPU itself as the accelerator")
+    print("if the result is 0 you're probably using the CPU itself as the accelerator\n")
 
     # Benchmark Phase
     for EXECUTABLE_PATH in EXECUTABLE_PATHS:
         print(f"Benchmarking {EXECUTABLE_PATH}")
-        print(f"{'SIZE (NxN)':<12} | {'KERNEL (ms)':<12} | {'TOTAL (ms)':<12} | {'BANDWIDTH (GB/s)*':<18}")
+        print(f"{'SIZE (NxN)':<12} | {'KERNEL (ms)':<12} | {'TOTAL (ms)':<12} | {'BANDWIDTH (GB/s)':<18}")
         print("-" * 65)
 
         for N in BENCHMARK_SIZES:
@@ -121,7 +120,7 @@ def main():
 
                 print(f"{N:<12} | {k_ms:<12.4f} | {t_ms:<12.4f} | {bandwidth_gbs:<18.2f}")
 
-        print("-" * 65)
+        print("-" * 65, "\n")
 
 if __name__ == "__main__":
     main()
diff --git a/tests/test_concat.cpp b/tests/test_concat.cpp
index f3bfd7e..cd86549 100644
--- a/tests/test_concat.cpp
+++ b/tests/test_concat.cpp
@@ -1,5 +1,6 @@
 #include <alpaka/alpaka.hpp>
 #include <array>
+#include <chrono>
 #include <iostream>
 #include <random>
 #include <vector>
@@ -32,7 +33,9 @@ using DevHost = alpaka::DevCpu;
 using PlatAcc = alpaka::Platform<DevAcc>;
 using PlatHost = alpaka::PlatformCpu;
 
-int main() {
+auto now() { return std::chrono::high_resolution_clock::now(); }
+
+int main(int argc, char* argv[]) {
     using namespace alpaka_kernels;
     using T = float;
 
@@ -43,19 +46,31 @@ int main() {
     std::uniform_real_distribution<float> distrib_real(-1.0f, 1.0f);
 
     // Input matrix dimensions
-    const std::size_t cols = distrib_int(gen);
+    std::array<std::size_t, NumInputs> in_rows;
+    std::size_t cols = distrib_int(gen);
     std::size_t total_rows = 0;
 
-    std::array<std::size_t, NumInputs> in_rows;
-    for (auto& val : in_rows) {
-        val = distrib_int(gen);
-        total_rows += val;
+    std::cout << "Number of inputs: " << NumInputs << "\n";
+
+    if (argc >= 2) {
+        std::cout << "Using input dimensions ";
+        cols = std::atoi(argv[1]);
+        for (auto& val : in_rows) {
+            val = cols;
+            total_rows += val;
+        }
+    }
+    else {
+        std::cout << "Using random dimensions ";
+        for (auto& val : in_rows) {
+            val = distrib_int(gen);
+            total_rows += val;
+        }
     }
 
-    std::cout << "Number of inputs: " << NumInputs << "\n";
-    std::cout << "Inputs are of shape: ";
     for (std::size_t k = 0; k < NumInputs; ++k)
-        std::cout << in_rows[k] << "x" << cols << ((k < NumInputs - 1) ? ", " : "\n");
+        std::cout << in_rows[k] << "x" << cols
+                  << ((k < NumInputs - 1) ? ", " : "\n");
 
     std::array<std::vector<T>, NumInputs> INPUT;
     for (std::size_t k = 0; k < NumInputs; ++k) {
@@ -68,9 +83,11 @@ int main() {
     auto devHost = alpaka::getDevByIdx(PlatHost{}, 0u);
     alpaka::Queue<Acc, alpaka::Blocking> queue{devAcc};
 
-    // Allocate buffers & initial data transfer
-    using BufAcc = decltype(alpaka::allocBuf<T, Idx>(devAcc, alpaka::Vec<Dim, Idx>{}));
-    using BufHost = decltype(alpaka::allocBuf<T, Idx>(devHost, alpaka::Vec<Dim, Idx>{}));
+    // Allocate buffers
+    using BufAcc =
+        decltype(alpaka::allocBuf<T, Idx>(devAcc, alpaka::Vec<Dim, Idx>{}));
+    using BufHost =
+        decltype(alpaka::allocBuf<T, Idx>(devHost, alpaka::Vec<Dim, Idx>{}));
 
     std::vector<BufAcc> aIn_bufs;
     aIn_bufs.reserve(NumInputs);
@@ -92,21 +109,16 @@ int main() {
         // 2) Host input buffers
         hIn_bufs.push_back(alpaka::allocBuf<T, Idx>(devHost, extentIn));
 
-        // Initial data transfer
-        // 1) INPUT -> host buffer (safe via raw pointers)
+        // INPUT to host buffer data transfer (safe via raw pointers)
         T* pHost = alpaka::getPtrNative(hIn_bufs.back());
-        for (std::size_t i = 0; i < INPUT[k].size(); ++i) pHost[i] = INPUT[k][i];
-
-        // 2) host -> accelerator
-        alpaka::memcpy(queue, aIn_bufs.back(), hIn_bufs.back());
+        for (std::size_t i = 0; i < INPUT[k].size(); ++i)
+            pHost[i] = INPUT[k][i];
     }
 
     // Allocate output buffers
     auto aOut = alpaka::allocBuf<T, Idx>(devAcc, extentOut);
     auto hOut = alpaka::allocBuf<T, Idx>(devHost, extentOut);
 
-    alpaka::wait(queue);
-
     // Prepare kernel arguments
     std::array<T const*, NumInputs> aIn_ptrs;
     for (std::size_t k = 0; k < NumInputs; ++k) {
@@ -130,26 +142,42 @@ int main() {
     const std::size_t blocksX = (out_rows + threadsX - 1) / threadsX;
     const std::size_t blocksY = (out_cols + threadsY - 1) / threadsY;
 
-    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{alpaka::Vec<Dim, Idx>(blocksX, blocksY),
-                                                          alpaka::Vec<Dim, Idx>(threadsX, threadsY), extentOut};
+    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{
+        alpaka::Vec<Dim, Idx>(blocksX, blocksY),
+        alpaka::Vec<Dim, Idx>(threadsX, threadsY), extentOut};
+
+    // Host to accelerator data transfer
+    auto start_total = now();
+
+    for (std::size_t k = 0; k < NumInputs; ++k) {
+        alpaka::memcpy(queue, aIn_bufs[k], hIn_bufs[k]);
+    }
+
+    alpaka::wait(queue);
 
     // Launch kernel
     ConcatKernel kernel;
 
-    alpaka::exec<Acc>(queue, workDiv, kernel, aIn_ptrs, alpaka::getPtrNative(aOut), input_strides_vec, output_strides,
-                      extentOut, axis_sizes, ConcatAxis);
+    auto start_kernel = now();
+
+    alpaka::exec<Acc>(queue, workDiv, kernel, aIn_ptrs,
+                      alpaka::getPtrNative(aOut), input_strides_vec,
+                      output_strides, extentOut, axis_sizes, ConcatAxis);
 
     alpaka::wait(queue);
+    auto end_kernel = now();
 
     // Final data transfer: accelerator -> host
     alpaka::memcpy(queue, hOut, aOut);
     alpaka::wait(queue);
+    auto end_total = now();
 
     // Print result
     std::cout << "Output is of shape " << out_rows << "x" << out_cols << "\n";
 
     std::vector<T> expected;
-    for (const auto& vec : INPUT) expected.insert(expected.end(), vec.begin(), vec.end());
+    for (const auto& vec : INPUT)
+        expected.insert(expected.end(), vec.begin(), vec.end());
 
     {
         T* pHost = alpaka::getPtrNative(hOut);
@@ -162,5 +190,13 @@ int main() {
     }
 
     std::cout << "Correct!\n";
+
+    std::chrono::duration<double, std::milli> kernel_ms =
+        end_kernel - start_kernel;
+    std::chrono::duration<double, std::milli> total_ms =
+        end_total - start_total;
+
+    std::cout << "TIME_KERNEL_MS: " << kernel_ms.count() << std::endl;
+    std::cout << "TIME_TOTAL_MS: " << total_ms.count() << std::endl;
     return 0;
 }
diff --git a/tests/test_topk.cpp b/tests/test_topk.cpp
index b92a076..2c247f1 100644
--- a/tests/test_topk.cpp
+++ b/tests/test_topk.cpp
@@ -1,5 +1,6 @@
 #include <alpaka/alpaka.hpp>
 #include <array>
+#include <chrono>
 #include <iostream>
 #include <random>
 #include <vector>
@@ -33,7 +34,9 @@ using DevHost = alpaka::DevCpu;
 using PlatAcc = alpaka::Platform<DevAcc>;
 using PlatHost = alpaka::PlatformCpu;
 
-int main() {
+auto now() { return std::chrono::high_resolution_clock::now(); }
+
+int main(int argc, char* argv[]) {
     using namespace alpaka_kernels;
     using T = float;
 
@@ -44,11 +47,19 @@ int main() {
     std::uniform_real_distribution<float> distrib_real(-1.0f, 1.0f);
 
     // Input matrix dimensions
-    const std::size_t rows = distrib_int(gen);
-    const std::size_t cols = distrib_int(gen);
-    const std::size_t numElems = rows * cols;
+    std::size_t rows = distrib_int(gen);
+    std::size_t cols = distrib_int(gen);
+
+    if (argc >= 2) {
+        rows = std::atoi(argv[1]);
+        cols = rows;
+        std::cout << "Using input dimensions " << rows << "x" << cols << "\n";
+    }
+    else {
+        std::cout << "Using random dimensions " << rows << "x" << cols << "\n";
+    }
 
-    std::cout << "Input is of shape " << rows << "x" << cols << "\n";
+    const std::size_t numElems = rows * cols;
 
     std::vector<T> INPUT(numElems);
     for (auto& val : INPUT) val = distrib_real(gen);
@@ -70,17 +81,6 @@ int main() {
     auto hIn = alpaka::allocBuf<T, Idx>(devHost, extentIn);
     auto hOut = alpaka::allocBuf<T, Idx>(devHost, extentOut);
 
-    // Initial data transfer
-    // 1) INPUT -> host buffer (safe via raw pointer)
-    {
-        T* pHost = alpaka::getPtrNative(hIn);
-        for (Idx i = 0; i < numElems; ++i) pHost[i] = INPUT[i];
-    }
-
-    // 2) host -> accelerator
-    alpaka::memcpy(queue, aIn, hIn);
-    alpaka::wait(queue);
-
     // Prepare kernel arguments
     T const padding_value = -1.0;
     auto input_strides = alpaka::Vec<Dim, Idx>(cols, 1);
@@ -98,25 +98,46 @@ int main() {
         if (d == TopkAxis) {
             threadsPerBlock[d] = 1;
             blocksPerGrid[d] = 1;
-        } else {
+        }
+        else {
             threadsPerBlock[d] = TARGET_BLOCK_SIZE;
-            blocksPerGrid[d] = (grid_elements[d] + threadsPerBlock[d] - 1) / threadsPerBlock[d];
+            blocksPerGrid[d] = (grid_elements[d] + threadsPerBlock[d] - 1) /
+                               threadsPerBlock[d];
         }
     }
 
-    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{blocksPerGrid, threadsPerBlock, grid_elements};
+    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{
+        blocksPerGrid, threadsPerBlock, grid_elements};
+
+    // Initial data transfer
+    // 1) INPUT -> host buffer (safe via raw pointer)
+    {
+        T* pHost = alpaka::getPtrNative(hIn);
+        for (Idx i = 0; i < numElems; ++i) pHost[i] = INPUT[i];
+    }
+
+    // 2) host -> accelerator
+    auto start_total = now();
+    alpaka::memcpy(queue, aIn, hIn);
+    alpaka::wait(queue);
 
     // Launch kernel
     TopKKernel<K, MaxRegisters> kernel;
 
-    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn), alpaka::getPtrNative(aOut), input_strides,
-                      output_strides, grid_elements, TopkAxis, extentIn[TopkAxis], padding_value);
+    auto start_kernel = now();
+
+    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn),
+                      alpaka::getPtrNative(aOut), input_strides, output_strides,
+                      grid_elements, TopkAxis, extentIn[TopkAxis],
+                      padding_value);
 
     alpaka::wait(queue);
+    auto end_kernel = now();
 
     // Final data transfer: accelerator -> host
     alpaka::memcpy(queue, hOut, aOut);
     alpaka::wait(queue);
+    auto end_total = now();
 
     // Print result
     std::cout << "Output is of shape " << rows << "x" << K << "\n";
@@ -161,5 +182,13 @@ int main() {
     }
 
     std::cout << "Correct!\n";
+
+    std::chrono::duration<double, std::milli> kernel_ms =
+        end_kernel - start_kernel;
+    std::chrono::duration<double, std::milli> total_ms =
+        end_total - start_total;
+
+    std::cout << "TIME_KERNEL_MS: " << kernel_ms.count() << std::endl;
+    std::cout << "TIME_TOTAL_MS: " << total_ms.count() << std::endl;
     return 0;
 }
diff --git a/tests/test_transpose.cpp b/tests/test_transpose.cpp
index dad9fdc..27f583c 100644
--- a/tests/test_transpose.cpp
+++ b/tests/test_transpose.cpp
@@ -74,9 +74,6 @@ int main(int argc, char* argv[]) {
     auto aOut = alpaka::allocBuf<T, Idx>(devAcc, extentOut);
 
     // 2) Host buffers
-    // Note that host and accelerator may coincide when using CPU backend,
-    // still it's better to allocate buffers separately for portability and
-    // because this ensures memory is pinned and not paged
     auto hIn = alpaka::allocBuf<T, Idx>(devHost, extentIn);
     auto hOut = alpaka::allocBuf<T, Idx>(devHost, extentOut);
 
@@ -104,9 +101,8 @@ int main(int argc, char* argv[]) {
         for (Idx i = 0; i < numElems; ++i) pHost[i] = INPUT[i];
     }
 
-    auto start_total = now();
-
     // 2) host -> accelerator
+    auto start_total = now();
     alpaka::memcpy(queue, aIn, hIn);
     alpaka::wait(queue);
 
diff --git a/tests/test_where.cpp b/tests/test_where.cpp
index 3ec5feb..af080b6 100644
--- a/tests/test_where.cpp
+++ b/tests/test_where.cpp
@@ -1,4 +1,5 @@
 #include <alpaka/alpaka.hpp>
+#include <chrono>
 #include <iostream>
 #include <random>
 #include <vector>
@@ -29,7 +30,9 @@ using DevHost = alpaka::DevCpu;
 using PlatAcc = alpaka::Platform<DevAcc>;
 using PlatHost = alpaka::PlatformCpu;
 
-int main() {
+auto now() { return std::chrono::high_resolution_clock::now(); }
+
+int main(int argc, char* argv[]) {
     using namespace alpaka_kernels;
     using T = float;
     using TCond = bool;
@@ -42,18 +45,27 @@ int main() {
     std::bernoulli_distribution distrib_bool(0.5);
 
     // Input matrix dimensions
-    const std::size_t rows = distrib_int(gen);
-    const std::size_t cols = distrib_int(gen);
-    const std::size_t numElems = rows * cols;
+    std::size_t rows = distrib_int(gen);
+    std::size_t cols = distrib_int(gen);
 
-    std::cout << "Inputs are of shape " << rows << "x" << cols << "\n";
+    if (argc >= 2) {
+        rows = std::atoi(argv[1]);
+        cols = rows;
+        std::cout << "Using input dimensions " << rows << "x" << cols << "\n";
+    }
+    else {
+        std::cout << "Using random dimensions " << rows << "x" << cols << "\n";
+    }
+
+    const std::size_t numElems = rows * cols;
 
     std::vector<T> INPUT_X(numElems), INPUT_Y(numElems);
     std::vector<TCond> INPUT_COND(numElems);
 
     for (auto& val : INPUT_X) val = distrib_real(gen) * 100.0;
     for (auto& val : INPUT_Y) val = distrib_real(gen);
-    for (std::size_t i = 0; i < numElems; ++i) INPUT_COND[i] = distrib_bool(gen);
+    for (std::size_t i = 0; i < numElems; ++i)
+        INPUT_COND[i] = distrib_bool(gen);
 
     // Setup the accelerator, host and queue
     auto devAcc = alpaka::getDevByIdx(PlatAcc{}, 0u);
@@ -75,6 +87,18 @@ int main() {
     auto hIn_Cond = alpaka::allocBuf<T, Idx>(devHost, extent);
     auto hOut = alpaka::allocBuf<T, Idx>(devHost, extent);
 
+    // Prepare kernel arguments
+    auto strides = alpaka::Vec<Dim, Idx>(cols, 1);
+
+    // Work division: 2D mapping of threads to elements
+    const std::size_t threadsX = 16, threadsY = 16;
+    const std::size_t blocksX = (cols + threadsX - 1) / threadsX;
+    const std::size_t blocksY = (rows + threadsY - 1) / threadsY;
+
+    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{
+        alpaka::Vec<Dim, Idx>(blocksX, blocksY),
+        alpaka::Vec<Dim, Idx>(threadsX, threadsY), extent};
+
     // Initial data transfer
     // 1) INPUT -> host buffer (safe via raw pointer)
     {
@@ -89,34 +113,29 @@ int main() {
     }
 
     // 2) host -> accelerator
+    auto start_total = now();
     alpaka::memcpy(queue, aIn_X, hIn_X);
     alpaka::memcpy(queue, aIn_Y, hIn_Y);
     alpaka::memcpy(queue, aIn_Cond, hIn_Cond);
     alpaka::wait(queue);
 
-    // Prepare kernel arguments
-    auto strides = alpaka::Vec<Dim, Idx>(cols, 1);
-
-    // Work division: 2D mapping of threads to elements
-    const std::size_t threadsX = 16, threadsY = 16;
-    const std::size_t blocksX = (cols + threadsX - 1) / threadsX;
-    const std::size_t blocksY = (rows + threadsY - 1) / threadsY;
-
-    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{alpaka::Vec<Dim, Idx>(blocksX, blocksY),
-                                                          alpaka::Vec<Dim, Idx>(threadsX, threadsY), extent};
-
     // Launch kernel
     WhereKernel kernel;
 
-    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn_Cond), alpaka::getPtrNative(aIn_X),
-                      alpaka::getPtrNative(aIn_Y), alpaka::getPtrNative(aOut), strides, strides, strides, strides,
-                      extent);
+    auto start_kernel = now();
+
+    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn_Cond),
+                      alpaka::getPtrNative(aIn_X), alpaka::getPtrNative(aIn_Y),
+                      alpaka::getPtrNative(aOut), strides, strides, strides,
+                      strides, extent);
 
     alpaka::wait(queue);
+    auto end_kernel = now();
 
     // Final data transfer: accelerator -> host
     alpaka::memcpy(queue, hOut, aOut);
     alpaka::wait(queue);
+    auto end_total = now();
 
     // Print result
     std::cout << "Output is of shape " << rows << "x" << cols << "\n";
@@ -126,7 +145,8 @@ int main() {
         for (std::size_t i = 0; i < rows; ++i) {
             for (std::size_t j = 0; j < cols; ++j) {
                 T valOut = pHost[i * cols + j];
-                T valIn = INPUT_COND[i * cols + j] ? INPUT_X[i * cols + j] : INPUT_Y[i * cols + j];
+                T valIn = INPUT_COND[i * cols + j] ? INPUT_X[i * cols + j]
+                                                   : INPUT_Y[i * cols + j];
 
                 if (valIn != valOut) {
                     std::cerr << "Failed!\n";
@@ -137,5 +157,13 @@ int main() {
     }
 
     std::cout << "Correct!\n";
+
+    std::chrono::duration<double, std::milli> kernel_ms =
+        end_kernel - start_kernel;
+    std::chrono::duration<double, std::milli> total_ms =
+        end_total - start_total;
+
+    std::cout << "TIME_KERNEL_MS: " << kernel_ms.count() << std::endl;
+    std::cout << "TIME_TOTAL_MS: " << total_ms.count() << std::endl;
     return 0;
 }

From 5b7c40cac0e009c22ce57e7af2a48148a49ea435 Mon Sep 17 00:00:00 2001
From: Francesco Derme <francesco.derme02@gmail.com>
Date: Tue, 16 Dec 2025 18:41:03 +0100
Subject: [PATCH 06/33] removed requirements

---
 requirements.txt | 8 --------
 1 file changed, 8 deletions(-)
 delete mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 916ec8d..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-h5py==3.15.1
-jupyter==1.1.1
-keras==3.12.0
-matplotlib==3.10.8
-numpy==2.3.5
-pandas==2.3.3
-pre-commit==4.5.0
-scikit-learn==1.8.0

From 2f5ff5f9167dc322d2ce7ad7acd074ef3c540c84 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 16 Dec 2025 17:41:54 +0000
Subject: [PATCH 07/33] style: pre-commit fixes

---
 kernels/transpose.hpp    |  9 +++------
 run.py                   | 18 +++++++++---------
 tests/test_concat.cpp    | 34 ++++++++++++----------------------
 tests/test_topk.cpp      | 24 ++++++++----------------
 tests/test_transpose.cpp | 19 +++++++------------
 tests/test_where.cpp     | 27 ++++++++++-----------------
 6 files changed, 49 insertions(+), 82 deletions(-)

diff --git a/kernels/transpose.hpp b/kernels/transpose.hpp
index edf4ea5..4732544 100644
--- a/kernels/transpose.hpp
+++ b/kernels/transpose.hpp
@@ -7,14 +7,11 @@ namespace alpaka_kernels {
 
 struct TransposeKernel {
     template <typename TAcc, typename T, typename Dim, typename Idx>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* input, T* output,
-                                  alpaka::Vec<Dim, Idx> input_strides,
-                                  alpaka::Vec<Dim, Idx> output_strides,
-                                  alpaka::Vec<Dim, Idx> output_shape,
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* input, T* output, alpaka::Vec<Dim, Idx> input_strides,
+                                  alpaka::Vec<Dim, Idx> output_strides, alpaka::Vec<Dim, Idx> output_shape,
                                   alpaka::Vec<Dim, Idx> perm) const {
         using DimAcc = alpaka::Dim<TAcc>;
-        static_assert(DimAcc::value == Dim::value,
-                      "Accelerator and data dimensions must match!");
+        static_assert(DimAcc::value == Dim::value, "Accelerator and data dimensions must match!");
 
         constexpr std::size_t D = Dim::value;
         auto elements = alpaka::uniformElementsND(acc, output_shape);
diff --git a/run.py b/run.py
index e12d460..225f7d1 100644
--- a/run.py
+++ b/run.py
@@ -31,10 +31,10 @@ def build_kernel_tests():
 
         # Run 'make'.
         subprocess.run(["make"], check=True)
-        
+
         print("Build successful\n")
         return True
-        
+
     except subprocess.CalledProcessError:
         print("Build failed. Please fix C++ errors before running benchmarks")
         return False
@@ -55,7 +55,7 @@ def run_benchmark(executable_path, args):
     try:
         # Construct the command
         cmd = [executable_path] + [str(a) for a in args]
-        
+
         # Run and capture output for parsing
         result = subprocess.run(cmd, capture_output=True, text=True, check=True)
         output = result.stdout
@@ -70,7 +70,7 @@ def run_benchmark(executable_path, args):
             print("Printing raw output from the cpp executable")
             print(output)
             return None
-        
+
     except subprocess.CalledProcessError as e:
         print(f"Execution failed with return code {e.returncode}")
         print("Stderr:", e.stderr)
@@ -90,12 +90,12 @@ def main():
         print("-" * 65)
 
         for N in BENCHMARK_SIZES:
-        
+
             res = run_benchmark(EXECUTABLE_PATH, [N])
-        
+
             if res:
                 k_ms, t_ms = res
-            
+
                 # Bandwidth Calculation (approximate)
                 # Transpose reads N*N floats and writes N*N floats
                 # Total Bytes = 2 * N * N * 4 bytes (for float32)
@@ -106,11 +106,11 @@ def main():
                 elif EXECUTABLE_PATH == "./bin/test_concat.out":
                     total_bytes = 24 * N * N
                 elif EXECUTABLE_PATH == "./bin/test_where.out":
-                    total_bytes = 13 * N * N 
+                    total_bytes = 13 * N * N
                 else:
                     k = 4
                     total_bytes = 4 * N * N + 4 * N * k
-            
+
                 # GB/s = (Bytes / 1e9) / (Seconds)
                 # Time is in ms, so divide by 1000.0
                 if k_ms > 0:
diff --git a/tests/test_concat.cpp b/tests/test_concat.cpp
index cd86549..bfffd29 100644
--- a/tests/test_concat.cpp
+++ b/tests/test_concat.cpp
@@ -59,8 +59,7 @@ int main(int argc, char* argv[]) {
             val = cols;
             total_rows += val;
         }
-    }
-    else {
+    } else {
         std::cout << "Using random dimensions ";
         for (auto& val : in_rows) {
             val = distrib_int(gen);
@@ -69,8 +68,7 @@ int main(int argc, char* argv[]) {
     }
 
     for (std::size_t k = 0; k < NumInputs; ++k)
-        std::cout << in_rows[k] << "x" << cols
-                  << ((k < NumInputs - 1) ? ", " : "\n");
+        std::cout << in_rows[k] << "x" << cols << ((k < NumInputs - 1) ? ", " : "\n");
 
     std::array<std::vector<T>, NumInputs> INPUT;
     for (std::size_t k = 0; k < NumInputs; ++k) {
@@ -84,10 +82,8 @@ int main(int argc, char* argv[]) {
     alpaka::Queue<Acc, alpaka::Blocking> queue{devAcc};
 
     // Allocate buffers
-    using BufAcc =
-        decltype(alpaka::allocBuf<T, Idx>(devAcc, alpaka::Vec<Dim, Idx>{}));
-    using BufHost =
-        decltype(alpaka::allocBuf<T, Idx>(devHost, alpaka::Vec<Dim, Idx>{}));
+    using BufAcc = decltype(alpaka::allocBuf<T, Idx>(devAcc, alpaka::Vec<Dim, Idx>{}));
+    using BufHost = decltype(alpaka::allocBuf<T, Idx>(devHost, alpaka::Vec<Dim, Idx>{}));
 
     std::vector<BufAcc> aIn_bufs;
     aIn_bufs.reserve(NumInputs);
@@ -111,8 +107,7 @@ int main(int argc, char* argv[]) {
 
         // INPUT to host buffer data transfer (safe via raw pointers)
         T* pHost = alpaka::getPtrNative(hIn_bufs.back());
-        for (std::size_t i = 0; i < INPUT[k].size(); ++i)
-            pHost[i] = INPUT[k][i];
+        for (std::size_t i = 0; i < INPUT[k].size(); ++i) pHost[i] = INPUT[k][i];
     }
 
     // Allocate output buffers
@@ -142,9 +137,8 @@ int main(int argc, char* argv[]) {
     const std::size_t blocksX = (out_rows + threadsX - 1) / threadsX;
     const std::size_t blocksY = (out_cols + threadsY - 1) / threadsY;
 
-    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{
-        alpaka::Vec<Dim, Idx>(blocksX, blocksY),
-        alpaka::Vec<Dim, Idx>(threadsX, threadsY), extentOut};
+    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{alpaka::Vec<Dim, Idx>(blocksX, blocksY),
+                                                          alpaka::Vec<Dim, Idx>(threadsX, threadsY), extentOut};
 
     // Host to accelerator data transfer
     auto start_total = now();
@@ -160,9 +154,8 @@ int main(int argc, char* argv[]) {
 
     auto start_kernel = now();
 
-    alpaka::exec<Acc>(queue, workDiv, kernel, aIn_ptrs,
-                      alpaka::getPtrNative(aOut), input_strides_vec,
-                      output_strides, extentOut, axis_sizes, ConcatAxis);
+    alpaka::exec<Acc>(queue, workDiv, kernel, aIn_ptrs, alpaka::getPtrNative(aOut), input_strides_vec, output_strides,
+                      extentOut, axis_sizes, ConcatAxis);
 
     alpaka::wait(queue);
     auto end_kernel = now();
@@ -176,8 +169,7 @@ int main(int argc, char* argv[]) {
     std::cout << "Output is of shape " << out_rows << "x" << out_cols << "\n";
 
     std::vector<T> expected;
-    for (const auto& vec : INPUT)
-        expected.insert(expected.end(), vec.begin(), vec.end());
+    for (const auto& vec : INPUT) expected.insert(expected.end(), vec.begin(), vec.end());
 
     {
         T* pHost = alpaka::getPtrNative(hOut);
@@ -191,10 +183,8 @@ int main(int argc, char* argv[]) {
 
     std::cout << "Correct!\n";
 
-    std::chrono::duration<double, std::milli> kernel_ms =
-        end_kernel - start_kernel;
-    std::chrono::duration<double, std::milli> total_ms =
-        end_total - start_total;
+    std::chrono::duration<double, std::milli> kernel_ms = end_kernel - start_kernel;
+    std::chrono::duration<double, std::milli> total_ms = end_total - start_total;
 
     std::cout << "TIME_KERNEL_MS: " << kernel_ms.count() << std::endl;
     std::cout << "TIME_TOTAL_MS: " << total_ms.count() << std::endl;
diff --git a/tests/test_topk.cpp b/tests/test_topk.cpp
index 2c247f1..5de7f38 100644
--- a/tests/test_topk.cpp
+++ b/tests/test_topk.cpp
@@ -54,8 +54,7 @@ int main(int argc, char* argv[]) {
         rows = std::atoi(argv[1]);
         cols = rows;
         std::cout << "Using input dimensions " << rows << "x" << cols << "\n";
-    }
-    else {
+    } else {
         std::cout << "Using random dimensions " << rows << "x" << cols << "\n";
     }
 
@@ -98,16 +97,13 @@ int main(int argc, char* argv[]) {
         if (d == TopkAxis) {
             threadsPerBlock[d] = 1;
             blocksPerGrid[d] = 1;
-        }
-        else {
+        } else {
             threadsPerBlock[d] = TARGET_BLOCK_SIZE;
-            blocksPerGrid[d] = (grid_elements[d] + threadsPerBlock[d] - 1) /
-                               threadsPerBlock[d];
+            blocksPerGrid[d] = (grid_elements[d] + threadsPerBlock[d] - 1) / threadsPerBlock[d];
         }
     }
 
-    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{
-        blocksPerGrid, threadsPerBlock, grid_elements};
+    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{blocksPerGrid, threadsPerBlock, grid_elements};
 
     // Initial data transfer
     // 1) INPUT -> host buffer (safe via raw pointer)
@@ -126,10 +122,8 @@ int main(int argc, char* argv[]) {
 
     auto start_kernel = now();
 
-    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn),
-                      alpaka::getPtrNative(aOut), input_strides, output_strides,
-                      grid_elements, TopkAxis, extentIn[TopkAxis],
-                      padding_value);
+    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn), alpaka::getPtrNative(aOut), input_strides,
+                      output_strides, grid_elements, TopkAxis, extentIn[TopkAxis], padding_value);
 
     alpaka::wait(queue);
     auto end_kernel = now();
@@ -183,10 +177,8 @@ int main(int argc, char* argv[]) {
 
     std::cout << "Correct!\n";
 
-    std::chrono::duration<double, std::milli> kernel_ms =
-        end_kernel - start_kernel;
-    std::chrono::duration<double, std::milli> total_ms =
-        end_total - start_total;
+    std::chrono::duration<double, std::milli> kernel_ms = end_kernel - start_kernel;
+    std::chrono::duration<double, std::milli> total_ms = end_total - start_total;
 
     std::cout << "TIME_KERNEL_MS: " << kernel_ms.count() << std::endl;
     std::cout << "TIME_TOTAL_MS: " << total_ms.count() << std::endl;
diff --git a/tests/test_transpose.cpp b/tests/test_transpose.cpp
index 27f583c..7416272 100644
--- a/tests/test_transpose.cpp
+++ b/tests/test_transpose.cpp
@@ -50,8 +50,7 @@ int main(int argc, char* argv[]) {
         rows = std::atoi(argv[1]);
         cols = rows;
         std::cout << "Using input dimensions " << rows << "x" << cols << "\n";
-    }
-    else {
+    } else {
         std::cout << "Using random dimensions " << rows << "x" << cols << "\n";
     }
 
@@ -90,9 +89,8 @@ int main(int argc, char* argv[]) {
     const std::size_t blocksX = (cols + threadsX - 1) / threadsX;
     const std::size_t blocksY = (rows + threadsY - 1) / threadsY;
 
-    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{
-        alpaka::Vec<Dim, Idx>(blocksX, blocksY),
-        alpaka::Vec<Dim, Idx>(threadsX, threadsY), extentOut};
+    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{alpaka::Vec<Dim, Idx>(blocksX, blocksY),
+                                                          alpaka::Vec<Dim, Idx>(threadsX, threadsY), extentOut};
 
     // Initial data transfer
     // 1) INPUT -> host buffer (safe via raw pointer)
@@ -111,9 +109,8 @@ int main(int argc, char* argv[]) {
 
     auto start_kernel = now();
 
-    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn),
-                      alpaka::getPtrNative(aOut), input_strides, output_strides,
-                      extentOut, perm);
+    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn), alpaka::getPtrNative(aOut), input_strides,
+                      output_strides, extentOut, perm);
 
     alpaka::wait(queue);
     auto end_kernel = now();
@@ -143,10 +140,8 @@ int main(int argc, char* argv[]) {
 
     std::cout << "Correct!\n";
 
-    std::chrono::duration<double, std::milli> kernel_ms =
-        end_kernel - start_kernel;
-    std::chrono::duration<double, std::milli> total_ms =
-        end_total - start_total;
+    std::chrono::duration<double, std::milli> kernel_ms = end_kernel - start_kernel;
+    std::chrono::duration<double, std::milli> total_ms = end_total - start_total;
 
     std::cout << "TIME_KERNEL_MS: " << kernel_ms.count() << std::endl;
     std::cout << "TIME_TOTAL_MS: " << total_ms.count() << std::endl;
diff --git a/tests/test_where.cpp b/tests/test_where.cpp
index af080b6..d97356a 100644
--- a/tests/test_where.cpp
+++ b/tests/test_where.cpp
@@ -52,8 +52,7 @@ int main(int argc, char* argv[]) {
         rows = std::atoi(argv[1]);
         cols = rows;
         std::cout << "Using input dimensions " << rows << "x" << cols << "\n";
-    }
-    else {
+    } else {
         std::cout << "Using random dimensions " << rows << "x" << cols << "\n";
     }
 
@@ -64,8 +63,7 @@ int main(int argc, char* argv[]) {
 
     for (auto& val : INPUT_X) val = distrib_real(gen) * 100.0;
     for (auto& val : INPUT_Y) val = distrib_real(gen);
-    for (std::size_t i = 0; i < numElems; ++i)
-        INPUT_COND[i] = distrib_bool(gen);
+    for (std::size_t i = 0; i < numElems; ++i) INPUT_COND[i] = distrib_bool(gen);
 
     // Setup the accelerator, host and queue
     auto devAcc = alpaka::getDevByIdx(PlatAcc{}, 0u);
@@ -95,9 +93,8 @@ int main(int argc, char* argv[]) {
     const std::size_t blocksX = (cols + threadsX - 1) / threadsX;
     const std::size_t blocksY = (rows + threadsY - 1) / threadsY;
 
-    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{
-        alpaka::Vec<Dim, Idx>(blocksX, blocksY),
-        alpaka::Vec<Dim, Idx>(threadsX, threadsY), extent};
+    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{alpaka::Vec<Dim, Idx>(blocksX, blocksY),
+                                                          alpaka::Vec<Dim, Idx>(threadsX, threadsY), extent};
 
     // Initial data transfer
     // 1) INPUT -> host buffer (safe via raw pointer)
@@ -124,10 +121,9 @@ int main(int argc, char* argv[]) {
 
     auto start_kernel = now();
 
-    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn_Cond),
-                      alpaka::getPtrNative(aIn_X), alpaka::getPtrNative(aIn_Y),
-                      alpaka::getPtrNative(aOut), strides, strides, strides,
-                      strides, extent);
+    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn_Cond), alpaka::getPtrNative(aIn_X),
+                      alpaka::getPtrNative(aIn_Y), alpaka::getPtrNative(aOut), strides, strides, strides, strides,
+                      extent);
 
     alpaka::wait(queue);
     auto end_kernel = now();
@@ -145,8 +141,7 @@ int main(int argc, char* argv[]) {
         for (std::size_t i = 0; i < rows; ++i) {
             for (std::size_t j = 0; j < cols; ++j) {
                 T valOut = pHost[i * cols + j];
-                T valIn = INPUT_COND[i * cols + j] ? INPUT_X[i * cols + j]
-                                                   : INPUT_Y[i * cols + j];
+                T valIn = INPUT_COND[i * cols + j] ? INPUT_X[i * cols + j] : INPUT_Y[i * cols + j];
 
                 if (valIn != valOut) {
                     std::cerr << "Failed!\n";
@@ -158,10 +153,8 @@ int main(int argc, char* argv[]) {
 
     std::cout << "Correct!\n";
 
-    std::chrono::duration<double, std::milli> kernel_ms =
-        end_kernel - start_kernel;
-    std::chrono::duration<double, std::milli> total_ms =
-        end_total - start_total;
+    std::chrono::duration<double, std::milli> kernel_ms = end_kernel - start_kernel;
+    std::chrono::duration<double, std::milli> total_ms = end_total - start_total;
 
     std::cout << "TIME_KERNEL_MS: " << kernel_ms.count() << std::endl;
     std::cout << "TIME_TOTAL_MS: " << total_ms.count() << std::endl;

From 7327a74ab36cab3fa7221103e31cb5af9fde2250 Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Wed, 17 Dec 2025 01:54:21 +0530
Subject: [PATCH 08/33] transpose working on GPU!!

---
 .gitignore               |  1 +
 CMakeLists.txt           | 39 --------------------------------
 README.md                | 13 +++--------
 tests/test_transpose.cpp | 48 +++++++++++++++++++++++++++-------------
 4 files changed, 37 insertions(+), 64 deletions(-)

diff --git a/.gitignore b/.gitignore
index 6993547..3a5d118 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@
 .bin/
 .DS_Store
 build/
+.vscode/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8830fc9..0aa6202 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,6 @@ set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 
 # --- User-configurable options ---
 set(CUDA_BASE "/usr/local/cuda-13.1" CACHE PATH "CUDA base path")
-set(TBB_BASE "/usr" CACHE PATH "TBB base path")
 set(ALPAKA_BASE "external/alpaka" CACHE PATH "Alpaka base path")
 
 # --- Compiler flags ---
@@ -46,14 +45,6 @@ target_include_directories(test_transpose PRIVATE
 
 target_link_directories(test_transpose PRIVATE ${CUDA_BASE}/lib64)
 
-target_link_libraries(test_transpose
-    PRIVATE
-        cublas
-        cublasLt
-        cudart
-        nvidia-ml
-)
-
 add_executable(test_concat tests/test_concat.cpp)
 set_source_files_properties(tests/test_concat.cpp PROPERTIES LANGUAGE CUDA)
 enable_language(CUDA)
@@ -81,14 +72,6 @@ target_include_directories(test_concat PRIVATE
 
 target_link_directories(test_concat PRIVATE ${CUDA_BASE}/lib64)
 
-target_link_libraries(test_concat
-    PRIVATE
-        cublas
-        cublasLt
-        cudart
-        nvidia-ml
-)
-
 add_executable(test_where tests/test_where.cpp)
 set_source_files_properties(tests/test_where.cpp PROPERTIES LANGUAGE CUDA)
 enable_language(CUDA)
@@ -116,14 +99,6 @@ target_include_directories(test_where PRIVATE
 
 target_link_directories(test_where PRIVATE ${CUDA_BASE}/lib64)
 
-target_link_libraries(test_where
-    PRIVATE
-        cublas
-        cublasLt
-        cudart
-        nvidia-ml
-)
-
 add_executable(test_topk tests/test_topk.cpp)
 set_source_files_properties(tests/test_topk.cpp PROPERTIES LANGUAGE CUDA)
 enable_language(CUDA)
@@ -150,17 +125,3 @@ target_include_directories(test_topk PRIVATE
 )
 
 target_link_directories(test_topk PRIVATE ${CUDA_BASE}/lib64)
-
-target_link_libraries(test_topk
-    PRIVATE
-        cublas
-        cublasLt
-        cudart
-        nvidia-ml
-)
-
-# Optional clean
-add_custom_target(clean-all
-  COMMAND ${CMAKE_COMMAND} -E rm -f test_transpose *.d *.o *.so
-  COMMENT "Cleaning all generated files"
-)
diff --git a/README.md b/README.md
index 9b510cc..fcc10b8 100644
--- a/README.md
+++ b/README.md
@@ -47,19 +47,12 @@ cmake --build build
 
 where the following flags can be configured by the user:
 - `CUDA_BASE` (default: "/usr/local/cuda-13.1"): CUDA base path
-- `TBB_BASE` (default: "/usr"): TBB base path
 - `ALPAKA_BASE` (default: "external/alpaka"): Alpaka base path
 - `CUDA_ARCH` (default: "sm_75"): CUDA architecture
 - `CMAKE_CUDA_COMPILER` (default: "/usr/local/cuda-13.1/bin/nvcc"): Cuda compiler path
 
-### Running integration tests on an NVIDIA GPU
-
-To run SOFIE integration tests:
+To run the tests, simply execute `test_*` executables produced in `build/`.
 
-```
-cd tests/sofie_integration
-cmake -S. -Bbuild
-cmake --build build
-```
+### Running integration tests on an NVIDIA GPU
 
-with the same configurable flags listed in the section above.
+TODO
diff --git a/tests/test_transpose.cpp b/tests/test_transpose.cpp
index 8f89690..109de94 100644
--- a/tests/test_transpose.cpp
+++ b/tests/test_transpose.cpp
@@ -12,13 +12,13 @@ using Idx = std::size_t;
 
 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
 using DevAcc = alpaka::DevCudaRt;
-using QueueAcc = alpaka::Queue<DevAcc, alpaka::NonBlocking>;
 using Acc = alpaka::AccGpuCudaRt<Dim, Idx>;
+using QueueAcc = alpaka::Queue<Acc, alpaka::NonBlocking>;
 
 #elif defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
 using DevAcc = alpaka::DevCpu;
-using QueueAcc = alpaka::Queue<DevAcc, alpaka::Blocking>;
 using Acc = alpaka::AccCpuThreads<Dim, Idx>;
+using QueueAcc = alpaka::Queue<Acc, alpaka::Blocking>;
 
 #else
 #error Please define a single one of ALPAKA_ACC_GPU_CUDA_ENABLED, ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
@@ -49,12 +49,12 @@ int main() {
     std::vector<T> INPUT(numElems);
     for (auto& val : INPUT) val = distrib_real(gen);
 
-    // Setup the accelerator, host and queue
-    auto devAcc = alpaka::getDevByIdx(PlatAcc{}, 0u);
-    auto devHost = alpaka::getDevByIdx(PlatHost{}, 0u);
-    alpaka::Queue<Acc, alpaka::Blocking> queue{devAcc};
+    // Setup devices and queue
+    auto devAcc = alpaka::getDevByIdx(PlatAcc{}, 0);
+    auto devHost = alpaka::getDevByIdx(PlatHost{}, 0);
+    QueueAcc queue{devAcc};
 
-    // Allocate buffers
+    // Create extents
     auto extentIn = alpaka::Vec<Dim, Idx>(rows, cols);
     auto extentOut = alpaka::Vec<Dim, Idx>(cols, rows);
 
@@ -77,8 +77,18 @@ int main() {
     }
 
     // 2) host -> accelerator
-    alpaka::memcpy(queue, aIn, hIn);
-    alpaka::wait(queue);
+    {
+        T* pAIn = alpaka::getPtrNative(aIn);
+        T* pHIn = alpaka::getPtrNative(hIn);
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+        // For GPU, use cudaMemcpy directly
+        cudaMemcpy(pAIn, pHIn, numElems * sizeof(T), cudaMemcpyHostToDevice);
+#else
+        // For CPU, use memcpy
+        std::memcpy(pAIn, pHIn, numElems * sizeof(T));
+#endif
+    }
 
     // Prepare kernel arguments
     auto input_strides = alpaka::Vec<Dim, Idx>(cols, 1);
@@ -93,8 +103,8 @@ int main() {
     const std::size_t blocksX = (cols + threadsX - 1) / threadsX;
     const std::size_t blocksY = (rows + threadsY - 1) / threadsY;
 
-    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{alpaka::Vec<Dim, Idx>(blocksX, blocksY),
-                                                          alpaka::Vec<Dim, Idx>(threadsX, threadsY), extentOut};
+    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{alpaka::Vec<Dim, Idx>(blocksY, blocksX),
+                                                          alpaka::Vec<Dim, Idx>(threadsY, threadsX), extentOut};
 
     // Launch kernel
     TransposeKernel kernel;
@@ -104,11 +114,19 @@ int main() {
 
     alpaka::wait(queue);
 
-    // Final data transfer: accelerator -> host
-    alpaka::memcpy(queue, hOut, aOut);
-    alpaka::wait(queue);
+    // Copy device -> host using 1D memcpy
+    {
+        T* pAOut = alpaka::getPtrNative(aOut);
+        T* pHOut = alpaka::getPtrNative(hOut);
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+        cudaMemcpy(pHOut, pAOut, numElems * sizeof(T), cudaMemcpyDeviceToHost);
+#else
+        std::memcpy(pHOut, pAOut, numElems * sizeof(T));
+#endif
+    }
 
-    // Print result
+    // Check results
     std::cout << "Output is of shape " << cols << "x" << rows << "\n";
 
     {

From 3793b0cdf534a78477ae361d2e5e236aa512b1df Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Wed, 17 Dec 2025 03:07:31 +0530
Subject: [PATCH 09/33] fix: get concat tests to work on GPU

---
 tests/test_concat.cpp | 35 +++++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/tests/test_concat.cpp b/tests/test_concat.cpp
index f3bfd7e..2ab5baf 100644
--- a/tests/test_concat.cpp
+++ b/tests/test_concat.cpp
@@ -15,13 +15,13 @@ using Idx = std::size_t;
 
 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
 using DevAcc = alpaka::DevCudaRt;
-using QueueAcc = alpaka::Queue<DevAcc, alpaka::NonBlocking>;
 using Acc = alpaka::AccGpuCudaRt<Dim, Idx>;
+using QueueAcc = alpaka::Queue<Acc, alpaka::NonBlocking>;
 
 #elif defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
 using DevAcc = alpaka::DevCpu;
-using QueueAcc = alpaka::Queue<DevAcc, alpaka::Blocking>;
 using Acc = alpaka::AccCpuThreads<Dim, Idx>;
+using QueueAcc = alpaka::Queue<Acc, alpaka::Blocking>;
 
 #else
 #error Please define a single one of ALPAKA_ACC_GPU_CUDA_ENABLED, ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
@@ -63,10 +63,12 @@ int main() {
         for (auto& val : INPUT[k]) val = distrib_real(gen);
     }
 
+    const std::size_t numElems = total_rows * cols;
+
     // Setup the accelerator, host and queue
     auto devAcc = alpaka::getDevByIdx(PlatAcc{}, 0u);
     auto devHost = alpaka::getDevByIdx(PlatHost{}, 0u);
-    alpaka::Queue<Acc, alpaka::Blocking> queue{devAcc};
+    QueueAcc queue{devAcc};
 
     // Allocate buffers & initial data transfer
     using BufAcc = decltype(alpaka::allocBuf<T, Idx>(devAcc, alpaka::Vec<Dim, Idx>{}));
@@ -98,15 +100,22 @@ int main() {
         for (std::size_t i = 0; i < INPUT[k].size(); ++i) pHost[i] = INPUT[k][i];
 
         // 2) host -> accelerator
-        alpaka::memcpy(queue, aIn_bufs.back(), hIn_bufs.back());
+        T* pAIn = alpaka::getPtrNative(aIn_bufs.back());
+        T* pHIn = alpaka::getPtrNative(hIn_bufs.back());
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+        // For GPU, use cudaMemcpy directly
+        cudaMemcpy(pAIn, pHIn, numElems * sizeof(T), cudaMemcpyHostToDevice);
+#else
+        // For CPU, use memcpy
+        std::memcpy(pAIn, pHIn, numElems * sizeof(T));
+#endif
     }
 
     // Allocate output buffers
     auto aOut = alpaka::allocBuf<T, Idx>(devAcc, extentOut);
     auto hOut = alpaka::allocBuf<T, Idx>(devHost, extentOut);
 
-    alpaka::wait(queue);
-
     // Prepare kernel arguments
     std::array<T const*, NumInputs> aIn_ptrs;
     for (std::size_t k = 0; k < NumInputs; ++k) {
@@ -130,8 +139,8 @@ int main() {
     const std::size_t blocksX = (out_rows + threadsX - 1) / threadsX;
     const std::size_t blocksY = (out_cols + threadsY - 1) / threadsY;
 
-    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{alpaka::Vec<Dim, Idx>(blocksX, blocksY),
-                                                          alpaka::Vec<Dim, Idx>(threadsX, threadsY), extentOut};
+    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{alpaka::Vec<Dim, Idx>(blocksY, blocksX),
+                                                          alpaka::Vec<Dim, Idx>(threadsY, threadsX), extentOut};
 
     // Launch kernel
     ConcatKernel kernel;
@@ -142,8 +151,14 @@ int main() {
     alpaka::wait(queue);
 
     // Final data transfer: accelerator -> host
-    alpaka::memcpy(queue, hOut, aOut);
-    alpaka::wait(queue);
+    T* pAOut = alpaka::getPtrNative(aOut);
+    T* pHOut = alpaka::getPtrNative(hOut);
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+    cudaMemcpy(pHOut, pAOut, numElems * sizeof(T), cudaMemcpyDeviceToHost);
+#else
+    std::memcpy(pHOut, pAOut, numElems * sizeof(T));
+#endif
 
     // Print result
     std::cout << "Output is of shape " << out_rows << "x" << out_cols << "\n";

From ea83974f294460de717dab562ade7d350b8bece9 Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Wed, 17 Dec 2025 03:16:29 +0530
Subject: [PATCH 10/33] fix: get topk tests to work on CUDA

---
 tests/test_topk.cpp | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/tests/test_topk.cpp b/tests/test_topk.cpp
index b92a076..e0c0eee 100644
--- a/tests/test_topk.cpp
+++ b/tests/test_topk.cpp
@@ -16,13 +16,13 @@ using Idx = std::size_t;
 
 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
 using DevAcc = alpaka::DevCudaRt;
-using QueueAcc = alpaka::Queue<DevAcc, alpaka::NonBlocking>;
 using Acc = alpaka::AccGpuCudaRt<Dim, Idx>;
+using QueueAcc = alpaka::Queue<Acc, alpaka::NonBlocking>;
 
 #elif defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
 using DevAcc = alpaka::DevCpu;
-using QueueAcc = alpaka::Queue<DevAcc, alpaka::Blocking>;
 using Acc = alpaka::AccCpuThreads<Dim, Idx>;
+using QueueAcc = alpaka::Queue<Acc, alpaka::Blocking>;
 
 #else
 #error Please define a single one of ALPAKA_ACC_GPU_CUDA_ENABLED, ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
@@ -56,7 +56,7 @@ int main() {
     // Setup the accelerator, host and queue
     auto devAcc = alpaka::getDevByIdx(PlatAcc{}, 0u);
     auto devHost = alpaka::getDevByIdx(PlatHost{}, 0u);
-    alpaka::Queue<Acc, alpaka::Blocking> queue{devAcc};
+    QueueAcc queue{devAcc};
 
     // Allocate buffers
     auto extentIn = alpaka::Vec<Dim, Idx>(rows, cols);
@@ -78,8 +78,18 @@ int main() {
     }
 
     // 2) host -> accelerator
-    alpaka::memcpy(queue, aIn, hIn);
-    alpaka::wait(queue);
+    {
+        T* pAIn = alpaka::getPtrNative(aIn);
+        T* pHIn = alpaka::getPtrNative(hIn);
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+        // For GPU, use cudaMemcpy directly
+        cudaMemcpy(pAIn, pHIn, numElems * sizeof(T), cudaMemcpyHostToDevice);
+#else
+        // For CPU, use memcpy
+        std::memcpy(pAIn, pHIn, numElems * sizeof(T));
+#endif
+    }
 
     // Prepare kernel arguments
     T const padding_value = -1.0;
@@ -115,8 +125,16 @@ int main() {
     alpaka::wait(queue);
 
     // Final data transfer: accelerator -> host
-    alpaka::memcpy(queue, hOut, aOut);
-    alpaka::wait(queue);
+    {
+        T* pAOut = alpaka::getPtrNative(aOut);
+        T* pHOut = alpaka::getPtrNative(hOut);
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+        cudaMemcpy(pHOut, pAOut, numElems * sizeof(T), cudaMemcpyDeviceToHost);
+#else
+        std::memcpy(pHOut, pAOut, numElems * sizeof(T));
+#endif
+    }
 
     // Print result
     std::cout << "Output is of shape " << rows << "x" << K << "\n";

From a99d99c54793f2c2586c7b1d7dadbbf7b76bd739 Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Wed, 17 Dec 2025 03:26:32 +0530
Subject: [PATCH 11/33] fix: get where tests to work on CUDA

---
 tests/test_where.cpp | 42 +++++++++++++++++++++++++++++++++---------
 1 file changed, 33 insertions(+), 9 deletions(-)

diff --git a/tests/test_where.cpp b/tests/test_where.cpp
index 3ec5feb..0c1f4c1 100644
--- a/tests/test_where.cpp
+++ b/tests/test_where.cpp
@@ -12,13 +12,13 @@ using Idx = std::size_t;
 
 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
 using DevAcc = alpaka::DevCudaRt;
-using QueueAcc = alpaka::Queue<DevAcc, alpaka::NonBlocking>;
 using Acc = alpaka::AccGpuCudaRt<Dim, Idx>;
+using QueueAcc = alpaka::Queue<Acc, alpaka::NonBlocking>;
 
 #elif defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
 using DevAcc = alpaka::DevCpu;
-using QueueAcc = alpaka::Queue<DevAcc, alpaka::Blocking>;
 using Acc = alpaka::AccCpuThreads<Dim, Idx>;
+using QueueAcc = alpaka::Queue<Acc, alpaka::Blocking>;
 
 #else
 #error Please define a single one of ALPAKA_ACC_GPU_CUDA_ENABLED, ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
@@ -58,7 +58,7 @@ int main() {
     // Setup the accelerator, host and queue
     auto devAcc = alpaka::getDevByIdx(PlatAcc{}, 0u);
     auto devHost = alpaka::getDevByIdx(PlatHost{}, 0u);
-    alpaka::Queue<Acc, alpaka::Blocking> queue{devAcc};
+    QueueAcc queue{devAcc};
 
     // Allocate buffers
     auto extent = alpaka::Vec<Dim, Idx>(rows, cols);
@@ -89,10 +89,26 @@ int main() {
     }
 
     // 2) host -> accelerator
-    alpaka::memcpy(queue, aIn_X, hIn_X);
-    alpaka::memcpy(queue, aIn_Y, hIn_Y);
-    alpaka::memcpy(queue, aIn_Cond, hIn_Cond);
-    alpaka::wait(queue);
+    {
+        T* pAIn_X = alpaka::getPtrNative(aIn_X);
+        T* pAIn_Y = alpaka::getPtrNative(aIn_Y);
+        T* pAIn_Cond = alpaka::getPtrNative(aIn_Cond);
+        T* pHIn_X = alpaka::getPtrNative(hIn_X);
+        T* pHIn_Y = alpaka::getPtrNative(hIn_Y);
+        T* pHIn_Cond = alpaka::getPtrNative(hIn_Cond);
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+        // For GPU, use cudaMemcpy directly
+        cudaMemcpy(pAIn_X, pHIn_X, numElems * sizeof(T), cudaMemcpyHostToDevice);
+        cudaMemcpy(pAIn_Y, pHIn_Y, numElems * sizeof(T), cudaMemcpyHostToDevice);
+        cudaMemcpy(pAIn_Cond, pHIn_Cond, numElems * sizeof(T), cudaMemcpyHostToDevice);
+#else
+        // For CPU, use memcpy
+        std::memcpy(pAIn_X, pHIn_X, numElems * sizeof(T));
+        std::memcpy(pAIn_Y, pHIn_Y, numElems * sizeof(T));
+        std::memcpy(pAIn_Cond, pHIn_Cond, numElems * sizeof(T));
+#endif
+    }
 
     // Prepare kernel arguments
     auto strides = alpaka::Vec<Dim, Idx>(cols, 1);
@@ -115,8 +131,16 @@ int main() {
     alpaka::wait(queue);
 
     // Final data transfer: accelerator -> host
-    alpaka::memcpy(queue, hOut, aOut);
-    alpaka::wait(queue);
+    {
+        T* pAOut = alpaka::getPtrNative(aOut);
+        T* pHOut = alpaka::getPtrNative(hOut);
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+        cudaMemcpy(pHOut, pAOut, numElems * sizeof(T), cudaMemcpyDeviceToHost);
+#else
+        std::memcpy(pHOut, pAOut, numElems * sizeof(T));
+#endif
+    }
 
     // Print result
     std::cout << "Output is of shape " << rows << "x" << cols << "\n";

From fab75f2dacd23872998878037d052c84f9dba8c7 Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Wed, 17 Dec 2025 03:54:45 +0530
Subject: [PATCH 12/33] fix CPU tests

---
 tests/test_topk.cpp      | 12 +++++-------
 tests/test_transpose.cpp | 14 ++++++--------
 tests/test_where.cpp     | 14 ++++++--------
 3 files changed, 17 insertions(+), 23 deletions(-)

diff --git a/tests/test_topk.cpp b/tests/test_topk.cpp
index e0c0eee..8d2be0e 100644
--- a/tests/test_topk.cpp
+++ b/tests/test_topk.cpp
@@ -79,15 +79,14 @@ int main() {
 
     // 2) host -> accelerator
     {
-        T* pAIn = alpaka::getPtrNative(aIn);
-        T* pHIn = alpaka::getPtrNative(hIn);
-
 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
         // For GPU, use cudaMemcpy directly
+        T* pAIn = alpaka::getPtrNative(aIn);
+        T* pHIn = alpaka::getPtrNative(hIn);
         cudaMemcpy(pAIn, pHIn, numElems * sizeof(T), cudaMemcpyHostToDevice);
 #else
         // For CPU, use memcpy
-        std::memcpy(pAIn, pHIn, numElems * sizeof(T));
+        alpaka::memcpy(queue, aIn, hIn);
 #endif
     }
 
@@ -126,13 +125,12 @@ int main() {
 
     // Final data transfer: accelerator -> host
     {
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
         T* pAOut = alpaka::getPtrNative(aOut);
         T* pHOut = alpaka::getPtrNative(hOut);
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
         cudaMemcpy(pHOut, pAOut, numElems * sizeof(T), cudaMemcpyDeviceToHost);
 #else
-        std::memcpy(pHOut, pAOut, numElems * sizeof(T));
+        alpaka::memcpy(queue, hOut, aOut);
 #endif
     }
 
diff --git a/tests/test_transpose.cpp b/tests/test_transpose.cpp
index 109de94..d2d0da4 100644
--- a/tests/test_transpose.cpp
+++ b/tests/test_transpose.cpp
@@ -78,15 +78,14 @@ int main() {
 
     // 2) host -> accelerator
     {
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
         T* pAIn = alpaka::getPtrNative(aIn);
         T* pHIn = alpaka::getPtrNative(hIn);
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
         // For GPU, use cudaMemcpy directly
         cudaMemcpy(pAIn, pHIn, numElems * sizeof(T), cudaMemcpyHostToDevice);
 #else
         // For CPU, use memcpy
-        std::memcpy(pAIn, pHIn, numElems * sizeof(T));
+        alpaka::memcpy(queue, aIn, hIn);
 #endif
     }
 
@@ -114,19 +113,18 @@ int main() {
 
     alpaka::wait(queue);
 
-    // Copy device -> host using 1D memcpy
+    // Final data transfer: accelerator -> host
     {
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
         T* pAOut = alpaka::getPtrNative(aOut);
         T* pHOut = alpaka::getPtrNative(hOut);
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
         cudaMemcpy(pHOut, pAOut, numElems * sizeof(T), cudaMemcpyDeviceToHost);
 #else
-        std::memcpy(pHOut, pAOut, numElems * sizeof(T));
+        alpaka::memcpy(queue, hOut, aOut);
 #endif
     }
 
-    // Check results
+    // Print results
     std::cout << "Output is of shape " << cols << "x" << rows << "\n";
 
     {
diff --git a/tests/test_where.cpp b/tests/test_where.cpp
index 0c1f4c1..9a45334 100644
--- a/tests/test_where.cpp
+++ b/tests/test_where.cpp
@@ -90,23 +90,22 @@ int main() {
 
     // 2) host -> accelerator
     {
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
         T* pAIn_X = alpaka::getPtrNative(aIn_X);
         T* pAIn_Y = alpaka::getPtrNative(aIn_Y);
         T* pAIn_Cond = alpaka::getPtrNative(aIn_Cond);
         T* pHIn_X = alpaka::getPtrNative(hIn_X);
         T* pHIn_Y = alpaka::getPtrNative(hIn_Y);
         T* pHIn_Cond = alpaka::getPtrNative(hIn_Cond);
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
         // For GPU, use cudaMemcpy directly
         cudaMemcpy(pAIn_X, pHIn_X, numElems * sizeof(T), cudaMemcpyHostToDevice);
         cudaMemcpy(pAIn_Y, pHIn_Y, numElems * sizeof(T), cudaMemcpyHostToDevice);
         cudaMemcpy(pAIn_Cond, pHIn_Cond, numElems * sizeof(T), cudaMemcpyHostToDevice);
 #else
         // For CPU, use memcpy
-        std::memcpy(pAIn_X, pHIn_X, numElems * sizeof(T));
-        std::memcpy(pAIn_Y, pHIn_Y, numElems * sizeof(T));
-        std::memcpy(pAIn_Cond, pHIn_Cond, numElems * sizeof(T));
+        alpaka::memcpy(queue, aIn_X, hIn_X);
+        alpaka::memcpy(queue, aIn_Y, hIn_Y);
+        alpaka::memcpy(queue, aIn_Cond, hIn_Cond);
 #endif
     }
 
@@ -132,13 +131,12 @@ int main() {
 
     // Final data transfer: accelerator -> host
     {
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
         T* pAOut = alpaka::getPtrNative(aOut);
         T* pHOut = alpaka::getPtrNative(hOut);
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
         cudaMemcpy(pHOut, pAOut, numElems * sizeof(T), cudaMemcpyDeviceToHost);
 #else
-        std::memcpy(pHOut, pAOut, numElems * sizeof(T));
+        alpaka::memcpy(queue, hOut, aOut);
 #endif
     }
 

From ad54955ef038b80ab49770c663034916fb52c892 Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Wed, 17 Dec 2025 03:59:46 +0530
Subject: [PATCH 13/33] fix concat gpu tests

---
 tests/test_concat.cpp    | 11 +++++------
 tests/test_transpose.cpp |  2 +-
 tests/test_where.cpp     |  2 +-
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/tests/test_concat.cpp b/tests/test_concat.cpp
index 2ab5baf..c013ce7 100644
--- a/tests/test_concat.cpp
+++ b/tests/test_concat.cpp
@@ -100,15 +100,15 @@ int main() {
         for (std::size_t i = 0; i < INPUT[k].size(); ++i) pHost[i] = INPUT[k][i];
 
         // 2) host -> accelerator
-        T* pAIn = alpaka::getPtrNative(aIn_bufs.back());
-        T* pHIn = alpaka::getPtrNative(hIn_bufs.back());
 
 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
         // For GPU, use cudaMemcpy directly
+        T* pAIn = alpaka::getPtrNative(aIn_bufs.back());
+        T* pHIn = alpaka::getPtrNative(hIn_bufs.back());
         cudaMemcpy(pAIn, pHIn, numElems * sizeof(T), cudaMemcpyHostToDevice);
 #else
         // For CPU, use memcpy
-        std::memcpy(pAIn, pHIn, numElems * sizeof(T));
+        alpaka::memcpy(queue, aIn_bufs.back(), hIn_bufs.back());
 #endif
     }
 
@@ -151,13 +151,12 @@ int main() {
     alpaka::wait(queue);
 
     // Final data transfer: accelerator -> host
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
     T* pAOut = alpaka::getPtrNative(aOut);
     T* pHOut = alpaka::getPtrNative(hOut);
-
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
     cudaMemcpy(pHOut, pAOut, numElems * sizeof(T), cudaMemcpyDeviceToHost);
 #else
-    std::memcpy(pHOut, pAOut, numElems * sizeof(T));
+    alpaka::memcpy(queue, hOut, aOut);
 #endif
 
     // Print result
diff --git a/tests/test_transpose.cpp b/tests/test_transpose.cpp
index d2d0da4..1102d19 100644
--- a/tests/test_transpose.cpp
+++ b/tests/test_transpose.cpp
@@ -79,9 +79,9 @@ int main() {
     // 2) host -> accelerator
     {
 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+        // For GPU, use cudaMemcpy directly
         T* pAIn = alpaka::getPtrNative(aIn);
         T* pHIn = alpaka::getPtrNative(hIn);
-        // For GPU, use cudaMemcpy directly
         cudaMemcpy(pAIn, pHIn, numElems * sizeof(T), cudaMemcpyHostToDevice);
 #else
         // For CPU, use memcpy
diff --git a/tests/test_where.cpp b/tests/test_where.cpp
index 9a45334..3aeb3e2 100644
--- a/tests/test_where.cpp
+++ b/tests/test_where.cpp
@@ -91,13 +91,13 @@ int main() {
     // 2) host -> accelerator
     {
 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+        // For GPU, use cudaMemcpy directly
         T* pAIn_X = alpaka::getPtrNative(aIn_X);
         T* pAIn_Y = alpaka::getPtrNative(aIn_Y);
         T* pAIn_Cond = alpaka::getPtrNative(aIn_Cond);
         T* pHIn_X = alpaka::getPtrNative(hIn_X);
         T* pHIn_Y = alpaka::getPtrNative(hIn_Y);
         T* pHIn_Cond = alpaka::getPtrNative(hIn_Cond);
-        // For GPU, use cudaMemcpy directly
         cudaMemcpy(pAIn_X, pHIn_X, numElems * sizeof(T), cudaMemcpyHostToDevice);
         cudaMemcpy(pAIn_Y, pHIn_Y, numElems * sizeof(T), cudaMemcpyHostToDevice);
         cudaMemcpy(pAIn_Cond, pHIn_Cond, numElems * sizeof(T), cudaMemcpyHostToDevice);

From 197681d23fa987779ff6216d4581095004384109 Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Wed, 17 Dec 2025 04:00:02 +0530
Subject: [PATCH 14/33] have removed large notebook

---
 .pre-commit-config.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 507e7db..687d069 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -14,7 +14,6 @@ repos:
     rev: v6.0.0
     hooks:
       - id: check-added-large-files
-        args: ['--maxkb=3000']
       - id: check-case-conflict
       - id: check-merge-conflict
       - id: check-symlinks

From e1b2bb4c0b75ac0d310b0af02802b5b5afb3cc40 Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Wed, 17 Dec 2025 04:17:51 +0530
Subject: [PATCH 15/33] fix dangling code

---
 tests/test_concat.cpp | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/tests/test_concat.cpp b/tests/test_concat.cpp
index c013ce7..c1f2009 100644
--- a/tests/test_concat.cpp
+++ b/tests/test_concat.cpp
@@ -100,16 +100,17 @@ int main() {
         for (std::size_t i = 0; i < INPUT[k].size(); ++i) pHost[i] = INPUT[k][i];
 
         // 2) host -> accelerator
-
+        {
 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-        // For GPU, use cudaMemcpy directly
-        T* pAIn = alpaka::getPtrNative(aIn_bufs.back());
-        T* pHIn = alpaka::getPtrNative(hIn_bufs.back());
-        cudaMemcpy(pAIn, pHIn, numElems * sizeof(T), cudaMemcpyHostToDevice);
+            // For GPU, use cudaMemcpy directly
+            T* pAIn = alpaka::getPtrNative(aIn_bufs.back());
+            T* pHIn = alpaka::getPtrNative(hIn_bufs.back());
+            cudaMemcpy(pAIn, pHIn, numElems * sizeof(T), cudaMemcpyHostToDevice);
 #else
-        // For CPU, use memcpy
-        alpaka::memcpy(queue, aIn_bufs.back(), hIn_bufs.back());
+            // For CPU, use memcpy
+            alpaka::memcpy(queue, aIn_bufs.back(), hIn_bufs.back());
 #endif
+        }
     }
 
     // Allocate output buffers
@@ -150,14 +151,16 @@ int main() {
 
     alpaka::wait(queue);
 
-    // Final data transfer: accelerator -> host
+    {
+        // Final data transfer: accelerator -> host
 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-    T* pAOut = alpaka::getPtrNative(aOut);
-    T* pHOut = alpaka::getPtrNative(hOut);
-    cudaMemcpy(pHOut, pAOut, numElems * sizeof(T), cudaMemcpyDeviceToHost);
+        T* pAOut = alpaka::getPtrNative(aOut);
+        T* pHOut = alpaka::getPtrNative(hOut);
+        cudaMemcpy(pHOut, pAOut, numElems * sizeof(T), cudaMemcpyDeviceToHost);
 #else
-    alpaka::memcpy(queue, hOut, aOut);
+        alpaka::memcpy(queue, hOut, aOut);
 #endif
+    }
 
     // Print result
     std::cout << "Output is of shape " << out_rows << "x" << out_cols << "\n";

From d1fb51a1e36fd653b94db09bbfd396e1a4f6a9db Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Wed, 17 Dec 2025 04:29:23 +0530
Subject: [PATCH 16/33] update readme a bit"

---
 README.md | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 94b3891..b3218a2 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ git clone https://github.com/Saransh-cpp/SOFIE-ALPAKA --recursive
 To build all kernels and tests in `bin/`:
 
 ```
-make all
+make all -j10
 ```
 
 ### Running tests on a threaded CPU
@@ -35,7 +35,7 @@ make all
 To run all kernel tests (and build if not built before):
 
 ```
-make test
+make test -j10
 ```
 
 ### Building kernels and tests on an NVIDIA GPU
@@ -44,17 +44,26 @@ To build all the kernels and tests in `build/`
 
 ```
 cmake -S. -Bbuild
-cmake --build build
+cmake --build build -j10
 ```
 
 where the following flags can be configured by the user:
 - `CUDA_BASE` (default: "/usr/local/cuda-13.1"): CUDA base path
 - `ALPAKA_BASE` (default: "external/alpaka"): Alpaka base path
 - `CUDA_ARCH` (default: "sm_75"): CUDA architecture
-- `CMAKE_CUDA_COMPILER` (default: "/usr/local/cuda-13.1/bin/nvcc"): Cuda compiler path
+- `CMAKE_CUDA_COMPILER` (default: "/usr/local/cuda-12.5/bin/nvcc"): Cuda compiler path
 
 To run the tests, simply execute `test_*` executables produced in `build/`.
 
 ### Running integration tests on an NVIDIA GPU
 
-TODO
+1. Port a kernel to [SOFIE](https://github.com/ML4EP/SOFIE) on a stand-alone branch (against the `gpu/alpaka` branch) (see https://github.com/ML4EP/SOFIE/pull/7 and https://github.com/ML4EP/SOFIE/pull/8 for reference).
+2. Make sure there is a corrresponding `onnx` model in `SOFIE/src/SOFIE_core/test/input_models/`.
+3. Make sure there is a reference output in `SOFIE/src/SOFIE_core/test/input_models/references`.
+4. Follow instructions in SOFIE's README to build and run tests with CUDA (remember to set `-DCUDA_ARCH` as per your GPU's architecture).
+
+The relevant header and DAT files will be generated in `SOFIE/build/src/SOFIE_core/test/`.
+
+#### Kernels already ported to SOFIE
+
+`Transpose` and `Concat` kernels have already been ported to SOFIE (pull requests not merged yet). This repository has an updated implementation for both of these kernels, and two other kernels, which much be ported in the future.

From 03f3aa1bfd13b367cbfff83e08bcaadef157471a Mon Sep 17 00:00:00 2001
From: Saransh <saransh0701@gmail.com>
Date: Wed, 17 Dec 2025 04:30:53 +0530
Subject: [PATCH 17/33] add pre-commit badge

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index b3218a2..c63d4f5 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 # SOFIE-ALPAKA
 
 [![Build and Test on CPU](https://github.com/Saransh-cpp/SOFIE-ALPAKA/actions/workflows/build_and_test.yml/badge.svg?branch=main)](https://github.com/Saransh-cpp/SOFIE-ALPAKA/actions/workflows/build_and_test.yml)
+[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/Saransh-cpp/SOFIE-ALPAKA/main.svg)](https://results.pre-commit.ci/latest/github/Saransh-cpp/SOFIE-ALPAKA/main)
 
 Kernels for heterogeneous architectures written in [Alpaka](https://alpaka.readthedocs.io/en/stable/) (An Abstraction Library for Parallel Kernel Acceleration) for [SOFIE](https://github.com/ML4EP/SOFIE) (System for Optimised Fast Inference code Emit).
 

From 8d386c509cf02132d8b55dbfdd8db1a8ae835146 Mon Sep 17 00:00:00 2001
From: Saransh Chopra <saransh0701@gmail.com>
Date: Wed, 17 Dec 2025 04:38:32 +0530
Subject: [PATCH 18/33] oops, update cuda version in cmake

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0aa6202..44ac7c9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,7 +7,7 @@ set(CMAKE_CUDA_STANDARD 20)
 set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 
 # --- User-configurable options ---
-set(CUDA_BASE "/usr/local/cuda-13.1" CACHE PATH "CUDA base path")
+set(CUDA_BASE "/usr/local/cuda-12.5" CACHE PATH "CUDA base path")
 set(ALPAKA_BASE "external/alpaka" CACHE PATH "Alpaka base path")
 
 # --- Compiler flags ---
@@ -15,7 +15,7 @@ set(CXXFLAGS -O2 -g -DALPAKA_HAS_STD_ATOMIC_REF)
 set(CXX_HOST_FLAGS -fPIC -pthread)
 set(CUDA_ARCH "sm_75")
 set(CXX_CUDA_FLAGS -arch=${CUDA_ARCH} -Wno-deprecated-gpu-targets --extended-lambda --expt-relaxed-constexpr)
-set(CMAKE_CUDA_COMPILER "/usr/local/cuda-13.1/bin/nvcc" CACHE PATH "Cuda compiler path")
+set(CMAKE_CUDA_COMPILER "/usr/local/cuda-12.5/bin/nvcc" CACHE PATH "Cuda compiler path")
 
 # --- Executables ---
 add_executable(test_transpose tests/test_transpose.cpp)

From a3efbab300edd21c13b04436f9bc21e695e36926 Mon Sep 17 00:00:00 2001
From: Francesco Derme <francesco.derme02@gmail.com>
Date: Wed, 17 Dec 2025 13:23:13 +0100
Subject: [PATCH 19/33] Fixing tests

---
 Makefile                 | 37 +++++++++++++++++++++++++++++++------
 run.py                   | 19 +++++++++----------
 tests/test_concat.cpp    | 26 +++++++++++++++++++++++---
 tests/test_topk.cpp      | 25 ++++++++++++++++++++++++-
 tests/test_transpose.cpp | 26 +++++++++++++++++++++++---
 tests/test_where.cpp     | 26 +++++++++++++++++++++++---
 6 files changed, 133 insertions(+), 26 deletions(-)

diff --git a/Makefile b/Makefile
index 62a4b6e..b54a26e 100644
--- a/Makefile
+++ b/Makefile
@@ -1,13 +1,40 @@
 CXX      := g++
-CXXFLAGS ?= -std=c++17 -O2 -Wall
+CXXFLAGS ?= -std=c++17 -O3 -Wall
+LDFLAGS  ?=
 
+# Path setup: Mac + Linux compatible
 KERNEL_DIR          	 ?= kernels
 TEST_DIR            	 ?= tests
 ALPAKA_DIR          	 ?= $(CURDIR)/external/alpaka/include
 CPLUS_INCLUDE_PATH       ?= /opt/homebrew/include
+LIBRARY_PATH         	 ?= /opt/homebrew/lib
 BIN_DIR                  ?= bin
-ALPAKA_ACCELERATOR_FLAG  ?= ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
 
+LDFLAGS += -L$(LIBRARY_PATH)
+
+# Accelerator selection (CPU options)
+# Debugging (slow, checks everything)
+# ALPAKA_ACCELERATOR_FLAG  ?= ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
+
+# Performance (fast, single core)
+# ALPAKA_ACCELERATOR_FLAG ?= ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
+
+# High performance (fast, multi-core TBB)
+ALPAKA_ACCELERATOR_FLAG ?= ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED 
+
+# Conditional flags
+# Auto-add -ltbb if TBB is selected
+ifneq (,$(findstring TBB,$(ALPAKA_ACCELERATOR_FLAG)))
+    LDFLAGS += -ltbb
+endif
+
+# Auto-add -fopenmp if OMP is selected
+ifneq (,$(findstring OMP,$(ALPAKA_ACCELERATOR_FLAG)))
+    CXXFLAGS += -fopenmp
+    LDFLAGS  += -fopenmp
+endif
+
+# Build rules
 KERNEL_HEADERS := $(wildcard $(KERNEL_DIR)/*.hpp)
 KERNEL_NAMES := $(patsubst $(KERNEL_DIR)/%.hpp,%,$(KERNEL_HEADERS))
 EXECUTABLES := $(patsubst %,$(BIN_DIR)/test_%.out,$(KERNEL_NAMES))
@@ -26,7 +53,7 @@ test: $(EXECUTABLES)
 
 $(BIN_DIR)/test_%.out: $(TEST_DIR)/test_%.cpp $(KERNEL_DIR)/%.hpp | $(BIN_DIR)
 	@echo "Building test for kernel: $*"
-	$(CXX) $(CXXFLAGS) -I$(ALPAKA_DIR) -I$(CPLUS_INCLUDE_PATH) -D$(ALPAKA_ACCELERATOR_FLAG) $< -o $@
+	$(CXX) $(CXXFLAGS) -I$(ALPAKA_DIR) -I$(CPLUS_INCLUDE_PATH) -D$(ALPAKA_ACCELERATOR_FLAG) $< -o $@ $(LDFLAGS)
 
 $(BIN_DIR):
 	mkdir -p $(BIN_DIR)
@@ -34,6 +61,4 @@ $(BIN_DIR):
 clean:
 	rm -rf $(BIN_DIR)
 
-test:
-
-.PHONY = all test clean
+.PHONY: all test clean
diff --git a/run.py b/run.py
index e12d460..41def8d 100644
--- a/run.py
+++ b/run.py
@@ -14,7 +14,8 @@
 BENCHMARK_SIZES = [
     512,
     1024,
-    # 2048 # Be careful
+    2048,
+    4096
 ]
 
 def build_kernel_tests():
@@ -80,8 +81,7 @@ def main():
     if not build_kernel_tests():
         sys.exit(1)
 
-    print("Bandwidth calculated based on kernel execution time only,")
-    print("if the result is 0 you're probably using the CPU itself as the accelerator\n")
+    print("Bandwidth calculated based on kernel execution time only")
 
     # Benchmark Phase
     for EXECUTABLE_PATH in EXECUTABLE_PATHS:
@@ -90,15 +90,11 @@ def main():
         print("-" * 65)
 
         for N in BENCHMARK_SIZES:
-        
             res = run_benchmark(EXECUTABLE_PATH, [N])
-        
             if res:
                 k_ms, t_ms = res
             
                 # Bandwidth Calculation (approximate)
-                # Transpose reads N*N floats and writes N*N floats
-                # Total Bytes = 2 * N * N * 4 bytes (for float32)
                 total_bytes = 0.0
 
                 if EXECUTABLE_PATH == "./bin/test_transpose.out":
@@ -107,7 +103,7 @@ def main():
                     total_bytes = 24 * N * N
                 elif EXECUTABLE_PATH == "./bin/test_where.out":
                     total_bytes = 13 * N * N 
-                else:
+                elif EXECUTABLE_PATH == "./bin/test_topk.out":
                     k = 4
                     total_bytes = 4 * N * N + 4 * N * k
             
@@ -118,9 +114,12 @@ def main():
                 else:
                     bandwidth_gbs = 0.0
 
-                print(f"{N:<12} | {k_ms:<12.4f} | {t_ms:<12.4f} | {bandwidth_gbs:<18.2f}")
+                print(f"{N:<12} | {k_ms:<12.4f} | {t_ms:<12.4f} | {bandwidth_gbs:<18.4f}")
+
+        print("-" * 65)
 
-        print("-" * 65, "\n")
+        if EXECUTABLE_PATH != EXECUTABLE_PATHS[-1]:
+            print("")
 
 if __name__ == "__main__":
     main()
diff --git a/tests/test_concat.cpp b/tests/test_concat.cpp
index cd86549..5d0eb04 100644
--- a/tests/test_concat.cpp
+++ b/tests/test_concat.cpp
@@ -19,6 +19,16 @@ using DevAcc = alpaka::DevCudaRt;
 using QueueAcc = alpaka::Queue<DevAcc, alpaka::NonBlocking>;
 using Acc = alpaka::AccGpuCudaRt<Dim, Idx>;
 
+#elif defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED)
+using DevAcc = alpaka::DevCpu;
+using QueueAcc = alpaka::Queue<DevAcc, alpaka::Blocking>;
+using Acc = alpaka::AccCpuTbbBlocks<Dim, Idx>;
+
+#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
+using DevAcc = alpaka::DevCpu;
+using QueueAcc = alpaka::Queue<DevAcc, alpaka::Blocking>;
+using Acc = alpaka::AccCpuSerial<Dim, Idx>;
+
 #elif defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
 using DevAcc = alpaka::DevCpu;
 using QueueAcc = alpaka::Queue<DevAcc, alpaka::Blocking>;
@@ -138,9 +148,19 @@ int main(int argc, char* argv[]) {
     auto output_strides = alpaka::Vec<Dim, Idx>(out_cols, 1);
 
     // Work division: 2D mapping of threads to elements
-    const std::size_t threadsX = 16, threadsY = 16;
-    const std::size_t blocksX = (out_rows + threadsX - 1) / threadsX;
-    const std::size_t blocksY = (out_cols + threadsY - 1) / threadsY;
+    std::size_t threadsX = 16, threadsY = 16;
+    std::size_t blocksX = (out_rows + threadsX - 1) / threadsX;
+    std::size_t blocksY = (out_cols + threadsY - 1) / threadsY;
+
+#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || \
+    defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
+    defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
+
+    threadsX = 1;
+    threadsY = 1;
+    blocksX = 64;
+    blocksY = 1;
+#endif
 
     auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{
         alpaka::Vec<Dim, Idx>(blocksX, blocksY),
diff --git a/tests/test_topk.cpp b/tests/test_topk.cpp
index 2c247f1..b93d508 100644
--- a/tests/test_topk.cpp
+++ b/tests/test_topk.cpp
@@ -20,6 +20,16 @@ using DevAcc = alpaka::DevCudaRt;
 using QueueAcc = alpaka::Queue<DevAcc, alpaka::NonBlocking>;
 using Acc = alpaka::AccGpuCudaRt<Dim, Idx>;
 
+#elif defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED)
+using DevAcc = alpaka::DevCpu;
+using QueueAcc = alpaka::Queue<DevAcc, alpaka::Blocking>;
+using Acc = alpaka::AccCpuTbbBlocks<Dim, Idx>;
+
+#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
+using DevAcc = alpaka::DevCpu;
+using QueueAcc = alpaka::Queue<DevAcc, alpaka::Blocking>;
+using Acc = alpaka::AccCpuSerial<Dim, Idx>;
+
 #elif defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
 using DevAcc = alpaka::DevCpu;
 using QueueAcc = alpaka::Queue<DevAcc, alpaka::Blocking>;
@@ -92,7 +102,16 @@ int main(int argc, char* argv[]) {
 
     alpaka::Vec<Dim, Idx> threadsPerBlock;
     alpaka::Vec<Dim, Idx> blocksPerGrid;
-    Idx const TARGET_BLOCK_SIZE = 16;
+    Idx TARGET_BLOCK_SIZE = 16;
+    bool limitBlocks = false;
+
+#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || \
+    defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
+    defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
+
+    TARGET_BLOCK_SIZE = 1;
+    limitBlocks = true;
+#endif
 
     for (std::size_t d = 0; d < Dim::value; ++d) {
         if (d == TopkAxis) {
@@ -101,6 +120,10 @@ int main(int argc, char* argv[]) {
         }
         else {
             threadsPerBlock[d] = TARGET_BLOCK_SIZE;
+
+            if (limitBlocks) {
+                blocksPerGrid[d] = std::min(grid_elements[d], std::size_t(64));
+            }
             blocksPerGrid[d] = (grid_elements[d] + threadsPerBlock[d] - 1) /
                                threadsPerBlock[d];
         }
diff --git a/tests/test_transpose.cpp b/tests/test_transpose.cpp
index 27f583c..ddd1224 100644
--- a/tests/test_transpose.cpp
+++ b/tests/test_transpose.cpp
@@ -16,6 +16,16 @@ using DevAcc = alpaka::DevCudaRt;
 using QueueAcc = alpaka::Queue<DevAcc, alpaka::NonBlocking>;
 using Acc = alpaka::AccGpuCudaRt<Dim, Idx>;
 
+#elif defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED)
+using DevAcc = alpaka::DevCpu;
+using QueueAcc = alpaka::Queue<DevAcc, alpaka::Blocking>;
+using Acc = alpaka::AccCpuTbbBlocks<Dim, Idx>;
+
+#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
+using DevAcc = alpaka::DevCpu;
+using QueueAcc = alpaka::Queue<DevAcc, alpaka::Blocking>;
+using Acc = alpaka::AccCpuSerial<Dim, Idx>;
+
 #elif defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
 using DevAcc = alpaka::DevCpu;
 using QueueAcc = alpaka::Queue<DevAcc, alpaka::Blocking>;
@@ -86,9 +96,19 @@ int main(int argc, char* argv[]) {
     auto perm = alpaka::Vec<Dim, Idx>(1, 0);
 
     // Work division: 2D mapping of threads to elements
-    const std::size_t threadsX = 16, threadsY = 16;
-    const std::size_t blocksX = (cols + threadsX - 1) / threadsX;
-    const std::size_t blocksY = (rows + threadsY - 1) / threadsY;
+    std::size_t threadsX = 16, threadsY = 16;
+    std::size_t blocksX = (cols + threadsX - 1) / threadsX;
+    std::size_t blocksY = (rows + threadsY - 1) / threadsY;
+
+#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || \
+    defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
+    defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
+
+    threadsX = 1;
+    threadsY = 1;
+    blocksX = 64;
+    blocksY = 1;
+#endif
 
     auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{
         alpaka::Vec<Dim, Idx>(blocksX, blocksY),
diff --git a/tests/test_where.cpp b/tests/test_where.cpp
index af080b6..37263af 100644
--- a/tests/test_where.cpp
+++ b/tests/test_where.cpp
@@ -16,6 +16,16 @@ using DevAcc = alpaka::DevCudaRt;
 using QueueAcc = alpaka::Queue<DevAcc, alpaka::NonBlocking>;
 using Acc = alpaka::AccGpuCudaRt<Dim, Idx>;
 
+#elif defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED)
+using DevAcc = alpaka::DevCpu;
+using QueueAcc = alpaka::Queue<DevAcc, alpaka::Blocking>;
+using Acc = alpaka::AccCpuTbbBlocks<Dim, Idx>;
+
+#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
+using DevAcc = alpaka::DevCpu;
+using QueueAcc = alpaka::Queue<DevAcc, alpaka::Blocking>;
+using Acc = alpaka::AccCpuSerial<Dim, Idx>;
+
 #elif defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
 using DevAcc = alpaka::DevCpu;
 using QueueAcc = alpaka::Queue<DevAcc, alpaka::Blocking>;
@@ -91,9 +101,19 @@ int main(int argc, char* argv[]) {
     auto strides = alpaka::Vec<Dim, Idx>(cols, 1);
 
     // Work division: 2D mapping of threads to elements
-    const std::size_t threadsX = 16, threadsY = 16;
-    const std::size_t blocksX = (cols + threadsX - 1) / threadsX;
-    const std::size_t blocksY = (rows + threadsY - 1) / threadsY;
+    std::size_t threadsX = 16, threadsY = 16;
+    std::size_t blocksX = (cols + threadsX - 1) / threadsX;
+    std::size_t blocksY = (rows + threadsY - 1) / threadsY;
+
+#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || \
+    defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
+    defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
+
+    threadsX = 1;
+    threadsY = 1;
+    blocksX = 64;
+    blocksY = 1;
+#endif
 
     auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{
         alpaka::Vec<Dim, Idx>(blocksX, blocksY),

From d1fb9cad3eb7323980042b1b8127f2fe381331c3 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 17 Dec 2025 12:27:43 +0000
Subject: [PATCH 20/33] style: pre-commit fixes

---
 Makefile                 |  2 +-
 run.py                   |  2 +-
 tests/test_concat.cpp    |  3 +--
 tests/test_topk.cpp      | 30 ++++++++++--------------------
 tests/test_transpose.cpp |  3 +--
 tests/test_where.cpp     |  3 +--
 6 files changed, 15 insertions(+), 28 deletions(-)

diff --git a/Makefile b/Makefile
index b54a26e..09111d0 100644
--- a/Makefile
+++ b/Makefile
@@ -20,7 +20,7 @@ LDFLAGS += -L$(LIBRARY_PATH)
 # ALPAKA_ACCELERATOR_FLAG ?= ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
 
 # High performance (fast, multi-core TBB)
-ALPAKA_ACCELERATOR_FLAG ?= ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED 
+ALPAKA_ACCELERATOR_FLAG ?= ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
 
 # Conditional flags
 # Auto-add -ltbb if TBB is selected
diff --git a/run.py b/run.py
index 5b37bc0..25a15e6 100644
--- a/run.py
+++ b/run.py
@@ -102,7 +102,7 @@ def main():
                 elif EXECUTABLE_PATH == "./bin/test_concat.out":
                     total_bytes = 24 * N * N
                 elif EXECUTABLE_PATH == "./bin/test_where.out":
-                    total_bytes = 13 * N * N 
+                    total_bytes = 13 * N * N
                 elif EXECUTABLE_PATH == "./bin/test_topk.out":
                     k = 4
                     total_bytes = 4 * N * N + 4 * N * k
diff --git a/tests/test_concat.cpp b/tests/test_concat.cpp
index 1873b94..df807cd 100644
--- a/tests/test_concat.cpp
+++ b/tests/test_concat.cpp
@@ -147,8 +147,7 @@ int main(int argc, char* argv[]) {
     std::size_t blocksX = (out_rows + threadsX - 1) / threadsX;
     std::size_t blocksY = (out_cols + threadsY - 1) / threadsY;
 
-#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || \
-    defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
+#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
     defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
 
     threadsX = 1;
diff --git a/tests/test_topk.cpp b/tests/test_topk.cpp
index 9ac5dbe..fb8463c 100644
--- a/tests/test_topk.cpp
+++ b/tests/test_topk.cpp
@@ -64,8 +64,7 @@ int main(int argc, char* argv[]) {
         rows = std::atoi(argv[1]);
         cols = rows;
         std::cout << "Using input dimensions " << rows << "x" << cols << "\n";
-    }
-    else {
+    } else {
         std::cout << "Using random dimensions " << rows << "x" << cols << "\n";
     }
 
@@ -105,8 +104,7 @@ int main(int argc, char* argv[]) {
     Idx TARGET_BLOCK_SIZE = 16;
     bool limitBlocks = false;
 
-#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || \
-    defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
+#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
     defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
 
     TARGET_BLOCK_SIZE = 1;
@@ -117,22 +115,18 @@ int main(int argc, char* argv[]) {
         if (d == TopkAxis) {
             threadsPerBlock[d] = 1;
             blocksPerGrid[d] = 1;
-        }
-        else {
+        } else {
             threadsPerBlock[d] = TARGET_BLOCK_SIZE;
 
             if (limitBlocks) {
                 blocksPerGrid[d] = std::min(grid_elements[d], std::size_t(64));
-            }
-            else {
-                blocksPerGrid[d] = (grid_elements[d] + threadsPerBlock[d] - 1) /
-                                   threadsPerBlock[d];
+            } else {
+                blocksPerGrid[d] = (grid_elements[d] + threadsPerBlock[d] - 1) / threadsPerBlock[d];
             }
         }
     }
 
-    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{
-        blocksPerGrid, threadsPerBlock, grid_elements};
+    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{blocksPerGrid, threadsPerBlock, grid_elements};
 
     // Initial data transfer
     // 1) INPUT -> host buffer (safe via raw pointer)
@@ -151,10 +145,8 @@ int main(int argc, char* argv[]) {
 
     auto start_kernel = now();
 
-    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn),
-                      alpaka::getPtrNative(aOut), input_strides, output_strides,
-                      grid_elements, TopkAxis, extentIn[TopkAxis],
-                      padding_value);
+    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn), alpaka::getPtrNative(aOut), input_strides,
+                      output_strides, grid_elements, TopkAxis, extentIn[TopkAxis], padding_value);
 
     alpaka::wait(queue);
     auto end_kernel = now();
@@ -208,10 +200,8 @@ int main(int argc, char* argv[]) {
 
     std::cout << "Correct!\n";
 
-    std::chrono::duration<double, std::milli> kernel_ms =
-        end_kernel - start_kernel;
-    std::chrono::duration<double, std::milli> total_ms =
-        end_total - start_total;
+    std::chrono::duration<double, std::milli> kernel_ms = end_kernel - start_kernel;
+    std::chrono::duration<double, std::milli> total_ms = end_total - start_total;
 
     std::cout << "TIME_KERNEL_MS: " << kernel_ms.count() << std::endl;
     std::cout << "TIME_TOTAL_MS: " << total_ms.count() << std::endl;
diff --git a/tests/test_transpose.cpp b/tests/test_transpose.cpp
index 5ded30b..c5e86a4 100644
--- a/tests/test_transpose.cpp
+++ b/tests/test_transpose.cpp
@@ -99,8 +99,7 @@ int main(int argc, char* argv[]) {
     std::size_t blocksX = (cols + threadsX - 1) / threadsX;
     std::size_t blocksY = (rows + threadsY - 1) / threadsY;
 
-#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || \
-    defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
+#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
     defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
 
     threadsX = 1;
diff --git a/tests/test_where.cpp b/tests/test_where.cpp
index 62869b5..8b25983 100644
--- a/tests/test_where.cpp
+++ b/tests/test_where.cpp
@@ -103,8 +103,7 @@ int main(int argc, char* argv[]) {
     std::size_t blocksX = (cols + threadsX - 1) / threadsX;
     std::size_t blocksY = (rows + threadsY - 1) / threadsY;
 
-#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || \
-    defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
+#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
     defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
 
     threadsX = 1;

From ed0d640f63251367cc079f8cd0341354b643b169 Mon Sep 17 00:00:00 2001
From: PietroFumagalli <pfuma02@gmail.com>
Date: Wed, 17 Dec 2025 15:36:37 +0100
Subject: [PATCH 21/33] pytorch benchmarking

---
 run_torch.py | 236 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 236 insertions(+)
 create mode 100644 run_torch.py

diff --git a/run_torch.py b/run_torch.py
new file mode 100644
index 0000000..50bda2d
--- /dev/null
+++ b/run_torch.py
@@ -0,0 +1,236 @@
+import subprocess
+import sys
+import os
+import re
+import time
+
+try: 
+    import torch
+    HAS_TORCH = True
+except:
+    HAS_TORCH = False
+    print("PyTorch not found, running only C++.\n")
+
+# Configuration
+EXECUTABLE_PATHS = [
+    "./bin/test_transpose.out",
+    "./bin/test_concat.out",
+    "./bin/test_where.out",
+    "./bin/test_topk.out"
+]
+
+BENCHMARK_SIZES = [
+    512,
+    1024,
+    2048,
+    4096
+]
+
+def build_kernel_tests():
+    """
+    Calls the Makefile to build the kernel tests.
+    Returns True if successful, False otherwise.
+    """
+    print("Building kernel tests with Make...")
+    try:
+        # Check if Makefile exists
+        if not os.path.exists("Makefile"):
+            print("Error: Makefile not found in current directory")
+            return False
+
+        # Run 'make'.
+        subprocess.run(["make"], check=True)
+
+        print("Build successful\n")
+        return True
+
+    except subprocess.CalledProcessError:
+        print("Build failed. Please fix C++ errors before running benchmarks")
+        return False
+    except FileNotFoundError:
+        print("Error: 'make' command not found. Is it installed?")
+        return False
+
+def get_op_name(executable_path):
+    if "transpose" in executable_path: return "transpose"
+    if "concat" in executable_path: return "concat"
+    if "where" in executable_path: return "where"
+    if "topk" in executable_path: return "topk"
+    return "unknown"
+
+def run_pytorch_benchmark(op_name, N, num_repeats=10, warmup=5):
+    """
+    Runs the equivalent operation in PyTorch and measures execution time.
+    Compatible with both CPU and GPU.
+    """
+    if not HAS_TORCH:
+        return None
+
+    # Detect device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    
+    # Setup Data
+    if op_name == "transpose":
+        x = torch.randn(N, N, device=device, dtype=torch.float32)
+        op = lambda: x.t().contiguous()
+        
+    elif op_name == "concat":
+        t1 = torch.randn(N, N, device=device, dtype=torch.float32)
+        t2 = torch.randn(N, N, device=device, dtype=torch.float32)
+        t3 = torch.randn(N, N, device=device, dtype=torch.float32)
+        op = lambda: torch.cat((t1, t2, t3), dim=1)
+        
+    elif op_name == "where":
+        cond = torch.randint(0, 2, (N, N), device=device, dtype=torch.bool)
+        x = torch.randn(N, N, device=device, dtype=torch.float32)
+        y = torch.randn(N, N, device=device, dtype=torch.float32)
+        op = lambda: torch.where(cond, x, y)
+        
+    elif op_name == "topk":
+        k = 4
+        x = torch.randn(N, N, device=device, dtype=torch.float32)
+        op = lambda: torch.topk(x, k)
+    else:
+        return None
+
+    '''
+    # Warmup 
+    for _ in range(warmup):
+        op()
+    '''
+
+    if device.type == 'cuda':
+        torch.cuda.synchronize()
+
+    #  Benchmarking
+    if device.type == 'cuda':
+        # GPU Timing (Asynchronous)
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+
+        start_event.record()
+        for _ in range(num_repeats):
+            op()
+        end_event.record()
+        torch.cuda.synchronize()
+        total_ms = start_event.elapsed_time(end_event)
+        
+    else:
+        # CPU Timing (Synchronous)
+        start_time = time.perf_counter()
+        for _ in range(num_repeats):
+            op()
+        end_time = time.perf_counter()
+        total_ms = (end_time - start_time) * 1000.0 # convert seconds to ms
+
+    return total_ms / num_repeats
+
+def run_cpp_benchmark(executable_path, args):
+    """
+    Runs the compiled executable with arguments.
+    """
+    if not os.path.exists(executable_path):
+        print(f"Error: Executable '{executable_path}' not found after build")
+        return
+
+    N = args[0]
+
+    try:
+        # Construct the command
+        cmd = [executable_path] + [str(a) for a in args]
+
+        # Run and capture output for parsing
+        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+        output = result.stdout
+
+        kernel_match = re.search(r"TIME_KERNEL_MS:\s+(\d+\.?\d*)", output)
+        total_match = re.search(r"TIME_TOTAL_MS:\s+(\d+\.?\d*)", output)
+
+        if kernel_match and total_match:
+            return float(kernel_match.group(1)), float(total_match.group(1))
+        else:
+            print(f"Output parsing failed for size {N}x{N}.")
+            print("Printing raw output from the cpp executable")
+            print(output)
+            return None
+
+    except subprocess.CalledProcessError as e:
+        print(f"Execution failed with return code {e.returncode}")
+        print("Stderr:", e.stderr)
+
+def main():
+    build_kernel_tests()
+
+    device_name = "CPU"
+    if HAS_TORCH and torch.cuda.is_available():
+        device_name = f"GPU ({torch.cuda.get_device_name(0)})"
+    
+    print(f"\n{'Benchmarking System':^100}")
+    print(f"{f'PyTorch Device: {device_name}':^100}")
+    print("-" * 100)
+
+    for EXECUTABLE_PATH in EXECUTABLE_PATHS:
+        op_name = get_op_name(EXECUTABLE_PATH)
+        print(f"Operation: {op_name.upper()}")
+        
+        # --- Flexible Headers ---
+        # K = Kernel Time, T = Total Time
+        if HAS_TORCH:
+            header = (f"{'SIZE':<6} | {'CPP(K)':<9} | {'CPP(T)':<9} | {'TORCH':<9} | "
+                      f"{'CPP GB/s':<9} | {'TORCH GB/s':<11} | {'SPEEDUP':<8}")
+        else:
+            header = (f"{'SIZE':<6} | {'CPP(K)':<10} | {'CPP(T)':<10} | {'CPP GB/s':<12}")
+            
+        print(header)
+        print("-" * len(header))
+
+        for N in BENCHMARK_SIZES:
+            # 1. Run C++ Benchmark
+            cpp_res = run_cpp_benchmark(EXECUTABLE_PATH, [N])
+            if cpp_res:
+                cpp_k_ms, cpp_t_ms = cpp_res
+            else:
+                cpp_k_ms, cpp_t_ms = None, None
+
+            # 2. Run PyTorch Benchmark
+            torch_ms = run_pytorch_benchmark(op_name, N) if HAS_TORCH else None
+
+            # 3. Calculate Bandwidth (Using Kernel Time)
+            total_bytes = 0.0
+            if op_name == "transpose": total_bytes = 8 * N * N
+            elif op_name == "concat":  total_bytes = 24 * N * N
+            elif op_name == "where":   total_bytes = 13 * N * N
+            elif op_name == "topk":    total_bytes = 4 * N * N + 16 * N # approx topk
+
+            # GB/s = (Bytes/1e9) / (ms/1000)
+            cpp_bw = (total_bytes / 1e9) / (cpp_k_ms / 1000.0) if (cpp_k_ms and cpp_k_ms > 0) else 0.0
+            torch_bw = (total_bytes / 1e9) / (torch_ms / 1000.0) if (torch_ms and torch_ms > 0) else 0.0
+
+            # Formatting
+            c_k_str = f"{cpp_k_ms:.4f}" if cpp_k_ms else "ERR"
+            c_t_str = f"{cpp_t_ms:.4f}" if cpp_t_ms else "ERR"
+            c_bw_str = f"{cpp_bw:.2f}" if cpp_k_ms else "-"
+
+            if HAS_TORCH:
+                t_ms_str = f"{torch_ms:.4f}" if torch_ms else "ERR"
+                t_bw_str = f"{torch_bw:.2f}" if torch_ms else "-"
+                
+                # Compare PyTorch Time vs C++ Kernel Time
+                speedup_str = "-"
+                if cpp_k_ms and torch_ms and cpp_k_ms > 0:
+                    ratio = torch_ms / cpp_k_ms 
+                    # If ratio < 1.0, PyTorch is faster. If > 1.0, C++ is faster.
+                    # Usually 'Speedup' means (Baseline / New), so let's do (Torch / CPP)
+                    # ratio 0.5x means PyTorch took half the time of CPP
+                    speedup_str = f"{ratio:.2f}x"
+
+                print(f"{N:<6} | {c_k_str:<9} | {c_t_str:<9} | {t_ms_str:<9} | "
+                      f"{c_bw_str:<9} | {t_bw_str:<11} | {speedup_str:<8}")
+            else:
+                print(f"{N:<6} | {c_k_str:<10} | {c_t_str:<10} | {c_bw_str:<12}")
+
+        print("-" * 100)
+        print("")
+if __name__ == "__main__":
+    main()
+

From b9d33b2a2fbd55a56838eb1e11f0eaa9715c2a2c Mon Sep 17 00:00:00 2001
From: Francesco Derme <francesco.derme02@gmail.com>
Date: Wed, 17 Dec 2025 15:36:42 +0100
Subject: [PATCH 22/33] Added warmup runs and trivial kernel

---
 kernels/trivial.hpp      |  36 ++++++++
 run.py                   |  15 ++--
 tests/test_concat.cpp    |  45 ++++++----
 tests/test_topk.cpp      |  12 ++-
 tests/test_transpose.cpp |  30 +++++--
 tests/test_trivial.cpp   | 172 +++++++++++++++++++++++++++++++++++++++
 tests/test_where.cpp     |  39 ++++++---
 7 files changed, 307 insertions(+), 42 deletions(-)
 create mode 100644 kernels/trivial.hpp
 create mode 100644 tests/test_trivial.cpp

diff --git a/kernels/trivial.hpp b/kernels/trivial.hpp
new file mode 100644
index 0000000..4a0236c
--- /dev/null
+++ b/kernels/trivial.hpp
@@ -0,0 +1,36 @@
+#ifndef TRIVIAL_KERNEL_HPP
+#define TRIVIAL_KERNEL_HPP
+
+#include <alpaka/alpaka.hpp>
+
+namespace alpaka_kernels {
+
+struct TrivialKernel {
+    template <typename TAcc, typename T, typename Dim, typename Idx>
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* input, T* output,
+                                  alpaka::Vec<Dim, Idx> output_strides,
+                                  alpaka::Vec<Dim, Idx> output_shape) const {
+        using DimAcc = alpaka::Dim<TAcc>;
+        static_assert(DimAcc::value == Dim::value,
+                      "Accelerator and data dimensions must match!");
+
+        constexpr std::size_t D = Dim::value;
+        auto elements = alpaka::uniformElementsND(acc, output_shape);
+
+        for (auto const& idx : elements) {
+            Idx linear_idx = 0;
+
+            // Compute input and output indexes
+            for (std::size_t d = 0; d < D; ++d) {
+                Idx const coord = idx[d];
+                linear_idx += coord * output_strides[d];
+            }
+
+            output[linear_idx] = input[linear_idx];
+        }
+    }
+};
+
+}  // namespace alpaka_kernels
+
+#endif  // TRIVIAL_KERNEL_HPP
diff --git a/run.py b/run.py
index 5b37bc0..a333a33 100644
--- a/run.py
+++ b/run.py
@@ -5,17 +5,19 @@
 
 # Configuration
 EXECUTABLE_PATHS = [
-    "./bin/test_transpose.out",
+    "./bin/test_trivial.out",
     "./bin/test_concat.out",
-    "./bin/test_where.out",
-    "./bin/test_topk.out"
+    "./bin/test_transpose.out",
+    "./bin/test_topk.out",
+    "./bin/test_where.out"
 ]
 
 BENCHMARK_SIZES = [
     512,
     1024,
     2048,
-    4096
+    4096,
+    8192
 ]
 
 def build_kernel_tests():
@@ -97,10 +99,13 @@ def main():
                 # Bandwidth Calculation (approximate)
                 total_bytes = 0.0
 
+                if EXECUTABLE_PATH == "./bin/test_trivial.out":
+                    total_bytes = 8 * N * N
                 if EXECUTABLE_PATH == "./bin/test_transpose.out":
                     total_bytes = 8 * N * N
                 elif EXECUTABLE_PATH == "./bin/test_concat.out":
-                    total_bytes = 24 * N * N
+                    concat_num = 3
+                    total_bytes = 8 * concat_num * N * N
                 elif EXECUTABLE_PATH == "./bin/test_where.out":
                     total_bytes = 13 * N * N 
                 elif EXECUTABLE_PATH == "./bin/test_topk.out":
diff --git a/tests/test_concat.cpp b/tests/test_concat.cpp
index 1873b94..2483097 100644
--- a/tests/test_concat.cpp
+++ b/tests/test_concat.cpp
@@ -69,7 +69,8 @@ int main(int argc, char* argv[]) {
             val = cols;
             total_rows += val;
         }
-    } else {
+    }
+    else {
         std::cout << "Using random dimensions ";
         for (auto& val : in_rows) {
             val = distrib_int(gen);
@@ -78,7 +79,8 @@ int main(int argc, char* argv[]) {
     }
 
     for (std::size_t k = 0; k < NumInputs; ++k)
-        std::cout << in_rows[k] << "x" << cols << ((k < NumInputs - 1) ? ", " : "\n");
+        std::cout << in_rows[k] << "x" << cols
+                  << ((k < NumInputs - 1) ? ", " : "\n");
 
     std::array<std::vector<T>, NumInputs> INPUT;
     for (std::size_t k = 0; k < NumInputs; ++k) {
@@ -92,8 +94,10 @@ int main(int argc, char* argv[]) {
     alpaka::Queue<Acc, alpaka::Blocking> queue{devAcc};
 
     // Allocate buffers
-    using BufAcc = decltype(alpaka::allocBuf<T, Idx>(devAcc, alpaka::Vec<Dim, Idx>{}));
-    using BufHost = decltype(alpaka::allocBuf<T, Idx>(devHost, alpaka::Vec<Dim, Idx>{}));
+    using BufAcc =
+        decltype(alpaka::allocBuf<T, Idx>(devAcc, alpaka::Vec<Dim, Idx>{}));
+    using BufHost =
+        decltype(alpaka::allocBuf<T, Idx>(devHost, alpaka::Vec<Dim, Idx>{}));
 
     std::vector<BufAcc> aIn_bufs;
     aIn_bufs.reserve(NumInputs);
@@ -117,7 +121,8 @@ int main(int argc, char* argv[]) {
 
         // INPUT to host buffer data transfer (safe via raw pointers)
         T* pHost = alpaka::getPtrNative(hIn_bufs.back());
-        for (std::size_t i = 0; i < INPUT[k].size(); ++i) pHost[i] = INPUT[k][i];
+        for (std::size_t i = 0; i < INPUT[k].size(); ++i)
+            pHost[i] = INPUT[k][i];
     }
 
     // Allocate output buffers
@@ -157,8 +162,18 @@ int main(int argc, char* argv[]) {
     blocksY = 1;
 #endif
 
-    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{alpaka::Vec<Dim, Idx>(blocksX, blocksY),
-                                                          alpaka::Vec<Dim, Idx>(threadsX, threadsY), extentOut};
+    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{
+        alpaka::Vec<Dim, Idx>(blocksX, blocksY),
+        alpaka::Vec<Dim, Idx>(threadsX, threadsY), extentOut};
+
+    // Warmup run
+    ConcatKernel kernel;
+
+    alpaka::exec<Acc>(queue, workDiv, kernel, aIn_ptrs,
+                      alpaka::getPtrNative(aOut), input_strides_vec,
+                      output_strides, extentOut, axis_sizes, ConcatAxis);
+
+    alpaka::wait(queue);
 
     // Host to accelerator data transfer
     auto start_total = now();
@@ -170,12 +185,11 @@ int main(int argc, char* argv[]) {
     alpaka::wait(queue);
 
     // Launch kernel
-    ConcatKernel kernel;
-
     auto start_kernel = now();
 
-    alpaka::exec<Acc>(queue, workDiv, kernel, aIn_ptrs, alpaka::getPtrNative(aOut), input_strides_vec, output_strides,
-                      extentOut, axis_sizes, ConcatAxis);
+    alpaka::exec<Acc>(queue, workDiv, kernel, aIn_ptrs,
+                      alpaka::getPtrNative(aOut), input_strides_vec,
+                      output_strides, extentOut, axis_sizes, ConcatAxis);
 
     alpaka::wait(queue);
     auto end_kernel = now();
@@ -189,7 +203,8 @@ int main(int argc, char* argv[]) {
     std::cout << "Output is of shape " << out_rows << "x" << out_cols << "\n";
 
     std::vector<T> expected;
-    for (const auto& vec : INPUT) expected.insert(expected.end(), vec.begin(), vec.end());
+    for (const auto& vec : INPUT)
+        expected.insert(expected.end(), vec.begin(), vec.end());
 
     {
         T* pHost = alpaka::getPtrNative(hOut);
@@ -203,8 +218,10 @@ int main(int argc, char* argv[]) {
 
     std::cout << "Correct!\n";
 
-    std::chrono::duration<double, std::milli> kernel_ms = end_kernel - start_kernel;
-    std::chrono::duration<double, std::milli> total_ms = end_total - start_total;
+    std::chrono::duration<double, std::milli> kernel_ms =
+        end_kernel - start_kernel;
+    std::chrono::duration<double, std::milli> total_ms =
+        end_total - start_total;
 
     std::cout << "TIME_KERNEL_MS: " << kernel_ms.count() << std::endl;
     std::cout << "TIME_TOTAL_MS: " << total_ms.count() << std::endl;
diff --git a/tests/test_topk.cpp b/tests/test_topk.cpp
index 9ac5dbe..f8908d9 100644
--- a/tests/test_topk.cpp
+++ b/tests/test_topk.cpp
@@ -134,6 +134,16 @@ int main(int argc, char* argv[]) {
     auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{
         blocksPerGrid, threadsPerBlock, grid_elements};
 
+    // Warmup run
+    TopKKernel<K, MaxRegisters> kernel;
+
+    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn),
+                      alpaka::getPtrNative(aOut), input_strides, output_strides,
+                      grid_elements, TopkAxis, extentIn[TopkAxis],
+                      padding_value);
+
+    alpaka::wait(queue);
+
     // Initial data transfer
     // 1) INPUT -> host buffer (safe via raw pointer)
     {
@@ -147,8 +157,6 @@ int main(int argc, char* argv[]) {
     alpaka::wait(queue);
 
     // Launch kernel
-    TopKKernel<K, MaxRegisters> kernel;
-
     auto start_kernel = now();
 
     alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn),
diff --git a/tests/test_transpose.cpp b/tests/test_transpose.cpp
index 5ded30b..b698674 100644
--- a/tests/test_transpose.cpp
+++ b/tests/test_transpose.cpp
@@ -60,7 +60,8 @@ int main(int argc, char* argv[]) {
         rows = std::atoi(argv[1]);
         cols = rows;
         std::cout << "Using input dimensions " << rows << "x" << cols << "\n";
-    } else {
+    }
+    else {
         std::cout << "Using random dimensions " << rows << "x" << cols << "\n";
     }
 
@@ -109,8 +110,18 @@ int main(int argc, char* argv[]) {
     blocksY = 1;
 #endif
 
-    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{alpaka::Vec<Dim, Idx>(blocksX, blocksY),
-                                                          alpaka::Vec<Dim, Idx>(threadsX, threadsY), extentOut};
+    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{
+        alpaka::Vec<Dim, Idx>(blocksX, blocksY),
+        alpaka::Vec<Dim, Idx>(threadsX, threadsY), extentOut};
+
+    // Warmup run
+    TransposeKernel kernel;
+
+    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn),
+                      alpaka::getPtrNative(aOut), input_strides, output_strides,
+                      extentOut, perm);
+
+    alpaka::wait(queue);
 
     // Initial data transfer
     // 1) INPUT -> host buffer (safe via raw pointer)
@@ -125,12 +136,11 @@ int main(int argc, char* argv[]) {
     alpaka::wait(queue);
 
     // Launch kernel
-    TransposeKernel kernel;
-
     auto start_kernel = now();
 
-    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn), alpaka::getPtrNative(aOut), input_strides,
-                      output_strides, extentOut, perm);
+    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn),
+                      alpaka::getPtrNative(aOut), input_strides, output_strides,
+                      extentOut, perm);
 
     alpaka::wait(queue);
     auto end_kernel = now();
@@ -160,8 +170,10 @@ int main(int argc, char* argv[]) {
 
     std::cout << "Correct!\n";
 
-    std::chrono::duration<double, std::milli> kernel_ms = end_kernel - start_kernel;
-    std::chrono::duration<double, std::milli> total_ms = end_total - start_total;
+    std::chrono::duration<double, std::milli> kernel_ms =
+        end_kernel - start_kernel;
+    std::chrono::duration<double, std::milli> total_ms =
+        end_total - start_total;
 
     std::cout << "TIME_KERNEL_MS: " << kernel_ms.count() << std::endl;
     std::cout << "TIME_TOTAL_MS: " << total_ms.count() << std::endl;
diff --git a/tests/test_trivial.cpp b/tests/test_trivial.cpp
new file mode 100644
index 0000000..763b2b1
--- /dev/null
+++ b/tests/test_trivial.cpp
@@ -0,0 +1,172 @@
+#include <alpaka/alpaka.hpp>
+#include <chrono>
+#include <iostream>
+#include <random>
+#include <vector>
+
+#include "../kernels/trivial.hpp"
+
+// Test domain parameters
+constexpr std::size_t NumDims = 2;
+using Dim = alpaka::DimInt<NumDims>;
+using Idx = std::size_t;
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+using DevAcc = alpaka::DevCudaRt;
+using QueueAcc = alpaka::Queue<DevAcc, alpaka::NonBlocking>;
+using Acc = alpaka::AccGpuCudaRt<Dim, Idx>;
+
+#elif defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED)
+using DevAcc = alpaka::DevCpu;
+using QueueAcc = alpaka::Queue<DevAcc, alpaka::Blocking>;
+using Acc = alpaka::AccCpuTbbBlocks<Dim, Idx>;
+
+#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
+using DevAcc = alpaka::DevCpu;
+using QueueAcc = alpaka::Queue<DevAcc, alpaka::Blocking>;
+using Acc = alpaka::AccCpuSerial<Dim, Idx>;
+
+#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
+using DevAcc = alpaka::DevCpu;
+using QueueAcc = alpaka::Queue<DevAcc, alpaka::Blocking>;
+using Acc = alpaka::AccCpuThreads<Dim, Idx>;
+
+#else
+#error Please define a single one of ALPAKA_ACC_GPU_CUDA_ENABLED, ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
+
+#endif
+
+using DevHost = alpaka::DevCpu;
+using PlatAcc = alpaka::Platform<DevAcc>;
+using PlatHost = alpaka::PlatformCpu;
+
+auto now() { return std::chrono::high_resolution_clock::now(); }
+
+int main(int argc, char* argv[]) {
+    using namespace alpaka_kernels;
+    using T = float;
+
+    // Random engine
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<int> distrib_int(50, 500);
+    std::uniform_real_distribution<float> distrib_real(-1.0f, 1.0f);
+
+    // Input matrix dimensions
+    std::size_t rows = distrib_int(gen);
+    std::size_t cols = distrib_int(gen);
+
+    if (argc >= 2) {
+        rows = std::atoi(argv[1]);
+        cols = rows;
+        std::cout << "Using input dimensions " << rows << "x" << cols << "\n";
+    }
+    else {
+        std::cout << "Using random dimensions " << rows << "x" << cols << "\n";
+    }
+
+    const std::size_t numElems = rows * cols;
+
+    std::vector<T> INPUT(numElems);
+    for (auto& val : INPUT) val = distrib_real(gen);
+
+    // Setup the accelerator, host and queue
+    auto devAcc = alpaka::getDevByIdx(PlatAcc{}, 0u);
+    auto devHost = alpaka::getDevByIdx(PlatHost{}, 0u);
+    alpaka::Queue<Acc, alpaka::Blocking> queue{devAcc};
+
+    // Allocate buffers
+    auto extent = alpaka::Vec<Dim, Idx>(rows, cols);
+
+    // 1) Accelerator buffers
+    auto aIn = alpaka::allocBuf<T, Idx>(devAcc, extent);
+    auto aOut = alpaka::allocBuf<T, Idx>(devAcc, extent);
+
+    // 2) Host buffers
+    auto hIn = alpaka::allocBuf<T, Idx>(devHost, extent);
+    auto hOut = alpaka::allocBuf<T, Idx>(devHost, extent);
+
+    // Prepare kernel arguments
+    auto output_strides = alpaka::Vec<Dim, Idx>(cols, 1);
+
+    // Work division: 2D mapping of threads to elements
+    std::size_t threadsX = 16, threadsY = 16;
+    std::size_t blocksX = (cols + threadsX - 1) / threadsX;
+    std::size_t blocksY = (rows + threadsY - 1) / threadsY;
+
+#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || \
+    defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
+    defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
+
+    threadsX = 1;
+    threadsY = 1;
+    blocksX = 64;
+    blocksY = 1;
+#endif
+
+    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{
+        alpaka::Vec<Dim, Idx>(blocksX, blocksY),
+        alpaka::Vec<Dim, Idx>(threadsX, threadsY), extent};
+
+    // Warmup run
+    TrivialKernel kernel;
+    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn),
+                      alpaka::getPtrNative(aOut), output_strides, extent);
+
+    alpaka::wait(queue);
+
+    // Initial data transfer
+    // 1) INPUT -> host buffer (safe via raw pointer)
+    {
+        T* pHost = alpaka::getPtrNative(hIn);
+        for (Idx i = 0; i < numElems; ++i) pHost[i] = INPUT[i];
+    }
+
+    // 2) host -> accelerator
+    auto start_total = now();
+    alpaka::memcpy(queue, aIn, hIn);
+    alpaka::wait(queue);
+
+    // Launch kernel
+    auto start_kernel = now();
+
+    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn),
+                      alpaka::getPtrNative(aOut), output_strides, extent);
+
+    alpaka::wait(queue);
+    auto end_kernel = now();
+
+    // Final data transfer: accelerator -> host
+    alpaka::memcpy(queue, hOut, aOut);
+    alpaka::wait(queue);
+    auto end_total = now();
+
+    // Print result
+    std::cout << "Output is of shape " << rows << "x" << cols << "\n";
+
+    {
+        T* pHost = alpaka::getPtrNative(hOut);
+        for (std::size_t i = 0; i < rows; ++i) {
+            for (std::size_t j = 0; j < cols; ++j) {
+                T valOut = pHost[i * cols + j];
+                T valIn = INPUT[i * cols + j];
+
+                if (valIn != valOut) {
+                    std::cerr << "Failed!\n";
+                    return 1;
+                }
+            }
+        }
+    }
+
+    std::cout << "Correct!\n";
+
+    std::chrono::duration<double, std::milli> kernel_ms =
+        end_kernel - start_kernel;
+    std::chrono::duration<double, std::milli> total_ms =
+        end_total - start_total;
+
+    std::cout << "TIME_KERNEL_MS: " << kernel_ms.count() << std::endl;
+    std::cout << "TIME_TOTAL_MS: " << total_ms.count() << std::endl;
+    return 0;
+}
diff --git a/tests/test_where.cpp b/tests/test_where.cpp
index 62869b5..16ce827 100644
--- a/tests/test_where.cpp
+++ b/tests/test_where.cpp
@@ -62,7 +62,8 @@ int main(int argc, char* argv[]) {
         rows = std::atoi(argv[1]);
         cols = rows;
         std::cout << "Using input dimensions " << rows << "x" << cols << "\n";
-    } else {
+    }
+    else {
         std::cout << "Using random dimensions " << rows << "x" << cols << "\n";
     }
 
@@ -73,7 +74,8 @@ int main(int argc, char* argv[]) {
 
     for (auto& val : INPUT_X) val = distrib_real(gen) * 100.0;
     for (auto& val : INPUT_Y) val = distrib_real(gen);
-    for (std::size_t i = 0; i < numElems; ++i) INPUT_COND[i] = distrib_bool(gen);
+    for (std::size_t i = 0; i < numElems; ++i)
+        INPUT_COND[i] = distrib_bool(gen);
 
     // Setup the accelerator, host and queue
     auto devAcc = alpaka::getDevByIdx(PlatAcc{}, 0u);
@@ -113,8 +115,19 @@ int main(int argc, char* argv[]) {
     blocksY = 1;
 #endif
 
-    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{alpaka::Vec<Dim, Idx>(blocksX, blocksY),
-                                                          alpaka::Vec<Dim, Idx>(threadsX, threadsY), extent};
+    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{
+        alpaka::Vec<Dim, Idx>(blocksX, blocksY),
+        alpaka::Vec<Dim, Idx>(threadsX, threadsY), extent};
+
+    // Warmup run
+    WhereKernel kernel;
+
+    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn_Cond),
+                      alpaka::getPtrNative(aIn_X), alpaka::getPtrNative(aIn_Y),
+                      alpaka::getPtrNative(aOut), strides, strides, strides,
+                      strides, extent);
+
+    alpaka::wait(queue);
 
     // Initial data transfer
     // 1) INPUT -> host buffer (safe via raw pointer)
@@ -137,13 +150,12 @@ int main(int argc, char* argv[]) {
     alpaka::wait(queue);
 
     // Launch kernel
-    WhereKernel kernel;
-
     auto start_kernel = now();
 
-    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn_Cond), alpaka::getPtrNative(aIn_X),
-                      alpaka::getPtrNative(aIn_Y), alpaka::getPtrNative(aOut), strides, strides, strides, strides,
-                      extent);
+    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn_Cond),
+                      alpaka::getPtrNative(aIn_X), alpaka::getPtrNative(aIn_Y),
+                      alpaka::getPtrNative(aOut), strides, strides, strides,
+                      strides, extent);
 
     alpaka::wait(queue);
     auto end_kernel = now();
@@ -161,7 +173,8 @@ int main(int argc, char* argv[]) {
         for (std::size_t i = 0; i < rows; ++i) {
             for (std::size_t j = 0; j < cols; ++j) {
                 T valOut = pHost[i * cols + j];
-                T valIn = INPUT_COND[i * cols + j] ? INPUT_X[i * cols + j] : INPUT_Y[i * cols + j];
+                T valIn = INPUT_COND[i * cols + j] ? INPUT_X[i * cols + j]
+                                                   : INPUT_Y[i * cols + j];
 
                 if (valIn != valOut) {
                     std::cerr << "Failed!\n";
@@ -173,8 +186,10 @@ int main(int argc, char* argv[]) {
 
     std::cout << "Correct!\n";
 
-    std::chrono::duration<double, std::milli> kernel_ms = end_kernel - start_kernel;
-    std::chrono::duration<double, std::milli> total_ms = end_total - start_total;
+    std::chrono::duration<double, std::milli> kernel_ms =
+        end_kernel - start_kernel;
+    std::chrono::duration<double, std::milli> total_ms =
+        end_total - start_total;
 
     std::cout << "TIME_KERNEL_MS: " << kernel_ms.count() << std::endl;
     std::cout << "TIME_TOTAL_MS: " << total_ms.count() << std::endl;

From 0cc67fc6ffaa18ff139cccfeecacb062e749de61 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 17 Dec 2025 14:36:49 +0000
Subject: [PATCH 23/33] style: pre-commit fixes

---
 run_torch.py | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/run_torch.py b/run_torch.py
index 50bda2d..db33ea4 100644
--- a/run_torch.py
+++ b/run_torch.py
@@ -4,7 +4,7 @@
 import re
 import time
 
-try: 
+try:
     import torch
     HAS_TORCH = True
 except:
@@ -68,24 +68,24 @@ def run_pytorch_benchmark(op_name, N, num_repeats=10, warmup=5):
 
     # Detect device
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    
+
     # Setup Data
     if op_name == "transpose":
         x = torch.randn(N, N, device=device, dtype=torch.float32)
         op = lambda: x.t().contiguous()
-        
+
     elif op_name == "concat":
         t1 = torch.randn(N, N, device=device, dtype=torch.float32)
         t2 = torch.randn(N, N, device=device, dtype=torch.float32)
         t3 = torch.randn(N, N, device=device, dtype=torch.float32)
         op = lambda: torch.cat((t1, t2, t3), dim=1)
-        
+
     elif op_name == "where":
         cond = torch.randint(0, 2, (N, N), device=device, dtype=torch.bool)
         x = torch.randn(N, N, device=device, dtype=torch.float32)
         y = torch.randn(N, N, device=device, dtype=torch.float32)
         op = lambda: torch.where(cond, x, y)
-        
+
     elif op_name == "topk":
         k = 4
         x = torch.randn(N, N, device=device, dtype=torch.float32)
@@ -94,7 +94,7 @@ def run_pytorch_benchmark(op_name, N, num_repeats=10, warmup=5):
         return None
 
     '''
-    # Warmup 
+    # Warmup
     for _ in range(warmup):
         op()
     '''
@@ -114,7 +114,7 @@ def run_pytorch_benchmark(op_name, N, num_repeats=10, warmup=5):
         end_event.record()
         torch.cuda.synchronize()
         total_ms = start_event.elapsed_time(end_event)
-        
+
     else:
         # CPU Timing (Synchronous)
         start_time = time.perf_counter()
@@ -164,7 +164,7 @@ def main():
     device_name = "CPU"
     if HAS_TORCH and torch.cuda.is_available():
         device_name = f"GPU ({torch.cuda.get_device_name(0)})"
-    
+
     print(f"\n{'Benchmarking System':^100}")
     print(f"{f'PyTorch Device: {device_name}':^100}")
     print("-" * 100)
@@ -172,7 +172,7 @@ def main():
     for EXECUTABLE_PATH in EXECUTABLE_PATHS:
         op_name = get_op_name(EXECUTABLE_PATH)
         print(f"Operation: {op_name.upper()}")
-        
+
         # --- Flexible Headers ---
         # K = Kernel Time, T = Total Time
         if HAS_TORCH:
@@ -180,7 +180,7 @@ def main():
                       f"{'CPP GB/s':<9} | {'TORCH GB/s':<11} | {'SPEEDUP':<8}")
         else:
             header = (f"{'SIZE':<6} | {'CPP(K)':<10} | {'CPP(T)':<10} | {'CPP GB/s':<12}")
-            
+
         print(header)
         print("-" * len(header))
 
@@ -214,11 +214,11 @@ def main():
             if HAS_TORCH:
                 t_ms_str = f"{torch_ms:.4f}" if torch_ms else "ERR"
                 t_bw_str = f"{torch_bw:.2f}" if torch_ms else "-"
-                
+
                 # Compare PyTorch Time vs C++ Kernel Time
                 speedup_str = "-"
                 if cpp_k_ms and torch_ms and cpp_k_ms > 0:
-                    ratio = torch_ms / cpp_k_ms 
+                    ratio = torch_ms / cpp_k_ms
                     # If ratio < 1.0, PyTorch is faster. If > 1.0, C++ is faster.
                     # Usually 'Speedup' means (Baseline / New), so let's do (Torch / CPP)
                     # ratio 0.5x means PyTorch took half the time of CPP
@@ -233,4 +233,3 @@ def main():
         print("")
 if __name__ == "__main__":
     main()
-

From b7aecd96cb1b08626d55ab714f12ac51c0056930 Mon Sep 17 00:00:00 2001
From: Saransh Chopra <saransh0701@gmail.com>
Date: Wed, 17 Dec 2025 20:16:04 +0530
Subject: [PATCH 24/33] fix transpose and where kernels for GPU

---
 kernels/concat.hpp       |  56 +++++++++---
 kernels/transpose.hpp    |  45 +++++++---
 kernels/where.hpp        |  39 +++++++--
 run.py                   | 178 +++++++++++++++++++++++++++++----------
 tests/test_concat.cpp    |  39 +++++----
 tests/test_topk.cpp      |  28 +++---
 tests/test_transpose.cpp |  38 +++++----
 tests/test_where.cpp     |  37 ++++----
 8 files changed, 324 insertions(+), 136 deletions(-)

diff --git a/kernels/concat.hpp b/kernels/concat.hpp
index fdbedf7..f2cd54d 100644
--- a/kernels/concat.hpp
+++ b/kernels/concat.hpp
@@ -12,39 +12,67 @@ struct ConcatKernel {
                                   std::array<alpaka::Vec<Dim, Idx>, N> input_strides_vec,
                                   alpaka::Vec<Dim, Idx> output_strides, alpaka::Vec<Dim, Idx> output_shape,
                                   std::array<Idx, N> axis_sizes, std::size_t concat_axis) const {
-        using DimAcc = alpaka::Dim<TAcc>;
-        static_assert(DimAcc::value == Dim::value, "Accelerator and data dims must match");
-
         constexpr std::size_t D = Dim::value;
-        auto elements = alpaka::uniformElementsND(acc, output_shape);
 
-        for (auto const& idx : elements) {
-            Idx concat_coord = idx[concat_axis];
+        // Get global thread index and total threads
+        auto const threadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        auto const threadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
+
+        // Convert to linear thread index
+        Idx global_thread_idx = 0;
+        Idx stride = 1;
+        for (std::size_t d = 0; d < D; ++d) {
+            global_thread_idx += threadIdx[d] * stride;
+            stride *= threadExtent[d];
+        }
+
+        // Total number of output elements
+        Idx total_elements = 1;
+        for (std::size_t d = 0; d < D; ++d) {
+            total_elements *= output_shape[d];
+        }
+
+        // Grid-stride loop
+        for (Idx elem_idx = global_thread_idx; elem_idx < total_elements; elem_idx += threadExtent.prod()) {
+            // Convert linear index to multi-dimensional output index
+            Idx remaining = elem_idx;
+            alpaka::Vec<Dim, Idx> out_idx;
+            for (int d = D - 1; d >= 0; --d) {
+                out_idx[d] = remaining % output_shape[d];
+                remaining /= output_shape[d];
+            }
+
+            // Determine which input tensor this element comes from
+            Idx concat_coord = out_idx[concat_axis];
             std::size_t chosen = 0;
             Idx offset = 0;
 
-            // Find which input matrix this pixel belongs to
+            // Find the input tensor that contains this coordinate
             for (std::size_t k = 0; k < N; ++k) {
                 Idx const sz = axis_sizes[k];
                 if (concat_coord < offset + sz) {
                     chosen = k;
                     break;
                 }
-
                 offset += sz;
             }
 
-            // Compute input and output indexes
-            Idx input_idx = 0;
+            // Compute output linear index
             Idx output_idx = 0;
             for (std::size_t d = 0; d < D; ++d) {
-                Idx const out_coord = idx[d];
-                output_idx += out_coord * output_strides[d];
+                output_idx += out_idx[d] * output_strides[d];
+            }
 
-                Idx const in_coord = out_coord - offset * (d == concat_axis);
-                input_idx += in_coord * input_strides_vec[chosen][d];
+            // Compute input linear index (adjust for concat axis offset)
+            alpaka::Vec<Dim, Idx> in_idx = out_idx;
+            in_idx[concat_axis] = concat_coord - offset;
+
+            Idx input_idx = 0;
+            for (std::size_t d = 0; d < D; ++d) {
+                input_idx += in_idx[d] * input_strides_vec[chosen][d];
             }
 
+            // Copy the element
             output[output_idx] = input_ptrs[chosen][input_idx];
         }
     }
diff --git a/kernels/transpose.hpp b/kernels/transpose.hpp
index 4732544..d4de617 100644
--- a/kernels/transpose.hpp
+++ b/kernels/transpose.hpp
@@ -10,21 +10,46 @@ struct TransposeKernel {
     ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* input, T* output, alpaka::Vec<Dim, Idx> input_strides,
                                   alpaka::Vec<Dim, Idx> output_strides, alpaka::Vec<Dim, Idx> output_shape,
                                   alpaka::Vec<Dim, Idx> perm) const {
-        using DimAcc = alpaka::Dim<TAcc>;
-        static_assert(DimAcc::value == Dim::value, "Accelerator and data dimensions must match!");
-
         constexpr std::size_t D = Dim::value;
-        auto elements = alpaka::uniformElementsND(acc, output_shape);
 
-        for (auto const& idx : elements) {
-            Idx input_idx = 0;
+        // Get global thread index (maps to output element)
+        auto const threadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        auto const threadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
+
+        // Convert to linear index
+        Idx global_thread_idx = 0;
+        Idx stride = 1;
+        for (std::size_t d = 0; d < D; ++d) {
+            global_thread_idx += threadIdx[d] * stride;
+            stride *= threadExtent[d];
+        }
+
+        // Total number of output elements
+        Idx total_elements = 1;
+        for (std::size_t d = 0; d < D; ++d) {
+            total_elements *= output_shape[d];
+        }
+
+        // Process elements with stride equal to total threads (grid-stride loop)
+        for (Idx elem_idx = global_thread_idx; elem_idx < total_elements; elem_idx += threadExtent.prod()) {
+            // Convert linear index to multi-dimensional output index
+            Idx remaining = elem_idx;
+            alpaka::Vec<Dim, Idx> out_idx;
+            for (int d = D - 1; d >= 0; --d) {
+                out_idx[d] = remaining % output_shape[d];
+                remaining /= output_shape[d];
+            }
+
+            // Compute output linear index
             Idx output_idx = 0;
+            for (std::size_t d = 0; d < D; ++d) {
+                output_idx += out_idx[d] * output_strides[d];
+            }
 
-            // Compute input and output indexes
+            // Compute input linear index using permutation
+            Idx input_idx = 0;
             for (std::size_t d = 0; d < D; ++d) {
-                Idx const out_coord = idx[d];
-                output_idx += out_coord * output_strides[d];
-                input_idx += out_coord * input_strides[perm[d]];
+                input_idx += out_idx[d] * input_strides[perm[d]];
             }
 
             output[output_idx] = input[input_idx];
diff --git a/kernels/where.hpp b/kernels/where.hpp
index ee49767..5267a22 100644
--- a/kernels/where.hpp
+++ b/kernels/where.hpp
@@ -11,28 +11,51 @@ struct WhereKernel {
                                   alpaka::Vec<Dim, Idx> cond_strides, alpaka::Vec<Dim, Idx> x_strides,
                                   alpaka::Vec<Dim, Idx> y_strides, alpaka::Vec<Dim, Idx> out_strides,
                                   alpaka::Vec<Dim, Idx> output_shape) const {
-        using DimAcc = alpaka::Dim<TAcc>;
-        static_assert(DimAcc::value == Dim::value, "Accelerator and data dims must match");
-
         constexpr std::size_t D = Dim::value;
-        auto elements = alpaka::uniformElementsND(acc, output_shape);
 
-        for (auto const& idx : elements) {
-            // Compute input and output indexes
+        // Get global thread index and total threads
+        auto const threadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        auto const threadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
+
+        // Convert to linear thread index
+        Idx global_thread_idx = 0;
+        Idx stride = 1;
+        for (std::size_t d = 0; d < D; ++d) {
+            global_thread_idx += threadIdx[d] * stride;
+            stride *= threadExtent[d];
+        }
+
+        // Total number of output elements
+        Idx total_elements = 1;
+        for (std::size_t d = 0; d < D; ++d) {
+            total_elements *= output_shape[d];
+        }
+
+        // Grid-stride loop: each thread processes multiple elements if needed
+        for (Idx elem_idx = global_thread_idx; elem_idx < total_elements; elem_idx += threadExtent.prod()) {
+            // Convert linear index to multi-dimensional output index
+            Idx remaining = elem_idx;
+            alpaka::Vec<Dim, Idx> out_idx;
+            for (int d = D - 1; d >= 0; --d) {
+                out_idx[d] = remaining % output_shape[d];
+                remaining /= output_shape[d];
+            }
+
+            // Compute linear indices for all arrays
             Idx cond_idx = 0;
             Idx x_idx = 0;
             Idx y_idx = 0;
             Idx output_idx = 0;
 
             for (std::size_t d = 0; d < D; ++d) {
-                Idx const coord = idx[d];
-
+                Idx const coord = out_idx[d];
                 cond_idx += coord * cond_strides[d];
                 x_idx += coord * x_strides[d];
                 y_idx += coord * y_strides[d];
                 output_idx += coord * out_strides[d];
             }
 
+            // Perform the where operation
             output[output_idx] = condition[cond_idx] ? x[x_idx] : y[y_idx];
         }
     }
diff --git a/run.py b/run.py
index 25a15e6..2b3922e 100644
--- a/run.py
+++ b/run.py
@@ -2,15 +2,23 @@
 import sys
 import os
 import re
+import argparse
 
 # Configuration
-EXECUTABLE_PATHS = [
+EXECUTABLE_PATHS_CPU = [
     "./bin/test_transpose.out",
     "./bin/test_concat.out",
     "./bin/test_where.out",
     "./bin/test_topk.out"
 ]
 
+EXECUTABLE_PATHS_GPU = [
+    "./build/test_transpose",
+    "./build/test_concat",
+    "./build/test_where",
+    "./build/test_topk"
+]
+
 BENCHMARK_SIZES = [
     512,
     1024,
@@ -18,7 +26,7 @@
     4096
 ]
 
-def build_kernel_tests():
+def build_kernel_tests_cpu():
     """
     Calls the Makefile to build the kernel tests.
     Returns True if successful, False otherwise.
@@ -31,7 +39,7 @@ def build_kernel_tests():
             return False
 
         # Run 'make'.
-        subprocess.run(["make"], check=True)
+        subprocess.run(["make", "-j8"], check=True)
 
         print("Build successful\n")
         return True
@@ -43,6 +51,34 @@ def build_kernel_tests():
         print("Error: 'make' command not found. Is it installed?")
         return False
 
+
+def build_kernel_tests_gpu():
+    """
+    Runs cmake to build the kernel tests.
+    Returns True if successful, False otherwise.
+    """
+    print("Building kernel tests with CMake...")
+    try:
+        # Check if Makefile exists
+        if not os.path.exists("Makefile"):
+            print("Error: Makefile not found in current directory")
+            return False
+
+        # Run 'make'.
+        subprocess.run(["cmake", "-S.", "-Bbuild"], check=True)
+        subprocess.run(["cmake", "--build", "build", "-j8"], check=True)
+
+        print("Build successful\n")
+        return True
+
+    except subprocess.CalledProcessError:
+        print("Build failed. Please fix C++ errors before running benchmarks")
+        return False
+    except FileNotFoundError:
+        print("Error: 'cmake' command not found. Is it installed?")
+        return False
+
+
 def run_benchmark(executable_path, args):
     """
     Runs the compiled executable with arguments.
@@ -76,50 +112,102 @@ def run_benchmark(executable_path, args):
         print(f"Execution failed with return code {e.returncode}")
         print("Stderr:", e.stderr)
 
-def main():
-    # Build Phase
-    if not build_kernel_tests():
-        sys.exit(1)
 
-    print("Bandwidth calculated based on kernel execution time only")
+def main(gpu = False):
+    if gpu:
+        # Build Phase
+        if not build_kernel_tests_gpu():
+            sys.exit(1)
+
+        print("Bandwidth calculated based on kernel execution time only")
+
+        # Benchmark Phase
+        for EXECUTABLE_PATH in EXECUTABLE_PATHS_GPU:
+            print(f"Benchmarking {EXECUTABLE_PATH}")
+            print(f"{'SIZE (NxN)':<12} | {'KERNEL (ms)':<12} | {'TOTAL (ms)':<12} | {'BANDWIDTH (GB/s)':<18}")
+            print("-" * 65)
+
+            for N in BENCHMARK_SIZES:
+                res = run_benchmark(EXECUTABLE_PATH, [N])
+                if res:
+                    k_ms, t_ms = res
+
+                    # Bandwidth Calculation (approximate)
+                    total_bytes = 0.0
+
+                    if EXECUTABLE_PATH == "./build/test_transpose":
+                        total_bytes = 8 * N * N
+                    elif EXECUTABLE_PATH == "./build/test_concat":
+                        total_bytes = 24 * N * N
+                    elif EXECUTABLE_PATH == "./build/test_where":
+                        total_bytes = 13 * N * N
+                    elif EXECUTABLE_PATH == "./build/test_topk":
+                        k = 4
+                        total_bytes = 4 * N * N + 4 * N * k
+
+                    # GB/s = (Bytes / 1e9) / (Seconds)
+                    # Time is in ms, so divide by 1000.0
+                    if k_ms > 0:
+                        bandwidth_gbs = (total_bytes / 1e9) / (k_ms / 1000.0)
+                    else:
+                        bandwidth_gbs = 0.0
+
+                    print(f"{N:<12} | {k_ms:<12.4f} | {t_ms:<12.4f} | {bandwidth_gbs:<18.4f}")
+
+            print("-" * 65)
+
+            if EXECUTABLE_PATH != EXECUTABLE_PATHS_GPU[-1]:
+                print("")
+    else:
+        # Build Phase
+        if not build_kernel_tests_cpu():
+            sys.exit(1)
+
+        print("Bandwidth calculated based on kernel execution time only")
+
+        # Benchmark Phase
+        for EXECUTABLE_PATH in EXECUTABLE_PATHS_CPU:
+            print(f"Benchmarking {EXECUTABLE_PATH}")
+            print(f"{'SIZE (NxN)':<12} | {'KERNEL (ms)':<12} | {'TOTAL (ms)':<12} | {'BANDWIDTH (GB/s)':<18}")
+            print("-" * 65)
+
+            for N in BENCHMARK_SIZES:
+                res = run_benchmark(EXECUTABLE_PATH, [N])
+                if res:
+                    k_ms, t_ms = res
+
+                    # Bandwidth Calculation (approximate)
+                    total_bytes = 0.0
+
+                    if EXECUTABLE_PATH == "./bin/test_transpose.out":
+                        total_bytes = 8 * N * N
+                    elif EXECUTABLE_PATH == "./bin/test_concat.out":
+                        total_bytes = 24 * N * N
+                    elif EXECUTABLE_PATH == "./bin/test_where.out":
+                        total_bytes = 13 * N * N
+                    elif EXECUTABLE_PATH == "./bin/test_topk.out":
+                        k = 4
+                        total_bytes = 4 * N * N + 4 * N * k
+
+                    # GB/s = (Bytes / 1e9) / (Seconds)
+                    # Time is in ms, so divide by 1000.0
+                    if k_ms > 0:
+                        bandwidth_gbs = (total_bytes / 1e9) / (k_ms / 1000.0)
+                    else:
+                        bandwidth_gbs = 0.0
+
+                    print(f"{N:<12} | {k_ms:<12.4f} | {t_ms:<12.4f} | {bandwidth_gbs:<18.4f}")
+
+            print("-" * 65)
+
+            if EXECUTABLE_PATH != EXECUTABLE_PATHS_CPU[-1]:
+                print("")
 
-    # Benchmark Phase
-    for EXECUTABLE_PATH in EXECUTABLE_PATHS:
-        print(f"Benchmarking {EXECUTABLE_PATH}")
-        print(f"{'SIZE (NxN)':<12} | {'KERNEL (ms)':<12} | {'TOTAL (ms)':<12} | {'BANDWIDTH (GB/s)':<18}")
-        print("-" * 65)
 
-        for N in BENCHMARK_SIZES:
-            res = run_benchmark(EXECUTABLE_PATH, [N])
-            if res:
-                k_ms, t_ms = res
-
-                # Bandwidth Calculation (approximate)
-                total_bytes = 0.0
-
-                if EXECUTABLE_PATH == "./bin/test_transpose.out":
-                    total_bytes = 8 * N * N
-                elif EXECUTABLE_PATH == "./bin/test_concat.out":
-                    total_bytes = 24 * N * N
-                elif EXECUTABLE_PATH == "./bin/test_where.out":
-                    total_bytes = 13 * N * N
-                elif EXECUTABLE_PATH == "./bin/test_topk.out":
-                    k = 4
-                    total_bytes = 4 * N * N + 4 * N * k
-
-                # GB/s = (Bytes / 1e9) / (Seconds)
-                # Time is in ms, so divide by 1000.0
-                if k_ms > 0:
-                    bandwidth_gbs = (total_bytes / 1e9) / (k_ms / 1000.0)
-                else:
-                    bandwidth_gbs = 0.0
-
-                print(f"{N:<12} | {k_ms:<12.4f} | {t_ms:<12.4f} | {bandwidth_gbs:<18.4f}")
-
-        print("-" * 65)
+if __name__ == "__main__":
 
-        if EXECUTABLE_PATH != EXECUTABLE_PATHS[-1]:
-            print("")
+    parser = argparse.ArgumentParser(description='Benchmark runner')
+    parser.add_argument('--gpu', help='Description for foo argument', action='store_true')
+    args = parser.parse_args()
 
-if __name__ == "__main__":
-    main()
+    main(args.gpu)
diff --git a/tests/test_concat.cpp b/tests/test_concat.cpp
index 74d3e59..200166c 100644
--- a/tests/test_concat.cpp
+++ b/tests/test_concat.cpp
@@ -144,18 +144,21 @@ int main(int argc, char* argv[]) {
 
     auto output_strides = alpaka::Vec<Dim, Idx>(out_cols, 1);
 
+#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
+    defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
+
+    std::size_t threadsX = 1;
+    std::size_t threadsY = 1;
+    std::size_t blocksX = 64;
+    std::size_t blocksY = 1;
+
+#elif defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+
     // Work division: 2D mapping of threads to elements
-    std::size_t threadsX = 16, threadsY = 16;
+    std::size_t threadsX = 4, threadsY = 4;
     std::size_t blocksX = (out_rows + threadsX - 1) / threadsX;
     std::size_t blocksY = (out_cols + threadsY - 1) / threadsY;
 
-#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
-    defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
-
-    threadsX = 1;
-    threadsY = 1;
-    blocksX = 64;
-    blocksY = 1;
 #endif
 
     auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{alpaka::Vec<Dim, Idx>(blocksY, blocksX),
@@ -167,14 +170,16 @@ int main(int argc, char* argv[]) {
     for (std::size_t k = 0; k < NumInputs; ++k) {
         // 2) host -> accelerator
         {
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-            // For GPU, use cudaMemcpy directly
-            T* pAIn = alpaka::getPtrNative(aIn_bufs.back());
-            T* pHIn = alpaka::getPtrNative(hIn_bufs.back());
-            cudaMemcpy(pAIn, pHIn, numElems * sizeof(T), cudaMemcpyHostToDevice);
-#else
+#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
+    defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
             // For CPU, use memcpy
             alpaka::memcpy(queue, aIn_bufs[k], hIn_bufs[k]);
+
+#elif defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+            // For GPU, use cudaMemcpy directly
+            T* pAIn = alpaka::getPtrNative(aIn_bufs[k]);
+            T* pHIn = alpaka::getPtrNative(hIn_bufs[k]);
+            cudaMemcpy(pAIn, pHIn, numElems * sizeof(T), cudaMemcpyHostToDevice);
 #endif
         }
     }
@@ -192,12 +197,14 @@ int main(int argc, char* argv[]) {
 
     {
         // Final data transfer: accelerator -> host
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
+    defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
+        alpaka::memcpy(queue, hOut, aOut);
+#elif defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
         T* pAOut = alpaka::getPtrNative(aOut);
         T* pHOut = alpaka::getPtrNative(hOut);
         cudaMemcpy(pHOut, pAOut, numElems * sizeof(T), cudaMemcpyDeviceToHost);
 #else
-        alpaka::memcpy(queue, hOut, aOut);
 #endif
     }
     auto end_total = now();
diff --git a/tests/test_topk.cpp b/tests/test_topk.cpp
index 3bb0a8e..1816e7a 100644
--- a/tests/test_topk.cpp
+++ b/tests/test_topk.cpp
@@ -101,14 +101,18 @@ int main(int argc, char* argv[]) {
 
     alpaka::Vec<Dim, Idx> threadsPerBlock;
     alpaka::Vec<Dim, Idx> blocksPerGrid;
-    Idx TARGET_BLOCK_SIZE = 16;
-    bool limitBlocks = false;
 
 #if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
     defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
 
-    TARGET_BLOCK_SIZE = 1;
-    limitBlocks = true;
+    Idx TARGET_BLOCK_SIZE = 1;
+    bool limitBlocks = true;
+
+#elif defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+
+    Idx TARGET_BLOCK_SIZE = 16;
+    bool limitBlocks = false;
+
 #endif
 
     for (std::size_t d = 0; d < Dim::value; ++d) {
@@ -138,14 +142,15 @@ int main(int argc, char* argv[]) {
     // 2) host -> accelerator
     auto start_total = now();
     {
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
+    defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
+        // For CPU, use memcpy
+        alpaka::memcpy(queue, aIn, hIn);
+#elif defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
         // For GPU, use cudaMemcpy directly
         T* pAIn = alpaka::getPtrNative(aIn);
         T* pHIn = alpaka::getPtrNative(hIn);
         cudaMemcpy(pAIn, pHIn, numElems * sizeof(T), cudaMemcpyHostToDevice);
-#else
-        // For CPU, use memcpy
-        alpaka::memcpy(queue, aIn, hIn);
 #endif
     }
 
@@ -162,12 +167,13 @@ int main(int argc, char* argv[]) {
 
     // Final data transfer: accelerator -> host
     {
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
+    defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
+        alpaka::memcpy(queue, hOut, aOut);
+#elif defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
         T* pAOut = alpaka::getPtrNative(aOut);
         T* pHOut = alpaka::getPtrNative(hOut);
         cudaMemcpy(pHOut, pAOut, numElems * sizeof(T), cudaMemcpyDeviceToHost);
-#else
-        alpaka::memcpy(queue, hOut, aOut);
 #endif
     }
     auto end_total = now();
diff --git a/tests/test_transpose.cpp b/tests/test_transpose.cpp
index 62ef923..0be9ba3 100644
--- a/tests/test_transpose.cpp
+++ b/tests/test_transpose.cpp
@@ -18,13 +18,13 @@ using QueueAcc = alpaka::Queue<Acc, alpaka::NonBlocking>;
 
 #elif defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED)
 using DevAcc = alpaka::DevCpu;
-using QueueAcc = alpaka::Queue<DevAcc, alpaka::Blocking>;
 using Acc = alpaka::AccCpuTbbBlocks<Dim, Idx>;
+using QueueAcc = alpaka::Queue<Acc, alpaka::Blocking>;
 
 #elif defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
 using DevAcc = alpaka::DevCpu;
-using QueueAcc = alpaka::Queue<DevAcc, alpaka::Blocking>;
 using Acc = alpaka::AccCpuSerial<Dim, Idx>;
+using QueueAcc = alpaka::Queue<Acc, alpaka::Blocking>;
 
 #elif defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
 using DevAcc = alpaka::DevCpu;
@@ -94,18 +94,21 @@ int main(int argc, char* argv[]) {
     // For transpose out[j,i] = in[i,j], so perm = {1,0}
     auto perm = alpaka::Vec<Dim, Idx>(1, 0);
 
+#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
+    defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
+
+    std::size_t threadsX = 1;
+    std::size_t threadsY = 1;
+    std::size_t blocksX = 64;
+    std::size_t blocksY = 1;
+
+#elif defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+
     // Work division: 2D mapping of threads to elements
     std::size_t threadsX = 16, threadsY = 16;
     std::size_t blocksX = (cols + threadsX - 1) / threadsX;
     std::size_t blocksY = (rows + threadsY - 1) / threadsY;
 
-#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
-    defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
-
-    threadsX = 1;
-    threadsY = 1;
-    blocksX = 64;
-    blocksY = 1;
 #endif
 
     auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{alpaka::Vec<Dim, Idx>(blocksY, blocksX),
@@ -119,15 +122,17 @@ int main(int argc, char* argv[]) {
     }
 
     // 2) host -> accelerator
+    auto start_total = now();
     {
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
+    defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
+        // For CPU, use memcpy
+        alpaka::memcpy(queue, aIn, hIn);
+#elif defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
         // For GPU, use cudaMemcpy directly
         T* pAIn = alpaka::getPtrNative(aIn);
         T* pHIn = alpaka::getPtrNative(hIn);
         cudaMemcpy(pAIn, pHIn, numElems * sizeof(T), cudaMemcpyHostToDevice);
-#else
-        // For CPU, use memcpy
-        alpaka::memcpy(queue, aIn, hIn);
 #endif
     }
 
@@ -144,12 +149,13 @@ int main(int argc, char* argv[]) {
 
     // Final data transfer: accelerator -> host
     {
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
+    defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
+        alpaka::memcpy(queue, hOut, aOut);
+#elif defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
         T* pAOut = alpaka::getPtrNative(aOut);
         T* pHOut = alpaka::getPtrNative(hOut);
         cudaMemcpy(pHOut, pAOut, numElems * sizeof(T), cudaMemcpyDeviceToHost);
-#else
-        alpaka::memcpy(queue, hOut, aOut);
 #endif
     }
     auto end_total = now();
diff --git a/tests/test_where.cpp b/tests/test_where.cpp
index ae5829b..a9bfc80 100644
--- a/tests/test_where.cpp
+++ b/tests/test_where.cpp
@@ -98,18 +98,21 @@ int main(int argc, char* argv[]) {
     // Prepare kernel arguments
     auto strides = alpaka::Vec<Dim, Idx>(cols, 1);
 
+#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
+    defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
+
+    std::size_t threadsX = 1;
+    std::size_t threadsY = 1;
+    std::size_t blocksX = 64;
+    std::size_t blocksY = 1;
+
+#elif defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+
     // Work division: 2D mapping of threads to elements
     std::size_t threadsX = 16, threadsY = 16;
     std::size_t blocksX = (cols + threadsX - 1) / threadsX;
     std::size_t blocksY = (rows + threadsY - 1) / threadsY;
 
-#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
-    defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
-
-    threadsX = 1;
-    threadsY = 1;
-    blocksX = 64;
-    blocksY = 1;
 #endif
 
     auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{alpaka::Vec<Dim, Idx>(blocksX, blocksY),
@@ -131,7 +134,13 @@ int main(int argc, char* argv[]) {
     // 2) host -> accelerator
     auto start_total = now();
     {
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
+    defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
+        // For CPU, use memcpy
+        alpaka::memcpy(queue, aIn_X, hIn_X);
+        alpaka::memcpy(queue, aIn_Y, hIn_Y);
+        alpaka::memcpy(queue, aIn_Cond, hIn_Cond);
+#elif defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
         // For GPU, use cudaMemcpy directly
         T* pAIn_X = alpaka::getPtrNative(aIn_X);
         T* pAIn_Y = alpaka::getPtrNative(aIn_Y);
@@ -142,11 +151,6 @@ int main(int argc, char* argv[]) {
         cudaMemcpy(pAIn_X, pHIn_X, numElems * sizeof(T), cudaMemcpyHostToDevice);
         cudaMemcpy(pAIn_Y, pHIn_Y, numElems * sizeof(T), cudaMemcpyHostToDevice);
         cudaMemcpy(pAIn_Cond, pHIn_Cond, numElems * sizeof(T), cudaMemcpyHostToDevice);
-#else
-        // For CPU, use memcpy
-        alpaka::memcpy(queue, aIn_X, hIn_X);
-        alpaka::memcpy(queue, aIn_Y, hIn_Y);
-        alpaka::memcpy(queue, aIn_Cond, hIn_Cond);
 #endif
     }
 
@@ -164,12 +168,13 @@ int main(int argc, char* argv[]) {
 
     // Final data transfer: accelerator -> host
     {
-#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
+    defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
+        alpaka::memcpy(queue, hOut, aOut);
+#elif defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
         T* pAOut = alpaka::getPtrNative(aOut);
         T* pHOut = alpaka::getPtrNative(hOut);
         cudaMemcpy(pHOut, pAOut, numElems * sizeof(T), cudaMemcpyDeviceToHost);
-#else
-        alpaka::memcpy(queue, hOut, aOut);
 #endif
     }
     auto end_total = now();

From d8553ea861857b13fa4ba62d0f7dc85d5faeb0b9 Mon Sep 17 00:00:00 2001
From: Francesco Derme <francesco.derme02@gmail.com>
Date: Wed, 17 Dec 2025 16:05:45 +0100
Subject: [PATCH 25/33] PyTorch!

---
 run.py       | 194 +++++++++++++++++++++++++++++++++---------
 run_torch.py | 235 ---------------------------------------------------
 2 files changed, 156 insertions(+), 273 deletions(-)
 delete mode 100644 run_torch.py

diff --git a/run.py b/run.py
index 27a00d4..2ea30ff 100644
--- a/run.py
+++ b/run.py
@@ -2,13 +2,21 @@
 import sys
 import os
 import re
+import time
+
+try:
+    import torch
+    HAS_TORCH = True
+except:
+    HAS_TORCH = False
+    print("PyTorch not found, running only C++.\n")
 
 # Configuration
 EXECUTABLE_PATHS = [
     "./bin/test_trivial.out",
     "./bin/test_concat.out",
-    "./bin/test_transpose.out",
     "./bin/test_topk.out",
+    "./bin/test_transpose.out",
     "./bin/test_where.out"
 ]
 
@@ -45,7 +53,86 @@ def build_kernel_tests():
         print("Error: 'make' command not found. Is it installed?")
         return False
 
-def run_benchmark(executable_path, args):
+def get_op_name(executable_path):
+    if "trivial" in executable_path: return "trivial"
+    if "transpose" in executable_path: return "transpose"
+    if "concat" in executable_path: return "concat"
+    if "where" in executable_path: return "where"
+    if "topk" in executable_path: return "topk"
+    return "unknown"
+
+def run_pytorch_benchmark(op_name, N, num_repeats=1, warmup=0):
+    """
+    Runs the equivalent operation in PyTorch and measures execution time.
+    Compatible with both CPU and GPU.
+    """
+    if not HAS_TORCH:
+        return None
+
+    # Detect device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    # Setup Data
+    if op_name == "trivial":
+        x = torch.randn(N, N, device=device, dtype=torch.float32)
+        op = lambda: x.clone()
+
+    elif op_name == "transpose":
+        x = torch.randn(N, N, device=device, dtype=torch.float32)
+        op = lambda: x.t().contiguous()
+
+    elif op_name == "concat":
+        t1 = torch.randn(N, N, device=device, dtype=torch.float32)
+        t2 = torch.randn(N, N, device=device, dtype=torch.float32)
+        t3 = torch.randn(N, N, device=device, dtype=torch.float32)
+        op = lambda: torch.cat((t1, t2, t3), dim=1)
+
+    elif op_name == "where":
+        cond = torch.randint(0, 2, (N, N), device=device, dtype=torch.bool)
+        x = torch.randn(N, N, device=device, dtype=torch.float32)
+        y = torch.randn(N, N, device=device, dtype=torch.float32)
+        op = lambda: torch.where(cond, x, y)
+
+    elif op_name == "topk":
+        k = 4
+        x = torch.randn(N, N, device=device, dtype=torch.float32)
+        op = lambda: torch.topk(x, k)
+    else:
+        return None
+
+    '''
+    # Warmup
+    for _ in range(warmup):
+        op()
+    '''
+
+    if device.type == 'cuda':
+        torch.cuda.synchronize()
+
+    #  Benchmarking
+    if device.type == 'cuda':
+        # GPU Timing (Asynchronous)
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+
+        start_event.record()
+        for _ in range(num_repeats):
+            op()
+        end_event.record()
+        torch.cuda.synchronize()
+        total_ms = start_event.elapsed_time(end_event)
+
+    else:
+        # CPU Timing (Synchronous)
+        start_time = time.perf_counter()
+        for _ in range(num_repeats):
+            op()
+        end_time = time.perf_counter()
+        total_ms = (end_time - start_time) * 1000.0 # convert seconds to ms
+
+    return total_ms / num_repeats
+
+def run_cpp_benchmark(executable_path, args):
     """
     Runs the compiled executable with arguments.
     """
@@ -79,49 +166,80 @@ def run_benchmark(executable_path, args):
         print("Stderr:", e.stderr)
 
 def main():
-    # Build Phase
+    # Build System
+    print(f"\n{'Build System':^80}")
+    print("-" * 80)
     if not build_kernel_tests():
         sys.exit(1)
 
-    print("Bandwidth calculated based on kernel execution time only")
+    # Benchmarking System
+    device_name = "CPU"
+    if HAS_TORCH and torch.cuda.is_available():
+        device_name = f"GPU ({torch.cuda.get_device_name(0)})"
+
+    print(f"\n{'Benchmarking System':^80}")
+    print(f"{f'PyTorch Device: {device_name}':^80}")
+    print("-" * 80)
 
-    # Benchmark Phase
     for EXECUTABLE_PATH in EXECUTABLE_PATHS:
-        print(f"Benchmarking {EXECUTABLE_PATH}")
-        print(f"{'SIZE (NxN)':<12} | {'KERNEL (ms)':<12} | {'TOTAL (ms)':<12} | {'BANDWIDTH (GB/s)':<18}")
-        print("-" * 65)
+        op_name = get_op_name(EXECUTABLE_PATH)
+        print(f"Operation: {op_name.upper()}")
+
+        # Flexible Headers
+        # K = Kernel Time, T = Total Time
+        if HAS_TORCH:
+            header = (f"{'SIZE':<6} | {'CPP(K)':<9} | {'CPP(T)':<9} | {'TORCH':<9} | "
+                      f"{'CPP GB/s':<9} | {'TORCH GB/s':<11} | {'SPEEDUP':<8}")
+        else:
+            header = (f"{'SIZE':<6} | {'CPP(K)':<10} | {'CPP(T)':<10} | {'CPP GB/s':<12}")
+
+        print(header)
+        print("-" * len(header))
 
         for N in BENCHMARK_SIZES:
-            res = run_benchmark(EXECUTABLE_PATH, [N])
-            if res:
-                k_ms, t_ms = res
-
-                # Bandwidth Calculation (approximate)
-                total_bytes = 0.0
-
-                if EXECUTABLE_PATH == "./bin/test_trivial.out":
-                    total_bytes = 8 * N * N
-                if EXECUTABLE_PATH == "./bin/test_transpose.out":
-                    total_bytes = 8 * N * N
-                elif EXECUTABLE_PATH == "./bin/test_concat.out":
-                    concat_num = 3
-                    total_bytes = 8 * concat_num * N * N
-                elif EXECUTABLE_PATH == "./bin/test_where.out":
-                    total_bytes = 13 * N * N
-                elif EXECUTABLE_PATH == "./bin/test_topk.out":
-                    k = 4
-                    total_bytes = 4 * N * N + 4 * N * k
-
-                # GB/s = (Bytes / 1e9) / (Seconds)
-                # Time is in ms, so divide by 1000.0
-                if k_ms > 0:
-                    bandwidth_gbs = (total_bytes / 1e9) / (k_ms / 1000.0)
-                else:
-                    bandwidth_gbs = 0.0
-
-                print(f"{N:<12} | {k_ms:<12.4f} | {t_ms:<12.4f} | {bandwidth_gbs:<18.4f}")
-
-        print("-" * 65)
+            # 1. Run C++ Benchmark
+            cpp_res = run_cpp_benchmark(EXECUTABLE_PATH, [N])
+            if cpp_res:
+                cpp_k_ms, cpp_t_ms = cpp_res
+            else:
+                cpp_k_ms, cpp_t_ms = None, None
+
+            # 2. Run PyTorch Benchmark
+            torch_ms = run_pytorch_benchmark(op_name, N) if HAS_TORCH else None
+
+            # 3. Calculate Bandwidth (Using Kernel Time)
+            total_bytes = 0.0
+            if op_name == "trivial": total_bytes = 8 * N * N
+            elif op_name == "transpose": total_bytes = 8 * N * N
+            elif op_name == "concat":  total_bytes = 24 * N * N
+            elif op_name == "where":   total_bytes = 13 * N * N
+            elif op_name == "topk":    total_bytes = 4 * N * N + 16 * N
+
+            # GB/s = (Bytes/1e9) / (ms/1000)
+            cpp_bw = (total_bytes / 1e9) / (cpp_k_ms / 1000.0) if (cpp_k_ms and cpp_k_ms > 0) else 0.0
+            torch_bw = (total_bytes / 1e9) / (torch_ms / 1000.0) if (torch_ms and torch_ms > 0) else 0.0
+
+            # Formatting
+            c_k_str = f"{cpp_k_ms:.4f}" if cpp_k_ms else "ERR"
+            c_t_str = f"{cpp_t_ms:.4f}" if cpp_t_ms else "ERR"
+            c_bw_str = f"{cpp_bw:.2f}" if cpp_k_ms else "-"
+
+            if HAS_TORCH:
+                t_ms_str = f"{torch_ms:.4f}" if torch_ms else "ERR"
+                t_bw_str = f"{torch_bw:.2f}" if torch_ms else "-"
+
+                # Compare PyTorch Time vs C++ Kernel Time
+                speedup_str = "-"
+                if cpp_k_ms and torch_ms and cpp_k_ms > 0:
+                    ratio = torch_ms / cpp_k_ms
+                    speedup_str = f"{ratio:.2f}x"
+
+                print(f"{N:<6} | {c_k_str:<9} | {c_t_str:<9} | {t_ms_str:<9} | "
+                      f"{c_bw_str:<9} | {t_bw_str:<11} | {speedup_str:<8}")
+            else:
+                print(f"{N:<6} | {c_k_str:<10} | {c_t_str:<10} | {c_bw_str:<12}")
+
+        print("-" * len(header))
 
         if EXECUTABLE_PATH != EXECUTABLE_PATHS[-1]:
             print("")
diff --git a/run_torch.py b/run_torch.py
deleted file mode 100644
index db33ea4..0000000
--- a/run_torch.py
+++ /dev/null
@@ -1,235 +0,0 @@
-import subprocess
-import sys
-import os
-import re
-import time
-
-try:
-    import torch
-    HAS_TORCH = True
-except:
-    HAS_TORCH = False
-    print("PyTorch not found, running only C++.\n")
-
-# Configuration
-EXECUTABLE_PATHS = [
-    "./bin/test_transpose.out",
-    "./bin/test_concat.out",
-    "./bin/test_where.out",
-    "./bin/test_topk.out"
-]
-
-BENCHMARK_SIZES = [
-    512,
-    1024,
-    2048,
-    4096
-]
-
-def build_kernel_tests():
-    """
-    Calls the Makefile to build the kernel tests.
-    Returns True if successful, False otherwise.
-    """
-    print("Building kernel tests with Make...")
-    try:
-        # Check if Makefile exists
-        if not os.path.exists("Makefile"):
-            print("Error: Makefile not found in current directory")
-            return False
-
-        # Run 'make'.
-        subprocess.run(["make"], check=True)
-
-        print("Build successful\n")
-        return True
-
-    except subprocess.CalledProcessError:
-        print("Build failed. Please fix C++ errors before running benchmarks")
-        return False
-    except FileNotFoundError:
-        print("Error: 'make' command not found. Is it installed?")
-        return False
-
-def get_op_name(executable_path):
-    if "transpose" in executable_path: return "transpose"
-    if "concat" in executable_path: return "concat"
-    if "where" in executable_path: return "where"
-    if "topk" in executable_path: return "topk"
-    return "unknown"
-
-def run_pytorch_benchmark(op_name, N, num_repeats=10, warmup=5):
-    """
-    Runs the equivalent operation in PyTorch and measures execution time.
-    Compatible with both CPU and GPU.
-    """
-    if not HAS_TORCH:
-        return None
-
-    # Detect device
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-    # Setup Data
-    if op_name == "transpose":
-        x = torch.randn(N, N, device=device, dtype=torch.float32)
-        op = lambda: x.t().contiguous()
-
-    elif op_name == "concat":
-        t1 = torch.randn(N, N, device=device, dtype=torch.float32)
-        t2 = torch.randn(N, N, device=device, dtype=torch.float32)
-        t3 = torch.randn(N, N, device=device, dtype=torch.float32)
-        op = lambda: torch.cat((t1, t2, t3), dim=1)
-
-    elif op_name == "where":
-        cond = torch.randint(0, 2, (N, N), device=device, dtype=torch.bool)
-        x = torch.randn(N, N, device=device, dtype=torch.float32)
-        y = torch.randn(N, N, device=device, dtype=torch.float32)
-        op = lambda: torch.where(cond, x, y)
-
-    elif op_name == "topk":
-        k = 4
-        x = torch.randn(N, N, device=device, dtype=torch.float32)
-        op = lambda: torch.topk(x, k)
-    else:
-        return None
-
-    '''
-    # Warmup
-    for _ in range(warmup):
-        op()
-    '''
-
-    if device.type == 'cuda':
-        torch.cuda.synchronize()
-
-    #  Benchmarking
-    if device.type == 'cuda':
-        # GPU Timing (Asynchronous)
-        start_event = torch.cuda.Event(enable_timing=True)
-        end_event = torch.cuda.Event(enable_timing=True)
-
-        start_event.record()
-        for _ in range(num_repeats):
-            op()
-        end_event.record()
-        torch.cuda.synchronize()
-        total_ms = start_event.elapsed_time(end_event)
-
-    else:
-        # CPU Timing (Synchronous)
-        start_time = time.perf_counter()
-        for _ in range(num_repeats):
-            op()
-        end_time = time.perf_counter()
-        total_ms = (end_time - start_time) * 1000.0 # convert seconds to ms
-
-    return total_ms / num_repeats
-
-def run_cpp_benchmark(executable_path, args):
-    """
-    Runs the compiled executable with arguments.
-    """
-    if not os.path.exists(executable_path):
-        print(f"Error: Executable '{executable_path}' not found after build")
-        return
-
-    N = args[0]
-
-    try:
-        # Construct the command
-        cmd = [executable_path] + [str(a) for a in args]
-
-        # Run and capture output for parsing
-        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
-        output = result.stdout
-
-        kernel_match = re.search(r"TIME_KERNEL_MS:\s+(\d+\.?\d*)", output)
-        total_match = re.search(r"TIME_TOTAL_MS:\s+(\d+\.?\d*)", output)
-
-        if kernel_match and total_match:
-            return float(kernel_match.group(1)), float(total_match.group(1))
-        else:
-            print(f"Output parsing failed for size {N}x{N}.")
-            print("Printing raw output from the cpp executable")
-            print(output)
-            return None
-
-    except subprocess.CalledProcessError as e:
-        print(f"Execution failed with return code {e.returncode}")
-        print("Stderr:", e.stderr)
-
-def main():
-    build_kernel_tests()
-
-    device_name = "CPU"
-    if HAS_TORCH and torch.cuda.is_available():
-        device_name = f"GPU ({torch.cuda.get_device_name(0)})"
-
-    print(f"\n{'Benchmarking System':^100}")
-    print(f"{f'PyTorch Device: {device_name}':^100}")
-    print("-" * 100)
-
-    for EXECUTABLE_PATH in EXECUTABLE_PATHS:
-        op_name = get_op_name(EXECUTABLE_PATH)
-        print(f"Operation: {op_name.upper()}")
-
-        # --- Flexible Headers ---
-        # K = Kernel Time, T = Total Time
-        if HAS_TORCH:
-            header = (f"{'SIZE':<6} | {'CPP(K)':<9} | {'CPP(T)':<9} | {'TORCH':<9} | "
-                      f"{'CPP GB/s':<9} | {'TORCH GB/s':<11} | {'SPEEDUP':<8}")
-        else:
-            header = (f"{'SIZE':<6} | {'CPP(K)':<10} | {'CPP(T)':<10} | {'CPP GB/s':<12}")
-
-        print(header)
-        print("-" * len(header))
-
-        for N in BENCHMARK_SIZES:
-            # 1. Run C++ Benchmark
-            cpp_res = run_cpp_benchmark(EXECUTABLE_PATH, [N])
-            if cpp_res:
-                cpp_k_ms, cpp_t_ms = cpp_res
-            else:
-                cpp_k_ms, cpp_t_ms = None, None
-
-            # 2. Run PyTorch Benchmark
-            torch_ms = run_pytorch_benchmark(op_name, N) if HAS_TORCH else None
-
-            # 3. Calculate Bandwidth (Using Kernel Time)
-            total_bytes = 0.0
-            if op_name == "transpose": total_bytes = 8 * N * N
-            elif op_name == "concat":  total_bytes = 24 * N * N
-            elif op_name == "where":   total_bytes = 13 * N * N
-            elif op_name == "topk":    total_bytes = 4 * N * N + 16 * N # approx topk
-
-            # GB/s = (Bytes/1e9) / (ms/1000)
-            cpp_bw = (total_bytes / 1e9) / (cpp_k_ms / 1000.0) if (cpp_k_ms and cpp_k_ms > 0) else 0.0
-            torch_bw = (total_bytes / 1e9) / (torch_ms / 1000.0) if (torch_ms and torch_ms > 0) else 0.0
-
-            # Formatting
-            c_k_str = f"{cpp_k_ms:.4f}" if cpp_k_ms else "ERR"
-            c_t_str = f"{cpp_t_ms:.4f}" if cpp_t_ms else "ERR"
-            c_bw_str = f"{cpp_bw:.2f}" if cpp_k_ms else "-"
-
-            if HAS_TORCH:
-                t_ms_str = f"{torch_ms:.4f}" if torch_ms else "ERR"
-                t_bw_str = f"{torch_bw:.2f}" if torch_ms else "-"
-
-                # Compare PyTorch Time vs C++ Kernel Time
-                speedup_str = "-"
-                if cpp_k_ms and torch_ms and cpp_k_ms > 0:
-                    ratio = torch_ms / cpp_k_ms
-                    # If ratio < 1.0, PyTorch is faster. If > 1.0, C++ is faster.
-                    # Usually 'Speedup' means (Baseline / New), so let's do (Torch / CPP)
-                    # ratio 0.5x means PyTorch took half the time of CPP
-                    speedup_str = f"{ratio:.2f}x"
-
-                print(f"{N:<6} | {c_k_str:<9} | {c_t_str:<9} | {t_ms_str:<9} | "
-                      f"{c_bw_str:<9} | {t_bw_str:<11} | {speedup_str:<8}")
-            else:
-                print(f"{N:<6} | {c_k_str:<10} | {c_t_str:<10} | {c_bw_str:<12}")
-
-        print("-" * 100)
-        print("")
-if __name__ == "__main__":
-    main()

From 5597b6b29606b210b9b46d80e192059c47521b37 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 17 Dec 2025 15:05:56 +0000
Subject: [PATCH 26/33] style: pre-commit fixes

---
 kernels/trivial.hpp      |  6 ++----
 tests/test_concat.cpp    | 39 ++++++++++++++-------------------------
 tests/test_topk.cpp      |  6 ++----
 tests/test_transpose.cpp | 24 +++++++++---------------
 tests/test_trivial.cpp   | 25 ++++++++++---------------
 tests/test_where.cpp     | 34 +++++++++++++---------------------
 6 files changed, 50 insertions(+), 84 deletions(-)

diff --git a/kernels/trivial.hpp b/kernels/trivial.hpp
index 4a0236c..e799152 100644
--- a/kernels/trivial.hpp
+++ b/kernels/trivial.hpp
@@ -7,12 +7,10 @@ namespace alpaka_kernels {
 
 struct TrivialKernel {
     template <typename TAcc, typename T, typename Dim, typename Idx>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* input, T* output,
-                                  alpaka::Vec<Dim, Idx> output_strides,
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* input, T* output, alpaka::Vec<Dim, Idx> output_strides,
                                   alpaka::Vec<Dim, Idx> output_shape) const {
         using DimAcc = alpaka::Dim<TAcc>;
-        static_assert(DimAcc::value == Dim::value,
-                      "Accelerator and data dimensions must match!");
+        static_assert(DimAcc::value == Dim::value, "Accelerator and data dimensions must match!");
 
         constexpr std::size_t D = Dim::value;
         auto elements = alpaka::uniformElementsND(acc, output_shape);
diff --git a/tests/test_concat.cpp b/tests/test_concat.cpp
index 5be7e6c..de487b9 100644
--- a/tests/test_concat.cpp
+++ b/tests/test_concat.cpp
@@ -69,8 +69,7 @@ int main(int argc, char* argv[]) {
             val = cols;
             total_rows += val;
         }
-    }
-    else {
+    } else {
         std::cout << "Using random dimensions ";
         for (auto& val : in_rows) {
             val = distrib_int(gen);
@@ -79,8 +78,7 @@ int main(int argc, char* argv[]) {
     }
 
     for (std::size_t k = 0; k < NumInputs; ++k)
-        std::cout << in_rows[k] << "x" << cols
-                  << ((k < NumInputs - 1) ? ", " : "\n");
+        std::cout << in_rows[k] << "x" << cols << ((k < NumInputs - 1) ? ", " : "\n");
 
     std::array<std::vector<T>, NumInputs> INPUT;
     for (std::size_t k = 0; k < NumInputs; ++k) {
@@ -94,10 +92,8 @@ int main(int argc, char* argv[]) {
     alpaka::Queue<Acc, alpaka::Blocking> queue{devAcc};
 
     // Allocate buffers
-    using BufAcc =
-        decltype(alpaka::allocBuf<T, Idx>(devAcc, alpaka::Vec<Dim, Idx>{}));
-    using BufHost =
-        decltype(alpaka::allocBuf<T, Idx>(devHost, alpaka::Vec<Dim, Idx>{}));
+    using BufAcc = decltype(alpaka::allocBuf<T, Idx>(devAcc, alpaka::Vec<Dim, Idx>{}));
+    using BufHost = decltype(alpaka::allocBuf<T, Idx>(devHost, alpaka::Vec<Dim, Idx>{}));
 
     std::vector<BufAcc> aIn_bufs;
     aIn_bufs.reserve(NumInputs);
@@ -121,8 +117,7 @@ int main(int argc, char* argv[]) {
 
         // INPUT to host buffer data transfer (safe via raw pointers)
         T* pHost = alpaka::getPtrNative(hIn_bufs.back());
-        for (std::size_t i = 0; i < INPUT[k].size(); ++i)
-            pHost[i] = INPUT[k][i];
+        for (std::size_t i = 0; i < INPUT[k].size(); ++i) pHost[i] = INPUT[k][i];
     }
 
     // Allocate output buffers
@@ -161,16 +156,14 @@ int main(int argc, char* argv[]) {
     blocksY = 1;
 #endif
 
-    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{
-        alpaka::Vec<Dim, Idx>(blocksX, blocksY),
-        alpaka::Vec<Dim, Idx>(threadsX, threadsY), extentOut};
+    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{alpaka::Vec<Dim, Idx>(blocksX, blocksY),
+                                                          alpaka::Vec<Dim, Idx>(threadsX, threadsY), extentOut};
 
     // Warmup run
     ConcatKernel kernel;
 
-    alpaka::exec<Acc>(queue, workDiv, kernel, aIn_ptrs,
-                      alpaka::getPtrNative(aOut), input_strides_vec,
-                      output_strides, extentOut, axis_sizes, ConcatAxis);
+    alpaka::exec<Acc>(queue, workDiv, kernel, aIn_ptrs, alpaka::getPtrNative(aOut), input_strides_vec, output_strides,
+                      extentOut, axis_sizes, ConcatAxis);
 
     alpaka::wait(queue);
 
@@ -186,9 +179,8 @@ int main(int argc, char* argv[]) {
     // Launch kernel
     auto start_kernel = now();
 
-    alpaka::exec<Acc>(queue, workDiv, kernel, aIn_ptrs,
-                      alpaka::getPtrNative(aOut), input_strides_vec,
-                      output_strides, extentOut, axis_sizes, ConcatAxis);
+    alpaka::exec<Acc>(queue, workDiv, kernel, aIn_ptrs, alpaka::getPtrNative(aOut), input_strides_vec, output_strides,
+                      extentOut, axis_sizes, ConcatAxis);
 
     alpaka::wait(queue);
     auto end_kernel = now();
@@ -202,8 +194,7 @@ int main(int argc, char* argv[]) {
     std::cout << "Output is of shape " << out_rows << "x" << out_cols << "\n";
 
     std::vector<T> expected;
-    for (const auto& vec : INPUT)
-        expected.insert(expected.end(), vec.begin(), vec.end());
+    for (const auto& vec : INPUT) expected.insert(expected.end(), vec.begin(), vec.end());
 
     {
         T* pHost = alpaka::getPtrNative(hOut);
@@ -217,10 +208,8 @@ int main(int argc, char* argv[]) {
 
     std::cout << "Correct!\n";
 
-    std::chrono::duration<double, std::milli> kernel_ms =
-        end_kernel - start_kernel;
-    std::chrono::duration<double, std::milli> total_ms =
-        end_total - start_total;
+    std::chrono::duration<double, std::milli> kernel_ms = end_kernel - start_kernel;
+    std::chrono::duration<double, std::milli> total_ms = end_total - start_total;
 
     std::cout << "TIME_KERNEL_MS: " << kernel_ms.count() << std::endl;
     std::cout << "TIME_TOTAL_MS: " << total_ms.count() << std::endl;
diff --git a/tests/test_topk.cpp b/tests/test_topk.cpp
index 8d72621..23e5864 100644
--- a/tests/test_topk.cpp
+++ b/tests/test_topk.cpp
@@ -131,10 +131,8 @@ int main(int argc, char* argv[]) {
     // Warmup run
     TopKKernel<K, MaxRegisters> kernel;
 
-    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn),
-                      alpaka::getPtrNative(aOut), input_strides, output_strides,
-                      grid_elements, TopkAxis, extentIn[TopkAxis],
-                      padding_value);
+    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn), alpaka::getPtrNative(aOut), input_strides,
+                      output_strides, grid_elements, TopkAxis, extentIn[TopkAxis], padding_value);
 
     alpaka::wait(queue);
 
diff --git a/tests/test_transpose.cpp b/tests/test_transpose.cpp
index 80e9dc5..caa9016 100644
--- a/tests/test_transpose.cpp
+++ b/tests/test_transpose.cpp
@@ -60,8 +60,7 @@ int main(int argc, char* argv[]) {
         rows = std::atoi(argv[1]);
         cols = rows;
         std::cout << "Using input dimensions " << rows << "x" << cols << "\n";
-    }
-    else {
+    } else {
         std::cout << "Using random dimensions " << rows << "x" << cols << "\n";
     }
 
@@ -109,16 +108,14 @@ int main(int argc, char* argv[]) {
     blocksY = 1;
 #endif
 
-    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{
-        alpaka::Vec<Dim, Idx>(blocksX, blocksY),
-        alpaka::Vec<Dim, Idx>(threadsX, threadsY), extentOut};
+    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{alpaka::Vec<Dim, Idx>(blocksX, blocksY),
+                                                          alpaka::Vec<Dim, Idx>(threadsX, threadsY), extentOut};
 
     // Warmup run
     TransposeKernel kernel;
 
-    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn),
-                      alpaka::getPtrNative(aOut), input_strides, output_strides,
-                      extentOut, perm);
+    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn), alpaka::getPtrNative(aOut), input_strides,
+                      output_strides, extentOut, perm);
 
     alpaka::wait(queue);
 
@@ -137,9 +134,8 @@ int main(int argc, char* argv[]) {
     // Launch kernel
     auto start_kernel = now();
 
-    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn),
-                      alpaka::getPtrNative(aOut), input_strides, output_strides,
-                      extentOut, perm);
+    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn), alpaka::getPtrNative(aOut), input_strides,
+                      output_strides, extentOut, perm);
 
     alpaka::wait(queue);
     auto end_kernel = now();
@@ -169,10 +165,8 @@ int main(int argc, char* argv[]) {
 
     std::cout << "Correct!\n";
 
-    std::chrono::duration<double, std::milli> kernel_ms =
-        end_kernel - start_kernel;
-    std::chrono::duration<double, std::milli> total_ms =
-        end_total - start_total;
+    std::chrono::duration<double, std::milli> kernel_ms = end_kernel - start_kernel;
+    std::chrono::duration<double, std::milli> total_ms = end_total - start_total;
 
     std::cout << "TIME_KERNEL_MS: " << kernel_ms.count() << std::endl;
     std::cout << "TIME_TOTAL_MS: " << total_ms.count() << std::endl;
diff --git a/tests/test_trivial.cpp b/tests/test_trivial.cpp
index 763b2b1..d959d7c 100644
--- a/tests/test_trivial.cpp
+++ b/tests/test_trivial.cpp
@@ -60,8 +60,7 @@ int main(int argc, char* argv[]) {
         rows = std::atoi(argv[1]);
         cols = rows;
         std::cout << "Using input dimensions " << rows << "x" << cols << "\n";
-    }
-    else {
+    } else {
         std::cout << "Using random dimensions " << rows << "x" << cols << "\n";
     }
 
@@ -94,8 +93,7 @@ int main(int argc, char* argv[]) {
     std::size_t blocksX = (cols + threadsX - 1) / threadsX;
     std::size_t blocksY = (rows + threadsY - 1) / threadsY;
 
-#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || \
-    defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
+#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
     defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
 
     threadsX = 1;
@@ -104,14 +102,13 @@ int main(int argc, char* argv[]) {
     blocksY = 1;
 #endif
 
-    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{
-        alpaka::Vec<Dim, Idx>(blocksX, blocksY),
-        alpaka::Vec<Dim, Idx>(threadsX, threadsY), extent};
+    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{alpaka::Vec<Dim, Idx>(blocksX, blocksY),
+                                                          alpaka::Vec<Dim, Idx>(threadsX, threadsY), extent};
 
     // Warmup run
     TrivialKernel kernel;
-    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn),
-                      alpaka::getPtrNative(aOut), output_strides, extent);
+    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn), alpaka::getPtrNative(aOut), output_strides,
+                      extent);
 
     alpaka::wait(queue);
 
@@ -130,8 +127,8 @@ int main(int argc, char* argv[]) {
     // Launch kernel
     auto start_kernel = now();
 
-    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn),
-                      alpaka::getPtrNative(aOut), output_strides, extent);
+    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn), alpaka::getPtrNative(aOut), output_strides,
+                      extent);
 
     alpaka::wait(queue);
     auto end_kernel = now();
@@ -161,10 +158,8 @@ int main(int argc, char* argv[]) {
 
     std::cout << "Correct!\n";
 
-    std::chrono::duration<double, std::milli> kernel_ms =
-        end_kernel - start_kernel;
-    std::chrono::duration<double, std::milli> total_ms =
-        end_total - start_total;
+    std::chrono::duration<double, std::milli> kernel_ms = end_kernel - start_kernel;
+    std::chrono::duration<double, std::milli> total_ms = end_total - start_total;
 
     std::cout << "TIME_KERNEL_MS: " << kernel_ms.count() << std::endl;
     std::cout << "TIME_TOTAL_MS: " << total_ms.count() << std::endl;
diff --git a/tests/test_where.cpp b/tests/test_where.cpp
index 4ca2c84..0b72602 100644
--- a/tests/test_where.cpp
+++ b/tests/test_where.cpp
@@ -62,8 +62,7 @@ int main(int argc, char* argv[]) {
         rows = std::atoi(argv[1]);
         cols = rows;
         std::cout << "Using input dimensions " << rows << "x" << cols << "\n";
-    }
-    else {
+    } else {
         std::cout << "Using random dimensions " << rows << "x" << cols << "\n";
     }
 
@@ -74,8 +73,7 @@ int main(int argc, char* argv[]) {
 
     for (auto& val : INPUT_X) val = distrib_real(gen) * 100.0;
     for (auto& val : INPUT_Y) val = distrib_real(gen);
-    for (std::size_t i = 0; i < numElems; ++i)
-        INPUT_COND[i] = distrib_bool(gen);
+    for (std::size_t i = 0; i < numElems; ++i) INPUT_COND[i] = distrib_bool(gen);
 
     // Setup the accelerator, host and queue
     auto devAcc = alpaka::getDevByIdx(PlatAcc{}, 0u);
@@ -114,17 +112,15 @@ int main(int argc, char* argv[]) {
     blocksY = 1;
 #endif
 
-    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{
-        alpaka::Vec<Dim, Idx>(blocksX, blocksY),
-        alpaka::Vec<Dim, Idx>(threadsX, threadsY), extent};
+    auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{alpaka::Vec<Dim, Idx>(blocksX, blocksY),
+                                                          alpaka::Vec<Dim, Idx>(threadsX, threadsY), extent};
 
     // Warmup run
     WhereKernel kernel;
 
-    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn_Cond),
-                      alpaka::getPtrNative(aIn_X), alpaka::getPtrNative(aIn_Y),
-                      alpaka::getPtrNative(aOut), strides, strides, strides,
-                      strides, extent);
+    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn_Cond), alpaka::getPtrNative(aIn_X),
+                      alpaka::getPtrNative(aIn_Y), alpaka::getPtrNative(aOut), strides, strides, strides, strides,
+                      extent);
 
     alpaka::wait(queue);
 
@@ -151,10 +147,9 @@ int main(int argc, char* argv[]) {
     // Launch kernel
     auto start_kernel = now();
 
-    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn_Cond),
-                      alpaka::getPtrNative(aIn_X), alpaka::getPtrNative(aIn_Y),
-                      alpaka::getPtrNative(aOut), strides, strides, strides,
-                      strides, extent);
+    alpaka::exec<Acc>(queue, workDiv, kernel, alpaka::getPtrNative(aIn_Cond), alpaka::getPtrNative(aIn_X),
+                      alpaka::getPtrNative(aIn_Y), alpaka::getPtrNative(aOut), strides, strides, strides, strides,
+                      extent);
 
     alpaka::wait(queue);
     auto end_kernel = now();
@@ -172,8 +167,7 @@ int main(int argc, char* argv[]) {
         for (std::size_t i = 0; i < rows; ++i) {
             for (std::size_t j = 0; j < cols; ++j) {
                 T valOut = pHost[i * cols + j];
-                T valIn = INPUT_COND[i * cols + j] ? INPUT_X[i * cols + j]
-                                                   : INPUT_Y[i * cols + j];
+                T valIn = INPUT_COND[i * cols + j] ? INPUT_X[i * cols + j] : INPUT_Y[i * cols + j];
 
                 if (valIn != valOut) {
                     std::cerr << "Failed!\n";
@@ -185,10 +179,8 @@ int main(int argc, char* argv[]) {
 
     std::cout << "Correct!\n";
 
-    std::chrono::duration<double, std::milli> kernel_ms =
-        end_kernel - start_kernel;
-    std::chrono::duration<double, std::milli> total_ms =
-        end_total - start_total;
+    std::chrono::duration<double, std::milli> kernel_ms = end_kernel - start_kernel;
+    std::chrono::duration<double, std::milli> total_ms = end_total - start_total;
 
     std::cout << "TIME_KERNEL_MS: " << kernel_ms.count() << std::endl;
     std::cout << "TIME_TOTAL_MS: " << total_ms.count() << std::endl;

From 44618e424505c9e1cb8dd56144217f92782a5e4e Mon Sep 17 00:00:00 2001
From: Saransh Chopra <saransh0701@gmail.com>
Date: Wed, 17 Dec 2025 20:37:11 +0530
Subject: [PATCH 27/33] fix other kernels

---
 kernels/topk.hpp      | 144 ++++++++++++++++++++++++++++++------------
 run.py                |   2 +-
 tests/test_concat.cpp |   8 +--
 tests/test_topk.cpp   |   2 +-
 4 files changed, 110 insertions(+), 46 deletions(-)

diff --git a/kernels/topk.hpp b/kernels/topk.hpp
index 7dac153..4f65024 100644
--- a/kernels/topk.hpp
+++ b/kernels/topk.hpp
@@ -13,20 +13,56 @@ struct TopKKernel {
     ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* input, T* output, alpaka::Vec<Dim, Idx> input_strides,
                                   alpaka::Vec<Dim, Idx> output_strides, alpaka::Vec<Dim, Idx> output_shape,
                                   Idx topk_axis, Idx topk_axis_size, T padding_value) const {
-        using DimAcc = alpaka::Dim<TAcc>;
-        static_assert(DimAcc::value == Dim::value, "Accelerator and data dimensions must match!");
-
         if constexpr (K == 0) return;
 
         constexpr std::size_t D = Dim::value;
-        auto elements = alpaka::uniformElementsND(acc, output_shape);
 
-        for (auto const& idx : elements) {
+        // Total number of output positions (excluding the K dimension)
+        // Each thread handles one position and finds top K along the topk_axis
+        Idx total_positions = 1;
+        for (std::size_t d = 0; d < D; ++d) {
+            if (d != static_cast<std::size_t>(topk_axis)) {
+                total_positions *= output_shape[d];
+            }
+        }
+
+        // Get global thread index and total threads
+        auto const threadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        auto const threadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
+
+        // Convert to linear thread index
+        Idx global_thread_idx = 0;
+        Idx stride = 1;
+        for (std::size_t d = 0; d < D; ++d) {
+            global_thread_idx += threadIdx[d] * stride;
+            stride *= threadExtent[d];
+        }
+
+        // Grid-stride loop over positions (each position is a slice along topk_axis)
+        for (Idx pos_idx = global_thread_idx; pos_idx < total_positions; pos_idx += threadExtent.prod()) {
+            // Convert linear position index to multi-dimensional index (excluding topk_axis)
+            Idx remaining = pos_idx;
+            alpaka::Vec<Dim, Idx> full_idx;
+
+            // Initialize full index
+            for (std::size_t d = 0; d < D; ++d) {
+                full_idx[d] = 0;
+            }
+
+            // Fill in coordinates for dimensions except topk_axis
+            for (int d = D - 1; d >= 0; --d) {
+                if (static_cast<std::size_t>(d) != static_cast<std::size_t>(topk_axis)) {
+                    Idx dim_size = output_shape[d];
+                    full_idx[d] = remaining % dim_size;
+                    remaining /= dim_size;
+                }
+            }
+
+            // Compute base indices for input and output
             Idx input_base_idx = 0;
             Idx output_base_idx = 0;
-
             for (std::size_t d = 0; d < D; ++d) {
-                Idx const coord = idx[d];
+                Idx const coord = full_idx[d];
                 input_base_idx += coord * input_strides[d];
                 output_base_idx += coord * output_strides[d];
             }
@@ -34,67 +70,95 @@ struct TopKKernel {
             Idx const input_topk_axis_stride = input_strides[topk_axis];
             Idx const output_topk_axis_stride = output_strides[topk_axis];
 
-            // Use registers
+            // Use registers for small K
             if constexpr (K <= MaxRegisters && K > 0) {
                 T top_vals[K];
                 Idx count = 0;
 
+                // Initialize with padding value
+                for (Idx i = 0; i < K; ++i) {
+                    top_vals[i] = padding_value;
+                }
+
+                // Process all elements along the topk_axis
                 for (Idx j = 0; j < topk_axis_size; ++j) {
                     Idx const input_idx = input_base_idx + (j * input_topk_axis_stride);
                     T const val = input[input_idx];
 
-                    if (count == K && val <= top_vals[K - 1]) continue;
-
-                    Idx insert_pos = 0;
-                    while (insert_pos < count && val <= top_vals[insert_pos]) {
-                        insert_pos++;
-                    }
-
-                    if (insert_pos < K) {
-                        Idx const last = std::min(count, K - 1);
-                        for (Idx s = last; s > insert_pos; --s) {
-                            top_vals[s] = top_vals[s - 1];
+                    if (count < K) {
+                        // Fill the array first
+                        Idx insert_pos = count;
+                        while (insert_pos > 0 && val < top_vals[insert_pos - 1]) {
+                            top_vals[insert_pos] = top_vals[insert_pos - 1];
+                            insert_pos--;
+                        }
+                        top_vals[insert_pos] = val;
+                        count++;
+                    } else if (val > top_vals[0]) {
+                        // Replace smallest element
+                        Idx insert_pos = 0;
+                        while (insert_pos < K - 1 && val > top_vals[insert_pos + 1]) {
+                            top_vals[insert_pos] = top_vals[insert_pos + 1];
+                            insert_pos++;
                         }
-
                         top_vals[insert_pos] = val;
-                        if (count < K) count++;
                     }
                 }
 
+                // Write results to output (largest to smallest)
                 for (Idx i = 0; i < K; ++i) {
                     Idx const output_idx = output_base_idx + (i * output_topk_axis_stride);
-                    output[output_idx] = (i < count) ? top_vals[i] : padding_value;
+                    output[output_idx] = top_vals[K - 1 - i];  // Reverse to get largest first
                 }
             }
-            // Use global memory
+            // Use global memory for large K
             else {
+                // Use output buffer as temporary storage
+                for (Idx i = 0; i < K; ++i) {
+                    Idx const output_idx = output_base_idx + (i * output_topk_axis_stride);
+                    output[output_idx] = padding_value;
+                }
+
                 Idx count = 0;
+
+                // Process all elements along the topk_axis
                 for (Idx j = 0; j < topk_axis_size; ++j) {
                     Idx const input_idx = input_base_idx + (j * input_topk_axis_stride);
                     T const val = input[input_idx];
 
-                    if (count == K && val <= output[output_base_idx + (K - 1) * output_topk_axis_stride]) continue;
-
-                    Idx insert_pos = 0;
-                    while (insert_pos < count) {
-                        if (val > output[output_base_idx + insert_pos * output_topk_axis_stride]) break;
-                        insert_pos++;
-                    }
-
-                    if (insert_pos < K) {
-                        Idx const last = std::min(count, K - 1);
-                        for (Idx s = last; s > insert_pos; --s) {
-                            output[output_base_idx + s * output_topk_axis_stride] =
-                                output[output_base_idx + (s - 1) * output_topk_axis_stride];
+                    if (count < K) {
+                        // Fill the output first
+                        Idx insert_pos = count;
+                        while (insert_pos > 0 &&
+                               val < output[output_base_idx + (insert_pos - 1) * output_topk_axis_stride]) {
+                            Idx src_idx = output_base_idx + (insert_pos - 1) * output_topk_axis_stride;
+                            Idx dst_idx = output_base_idx + insert_pos * output_topk_axis_stride;
+                            output[dst_idx] = output[src_idx];
+                            insert_pos--;
+                        }
+                        output[output_base_idx + insert_pos * output_topk_axis_stride] = val;
+                        count++;
+                    } else if (val > output[output_base_idx]) {
+                        // Replace smallest element (at position 0 since we store ascending)
+                        Idx insert_pos = 0;
+                        while (insert_pos < K - 1 &&
+                               val > output[output_base_idx + (insert_pos + 1) * output_topk_axis_stride]) {
+                            Idx src_idx = output_base_idx + (insert_pos + 1) * output_topk_axis_stride;
+                            Idx dst_idx = output_base_idx + insert_pos * output_topk_axis_stride;
+                            output[dst_idx] = output[src_idx];
+                            insert_pos++;
                         }
-
                         output[output_base_idx + insert_pos * output_topk_axis_stride] = val;
-                        if (count < K) count++;
                     }
                 }
 
-                for (Idx i = count; i < K; ++i) {
-                    output[output_base_idx + i * output_topk_axis_stride] = padding_value;
+                // Reverse to get largest first
+                for (Idx i = 0; i < K / 2; ++i) {
+                    Idx idx1 = output_base_idx + i * output_topk_axis_stride;
+                    Idx idx2 = output_base_idx + (K - 1 - i) * output_topk_axis_stride;
+                    T temp = output[idx1];
+                    output[idx1] = output[idx2];
+                    output[idx2] = temp;
                 }
             }
         }
diff --git a/run.py b/run.py
index 2b3922e..da085d8 100644
--- a/run.py
+++ b/run.py
@@ -23,7 +23,7 @@
     512,
     1024,
     2048,
-    4096
+    4096,
 ]
 
 def build_kernel_tests_cpu():
diff --git a/tests/test_concat.cpp b/tests/test_concat.cpp
index 200166c..ece9bd6 100644
--- a/tests/test_concat.cpp
+++ b/tests/test_concat.cpp
@@ -86,8 +86,6 @@ int main(int argc, char* argv[]) {
         for (auto& val : INPUT[k]) val = distrib_real(gen);
     }
 
-    const std::size_t numElems = total_rows * cols;
-
     // Setup the accelerator, host and queue
     auto devAcc = alpaka::getDevByIdx(PlatAcc{}, 0u);
     auto devHost = alpaka::getDevByIdx(PlatHost{}, 0u);
@@ -155,7 +153,7 @@ int main(int argc, char* argv[]) {
 #elif defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
 
     // Work division: 2D mapping of threads to elements
-    std::size_t threadsX = 4, threadsY = 4;
+    std::size_t threadsX = 16, threadsY = 16;
     std::size_t blocksX = (out_rows + threadsX - 1) / threadsX;
     std::size_t blocksY = (out_cols + threadsY - 1) / threadsY;
 
@@ -179,7 +177,8 @@ int main(int argc, char* argv[]) {
             // For GPU, use cudaMemcpy directly
             T* pAIn = alpaka::getPtrNative(aIn_bufs[k]);
             T* pHIn = alpaka::getPtrNative(hIn_bufs[k]);
-            cudaMemcpy(pAIn, pHIn, numElems * sizeof(T), cudaMemcpyHostToDevice);
+            std::size_t numElems_k = in_rows[k] * cols;  // ACTUAL size of this buffer
+            cudaMemcpy(pAIn, pHIn, numElems_k * sizeof(T), cudaMemcpyHostToDevice);
 #endif
         }
     }
@@ -203,6 +202,7 @@ int main(int argc, char* argv[]) {
 #elif defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
         T* pAOut = alpaka::getPtrNative(aOut);
         T* pHOut = alpaka::getPtrNative(hOut);
+        const std::size_t numElems = total_rows * cols;
         cudaMemcpy(pHOut, pAOut, numElems * sizeof(T), cudaMemcpyDeviceToHost);
 #else
 #endif
diff --git a/tests/test_topk.cpp b/tests/test_topk.cpp
index 1816e7a..0bf2506 100644
--- a/tests/test_topk.cpp
+++ b/tests/test_topk.cpp
@@ -173,7 +173,7 @@ int main(int argc, char* argv[]) {
 #elif defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
         T* pAOut = alpaka::getPtrNative(aOut);
         T* pHOut = alpaka::getPtrNative(hOut);
-        cudaMemcpy(pHOut, pAOut, numElems * sizeof(T), cudaMemcpyDeviceToHost);
+        cudaMemcpy(pHOut, pAOut, rows * K * sizeof(T), cudaMemcpyDeviceToHost);
 #endif
     }
     auto end_total = now();

From 88fa53e15c53910ef048e91be0f01450b1577967 Mon Sep 17 00:00:00 2001
From: Francesco Derme <francesco.derme02@gmail.com>
Date: Wed, 17 Dec 2025 16:08:48 +0100
Subject: [PATCH 28/33] PyTorch!

---
 run.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/run.py b/run.py
index 2ea30ff..b6cca73 100644
--- a/run.py
+++ b/run.py
@@ -100,11 +100,9 @@ def run_pytorch_benchmark(op_name, N, num_repeats=1, warmup=0):
     else:
         return None
 
-    '''
     # Warmup
     for _ in range(warmup):
         op()
-    '''
 
     if device.type == 'cuda':
         torch.cuda.synchronize()
@@ -128,7 +126,7 @@ def run_pytorch_benchmark(op_name, N, num_repeats=1, warmup=0):
         for _ in range(num_repeats):
             op()
         end_time = time.perf_counter()
-        total_ms = (end_time - start_time) * 1000.0 # convert seconds to ms
+        total_ms = (end_time - start_time) * 1000.0
 
     return total_ms / num_repeats
 

From fda7a01b2d6a83de1f083a62a2489c77814c0dd8 Mon Sep 17 00:00:00 2001
From: Francesco Derme <francesco.derme02@gmail.com>
Date: Wed, 17 Dec 2025 16:14:44 +0100
Subject: [PATCH 29/33] run.py updates

---
 run.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/run.py b/run.py
index b6cca73..2017c29 100644
--- a/run.py
+++ b/run.py
@@ -165,8 +165,8 @@ def run_cpp_benchmark(executable_path, args):
 
 def main():
     # Build System
-    print(f"\n{'Build System':^80}")
-    print("-" * 80)
+    print(f"\n{'Build System':^100}")
+    print("-" * 100)
     if not build_kernel_tests():
         sys.exit(1)
 
@@ -175,9 +175,9 @@ def main():
     if HAS_TORCH and torch.cuda.is_available():
         device_name = f"GPU ({torch.cuda.get_device_name(0)})"
 
-    print(f"\n{'Benchmarking System':^80}")
-    print(f"{f'PyTorch Device: {device_name}':^80}")
-    print("-" * 80)
+    print(f"\n{'Benchmarking System':^100}")
+    print(f"{f'PyTorch Device: {device_name}':^100}")
+    print("-" * 100)
 
     for EXECUTABLE_PATH in EXECUTABLE_PATHS:
         op_name = get_op_name(EXECUTABLE_PATH)

From 779a2ce36e6d61c62e5494d14be5e63f6208783f Mon Sep 17 00:00:00 2001
From: Francesco Derme <francesco.derme02@gmail.com>
Date: Wed, 17 Dec 2025 16:34:30 +0100
Subject: [PATCH 30/33] run.py

---
 run.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/run.py b/run.py
index 2017c29..bc40cdf 100644
--- a/run.py
+++ b/run.py
@@ -75,28 +75,34 @@ def run_pytorch_benchmark(op_name, N, num_repeats=1, warmup=0):
     # Setup Data
     if op_name == "trivial":
         x = torch.randn(N, N, device=device, dtype=torch.float32)
-        op = lambda: x.clone()
+        y = torch.empty_like(x)
+        op = lambda: y.copy_(x)
 
     elif op_name == "transpose":
         x = torch.randn(N, N, device=device, dtype=torch.float32)
-        op = lambda: x.t().contiguous()
+        y = torch.empty(N, N, device=device, dtype=torch.float32)
+        op = lambda: y.copy_(x.t())
 
     elif op_name == "concat":
         t1 = torch.randn(N, N, device=device, dtype=torch.float32)
         t2 = torch.randn(N, N, device=device, dtype=torch.float32)
         t3 = torch.randn(N, N, device=device, dtype=torch.float32)
-        op = lambda: torch.cat((t1, t2, t3), dim=1)
+        out_tensor = torch.empty(N, 3*N, device=device, dtype=torch.float32)
+        op = lambda: torch.cat((t1, t2, t3), dim=1, out=out_tensor)
 
     elif op_name == "where":
         cond = torch.randint(0, 2, (N, N), device=device, dtype=torch.bool)
         x = torch.randn(N, N, device=device, dtype=torch.float32)
         y = torch.randn(N, N, device=device, dtype=torch.float32)
-        op = lambda: torch.where(cond, x, y)
+        out_tensor = torch.empty_like(x)
+        op = lambda: torch.where(cond, x, y, out=out_tensor)
 
     elif op_name == "topk":
         k = 4
         x = torch.randn(N, N, device=device, dtype=torch.float32)
-        op = lambda: torch.topk(x, k)
+        values = torch.empty(N, k, device=device, dtype=torch.float32)
+        indices = torch.empty(N, k, device=device, dtype=torch.long)
+        op = lambda: torch.topk(x, k, out=(values, indices))
     else:
         return None
 

From 86e312fbed0744c1012bd653d4fae563af9f2d46 Mon Sep 17 00:00:00 2001
From: Francesco Derme <francesco.derme02@gmail.com>
Date: Wed, 17 Dec 2025 16:38:02 +0100
Subject: [PATCH 31/33] run.py

---
 run.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/run.py b/run.py
index bc40cdf..02f3469 100644
--- a/run.py
+++ b/run.py
@@ -21,6 +21,8 @@
 ]
 
 BENCHMARK_SIZES = [
+    128,
+    256,
     512,
     1024,
     2048,

From 14c1975d126236ec1118d488f5251769b8ed622b Mon Sep 17 00:00:00 2001
From: Saransh Chopra <saransh0701@gmail.com>
Date: Wed, 17 Dec 2025 21:09:29 +0530
Subject: [PATCH 32/33] add GPU to run.py

---
 run.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/run.py b/run.py
index cc8908f..c75ceb8 100644
--- a/run.py
+++ b/run.py
@@ -34,7 +34,7 @@
     1024,
     2048,
     4096,
-    # 8192
+    8192
 ]
 
 def build_kernel_tests_cpu():
@@ -217,7 +217,8 @@ def main(gpu = False):
             device_name = f"GPU ({torch.cuda.get_device_name(0)})"
 
         print(f"\n{'Benchmarking System':^80}")
-        print(f"{f'PyTorch Device: {device_name}':^80}")
+        if HAS_TORCH:
+            print(f"{f'PyTorch Device: {device_name}':^80}")
         print("-" * 80)
 
         for EXECUTABLE_PATH in EXECUTABLE_PATHS_GPU:
@@ -227,10 +228,10 @@ def main(gpu = False):
             # Flexible Headers
             # K = Kernel Time, T = Total Time
             if HAS_TORCH:
-                header = (f"{'SIZE':<6} | {'CPP(K)':<9} | {'CPP(T)':<9} | {'TORCH':<9} | "
-                        f"{'CPP GB/s':<9} | {'TORCH GB/s':<11} | {'SPEEDUP':<8}")
+                header = (f"{'SIZE':<6} | {'CUDA(K)':<9} | {'CUDA(T)':<9} | {'TORCH':<9} | "
+                        f"{'CUDA GB/s':<9} | {'TORCH GB/s':<11} | {'SPEEDUP':<8}")
             else:
-                header = (f"{'SIZE':<6} | {'CPP(K)':<10} | {'CPP(T)':<10} | {'CPP GB/s':<12}")
+                header = (f"{'SIZE':<6} | {'CUDA(K)':<10} | {'CUDA(T)':<10} | {'CUDA GB/s':<12}")
 
             print(header)
             print("-" * len(header))
@@ -292,7 +293,8 @@ def main(gpu = False):
             device_name = f"GPU ({torch.cuda.get_device_name(0)})"
 
         print(f"\n{'Benchmarking System':^80}")
-        print(f"{f'PyTorch Device: {device_name}':^80}")
+        if HAS_TORCH:
+            print(f"{f'PyTorch Device: {device_name}':^80}")
         print("-" * 80)
 
         for EXECUTABLE_PATH in EXECUTABLE_PATHS_CPU:
@@ -302,10 +304,10 @@ def main(gpu = False):
             # Flexible Headers
             # K = Kernel Time, T = Total Time
             if HAS_TORCH:
-                header = (f"{'SIZE':<6} | {'CPP(K)':<9} | {'CPP(T)':<9} | {'TORCH':<9} | "
-                        f"{'CPP GB/s':<9} | {'TORCH GB/s':<11} | {'SPEEDUP':<8}")
+                header = (f"{'SIZE':<6} | {'CPU(K)':<9} | {'CPU(T)':<9} | {'TORCH':<9} | "
+                        f"{'CPU GB/s':<9} | {'TORCH GB/s':<11} | {'SPEEDUP':<8}")
             else:
-                header = (f"{'SIZE':<6} | {'CPP(K)':<10} | {'CPP(T)':<10} | {'CPP GB/s':<12}")
+                header = (f"{'SIZE':<6} | {'CPU(K)':<10} | {'CPU(T)':<10} | {'CPU GB/s':<12}")
 
             print(header)
             print("-" * len(header))

From f5c9c66e6354d04af128101867a99703953e45f2 Mon Sep 17 00:00:00 2001
From: Saransh Chopra <saransh0701@gmail.com>
Date: Wed, 17 Dec 2025 21:56:59 +0530
Subject: [PATCH 33/33] fix trivial kernelel for GPU

---
 kernels/trivial.hpp    | 32 +++++++++++---------
 tests/test_trivial.cpp | 69 +++++++++++++++++++++++++++++++-----------
 2 files changed, 68 insertions(+), 33 deletions(-)

diff --git a/kernels/trivial.hpp b/kernels/trivial.hpp
index e799152..5be9b67 100644
--- a/kernels/trivial.hpp
+++ b/kernels/trivial.hpp
@@ -7,24 +7,26 @@ namespace alpaka_kernels {
 
 struct TrivialKernel {
     template <typename TAcc, typename T, typename Dim, typename Idx>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* input, T* output, alpaka::Vec<Dim, Idx> output_strides,
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* input, T* output, alpaka::Vec<Dim, Idx> /*output_strides*/,
                                   alpaka::Vec<Dim, Idx> output_shape) const {
-        using DimAcc = alpaka::Dim<TAcc>;
-        static_assert(DimAcc::value == Dim::value, "Accelerator and data dimensions must match!");
-
-        constexpr std::size_t D = Dim::value;
-        auto elements = alpaka::uniformElementsND(acc, output_shape);
-
-        for (auto const& idx : elements) {
-            Idx linear_idx = 0;
+        // Get global thread index
+        auto const threadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        auto const threadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
+
+        // Convert to linear thread index
+        Idx global_thread_idx = 0;
+        Idx stride = 1;
+        for (std::size_t d = 0; d < Dim::value; ++d) {
+            global_thread_idx += threadIdx[d] * stride;
+            stride *= threadExtent[d];
+        }
 
-            // Compute input and output indexes
-            for (std::size_t d = 0; d < D; ++d) {
-                Idx const coord = idx[d];
-                linear_idx += coord * output_strides[d];
-            }
+        // Total number of elements
+        Idx total_elements = output_shape.prod();
 
-            output[linear_idx] = input[linear_idx];
+        // Simple grid-stride copy for contiguous memory
+        for (Idx i = global_thread_idx; i < total_elements; i += threadExtent.prod()) {
+            output[i] = input[i];
         }
     }
 };
diff --git a/tests/test_trivial.cpp b/tests/test_trivial.cpp
index 9da80a8..d81b7ec 100644
--- a/tests/test_trivial.cpp
+++ b/tests/test_trivial.cpp
@@ -13,23 +13,23 @@ using Idx = std::size_t;
 
 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
 using DevAcc = alpaka::DevCudaRt;
-using QueueAcc = alpaka::Queue<DevAcc, alpaka::NonBlocking>;
 using Acc = alpaka::AccGpuCudaRt<Dim, Idx>;
+using QueueAcc = alpaka::Queue<Acc, alpaka::NonBlocking>;
 
 #elif defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED)
 using DevAcc = alpaka::DevCpu;
-using QueueAcc = alpaka::Queue<DevAcc, alpaka::Blocking>;
 using Acc = alpaka::AccCpuTbbBlocks<Dim, Idx>;
+using QueueAcc = alpaka::Queue<Acc, alpaka::Blocking>;
 
 #elif defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
 using DevAcc = alpaka::DevCpu;
-using QueueAcc = alpaka::Queue<DevAcc, alpaka::Blocking>;
 using Acc = alpaka::AccCpuSerial<Dim, Idx>;
+using QueueAcc = alpaka::Queue<Acc, alpaka::Blocking>;
 
 #elif defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
 using DevAcc = alpaka::DevCpu;
-using QueueAcc = alpaka::Queue<DevAcc, alpaka::Blocking>;
 using Acc = alpaka::AccCpuThreads<Dim, Idx>;
+using QueueAcc = alpaka::Queue<Acc, alpaka::Blocking>;
 
 #else
 #error Please define a single one of ALPAKA_ACC_GPU_CUDA_ENABLED, ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED
@@ -88,19 +88,39 @@ int main(int argc, char* argv[]) {
     // Prepare kernel arguments
     auto output_strides = alpaka::Vec<Dim, Idx>(cols, 1);
 
+#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
+    defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
+    std::size_t threadsX = 1;
+    std::size_t threadsY = 1;
+    std::size_t blocksX = 64;
+    std::size_t blocksY = 1;
+#elif defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
     // Work division: 2D mapping of threads to elements
     std::size_t threadsX = 16, threadsY = 16;
     std::size_t blocksX = (cols + threadsX - 1) / threadsX;
     std::size_t blocksY = (rows + threadsY - 1) / threadsY;
+#endif
 
+    // Initial data transfer
+    // 1) INPUT -> host buffer (safe via raw pointer)
+    {
+        T* pHost = alpaka::getPtrNative(hIn);
+        for (Idx i = 0; i < numElems; ++i) pHost[i] = INPUT[i];
+    }
+
+    // 2) host -> accelerator
+    {
 #if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
     defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
-
-    threadsX = 1;
-    threadsY = 1;
-    blocksX = 64;
-    blocksY = 1;
+        // For CPU, use memcpy
+        alpaka::memcpy(queue, aIn, hIn);
+#elif defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+        // For GPU, use cudaMemcpy directly
+        T* pAIn = alpaka::getPtrNative(aIn);
+        T* pHIn = alpaka::getPtrNative(hIn);
+        cudaMemcpy(pAIn, pHIn, numElems * sizeof(T), cudaMemcpyHostToDevice);
 #endif
+    }
 
     auto const workDiv = alpaka::WorkDivMembers<Dim, Idx>{alpaka::Vec<Dim, Idx>(blocksX, blocksY),
                                                           alpaka::Vec<Dim, Idx>(threadsX, threadsY), extent};
@@ -112,17 +132,21 @@ int main(int argc, char* argv[]) {
 
     alpaka::wait(queue);
 
-    // Initial data transfer
-    // 1) INPUT -> host buffer (safe via raw pointer)
+    // 2) host -> accelerator (again for timing)
+    auto start_total = now();
     {
-        T* pHost = alpaka::getPtrNative(hIn);
-        for (Idx i = 0; i < numElems; ++i) pHost[i] = INPUT[i];
+#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
+    defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
+        // For CPU, use memcpy
+        alpaka::memcpy(queue, aIn, hIn);
+#elif defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+        // For GPU, use cudaMemcpy directly
+        T* pAIn = alpaka::getPtrNative(aIn);
+        T* pHIn = alpaka::getPtrNative(hIn);
+        cudaMemcpy(pAIn, pHIn, numElems * sizeof(T), cudaMemcpyHostToDevice);
+#endif
     }
 
-    // 2) host -> accelerator
-    auto start_total = now();
-    alpaka::memcpy(queue, aIn, hIn);
-
     // Launch kernel
     auto start_kernel = now();
 
@@ -133,7 +157,16 @@ int main(int argc, char* argv[]) {
     auto end_kernel = now();
 
     // Final data transfer: accelerator -> host
-    alpaka::memcpy(queue, hOut, aOut);
+    {
+#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED) || \
+    defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
+        alpaka::memcpy(queue, hOut, aOut);
+#elif defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+        T* pAOut = alpaka::getPtrNative(aOut);
+        T* pHOut = alpaka::getPtrNative(hOut);
+        cudaMemcpy(pHOut, pAOut, numElems * sizeof(T), cudaMemcpyDeviceToHost);
+#endif
+    }
     auto end_total = now();
 
     // Print result